static void d_finalizer(SEXP d_ptr) { if(!R_ExternalPtrAddr(d_ptr)) return; cublasFree(R_ExternalPtrAddr(d_ptr)); R_ClearExternalPtr(d_ptr); }
double magma_get_norm_sy(SEXP obj, const char *typstr) { #ifdef HIPLAR_WITH_MAGMA char typnm[] = {'\0', '\0'}; int *dims = INTEGER(GET_SLOT(obj, Matrix_DimSym)); double *work = (double *) NULL; int N = dims[0]; int lda = N; double *A = REAL(GET_SLOT(obj, Matrix_xSym)); typnm[0] = La_norm_type(typstr); const char *c = uplo_P(obj); //Magmablas dlansy only does I & M norms if(GPUFlag == 1 && (*typnm == 'I' || *typnm == 'M')) { #ifdef HIPLAR_DBG R_ShowMessage("DBG: Performing norm using magmablas_dlansy"); #endif double *dwork, *d_A, maxnorm; cublasAlloc(N, sizeof(double), (void**)&dwork); cublasAlloc(lda * N, sizeof(double), (void**)&d_A); cublasSetVector(N * lda, sizeof(double), A, 1, d_A, 1); maxnorm = magmablas_dlansy(typnm[0], *c ,N, d_A, lda, dwork); cublasFree(d_A); cublasFree(dwork); return maxnorm; } else { if (*typnm == 'I' || *typnm == 'O') { work = (double *) R_alloc(dims[0], sizeof(double)); } return F77_CALL(dlansy)(typnm, uplo_P(obj), dims, A, dims, work); } #endif return 0.0; }
long benchmark(int size) { long requestStart, requestEnd; int incx = 1, incy = 1, n = size; double *cuA, *cuB; cublasStatus status; double* a = random_array(size); double* b = random_array(size); status = cublasAlloc(n, sizeof(double),(void**)&cuA); checkStatus("A", status); status = cublasAlloc(n, sizeof(double),(void**)&cuB); checkStatus("B", status); status = cublasSetVector(n, sizeof(double), a, incx, cuA, incx); checkStatus("setA", status); status = cublasSetVector(n, sizeof(double), b, incy, cuB, incy); checkStatus("setB", status); requestStart = currentTimeNanos(); cublasDdot(n, cuA, incx, cuB, incy); requestEnd = currentTimeNanos(); status = cublasFree(cuA); checkStatus("freeA", status); status = cublasFree(cuB); checkStatus("freeB", status); free(a); free(b); return (requestEnd - requestStart); }
SEXP magma_dgeMatrix_matrix_mm(SEXP a, SEXP bP, SEXP right) { #ifdef HIPLAR_WITH_MAGMA SEXP b = PROTECT(mMatrix_as_dgeMatrix(bP)), val = PROTECT(NEW_OBJECT(MAKE_CLASS("dgeMatrix"))); int *adims = INTEGER(GET_SLOT(a, Matrix_DimSym)), *bdims = INTEGER(GET_SLOT(b, Matrix_DimSym)), *cdims = INTEGER(ALLOC_SLOT(val, Matrix_DimSym, INTSXP, 2)); double one = 1.0, zero = 0.0; if (asLogical(right)) { int m = bdims[0], n = adims[1], k = bdims[1]; if (adims[0] != k) error(_("Matrices are not conformable for multiplication")); cdims[0] = m; cdims[1] = n; if (m < 1 || n < 1 || k < 1) { // This was commented out error(_("Matrices with zero extents cannot be multiplied")); ALLOC_SLOT(val, Matrix_xSym, REALSXP, m * n); } else { double *B = REAL(GET_SLOT(b, Matrix_xSym)); double *A = REAL(GET_SLOT(a, Matrix_xSym)); double *C = REAL(ALLOC_SLOT(val, Matrix_xSym, REALSXP, m * n)); //TODO add magma here too if(GPUFlag == 1) { double *d_A, *d_B, *d_C; cublasStatus retStatus; #ifdef HIPLAR_DBG R_ShowMessage("DBG: Performing matrix multiplication with Right = true using magmablas_dgemm"); #endif cublasAlloc(n * k, sizeof(double), (void**)&d_A); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Memory Allocation")); /********************************************/ cublasAlloc(m * k, sizeof(double), (void**)&d_B); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Memory Allocation")); /********************************************/ cublasAlloc(m * n, sizeof(double), (void**)&d_C); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Memory Allocation")); /********************************************/ cublasSetVector( n * k , sizeof(double), A, 1, d_A, 1); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Data Transfer to Device")); /********************************************/ cublasSetVector( m * k, sizeof(double), B, 1, d_B, 1 ); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Data Transfer to Device")); /********************************************/ // ******** magmablas_dgemm call Here ** //magmablas_dgemm('N', 'N', m, n, k, one, d_B, m, d_A, k, zero, d_C, m); //CHANGED 30/07 cublasDgemm('N', 'N', m, n, k, one, d_B, m, d_A, k, zero, d_C, m); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) { error(_("CUBLAS: Error in cublasDgemm routine")); } /********************************************/ cublasGetVector( m * n , sizeof(double), d_C, 1, C, 1); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Data Transfer from Device")); /********************************************/ cublasFree(d_A); cublasFree(d_B); cublasFree(d_C); } else { #ifdef HIPLAR_DBG R_ShowMessage("DBG: Performing matrix multiplication using dgemm with right = TRUE"); #endif F77_CALL(dgemm) ("N", "N", &m, &n, &k, &one, B, &m, A , &k, &zero, C , &m); } } } else { int m = adims[0], n = bdims[1], k = adims[1]; double *A = REAL(GET_SLOT(a, Matrix_xSym)); double *B = REAL(GET_SLOT(b, Matrix_xSym)); if (bdims[0] != k) error(_("Matrices are not conformable for multiplication")); cdims[0] = m; cdims[1] = n; double *C = REAL(ALLOC_SLOT(val, Matrix_xSym, REALSXP, m * n)); if (m < 1 || n < 1 || k < 1) { // This was commented out error(_("Matrices with zero extents cannot be multiplied")); ALLOC_SLOT(val, Matrix_xSym, REALSXP, m * n); } else { if(GPUFlag == 1) { double *d_A, *d_B, *d_C; cublasStatus retStatus; #ifdef HIPLAR_DBG R_ShowMessage("DBG: Performing matrix multiplication using magmablas_dgemm"); #endif cublasAlloc(m * k, sizeof(double), (void**)&d_A); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Memory Allocation")); /********************************************/ cublasAlloc(n * k, sizeof(double), (void**)&d_B); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Memory Allocation")); /********************************************/ cublasAlloc(m * n, sizeof(double), (void**)&d_C); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Memory Allocation")); /********************************************/ cublasSetVector( m * k , sizeof(double), A, 1, d_A, 1); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Data Transfer to Device")); /********************************************/ cublasSetVector( n * k, sizeof(double), B, 1, d_B, 1 ); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Data Transfer to Device")); /********************************************/ // ******** magmablas_dgemm call Here ** //magmablas_dgemm('N', 'N', m, n, k, one, d_A, m, d_B, k, zero, d_C, m); //CHANGE cublasDgemm('N', 'N', m, n, k, one, d_A, m, d_B, k, zero, d_C, m); retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) { error(_("CUBLAS: Error in Data Transfer from Device")); /********************************************/ } cublasGetVector( m * n , sizeof(double), d_C, 1, C, 1); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Data Transfer from Device")); /********************************************/ cublasFree(d_A); cublasFree(d_B); cublasFree(d_C); } else { #ifdef HIPLAR_DBG R_ShowMessage("DBG: Performing matrix multiplication using dgemm"); #endif F77_CALL(dgemm) ("N", "N", &m, &n, &k, &one, A, &m, B, &k, &zero, C, &m); } } } ALLOC_SLOT(val, Matrix_DimNamesSym, VECSXP, 2); UNPROTECT(2); return val; #endif return R_NilValue; }
SEXP magma_dgeMatrix_matrix_solve(SEXP a, SEXP b) { #ifdef HIPLAR_WITH_MAGMA SEXP val = PROTECT(dup_mMatrix_as_dgeMatrix(b)), lu = PROTECT(magma_dgeMatrix_LU_(a, TRUE)); int *adims = INTEGER(GET_SLOT(lu, Matrix_DimSym)), *bdims = INTEGER(GET_SLOT(val, Matrix_DimSym)); int info, n = bdims[0], nrhs = bdims[1]; if (*adims != *bdims || bdims[1] < 1 || *adims < 1 || *adims != adims[1]) error(_("Dimensions of system to be solved are inconsistent")); double *A = REAL(GET_SLOT(lu, Matrix_xSym)); double *B = REAL(GET_SLOT(val, Matrix_xSym)); int *ipiv = INTEGER(GET_SLOT(lu, Matrix_permSym)); if(GPUFlag == 0) { F77_CALL(dgetrs)("N", &n, &nrhs, A, &n, ipiv, B, &n, &info); #ifdef HIPLAR_DBG R_ShowMessage("DBG: Solve using LU using dgetrs;"); #endif }else if(GPUFlag == 1) { #ifdef HIPLAR_DBG R_ShowMessage("DBG: Solve using LU using magma_dgetrs;"); #endif double *d_A, *d_B; cublasStatus retStatus; cublasAlloc(adims[0] * adims[1], sizeof(double), (void**)&d_A); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Memory Allocation of A on Device")); /********************************************/ cublasAlloc(n * nrhs, sizeof(double), (void**)&d_B); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Memory Allocation of b on Device")); /********************************************/ cublasSetVector(adims[0] * adims[1], sizeof(double), A, 1, d_A, 1); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Transferring data to advice")); /********************************************/ cublasSetVector(n * nrhs, sizeof(double), B, 1, d_B, 1); magma_dgetrs_gpu( 'N', n, nrhs, d_A, n, ipiv, d_B, n, &info ); cublasGetVector(n * nrhs, sizeof(double), d_B, 1, B, 1); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Transferring from to advice")); /********************************************/ cublasFree(d_A); cublasFree(d_B); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in freeing data")); /********************************************/ } if (info) error(_("Lapack routine dgetrs: system is exactly singular")); UNPROTECT(2); return val; #endif return R_NilValue; }
SEXP magma_dgeMatrix_solve(SEXP a) { #ifdef HIPLAR_WITH_MAGMA /* compute the 1-norm of the matrix, which is needed later for the computation of the reciprocal condition number. */ double aNorm = magma_get_norm(a, "1"); /* the LU decomposition : */ /* Given that we may be performing this operation * on the GPU we may put in an optimisation here * where if we call the LU solver we, we do not require * the decomposition to be transferred back to CPU. This is TODO */ SEXP val = PROTECT(NEW_OBJECT(MAKE_CLASS("dgeMatrix"))), lu = magma_dgeMatrix_LU_(a, TRUE); int *dims = INTEGER(GET_SLOT(lu, Matrix_DimSym)), *pivot = INTEGER(GET_SLOT(lu, Matrix_permSym)); /* prepare variables for the dgetri calls */ double *x, tmp; int info, lwork = -1; if (dims[0] != dims[1]) error(_("Solve requires a square matrix")); slot_dup(val, lu, Matrix_xSym); x = REAL(GET_SLOT(val, Matrix_xSym)); slot_dup(val, lu, Matrix_DimSym); int N2 = dims[0] * dims[0]; if(dims[0]) /* the dimension is not zero */ { /* is the matrix is *computationally* singular ? */ double rcond; F77_CALL(dgecon)("1", dims, x, dims, &aNorm, &rcond, (double *) R_alloc(4*dims[0], sizeof(double)), (int *) R_alloc(dims[0], sizeof(int)), &info); if (info) error(_("error [%d] from Lapack 'dgecon()'"), info); if(rcond < DOUBLE_EPS) error(_("Lapack dgecon(): system computationally singular, reciprocal condition number = %g"), rcond); /* only now try the inversion and check if the matrix is *exactly* singular: */ // This is also a work space query. This is not an option in magma F77_CALL(dgetri)(dims, x, dims, pivot, &tmp, &lwork, &info); lwork = (int) tmp; if( GPUFlag == 0){ F77_CALL(dgetri)(dims, x, dims, pivot, (double *) R_alloc((size_t) lwork, sizeof(double)), &lwork, &info); #ifdef HIPLAR_DBG R_ShowMessage("DBG: Solve using LU using dgetri;"); #endif } else if(GPUFlag == 1) { double *d_x, *dwork; cublasStatus retStatus; #ifdef HIPLAR_DBG R_ShowMessage("Solve using LU using magma_dgetri;"); #endif cublasAlloc(N2, sizeof(double), (void**)&d_x); //cublasAlloc(N2 , sizeof(double), (void**)&dtmp); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Memory Allocation on Device")); /********************************************/ cublasSetVector( N2, sizeof(double), x, 1, d_x, 1); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Data Transfer to Device")); /********************************************/ lwork = dims[0] * magma_get_dgetri_nb( dims[0] ); cublasAlloc(lwork, sizeof(double), (void**)&dwork); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Memory Allocation on Device")); /********************************************/ magma_dgetri_gpu(dims[0], d_x, dims[0], pivot, dwork , lwork, &info); cublasGetVector(N2, sizeof(double), d_x, 1, x, 1); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Data From to Device")); /********************************************/ cublasFree(dwork); cublasFree(d_x); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error freeing memory")); /********************************************/ } else error(_("GPUFlag not set correctly")); if (info) error(_("Lapack routine dgetri: system is exactly singular")); } UNPROTECT(1); return val; #endif return R_NilValue; }
SEXP magma_dpoMatrix_chol(SEXP x) { #ifdef HIPLAR_WITH_MAGMA SEXP val = get_factors(x, "Cholesky"), dimP = GET_SLOT(x, Matrix_DimSym), uploP = GET_SLOT(x, Matrix_uploSym); const char *uplo = CHAR(STRING_ELT(uploP, 0)); int *dims = INTEGER(dimP), info; int n = dims[0]; double *vx; cublasStatus retStatus; if (val != R_NilValue) return val; dims = INTEGER(dimP); val = PROTECT(NEW_OBJECT(MAKE_CLASS("Cholesky"))); SET_SLOT(val, Matrix_uploSym, duplicate(uploP)); SET_SLOT(val, Matrix_diagSym, mkString("N")); SET_SLOT(val, Matrix_DimSym, duplicate(dimP)); vx = REAL(ALLOC_SLOT(val, Matrix_xSym, REALSXP, n * n)); AZERO(vx, n * n); //we could put in magmablas_dlacpy but it only //copies all of the matrix F77_CALL(dlacpy)(uplo, &n, &n, REAL(GET_SLOT(x, Matrix_xSym)), &n, vx, &n); if (n > 0) { if(GPUFlag == 0){ #ifdef HIPLAR_DBG R_ShowMessage("DBG: Cholesky decomposition using dpotrf;"); #endif F77_CALL(dpotrf)(uplo, &n, vx, &n, &info); } else if(GPUFlag == 1 && Interface == 0){ #ifdef HIPLAR_DBG R_ShowMessage("DBG: Cholesky decomposition using magma_dpotrf;"); #endif int nrows, ncols; nrows = ncols = n; magma_int_t lda; lda = nrows; magma_dpotrf(uplo[0], ncols, vx, lda, &info); /* Error Checking */ retStatus = cudaGetLastError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in magma_dpotrf")); /********************************************/ } else if(GPUFlag == 1 && Interface == 1) { #ifdef HIPLAR_DBG R_ShowMessage("DBG: Cholesky decomposition using magma_dpotrf_gpu;"); #endif double *d_c; int nrows, ncols; nrows = ncols = n; int N2 = nrows * ncols; magma_int_t lda; lda = nrows; cublasAlloc(lda * ncols, sizeof(double), (void**)&d_c); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Memory Allocation")); /********************************************/ cublasSetVector(N2, sizeof(double), vx, 1, d_c, 1); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Date Transfer to Device")); /********************************************/ magma_dpotrf_gpu(uplo[0], ncols, d_c, lda, &info); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in magma_dpotrf_gpu")); /********************************************/ cublasGetVector(nrows * ncols, sizeof(double), d_c, 1, vx, 1); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Date Transfer from Device")); /********************************************/ cublasFree(d_c); } else error(_("MAGMA/LAPACK/Interface Flag not defined correctly")); } if (info) { if(info > 0) error(_("the leading minor of order %d is not positive definite"), info); else /* should never happen! */ error(_("Lapack routine %s returned error code %d"), "dpotrf", info); } UNPROTECT(1); return set_factors(x, val, "Cholesky"); #endif return R_NilValue; }
SEXP magma_dpoMatrix_matrix_solve(SEXP a, SEXP b) { #ifdef HIPLAR_WITH_MAGMA SEXP Chol = magma_dpoMatrix_chol(a), val = PROTECT(duplicate(b)); int *adims = INTEGER(GET_SLOT(a, Matrix_DimSym)), *bdims = INTEGER(getAttrib(b, R_DimSymbol)), info; if (!(isReal(b) && isMatrix(b))) error(_("Argument b must be a numeric matrix")); if (*adims != *bdims || bdims[1] < 1 || *adims < 1) error(_("Dimensions of system to be solved are inconsistent")); double *A = REAL(GET_SLOT(Chol, Matrix_xSym)); double *B = REAL(val); //const char *uplo = uplo_P(Chol); //int N = bdims[1]; //There is only a GPU interface for this call //so it will be the default setting if the GPU is on if(GPUFlag == 1) { #ifdef HIPLAR_DBG R_ShowMessage("DBG: Solving system of Ax = b, A = dpo, b = dge, using dpotrs_gpu;"); #endif double *d_A, *d_B; const char *uplo = uplo_P(Chol); magma_int_t NRHS = bdims[1]; magma_int_t lda = adims[1]; magma_int_t ldb = bdims[0]; magma_int_t N = adims[0]; cublasStatus retStatus; cublasAlloc(N * lda, sizeof(double), (void**)&d_A); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Memory Allocation")); /********************************************/ cublasAlloc(N * NRHS, sizeof(double), (void**)&d_B); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Memory Allocation")); /********************************************/ cublasSetVector( N * lda , sizeof(double), A, 1, d_A, 1); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Data Transfer to Device")); /********************************************/ cublasSetVector( ldb * NRHS, sizeof(double), B, 1, d_B, 1 ); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Data Transfer to Device")); /********************************************/ magma_dpotrs_gpu(uplo[0], N, NRHS , d_A, lda, d_B, ldb, &info); cublasGetVector( ldb * NRHS, sizeof(double), d_B, 1, B, 1); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Data Transfer from Device")); /********************************************/ cublasFree(d_A); cublasFree(d_B); } else { F77_CALL(dpotrs)(uplo_P(Chol), adims, bdims + 1, REAL(GET_SLOT(Chol, Matrix_xSym)), adims, REAL(val), bdims, &info); } // Error checking of MAGMA/LAPACK calls if (info) { if(info > 0) error(_("the leading minor of order %d is not positive definite"), info); else /* should never happen! */ error(_("Lapack routine %s returned error code %d"), "dpotrf", info); } UNPROTECT(1); return val; #endif return R_NilValue; }
/* Main */ int main(int argc, char** argv) { if (argc!=5){ fprintf (stderr, "Usage: %s <sizeM> <sizeN> <sizeK> <Nb iter>\n",argv[0]); exit(0); } const int M=strtol(argv[1],0,0); const int N=strtol(argv[2],0,0); const int K=strtol(argv[3],0,0); const int NBITER=strtol(argv[4],0,0); const int NA= M * K; const int NB= K * N; const int NC= M * N; real* h_A; real* h_B; real* h_C; const real alpha = 1.0f; const real beta = 0.0f; #ifdef NVIDIA cublasStatus status; real* d_A = 0; real* d_B = 0; real* d_C = 0; #endif #ifdef COMPARE real* h_C_ref; real error_norm; real ref_norm; real diff; #endif /* Allocate host memory for the matrices */ h_A = (real*)malloc(NA * sizeof(h_A[0])); if (h_A == 0) { fprintf (stderr, "!!!! host memory allocation error (A)\n"); return EXIT_FAILURE; } h_B = (real*)malloc(NB * sizeof(h_B[0])); if (h_B == 0) { fprintf (stderr, "!!!! host memory allocation error (B)\n"); return EXIT_FAILURE; } h_C = (real*)malloc(NC * sizeof(h_C[0])); if (h_C == 0) { fprintf (stderr, "!!!! host memory allocation error (C)\n"); return EXIT_FAILURE; } for (int i = 0; i < NA; ++i) h_A[i] = M_PI+(real)i; for (int i = 0; i < NB; ++i) h_B[i] = M_PI+(real)i; #ifdef NVIDIA /* Initialize CUBLAS */ status = cublasInit(); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! CUBLAS initialization error\n"); return EXIT_FAILURE; } /* Allocate device memory for the matrices */ status = cublasAlloc(NA, sizeof(d_A[0]), (void**)&d_A); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! device memory allocation error (A)\n"); return EXIT_FAILURE; } status = cublasAlloc(NB, sizeof(d_B[0]), (void**)&d_B); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! device memory allocation error (B)\n"); return EXIT_FAILURE; } status = cublasAlloc(NC, sizeof(d_C[0]), (void**)&d_C); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! device memory allocation error (C)\n"); return EXIT_FAILURE; } /* Initialize the device matrices with the host matrices */ status = cublasSetVector(NA, sizeof(h_A[0]), h_A, 1, d_A, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! device access error (write A)\n"); return EXIT_FAILURE; } status = cublasSetVector(NB, sizeof(h_B[0]), h_B, 1, d_B, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! device access error (write B)\n"); return EXIT_FAILURE; } status = cublasSetVector(NC, sizeof(h_C[0]), h_C, 1, d_C, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! device access error (write C)\n"); return EXIT_FAILURE; } /* Clear last error */ cublasGetError(); #endif #ifdef COMPARE /* Performs operation using plain C code */ for (int i=0;i<NBITER;i++) c_xgemm(M,N,K, alpha, h_A, h_B, beta, h_C); h_C_ref = h_C; /* Allocate host memory for reading back the result from device memory */ h_C = (real*)malloc(NC * sizeof(h_C[0])); if (h_C == 0) { fprintf (stderr, "!!!! host memory allocation error (C)\n"); return EXIT_FAILURE; } #endif #ifdef NVIDIA /* Performs operation using cublas */ for (int i=0;i<NBITER;i++) //We must Change the order of the parameter as cublas take //matrix as colomn major and C matrix is row major cublasSgemm('n', 'n', N, M, K, alpha, d_B, N, d_A, K, beta, d_C, N); status = cublasGetError(); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! kernel execution error.\n"); return EXIT_FAILURE; } /* Read the result back */ status = cublasGetVector(NC, sizeof(h_C[0]), d_C, 1, h_C, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! device access error (read C)\n"); return EXIT_FAILURE; } #elif defined( CXGEMM ) for (int i=0;i<NBITER;i++) c_xgemm(M,N,K, alpha, h_A, h_B, beta, h_C); #else char transa='N', transb='N'; for (int i=0;i<NBITER;i++) sgemm_(&transb, &transa, &N, &M, &K, &alpha, h_B, &N, h_A, &K, &beta, h_C, &N); #endif #ifdef COMPARE /* Check result against reference */ error_norm = 0; ref_norm = 0; for (int i = 0; i < NC; ++i) { diff = h_C_ref[i] - h_C[i]; error_norm += diff * diff; ref_norm += h_C_ref[i] * h_C_ref[i]; } error_norm = (float)sqrt((double)error_norm); ref_norm = (float)sqrt((double)ref_norm); if (fabs(ref_norm) < 1e-7) { fprintf (stderr, "!!!! reference norm is 0\n"); return EXIT_FAILURE; } printf( "Test %s\n", (error_norm / ref_norm < 1e-6f) ? "PASSED" : "FAILED"); #endif /* Memory clean up */ free(h_A); free(h_B); free(h_C); #ifdef NVIDIA status = cublasFree(d_A); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! memory free error (A)\n"); return EXIT_FAILURE; } status = cublasFree(d_B); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! memory free error (B)\n"); return EXIT_FAILURE; } status = cublasFree(d_C); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! memory free error (C)\n"); return EXIT_FAILURE; } /* Shutdown */ status = cublasShutdown(); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! shutdown error (A)\n"); return EXIT_FAILURE; } #endif // if (argc <= 1 || strcmp(argv[1], "-noprompt")) { // printf("\nPress ENTER to exit...\n"); // getchar(); // } return EXIT_SUCCESS; }
SEXP magma_dgeMatrix_LU_(SEXP x, Rboolean warn_sing) { #ifdef HIPLAR_WITH_MAGMA SEXP val = get_factors(x, "LU"); int *dims, npiv, info; if (val != R_NilValue) { // R_ShowMessage("already in slot"); /* nothing to do if it's there in 'factors' slot */ return val; } dims = INTEGER(GET_SLOT(x, Matrix_DimSym)); if (dims[0] < 1 || dims[1] < 1) error(_("Cannot factor a matrix with zero extents")); npiv = (dims[0] < dims[1]) ? dims[0] : dims[1]; val = PROTECT(NEW_OBJECT(MAKE_CLASS("denseLU"))); slot_dup(val, x, Matrix_xSym); slot_dup(val, x, Matrix_DimSym); double *h_R = REAL(GET_SLOT(val, Matrix_xSym)); int *ipiv = INTEGER(ALLOC_SLOT(val, Matrix_permSym, INTSXP, npiv)); if(GPUFlag == 0){ #ifdef HIPLAR_DBG R_ShowMessage("DBG: LU decomposition using dgetrf;"); #endif F77_CALL(dgetrf)(dims, dims + 1, h_R, dims, ipiv, &info); } else if(GPUFlag == 1 && Interface == 0){ #ifdef HIPLAR_DBG R_ShowMessage("DBG: LU decomposition using magma_dgetrf;"); #endif magma_dgetrf(dims[0], dims[1], h_R, dims[0], ipiv, &info); } else if(GPUFlag == 1 && Interface == 1) { #ifdef HIPLAR_DBG R_ShowMessage("DBG: LU decomposition using magma_dgetrf_gpu;"); #endif double *d_A; int N2 = dims[0] * dims[1]; cublasStatus retStatus; cublasAlloc( N2 , sizeof(double), (void**)&d_A); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Memory Allocation")); /********************************************/ cublasSetVector(N2, sizeof(double), h_R, 1, d_A, 1); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Date Transfer to Device")); /********************************************/ magma_dgetrf_gpu(dims[0],dims[1], d_A, dims[0], ipiv, &info); cublasGetVector( N2, sizeof(double), d_A, 1, h_R, 1); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Date Transfer from Device")); /********************************************/ cublasFree(d_A); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error freeing data")); /********************************************/ } else error(_("MAGMA/LAPACK/Interface Flag not defined correctly")); if (info < 0) error(_("Lapack routine %s returned error code %d"), "dgetrf", info); else if (info > 0 && warn_sing) warning(_("Exact singularity detected during LU decomposition: %s, i=%d."), "U[i,i]=0", info); UNPROTECT(1); return set_factors(x, val, "LU"); #endif return R_NilValue; }
extern "C" magma_int_t magma_zgeqrf2(magma_context *cntxt, magma_int_t m, magma_int_t n, cuDoubleComplex *a, magma_int_t lda, cuDoubleComplex *tau, cuDoubleComplex *work, magma_int_t lwork, magma_int_t *info) { /* -- MAGMA (version 1.5.0-beta3) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date July 2014 Purpose ======= ZGEQRF computes a QR factorization of a COMPLEX_16 M-by-N matrix A: A = Q * R. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. Arguments ========= CNTXT (input) MAGMA_CONTEXT CNTXT specifies the MAGMA hardware context for this routine. M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input/output) COMPLEX_16 array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). Higher performance is achieved if A is in pinned memory, e.g. allocated using cudaMallocHost. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,M). TAU (output) COMPLEX_16 array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). WORK (workspace/output) COMPLEX_16 array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. Higher performance is achieved if WORK is in pinned memory, e.g. allocated using cudaMallocHost. LWORK (input) INTEGER The dimension of the array WORK. LWORK >= N*NB, where NB can be obtained through magma_get_zgeqrf_nb(M). If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value if INFO = -8, the GPU memory allocation failed Further Details =============== The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). ===================================================================== */ #define a_ref(a_1,a_2) ( a+(a_2)*(lda) + (a_1)) #define da_ref(a_1,a_2) (da+(a_2)*ldda + (a_1)) int cnt=-1; cuDoubleComplex c_one = MAGMA_Z_ONE; int i, k, lddwork, old_i, old_ib; int nbmin, nx, ib, ldda; *info = 0; magma_qr_params *qr_params = (magma_qr_params *)cntxt->params; int nb = qr_params->nb; int lwkopt = n * nb; work[0] = MAGMA_Z_MAKE( (double)lwkopt, 0 ); long int lquery = (lwork == -1); if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,m)) { *info = -4; } else if (lwork < max(1,n) && ! lquery) { *info = -7; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return MAGMA_ERR_ILLEGAL_VALUE; } else if (lquery) return MAGMA_SUCCESS; k = min(m,n); if (k == 0) { work[0] = c_one; return MAGMA_SUCCESS; } cublasStatus status; static cudaStream_t stream[2]; cudaStreamCreate(&stream[0]); cudaStreamCreate(&stream[1]); nbmin = 2; nx = nb; lddwork = ((n+31)/32)*32; ldda = ((m+31)/32)*32; cuDoubleComplex *da; status = cublasAlloc((n)*ldda + nb*lddwork, sizeof(cuDoubleComplex), (void**)&da); if (status != CUBLAS_STATUS_SUCCESS) { *info = -8; return 0; } cuDoubleComplex *dwork = da + ldda*(n); if (nb >= nbmin && nb < k && nx < k) { /* Use blocked code initially */ cudaMemcpy2DAsync(da_ref(0,nb), ldda*sizeof(cuDoubleComplex), a_ref(0,nb), lda *sizeof(cuDoubleComplex), sizeof(cuDoubleComplex)*(m), (n-nb), cudaMemcpyHostToDevice,stream[0]); old_i = 0; old_ib = nb; for (i = 0; i < k-nx; i += nb) { ib = min(k-i, nb); if (i>0){ cudaMemcpy2DAsync( a_ref(i,i), lda *sizeof(cuDoubleComplex), da_ref(i,i), ldda*sizeof(cuDoubleComplex), sizeof(cuDoubleComplex)*(m-i), ib, cudaMemcpyDeviceToHost,stream[1]); cudaMemcpy2DAsync( a_ref(0,i), lda *sizeof(cuDoubleComplex), da_ref(0,i), ldda*sizeof(cuDoubleComplex), sizeof(cuDoubleComplex)*i, ib, cudaMemcpyDeviceToHost,stream[0]); /* Apply H' to A(i:m,i+2*ib:n) from the left */ magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, m-old_i, n-old_i-2*old_ib, old_ib, da_ref(old_i, old_i), ldda, dwork, lddwork, da_ref(old_i, old_i+2*old_ib), ldda, dwork+old_ib, lddwork); } cudaStreamSynchronize(stream[1]); int rows = m-i; cnt++; cntxt->nb = qr_params->ib; magma_zgeqrf_mc(cntxt, &rows, &ib, a_ref(i,i), &lda, tau+i, work, &lwork, info); cntxt->nb = nb; /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ lapackf77_zlarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib, a_ref(i,i), &lda, tau+i, qr_params->t+cnt*nb*nb, &ib); if (cnt < qr_params->np_gpu) { qr_params->p[cnt]=a; } zpanel_to_q(MagmaUpper, ib, a_ref(i,i), lda, qr_params->w+cnt*qr_params->nb*qr_params->nb); cublasSetMatrix(rows, ib, sizeof(cuDoubleComplex), a_ref(i,i), lda, da_ref(i,i), ldda); if (qr_params->flag == 1) zq_to_panel(MagmaUpper, ib, a_ref(i,i), lda, qr_params->w+cnt*qr_params->nb*qr_params->nb); if (i + ib < n) { cublasSetMatrix(ib, ib, sizeof(cuDoubleComplex), qr_params->t+cnt*nb*nb, ib, dwork, lddwork); if (i+ib < k-nx) /* Apply H' to A(i:m,i+ib:i+2*ib) from the left */ magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, ib, ib, da_ref(i, i ), ldda, dwork, lddwork, da_ref(i, i+ib), ldda, dwork+ib, lddwork); else magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, n-i-ib, ib, da_ref(i, i ), ldda, dwork, lddwork, da_ref(i, i+ib), ldda, dwork+ib, lddwork); old_i = i; old_ib = ib; } } } else { i = 0; } /* Use unblocked code to factor the last or only block. */ if (i < k) { ib = n-i; if (i!=0) cublasGetMatrix(m, ib, sizeof(cuDoubleComplex), da_ref(0,i), ldda, a_ref(0,i), lda); int rows = m-i; cnt++; lapackf77_zgeqrf(&rows, &ib, a_ref(i,i), &lda, tau+i, work, &lwork, info); if (cnt < qr_params->np_gpu) { int ib2=min(ib,nb); lapackf77_zlarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib2, a_ref(i,i), &lda, tau+i, qr_params->t+cnt*nb*nb, &ib2); qr_params->p[cnt]=a; } } cudaStreamDestroy( stream[0] ); cudaStreamDestroy( stream[1] ); cublasFree(da); return MAGMA_SUCCESS; } /* magma_zgeqrf */
/* ========================================================================== */ int sci_gpuLU(char *fname) { CheckRhs(1,2); CheckLhs(2,2); #ifdef WITH_CUDA cublasStatus status; #endif SciErr sciErr; int* piAddr_A = NULL; double* h_A = NULL; double* hi_A = NULL; int rows_A; int cols_A; int* piAddr_Opt = NULL; double* option = NULL; int rows_Opt; int cols_Opt; void* d_A = NULL; int na; void* pvPtr = NULL; int size_A = sizeof(double); bool bComplex_A = FALSE; int inputType_A; int inputType_Opt; double res; int posOutput = 1; try { sciErr = getVarAddressFromPosition(pvApiCtx, 1, &piAddr_A); if(sciErr.iErr) throw sciErr; if(Rhs == 2) { sciErr = getVarAddressFromPosition(pvApiCtx, 2, &piAddr_Opt); if(sciErr.iErr) throw sciErr; sciErr = getVarType(pvApiCtx, piAddr_Opt, &inputType_Opt); if(sciErr.iErr) throw sciErr; if(inputType_Opt == sci_matrix) { sciErr = getMatrixOfDouble(pvApiCtx, piAddr_Opt, &rows_Opt, &cols_Opt, &option); if(sciErr.iErr) throw sciErr; } else throw "Option syntax is [number,number]."; } else { rows_Opt=1; cols_Opt=2; option = (double*)malloc(2*sizeof(double)); option[0]=0; option[1]=0; } if(rows_Opt != 1 || cols_Opt != 2) throw "Option syntax is [number,number]."; if((int)option[1] == 1 && !isGpuInit()) throw "gpu is not initialised. Please launch gpuInit() before use this function."; sciErr = getVarType(pvApiCtx, piAddr_A, &inputType_A); if(sciErr.iErr) throw sciErr; #ifdef WITH_CUDA if (useCuda()) { if(inputType_A == sci_pointer) { sciErr = getPointer(pvApiCtx, piAddr_A, (void**)&pvPtr); if(sciErr.iErr) throw sciErr; gpuMat_CUDA* gmat; gmat = static_cast<gpuMat_CUDA*>(pvPtr); if(!gmat->useCuda) throw "Please switch to OpenCL mode before use this data."; rows_A=gmat->rows; cols_A=gmat->columns; if(gmat->complex) { bComplex_A = TRUE; size_A = sizeof(cuDoubleComplex); d_A=(cuDoubleComplex*)gmat->ptr->get_ptr(); } else d_A=(double*)gmat->ptr->get_ptr(); // Initialize CUBLAS status = cublasInit(); if (status != CUBLAS_STATUS_SUCCESS) throw status; na = rows_A * cols_A; } else if(inputType_A == 1) { // Get size and data if(isVarComplex(pvApiCtx, piAddr_A)) { sciErr = getComplexMatrixOfDouble(pvApiCtx, piAddr_A, &rows_A, &cols_A, &h_A, &hi_A); if(sciErr.iErr) throw sciErr; size_A = sizeof(cuDoubleComplex); bComplex_A = TRUE; } else { sciErr = getMatrixOfDouble(pvApiCtx, piAddr_A, &rows_A, &cols_A, &h_A); if(sciErr.iErr) throw sciErr; } na = rows_A * cols_A; // Initialize CUBLAS status = cublasInit(); if (status != CUBLAS_STATUS_SUCCESS) throw status; // Allocate device memory status = cublasAlloc(na, size_A, (void**)&d_A); if (status != CUBLAS_STATUS_SUCCESS) throw status; // Initialize the device matrices with the host matrices if(!bComplex_A) { status = cublasSetMatrix(rows_A,cols_A, sizeof(double), h_A, rows_A, (double*)d_A, rows_A); if (status != CUBLAS_STATUS_SUCCESS) throw status; } else writecucomplex(h_A, hi_A, rows_A, cols_A, (cuDoubleComplex *)d_A); } else throw "Bad argument type."; cuDoubleComplex resComplex; // Performs operation if(!bComplex_A) status = decomposeBlockedLU(rows_A, cols_A, rows_A, (double*)d_A, 1); // else // resComplex = cublasZtrsm(na,(cuDoubleComplex*)d_A); if (status != CUBLAS_STATUS_SUCCESS) throw status; // Put the result in scilab switch((int)option[0]) { case 2 : case 1 : sciprint("The first option must be 0 for this function. Considered as 0.\n"); case 0 : // Keep the result on the Host. { // Put the result in scilab if(!bComplex_A) { double* h_res = NULL; sciErr=allocMatrixOfDouble(pvApiCtx, Rhs + posOutput, rows_A, cols_A, &h_res); if(sciErr.iErr) throw sciErr; status = cublasGetMatrix(rows_A,cols_A, sizeof(double), (double*)d_A, rows_A, h_res, rows_A); if (status != CUBLAS_STATUS_SUCCESS) throw status; } else { sciErr = createComplexMatrixOfDouble(pvApiCtx, Rhs + posOutput, 1, 1, &resComplex.x,&resComplex.y); if(sciErr.iErr) throw sciErr; } LhsVar(posOutput)=Rhs+posOutput; posOutput++; break; } default : throw "First option argument must be 0 or 1 or 2."; } switch((int)option[1]) { case 0 : // Don't keep the data input on Device. { if(inputType_A == sci_matrix) { status = cublasFree(d_A); if (status != CUBLAS_STATUS_SUCCESS) throw status; d_A = NULL; } break; } case 1 : // Keep data of the fisrt argument on Device and return the Device pointer. { if(inputType_A == sci_matrix) { gpuMat_CUDA* dptr; gpuMat_CUDA tmp={getCudaContext()->genMatrix<double>(getCudaQueue(),rows_A*cols_A),rows_A,cols_A}; dptr=new gpuMat_CUDA(tmp); dptr->useCuda = true; dptr->ptr->set_ptr((double*)d_A); if(bComplex_A) dptr->complex=TRUE; else dptr->complex=FALSE; sciErr = createPointer(pvApiCtx,Rhs+posOutput, (void*)dptr); if(sciErr.iErr) throw sciErr; LhsVar(posOutput)=Rhs+posOutput; } else throw "The first input argument is already a GPU variable."; posOutput++; break; } default : throw "Second option argument must be 0 or 1."; } // Shutdown status = cublasShutdown(); if (status != CUBLAS_STATUS_SUCCESS) throw status; } #endif #ifdef WITH_OPENCL if (!useCuda()) { throw "not implemented with OpenCL."; } #endif if(Rhs == 1) { free(option); option = NULL; } if(posOutput < Lhs+1) throw "Too many output arguments."; if(posOutput > Lhs+1) throw "Too few output arguments."; PutLhsVar(); return 0; } catch(const char* str) { Scierror(999,"%s\n",str); } catch(SciErr E) { printError(&E, 0); } #ifdef WITH_CUDA catch(cudaError_t cudaE) { GpuError::treat_error<CUDAmode>((CUDAmode::Status)cudaE); } catch(cublasStatus CublasE) { GpuError::treat_error<CUDAmode>((CUDAmode::Status)CublasE,1); } if (useCuda()) { if(inputType_A == 1 && d_A != NULL) cudaFree(d_A); } #endif #ifdef WITH_OPENCL if (!useCuda()) { Scierror(999,"not implemented with OpenCL.\n"); } #endif if(Rhs == 1 && option != NULL) free(option); return EXIT_FAILURE; }
void psgetrf_gpu(int *m_in, int *n_in, float *A, int *ia_in, int *ja_in, int *descA, int *ipiv_, int *info) { int m = *m_in; int n = *n_in; int ia = *ia_in; int ja = *ja_in; const int use_setup_desc = TRUE; const int idebug = 0; int use_replicated_storage = FALSE; const int use_broadcast_triangular_matrix = TRUE; int ia_proc, ja_proc; int lrindx, lcindx, rsrc,csrc, irsrc,icsrc; int ictxt, nprow,npcol, myprow,mypcol; int is_root; int minmn; int k1,k2,incx,ip; int mm, nn, kk, ii, jj, mtmp; int mm_lu,nn_lu,ia_lu,ja_lu; int elemSize = sizeof( float ); size_t nbytes; int nnb, jstart,jend,jsize, isize, jb; int icontxt, isizeAtmp; int i,j, iia,jja, ldA, ldhA; int iinfo = 0; int iAtmp, jAtmp, iha,jha, iib,jjb,iic,jjc; int ldAtmp, ldBtmp, lmm,lnn; int lrA1,lcA1, lrA2,lcA2; int desc_hA_[DLEN_]; int *desc_hA = &(desc_hA_[0]); int *ipiv_hA_ = 0; float *hA = 0; float *Atmp = 0; float *dAtmp = 0; int *gipiv_ = 0; int desc_Atmp_[DLEN_]; int *desc_Atmp = &(desc_Atmp_[0]); cublasStatus cu_status; int isok; int use_delayed_left_interchange = 1; int is_mine; int i1,j1,inc1, i2,j2,inc2; int desc_ipiv_hA_[DLEN_]; int *desc_ipiv_hA = &(desc_ipiv_hA_[0]); int desc_ipiv_[DLEN_]; int *desc_ipiv = &(desc_ipiv_[0]); int desc_gipiv_[DLEN_]; int *desc_gipiv = &(desc_gipiv_[0]); int mb,nb, Locp, Locq, lld; char direc = 'F'; char rowcol = 'R'; char left[] = "Left"; char lower[] = "Lower"; char notrans[] = "NoTrans"; char unit[] = "Unit"; char *side = left; char *uplo = lower; char *trans = notrans; char *diag = unit; float zero_[REAL_PART+IMAG_PART+1]; float *zero = &(zero_[0]); float one_[REAL_PART+IMAG_PART+1]; float *one = &(one_[0]); float neg_one_[REAL_PART+IMAG_PART+1]; float *neg_one = &(neg_one_[0]); float beta_[REAL_PART+IMAG_PART+1]; float *beta = &(beta_[0]); float alpha_[REAL_PART+IMAG_PART+1]; float *alpha = &(alpha_[0]); /* * A is a pointer to GPU device memory but conceptually associated * with a scalapack distributed matrix * A is array of complex numbers */ *info = 0; zero[REAL_PART] = 0.0; zero[IMAG_PART] = 0.0; one[REAL_PART] = 1.0; one[IMAG_PART] = 0.0; neg_one[REAL_PART] = -1.0; neg_one[IMAG_PART] = 0.0; /* * setup copy of distributed matrix on CPU host */ hA = 0; Atmp = 0; ictxt = descA[CTXT_]; icontxt = ictxt; Cblacs_gridinfo( ictxt, &nprow, &npcol, &myprow, &mypcol ); is_root = (myprow == 0) && (mypcol == 0); if ((idebug >= 1) && (is_root)) { printf("pcgetrf_gpu: m %d n %d ia %d ja %d \n", m,n, ia,ja ); }; ia_proc = Cindxg2p( ia, descA[MB_], myprow, descA[RSRC_], nprow); ja_proc = Cindxg2p( ja, descA[NB_], mypcol, descA[CSRC_], npcol); /* * setup global pivot vector */ lld = MIN(m,n) + descA[MB_]; nbytes = lld; nbytes *= sizeof(int); if (gipiv_ != 0) { free(gipiv_); gipiv_ = 0; }; gipiv_ = (int *) malloc( nbytes ); assert( gipiv_ != 0 ); desc_gipiv[DTYPE_] = descA[DTYPE_]; desc_gipiv[CTXT_] = descA[CTXT_]; desc_gipiv[M_] = MIN(m,n); desc_gipiv[N_] = 1; desc_gipiv[MB_] = desc_gipiv[M_]; desc_gipiv[NB_] = desc_gipiv[N_]; desc_gipiv[LLD_] = lld; desc_gipiv[RSRC_] = -1; desc_gipiv[CSRC_] = -1; /* * setup distribute array hA on host */ /* * Note, optimal block size on GPU might not be * optimal block size on CPU, but assume to be * the same for simplicity for now */ /* * should nnb = descA[NB_] * npcol ? */ nnb = descA[NB_]; minmn = MIN(m,n); for( jstart=1; jstart <= minmn; jstart = jend + 1) { jend = MIN( minmn, jstart + nnb - 1); jsize = jend - jstart + 1; /* * setup matrix on host */ /* was iia = (ia-1) + 1; */ j = jstart; jb = jsize; iia = (ia-1) + jstart; jja = (ja-1) + jstart; mm = m - jstart + 1; nn = jsize; if (use_setup_desc) { setup_desc( mm,nn, iia,jja,descA, &isize, desc_hA ); } else { irsrc = Cindxg2p( iia, descA[MB_], myprow, descA[RSRC_], nprow ); icsrc = Cindxg2p( jja, descA[NB_], mypcol, descA[CSRC_], npcol ); mb = descA[MB_]; nb = descA[NB_]; Locp = Cnumroc( mm, mb, 0,0,nprow ); Locq = Cnumroc( nn, nb, 0,0,npcol ); lld = MAX(1,Locp); isize = MAX(1,Locp) * MAX(1, Locq ); ictxt = descA[CTXT_]; iinfo = 0; Cdescinit( desc_hA, mm,nn, mb,nb, irsrc,icsrc, ictxt, lld, &iinfo); assert( iinfo == 0); }; nbytes = isize; nbytes *= elemSize; if (hA != 0) { free(hA); hA = 0; }; hA = (float *) malloc( nbytes ); assert( hA != 0 ); /* * distribution of pivot vector is tied to distribution of matrix */ Locp = Cnumroc( desc_hA[M_], desc_hA[MB_], myprow, desc_hA[RSRC_], nprow); lld = Locp + desc_hA[MB_]; nbytes = lld; nbytes *= sizeof(int); if (ipiv_hA_ != 0) { free( ipiv_hA_ ); ipiv_hA_ = 0; }; ipiv_hA_ = (int *) malloc( nbytes ); assert( ipiv_hA_ != 0); Cdescset( desc_ipiv_hA, desc_hA[M_], 1, desc_hA[MB_], 1, desc_hA[RSRC_], icsrc, desc_hA[CTXT_], lld ); /* copy column panel back to CPU host to be factored using scalapack */ jb = jsize; j = jstart; mm = m - j + 1; nn = jb; /* hA(1:mm,1:nn) <- dA(j:(j+mm-1), j:(j+nn-1) ) */ iia = (ia-1) + j; jja = (ja-1) + j; ii = 1; jj = 1; PROFSTART("gpu:hA <- dA"); Cpsgecopy_d2h( mm,nn, A,iia,jja,descA, hA, ii,jj, desc_hA ); PROFEND("gpu:hA <- dA"); /* * factor on host CPU using ScaLAPACK * Note the pivot vector is tied to the distribution of the matrix * Therefore, we need a different "ipiv_hA" pivot vector * that is tied the the distributed matrix hA */ ii = 1; jj = 1; iinfo = 0; mm_lu = mm; nn_lu = nn; ia_lu = ii; ja_lu = jj; PROFSTART("gpu:psgetrf"); scalapack_psgetrf( &mm_lu, &nn_lu, hA, &ia_lu, &ja_lu, desc_hA, &(ipiv_hA(1)), &iinfo ); PROFEND("gpu:psgetrf"); /* * broadcast pivot vector to global vector */ i1 = 1; j1 = 1; inc1 = 1; i2 = jstart; j2 = 1; inc2 = 1; mtmp = MIN(mm,nn); desc_ipiv_hA[CSRC_] = icsrc; use_replicated_storage = FALSE; if (use_replicated_storage) { int ja_lu_proc; ja_lu_proc = Cindxg2p(ja_lu,desc_hA[NB_], mypcol,desc_hA[CSRC_],npcol); desc_ipiv_hA[CSRC_] = ja_lu_proc; desc_gipiv[RSRC_] = -1; desc_gipiv[CSRC_] = -1; scalapack_picopy( &mtmp, &(ipiv_hA(1)), &i1,&j1, desc_ipiv_hA, &inc1, &(gipiv(1)), &i2,&j2, desc_gipiv, &inc2 ); } else { /* * copy to 1 processors (rsrc,csrc), then * broadcast to all processors */ int icontxt = desc_ipiv_hA[CTXT_]; char scope = 'A'; char top = ' '; int ntmp = 1; int lld; int ia_lu_proc,ja_lu_proc; int rsrc, csrc; ia_lu_proc = Cindxg2p( ia_lu, desc_hA[MB_], myprow,desc_hA[RSRC_],nprow); ja_lu_proc = Cindxg2p( ja_lu, desc_hA[NB_], mypcol,desc_hA[CSRC_],npcol); rsrc = ia_lu_proc; csrc = ja_lu_proc; desc_gipiv[RSRC_] = rsrc; desc_gipiv[CSRC_] = csrc; desc_ipiv_hA[CSRC_] = csrc; mtmp = MIN( mm_lu, nn_lu); scalapack_picopy( &mtmp, &(ipiv_hA(1)), &i1,&j1,desc_ipiv_hA,&inc1, &(gipiv(1)), &i2,&j2, desc_gipiv, &inc2 ); if ((myprow == rsrc) && (mypcol == csrc)) { lld = mtmp; ntmp = 1; scalapack_igebs2d( &icontxt, &scope, &top, &mtmp, &ntmp, &(gipiv(i2)), &lld ); } else { lld = mtmp; ntmp = 1; scalapack_igebr2d( &icontxt, &scope, &top, &mtmp, &ntmp, &(gipiv(i2)), &lld, &rsrc,&csrc ); }; }; if (idebug >= 1) { int desctmp[DLEN_]; char name_ipiv_hA[] = "ipiv_hA"; char name_gipiv[] = "gipiv"; if (is_root) { printf("jstart %d jend %d \n", jstart,jend); printf("mm_lu %d nn_lu %d ia_lu %d ja_lu %d\n", mm_lu, nn_lu, ia_lu, ja_lu ); }; Cdescset(desctmp, desc_hA[M_], npcol, desc_hA[MB_],1, desc_hA[RSRC_], desc_hA[CSRC_], desc_hA[CTXT_], desc_hA[LLD_] ); Cpilaprnt( MIN(mm_lu,nn_lu), npcol, &(ipiv_hA(1)), 1,1,desctmp, name_ipiv_hA); Cdescset(desctmp, minmn*nprow, npcol, minmn, 1, 0,0, descA[CTXT_], minmn ); Cpilaprnt( nprow*minmn, npcol, &(gipiv(1)),1,1,desctmp, name_gipiv); }; /* * adjust pivot sequence from 1:min(mm,nn) in ipiv to * jstart:(jstart+min(mm,nn)-1) */ for(int i=1; i <= MIN(mm,nn); i++) { i2 = (jstart-1) + i; gipiv(i2) = gipiv(i2) + (jstart-1); }; if (iinfo < 0) { *info = iinfo; return; }; if ((*info == 0) && (iinfo > 0)) { *info = iinfo + (j-1); return; }; /* * transfer factored panel back to GPU device */ iia = (ia-1) + j; jja = (ja-1) + j; ii = 1; jj = 1; PROFSTART("gpu:A <- hA"); Cpsgecopy_h2d(mm,nn, hA, ii,jj, desc_hA, A, iia,jja, descA ); PROFEND("gpu:A <- hA"); if (use_delayed_left_interchange) { /* * do nothing for now */ } else { /* * apply interchanges to columns 1:(j-1) */ nn = j-1; k1 = j; k2 = j + jb-1; incx = 1; PROFSTART("gpu:left swap"); if (nn >= 1) { iia = (ia-1) + 1; jja = (ja-1) + 1; for(kk=k1; kk <= k2; kk++) { ip = gipiv( kk); assert(ip >= kk ); assert( ip <= m ); if (kk != ip) { inc1 = descA[M_]; inc2 = descA[M_]; i1 = (iia-1) + kk; i2 = (iia-1) + ip; j1 = jja; j2 = jja; Cpsswap_gpu(nn, A,i1,j1,descA,inc1, A,i2,j2,descA,inc2 ); }; }; }; PROFEND("gpu:left swap"); }; /* * apply interchanges to columns (j+jb):n */ nn = n - (jend + 1) + 1; k1 = j; k2 = j + jb - 1; incx = 1; PROFSTART("gpu:right swap"); if (nn >= 1) { iia = (ia-1) + 1; jja = (ja-1) + (jend+1); for(kk=k1; kk <= k2; kk++) { ip = gipiv( kk ); assert( ip >= kk ); assert( ip <= m ); if (ip != kk) { i1 = (iia-1) + kk; i2 = (iia-1) + ip; j1 = jja; j2 = jja; inc1 = descA[M_]; inc2 = descA[M_]; Cpsswap_gpu( nn, A, i1,j1, descA, inc1, A, i2,j2, descA, inc2 ); }; }; }; PROFEND("gpu:right swap"); PROFSTART("gpu:pTRSM"); mm = jb; nn = n - (jend+1) + 1; if ( (1 <= mm) && (1 <= nn)) { /* cublasCtrsm('L','L','N','U', mm,nn, alpha, dA(j,j), lddA, dA(j,j+jb), lddA ); */ if (use_broadcast_triangular_matrix) { /* * broadcast triangular part, then solve locally */ char lscope = 'A'; char ltop = ' '; int msize, nsize, lr1,lc1, lr2,lc2; int ia_lu_proc, ja_lu_proc; /* * copy on local processor */ ia_lu_proc = Cindxg2p(ia_lu, desc_hA[MB_], myprow, desc_hA[RSRC_], nprow ); ja_lu_proc = Cindxg2p(ja_lu, desc_hA[NB_], mypcol, desc_hA[CSRC_], npcol ); /* * complete mm by mm block on Atmp */ ldAtmp = MAX(1,mm); Cdescset(desc_Atmp, mm,mm, mm,mm, ia_lu_proc,ja_lu_proc, icontxt, ldAtmp); isizeAtmp = ldAtmp * MAX(1,mm); nbytes = isizeAtmp; nbytes *= elemSize; if (Atmp != 0) { free(Atmp); Atmp = 0; }; Atmp = (float *) malloc( nbytes ); assert( Atmp != 0); #ifdef USE_CUBLASV2 { cudaError_t ierr; size_t isize = isizeAtmp; isize *= elemSize; ierr = cudaMalloc( (void **) &dAtmp, isize ); assert(ierr == cudaSuccess ); } #else cu_status = cublasAlloc(isizeAtmp, elemSize, (void **) &dAtmp ); CHKERR(cu_status); assert( dAtmp != 0); #endif ii = 1; jj = 1; scalapack_psgeadd( notrans, &mm, &mm, one, hA, &ia_lu, &ja_lu, desc_hA, zero, Atmp, &ii, &jj, desc_Atmp ); rsrc = desc_Atmp[RSRC_]; csrc = desc_Atmp[CSRC_]; if ((myprow == rsrc) && (mypcol == csrc)) { scalapack_cgebs2d( &icontxt, &lscope, <op, &mm, &mm, Atmp, &ldAtmp ); } else { scalapack_cgebr2d( &icontxt, &lscope, <op, &mm, &mm, Atmp, &ldAtmp, &rsrc, &csrc ); }; inc1 = 1; inc2 = 1; cu_status = cublasSetVector(isizeAtmp, elemSize, Atmp, inc1, dAtmp, inc2 ); CHKERR(cu_status); /* * perform local solve on GPU */ iia = (ia-1) + j; jja = (ja-1) + (j+jb); local_extent( mm,nn, iia,jja,descA, &msize,&nsize, &lr1,&lc1, &lr2,&lc2 ); if (msize >= 1) { assert( msize == mm ); }; if ((msize >= 1) && (nsize >= 1)) { char lside = 'L'; char luplo = 'L'; char ltrans = 'N'; char ldiag = 'U'; float zalpha; zalpha = (float)1.0;//make_float(1.0,0.0); CUBLAS_STRSM( ((lside == 'l')||(lside == 'L')) ? CUBLAS_SIDE_LEFT : CUBLAS_SIDE_RIGHT, ((luplo == 'l')||(luplo == 'L')) ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER, ((ltrans == 'c')||(ltrans == 'C')) ? CUBLAS_OP_C : ((ltrans == 't')||(ltrans == 'T')) ? CUBLAS_OP_T : CUBLAS_OP_N, ((ldiag == 'u')||(ldiag == 'U')) ? CUBLAS_DIAG_UNIT : CUBLAS_DIAG_NON_UNIT, mm, nsize, zalpha, (float *) dAtmp, ldAtmp, dA(lr1,lc1), descA[LLD_] ); }; if (Atmp != 0) { free(Atmp); Atmp = 0; }; #ifdef USE_CUBLASV2 { cudaError_t ierr; ierr = cudaFree( (void *) dAtmp ); assert(ierr == cudaSuccess ); dAtmp = 0; } #else cu_status = cublasFree( dAtmp ); CHKERR(cu_status ); #endif } else { /* * perform triangular solve using scalapack */ iia = (ia-1) + j; jja = (ja-1) + (j+jb); setup_desc(mm,nn,iia,jja,descA, &isize, desc_Atmp ); nbytes = elemSize; nbytes *= isize; if (Atmp != 0) { free(Atmp); Atmp = 0; }; Atmp = (float *) malloc( nbytes ); assert( Atmp != 0 ); /* * copy to Atmp(1:mm,1:nn) <- dA(j:(j+mm-1),(j+jb):((j+jb)+nn-1)) */ ii = 1; jj = 1; PROFSTART("gpu:Atmp <- dA"); Cpsgecopy_d2h( mm,nn,A,iia,jja,descA, Atmp, ii,jj, desc_Atmp ); PROFEND("gpu:Atmp <- dA"); /* * perform triangular solve using scalapack */ side = left; uplo = lower; trans = notrans; diag = unit; alpha = one; iha = 1; jha = 1; ii = 1; jj = 1; PROFSTART("gpu:pstrsm") scalapack_pstrsm( side, uplo, trans, diag, &mm,&nn, alpha, hA, &iha,&jha, desc_hA, Atmp,&ii,&jj, desc_Atmp ); PROFEND("gpu:pstrsm") /* * copy back to GPU */ iia = (ia-1) + j; jja = (ja-1) + (j+jb); ii = 1; jj = 1; PROFSTART("gpu:A <- Atmp"); Cpsgecopy_h2d( mm,nn, Atmp,ii,jj,desc_Atmp, A, iia,jja, descA ); PROFEND("gpu:A <- Atmp"); }; }; PROFEND("gpu:pTRSM"); /* * update trailing submatrix */ alpha = neg_one; beta = one; mm = m-(jend+1) + 1; nn = n-(jend+1) + 1; kk = jb; if ((1 <= mm) && (1 <= nn) && (1 <= kk)) { /* cublasSgemm('N','N',mm,nn,kk, alpha, dA(j+jb,j),lddA, dA(j,j+jb),lddA, beta, dA(j+jb,j+jb), lddA ); */ if (use_broadcast_triangular_matrix) { /* * Copy from GPU to Atmp */ iia = (ia-1) + j; jja = (ja-1) + (j+jb); setup_desc( kk,nn, iia,jja, descA, &isizeAtmp, desc_Atmp); nbytes = isizeAtmp; nbytes *= elemSize; if (Atmp != 0) { free(Atmp); Atmp = 0; }; Atmp = (float *) malloc( nbytes ); assert( Atmp != 0); PROFSTART("gpu:Atmp <- A"); Cpsgecopy_d2h( kk,nn, A,iia,jja,descA, Atmp,1,1,desc_Atmp ); PROFEND("gpu:Atmp <- A"); }; iic = (ia-1) + (jend+1); jjc = (ja-1) + (jend+1); iha = jsize+1; jha = 1; iAtmp = 1; jAtmp = 1; { char transA = 'N'; char transB = 'N'; PROFSTART("zgetrf_gpu:psgemm"); Cpsgemm_hhd( transA, transB, mm,nn,kk, alpha, hA, iha,jha, desc_hA, Atmp, iAtmp,jAtmp, desc_Atmp, beta, A, iic,jjc, descA ); PROFEND("zgetrf_gpu:psgemm"); }; }; if (Atmp != 0) { free(Atmp); Atmp = 0; }; if (ipiv_hA_ != 0) { free( ipiv_hA_ ); ipiv_hA_ = 0; }; if (hA != 0) { free(hA); hA = 0; }; }; /* for (jstart) */ if (use_delayed_left_interchange) { PROFSTART("gpu:dleft swap"); for(j=1; j <= minmn; j = jend + 1) { jend = MIN( minmn, j+nnb-1); jsize = jend - j + 1; jb = jsize; /* * apply interchanges to columns 1:(j-1) */ nn = j-1; k1 = j; k2 = j+jb-1; incx = 1; if (nn >= 1) { iia = (ia-1) + 1; jja = (ja-1) + 1; for(kk=k1; kk <= k2; kk++) { ip = gipiv(kk); assert( ip >= kk ); if (ip != kk) { inc1 = descA[M_]; inc2 = descA[M_]; i1 = (iia-1) + kk; i2 = (iia-1) + ip; j1 = jja; j2 = jja; Cpsswap_gpu(nn, A, i1,j1,descA, inc1, A, i2,j2,descA, inc2 ); }; }; }; }; /* end for j */ PROFEND("gpu:dleft swap"); }; /* end if use delayed left interchange */ /* * adjust global pivot from 1:MIN(m,n) to ia:(ia + MIN(m,n)-1) * copy global vector back to distributed pivot vector */ for(int j=1; j <= minmn; j++) { gipiv(j) = (ia-1) + gipiv(j); }; lld = descA[MB_] + Cnumroc( descA[M_], descA[MB_], myprow, descA[RSRC_], nprow); Cdescset( desc_ipiv, descA[M_],1, descA[MB_], 1, descA[RSRC_], -1, descA[CTXT_], lld ); i1 = 1; j1 = 1; inc1 = 1; i2 = ia; j2 = 1; inc2 = 1; mtmp = MIN(m,n); PROFSTART("gpu:ipiv"); use_replicated_storage = FALSE; if (use_replicated_storage) { int msize,nsize,lr1,lc1,lr2,lc2, lrindx,iia; local_extent(MIN(m,n),n,ia,ja,descA, &msize,&nsize, &lr1,&lc1, &lr2,&lc2); if (msize >= 1) { for(lrindx=lr1; lrindx <= lr2; lrindx++) { iia = Cindxl2g( lrindx, descA[MB_], myprow, descA[RSRC_], nprow); ipiv(lrindx) = gipiv( (iia-ia) + 1 ); }; }; } else { /* * copy to a column, then broadcast */ char scope = 'R'; char top = ' '; int Locp, Locq; int lld; int icontxt = desc_ipiv[CTXT_]; desc_ipiv[CSRC_] = ja_proc; desc_gipiv[RSRC_] = ia_proc; desc_gipiv[CSRC_] = ja_proc; mtmp = MIN(m,n); scalapack_picopy( &mtmp, &(gipiv(1)), &i1,&j1, desc_gipiv, &inc1, &(ipiv(1)), &i2, &j2, desc_ipiv, &inc2 ); if (idebug >= 1) { char cmatnm[] = "ipiv after picopy"; if (is_root) { printf("ia_proc %d ja_proc %d i2 %d j2 %d \n",ia_proc,ja_proc,i2,j2); }; Cpilaprnt( mtmp,1, &(ipiv(1)), i2,j2,desc_ipiv, cmatnm); }; Locp = Cnumroc( ia + MIN(m,n)-1, desc_ipiv[MB_], myprow, desc_ipiv[RSRC_], nprow); lld = MAX(1,Locp); Locq = 1; if (npcol > 1) { if (mypcol == ja_proc) { scalapack_igebs2d( &icontxt, &scope, &top, &Locp, &Locq, &(ipiv(1)), &lld ); } else { rsrc = myprow; scalapack_igebr2d( &icontxt, &scope, &top, &Locp, &Locq, &(ipiv(1)), &lld, &rsrc, &ja_proc ); }; }; }; PROFEND("gpu:ipiv"); if (idebug >= 1) { int desctmp[DLEN_]; char cmatnm[] = "final ipiv"; Cdescset( desctmp, descA[M_],npcol, descA[MB_],1, descA[RSRC_], descA[CSRC_], descA[CTXT_], descA[LLD_]); Cpilaprnt( MIN(m,n),npcol, &(ipiv(1)), ia,1,desctmp, cmatnm); }; /* * clean up */ if (Atmp != 0) { free(Atmp); Atmp = 0; }; if (hA != 0) { free(hA); hA = 0; }; if (ipiv_hA_ != 0) { free( ipiv_hA_ ); ipiv_hA_ = 0; }; if (gipiv_ != 0) { free(gipiv_); gipiv_ = 0; }; return; }
SEXP magma_dgeMatrix_crossprod(SEXP x, SEXP trans) { #ifdef HIPLAR_WITH_MAGMA int tr = asLogical(trans);/* trans=TRUE: tcrossprod(x) */ SEXP val = PROTECT(NEW_OBJECT(MAKE_CLASS("dpoMatrix"))), nms = VECTOR_ELT(GET_SLOT(x, Matrix_DimNamesSym), tr ? 0 : 1), vDnms = ALLOC_SLOT(val, Matrix_DimNamesSym, VECSXP, 2); int *Dims = INTEGER(GET_SLOT(x, Matrix_DimSym)), *vDims = INTEGER(ALLOC_SLOT(val, Matrix_DimSym, INTSXP, 2)); int k = tr ? Dims[1] : Dims[0], n = tr ? Dims[0] : Dims[1]; double *vx = REAL(ALLOC_SLOT(val, Matrix_xSym, REALSXP, n * n)), one = 1.0, zero = 0.0; double *A = REAL(GET_SLOT(x, Matrix_xSym)); AZERO(vx, n * n); SET_SLOT(val, Matrix_uploSym, mkString("U")); ALLOC_SLOT(val, Matrix_factorSym, VECSXP, 0); vDims[0] = vDims[1] = n; SET_VECTOR_ELT(vDnms, 0, duplicate(nms)); SET_VECTOR_ELT(vDnms, 1, duplicate(nms)); if(n && GPUFlag == 1) { #ifdef HIPLAR_DBG R_ShowMessage("DBG: Performing crossproduct using cublasDsyrk"); #endif cublasStatus retStatus; double *d_A, *d_C; /*retStatus = cublasCreate(&handle); if ( retStatus != CUBLAS_STATUS_SUCCESS ) error(_("CUBLAS initialisation failed")); */ cublasAlloc(n * k, sizeof(double), (void**)&d_A); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Memory Allocation")); /********************************************/ cublasAlloc(n * n, sizeof(double), (void**)&d_C); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Memory Allocation")); /********************************************/ cublasSetVector( n * k , sizeof(double), A, 1, d_A, 1); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Data Transfer to Device")); /********************************************/ //cublasSetVector( n * n , sizeof(double), vx, 1, d_C, 1); /* Error Checking */ //retStatus = cublasGetError (); //if (retStatus != CUBLAS_STATUS_SUCCESS) // error(_("CUBLAS: Error in Data Transfer to Device")); /********************************************/ cublasDsyrk('U' , tr ? 'N' : 'T', n, k, one, d_A, Dims[0], zero, d_C, n); cublasGetVector( n * n , sizeof(double), d_C, 1, vx, 1); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Data Transfer from Device")); /********************************************/ cublasFree(d_A); cublasFree(d_C); } else if(n){ #ifdef HIPLAR_DBG R_ShowMessage("DBG: Performing cross prod with dsyrk"); #endif F77_CALL(dsyrk)("U", tr ? "N" : "T", &n, &k, &one, A, Dims, &zero, vx, &n); } SET_SLOT(val, Matrix_factorSym, allocVector(VECSXP, 0)); UNPROTECT(1); return val; #endif return R_NilValue; }
SEXP magma_dpoMatrix_solve(SEXP x) { #ifdef HIPLAR_WITH_MAGMA SEXP Chol = magma_dpoMatrix_chol(x); SEXP val = PROTECT(NEW_OBJECT(MAKE_CLASS("dpoMatrix"))); int *dims = INTEGER(GET_SLOT(x, Matrix_DimSym)), info; SET_SLOT(val, Matrix_factorSym, allocVector(VECSXP, 0)); slot_dup(val, Chol, Matrix_uploSym); slot_dup(val, Chol, Matrix_xSym); slot_dup(val, Chol, Matrix_DimSym); SET_SLOT(val, Matrix_DimNamesSym, duplicate(GET_SLOT(x, Matrix_DimNamesSym))); double *A = REAL(GET_SLOT(val, Matrix_xSym)); int N = *dims; int lda = N; const char *uplo = uplo_P(val); if(GPUFlag == 0) { F77_CALL(dpotri)(uplo_P(val), dims, A, dims, &info); } else if(GPUFlag == 1 && Interface == 0) { #ifdef HIPLAR_DBG R_ShowMessage("DBG: Solving using magma_dpotri"); #endif magma_dpotri(uplo[0], N, A, lda, &info); } else if(GPUFlag == 1 && Interface == 1){ double *d_A; cublasStatus retStatus; cublasAlloc( N * lda , sizeof(double), (void**)&d_A); #ifdef HIPLAR_DBG R_ShowMessage("DBG: Solving using magma_dpotri_gpu"); #endif /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Memory Allocation")); /********************************************/ cublasSetVector( N * lda, sizeof(double), A, 1, d_A, 1); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Data Transfer to Device")); /********************************************/ magma_dpotri_gpu(uplo[0], N, d_A, lda, &info); cublasGetVector(N * lda, sizeof(double), d_A, 1, val, 1); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Data Transfer from Device")); /********************************************/ cublasFree(d_A); } else error(_("MAGMA/LAPACK/Interface Flag not defined correctly")); if (info) { if(info > 0) error(_("the leading minor of order %d is not positive definite"), info); else /* should never happen! */ error(_("Lapack routine %s returned error code %d"), "dpotrf", info); } UNPROTECT(1); return val; #endif return R_NilValue; }
SEXP magma_dgeMatrix_matrix_crossprod(SEXP x, SEXP y, SEXP trans) { #ifdef HIPLAR_WITH_MAGMA int tr = asLogical(trans);/* trans=TRUE: tcrossprod(x,y) */ SEXP val = PROTECT(NEW_OBJECT(MAKE_CLASS("dgeMatrix"))); int *xDims = INTEGER(GET_SLOT(x, Matrix_DimSym)), *yDims = INTEGER(getAttrib(y, R_DimSymbol)), *vDims, nprot = 1; int m = xDims[!tr], n = yDims[!tr];/* -> result dim */ int xd = xDims[ tr], yd = yDims[ tr];/* the conformable dims */ double one = 1.0, zero = 0.0; if (isInteger(y)) { y = PROTECT(coerceVector(y, REALSXP)); nprot++; } if (!(isMatrix(y) && isReal(y))) error(_("Argument y must be a numeric matrix")); SET_SLOT(val, Matrix_factorSym, allocVector(VECSXP, 0)); SET_SLOT(val, Matrix_DimSym, allocVector(INTSXP, 2)); vDims = INTEGER(GET_SLOT(val, Matrix_DimSym)); if (xd > 0 && yd > 0 && n > 0 && m > 0) { if (xd != yd) error(_("Dimensions of x and y are not compatible for %s"), tr ? "tcrossprod" : "crossprod"); vDims[0] = m; vDims[1] = n; SET_SLOT(val, Matrix_xSym, allocVector(REALSXP, m * n)); double *A = REAL(GET_SLOT(x, Matrix_xSym)); double *B = REAL(y); double *C = REAL(GET_SLOT(val, Matrix_xSym)); if(GPUFlag == 1) { double *d_A, *d_B, *d_C; cublasStatus retStatus; #ifdef HIPLAR_DBG R_ShowMessage("DBG: Performing dge/matrix crossprod using magmablas_dgemm"); #endif cublasAlloc(m * xd, sizeof(double), (void**)&d_A); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Memory Allocation")); /********************************************/ cublasAlloc(n * xd, sizeof(double), (void**)&d_B); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Memory Allocation")); /********************************************/ cublasAlloc(m * n, sizeof(double), (void**)&d_C); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Memory Allocation")); /********************************************/ cublasSetVector( m * xd , sizeof(double), A, 1, d_A, 1); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Data Transfer to Device")); /********************************************/ cublasSetVector( xd * n, sizeof(double), B, 1, d_B, 1 ); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Data Transfer to Device")); /********************************************/ cublasSetVector( m * n, sizeof(double), C, 1, d_C, 1 ); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Data Transfer to Device")); /********************************************/ // ******** magmablas_dgemm call Here ** //magmablas_dgemm( tr ? 'N' : 'T', tr ? 'T' : 'N', m, n, xd, one, d_A, xDims[0], d_B, yDims[0], zero, d_C, m); //CHANGE cublasDgemm( tr ? 'N' : 'T', tr ? 'T' : 'N', m, n, xd, one, d_A, xDims[0], d_B, yDims[0], zero, d_C, m); cublasGetVector( m * n , sizeof(double), d_C, 1, C, 1); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Data Transfer from Device")); /********************************************/ cublasFree(d_A); cublasFree(d_B); cublasFree(d_C); } else { #ifdef HIPLAR_DBG R_ShowMessage("DBG: Performing dge/matrix cross prod with dgemm"); #endif F77_CALL(dgemm)(tr ? "N" : "T", tr ? "T" : "N", &m, &n, &xd, &one, A , xDims, B , yDims, &zero, C, &m); } } UNPROTECT(nprot); return val; #endif return R_NilValue; }
SEXP magma_dpoMatrix_dgeMatrix_solve(SEXP a, SEXP b) { #ifdef HIPLAR_WITH_MAGMA SEXP Chol = magma_dpoMatrix_chol(a), val = PROTECT(NEW_OBJECT(MAKE_CLASS("dgeMatrix"))); int *adims = INTEGER(GET_SLOT(a, Matrix_DimSym)), *bdims = INTEGER(GET_SLOT(b, Matrix_DimSym)), info; /* Checking Matrix Dimensions */ if (adims[1] != bdims[0]) error(_("Dimensions of system to be solved are inconsistent")); if (adims[0] < 1 || bdims[1] < 1) error(_("Cannot solve() for matrices with zero extents")); /* ****************************************** */ SET_SLOT(val, Matrix_factorSym, allocVector(VECSXP, 0)); slot_dup(val, b, Matrix_DimSym); slot_dup(val, b, Matrix_xSym); double *A = REAL(GET_SLOT(Chol, Matrix_xSym)); double *B = REAL(GET_SLOT(val, Matrix_xSym)); if(GPUFlag == 1) { #ifdef HIPLAR_DBG R_ShowMessage("DBG: Solving system of Ax = b, A = dpo, b = dge, using dpotrs_gpu;"); #endif double *d_A, *d_B; const char *uplo = uplo_P(Chol); magma_int_t NRHS = bdims[1]; magma_int_t lda = adims[1]; magma_int_t ldb = bdims[0]; magma_int_t N = adims[0]; cublasStatus retStatus; /*if(uplo == "U") uplo = MagmaUpperStr; else if(uplo == "L") uplo = MagmaLowerStr; else uplo = MagmaUpperStr; */ cublasAlloc(N * lda, sizeof(double), (void**)&d_A); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Memory Allocation")); /********************************************/ cublasAlloc(N * NRHS, sizeof(double), (void**)&d_B); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Memory Allocation")); /********************************************/ cublasSetVector( N * lda , sizeof(double), A, 1, d_A, 1); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Data Transfer to Device")); /********************************************/ cublasSetVector( ldb * NRHS, sizeof(double), B, 1, d_B, 1 ); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Data Transfer to Device")); /********************************************/ magma_dpotrs_gpu(uplo[0], N, NRHS , d_A, lda, d_B, ldb, &info); cublasGetVector( ldb * NRHS, sizeof(double), d_B, 1, B, 1); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Data Transfer from Device")); /********************************************/ cublasFree(d_A); cublasFree(d_B); } else { #ifdef HIPLAR_DBG R_ShowMessage("DBG: Solving system of Ax = b, A = dpo, b = dge, using dpotrs;"); #endif F77_CALL(dpotrs)(uplo_P(Chol), adims, bdims + 1, A , adims, B , bdims, &info); } if (info) { if(info > 0) error(_("the leading minor of order %d is not positive definite"), info); else /* should never happen! */ error(_("Lapack routine %s returned error code %d"), "dpotrf", info); } UNPROTECT(1); return val; #endif return R_NilValue; }
double Cholesky(Quark *quark, double *A, int N, int NB, int LDA, size_t memsize) { #define A(ib,jb) A[(size_t)(jb)*NB*LDA+(ib)*NB] #ifndef USE_MIC cublasStatus cu_status; #endif int bb = (N + NB - 1) / NB; int YM, YN; int Ym, Yn; int JB; int jb, jjb; int memBlock = memsize/sizeof(double)/NB/NB; double *X, *Y; #ifdef USE_MIC Y = (double*) offload_Alloc((size_t)memBlock*NB*NB*sizeof(double), 0); assert(Y != NULL); #else #ifdef USE_CUBLASV2 { cudaError_t ierr; ierr = cudaMalloc((void **) &Y, (size_t) memBlock*NB*NB*sizeof(double)); assert(ierr == cudaSuccess); } #else cu_status = cublasAlloc((size_t) memBlock*NB*NB, sizeof(double), (void **) &Y); CHKERR(cu_status); #endif #endif double t1; double llttime = MPI_Wtime(); /*--------------------------------------*/ /* The main Ypanel loop */ // QUARK_Barrier(quark); for (JB = 0, jb = 0; JB < N; JB+=YN, jb+=Yn) { //determine size of Ypanel Ym = bb - jb; Yn = find_Yn(bb, memBlock, jb); YM = N - JB; YN = MIN((jb+Yn)*NB, N) - jb*NB; X = Y + (size_t)(memBlock-Ym)*NB*NB; printf("bb %d jb %d YM %d YN %d Ym %d Yn %d Y %p X %p\n", bb, jb, YM, YN, Ym, Yn, Y, X); /* Copy in data */ A2Y(quark, &A(jb,jb), Y, LDA, NB, YM, YN); /* Left-looking */ for(jjb = 0; jjb < jb; jjb++){ /* copy from A to X */ A2X(quark, &A(jb,jjb), LDA, X, NB, YM); ooc_syrk(quark, X, Y, YM, YN, NB); } /* incore factorization */ ooc_incore(quark, &A(jb,jb), Y, LDA, NB, YM, YN); /* Copy out data */ // Y2A(quark, Y, &A(jb,jb), LDA, NB, YM, YN); // QUARK_Barrier(quark); // reduce parallelism // goto oasdfh; // early stop } oasdfh: QUARK_Barrier(quark); llttime = MPI_Wtime() - llttime; printf("llt time %lf %lf\n", llttime, MPI_Wtime()); printf("%lf %lf\n", A[(N-1)*LDA+N-1], MPI_Wtime()); /*--------------------------------------*/ #ifdef USE_MIC offload_Free(Y,0); #else #ifdef USE_CUBLASV2 { cudaError_t ierr; ierr = cudaFree((void *) Y); assert(ierr == cudaSuccess); Y = 0; } #else cu_status = cublasFree(Y); CHKERR(cu_status); #endif #endif return llttime; #undef A }
int main(void) { cublasStatus status; float* h_image; float* h_covariance; float* d_image; float* d_covariance; float alpha = 1.0f; float beta = 0.0f; int imgsize = N * L; //int i; FILE *fp1, *fp2; /* Initialize CUBLAS */ status = cublasInit(); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! CUBLAS initialization error\n"); return EXIT_FAILURE; } /* Allocate host memory for the image */ h_image = (float*)malloc(imgsize * sizeof(float)); if (h_image == 0) { fprintf (stderr, "!!!! host memory allocation error (image)\n"); return EXIT_FAILURE; } h_covariance = (float*)calloc(L * L, sizeof(float)); if (h_covariance == 0) { fprintf (stderr, "!!!! host memory allocation error (covariance)\n"); return EXIT_FAILURE; } /* Fill the image with test data for (i = 0; i < imgsize; i++) { h_image[i] = rand() / (float)RAND_MAX; }*/ fp1 = fopen("image.dat","rb"); fread(h_image, sizeof(float), imgsize, fp1); printf("Valor de image[0]: %f\n", h_image[8]); /* Allocate device memory */ status = cublasAlloc(imgsize, sizeof(float), (void**)&d_image); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! device memory allocation error (image)\n"); return EXIT_FAILURE; } status = cublasAlloc(L * L, sizeof(float), (void**)&d_covariance); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! device memory allocation error (covariance)\n"); return EXIT_FAILURE; } /* Copy image to device memory */ status = cublasSetVector(imgsize, sizeof(float), h_image, 1, d_image, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! device access error (write A)\n"); return EXIT_FAILURE; } status = cublasSetVector(L * L, sizeof(float), h_covariance, 1, d_covariance, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! device access error (write covariance)\n"); return EXIT_FAILURE; } /* Clear last error */ cublasGetError(); /* Calculate covariance matrix using cublas */ cublasSgemm('n', 't', L, L, N, alpha, d_image, L, d_image, L, beta, d_covariance, L); status = cublasGetError(); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! kernel execution error.\n"); return EXIT_FAILURE; } /* Read the result back */ status = cublasGetVector(L * L, sizeof(float), d_covariance, 1, h_covariance, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! device access error (read covariance)\n"); return EXIT_FAILURE; } fp2 = fopen("covariance.dat","wb"); fwrite(h_covariance, sizeof(float), L*L, fp2); printf("Valor de covariance[8]: %f\n", h_covariance[8]); /* Memory clean up */ free(h_image); free(h_covariance); status = cublasFree(d_image); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! memory free error (image)\n"); return EXIT_FAILURE; } status = cublasFree(d_covariance); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! memory free error (covariance)\n"); return EXIT_FAILURE; } /* Shutdown */ status = cublasShutdown(); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! shutdown error (A)\n"); return EXIT_FAILURE; } fclose(fp1); fclose(fp2); printf("\nPress ENTER to exit...\n"); getchar(); return EXIT_SUCCESS; }