static int dger(cb_order order, size_t M, size_t N, double alpha, gpudata *X, size_t offX, int incX, gpudata *Y, size_t offY, int incY, gpudata *A, size_t offA, size_t lda) { cuda_context *ctx = X->ctx; blas_handle *h = (blas_handle *)ctx->blas_handle; gpudata *td; size_t t; ASSERT_BUF(X); ASSERT_BUF(Y); ASSERT_BUF(A); if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(M * N) || LARGE_VAL(lda) || LARGE_VAL(incX) || LARGE_VAL(incY)) return GA_XLARGE_ERROR; if (order == cb_c) { t = M; M = N; N = t; t = offX; offX = offY; offY = t; t = incX; incX = incY; incY = t; td = X; X = Y; Y = td; } cuda_enter(ctx); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(X, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(Y, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(A, CUDA_WAIT_ALL)); h->err = cublasDger(h->h, M, N, &alpha, ((double *)X->ptr) + offX, incX, ((double *)Y->ptr) + offY, incY, ((double *)A->ptr) + offA, lda); if (h->err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); if (h->err == CUBLAS_STATUS_ARCH_MISMATCH) return GA_DEVSUP_ERROR; return GA_BLAS_ERROR; } GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(X, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(Y, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A, CUDA_WAIT_ALL)); cuda_exit(ctx); return GA_NO_ERROR; }
static int dger(cb_order order, size_t M, size_t N, double alpha, gpudata *X, size_t offX, int incX, gpudata *Y, size_t offY, int incY, gpudata *A, size_t offA, size_t lda) { cuda_context *ctx = X->ctx; gpudata *td; size_t t; cublasStatus_t err; ASSERT_BUF(X); ASSERT_BUF(Y); ASSERT_BUF(A); if (order == cb_c) { t = M; M = N; N = t; t = offX; offX = offY; offY = t; t = incX; incX = incY; incY = t; td = X; X = Y; Y = td; } cuda_enter(ctx); cuda_wait(X, CUDA_WAIT_READ); cuda_wait(Y, CUDA_WAIT_READ); cuda_wait(A, CUDA_WAIT_READ|CUDA_WAIT_WRITE); err = cublasDger(((blas_handle *)ctx->blas_handle)->h, M, N, &alpha, ((double *)X->ptr) + offX, incX, ((double *)Y->ptr) + offY, incY, ((double *)A->ptr) + offA, lda); if (err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); if (err == CUBLAS_STATUS_ARCH_MISMATCH) return GA_DEVSUP_ERROR; return GA_BLAS_ERROR; } cuda_record(X, CUDA_WAIT_READ); cuda_record(Y, CUDA_WAIT_READ); cuda_record(A, CUDA_WAIT_READ|CUDA_WAIT_WRITE); cuda_exit(ctx); return GA_NO_ERROR; }
void magma_dger( magma_int_t m, magma_int_t n, double alpha, const double *dx, magma_int_t incx, const double *dy, magma_int_t incy, double *dA, magma_int_t ldda ) { cublasDger( m, n, alpha, dx, incx, dy, incy, dA, ldda ); }
void d_ger(SEXP ralpha, SEXP rx, SEXP rincx, SEXP ry, SEXP rincy, SEXP ra, SEXP rlda) { double alpha = asReal(ralpha), * a, * x, * y; int rowsa, colsa, lda = asInteger(rlda), nx, ny, incx = asInteger(rincx), incy = asInteger(rincy); unpackVector(rx, &nx, &x); unpackVector(ry, &ny, &y); unpackMatrix(ra, &rowsa, &colsa, &a); cublasDger(rowsa, colsa, alpha, x, incx, y, incy, a, lda); checkCublasError("d_ger"); }
int CORE_dtstrf_cublas(int M, int N, int IB, int NB, double *U, int LDU, double *A, int LDA, double *L, int LDL, int *IPIV, double *WORK, int LDWORK, int *INFO) { static double zzero = 0.0; static double mzone =-1.0; cublasStatus_t status; cudaError_t err; double alpha; int i, j, ii, sb; int im, ip; #if CONFIG_VERBOSE fprintf(stdout, "%s: M=%d N=%d IB=%d NB=%d U=%p LDU=%d A=%p LDA=%d L=%p LDL=%d IPIV=%p WORK=%p LDWORK=%d\n", __FUNCTION__, M, N, IB, NB, U, LDU, A, LDA, L, LDL, IPIV, WORK, LDWORK); fflush(stdout); #endif /* Check input arguments */ *INFO = 0; if (M < 0) { coreblas_error(1, "Illegal value of M"); return -1; } if (N < 0) { coreblas_error(2, "Illegal value of N"); return -2; } if (IB < 0) { coreblas_error(3, "Illegal value of IB"); return -3; } if ((LDU < max(1,NB)) && (NB > 0)) { coreblas_error(6, "Illegal value of LDU"); return -6; } if ((LDA < max(1,M)) && (M > 0)) { coreblas_error(8, "Illegal value of LDA"); return -8; } if ((LDL < max(1,IB)) && (IB > 0)) { coreblas_error(10, "Illegal value of LDL"); return -10; } /* Quick return */ if ((M == 0) || (N == 0) || (IB == 0)) return PLASMA_SUCCESS; /* Set L to 0 */ err = cudaMemset(L, 0, LDL*N*sizeof(double)); PLASMA_CUDA_ASSERT(err); double* dev_ptr = 0; err = cudaMalloc((void**)&dev_ptr, 2*sizeof(double)); PLASMA_CUDA_ASSERT(err); double* host_ptr; err = cudaMallocHost((void**)&host_ptr, 2*sizeof(double)); PLASMA_CUDA_ASSERT(err); int* piv = kaapi_memory_get_host_pointer_and_validate(IPIV); ip = 0; for (ii = 0; ii < N; ii += IB) { sb = min(N-ii, IB); for (i = 0; i < sb; i++) { status = cublasIdamax(kaapi_cuda_cublas_handle(), M, &A[LDA*(ii+i)], 1, &im ); PLASMA_CUBLAS_ASSERT(status); /* get im */ err = cudaStreamSynchronize(kaapi_cuda_kernel_stream()); PLASMA_CUDA_ASSERT(err); /* ajust index, CUBLAS is 1-based indexing */ im--; piv[ip] = ii+i+1; core_dtstrf_cmp(kaapi_cuda_kernel_stream(), &A[LDA*(ii+i)+im], &U[LDU*(ii+i)+ii+i], dev_ptr, host_ptr); err = cudaStreamSynchronize(kaapi_cuda_kernel_stream()); PLASMA_CUDA_ASSERT(err); if (host_ptr[0] == 1.0f) { /* * Swap behind. */ status = cublasDswap(kaapi_cuda_cublas_handle(), i, &L[LDL*ii+i], LDL, &WORK[im], LDWORK ); PLASMA_CUBLAS_ASSERT(status); /* * Swap ahead. */ status = cublasDswap(kaapi_cuda_cublas_handle(), sb-i, &U[LDU*(ii+i)+ii+i], LDU, &A[LDA*(ii+i)+im], LDA ); PLASMA_CUBLAS_ASSERT(status); /* * Set IPIV. */ piv[ip] = NB + im + 1; core_dtstrf_set_zero(kaapi_cuda_kernel_stream(), A, LDA, i, ii, im, zzero ); } core_dtstrf_cmp_zzero_and_get_alpha(kaapi_cuda_kernel_stream(), &A[LDA*(ii+i)+im], &U[LDU*(ii+i)+ii+i], zzero, dev_ptr, host_ptr); err = cudaStreamSynchronize(kaapi_cuda_kernel_stream()); PLASMA_CUDA_ASSERT(err); if ((*INFO == 0) && (host_ptr[0] == 1.0f)) { *INFO = ii+i+1; } // alpha = ((double)1. / U[LDU*(ii+i)+ii+i]); alpha = host_ptr[1]; status = cublasDscal(kaapi_cuda_cublas_handle(), M, &alpha, &A[LDA*(ii+i)], 1 ); PLASMA_CUBLAS_ASSERT(status); status = cublasDcopy(kaapi_cuda_cublas_handle(), M, &A[LDA*(ii+i)], 1, &WORK[LDWORK*i], 1 ); PLASMA_CUBLAS_ASSERT(status); status = cublasDger(kaapi_cuda_cublas_handle(), M, sb-i-1, &mzone, &A[LDA*(ii+i)], 1, &U[LDU*(ii+i+1)+ii+i], LDU, &A[LDA*(ii+i+1)], LDA ); PLASMA_CUBLAS_ASSERT(status); ip = ip+1; } /* * Apply the subpanel to the rest of the panel. */ if(ii+i < N) { for(j = ii; j < ii+sb; j++) { if (piv[j] <= NB) { piv[j] = piv[j] - ii; } } CORE_dssssm_cublas_v2( NB, N-(ii+sb), M, N-(ii+sb), sb, sb, &U[LDU*(ii+sb)+ii], LDU, &A[LDA*(ii+sb)], LDA, &L[LDL*ii], LDL, WORK, LDWORK, &piv[ii] ); err = cudaStreamSynchronize(kaapi_cuda_kernel_stream()); PLASMA_CUDA_ASSERT(err); for(j = ii; j < ii+sb; j++) { if (piv[j] <= NB) { piv[j] = piv[j] + ii; } } } } cudaFreeHost(host_ptr); cudaFree(dev_ptr); return PLASMA_SUCCESS; }