/* C <- alpha*A*B + beta*C */ void gemm(double alpha, Mat mA, Mat mB, double beta, Mat mC) { const int n = MatN(mA); const void* const a = MatElems(mA); const void* const b = MatElems(mB); void* const c = MatElems(mC); const bool dev = MatDev(mA); switch (MatElemSize(mA)) { case 4: if (dev) { float alpha32 = alpha, beta32 = beta; cublasSgemm(g_cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n, n, n, &alpha32, a, n, b, n, &beta32, c, n); } else { cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, n, n, n, alpha, a, n, b, n, beta, c, n); } break; case 8: if (dev) { cublasDgemm(g_cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n, n, n, &alpha, a, n, b, n, &beta, c, n); } else { cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, n, n, n, alpha, a, n, b, n, beta, c, n); } break; } }
CAMLprim value spoc_cublasDgemm(value transa, value transb, value m, value n, value k, value alpha, value a, value lda, value b, value ldb, value beta, value c, value ldc, value dev){ CAMLparam5(transa, transb, m, n, k); CAMLxparam5(alpha, a, lda, b, ldb); CAMLxparam4(beta, c, ldc, dev); CAMLlocal3(dev_vec_array, dev_vec, gi); CUdeviceptr d_A; CUdeviceptr d_B; CUdeviceptr d_C; int id; gi = Field(dev, 0); id = Int_val(Field(gi, 7)); GET_VEC(a, d_A); GET_VEC(b, d_B); GET_VEC(c, d_C); //CUBLAS_GET_CONTEXT; CUBLAS_GET_CONTEXT; cublasDgemm (Int_val(transa), Int_val(transb), Int_val(m), Int_val(n), Int_val(k), (double)Double_val(alpha), (double*) d_A, Int_val(lda), (double*) d_B, Int_val(ldb), (double) Double_val(beta), (double *)d_C, Int_val(ldc)); CUDA_RESTORE_CONTEXT; CAMLreturn(Val_unit); }
void gpu_cublas1(double *A, double *B, double *C, double *D, double *r, double *nrmC, int N, int N2) { #pragma acc data present(A, B, C, D) { #pragma acc host_data use_device(A, B, C, D) { cublasHandle_t handle; cublasCreate(&handle); const double alpha = 1.0; const double beta = 0.0; cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_T, N, N, N, &alpha, A, N, B, N, &beta, C, N); printf(" gpu gemm success \n"); cublasDdot(handle, N2, C, 1, B, 1, r); printf(" gpu dot success \n"); *r = -1.0 * *r; cublasDaxpy(handle, N2, r, B, 1, C, 1); printf(" gpu axpy success \n"); cublasDnrm2(handle, N2, C, 1, nrmC); printf(" gpu nrm2 success \n"); cublasDcopy(handle, N2, C, 1, D, 1); printf(" gpu copy success \n"); *nrmC = 1.0 / *nrmC; cublasDscal(handle, N2, nrmC, D, 1); printf(" gpu scal success \n"); cublasDestroy(handle); printf(" gpu destroy success \n"); } } }
static vl::Error gemm(vl::Context& context, char op1, char op2, ptrdiff_t m, ptrdiff_t n, ptrdiff_t k, type alpha, type const * a, ptrdiff_t lda, type const * b, ptrdiff_t ldb, type beta, type * c, ptrdiff_t ldc) { cublasHandle_t handle ; cublasStatus_t status ; status = context.getCudaHelper().getCublasHandle(&handle) ; if (status != CUBLAS_STATUS_SUCCESS) goto done ; status = cublasDgemm(handle, (op1 == 't') ? CUBLAS_OP_T : CUBLAS_OP_N, (op2 == 't') ? CUBLAS_OP_T : CUBLAS_OP_N, (int)m, (int)n, (int)k, &alpha, a, (int)lda, b, (int)ldb, &beta, c, (int)ldc); done: return context.setError (context.getCudaHelper().catchCublasError(status, "cublasDgemm"), __func__) ; }
void cube_blas_d_gemm (cube_t *ctx, cube_blas_op_t transa, cube_blas_op_t transb, int m, int n, int k, const double *alpha, const double *A, int lda, const double *B, int ldb, const double *beta, double *C, int ldc) { cublasStatus_t status; cublasOperation_t ta, tb; if (! cube_context_check (ctx)) return; ta = (cublasOperation_t) transa; tb = (cublasOperation_t) transb; status = cublasDgemm (ctx->h_blas, ta, tb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); cube_blas_check (ctx, status); }
void d_gemm(SEXP rtransa, SEXP rtransb, SEXP ralpha, SEXP ra, SEXP rlda, SEXP rb, SEXP rldb, SEXP rbeta, SEXP rc, SEXP rldc) { char transa = getTranspose(rtransa), transb = getTranspose(rtransb); double alpha = asReal(ralpha), beta = asReal(rbeta), * a, * b, * c; int m, n, k, rowsa, colsa, lda = asInteger(rlda), rowsb, colsb, ldb = asInteger(rldb), rowsc, colsc, ldc = asInteger(rldc); unpackMatrix(ra, &rowsa, &colsa, &a); unpackMatrix(rb, &rowsb, &colsb, &b); unpackMatrix(rc, &rowsc, &colsc, &c); m = rowsa; n = colsb; k = colsa; if(isTranspose(transa)) { m = colsa; k = rowsa; } if(isTranspose(transb)) n = rowsb; cublasDgemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); checkCublasError("d_gemm"); }
static int dgemm(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, double alpha, gpudata *A, size_t offA, size_t lda, gpudata *B, size_t offB, size_t ldb, double beta, gpudata *C, size_t offC, size_t ldc) { cuda_context *ctx = A->ctx; gpudata *T; size_t t; cublasStatus_t err; cb_transpose transT; ASSERT_BUF(A); ASSERT_BUF(B); ASSERT_BUF(C); if (order == cb_c) { /* swap A and B */ t = N; N = M; M = t; T = A; A = B; B = T; t = lda; lda = ldb; ldb = t; transT = transA; transA = transB; transB = transT; t = offA; offA = offB; offB = t; } cuda_enter(ctx); cuda_wait(A, CUDA_WAIT_READ); cuda_wait(B, CUDA_WAIT_READ); cuda_wait(C, CUDA_WAIT_READ|CUDA_WAIT_WRITE); err = cublasDgemm(((blas_handle *)ctx->blas_handle)->h, convT(transA), convT(transB), M, N, K, &alpha, ((double *)A->ptr) + offA, lda, ((double *)B->ptr) + offB, ldb, &beta, ((double *)C->ptr) + offC, ldc); if (err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); if (err == CUBLAS_STATUS_ARCH_MISMATCH) return GA_DEVSUP_ERROR; return GA_BLAS_ERROR; } cuda_record(A, CUDA_WAIT_READ); cuda_record(B, CUDA_WAIT_READ); cuda_record(C, CUDA_WAIT_READ|CUDA_WAIT_WRITE); cuda_exit(ctx); return GA_NO_ERROR; }
inline void gemm( const Order order, const TransA transa, const TransB transb, const int m, const int n, const int k, const double alpha, const double* a, const int lda, const double* b, const int ldb, const double beta, double* c, const int ldc ) { BOOST_STATIC_ASSERT( (is_same<Order, tag::column_major>::value) ); cublasDgemm( blas_option< TransA >::value, blas_option< TransB >::value, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc ); }
void gemm(bool transa, bool transb, int m, int n, int k, double alpha, thrust::device_ptr<const double> A, int lda, thrust::device_ptr<const double> B, int ldb, double beta, thrust::device_ptr<double> C, int ldc) { const cublasOperation_t ctransa = transa ? CUBLAS_OP_T : CUBLAS_OP_N; const cublasOperation_t ctransb = transb ? CUBLAS_OP_T : CUBLAS_OP_N; cublasSetStream(context::get().cublasHandle, context::get().stream); cublasDgemm(context::get().cublasHandle, ctransa, ctransb, m, n, k, &alpha, A.get(), lda, B.get(), ldb, &beta, C.get(), ldc); }
Darray<double> cudot (const Darray<double>& lhs, const Darray<double>& rhs) { // context check CHECK_EQ(lhs.getDeviceManager().getDeviceID(), rhs.getDeviceManager().getDeviceID()); CHECK_EQ(lhs.ndim(), rhs.ndim()); CHECK_LT(lhs.ndim(), 3); CHECK_LT(rhs.ndim(), 3); Darray<double> ret; if (lhs.ndim()==1 && rhs.ndim()==1) { // shape check CHECK_EQ(lhs.size(), rhs.size()); ret = Darray<double>(lhs.getDeviceManager(), {1}); // using cublas ddot lhs.deviceSet(); cublasDdot (DeviceManager::handle, lhs.size(), lhs.data, 1, rhs.data, 1, ret.data); } // 2D matrix dot else if (lhs.ndim()==2 && rhs.ndim()==2) { // shape check CHECK_EQ(lhs.shape()[1], rhs.shape()[0]); ret = Darray<double>(lhs.getDeviceManager(), {lhs.shape()[0], rhs.shape()[1]}); // using cblas dgemm lhs.deviceSet(); const double alpha = 1.; const double beta = 0.; CUBLAS_SAFE_CALL( cublasDgemm (DeviceManager::handle, CUBLAS_OP_N, CUBLAS_OP_N, lhs.shape()[0], rhs.shape()[1], lhs.shape()[1], &alpha, lhs.dev_data, lhs.shape()[0], rhs.dev_data, rhs.shape()[0], &beta, ret.dev_data, ret.shape()[0]) ); } return ret; }
void contractTensor(cublasHandle_t &handle, sTensorGPU &tensorIn1, sTensorGPU &tensorIn2, sTensorGPU &tensorOut, int numDim = 1) { //printf("\nContract(%d) ->%d\n", numDim, tensorIn2.id); //printTensor(tensorIn1); //printTensor(tensorIn2); //sTensor tensorOut; tensorOut.dim = tensorIn1.dim + tensorIn2.dim - numDim * 2; //tensorOut.size = (int*)malloc(sizeof(int)*tensorOut.dim); int outW = 1; int outH = 1; int contract = tensorIn1.size[0]; int test = tensorIn2.size[0]; for (int i = 1; i<numDim; i++){ contract *= tensorIn1.size[i]; test *= tensorIn2.size[i]; } if (test != contract){ printf("Unequal Size %d!=%d\n", contract, test); } { int i = 0; for (int j = numDim; j<tensorIn1.dim; j++){ tensorOut.size[i++] = tensorIn1.size[j]; outW *= tensorIn1.size[j]; } for (int j = numDim; j<tensorIn2.dim; j++){ tensorOut.size[i++] = tensorIn2.size[j]; outH *= tensorIn2.size[j]; } } tensorOut.dataSize = outW * outH; //double* deviceData; //handleError(cudaMalloc((void **)&deviceData, sizeof(double)*tensorOut.dataSize)); //tensorOut.deviceData = deviceData; type alpha = 1.f; type beta = 0.f; cublasStatus_t ret; double flops = outW; flops *= outH; flops *= contract; flops *= 2; bops += flops; // m n k lda x k ldA ldb x n ldB ldc x n ldC ret = cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, outW, outH, contract, &alpha, tensorIn1.deviceData, contract, tensorIn2.deviceData, contract, &beta, tensorOut.deviceData, outW); if (ret != CUBLAS_STATUS_SUCCESS) { printf("cublasSgemm returned error code %d\n", ret); } cudaDeviceSynchronize(); //freeTensor(tensorIn1); //freeTensor(tensorIn2); //return tensorOut; }
void caffe_gpu_gemm<double>(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const double alpha, const double* A, const double* B, const double beta, double* C) { // Note that cublas follows fortran order. int lda = (TransA == CblasNoTrans) ? K : M; int ldb = (TransB == CblasNoTrans) ? N : K; cublasOperation_t cuTransA = (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; cublasOperation_t cuTransB = (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; CUBLAS_CHECK(cublasDgemm(Caffe::cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, N)); }
// Note : cublasDgemm( handle, CUBLAS_OP_N, CUBLAS_OP_N, n,n,n, &alpha, A, n, B, n, &beta, C, n) // means matrix C = B * A void cublas_gemm(int n, double *c, double *b, double *a ) { #pragma acc data present(a, b, c) { #pragma acc host_data use_device(a, b, c) { cublasHandle_t handle; cublasCreate(&handle); const double alpha = 1.0; const double beta = 0.0; cublasDgemm( handle, CUBLAS_OP_N, CUBLAS_OP_N, n,n,n, &alpha, a, n, b, n, &beta, c, n); cublasDestroy(handle); } } }
void cublas_gemm(const double *A, const double *B, double *C, int N) { #pragma acc data present(A, B, C) { #pragma acc host_data use_device(A, B, C) { cublasHandle_t h; cublasCreate(&h); const double alpha = 1.0; const double beta = 0.0; cublasDgemm(h, CUBLAS_OP_T, CUBLAS_OP_T, N, N, N, &alpha, A, N, B, N, &beta, C, N); cublasDestroy(h); } } }
void magma_dgemm( magma_trans_t transA, magma_trans_t transB, magma_int_t m, magma_int_t n, magma_int_t k, double alpha, double const* dA, magma_int_t lda, double const* dB, magma_int_t ldb, double beta, double* dC, magma_int_t ldc ) { cublasDgemm( cublas_trans_const( transA ), cublas_trans_const( transB ), m, n, k, alpha, dA, lda, dB, ldb, beta, dC, ldc ); }
static void GEMM(Teuchos::ETransp transA, Teuchos::ETransp transB, double alpha, View<const double**,LayoutLeft,Cuda> A, View<const double**,LayoutLeft,Cuda> B, double beta, View<double**,LayoutLeft,Cuda> C) { const int m = static_cast<int>(C.dimension_0()), n = static_cast<int>(C.dimension_1()), k = (transA == Teuchos::NO_TRANS ? A.dimension_1() : A.dimension_0()), lda = static_cast<int>(Impl::getStride2DView(A)), ldb = static_cast<int>(Impl::getStride2DView(B)), ldc = static_cast<int>(Impl::getStride2DView(C)); const char char_transA = (transA == Teuchos::NO_TRANS ? 'N' : 'T'), char_transB = (transB == Teuchos::NO_TRANS ? 'N' : 'T'); cublasDgemm(char_transA, char_transB, m, n, k, alpha, A.ptr_on_device(), lda, B.ptr_on_device(), ldb, beta, C.ptr_on_device(), ldc); #ifdef HAVE_KOKKOS_DEBUG cublasStatus info = cublasGetError(); TEUCHOS_TEST_FOR_EXCEPTION( info != CUBLAS_STATUS_SUCCESS, std::runtime_error, "cublasDgemm failed with status " << info << "." ); #endif }
// Multiply the arrays A and B on GPU and save the result in C // C(m,n) = A(m,k) * B(k,n) void gpu_blas_mmul(const double *A, const double *B, double *C, const int m, const int k, const int n) { int lda=m,ldb=k,ldc=m; const double alf = 1; const double bet = 0; const double *alpha = &alf; const double *beta = &bet; // Create a handle for CUBLAS cublasHandle_t handle; cublasCreate(&handle); // Do the actual multiplication cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); // Destroy the handle cublasDestroy(handle); }
double cublas_gemm_norm(const double *A, const double *B, double *C, int N) { double *norm; norm = (double *) malloc(1*sizeof(double)); #pragma acc data present(A, B, C) copyout(norm[0]) { #pragma acc host_data use_device(A, B, C) { cublasHandle_t h; cublasCreate(&h); const double alpha = 1.0; const double beta = 0.0; cublasDgemm(h, CUBLAS_OP_T, CUBLAS_OP_T, N, N, N, &alpha, A, N, B, N, &beta, C, N); cublasDnrm2(h, N*N, C, 1, norm); cublasDestroy(h); } } return *norm; }
int tiramisu_cublas_dgemm(double *A, double *B, double *C, uint64_t M, uint64_t N, uint64_t K, double alpha, double beta, uint64_t ldA, uint64_t ldB, uint64_t ldC, uint64_t offsetA, uint64_t offsetB, uint64_t offsetC, bool transposeA, bool transposeB) { // TODO: Destroy the handle. static bool handle_created = false; static cublasHandle_t handle; if (!handle_created) { cublasCreate(&handle); handle_created = true; } // Default values for tight packing: if (ldA == 0) { ldA = transposeA ? M : K; } if (ldB == 0) { ldB = transposeB ? K : N; } if (ldC == 0) { ldC = N; } // The cuBLAS GEMM accepts column major buffers by default. We do a simple // trick here to multiply row major matrices. From a row-major perspective, // column-major multiplication basically transposes inputs, multiplies, and // transposes the output again: cublas(A, B) = ((A^T)x(B^T))^T = BxA // So it is actually equivalent to row-major GEMM with inputs swapped. // We need to reorder the size parameters as well to make it work: cublasSetStream(handle, cudaStreamPerThread); handle_cublas_error( cublasDgemm(handle, transposeB ? CUBLAS_OP_T : CUBLAS_OP_N, transposeA ? CUBLAS_OP_T : CUBLAS_OP_N, N, M, K, &alpha, B + offsetB, ldB, A + offsetA, ldA, &beta, C + offsetC, ldC), __FUNCTION__); return 0; }
SEXP magMultmm(SEXP a, SEXP transa, SEXP b, SEXP transb) { SEXP gpu = magGetGPU(a, b), c = PROTECT(NEW_OBJECT(MAKE_CLASS("magma"))); int TA = LOGICAL_VALUE(transa), TB = LOGICAL_VALUE(transb), *DIMA = INTEGER(GET_DIM(a)), *DIMB = INTEGER(GET_DIM(b)), M = DIMA[TA], N = DIMB[!TB], K = DIMA[!TA], LDA = DIMA[0], LDB = DIMB[0], LDC = M; char TRANSA = (TA ? 'T' : 'N'), TRANSB = (TB ? 'T' : 'N'); double *A = REAL(PROTECT(AS_NUMERIC(a))), *B = REAL(PROTECT(AS_NUMERIC(b))), *dA, *dB, *dC; if(DIMB[TB] != K) error("non-conformable matrices"); c = SET_SLOT(c, install(".Data"), allocMatrix(REALSXP, M, N)); SET_SLOT(c, install("gpu"), duplicate(gpu)); magma_malloc((void**)&dA, (M*K)*sizeof(double)); magma_malloc((void**)&dB, (K*N)*sizeof(double)); magma_malloc((void**)&dC, (M*N)*sizeof(double)); magma_dsetmatrix(DIMA[0], DIMA[1], A, LDA, dA, LDA); magma_dsetmatrix(DIMB[0], DIMB[1], B, LDB, dB, LDB); if(LOGICAL_VALUE(gpu)) magmablas_dgemm(TRANSA, TRANSB, M, N, K, 1.0, dA, LDA, dB, LDB, 0.0, dC, LDC); else cublasDgemm(TRANSA, TRANSB, M, N, K, 1.0, dA, LDA, dB, LDB, 0.0, dC, LDC); magma_dgetmatrix(M, N, dC, LDC, REAL(c), LDC); magma_free(dA); magma_free(dB); magma_free(dC); UNPROTECT(3); return c; }
void mat_prod_mat(const double* a, cublasOperation_t op_a, const double* b, cublasOperation_t op_b, double*c, int m, int n, int k){ cudaError_t cudaStat ; // cudaMalloc status cublasStatus_t stat ; // CUBLAS functions status cublasHandle_t handle ; // CUBLAS context // on the device double* d_a; // d_a - a on the device double* d_b; // d_b - b on the device double* d_c; // d_c - c on the device cudaStat = cudaMalloc((void **)&d_a ,m*k*sizeof(*a)); // device // memory alloc for a cudaStat = cudaMalloc((void **)&d_b ,k*n*sizeof(*b)); // device // memory alloc for b cudaStat = cudaMalloc((void **)&d_c ,m*n*sizeof(*c)); // device // memory alloc for c stat = cublasCreate(&handle); // initialize CUBLAS context // copy matrices from the host to the device stat = cublasSetMatrix (m,k, sizeof(*a) ,a,m,d_a ,m); //a -> d_a stat = cublasSetMatrix (k,n, sizeof(*b) ,b,k,d_b ,k); //b -> d_b stat = cublasSetMatrix (m,n, sizeof(*c) ,c,m,d_c ,m); //c -> d_c double al=1.0; double bet=1.0; // matrix - matrix multiplication : d_c = al*d_a *d_b + bet *d_c // d_a -mxk matrix , d_b -kxn matrix , d_c -mxn matrix ; // al ,bet -scalars stat=cublasDgemm(handle,op_a,op_b,m,n,k,&al,d_a,m,d_b,k,&bet,d_c,m); stat = cublasGetMatrix (m, n, sizeof(*c) ,d_c ,m,c,m); // cp d_c - >c cudaFree (d_a ); // free device memory cudaFree (d_b ); // free device memory cudaFree (d_c ); // free device memory cublasDestroy ( handle ); // destroy CUBLAS context }
cublasStatus_t cublasXgemm(cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const double *alpha, const double *A, int lda, const double *B, int ldb, const double *beta, double *C, int ldc) { return cublasDgemm(g_context->cublasHandle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); }
int main( int argc, char** argv ) { magma_init(); cublasHandle_t handle; cudaSetDevice( 0 ); cublasCreate( &handle ); double *A, *B, *C; double *dA, *dB, *dC; double error, work[1]; double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = { 1, 2, 3, 4 }; magma_int_t n = 10; magma_int_t lda = n; magma_int_t ldda = ((n+31)/32)*32; magma_int_t size = lda*n; magma_int_t info; magma_dmalloc_cpu( &A, lda*n ); magma_dmalloc_cpu( &B, lda*n ); magma_dmalloc_cpu( &C, lda*n ); magma_dmalloc( &dA, ldda*n ); magma_dmalloc( &dB, ldda*n ); magma_dmalloc( &dC, ldda*n ); // initialize matrices lapackf77_dlarnv( &ione, ISEED, &size, A ); lapackf77_dlarnv( &ione, ISEED, &size, B ); lapackf77_dlarnv( &ione, ISEED, &size, C ); // increase diagonal to be SPD for( int i=0; i < n; ++i ) { C[i+i*lda] = MAGMA_D_ADD( C[i+i*lda], MAGMA_D_MAKE( n*n, 0 )); } magma_dsetmatrix( n, n, A, lda, dA, ldda ); magma_dsetmatrix( n, n, B, lda, dB, ldda ); magma_dsetmatrix( n, n, C, lda, dC, ldda ); // compute with cublas cublasDgemm( handle, CUBLAS_OP_N, CUBLAS_OP_N, n, n, n, &c_neg_one, dA, ldda, dB, ldda, &c_one, dC, ldda ); magma_dpotrf_gpu( MagmaLower, n, dC, ldda, &info ); if (info != 0) printf("magma_dpotrf returned error %d: %s.\n", (int) info, magma_strerror( info )); // compute with LAPACK blasf77_dgemm( MagmaNoTransStr, MagmaNoTransStr, &n, &n, &n, &c_neg_one, A, &lda, B, &lda, &c_one, C, &lda ); lapackf77_dpotrf( MagmaLowerStr, &n, C, &lda, &info ); if (info != 0) printf("lapackf77_dpotrf returned error %d: %s.\n", (int) info, magma_strerror( info )); // compute difference magma_dgetmatrix( n, n, dC, ldda, A, lda ); blasf77_daxpy( &size, &c_neg_one, C, &ione, A, &ione ); error = lapackf77_dlange( "F", &n, &n, A, &lda, work ); printf( "n %d, error %8.2e\n", n, error ); magma_free( dA ); magma_free( dB ); magma_free( dC ); magma_free_cpu( A ); magma_free_cpu( B ); magma_free_cpu( C ); cublasDestroy( handle ); magma_finalize(); return 0; }
static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, double alpha, gpudata **A, size_t *offA, size_t lda, gpudata **B, size_t *offB, size_t ldb, double beta, gpudata **C, size_t *offC, size_t ldc, size_t batchCount) { cuda_context *ctx; size_t *lt, t; gpudata **T; size_t i; cb_transpose transT; cublasStatus_t err; if (batchCount == 0) return GA_NO_ERROR; ASSERT_BUF(A[0]); ctx = A[0]->ctx; cuda_enter(ctx); if (order == cb_c) { /* swap A and B */ t = N; N = M; M = t; T = A; A = B; B = T; t = lda; lda = ldb; ldb = t; transT = transA; transA = transB; transB = transT; lt = offA; offA = offB; offB = lt; } // use parallel cublasSgemm calls rather than cublasSgemmBatched for large products const size_t threshold = 650; const int multiple_dispatch = M * N * K > threshold * threshold * threshold; if (multiple_dispatch) { for (i = 0; i < batchCount; i++) { ASSERT_BUF(A[i]); ASSERT_BUF(B[i]); ASSERT_BUF(C[i]); cuda_wait(A[i], CUDA_WAIT_READ); cuda_wait(B[i], CUDA_WAIT_READ); cuda_wait(C[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE); err = cublasDgemm(((blas_handle *)ctx->blas_handle)->h, convT(transA), convT(transB), M, N, K, &alpha, (double*)A[i]->ptr + offA[i], lda, (double*)B[i]->ptr + offB[i], ldb, &beta, (double*)C[i]->ptr + offC[i], ldc); if (err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); if (err == CUBLAS_STATUS_ARCH_MISMATCH) return GA_DEVSUP_ERROR; return GA_BLAS_ERROR; } cuda_record(A[i], CUDA_WAIT_READ); cuda_record(B[i], CUDA_WAIT_READ); cuda_record(C[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE); } } else { double **T_l = alloca(sizeof(double *) * batchCount * 3); const double **A_l = (const double **)T_l; const double **B_l = (const double **)T_l + batchCount; double **C_l = T_l + (batchCount * 2); CUdeviceptr Ta, Aa, Ba, Ca; for (i = 0; i < batchCount; i++) { ASSERT_BUF(A[i]); ASSERT_BUF(B[i]); ASSERT_BUF(C[i]); cuda_wait(A[i], CUDA_WAIT_READ); cuda_wait(B[i], CUDA_WAIT_READ); cuda_wait(C[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE); A_l[i] = ((double *)A[i]->ptr) + offA[i]; B_l[i] = ((double *)B[i]->ptr) + offB[i]; C_l[i] = ((double *)C[i]->ptr) + offC[i]; } cuMemAlloc(&Ta, sizeof(double *) * batchCount * 3); Aa = Ta; Ba = Ta + (batchCount * sizeof(double *)); Ca = Ta + (batchCount * sizeof(double *) * 2); cuMemcpyHtoD(Ta, T_l, sizeof(double *) * batchCount * 3); err = cublasDgemmBatched(((blas_handle *)ctx->blas_handle)->h, convT(transA), convT(transB), M, N, K, &alpha, (const double **)Aa, lda, (const double **)Ba, ldb, &beta, (double **)Ca, ldc, batchCount); cuMemFree(Ta); if (err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); if (err == CUBLAS_STATUS_ARCH_MISMATCH) return GA_DEVSUP_ERROR; return GA_BLAS_ERROR; } for (i = 0; i < batchCount; i++) { cuda_record(A[i], CUDA_WAIT_READ); cuda_record(B[i], CUDA_WAIT_READ); cuda_record(C[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE); } } cuda_exit(ctx); return GA_NO_ERROR; }
int cublas_multiply_matrix_double2( PGM_Matriz_Double *A, PGM_Matriz_Double *B, PGM_Matriz_Double *result, PGM_Matriz_Double *work){ cublasStatus_t status; cublasHandle_t handle; int max_dim = work->n_linhas; PGM_Matriz_GPU device_A, device_B, device_C; double alpha = 1.0, beta = 0.0; if (A->n_linhas != result->n_linhas || B->n_colunas != result->n_colunas){ return 0; } status = cublasCreate(&handle); if (status != CUBLAS_STATUS_SUCCESS){ return -1; } if(create_PGM_Matriz_GPU_double(&device_A,A->n_linhas, A->n_colunas,max_dim) != cudaSuccess){ return -2; } if(create_PGM_Matriz_GPU_double(&device_B,B->n_linhas, B->n_colunas,max_dim) != cudaSuccess){ if(cudaFree(device_A.valor) == cudaSuccess) return -2; else return -11; } if(create_PGM_Matriz_GPU_double(&device_C,result->n_linhas, result->n_colunas,max_dim) != cudaSuccess){ if(cudaFree(device_A.valor) == cudaSuccess && cudaFree(device_B.valor) == cudaSuccess) return -2; else return -11; } if(double_copyMatrixHost2GPU(A,work,&device_A) != CUBLAS_STATUS_SUCCESS){ if(cudaFree(device_A.valor) == cudaSuccess && cudaFree(device_B.valor) == cudaSuccess && cudaFree(device_C.valor) == cudaSuccess) return -3; else return -12; } if(double_copyMatrixHost2GPU(B,work,&device_B) != CUBLAS_STATUS_SUCCESS){ if(cudaFree(device_A.valor) == cudaSuccess && cudaFree(device_B.valor) == cudaSuccess && cudaFree(device_C.valor) == cudaSuccess) return -3; else return -12; } if(cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_T, max_dim,max_dim,max_dim, &alpha,(double*) device_A.valor,max_dim,(double*)device_B.valor, max_dim, &beta, (double*)device_C.valor, max_dim) != CUBLAS_STATUS_SUCCESS){ if(cudaFree(device_A.valor) == cudaSuccess && cudaFree(device_B.valor) == cudaSuccess && cudaFree(device_C.valor) == cudaSuccess) return -4; else return -13; } if(double_copyMatrixGPU2Host_Transpose(result, &device_C, work) != CUBLAS_STATUS_SUCCESS){ if(cudaFree(device_A.valor) == cudaSuccess && cudaFree(device_B.valor) == cudaSuccess && cudaFree(device_C.valor) == cudaSuccess) return -5; else return -14; } if(cudaFree(device_A.valor) != cudaSuccess){ return -15; } if(cudaFree(device_B.valor) != cudaSuccess){ return -15; } if(cudaFree(device_C.valor) != cudaSuccess){ return -15; } if(cublasDestroy(handle) != CUBLAS_STATUS_SUCCESS){ return -7; } return 1; }
int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, t1, t2; double c_neg_one = MAGMA_D_NEG_ONE; magma_int_t ione = 1; const char trans[] = { 'N', 'C', 'T' }; const char uplo[] = { 'L', 'U' }; const char diag[] = { 'U', 'N' }; const char side[] = { 'L', 'R' }; double *A, *B, *C, *C2, *LU; double *dA, *dB, *dC1, *dC2; double alpha = MAGMA_D_MAKE( 0.5, 0.1 ); double beta = MAGMA_D_MAKE( 0.7, 0.2 ); double dalpha = 0.6; double dbeta = 0.8; double work[1], error, total_error; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t m, n, k, size, maxn, ld, info; magma_int_t *piv; magma_err_t err; magma_opts opts; parse_opts( argc, argv, &opts ); printf( "Compares magma wrapper function to cublas function; all diffs should be exactly 0.\n\n" ); total_error = 0.; for( int i = 0; i < opts.ntest; ++i ) { m = opts.msize[i]; n = opts.nsize[i]; k = opts.ksize[i]; printf("=========================================================================\n"); printf( "M %d, N %d, K %d\n", (int) m, (int) n, (int) k ); // allocate matrices // over-allocate so they can be any combination of {m,n,k} x {m,n,k}. maxn = max( max( m, n ), k ); ld = maxn; size = maxn*maxn; err = magma_malloc_cpu( (void**) &piv, maxn*sizeof(magma_int_t) ); assert( err == 0 ); err = magma_dmalloc_pinned( &A, size ); assert( err == 0 ); err = magma_dmalloc_pinned( &B, size ); assert( err == 0 ); err = magma_dmalloc_pinned( &C, size ); assert( err == 0 ); err = magma_dmalloc_pinned( &C2, size ); assert( err == 0 ); err = magma_dmalloc_pinned( &LU, size ); assert( err == 0 ); err = magma_dmalloc( &dA, size ); assert( err == 0 ); err = magma_dmalloc( &dB, size ); assert( err == 0 ); err = magma_dmalloc( &dC1, size ); assert( err == 0 ); err = magma_dmalloc( &dC2, size ); assert( err == 0 ); // initialize matrices size = maxn*maxn; lapackf77_dlarnv( &ione, ISEED, &size, A ); lapackf77_dlarnv( &ione, ISEED, &size, B ); lapackf77_dlarnv( &ione, ISEED, &size, C ); printf( "========== Level 1 BLAS ==========\n" ); // ----- test DSWAP // swap 2nd and 3rd columns of dA, then copy to C2 and compare with A assert( n >= 4 ); magma_dsetmatrix( m, n, A, ld, dA, ld ); magma_dsetmatrix( m, n, A, ld, dB, ld ); magma_dswap( m, dA(0,1), 1, dA(0,2), 1 ); magma_dswap( m, dB(0,1), 1, dB(0,2), 1 ); // check results, storing diff between magma and cuda calls in C2 cublasDaxpy( ld*n, c_neg_one, dA, 1, dB, 1 ); magma_dgetmatrix( m, n, dB, ld, C2, ld ); error = lapackf77_dlange( "F", &m, &k, C2, &ld, work ); total_error += error; printf( "dswap diff %.2g\n", error ); // ----- test IDAMAX // get argmax of column of A magma_dsetmatrix( m, k, A, ld, dA, ld ); error = 0; for( int j = 0; j < k; ++j ) { magma_int_t i1 = magma_idamax( m, dA(0,j), 1 ); magma_int_t i2 = cublasIdamax( m, dA(0,j), 1 ); assert( i1 == i2 ); error += abs( i1 - i2 ); } total_error += error; gflops = (double)m * k / 1e9; printf( "idamax diff %.2g\n", error ); printf( "\n" ); printf( "========== Level 2 BLAS ==========\n" ); // ----- test DGEMV // c = alpha*A*b + beta*c, with A m*n; b,c m or n-vectors // try no-trans/trans for( int ia = 0; ia < 3; ++ia ) { magma_dsetmatrix( m, n, A, ld, dA, ld ); magma_dsetvector( maxn, B, 1, dB, 1 ); magma_dsetvector( maxn, C, 1, dC1, 1 ); magma_dsetvector( maxn, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_dgemv( trans[ia], m, n, alpha, dA, ld, dB, 1, beta, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDgemv( trans[ia], m, n, alpha, dA, ld, dB, 1, beta, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 size = (trans[ia] == 'N' ? m : n); cublasDaxpy( size, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetvector( size, dC2, 1, C2, 1 ); error = lapackf77_dlange( "F", &size, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_DGEMV( m, n ) / 1e9; printf( "dgemv( %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", trans[ia], error, gflops/t1, gflops/t2 ); } printf( "\n" ); // ----- test DSYMV // c = alpha*A*b + beta*c, with A m*m symmetric; b,c m-vectors // try upper/lower for( int iu = 0; iu < 2; ++iu ) { magma_dsetmatrix( m, m, A, ld, dA, ld ); magma_dsetvector( m, B, 1, dB, 1 ); magma_dsetvector( m, C, 1, dC1, 1 ); magma_dsetvector( m, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_dsymv( uplo[iu], m, alpha, dA, ld, dB, 1, beta, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDsymv( uplo[iu], m, alpha, dA, ld, dB, 1, beta, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( m, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetvector( m, dC2, 1, C2, 1 ); error = lapackf77_dlange( "F", &m, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_DSYMV( m ) / 1e9; printf( "dsymv( %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], error, gflops/t1, gflops/t2 ); } printf( "\n" ); // ----- test DTRSV // solve A*c = c, with A m*m triangular; c m-vector // try upper/lower, no-trans/trans, unit/non-unit diag // Factor A into LU to get well-conditioned triangles, else solve yields garbage. // Still can give garbage if solves aren't consistent with LU factors, // e.g., using unit diag for U, so copy lower triangle to upper triangle. // Also used for trsm later. lapackf77_dlacpy( "Full", &maxn, &maxn, A, &ld, LU, &ld ); lapackf77_dgetrf( &maxn, &maxn, LU, &ld, piv, &info ); for( int j = 0; j < maxn; ++j ) { for( int i = 0; i < j; ++i ) { *LU(i,j) = *LU(j,i); } } for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { magma_dsetmatrix( m, m, LU, ld, dA, ld ); magma_dsetvector( m, C, 1, dC1, 1 ); magma_dsetvector( m, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_dtrsv( uplo[iu], trans[it], diag[id], m, dA, ld, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDtrsv( uplo[iu], trans[it], diag[id], m, dA, ld, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( m, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetvector( m, dC2, 1, C2, 1 ); error = lapackf77_dlange( "F", &m, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_DTRSM( MagmaLeft, m, 1 ) / 1e9; printf( "dtrsv( %c, %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], trans[it], diag[id], error, gflops/t1, gflops/t2 ); }}} printf( "\n" ); printf( "========== Level 3 BLAS ==========\n" ); // ----- test DGEMM // C = alpha*A*B + beta*C, with A m*k or k*m; B k*n or n*k; C m*n // try combinations of no-trans/trans for( int ia = 0; ia < 3; ++ia ) { for( int ib = 0; ib < 3; ++ib ) { bool nta = (trans[ia] == 'N'); bool ntb = (trans[ib] == 'N'); magma_dsetmatrix( (nta ? m : k), (nta ? m : k), A, ld, dA, ld ); magma_dsetmatrix( (ntb ? k : n), (ntb ? n : k), B, ld, dB, ld ); magma_dsetmatrix( m, n, C, ld, dC1, ld ); magma_dsetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_dgemm( trans[ia], trans[ib], m, n, k, alpha, dA, ld, dB, ld, beta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDgemm( trans[ia], trans[ib], m, n, k, alpha, dA, ld, dB, ld, beta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_dlange( "F", &m, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_DGEMM( m, n, k ) / 1e9; printf( "dgemm( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", trans[ia], trans[ib], error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test DSYMM // C = alpha*A*B + beta*C (left) with A m*m symmetric; B,C m*n; or // C = alpha*B*A + beta*C (right) with A n*n symmetric; B,C m*n // try left/right, upper/lower for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { magma_dsetmatrix( m, m, A, ld, dA, ld ); magma_dsetmatrix( m, n, B, ld, dB, ld ); magma_dsetmatrix( m, n, C, ld, dC1, ld ); magma_dsetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_dsymm( side[is], uplo[iu], m, n, alpha, dA, ld, dB, ld, beta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDsymm( side[is], uplo[iu], m, n, alpha, dA, ld, dB, ld, beta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_dlange( "F", &m, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_DSYMM( side[is], m, n ) / 1e9; printf( "dsymm( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", side[is], uplo[iu], error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test DSYRK // C = alpha*A*A^H + beta*C (no-trans) with A m*k and C m*m symmetric; or // C = alpha*A^H*A + beta*C (trans) with A k*m and C m*m symmetric // try upper/lower, no-trans/trans for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { magma_dsetmatrix( n, k, A, ld, dA, ld ); magma_dsetmatrix( n, n, C, ld, dC1, ld ); magma_dsetmatrix( n, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_dsyrk( uplo[iu], trans[it], n, k, dalpha, dA, ld, dbeta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDsyrk( uplo[iu], trans[it], n, k, dalpha, dA, ld, dbeta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetmatrix( n, n, dC2, ld, C2, ld ); error = lapackf77_dlange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_DSYRK( k, n ) / 1e9; printf( "dsyrk( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], trans[it], error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test DSYR2K // C = alpha*A*B^H + ^alpha*B*A^H + beta*C (no-trans) with A,B n*k; C n*n symmetric; or // C = alpha*A^H*B + ^alpha*B^H*A + beta*C (trans) with A,B k*n; C n*n symmetric // try upper/lower, no-trans/trans for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { bool nt = (trans[it] == 'N'); magma_dsetmatrix( (nt ? n : k), (nt ? n : k), A, ld, dA, ld ); magma_dsetmatrix( n, n, C, ld, dC1, ld ); magma_dsetmatrix( n, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_dsyr2k( uplo[iu], trans[it], n, k, alpha, dA, ld, dB, ld, dbeta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDsyr2k( uplo[iu], trans[it], n, k, alpha, dA, ld, dB, ld, dbeta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetmatrix( n, n, dC2, ld, C2, ld ); error = lapackf77_dlange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_DSYR2K( k, n ) / 1e9; printf( "dsyr2k( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], trans[it], error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test DTRMM // C = alpha*A*C (left) with A m*m triangular; C m*n; or // C = alpha*C*A (right) with A n*n triangular; C m*n // try left/right, upper/lower, no-trans/trans, unit/non-unit for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { bool left = (side[is] == 'L'); magma_dsetmatrix( (left ? m : n), (left ? m : n), A, ld, dA, ld ); magma_dsetmatrix( m, n, C, ld, dC1, ld ); magma_dsetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_dtrmm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDtrmm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_dlange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_DTRMM( side[is], m, n ) / 1e9; printf( "dtrmm( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], trans[it], error, gflops/t1, gflops/t2 ); }}}} printf( "\n" ); // ----- test DTRSM // solve A*X = alpha*B (left) with A m*m triangular; B m*n; or // solve X*A = alpha*B (right) with A n*n triangular; B m*n // try left/right, upper/lower, no-trans/trans, unit/non-unit for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { bool left = (side[is] == 'L'); magma_dsetmatrix( (left ? m : n), (left ? m : n), LU, ld, dA, ld ); magma_dsetmatrix( m, n, C, ld, dC1, ld ); magma_dsetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_dtrsm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDtrsm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_dlange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_DTRSM( side[is], m, n ) / 1e9; printf( "dtrsm( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], trans[it], error, gflops/t1, gflops/t2 ); }}}} printf( "\n" ); // cleanup magma_free_cpu( piv ); magma_free_pinned( A ); magma_free_pinned( B ); magma_free_pinned( C ); magma_free_pinned( C2 ); magma_free_pinned( LU ); magma_free( dA ); magma_free( dB ); magma_free( dC1 ); magma_free( dC2 ); } if ( total_error != 0. ) { printf( "total error %.2g -- ought to be 0 -- some test failed (see above).\n", total_error ); } else { printf( "all tests passed\n" ); } TESTING_FINALIZE(); return 0; }
int main( int argc, char **argv ) { double *A, *B, *C; double *cu_A, *cu_B, *cu_C; cudaError_t cuError; cublasStatus_t cuStatus; cublasHandle_t cuHandle; // seed rand() srand(time(NULL)); // allocate memory on CPU A = (double*)malloc(sizeof(double)*MATRIX_N*MATRIX_P); B = (double*)malloc(sizeof(double)*MATRIX_P*MATRIX_M); C = (double*)malloc(sizeof(double)*MATRIX_N*MATRIX_M); if( !A || !B || !C ) { perror("Can't allocate CPU matrices"); exit(EXIT_FAILURE); } // generate matrices for( int i = 0; i < MATRIX_N*MATRIX_P; i++ ) A[i] = 10.0*((double)rand())/RAND_MAX; for( int i = 0; i < MATRIX_P*MATRIX_M; i++ ) B[i] = 10.0*((double)rand())/RAND_MAX; // allocate memory on GPU cuError = cudaMalloc( &cu_A, sizeof(double)*MATRIX_N*MATRIX_P ); if( cuError != cudaSuccess ) { fprintf(stderr, "Can't allocate GPU matrices\n"); exit(EXIT_FAILURE); } cuError = cudaMalloc( &cu_B, sizeof(double)*MATRIX_P*MATRIX_M ); if( cuError != cudaSuccess ) { fprintf(stderr, "Can't allocate GPU matrices\n"); exit(EXIT_FAILURE); } cuError = cudaMalloc( &cu_C, sizeof(double)*MATRIX_N*MATRIX_M ); if( cuError != cudaSuccess ) { fprintf(stderr, "Can't allocate GPU matrices\n"); exit(EXIT_FAILURE); } // setup cuBlas cuStatus = cublasCreate( &cuHandle ); if( cuStatus != CUBLAS_STATUS_SUCCESS ) { fprintf(stderr, "Error initializing cuBlas\n"); exit(EXIT_FAILURE); } // setup matrices cuStatus = cublasSetMatrix( MATRIX_N, MATRIX_P, sizeof(double), A, MATRIX_N, cu_A, MATRIX_N ); if( cuStatus != CUBLAS_STATUS_SUCCESS ) { fprintf(stderr, "Error transferring matrix A\n"); exit(EXIT_FAILURE); } cuStatus = cublasSetMatrix( MATRIX_P, MATRIX_M, sizeof(double), B, MATRIX_P, cu_B, MATRIX_P ); if( cuStatus != CUBLAS_STATUS_SUCCESS ) { fprintf(stderr, "Error transferring matrix B\n"); exit(EXIT_FAILURE); } // multiply double one = 1.0; double zero = 0.0; cuStatus = cublasDgemm( cuHandle, CUBLAS_OP_N, CUBLAS_OP_N, MATRIX_N, MATRIX_M, MATRIX_P, &one, cu_A, MATRIX_N, cu_B, MATRIX_P, &zero, cu_C, MATRIX_N ); if( cuStatus != CUBLAS_STATUS_SUCCESS ) { fprintf(stderr, "Error executing matrix mult\n"); exit(EXIT_FAILURE); } // get results cuStatus = cublasGetMatrix( MATRIX_N, MATRIX_M, sizeof(double), cu_C, MATRIX_N, C, MATRIX_N ); if( cuStatus != CUBLAS_STATUS_SUCCESS ) { fprintf(stderr, "Error transferring results\n"); exit(EXIT_FAILURE); } // check results bool good = true; for( int i = 0; i < MATRIX_N; i++ ) { for( int j = 0; j < MATRIX_M; j++ ) { double sum = 0.0; for( int k = 0; k < MATRIX_P; k++ ) { sum += A[IDX2C(i, k, MATRIX_N)]*B[IDX2C(k, j, MATRIX_P)]; } // check if( fabs(sum - C[IDX2C(i,j,MATRIX_N)]) > 0.00001 ) { good = false; printf("(%i, %i) sum = %f\tcu_C = %f\tMISMATCH\n", i, j, sum, C[IDX2C(i,j,MATRIX_N)]); } } } if( good ) printf("Results Match\n"); else printf("Results DO NOT Match\n"); // cleanup free( A ); free( B ); free( C ); cudaFree( cu_A ); cudaFree( cu_B ); cudaFree( cu_C ); cublasDestroy( cuHandle ); return 0; }
SEXP magma_dgeMatrix_matrix_mm(SEXP a, SEXP bP, SEXP right) { #ifdef HIPLAR_WITH_MAGMA SEXP b = PROTECT(mMatrix_as_dgeMatrix(bP)), val = PROTECT(NEW_OBJECT(MAKE_CLASS("dgeMatrix"))); int *adims = INTEGER(GET_SLOT(a, Matrix_DimSym)), *bdims = INTEGER(GET_SLOT(b, Matrix_DimSym)), *cdims = INTEGER(ALLOC_SLOT(val, Matrix_DimSym, INTSXP, 2)); double one = 1.0, zero = 0.0; if (asLogical(right)) { int m = bdims[0], n = adims[1], k = bdims[1]; if (adims[0] != k) error(_("Matrices are not conformable for multiplication")); cdims[0] = m; cdims[1] = n; if (m < 1 || n < 1 || k < 1) { // This was commented out error(_("Matrices with zero extents cannot be multiplied")); ALLOC_SLOT(val, Matrix_xSym, REALSXP, m * n); } else { double *B = REAL(GET_SLOT(b, Matrix_xSym)); double *A = REAL(GET_SLOT(a, Matrix_xSym)); double *C = REAL(ALLOC_SLOT(val, Matrix_xSym, REALSXP, m * n)); //TODO add magma here too if(GPUFlag == 1) { double *d_A, *d_B, *d_C; cublasStatus retStatus; #ifdef HIPLAR_DBG R_ShowMessage("DBG: Performing matrix multiplication with Right = true using magmablas_dgemm"); #endif cublasAlloc(n * k, sizeof(double), (void**)&d_A); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Memory Allocation")); /********************************************/ cublasAlloc(m * k, sizeof(double), (void**)&d_B); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Memory Allocation")); /********************************************/ cublasAlloc(m * n, sizeof(double), (void**)&d_C); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Memory Allocation")); /********************************************/ cublasSetVector( n * k , sizeof(double), A, 1, d_A, 1); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Data Transfer to Device")); /********************************************/ cublasSetVector( m * k, sizeof(double), B, 1, d_B, 1 ); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Data Transfer to Device")); /********************************************/ // ******** magmablas_dgemm call Here ** //magmablas_dgemm('N', 'N', m, n, k, one, d_B, m, d_A, k, zero, d_C, m); //CHANGED 30/07 cublasDgemm('N', 'N', m, n, k, one, d_B, m, d_A, k, zero, d_C, m); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) { error(_("CUBLAS: Error in cublasDgemm routine")); } /********************************************/ cublasGetVector( m * n , sizeof(double), d_C, 1, C, 1); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Data Transfer from Device")); /********************************************/ cublasFree(d_A); cublasFree(d_B); cublasFree(d_C); } else { #ifdef HIPLAR_DBG R_ShowMessage("DBG: Performing matrix multiplication using dgemm with right = TRUE"); #endif F77_CALL(dgemm) ("N", "N", &m, &n, &k, &one, B, &m, A , &k, &zero, C , &m); } } } else { int m = adims[0], n = bdims[1], k = adims[1]; double *A = REAL(GET_SLOT(a, Matrix_xSym)); double *B = REAL(GET_SLOT(b, Matrix_xSym)); if (bdims[0] != k) error(_("Matrices are not conformable for multiplication")); cdims[0] = m; cdims[1] = n; double *C = REAL(ALLOC_SLOT(val, Matrix_xSym, REALSXP, m * n)); if (m < 1 || n < 1 || k < 1) { // This was commented out error(_("Matrices with zero extents cannot be multiplied")); ALLOC_SLOT(val, Matrix_xSym, REALSXP, m * n); } else { if(GPUFlag == 1) { double *d_A, *d_B, *d_C; cublasStatus retStatus; #ifdef HIPLAR_DBG R_ShowMessage("DBG: Performing matrix multiplication using magmablas_dgemm"); #endif cublasAlloc(m * k, sizeof(double), (void**)&d_A); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Memory Allocation")); /********************************************/ cublasAlloc(n * k, sizeof(double), (void**)&d_B); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Memory Allocation")); /********************************************/ cublasAlloc(m * n, sizeof(double), (void**)&d_C); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Memory Allocation")); /********************************************/ cublasSetVector( m * k , sizeof(double), A, 1, d_A, 1); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Data Transfer to Device")); /********************************************/ cublasSetVector( n * k, sizeof(double), B, 1, d_B, 1 ); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Data Transfer to Device")); /********************************************/ // ******** magmablas_dgemm call Here ** //magmablas_dgemm('N', 'N', m, n, k, one, d_A, m, d_B, k, zero, d_C, m); //CHANGE cublasDgemm('N', 'N', m, n, k, one, d_A, m, d_B, k, zero, d_C, m); retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) { error(_("CUBLAS: Error in Data Transfer from Device")); /********************************************/ } cublasGetVector( m * n , sizeof(double), d_C, 1, C, 1); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Data Transfer from Device")); /********************************************/ cublasFree(d_A); cublasFree(d_B); cublasFree(d_C); } else { #ifdef HIPLAR_DBG R_ShowMessage("DBG: Performing matrix multiplication using dgemm"); #endif F77_CALL(dgemm) ("N", "N", &m, &n, &k, &one, A, &m, B, &k, &zero, C, &m); } } } ALLOC_SLOT(val, Matrix_DimNamesSym, VECSXP, 2); UNPROTECT(2); return val; #endif return R_NilValue; }
SEXP magma_dgeMatrix_matrix_crossprod(SEXP x, SEXP y, SEXP trans) { #ifdef HIPLAR_WITH_MAGMA int tr = asLogical(trans);/* trans=TRUE: tcrossprod(x,y) */ SEXP val = PROTECT(NEW_OBJECT(MAKE_CLASS("dgeMatrix"))); int *xDims = INTEGER(GET_SLOT(x, Matrix_DimSym)), *yDims = INTEGER(getAttrib(y, R_DimSymbol)), *vDims, nprot = 1; int m = xDims[!tr], n = yDims[!tr];/* -> result dim */ int xd = xDims[ tr], yd = yDims[ tr];/* the conformable dims */ double one = 1.0, zero = 0.0; if (isInteger(y)) { y = PROTECT(coerceVector(y, REALSXP)); nprot++; } if (!(isMatrix(y) && isReal(y))) error(_("Argument y must be a numeric matrix")); SET_SLOT(val, Matrix_factorSym, allocVector(VECSXP, 0)); SET_SLOT(val, Matrix_DimSym, allocVector(INTSXP, 2)); vDims = INTEGER(GET_SLOT(val, Matrix_DimSym)); if (xd > 0 && yd > 0 && n > 0 && m > 0) { if (xd != yd) error(_("Dimensions of x and y are not compatible for %s"), tr ? "tcrossprod" : "crossprod"); vDims[0] = m; vDims[1] = n; SET_SLOT(val, Matrix_xSym, allocVector(REALSXP, m * n)); double *A = REAL(GET_SLOT(x, Matrix_xSym)); double *B = REAL(y); double *C = REAL(GET_SLOT(val, Matrix_xSym)); if(GPUFlag == 1) { double *d_A, *d_B, *d_C; cublasStatus retStatus; #ifdef HIPLAR_DBG R_ShowMessage("DBG: Performing dge/matrix crossprod using magmablas_dgemm"); #endif cublasAlloc(m * xd, sizeof(double), (void**)&d_A); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Memory Allocation")); /********************************************/ cublasAlloc(n * xd, sizeof(double), (void**)&d_B); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Memory Allocation")); /********************************************/ cublasAlloc(m * n, sizeof(double), (void**)&d_C); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Memory Allocation")); /********************************************/ cublasSetVector( m * xd , sizeof(double), A, 1, d_A, 1); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Data Transfer to Device")); /********************************************/ cublasSetVector( xd * n, sizeof(double), B, 1, d_B, 1 ); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Data Transfer to Device")); /********************************************/ cublasSetVector( m * n, sizeof(double), C, 1, d_C, 1 ); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Data Transfer to Device")); /********************************************/ // ******** magmablas_dgemm call Here ** //magmablas_dgemm( tr ? 'N' : 'T', tr ? 'T' : 'N', m, n, xd, one, d_A, xDims[0], d_B, yDims[0], zero, d_C, m); //CHANGE cublasDgemm( tr ? 'N' : 'T', tr ? 'T' : 'N', m, n, xd, one, d_A, xDims[0], d_B, yDims[0], zero, d_C, m); cublasGetVector( m * n , sizeof(double), d_C, 1, C, 1); /* Error Checking */ retStatus = cublasGetError (); if (retStatus != CUBLAS_STATUS_SUCCESS) error(_("CUBLAS: Error in Data Transfer from Device")); /********************************************/ cublasFree(d_A); cublasFree(d_B); cublasFree(d_C); } else { #ifdef HIPLAR_DBG R_ShowMessage("DBG: Performing dge/matrix cross prod with dgemm"); #endif F77_CALL(dgemm)(tr ? "N" : "T", tr ? "T" : "N", &m, &n, &xd, &one, A , xDims, B , yDims, &zero, C, &m); } } UNPROTECT(nprot); return val; #endif return R_NilValue; }
void blasx_gpu_dgemm_kernel(int j, int nrowa, int ncola, int nrowb, int ncolb, int nrowc, int ncolc, int current_task, int prior_task, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, double* A, double* B, double* C, int lda, int ldb, int ldc, int x, int y, int z, double** C_dev, cudaStream_t *stream, cublasHandle_t *handle_p, int current_stream, double alpha, double beta, int block_dim, int switcher, int* task_batch_counter, LRU_t **LRUs, int GPUs, int *mem_cpy_counter, reader_tracker *addr_track, int GPU_id) { int nrowa_dev, nrowb_dev, nrowc_dev; int ncola_dev, ncolb_dev, ncolc_dev; int nrow_offset_a, nrow_offset_b; int ncol_offset_a, ncol_offset_b; int i = current_task/(y+1); int k = current_task%(y+1); double *A_dev, *B_dev; if (TransA != CblasNoTrans) { margin_adjustment(nrowa,ncola,block_dim,j,i,&nrowa_dev,&ncola_dev); }else{ margin_adjustment(nrowa,ncola,block_dim,i,j,&nrowa_dev,&ncola_dev); } if (TransB != CblasNoTrans) { margin_adjustment(nrowb,ncolb,block_dim,k,j,&nrowb_dev,&ncolb_dev); }else{ margin_adjustment(nrowb,ncolb,block_dim,j,k,&nrowb_dev,&ncolb_dev); } margin_adjustment(nrowc,ncolc,block_dim,i,k,&nrowc_dev,&ncolc_dev); if (TransA != CblasNoTrans) { nrow_offset_a = j*block_dim, ncol_offset_a = i*block_dim; }else{ nrow_offset_a = i*block_dim, ncol_offset_a = j*block_dim; } if (TransB != CblasNoTrans) { nrow_offset_b = k*block_dim, ncol_offset_b = j*block_dim; }else{ nrow_offset_b = j*block_dim, ncol_offset_b = k*block_dim; } double *starting_point_A = &A[nrow_offset_a+ncol_offset_a*lda]; double *starting_point_B = &B[nrow_offset_b+ncol_offset_b*ldb]; //Asynchonizing set matrix on GPU //----------------LRU&RBT optimization----------------// mem_control_kernel_double(starting_point_A, &A_dev, LRUs, GPUs, GPU_id, block_dim, mem_cpy_counter, addr_track, stream, nrowa_dev, ncola_dev, lda); mem_control_kernel_double(starting_point_B, &B_dev, LRUs, GPUs, GPU_id, block_dim, mem_cpy_counter, addr_track, stream, nrowb_dev, ncolb_dev, ldb); //----------------------------------------------------// if (j == 0) { margin_adjustment(nrowc,ncolc,block_dim,i,k,&nrowc_dev,&ncolc_dev); int nrow_offset_c = i*block_dim; int ncol_offset_c = k*block_dim; double *starting_point_C = &C[nrow_offset_c+ncol_offset_c*ldc]; if (beta != 0) { assert( cublasSetMatrixAsync(nrowc_dev, ncolc_dev, sizeof(double), starting_point_C, ldc, C_dev[switcher*STREAMNUM+current_stream], block_dim, *stream) == CUBLAS_STATUS_SUCCESS ); } if (*task_batch_counter != 0) {//Set matrix back int i_pre = prior_task/(y+1); int k_pre = prior_task%(y+1); int nrowc_dev_pre, ncolc_dev_pre; margin_adjustment(nrowc,ncolc,block_dim,i_pre,k_pre,&nrowc_dev_pre,&ncolc_dev_pre); int nrow_offset_c_pre = i_pre*block_dim; int ncol_offset_c_pre = k_pre*block_dim; double *starting_point_C_pre = &C[nrow_offset_c_pre+ncol_offset_c_pre*ldc]; assert( cublasGetMatrixAsync(nrowc_dev_pre, ncolc_dev_pre, sizeof(double), C_dev[current_stream+(1-switcher)*STREAMNUM], block_dim, starting_point_C_pre, ldc,*stream) == CUBLAS_STATUS_SUCCESS); } } cudaStreamSynchronize(*stream); assert( cublasSetStream(*handle_p, *stream) == CUBLAS_STATUS_SUCCESS ); double beta_inner = (j==0)?(beta):(1); int ka = (TransA != CblasNoTrans)?(nrowa_dev):(ncola_dev); cublasOperation_t transa, transb; CBLasTransToCuBlasTrans(TransA, &transa); CBLasTransToCuBlasTrans(TransB, &transb); cublasStatus_t status = cublasDgemm(*handle_p, transa, transb, nrowc_dev, ncolc_dev, ka, &alpha, A_dev, block_dim, B_dev, block_dim, &beta_inner, C_dev[switcher*STREAMNUM+current_stream], block_dim); assert( status == CUBLAS_STATUS_SUCCESS ); }