Beispiel #1
0
/* C <- alpha*A*B + beta*C */
void gemm(double alpha, Mat mA, Mat mB, double beta, Mat mC) {
  const int n = MatN(mA);
  const void* const a = MatElems(mA);
  const void* const b = MatElems(mB);
  void* const c = MatElems(mC);
  const bool dev = MatDev(mA);

  switch (MatElemSize(mA)) {
  case 4:
    if (dev) {
      float alpha32 = alpha, beta32 = beta;
      cublasSgemm(g_cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n, n, n,
                  &alpha32, a, n, b, n, &beta32, c, n);
    } else {
      cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, n, n,
                  n, alpha, a, n, b, n, beta, c, n);
    }
    break;

  case 8:
    if (dev) {
      cublasDgemm(g_cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n, n, n,
                  &alpha, a, n, b, n, &beta, c, n);
    } else {
      cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, n, n,
                  n, alpha, a, n, b, n, beta, c, n);
    }
    break;
  }
}
Beispiel #2
0
CAMLprim value spoc_cublasDgemm(value transa, value transb,
		value m, value n, value k,
		value alpha, value a, value lda,
		value b, value ldb, value beta, value c, value ldc, value dev){
	CAMLparam5(transa, transb, m, n, k);
	CAMLxparam5(alpha, a, lda, b, ldb);
	CAMLxparam4(beta, c, ldc, dev);
	CAMLlocal3(dev_vec_array, dev_vec, gi);
	CUdeviceptr d_A;
	CUdeviceptr d_B;
	CUdeviceptr d_C;
	int id;
	gi = Field(dev, 0);
	id = Int_val(Field(gi, 7));

	GET_VEC(a, d_A);
	GET_VEC(b, d_B);
	GET_VEC(c, d_C);

	//CUBLAS_GET_CONTEXT;
	CUBLAS_GET_CONTEXT;


	cublasDgemm (Int_val(transa), Int_val(transb), Int_val(m), Int_val(n),
	Int_val(k), (double)Double_val(alpha), (double*) d_A, Int_val(lda),
	(double*) d_B, Int_val(ldb), (double) Double_val(beta),
	(double *)d_C, Int_val(ldc));

	CUDA_RESTORE_CONTEXT;

	CAMLreturn(Val_unit);
}
Beispiel #3
0
void gpu_cublas1(double *A, double *B, double *C, double *D, double *r, double *nrmC, int N, int N2)
{
	#pragma acc data present(A, B, C, D)
	{
		#pragma acc host_data use_device(A, B, C, D)
		{
			cublasHandle_t handle;
			cublasCreate(&handle);
			const double alpha = 1.0;
			const double beta = 0.0;
			cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_T, N, N, N, &alpha, A, N, B, N, &beta, C, N);
			printf(" gpu gemm success \n");
			cublasDdot(handle, N2, C, 1, B, 1, r);
			printf(" gpu dot success \n");
			*r = -1.0 * *r;
			cublasDaxpy(handle, N2, r, B, 1, C, 1);
			printf(" gpu axpy success \n");
			cublasDnrm2(handle, N2, C, 1, nrmC);
			printf(" gpu nrm2 success \n");
			cublasDcopy(handle, N2, C, 1, D, 1);
			printf(" gpu copy success \n");
			*nrmC = 1.0 / *nrmC;
			cublasDscal(handle, N2, nrmC, D, 1);
			printf(" gpu scal success \n");
			cublasDestroy(handle);
			printf(" gpu destroy success \n");
		}
	}
}
  static vl::Error
  gemm(vl::Context& context,
       char op1, char op2,
       ptrdiff_t m, ptrdiff_t n, ptrdiff_t k,
       type alpha,
       type const * a, ptrdiff_t lda,
       type const * b, ptrdiff_t ldb,
       type beta,
       type * c, ptrdiff_t ldc)
  {
    cublasHandle_t handle ;
    cublasStatus_t status ;
    status = context.getCudaHelper().getCublasHandle(&handle) ;
    if (status != CUBLAS_STATUS_SUCCESS) goto done ;

    status = cublasDgemm(handle,
                         (op1 == 't') ? CUBLAS_OP_T : CUBLAS_OP_N,
                         (op2 == 't') ? CUBLAS_OP_T : CUBLAS_OP_N,
                         (int)m, (int)n, (int)k,
                         &alpha,
                         a, (int)lda,
                         b, (int)ldb,
                         &beta,
                         c, (int)ldc);
  done:
    return context.setError
    (context.getCudaHelper().catchCublasError(status, "cublasDgemm"), __func__) ;
  }
Beispiel #5
0
void
cube_blas_d_gemm (cube_t *ctx,
		  cube_blas_op_t transa, cube_blas_op_t transb,
		  int m, int n, int k,
		  const double *alpha,
		  const double *A, int lda,
		  const double *B, int ldb,
		  const double *beta,
		  double *C, int ldc)
{
  cublasStatus_t status;
  cublasOperation_t ta,  tb;
  
  if (! cube_context_check (ctx))
    return;

  ta = (cublasOperation_t) transa;
  tb = (cublasOperation_t) transb;

  status = cublasDgemm (ctx->h_blas,
			ta, tb,
			m, n, k,
			alpha,
			A, lda,
			B, ldb,
			beta,
			C, ldc);

  cube_blas_check (ctx, status);
}
Beispiel #6
0
void d_gemm(SEXP rtransa, SEXP rtransb, SEXP ralpha, SEXP ra, SEXP rlda,
	SEXP rb, SEXP rldb, SEXP rbeta, SEXP rc, SEXP rldc)
{
	char
		transa = getTranspose(rtransa),
		transb = getTranspose(rtransb);
	double
		alpha = asReal(ralpha), beta = asReal(rbeta),
		* a, * b, * c;
	int
		m, n, k,
		rowsa, colsa, lda = asInteger(rlda),
		rowsb, colsb, ldb = asInteger(rldb),
		rowsc, colsc, ldc = asInteger(rldc);	
		
	unpackMatrix(ra, &rowsa, &colsa, &a);
	unpackMatrix(rb, &rowsb, &colsb, &b);
	unpackMatrix(rc, &rowsc, &colsc, &c);
	
	m = rowsa;
	n = colsb;
	k = colsa;
	
	if(isTranspose(transa)) {
		m = colsa;
		k = rowsa;
	}
	
	if(isTranspose(transb))
		n = rowsb;
	
	cublasDgemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
	checkCublasError("d_gemm");
}
static int dgemm(cb_order order, cb_transpose transA, cb_transpose transB,
                 size_t M, size_t N, size_t K, double alpha,
                 gpudata *A, size_t offA, size_t lda,
                 gpudata *B, size_t offB, size_t ldb,
                 double beta, gpudata *C, size_t offC, size_t ldc) {
  cuda_context *ctx = A->ctx;
  gpudata *T;
  size_t t;
  cublasStatus_t err;
  cb_transpose transT;

  ASSERT_BUF(A);
  ASSERT_BUF(B);
  ASSERT_BUF(C);

  if (order == cb_c) {
    /* swap A and B */
    t = N;
    N = M;
    M = t;
    T = A;
    A = B;
    B = T;
    t = lda;
    lda = ldb;
    ldb = t;
    transT = transA;
    transA = transB;
    transB = transT;
    t = offA;
    offA = offB;
    offB = t;
  }

  cuda_enter(ctx);

  cuda_wait(A, CUDA_WAIT_READ);
  cuda_wait(B, CUDA_WAIT_READ);
  cuda_wait(C, CUDA_WAIT_READ|CUDA_WAIT_WRITE);

  err = cublasDgemm(((blas_handle *)ctx->blas_handle)->h,
                    convT(transA), convT(transB), M, N, K,
                    &alpha, ((double *)A->ptr) + offA, lda,
                    ((double *)B->ptr) + offB, ldb, &beta,
                    ((double *)C->ptr) + offC, ldc);
  if (err != CUBLAS_STATUS_SUCCESS) {
    cuda_exit(ctx);
    if (err == CUBLAS_STATUS_ARCH_MISMATCH)
      return GA_DEVSUP_ERROR;
    return GA_BLAS_ERROR;
  }

  cuda_record(A, CUDA_WAIT_READ);
  cuda_record(B, CUDA_WAIT_READ);
  cuda_record(C, CUDA_WAIT_READ|CUDA_WAIT_WRITE);

  cuda_exit(ctx);
  return GA_NO_ERROR;
}
Beispiel #8
0
inline void gemm( const Order order, const TransA transa, const TransB transb,
        const int m, const int n, const int k, const double alpha,
        const double* a, const int lda, const double* b, const int ldb,
        const double beta, double* c, const int ldc ) {
    BOOST_STATIC_ASSERT( (is_same<Order, tag::column_major>::value) );
    cublasDgemm( blas_option< TransA >::value, blas_option< TransB >::value,
            m, n, k, alpha, a, lda, b, ldb, beta, c, ldc );
}
Beispiel #9
0
void gemm(bool transa, bool transb, int m, int n, int k, double alpha, thrust::device_ptr<const double> A, int lda,
    thrust::device_ptr<const double> B, int ldb, double beta, thrust::device_ptr<double> C, int ldc)
{
  const cublasOperation_t ctransa = transa ? CUBLAS_OP_T : CUBLAS_OP_N;
  const cublasOperation_t ctransb = transb ? CUBLAS_OP_T : CUBLAS_OP_N;

  cublasSetStream(context::get().cublasHandle, context::get().stream);
  cublasDgemm(context::get().cublasHandle, ctransa, ctransb, m, n, k, &alpha, A.get(), lda, B.get(), ldb, &beta, C.get(), ldc);
}
Beispiel #10
0
Darray<double> cudot (const Darray<double>& lhs, const Darray<double>& rhs)
{
	// context check
	CHECK_EQ(lhs.getDeviceManager().getDeviceID(), rhs.getDeviceManager().getDeviceID());
	
	CHECK_EQ(lhs.ndim(), rhs.ndim());
	CHECK_LT(lhs.ndim(), 3);
	CHECK_LT(rhs.ndim(), 3);

	Darray<double> ret;

	if (lhs.ndim()==1 && rhs.ndim()==1)
	{
		// shape check
		CHECK_EQ(lhs.size(), rhs.size());
		ret = Darray<double>(lhs.getDeviceManager(), {1});
		
		// using cublas ddot
		lhs.deviceSet();
		cublasDdot (DeviceManager::handle,
				    lhs.size(),
				    lhs.data,
				    1,
				    rhs.data,
				    1,
				    ret.data);
	}
	// 2D matrix dot
	else if (lhs.ndim()==2 && rhs.ndim()==2)
	{
		// shape check
		CHECK_EQ(lhs.shape()[1], rhs.shape()[0]);
		ret = Darray<double>(lhs.getDeviceManager(), {lhs.shape()[0], rhs.shape()[1]});
		
		// using cblas dgemm
		lhs.deviceSet();
		const double alpha = 1.;
		const double beta = 0.;
		CUBLAS_SAFE_CALL(
		cublasDgemm (DeviceManager::handle,
					CUBLAS_OP_N,
					CUBLAS_OP_N,
					lhs.shape()[0],
					rhs.shape()[1],
					lhs.shape()[1],
					&alpha,
					lhs.dev_data,
					lhs.shape()[0],
					rhs.dev_data,
					rhs.shape()[0],
					&beta,
					ret.dev_data,
					ret.shape()[0])
		);
	}
	return ret;
}
Beispiel #11
0
void contractTensor(cublasHandle_t &handle, sTensorGPU &tensorIn1, sTensorGPU &tensorIn2, sTensorGPU &tensorOut, int numDim = 1)
{
	//printf("\nContract(%d) ->%d\n", numDim, tensorIn2.id);
	//printTensor(tensorIn1);
	//printTensor(tensorIn2);

	//sTensor tensorOut;
	tensorOut.dim = tensorIn1.dim + tensorIn2.dim - numDim * 2;
	//tensorOut.size = (int*)malloc(sizeof(int)*tensorOut.dim);
	int outW = 1;
	int outH = 1;
	int contract = tensorIn1.size[0];
	int test = tensorIn2.size[0];
	for (int i = 1; i<numDim; i++){
		contract *= tensorIn1.size[i];
		test *= tensorIn2.size[i];
	}
	if (test != contract){
		printf("Unequal Size %d!=%d\n", contract, test);
	}

	{
		int i = 0;
		for (int j = numDim; j<tensorIn1.dim; j++){
			tensorOut.size[i++] = tensorIn1.size[j];
			outW *= tensorIn1.size[j];
		}
		for (int j = numDim; j<tensorIn2.dim; j++){
			tensorOut.size[i++] = tensorIn2.size[j];
			outH *= tensorIn2.size[j];
		}
	}
	tensorOut.dataSize = outW * outH;
	//double* deviceData;
	//handleError(cudaMalloc((void **)&deviceData, sizeof(double)*tensorOut.dataSize));
	//tensorOut.deviceData = deviceData;

	type alpha = 1.f;
	type beta = 0.f;
	cublasStatus_t ret;
	double flops = outW;
	flops *= outH;
	flops *= contract;
	flops *= 2;
	bops += flops;
	//                                                    m     n      k                  lda x k            ldA         ldb x n          ldB                ldc x n            ldC
	ret = cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, outW, outH, contract, &alpha, tensorIn1.deviceData, contract, tensorIn2.deviceData, contract, &beta, tensorOut.deviceData, outW);
	if (ret != CUBLAS_STATUS_SUCCESS)
	{
		printf("cublasSgemm returned error code %d\n", ret);
	}
	cudaDeviceSynchronize();
	//freeTensor(tensorIn1);
	//freeTensor(tensorIn2);
	//return tensorOut;
}
void caffe_gpu_gemm<double>(const CBLAS_TRANSPOSE TransA,
    const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
    const double alpha, const double* A, const double* B, const double beta,
    double* C) {
  // Note that cublas follows fortran order.
  int lda = (TransA == CblasNoTrans) ? K : M;
  int ldb = (TransB == CblasNoTrans) ? N : K;
  cublasOperation_t cuTransA =
      (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
  cublasOperation_t cuTransB =
      (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
  CUBLAS_CHECK(cublasDgemm(Caffe::cublas_handle(), cuTransB, cuTransA,
      N, M, K, &alpha, B, ldb, A, lda, &beta, C, N));
}
Beispiel #13
0
// Note : cublasDgemm( handle, CUBLAS_OP_N, CUBLAS_OP_N, n,n,n, &alpha, A, n, B, n, &beta, C, n)
// means matrix C = B * A
void cublas_gemm(int n, double *c, double *b, double *a )
{
	#pragma acc data present(a, b, c)
	{
		#pragma acc host_data use_device(a, b, c)
		{
			cublasHandle_t handle;
			cublasCreate(&handle);
			const double alpha = 1.0;
			const double beta = 0.0;
			cublasDgemm( handle, CUBLAS_OP_N, CUBLAS_OP_N, n,n,n, &alpha, a, n, b, n, &beta, c, n);
			cublasDestroy(handle);
		}
	}
}
Beispiel #14
0
void cublas_gemm(const double *A, const double *B, double *C, int N)
{
	#pragma acc data present(A, B, C)
	{
		#pragma acc host_data use_device(A, B, C)
		{
			cublasHandle_t h;
			cublasCreate(&h);
			const double alpha = 1.0;
			const double beta = 0.0;
			cublasDgemm(h, CUBLAS_OP_T, CUBLAS_OP_T, N, N, N, &alpha, A, N, B, N, &beta, C, N);
			cublasDestroy(h);
		}
	}
}
Beispiel #15
0
void magma_dgemm(
    magma_trans_t transA, magma_trans_t transB,
    magma_int_t m, magma_int_t n, magma_int_t k,
    double alpha, double const* dA, magma_int_t lda,
                           double const* dB, magma_int_t ldb,
    double beta,  double*       dC, magma_int_t ldc )
{
    cublasDgemm(
        cublas_trans_const( transA ),
        cublas_trans_const( transB ),
        m, n, k,
        alpha, dA, lda,
               dB, ldb,
        beta,  dC, ldc );
}
Beispiel #16
0
      static void GEMM(Teuchos::ETransp transA, Teuchos::ETransp transB, double alpha,
          View<const double**,LayoutLeft,Cuda> A, View<const double**,LayoutLeft,Cuda> B,
          double beta, View<double**,LayoutLeft,Cuda> C) {
        const int m = static_cast<int>(C.dimension_0()),
                  n = static_cast<int>(C.dimension_1()),
                  k = (transA == Teuchos::NO_TRANS ? A.dimension_1() : A.dimension_0()),
                  lda = static_cast<int>(Impl::getStride2DView(A)),
                  ldb = static_cast<int>(Impl::getStride2DView(B)),
                  ldc = static_cast<int>(Impl::getStride2DView(C));
        const char char_transA = (transA == Teuchos::NO_TRANS ? 'N' : 'T'),
                   char_transB = (transB == Teuchos::NO_TRANS ? 'N' : 'T');
        cublasDgemm(char_transA, char_transB, m, n, k, alpha, A.ptr_on_device(), lda, B.ptr_on_device(), ldb, beta, C.ptr_on_device(), ldc);
#ifdef HAVE_KOKKOS_DEBUG
        cublasStatus info = cublasGetError();
        TEUCHOS_TEST_FOR_EXCEPTION( info != CUBLAS_STATUS_SUCCESS, std::runtime_error, "cublasDgemm failed with status " << info << "." );
#endif
      }
// Multiply the arrays A and B on GPU and save the result in C
// C(m,n) = A(m,k) * B(k,n)
void gpu_blas_mmul(const double *A, const double *B, double *C, const int m, const int k, const int n) {
     int lda=m,ldb=k,ldc=m;
     const double alf = 1;
     const double bet = 0;
     const double *alpha = &alf;
     const double *beta = &bet;
 
     // Create a handle for CUBLAS
     cublasHandle_t handle;
     cublasCreate(&handle);
 
     // Do the actual multiplication
     cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
 
     // Destroy the handle
     cublasDestroy(handle);
 }
Beispiel #18
0
double cublas_gemm_norm(const double *A, const double *B, double *C, int N)
{
	double *norm;
	norm = (double *) malloc(1*sizeof(double));

        #pragma acc data present(A, B, C) copyout(norm[0])
        {
                #pragma acc host_data use_device(A, B, C)
                {
                        cublasHandle_t h;
                        cublasCreate(&h);
                        const double alpha = 1.0;
                        const double beta = 0.0;
                        cublasDgemm(h, CUBLAS_OP_T, CUBLAS_OP_T, N, N, N, &alpha, A, N, B, N, &beta, C, N);
						cublasDnrm2(h, N*N, C, 1, norm);
                        cublasDestroy(h);
                }
        }
	return *norm;
}
int tiramisu_cublas_dgemm(double *A, double *B, double *C,
                          uint64_t M, uint64_t N, uint64_t K,
                          double alpha, double beta,
                          uint64_t ldA, uint64_t ldB, uint64_t ldC,
                          uint64_t offsetA, uint64_t offsetB, uint64_t offsetC,
                          bool transposeA, bool transposeB)
{
    // TODO: Destroy the handle.
    static bool handle_created = false;
    static cublasHandle_t handle;
    if (!handle_created) {
        cublasCreate(&handle);
        handle_created = true;
    }
    // Default values for tight packing:
    if (ldA == 0) {
        ldA = transposeA ? M : K;
    }
    if (ldB == 0) {
        ldB = transposeB ? K : N;
    }
    if (ldC == 0) {
        ldC = N;
    }
    // The cuBLAS GEMM accepts column major buffers by default. We do a simple
    // trick here to multiply row major matrices. From a row-major perspective,
    // column-major multiplication basically transposes inputs, multiplies, and
    // transposes the output again: cublas(A, B) = ((A^T)x(B^T))^T = BxA
    // So it is actually equivalent to row-major GEMM with inputs swapped.
    // We need to reorder the size parameters as well to make it work:
    cublasSetStream(handle, cudaStreamPerThread);
    handle_cublas_error(
        cublasDgemm(handle,
                    transposeB ? CUBLAS_OP_T : CUBLAS_OP_N,
                    transposeA ? CUBLAS_OP_T : CUBLAS_OP_N,
                    N, M, K,
                    &alpha, B + offsetB, ldB, A + offsetA, ldA,
                    &beta, C + offsetC, ldC),
         __FUNCTION__);
    return 0;
}
Beispiel #20
0
SEXP magMultmm(SEXP a, SEXP transa, SEXP b, SEXP transb)
{
   SEXP gpu = magGetGPU(a, b),
        c = PROTECT(NEW_OBJECT(MAKE_CLASS("magma")));
   int TA = LOGICAL_VALUE(transa), TB = LOGICAL_VALUE(transb),
       *DIMA = INTEGER(GET_DIM(a)), *DIMB = INTEGER(GET_DIM(b)),
       M = DIMA[TA], N = DIMB[!TB], K = DIMA[!TA],
       LDA = DIMA[0], LDB = DIMB[0], LDC = M;
   char TRANSA = (TA ? 'T' : 'N'), TRANSB = (TB ? 'T' : 'N');
   double *A = REAL(PROTECT(AS_NUMERIC(a))), *B = REAL(PROTECT(AS_NUMERIC(b))),
          *dA, *dB, *dC;
 
   if(DIMB[TB] != K) error("non-conformable matrices");

   c = SET_SLOT(c, install(".Data"), allocMatrix(REALSXP, M, N));
   SET_SLOT(c, install("gpu"), duplicate(gpu));
   
   magma_malloc((void**)&dA, (M*K)*sizeof(double));
   magma_malloc((void**)&dB, (K*N)*sizeof(double));
   magma_malloc((void**)&dC, (M*N)*sizeof(double));

   magma_dsetmatrix(DIMA[0], DIMA[1], A, LDA, dA, LDA);
   magma_dsetmatrix(DIMB[0], DIMB[1], B, LDB, dB, LDB);

   if(LOGICAL_VALUE(gpu))
      magmablas_dgemm(TRANSA, TRANSB, M, N, K, 1.0, dA, LDA, dB, LDB, 0.0, dC, LDC);
   else
      cublasDgemm(TRANSA, TRANSB, M, N, K, 1.0, dA, LDA, dB, LDB, 0.0, dC, LDC);

   magma_dgetmatrix(M, N, dC, LDC, REAL(c), LDC);

   magma_free(dA);
   magma_free(dB);
   magma_free(dC);

   UNPROTECT(3);

   return c;
}
void mat_prod_mat(const double* a, cublasOperation_t op_a, const double* b, cublasOperation_t op_b, double*c, int m, int n, int k){

	cudaError_t cudaStat ; // cudaMalloc status
	cublasStatus_t stat ; // CUBLAS functions status
	cublasHandle_t handle ; // CUBLAS context

	// on the device
	double* d_a; // d_a - a on the device
	double* d_b; // d_b - b on the device
	double* d_c; // d_c - c on the device
	cudaStat = cudaMalloc((void **)&d_a ,m*k*sizeof(*a)); // device
	// memory alloc for a
	cudaStat = cudaMalloc((void **)&d_b ,k*n*sizeof(*b)); // device
	// memory alloc for b
	cudaStat = cudaMalloc((void **)&d_c ,m*n*sizeof(*c)); // device
	// memory alloc for c
	stat = cublasCreate(&handle); // initialize CUBLAS context
// copy matrices from the host to the device
	stat = cublasSetMatrix (m,k, sizeof(*a) ,a,m,d_a ,m); //a -> d_a
	stat = cublasSetMatrix (k,n, sizeof(*b) ,b,k,d_b ,k); //b -> d_b
	stat = cublasSetMatrix (m,n, sizeof(*c) ,c,m,d_c ,m); //c -> d_c
	double al=1.0;	
	double bet=1.0;
// matrix - matrix multiplication : d_c = al*d_a *d_b + bet *d_c
// d_a -mxk matrix , d_b -kxn matrix , d_c -mxn matrix ;
// al ,bet -scalars
	stat=cublasDgemm(handle,op_a,op_b,m,n,k,&al,d_a,m,d_b,k,&bet,d_c,m);
	
	stat = cublasGetMatrix (m, n, sizeof(*c) ,d_c ,m,c,m); // cp d_c - >c

	cudaFree (d_a ); // free device memory
	cudaFree (d_b ); // free device memory
	cudaFree (d_c ); // free device memory
	cublasDestroy ( handle ); // destroy CUBLAS context

}
Beispiel #22
0
		cublasStatus_t cublasXgemm(cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const double *alpha, 
								   const double *A, int lda, const double *B, int ldb, const double *beta, double *C, int ldc) {
			return cublasDgemm(g_context->cublasHandle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
		}
Beispiel #23
0
int main( int argc, char** argv )
{
    magma_init();
    cublasHandle_t handle;
    cudaSetDevice( 0 );
    cublasCreate( &handle );
    
    double *A, *B, *C;
    double *dA, *dB, *dC;
    double error, work[1];
    double c_one     = MAGMA_D_ONE;
    double c_neg_one = MAGMA_D_NEG_ONE;
    magma_int_t ione = 1;
    magma_int_t ISEED[4] = { 1, 2, 3, 4 };
    magma_int_t n = 10;
    magma_int_t lda = n;
    magma_int_t ldda = ((n+31)/32)*32;
    magma_int_t size = lda*n;
    magma_int_t info;
    
    magma_dmalloc_cpu( &A, lda*n );
    magma_dmalloc_cpu( &B, lda*n );
    magma_dmalloc_cpu( &C, lda*n );
    magma_dmalloc( &dA, ldda*n );
    magma_dmalloc( &dB, ldda*n );
    magma_dmalloc( &dC, ldda*n );
    
    // initialize matrices
    lapackf77_dlarnv( &ione, ISEED, &size, A );
    lapackf77_dlarnv( &ione, ISEED, &size, B );
    lapackf77_dlarnv( &ione, ISEED, &size, C );
    // increase diagonal to be SPD
    for( int i=0; i < n; ++i ) {
        C[i+i*lda] = MAGMA_D_ADD( C[i+i*lda], MAGMA_D_MAKE( n*n, 0 ));
    }
    
    magma_dsetmatrix( n, n, A, lda, dA, ldda );
    magma_dsetmatrix( n, n, B, lda, dB, ldda );
    magma_dsetmatrix( n, n, C, lda, dC, ldda );
    
    // compute with cublas
    cublasDgemm( handle, CUBLAS_OP_N, CUBLAS_OP_N, n, n, n,
                 &c_neg_one, dA, ldda, dB, ldda, &c_one, dC, ldda );
    
    magma_dpotrf_gpu( MagmaLower, n, dC, ldda, &info );
    if (info != 0)
        printf("magma_dpotrf returned error %d: %s.\n",
               (int) info, magma_strerror( info ));
    
    // compute with LAPACK
    blasf77_dgemm( MagmaNoTransStr, MagmaNoTransStr, &n, &n, &n,
                   &c_neg_one, A, &lda, B, &lda, &c_one, C, &lda );
    
    lapackf77_dpotrf( MagmaLowerStr, &n, C, &lda, &info );
    if (info != 0)
        printf("lapackf77_dpotrf returned error %d: %s.\n",
               (int) info, magma_strerror( info ));
    
    // compute difference
    magma_dgetmatrix( n, n, dC, ldda, A, lda );
    blasf77_daxpy( &size, &c_neg_one, C, &ione, A, &ione );
    error = lapackf77_dlange( "F", &n, &n, A, &lda, work );
    printf( "n %d, error %8.2e\n", n, error );
    
    magma_free( dA );
    magma_free( dB );
    magma_free( dC );
    magma_free_cpu( A );
    magma_free_cpu( B );
    magma_free_cpu( C );
    
    cublasDestroy( handle );
    magma_finalize();
    return 0;
}
static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB,
                      size_t M, size_t N, size_t K, double alpha,
                      gpudata **A, size_t *offA, size_t lda,
                      gpudata **B, size_t *offB, size_t ldb,
                      double beta, gpudata **C, size_t *offC, size_t ldc,
                      size_t batchCount) {
  cuda_context *ctx;
  size_t *lt, t;
  gpudata **T;
  size_t i;
  cb_transpose transT;
  cublasStatus_t err;

  if (batchCount == 0) return GA_NO_ERROR;

  ASSERT_BUF(A[0]);
  ctx = A[0]->ctx;
  cuda_enter(ctx);

  if (order == cb_c) {
    /* swap A and B */
    t = N;
    N = M;
    M = t;
    T = A;
    A = B;
    B = T;
    t = lda;
    lda = ldb;
    ldb = t;
    transT = transA;
    transA = transB;
    transB = transT;
    lt = offA;
    offA = offB;
    offB = lt;
  }

  // use parallel cublasSgemm calls rather than cublasSgemmBatched for large products
  const size_t threshold = 650;
  const int multiple_dispatch = M * N * K > threshold * threshold * threshold;
  if (multiple_dispatch) {
    for (i = 0; i < batchCount; i++) {
      ASSERT_BUF(A[i]);
      ASSERT_BUF(B[i]);
      ASSERT_BUF(C[i]);
      cuda_wait(A[i], CUDA_WAIT_READ);
      cuda_wait(B[i], CUDA_WAIT_READ);
      cuda_wait(C[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE);

      err = cublasDgemm(((blas_handle *)ctx->blas_handle)->h,
                        convT(transA), convT(transB),
                        M, N, K, &alpha,
                        (double*)A[i]->ptr + offA[i], lda,
                        (double*)B[i]->ptr + offB[i], ldb,
                        &beta,
                        (double*)C[i]->ptr + offC[i], ldc);
      if (err != CUBLAS_STATUS_SUCCESS) {
        cuda_exit(ctx);
        if (err == CUBLAS_STATUS_ARCH_MISMATCH)
          return GA_DEVSUP_ERROR;
        return GA_BLAS_ERROR;
      }

      cuda_record(A[i], CUDA_WAIT_READ);
      cuda_record(B[i], CUDA_WAIT_READ);
      cuda_record(C[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE);
    }
  } else {
    double **T_l = alloca(sizeof(double *) * batchCount * 3);
    const double **A_l = (const double **)T_l;
    const double **B_l = (const double **)T_l + batchCount;
    double **C_l = T_l + (batchCount * 2);
    CUdeviceptr Ta, Aa, Ba, Ca;

    for (i = 0; i < batchCount; i++) {
      ASSERT_BUF(A[i]);
      ASSERT_BUF(B[i]);
      ASSERT_BUF(C[i]);
      cuda_wait(A[i], CUDA_WAIT_READ);
      cuda_wait(B[i], CUDA_WAIT_READ);
      cuda_wait(C[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE);
      A_l[i] = ((double *)A[i]->ptr) + offA[i];
      B_l[i] = ((double *)B[i]->ptr) + offB[i];
      C_l[i] = ((double *)C[i]->ptr) + offC[i];
    }

    cuMemAlloc(&Ta, sizeof(double *) * batchCount * 3);
    Aa = Ta;
    Ba = Ta + (batchCount * sizeof(double *));
    Ca = Ta + (batchCount * sizeof(double *) * 2);

    cuMemcpyHtoD(Ta, T_l, sizeof(double *) * batchCount * 3);

    err = cublasDgemmBatched(((blas_handle *)ctx->blas_handle)->h,
                             convT(transA), convT(transB),
                             M, N, K, &alpha, (const double **)Aa, lda,
                             (const double **)Ba, ldb, &beta,
                             (double **)Ca, ldc, batchCount);
    cuMemFree(Ta);
    if (err != CUBLAS_STATUS_SUCCESS) {
      cuda_exit(ctx);
      if (err == CUBLAS_STATUS_ARCH_MISMATCH)
        return GA_DEVSUP_ERROR;
      return GA_BLAS_ERROR;
    }

    for (i = 0; i < batchCount; i++) {
      cuda_record(A[i], CUDA_WAIT_READ);
      cuda_record(B[i], CUDA_WAIT_READ);
      cuda_record(C[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE);
    }
  }

  cuda_exit(ctx);
  return GA_NO_ERROR;
}
int cublas_multiply_matrix_double2( PGM_Matriz_Double  *A, PGM_Matriz_Double  *B, PGM_Matriz_Double  *result, PGM_Matriz_Double *work){
    cublasStatus_t status;
    cublasHandle_t handle;
    int max_dim = work->n_linhas;
    PGM_Matriz_GPU device_A,
                   device_B,
                   device_C;
    double alpha = 1.0,
           beta = 0.0;

    if (A->n_linhas != result->n_linhas || B->n_colunas != result->n_colunas){
        return 0;
    }

    status = cublasCreate(&handle);

    if (status != CUBLAS_STATUS_SUCCESS){
        return -1;
    }

    if(create_PGM_Matriz_GPU_double(&device_A,A->n_linhas, A->n_colunas,max_dim) != cudaSuccess){
        return -2;
    }
    if(create_PGM_Matriz_GPU_double(&device_B,B->n_linhas, B->n_colunas,max_dim) != cudaSuccess){
        if(cudaFree(device_A.valor) == cudaSuccess) return -2;
        else return -11;
    }
    if(create_PGM_Matriz_GPU_double(&device_C,result->n_linhas, result->n_colunas,max_dim) != cudaSuccess){
        if(cudaFree(device_A.valor) == cudaSuccess && cudaFree(device_B.valor) == cudaSuccess) return -2;
        else return -11;
    }

    if(double_copyMatrixHost2GPU(A,work,&device_A) != CUBLAS_STATUS_SUCCESS){
        if(cudaFree(device_A.valor) == cudaSuccess && cudaFree(device_B.valor) == cudaSuccess && cudaFree(device_C.valor) == cudaSuccess) return -3;
        else return -12;
    }

    if(double_copyMatrixHost2GPU(B,work,&device_B) != CUBLAS_STATUS_SUCCESS){
        if(cudaFree(device_A.valor) == cudaSuccess && cudaFree(device_B.valor) == cudaSuccess && cudaFree(device_C.valor) == cudaSuccess) return -3;
        else return -12;
    }
    if(cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_T, max_dim,max_dim,max_dim, &alpha,(double*) device_A.valor,max_dim,(double*)device_B.valor, max_dim, &beta, (double*)device_C.valor, max_dim) != CUBLAS_STATUS_SUCCESS){
        if(cudaFree(device_A.valor) == cudaSuccess && cudaFree(device_B.valor) == cudaSuccess && cudaFree(device_C.valor) == cudaSuccess) return -4;
        else return -13;
    }
    if(double_copyMatrixGPU2Host_Transpose(result, &device_C, work) != CUBLAS_STATUS_SUCCESS){
        if(cudaFree(device_A.valor) == cudaSuccess && cudaFree(device_B.valor) == cudaSuccess && cudaFree(device_C.valor) == cudaSuccess) return -5;
        else return -14;
    }

    if(cudaFree(device_A.valor) != cudaSuccess){
        return -15;
    }
    if(cudaFree(device_B.valor) != cudaSuccess){
        return -15;
    }
    if(cudaFree(device_C.valor) != cudaSuccess){
        return -15;
    }
    if(cublasDestroy(handle) != CUBLAS_STATUS_SUCCESS){
        return -7;
    }
    return 1;
}
Beispiel #26
0
int main( int argc, char** argv )
{
    TESTING_INIT();
    
    real_Double_t   gflops, t1, t2;
    double c_neg_one = MAGMA_D_NEG_ONE;
    magma_int_t ione = 1;
    const char trans[] = { 'N', 'C', 'T' };
    const char uplo[]  = { 'L', 'U' };
    const char diag[]  = { 'U', 'N' };
    const char side[]  = { 'L', 'R' };
    
    double  *A,  *B,  *C,   *C2, *LU;
    double *dA, *dB, *dC1, *dC2;
    double alpha = MAGMA_D_MAKE( 0.5, 0.1 );
    double beta  = MAGMA_D_MAKE( 0.7, 0.2 );
    double dalpha = 0.6;
    double dbeta  = 0.8;
    double work[1], error, total_error;
    magma_int_t ISEED[4] = {0,0,0,1};
    magma_int_t m, n, k, size, maxn, ld, info;
    magma_int_t *piv;
    magma_err_t err;
    
    magma_opts opts;
    parse_opts( argc, argv, &opts );
    
    printf( "Compares magma wrapper function to cublas function; all diffs should be exactly 0.\n\n" );
    
    total_error = 0.;
    for( int i = 0; i < opts.ntest; ++i ) {
        m = opts.msize[i];
        n = opts.nsize[i];
        k = opts.ksize[i];
        printf("=========================================================================\n");
        printf( "M %d, N %d, K %d\n", (int) m, (int) n, (int) k );
        
        // allocate matrices
        // over-allocate so they can be any combination of {m,n,k} x {m,n,k}.
        maxn = max( max( m, n ), k );
        ld = maxn;
        size = maxn*maxn;
        err = magma_malloc_cpu( (void**) &piv, maxn*sizeof(magma_int_t) );  assert( err == 0 );
        err = magma_dmalloc_pinned( &A,  size );  assert( err == 0 );
        err = magma_dmalloc_pinned( &B,  size );  assert( err == 0 );
        err = magma_dmalloc_pinned( &C,  size );  assert( err == 0 );
        err = magma_dmalloc_pinned( &C2, size );  assert( err == 0 );
        err = magma_dmalloc_pinned( &LU, size );  assert( err == 0 );
        err = magma_dmalloc( &dA,  size );        assert( err == 0 );
        err = magma_dmalloc( &dB,  size );        assert( err == 0 );
        err = magma_dmalloc( &dC1, size );        assert( err == 0 );
        err = magma_dmalloc( &dC2, size );        assert( err == 0 );
        
        // initialize matrices
        size = maxn*maxn;
        lapackf77_dlarnv( &ione, ISEED, &size, A  );
        lapackf77_dlarnv( &ione, ISEED, &size, B  );
        lapackf77_dlarnv( &ione, ISEED, &size, C  );
        
        printf( "========== Level 1 BLAS ==========\n" );
        
        // ----- test DSWAP
        // swap 2nd and 3rd columns of dA, then copy to C2 and compare with A
        assert( n >= 4 );
        magma_dsetmatrix( m, n, A, ld, dA, ld );
        magma_dsetmatrix( m, n, A, ld, dB, ld );
        magma_dswap( m, dA(0,1), 1, dA(0,2), 1 );
        magma_dswap( m, dB(0,1), 1, dB(0,2), 1 );
        
        // check results, storing diff between magma and cuda calls in C2
        cublasDaxpy( ld*n, c_neg_one, dA, 1, dB, 1 );
        magma_dgetmatrix( m, n, dB, ld, C2, ld );
        error = lapackf77_dlange( "F", &m, &k, C2, &ld, work );
        total_error += error;
        printf( "dswap             diff %.2g\n", error );
        
        // ----- test IDAMAX
        // get argmax of column of A
        magma_dsetmatrix( m, k, A, ld, dA, ld );
        error = 0;
        for( int j = 0; j < k; ++j ) {
            magma_int_t i1 = magma_idamax( m, dA(0,j), 1 );
            magma_int_t i2 = cublasIdamax( m, dA(0,j), 1 );
            assert( i1 == i2 );
            error += abs( i1 - i2 );
        }
        total_error += error;
        gflops = (double)m * k / 1e9;
        printf( "idamax            diff %.2g\n", error );
        printf( "\n" );
        
        printf( "========== Level 2 BLAS ==========\n" );
        
        // ----- test DGEMV
        // c = alpha*A*b + beta*c,  with A m*n; b,c m or n-vectors
        // try no-trans/trans
        for( int ia = 0; ia < 3; ++ia ) {
            magma_dsetmatrix( m, n, A,  ld, dA,  ld );
            magma_dsetvector( maxn, B, 1, dB,  1 );
            magma_dsetvector( maxn, C, 1, dC1, 1 );
            magma_dsetvector( maxn, C, 1, dC2, 1 );
            t1 = magma_sync_wtime( 0 );
            magma_dgemv( trans[ia], m, n, alpha, dA, ld, dB, 1, beta, dC1, 1 );
            t1 = magma_sync_wtime( 0 ) - t1;
            t2 = magma_sync_wtime( 0 );
            cublasDgemv( trans[ia], m, n, alpha, dA, ld, dB, 1, beta, dC2, 1 );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            size = (trans[ia] == 'N' ? m : n);
            cublasDaxpy( size, c_neg_one, dC1, 1, dC2, 1 );
            magma_dgetvector( size, dC2, 1, C2, 1 );
            error = lapackf77_dlange( "F", &size, &ione, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_DGEMV( m, n ) / 1e9;
            printf( "dgemv( %c )        diff %.2g,  Gflop/s %6.2f, %6.2f\n",
                    trans[ia], error, gflops/t1, gflops/t2 );
        }
        printf( "\n" );
        
        // ----- test DSYMV
        // c = alpha*A*b + beta*c,  with A m*m symmetric; b,c m-vectors
        // try upper/lower
        for( int iu = 0; iu < 2; ++iu ) {
            magma_dsetmatrix( m, m, A, ld, dA, ld );
            magma_dsetvector( m, B, 1, dB,  1 );
            magma_dsetvector( m, C, 1, dC1, 1 );
            magma_dsetvector( m, C, 1, dC2, 1 );
            t1 = magma_sync_wtime( 0 );
            magma_dsymv( uplo[iu], m, alpha, dA, ld, dB, 1, beta, dC1, 1 );
            t1 = magma_sync_wtime( 0 ) - t1;
            t2 = magma_sync_wtime( 0 );
            cublasDsymv( uplo[iu], m, alpha, dA, ld, dB, 1, beta, dC2, 1 );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            cublasDaxpy( m, c_neg_one, dC1, 1, dC2, 1 );
            magma_dgetvector( m, dC2, 1, C2, 1 );
            error = lapackf77_dlange( "F", &m, &ione, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_DSYMV( m ) / 1e9;
            printf( "dsymv( %c )        diff %.2g,  Gflop/s %6.2f, %6.2f\n",
                    uplo[iu], error, gflops/t1, gflops/t2 );
        }
        printf( "\n" );
        
        // ----- test DTRSV
        // solve A*c = c,  with A m*m triangular; c m-vector
        // try upper/lower, no-trans/trans, unit/non-unit diag
        // Factor A into LU to get well-conditioned triangles, else solve yields garbage.
        // Still can give garbage if solves aren't consistent with LU factors,
        // e.g., using unit diag for U, so copy lower triangle to upper triangle.
        // Also used for trsm later.
        lapackf77_dlacpy( "Full", &maxn, &maxn, A, &ld, LU, &ld );
        lapackf77_dgetrf( &maxn, &maxn, LU, &ld, piv, &info );
        for( int j = 0; j < maxn; ++j ) {
            for( int i = 0; i < j; ++i ) {
                *LU(i,j) = *LU(j,i);
            }
        }
        for( int iu = 0; iu < 2; ++iu ) {
        for( int it = 0; it < 3; ++it ) {
        for( int id = 0; id < 2; ++id ) {
            magma_dsetmatrix( m, m, LU, ld, dA, ld );
            magma_dsetvector( m, C, 1, dC1, 1 );
            magma_dsetvector( m, C, 1, dC2, 1 );
            t1 = magma_sync_wtime( 0 );
            magma_dtrsv( uplo[iu], trans[it], diag[id], m, dA, ld, dC1, 1 );
            t1 = magma_sync_wtime( 0 ) - t1;
            t2 = magma_sync_wtime( 0 );
            cublasDtrsv( uplo[iu], trans[it], diag[id], m, dA, ld, dC2, 1 );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            cublasDaxpy( m, c_neg_one, dC1, 1, dC2, 1 );
            magma_dgetvector( m, dC2, 1, C2, 1 );
            error = lapackf77_dlange( "F", &m, &ione, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_DTRSM( MagmaLeft, m, 1 ) / 1e9;
            printf( "dtrsv( %c, %c, %c )  diff %.2g,  Gflop/s %6.2f, %6.2f\n",
                    uplo[iu], trans[it], diag[id], error, gflops/t1, gflops/t2 );
        }}}
        printf( "\n" );
        
        printf( "========== Level 3 BLAS ==========\n" );
        
        // ----- test DGEMM
        // C = alpha*A*B + beta*C,  with A m*k or k*m; B k*n or n*k; C m*n
        // try combinations of no-trans/trans
        for( int ia = 0; ia < 3; ++ia ) {
        for( int ib = 0; ib < 3; ++ib ) {
            bool nta = (trans[ia] == 'N');
            bool ntb = (trans[ib] == 'N');
            magma_dsetmatrix( (nta ? m : k), (nta ? m : k), A, ld, dA,  ld );
            magma_dsetmatrix( (ntb ? k : n), (ntb ? n : k), B, ld, dB,  ld );
            magma_dsetmatrix( m, n, C, ld, dC1, ld );
            magma_dsetmatrix( m, n, C, ld, dC2, ld );
            t1 = magma_sync_wtime( 0 );
            magma_dgemm( trans[ia], trans[ib], m, n, k, alpha, dA, ld, dB, ld, beta, dC1, ld );
            t1 = magma_sync_wtime( 0 ) - t1;
            t2 = magma_sync_wtime( 0 );
            cublasDgemm( trans[ia], trans[ib], m, n, k, alpha, dA, ld, dB, ld, beta, dC2, ld );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 );
            magma_dgetmatrix( m, n, dC2, ld, C2, ld );
            error = lapackf77_dlange( "F", &m, &n, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_DGEMM( m, n, k ) / 1e9;
            printf( "dgemm( %c, %c )     diff %.2g,  Gflop/s %6.2f, %6.2f\n",
                    trans[ia], trans[ib], error, gflops/t1, gflops/t2 );
        }}
        printf( "\n" );
        
        // ----- test DSYMM
        // C = alpha*A*B + beta*C  (left)  with A m*m symmetric; B,C m*n; or
        // C = alpha*B*A + beta*C  (right) with A n*n symmetric; B,C m*n
        // try left/right, upper/lower
        for( int is = 0; is < 2; ++is ) {
        for( int iu = 0; iu < 2; ++iu ) {
            magma_dsetmatrix( m, m, A, ld, dA,  ld );
            magma_dsetmatrix( m, n, B, ld, dB,  ld );
            magma_dsetmatrix( m, n, C, ld, dC1, ld );
            magma_dsetmatrix( m, n, C, ld, dC2, ld );
            t1 = magma_sync_wtime( 0 );
            magma_dsymm( side[is], uplo[iu], m, n, alpha, dA, ld, dB, ld, beta, dC1, ld );
            t1 = magma_sync_wtime( 0 ) - t1;
            t2 = magma_sync_wtime( 0 );
            cublasDsymm( side[is], uplo[iu], m, n, alpha, dA, ld, dB, ld, beta, dC2, ld );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 );
            magma_dgetmatrix( m, n, dC2, ld, C2, ld );
            error = lapackf77_dlange( "F", &m, &n, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_DSYMM( side[is], m, n ) / 1e9;
            printf( "dsymm( %c, %c )     diff %.2g,  Gflop/s %6.2f, %6.2f\n",
                    side[is], uplo[iu], error, gflops/t1, gflops/t2 );
        }}
        printf( "\n" );
        
        // ----- test DSYRK
        // C = alpha*A*A^H + beta*C  (no-trans) with A m*k and C m*m symmetric; or
        // C = alpha*A^H*A + beta*C  (trans)    with A k*m and C m*m symmetric
        // try upper/lower, no-trans/trans
        for( int iu = 0; iu < 2; ++iu ) {
        for( int it = 0; it < 3; ++it ) {
            magma_dsetmatrix( n, k, A, ld, dA,  ld );
            magma_dsetmatrix( n, n, C, ld, dC1, ld );
            magma_dsetmatrix( n, n, C, ld, dC2, ld );
            t1 = magma_sync_wtime( 0 );
            magma_dsyrk( uplo[iu], trans[it], n, k, dalpha, dA, ld, dbeta, dC1, ld );
            t1 = magma_sync_wtime( 0 ) - t1;
            t2 = magma_sync_wtime( 0 );
            cublasDsyrk( uplo[iu], trans[it], n, k, dalpha, dA, ld, dbeta, dC2, ld );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 );
            magma_dgetmatrix( n, n, dC2, ld, C2, ld );
            error = lapackf77_dlange( "F", &n, &n, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_DSYRK( k, n ) / 1e9;
            printf( "dsyrk( %c, %c )     diff %.2g,  Gflop/s %6.2f, %6.2f\n",
                    uplo[iu], trans[it], error, gflops/t1, gflops/t2 );
        }}
        printf( "\n" );
        
        // ----- test DSYR2K
        // C = alpha*A*B^H + ^alpha*B*A^H + beta*C  (no-trans) with A,B n*k; C n*n symmetric; or
        // C = alpha*A^H*B + ^alpha*B^H*A + beta*C  (trans)    with A,B k*n; C n*n symmetric
        // try upper/lower, no-trans/trans
        for( int iu = 0; iu < 2; ++iu ) {
        for( int it = 0; it < 3; ++it ) {
            bool nt = (trans[it] == 'N');
            magma_dsetmatrix( (nt ? n : k), (nt ? n : k), A, ld, dA,  ld );
            magma_dsetmatrix( n, n, C, ld, dC1, ld );
            magma_dsetmatrix( n, n, C, ld, dC2, ld );
            t1 = magma_sync_wtime( 0 );
            magma_dsyr2k( uplo[iu], trans[it], n, k, alpha, dA, ld, dB, ld, dbeta, dC1, ld );
            t1 = magma_sync_wtime( 0 ) - t1;
            t2 = magma_sync_wtime( 0 );
            cublasDsyr2k( uplo[iu], trans[it], n, k, alpha, dA, ld, dB, ld, dbeta, dC2, ld );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 );
            magma_dgetmatrix( n, n, dC2, ld, C2, ld );
            error = lapackf77_dlange( "F", &n, &n, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_DSYR2K( k, n ) / 1e9;
            printf( "dsyr2k( %c, %c )    diff %.2g,  Gflop/s %6.2f, %6.2f\n",
                    uplo[iu], trans[it], error, gflops/t1, gflops/t2 );
        }}
        printf( "\n" );
        
        // ----- test DTRMM
        // C = alpha*A*C  (left)  with A m*m triangular; C m*n; or
        // C = alpha*C*A  (right) with A n*n triangular; C m*n
        // try left/right, upper/lower, no-trans/trans, unit/non-unit
        for( int is = 0; is < 2; ++is ) {
        for( int iu = 0; iu < 2; ++iu ) {
        for( int it = 0; it < 3; ++it ) {
        for( int id = 0; id < 2; ++id ) {
            bool left = (side[is] == 'L');
            magma_dsetmatrix( (left ? m : n), (left ? m : n), A, ld, dA,  ld );
            magma_dsetmatrix( m, n, C, ld, dC1, ld );
            magma_dsetmatrix( m, n, C, ld, dC2, ld );
            t1 = magma_sync_wtime( 0 );
            magma_dtrmm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC1, ld );
            t1 = magma_sync_wtime( 0 ) - t1;
            t2 = magma_sync_wtime( 0 );
            cublasDtrmm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC2, ld );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 );
            magma_dgetmatrix( m, n, dC2, ld, C2, ld );
            error = lapackf77_dlange( "F", &n, &n, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_DTRMM( side[is], m, n ) / 1e9;
            printf( "dtrmm( %c, %c )     diff %.2g,  Gflop/s %6.2f, %6.2f\n",
                    uplo[iu], trans[it], error, gflops/t1, gflops/t2 );
        }}}}
        printf( "\n" );
        
        // ----- test DTRSM
        // solve A*X = alpha*B  (left)  with A m*m triangular; B m*n; or
        // solve X*A = alpha*B  (right) with A n*n triangular; B m*n
        // try left/right, upper/lower, no-trans/trans, unit/non-unit
        for( int is = 0; is < 2; ++is ) {
        for( int iu = 0; iu < 2; ++iu ) {
        for( int it = 0; it < 3; ++it ) {
        for( int id = 0; id < 2; ++id ) {
            bool left = (side[is] == 'L');
            magma_dsetmatrix( (left ? m : n), (left ? m : n), LU, ld, dA,  ld );
            magma_dsetmatrix( m, n, C, ld, dC1, ld );
            magma_dsetmatrix( m, n, C, ld, dC2, ld );
            t1 = magma_sync_wtime( 0 );
            magma_dtrsm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC1, ld );
            t1 = magma_sync_wtime( 0 ) - t1;
            t2 = magma_sync_wtime( 0 );
            cublasDtrsm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC2, ld );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 );
            magma_dgetmatrix( m, n, dC2, ld, C2, ld );
            error = lapackf77_dlange( "F", &n, &n, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_DTRSM( side[is], m, n ) / 1e9;
            printf( "dtrsm( %c, %c )     diff %.2g,  Gflop/s %6.2f, %6.2f\n",
                    uplo[iu], trans[it], error, gflops/t1, gflops/t2 );
        }}}}
        printf( "\n" );
        
        // cleanup
        magma_free_cpu( piv );
        magma_free_pinned( A  );
        magma_free_pinned( B  );
        magma_free_pinned( C  );
        magma_free_pinned( C2 );
        magma_free_pinned( LU );
        magma_free( dA  );
        magma_free( dB  );
        magma_free( dC1 );
        magma_free( dC2 );
    }
    
    if ( total_error != 0. ) {
        printf( "total error %.2g -- ought to be 0 -- some test failed (see above).\n",
                total_error );
    }
    else {
        printf( "all tests passed\n" );
    }
    
    TESTING_FINALIZE();
    return 0;
}
Beispiel #27
0
int main( int argc, char **argv )
{
	double *A, *B, *C;
	double *cu_A, *cu_B, *cu_C;

	cudaError_t    cuError;
	cublasStatus_t cuStatus;
	cublasHandle_t cuHandle;

	// seed rand()
	srand(time(NULL));

	// allocate memory on CPU
	A = (double*)malloc(sizeof(double)*MATRIX_N*MATRIX_P);
	B = (double*)malloc(sizeof(double)*MATRIX_P*MATRIX_M);
	C = (double*)malloc(sizeof(double)*MATRIX_N*MATRIX_M);

	if( !A || !B || !C )
	{
		perror("Can't allocate CPU matrices");
		exit(EXIT_FAILURE);
	}

	// generate matrices
	for( int i = 0; i < MATRIX_N*MATRIX_P; i++ )
		A[i] = 10.0*((double)rand())/RAND_MAX;

	for( int i = 0; i < MATRIX_P*MATRIX_M; i++ )
		B[i] = 10.0*((double)rand())/RAND_MAX;

	// allocate memory on GPU
	cuError = cudaMalloc( &cu_A, sizeof(double)*MATRIX_N*MATRIX_P );

	if( cuError != cudaSuccess )
	{
		fprintf(stderr, "Can't allocate GPU matrices\n");
		exit(EXIT_FAILURE);
	}

	cuError = cudaMalloc( &cu_B, sizeof(double)*MATRIX_P*MATRIX_M );

	if( cuError != cudaSuccess )
	{
		fprintf(stderr, "Can't allocate GPU matrices\n");
		exit(EXIT_FAILURE);
	}

	cuError = cudaMalloc( &cu_C, sizeof(double)*MATRIX_N*MATRIX_M );

	if( cuError != cudaSuccess )
	{
		fprintf(stderr, "Can't allocate GPU matrices\n");
		exit(EXIT_FAILURE);
	}

	// setup cuBlas
	cuStatus = cublasCreate( &cuHandle );
	if( cuStatus != CUBLAS_STATUS_SUCCESS )
	{
		fprintf(stderr, "Error initializing cuBlas\n");
		exit(EXIT_FAILURE);
	}

	// setup matrices
	cuStatus = cublasSetMatrix( MATRIX_N, MATRIX_P, sizeof(double), A, MATRIX_N, cu_A, MATRIX_N );
	if( cuStatus != CUBLAS_STATUS_SUCCESS )
	{
		fprintf(stderr, "Error transferring matrix A\n");
		exit(EXIT_FAILURE);
	}

	cuStatus = cublasSetMatrix( MATRIX_P, MATRIX_M, sizeof(double), B, MATRIX_P, cu_B, MATRIX_P );
	if( cuStatus != CUBLAS_STATUS_SUCCESS )
	{
		fprintf(stderr, "Error transferring matrix B\n");
		exit(EXIT_FAILURE);
	}

	// multiply
	double one  = 1.0;
	double zero = 0.0;
	cuStatus = cublasDgemm( cuHandle, CUBLAS_OP_N, CUBLAS_OP_N, MATRIX_N, MATRIX_M, MATRIX_P, &one, cu_A, MATRIX_N, cu_B, MATRIX_P, &zero, cu_C, MATRIX_N );

	if( cuStatus != CUBLAS_STATUS_SUCCESS )
	{
		fprintf(stderr, "Error executing matrix mult\n");
		exit(EXIT_FAILURE);
	}

	// get results
	cuStatus = cublasGetMatrix( MATRIX_N, MATRIX_M, sizeof(double), cu_C, MATRIX_N, C, MATRIX_N );
	if( cuStatus != CUBLAS_STATUS_SUCCESS )
	{
		fprintf(stderr, "Error transferring results\n");
		exit(EXIT_FAILURE);
	}
	
	// check results
	bool good = true;
	for( int i = 0; i < MATRIX_N; i++ )
	{
		for( int j = 0; j < MATRIX_M; j++ )
		{
			double sum = 0.0;
			for( int k = 0; k < MATRIX_P; k++ )
			{
				sum += A[IDX2C(i, k, MATRIX_N)]*B[IDX2C(k, j, MATRIX_P)];
			}
			// check
			if( fabs(sum - C[IDX2C(i,j,MATRIX_N)]) > 0.00001 )
			{
				good = false;
				printf("(%i, %i) sum = %f\tcu_C = %f\tMISMATCH\n", i, j, sum, C[IDX2C(i,j,MATRIX_N)]);
			}
		}
	}

	if( good )
		printf("Results Match\n");
	else
		printf("Results DO NOT Match\n");

	// cleanup
	free( A ); free( B ); free( C );
	cudaFree( cu_A ); cudaFree( cu_B ); cudaFree( cu_C );
	cublasDestroy( cuHandle );

	return 0;
}
Beispiel #28
0
SEXP magma_dgeMatrix_matrix_mm(SEXP a, SEXP bP, SEXP right)
{
#ifdef HIPLAR_WITH_MAGMA
	SEXP b = PROTECT(mMatrix_as_dgeMatrix(bP)),
	     val = PROTECT(NEW_OBJECT(MAKE_CLASS("dgeMatrix")));
	int *adims = INTEGER(GET_SLOT(a, Matrix_DimSym)),
	    *bdims = INTEGER(GET_SLOT(b, Matrix_DimSym)),
	    *cdims = INTEGER(ALLOC_SLOT(val, Matrix_DimSym, INTSXP, 2));
	double one = 1.0, zero = 0.0;

	if (asLogical(right)) {
		int m = bdims[0], n = adims[1], k = bdims[1];
		if (adims[0] != k)
			error(_("Matrices are not conformable for multiplication"));
		cdims[0] = m; cdims[1] = n;
		if (m < 1 || n < 1 || k < 1) {
			// 		This was commented out
				    error(_("Matrices with zero extents cannot be multiplied")); 
			ALLOC_SLOT(val, Matrix_xSym, REALSXP, m * n);
		} else {
			double *B = REAL(GET_SLOT(b, Matrix_xSym));
			double *A = REAL(GET_SLOT(a, Matrix_xSym));
			double *C = REAL(ALLOC_SLOT(val, Matrix_xSym, REALSXP, m * n));
			//TODO add magma here too
			if(GPUFlag == 1) {
				double *d_A, *d_B, *d_C;
				cublasStatus retStatus;


#ifdef HIPLAR_DBG
				R_ShowMessage("DBG: Performing matrix multiplication with Right = true using magmablas_dgemm");
#endif
				cublasAlloc(n * k, sizeof(double), (void**)&d_A);
			
				/* Error Checking */
				retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Memory Allocation"));
				/********************************************/

				cublasAlloc(m * k, sizeof(double), (void**)&d_B);	

				/* Error Checking */
				retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Memory Allocation"));
				/********************************************/

				cublasAlloc(m * n, sizeof(double), (void**)&d_C);	

				/* Error Checking */
				retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Memory Allocation"));
				/********************************************/

				cublasSetVector( n  * k , sizeof(double), A, 1, d_A, 1);

				/* Error Checking */
				retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Data Transfer to Device"));
				/********************************************/

				cublasSetVector( m * k, sizeof(double), B, 1, d_B, 1 );

				/* Error Checking */
				retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Data Transfer to Device"));
				/********************************************/

				// ******** magmablas_dgemm call Here **
				//magmablas_dgemm('N', 'N', m, n, k, one, d_B, m, d_A, k, zero, d_C,  m);
				//CHANGED 30/07
				cublasDgemm('N', 'N', m, n, k, one, d_B, m, d_A, k, zero, d_C, m);
				
				/* Error Checking */
				retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) {
					error(_("CUBLAS: Error in cublasDgemm routine"));
				}
				/********************************************/

				cublasGetVector( m * n , sizeof(double), d_C, 1, C, 1);

				/* Error Checking */
				retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Data Transfer from Device"));
				/********************************************/

				cublasFree(d_A);
				cublasFree(d_B);
				cublasFree(d_C);
			}
			else {
	
#ifdef HIPLAR_DBG
				R_ShowMessage("DBG: Performing matrix multiplication using dgemm with right = TRUE");
#endif
				F77_CALL(dgemm) ("N", "N", &m, &n, &k, &one,
						B, &m, A , &k, &zero, C , &m);
			}
		}
	} else {
		int m = adims[0], n = bdims[1], k = adims[1];
		double *A = REAL(GET_SLOT(a, Matrix_xSym));
		double *B = REAL(GET_SLOT(b, Matrix_xSym));


		if (bdims[0] != k)
			error(_("Matrices are not conformable for multiplication"));
		cdims[0] = m; cdims[1] = n;
		double *C = REAL(ALLOC_SLOT(val, Matrix_xSym, REALSXP, m * n));
		
		if (m < 1 || n < 1 || k < 1) {
			//		This was commented out
			error(_("Matrices with zero extents cannot be multiplied")); 
			ALLOC_SLOT(val, Matrix_xSym, REALSXP, m * n);
		} else {
			if(GPUFlag == 1) {

				double *d_A, *d_B, *d_C;
				cublasStatus retStatus;

	
#ifdef HIPLAR_DBG
				R_ShowMessage("DBG: Performing matrix multiplication using magmablas_dgemm");
#endif			
				cublasAlloc(m * k, sizeof(double), (void**)&d_A);
			
				/* Error Checking */
				retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Memory Allocation"));
				/********************************************/

				cublasAlloc(n * k, sizeof(double), (void**)&d_B);	

				/* Error Checking */
				retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Memory Allocation"));
				/********************************************/

				cublasAlloc(m * n, sizeof(double), (void**)&d_C);	

				/* Error Checking */
				retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Memory Allocation"));
				/********************************************/

				cublasSetVector( m  * k , sizeof(double), A, 1, d_A, 1);

				/* Error Checking */
				retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Data Transfer to Device"));
				/********************************************/

				cublasSetVector( n * k, sizeof(double), B, 1, d_B, 1 );

				/* Error Checking */
				retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Data Transfer to Device"));
				/********************************************/

				// ******** magmablas_dgemm call Here **
				//magmablas_dgemm('N', 'N', m, n, k, one, d_A, m, d_B, k, zero, d_C,  m);
				//CHANGE
				cublasDgemm('N', 'N', m, n, k, one, d_A, m, d_B, k, zero, d_C, m);
				
				retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) {
					error(_("CUBLAS: Error in Data Transfer from Device"));
				/********************************************/
				}
				
				cublasGetVector( m * n , sizeof(double), d_C, 1, C, 1);

				/* Error Checking */
				retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Data Transfer from Device"));
				/********************************************/

				cublasFree(d_A);
				cublasFree(d_B);
				cublasFree(d_C);
				
			}
			else {
	
#ifdef HIPLAR_DBG
				R_ShowMessage("DBG: Performing matrix multiplication using dgemm");
#endif
				F77_CALL(dgemm) ("N", "N", &m, &n, &k, &one,
						A, &m,
						B, &k, &zero,
						C,
						&m);	

			}
		}
	}
	ALLOC_SLOT(val, Matrix_DimNamesSym, VECSXP, 2);
	UNPROTECT(2);
	return val;
#endif
	return R_NilValue;
}
Beispiel #29
0
SEXP magma_dgeMatrix_matrix_crossprod(SEXP x, SEXP y, SEXP trans)
{
#ifdef HIPLAR_WITH_MAGMA
	int tr = asLogical(trans);/* trans=TRUE: tcrossprod(x,y) */
	SEXP val = PROTECT(NEW_OBJECT(MAKE_CLASS("dgeMatrix")));
	int *xDims = INTEGER(GET_SLOT(x, Matrix_DimSym)),
	    *yDims = INTEGER(getAttrib(y, R_DimSymbol)),
	    *vDims, nprot = 1;
	int m  = xDims[!tr],  n = yDims[!tr];/* -> result dim */
	int xd = xDims[ tr], yd = yDims[ tr];/* the conformable dims */
	double one = 1.0, zero = 0.0;

	if (isInteger(y)) {
		y = PROTECT(coerceVector(y, REALSXP));
		nprot++;
	}
	if (!(isMatrix(y) && isReal(y)))
		error(_("Argument y must be a numeric matrix"));
	SET_SLOT(val, Matrix_factorSym, allocVector(VECSXP, 0));
	SET_SLOT(val, Matrix_DimSym, allocVector(INTSXP, 2));
	vDims = INTEGER(GET_SLOT(val, Matrix_DimSym));
	if (xd > 0 && yd > 0 && n > 0 && m > 0) {
		if (xd != yd)
			error(_("Dimensions of x and y are not compatible for %s"),
					tr ? "tcrossprod" : "crossprod");
		vDims[0] = m; vDims[1] = n;
		SET_SLOT(val, Matrix_xSym, allocVector(REALSXP, m * n));

		double *A = REAL(GET_SLOT(x, Matrix_xSym));
		double *B = REAL(y);
		double *C = REAL(GET_SLOT(val, Matrix_xSym));

		if(GPUFlag == 1) {
			
			double *d_A, *d_B, *d_C;
			cublasStatus retStatus;

#ifdef HIPLAR_DBG
			R_ShowMessage("DBG: Performing dge/matrix crossprod using magmablas_dgemm");
#endif
			cublasAlloc(m * xd, sizeof(double), (void**)&d_A);

			/* Error Checking */
			retStatus = cublasGetError ();
			if (retStatus != CUBLAS_STATUS_SUCCESS) 
				error(_("CUBLAS: Error in Memory Allocation"));
			/********************************************/

			cublasAlloc(n * xd, sizeof(double), (void**)&d_B);	

			/* Error Checking */
			retStatus = cublasGetError ();
			if (retStatus != CUBLAS_STATUS_SUCCESS) 
				error(_("CUBLAS: Error in Memory Allocation"));
			/********************************************/

			cublasAlloc(m * n, sizeof(double), (void**)&d_C);	

			/* Error Checking */
			retStatus = cublasGetError ();
			if (retStatus != CUBLAS_STATUS_SUCCESS) 
				error(_("CUBLAS: Error in Memory Allocation"));
			/********************************************/

			cublasSetVector( m  * xd , sizeof(double), A, 1, d_A, 1);

			/* Error Checking */
			retStatus = cublasGetError ();
			if (retStatus != CUBLAS_STATUS_SUCCESS) 
				error(_("CUBLAS: Error in Data Transfer to Device"));
			/********************************************/

			cublasSetVector( xd * n, sizeof(double), B, 1, d_B, 1 );

			/* Error Checking */
			retStatus = cublasGetError ();
			if (retStatus != CUBLAS_STATUS_SUCCESS) 
				error(_("CUBLAS: Error in Data Transfer to Device"));
			/********************************************/

			cublasSetVector( m * n, sizeof(double), C, 1, d_C, 1 );

			/* Error Checking */
			retStatus = cublasGetError ();
			if (retStatus != CUBLAS_STATUS_SUCCESS) 
				error(_("CUBLAS: Error in Data Transfer to Device"));
			/********************************************/


			// ******** magmablas_dgemm call Here **
			//magmablas_dgemm( tr ? 'N' : 'T', tr ? 'T' : 'N', m, n, xd, one, d_A, xDims[0], d_B, yDims[0], zero, d_C,  m);
			//CHANGE
			cublasDgemm( tr ? 'N' : 'T', tr ? 'T' : 'N', m, n, xd, one, d_A, xDims[0], d_B, yDims[0], zero, d_C,  m);
			cublasGetVector( m * n , sizeof(double), d_C, 1, C, 1);

			/* Error Checking */
			retStatus = cublasGetError ();
			if (retStatus != CUBLAS_STATUS_SUCCESS) 
				error(_("CUBLAS: Error in Data Transfer from Device"));
			/********************************************/

			cublasFree(d_A);
			cublasFree(d_B);
			cublasFree(d_C);
			
		}
		else {

#ifdef HIPLAR_DBG
			R_ShowMessage("DBG: Performing dge/matrix cross prod with dgemm");
#endif
			F77_CALL(dgemm)(tr ? "N" : "T", tr ? "T" : "N", &m, &n, &xd, &one,
					A , xDims,
					B , yDims,
					&zero, C, &m);
		}
	}
	UNPROTECT(nprot);
	return val;
#endif
	return R_NilValue;
}
Beispiel #30
0
void blasx_gpu_dgemm_kernel(int j,
                            int nrowa, int ncola,
                            int nrowb, int ncolb,
                            int nrowc, int ncolc,
                            int current_task, int prior_task,
                            enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB,
                            double* A, double* B, double* C,
                            int lda, int ldb, int ldc,
                            int x, int y, int z,
                            double** C_dev,
                            cudaStream_t *stream, cublasHandle_t *handle_p,
                            int current_stream,
                            double alpha, double beta, int block_dim,
                            int switcher, int* task_batch_counter,
                            LRU_t **LRUs, int GPUs,
                            int *mem_cpy_counter,
                            reader_tracker *addr_track,
                            int GPU_id)
{
    int nrowa_dev, nrowb_dev, nrowc_dev;
    int ncola_dev, ncolb_dev, ncolc_dev;
    int nrow_offset_a, nrow_offset_b;
    int ncol_offset_a, ncol_offset_b;
    int i = current_task/(y+1);
    int k = current_task%(y+1);
    double *A_dev, *B_dev;
    if (TransA != CblasNoTrans) {
        margin_adjustment(nrowa,ncola,block_dim,j,i,&nrowa_dev,&ncola_dev);
    }else{
        margin_adjustment(nrowa,ncola,block_dim,i,j,&nrowa_dev,&ncola_dev);
    }
    if (TransB != CblasNoTrans) {
        margin_adjustment(nrowb,ncolb,block_dim,k,j,&nrowb_dev,&ncolb_dev);
    }else{
        margin_adjustment(nrowb,ncolb,block_dim,j,k,&nrowb_dev,&ncolb_dev);
    }
    margin_adjustment(nrowc,ncolc,block_dim,i,k,&nrowc_dev,&ncolc_dev);
    if (TransA != CblasNoTrans) {
        nrow_offset_a = j*block_dim, ncol_offset_a = i*block_dim;
    }else{
        nrow_offset_a = i*block_dim, ncol_offset_a = j*block_dim;
    }
    if (TransB != CblasNoTrans) {
        nrow_offset_b = k*block_dim, ncol_offset_b = j*block_dim;
    }else{
        nrow_offset_b = j*block_dim, ncol_offset_b = k*block_dim;
    }
    double *starting_point_A = &A[nrow_offset_a+ncol_offset_a*lda];
    double *starting_point_B = &B[nrow_offset_b+ncol_offset_b*ldb];
    //Asynchonizing set matrix on GPU
    //----------------LRU&RBT optimization----------------//
    mem_control_kernel_double(starting_point_A, &A_dev, LRUs, GPUs, GPU_id, block_dim, mem_cpy_counter, addr_track, stream, nrowa_dev, ncola_dev, lda);
    mem_control_kernel_double(starting_point_B, &B_dev, LRUs, GPUs, GPU_id, block_dim, mem_cpy_counter, addr_track, stream, nrowb_dev, ncolb_dev, ldb);
    //----------------------------------------------------//
    
    if (j == 0) {
        margin_adjustment(nrowc,ncolc,block_dim,i,k,&nrowc_dev,&ncolc_dev);
        int nrow_offset_c = i*block_dim;
        int ncol_offset_c = k*block_dim;
        double *starting_point_C = &C[nrow_offset_c+ncol_offset_c*ldc];
        if (beta != 0) {
            assert( cublasSetMatrixAsync(nrowc_dev, ncolc_dev, sizeof(double), starting_point_C, ldc, C_dev[switcher*STREAMNUM+current_stream], block_dim, *stream) == CUBLAS_STATUS_SUCCESS );
        }
        if (*task_batch_counter != 0) {//Set matrix back
            int i_pre = prior_task/(y+1);
            int k_pre = prior_task%(y+1);
            int nrowc_dev_pre, ncolc_dev_pre;
            margin_adjustment(nrowc,ncolc,block_dim,i_pre,k_pre,&nrowc_dev_pre,&ncolc_dev_pre);
            int nrow_offset_c_pre = i_pre*block_dim;
            int ncol_offset_c_pre = k_pre*block_dim;
            double *starting_point_C_pre = &C[nrow_offset_c_pre+ncol_offset_c_pre*ldc];
            assert( cublasGetMatrixAsync(nrowc_dev_pre, ncolc_dev_pre, sizeof(double), C_dev[current_stream+(1-switcher)*STREAMNUM], block_dim, starting_point_C_pre, ldc,*stream) == CUBLAS_STATUS_SUCCESS);
        }
    }
    cudaStreamSynchronize(*stream);
    assert( cublasSetStream(*handle_p, *stream) == CUBLAS_STATUS_SUCCESS );

    double beta_inner = (j==0)?(beta):(1);
    int ka = (TransA != CblasNoTrans)?(nrowa_dev):(ncola_dev);
    cublasOperation_t transa, transb;
    CBLasTransToCuBlasTrans(TransA, &transa);
    CBLasTransToCuBlasTrans(TransB, &transb);
    cublasStatus_t status = cublasDgemm(*handle_p,
                                        transa, transb,
                                        nrowc_dev, ncolc_dev, ka,
                                        &alpha,
                                        A_dev, block_dim,
                                        B_dev, block_dim,
                                        &beta_inner,
                                        C_dev[switcher*STREAMNUM+current_stream], block_dim);
    assert( status == CUBLAS_STATUS_SUCCESS );
}