コード例 #1
0
ファイル: cuMatrix.cpp プロジェクト: YuetianLiu/CUDA-CNN
/*z = x * T(y)*/
void matrixMulTB(cuMatrix<float>* x, cuMatrix<float>*y, cuMatrix<float>*z)
{
	if(x->channels != 1 || y->channels != 1 || z->channels != 1){
		printf("matrix mul chanels != 1\n");
		exit(0);
	}
	if(x->cols != y->cols || z->rows != x->rows || z->cols != y->rows){
		printf("matrix mul chanels != 1\n");
		exit(0);
	}
	cublasStatus_t stat;
	float alpha = 1.0;
	float beta = 0.0;
	stat = cublasSgemm(
		getHandle(), 
		CUBLAS_OP_T,
		CUBLAS_OP_N,
		y->rows,
		x->rows,
		y->cols,
		&alpha,
		y->getDev(),
		y->cols,
		x->getDev(),
		x->cols,
		&beta,
		z->getDev(),
		z->cols);
	cudaDeviceSynchronize();
	if(stat != CUBLAS_STATUS_SUCCESS) {
		printf("matrixMulTB cublasSgemm error\n");
		exit(0);
	}
}
コード例 #2
0
  static vl::Error
  gemm(vl::Context& context,
       char op1, char op2,
       ptrdiff_t m, ptrdiff_t n, ptrdiff_t k,
       type alpha,
       type const * a, ptrdiff_t lda,
       type const * b, ptrdiff_t ldb,
       type beta,
       type * c, ptrdiff_t ldc)
  {
    cublasHandle_t handle ;
    cublasStatus_t status ;
    status = context.getCudaHelper().getCublasHandle(&handle) ;
    if (status != CUBLAS_STATUS_SUCCESS) goto done ;

    status = cublasSgemm(handle,
                         (op1 == 't') ? CUBLAS_OP_T : CUBLAS_OP_N,
                         (op2 == 't') ? CUBLAS_OP_T : CUBLAS_OP_N,
                         (int)m, (int)n, (int)k,
                         &alpha,
                         a, (int)lda,
                         b, (int)ldb,
                         &beta,
                         c, (int)ldc);
  done:
    return context.setError
    (context.getCudaHelper().catchCublasError(status, "cublasSgemm"), __func__) ;
  }
コード例 #3
0
ファイル: linalg.c プロジェクト: zauberkraut/acmi
/* C <- alpha*A*B + beta*C */
void gemm(double alpha, Mat mA, Mat mB, double beta, Mat mC) {
  const int n = MatN(mA);
  const void* const a = MatElems(mA);
  const void* const b = MatElems(mB);
  void* const c = MatElems(mC);
  const bool dev = MatDev(mA);

  switch (MatElemSize(mA)) {
  case 4:
    if (dev) {
      float alpha32 = alpha, beta32 = beta;
      cublasSgemm(g_cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n, n, n,
                  &alpha32, a, n, b, n, &beta32, c, n);
    } else {
      cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, n, n,
                  n, alpha, a, n, b, n, beta, c, n);
    }
    break;

  case 8:
    if (dev) {
      cublasDgemm(g_cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n, n, n,
                  &alpha, a, n, b, n, &beta, c, n);
    } else {
      cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, n, n,
                  n, alpha, a, n, b, n, beta, c, n);
    }
    break;
  }
}
コード例 #4
0
ファイル: Spoc_cublas.c プロジェクト: archonSTB/SPOC
CAMLprim value spoc_cublasSgemm(value transa, value transb,
		value m, value n, value k,
		value alpha, value a, value lda,
		value b, value ldb, value beta, value c, value ldc, value dev){
	CAMLparam5(transa, transb, m, n, k);
	CAMLxparam5(alpha, a, lda, b, ldb);
	CAMLxparam4(beta, c, ldc, dev);
	CAMLlocal3(dev_vec_array, dev_vec, gi);
	CUdeviceptr d_A;
	CUdeviceptr d_B;
	CUdeviceptr d_C;
	int id;

	GET_VEC(a, d_A);
	GET_VEC(b, d_B);
	GET_VEC(c, d_C);
	CUBLAS_GET_CONTEXT;

	cublasSgemm (Int_val(transa), Int_val(transb), Int_val(m), Int_val(n),
	Int_val(k), (float)Double_val(alpha), (float*) d_A, Int_val(lda),
	(float*) d_B, Int_val(ldb), (float) Double_val(beta),
	(float *)d_C, Int_val(ldc));

	CUDA_RESTORE_CONTEXT;

	CAMLreturn(Val_unit);
}
コード例 #5
0
ファイル: Kokkos_MV_GEMM.hpp プロジェクト: cihanuq/Trilinos
      static void
      GEMM (const Teuchos::ETransp transA,
            const Teuchos::ETransp transB,
            const float alpha,
            const View<const float**,LayoutLeft,Cuda>& A,
            const View<const float**,LayoutLeft,Cuda>& B,
            const float beta,
            const View<float**,LayoutLeft,Cuda>& C)
    {
      const int m = static_cast<int>(C.dimension_0()),
        n = static_cast<int>(C.dimension_1()),
        k = (transA == Teuchos::NO_TRANS ? A.dimension_1() : A.dimension_0()),
        lda = static_cast<int>(Impl::getStride2DView(A)),
        ldb = static_cast<int>(Impl::getStride2DView(B)),
        ldc = static_cast<int>(Impl::getStride2DView(C));
      const char char_transA = (transA == Teuchos::NO_TRANS ? 'N' : 'T'),
        char_transB = (transB == Teuchos::NO_TRANS ? 'N' : 'T');
      cublasSgemm (char_transA, char_transB, m, n, k, alpha,
                   A.ptr_on_device(), lda, B.ptr_on_device(),
                   ldb, beta, C.ptr_on_device(), ldc);

#ifdef HAVE_KOKKOS_DEBUG
      const cublasStatus info = cublasGetError ();
      TEUCHOS_TEST_FOR_EXCEPTION
        (info != CUBLAS_STATUS_SUCCESS, std::runtime_error,
         "cublasSgemm failed with status " << info << "." );
#endif // HAVE_KOKKOS_DEBUG
      }
コード例 #6
0
static int sgemm(cb_order order, cb_transpose transA, cb_transpose transB,
                 size_t M, size_t N, size_t K, float alpha,
                 gpudata *A, size_t offA, size_t lda,
                 gpudata *B, size_t offB, size_t ldb,
                 float beta, gpudata *C, size_t offC, size_t ldc) {
  cuda_context *ctx = A->ctx;
  gpudata *T;
  size_t t;
  cublasStatus_t err;
  cb_transpose transT;

  ASSERT_BUF(A);
  ASSERT_BUF(B);
  ASSERT_BUF(C);

  if (order == cb_c) {
    /* swap A and B */
    t = N;
    N = M;
    M = t;
    T = A;
    A = B;
    B = T;
    t = lda;
    lda = ldb;
    ldb = t;
    transT = transA;
    transA = transB;
    transB = transT;
    t = offA;
    offA = offB;
    offB = t;
  }

  cuda_enter(ctx);

  cuda_wait(A, CUDA_WAIT_READ);
  cuda_wait(B, CUDA_WAIT_READ);
  cuda_wait(C, CUDA_WAIT_READ|CUDA_WAIT_WRITE);

  err = cublasSgemm(((blas_handle *)ctx->blas_handle)->h,
                    convT(transA), convT(transB), M, N, K,
                    &alpha, ((float *)A->ptr) + offA, lda,
                    ((float *)B->ptr) + offB, ldb, &beta,
                    ((float *)C->ptr) + offC, ldc);
  if (err != CUBLAS_STATUS_SUCCESS) {
    cuda_exit(ctx);
    if (err == CUBLAS_STATUS_ARCH_MISMATCH)
      return GA_DEVSUP_ERROR;
    return GA_BLAS_ERROR;
  }

  cuda_record(A, CUDA_WAIT_READ);
  cuda_record(B, CUDA_WAIT_READ);
  cuda_record(C, CUDA_WAIT_READ|CUDA_WAIT_WRITE);

  cuda_exit(ctx);
  return GA_NO_ERROR;
}
コード例 #7
0
ファイル: gemm.hpp プロジェクト: CQMP/scripts
inline void gemm( const Order order, const TransA transa, const TransB transb,
        const int m, const int n, const int k, const float alpha,
        const float* a, const int lda, const float* b, const int ldb,
        const float beta, float* c, const int ldc ) {
    BOOST_STATIC_ASSERT( (is_same<Order, tag::column_major>::value) );
    cublasSgemm( blas_option< TransA >::value, blas_option< TransB >::value,
            m, n, k, alpha, a, lda, b, ldb, beta, c, ldc );
}
int
CudaInterface::cublasMatrixMult(float* A, float* B, float* C, MatrixSize_t mSize) {
    checkCudaErrors(cudaSetDevice(mDevID));
    checkCudaErrors(cudaGetDevice(&mDevID));
    checkCudaErrors(cublasSgemm(mCublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, mSize.uiWB, mSize.uiHA, 
                                mSize.uiWA, &mAlpha, B, mSize.uiWB, A, mSize.uiWA, &mBeta, C, mSize.uiWA));
    return 0;
}
コード例 #9
0
ファイル: numcp_cuda.hpp プロジェクト: DaikiShimada/numcp
Darray<float> cudot (const Darray<float>& lhs, const Darray<float>& rhs)
{
	// context check
	CHECK_EQ(lhs.getDeviceManager().getDeviceID(), rhs.getDeviceManager().getDeviceID());
	
	CHECK_EQ(lhs.ndim(), rhs.ndim());
	CHECK_LT(lhs.ndim(), 3);
	CHECK_LT(rhs.ndim(), 3);

	Darray<float> ret;

	if (lhs.ndim()==1 && rhs.ndim()==1)
	{
		// shape check
		CHECK_EQ(lhs.size(), rhs.size());
		ret = Darray<float>(lhs.getDeviceManager(), {1});
		
		// using cublas sdot
		lhs.deviceSet();
		cublasSdot (DeviceManager::handle,
				    lhs.size(),
				    lhs.data,
				    1,
				    rhs.data,
				    1,
				    ret.data);
	}
	// 2D matrix dot
	else if (lhs.ndim()==2 && rhs.ndim()==2)
	{
		// shape check
		CHECK_EQ(lhs.shape()[1], rhs.shape()[0]);
		ret = Darray<float>(lhs.getDeviceManager(), {lhs.shape()[0], rhs.shape()[1]});
		
		// using cublas sgemm
		lhs.deviceSet();
		const float alpha = 1.;
		const float beta = 0.;
		CUBLAS_SAFE_CALL(
		cublasSgemm (DeviceManager::handle,
					CUBLAS_OP_N,
					CUBLAS_OP_N,
					lhs.shape()[0],
					rhs.shape()[1],
					lhs.shape()[1],
					&alpha,
					lhs.dev_data,
					lhs.shape()[0],
					rhs.dev_data,
					rhs.shape()[0],
					&beta,
					ret.dev_data,
					ret.shape()[0])
		);
	}
	return ret;
}
コード例 #10
0
ファイル: prod.cpp プロジェクト: e-thereal/capputils
void gemm(bool transa, bool transb, int m, int n, int k, float alpha, thrust::device_ptr<const float> A, int lda,
    thrust::device_ptr<const float> B, int ldb, float beta, thrust::device_ptr<float> C, int ldc)
{
  const cublasOperation_t ctransa = transa ? CUBLAS_OP_T : CUBLAS_OP_N;
  const cublasOperation_t ctransb = transb ? CUBLAS_OP_T : CUBLAS_OP_N;

  cublasSetStream(context::get().cublasHandle, context::get().stream);
  cublasSgemm(context::get().cublasHandle, ctransa, ctransb, m, n, k, &alpha, A.get(), lda, B.get(), ldb, &beta, C.get(), ldc);
}
コード例 #11
0
		void fully_connected_layer_updater_cuda::enqueue_backward_weights_propagation(
			cudaStream_t stream_id,
			const std::vector<cuda_linear_buffer_device::const_ptr>& schema_data,
			const std::vector<cuda_linear_buffer_device::ptr>& gradient,
			const std::vector<cuda_linear_buffer_device::const_ptr>& data_custom,
			const std::vector<cuda_linear_buffer_device::const_ptr>& input_neurons_buffers,
			cuda_linear_buffer_device::const_ptr output_errors_buffer,
			const std::vector<cuda_linear_buffer_device::const_ptr>& persistent_working_data,
			cuda_linear_buffer_device::ptr temporary_working_fixed_buffer,
			cuda_linear_buffer_device::ptr temporary_working_per_entry_buffer,
			cuda_linear_buffer_device::const_ptr temporary_fixed_buffer,
			cuda_linear_buffer_device::const_ptr temporary_per_entry_buffer,
			unsigned int entry_count)
		{
			// Update weights
			{
				cublas_safe_call(cublasSetStream(cuda_config->get_cublas_handle(), stream_id));
				float alpha = 1.0F;
				float beta = 1.0F;
				cublas_safe_call(cublasSgemm(
					cuda_config->get_cublas_handle(),
					CUBLAS_OP_N,
					CUBLAS_OP_T,
					input_elem_count_per_entry_list[0],
					output_elem_count_per_entry,
					entry_count,
					&alpha,
					*input_neurons_buffers[0],
					input_elem_count_per_entry_list[0],
					*output_errors_buffer,
					output_elem_count_per_entry,
					&beta,
					*gradient[0],
					input_elem_count_per_entry_list[0]));
			}

			// Update biases
			if (bias)
			{
				cudnn_safe_call(cudnnSetStream(cuda_config->get_cudnn_handle(), stream_id));
				cudnn_util::set_tensor_descriptor(
					output_data_desc,
					output_configuration_specific,
					entry_count);
				float alpha = 1.0F;
				float beta = 1.0F;
				cudnn_safe_call(cudnnConvolutionBackwardBias(
					cuda_config->get_cudnn_handle(),
					&alpha,
					output_data_desc,
					*output_errors_buffer,
					&beta,
					bias_desc,
					*gradient[1]));
			}
		}
コード例 #12
0
ファイル: gemm.c プロジェクト: hyperchris/Yolo
void gemm_ongpu(int TA, int TB, int M, int N, int K, float ALPHA, 
        float *A_gpu, int lda, 
        float *B_gpu, int ldb,
        float BETA,
        float *C_gpu, int ldc)
{
    cublasHandle_t handle = blas_handle();
    cudaError_t status = (cudaError_t)cublasSgemm(handle, (TB ? CUBLAS_OP_T : CUBLAS_OP_N), 
                        (TA ? CUBLAS_OP_T : CUBLAS_OP_N), N, M, K, &ALPHA, B_gpu, ldb, A_gpu, lda, &BETA, C_gpu, ldc);
    check_error(status);
}
コード例 #13
0
		void fully_connected_layer_updater_cuda::enqueue_forward_propagation(
			cudaStream_t stream_id,
			cuda_linear_buffer_device::ptr output_buffer,
			const std::vector<cuda_linear_buffer_device::const_ptr>& schema_data,
			const std::vector<cuda_linear_buffer_device::const_ptr>& data,
			const std::vector<cuda_linear_buffer_device::const_ptr>& data_custom,
			const std::vector<cuda_linear_buffer_device::const_ptr>& input_buffers,
			const std::vector<cuda_linear_buffer_device::const_ptr>& persistent_working_data,
			cuda_linear_buffer_device::ptr temporary_working_fixed_buffer,
			cuda_linear_buffer_device::ptr temporary_working_per_entry_buffer,
			cuda_linear_buffer_device::ptr temporary_fixed_buffer,
			cuda_linear_buffer_device::ptr temporary_per_entry_buffer,
			unsigned int entry_count)
		{
			{
				cublas_safe_call(cublasSetStream(cuda_config->get_cublas_handle(), stream_id));
				float alpha = 1.0F;
				float beta = 0.0F;
				cublas_safe_call(cublasSgemm(
					cuda_config->get_cublas_handle(),
					CUBLAS_OP_T,
					CUBLAS_OP_N,
					output_elem_count_per_entry,
					entry_count,
					input_elem_count_per_entry_list[0],
					&alpha,
					*data[0],
					input_elem_count_per_entry_list[0],
					*input_buffers[0],
					input_elem_count_per_entry_list[0],
					&beta,
					*output_buffer,
					output_elem_count_per_entry));
			}

			if (bias)
			{
				cudnn_safe_call(cudnnSetStream(cuda_config->get_cudnn_handle(), stream_id));
				cudnn_util::set_tensor_descriptor(
					output_data_desc,
					output_configuration_specific,
					entry_count);
				float alpha = 1.0F;
				float beta = 1.0F;
				cudnn_safe_call(cudnnAddTensor(
					cuda_config->get_cudnn_handle(),
					&alpha,
					bias_desc,
					*data[1],
					&beta,
					output_data_desc,
					*output_buffer));
			}
		}
コード例 #14
0
ファイル: micronn.c プロジェクト: microo8/micronn
micronn_matrix* micronn_matrix_dot(cublasHandle_t handle, cublasOperation_t transA, cublasOperation_t transB, float alpha, micronn_matrix* v, micronn_matrix* w)
{
    float beta = 0.0;
    micronn_matrix* x = micronn_matrix_alloc(transA == CUBLAS_OP_N ? v->rows : v->cols,
                        transB == CUBLAS_OP_N ? w->cols : w->rows);
    cublasSgemm(handle, transA, transB,
                transA == CUBLAS_OP_N ? v->rows : v->cols,
                transB == CUBLAS_OP_N ? w->cols : w->rows,
                transA == CUBLAS_OP_N ? v->cols : v->rows,
                &alpha, v->devPtrvals, v->rows,
                w->devPtrvals, w->rows, &beta,
                x->devPtrvals, x->rows);
    return x;
};
コード例 #15
0
void caffe_gpu_gemm<float>(const CBLAS_TRANSPOSE TransA,
    const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
    const float alpha, const float* A, const float* B, const float beta,
    float* C) {
  // Note that cublas follows fortran order.
  int lda = (TransA == CblasNoTrans) ? K : M;
  int ldb = (TransB == CblasNoTrans) ? N : K;
  cublasOperation_t cuTransA =
      (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
  cublasOperation_t cuTransB =
      (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
  CUBLAS_CHECK(cublasSgemm(Caffe::cublas_handle(), cuTransB, cuTransA,
      N, M, K, &alpha, B, ldb, A, lda, &beta, C, N));
}
コード例 #16
0
ファイル: blas_s.cpp プロジェクト: cjy7117/DVFS-MAGMA
void magma_sgemm(
    magma_trans_t transA, magma_trans_t transB,
    magma_int_t m, magma_int_t n, magma_int_t k,
    float alpha, float const* dA, magma_int_t lda,
                           float const* dB, magma_int_t ldb,
    float beta,  float*       dC, magma_int_t ldc )
{
    cublasSgemm(
        cublas_trans_const( transA ),
        cublas_trans_const( transB ),
        m, n, k,
        alpha, dA, lda,
               dB, ldb,
        beta,  dC, ldc );
}
コード例 #17
0
ファイル: BIDMat_CUMAT.cpp プロジェクト: nomad-ca-us/BIDMat
 JNIEXPORT jint JNICALL Java_edu_berkeley_bid_CUMAT_blockSgemm
 (JNIEnv *env, jobject obj, jint transA, jint transB, jint nr, jint nc, jint kk, jint reps, jobject jA, jint lda, jint astep, 
  jobject jB, jint ldb, jint bstep, jobject jC, jint ldc, jint cstep)
 {
   char at, bt;
   at = (transA) ? 't' : 'n';
   bt = (transB) ? 't' : 'n';
   float *A = (float*)getPointer(env, jA);
   float *B = (float*)getPointer(env, jB);
   float *C = (float*)getPointer(env, jC);
   for (int i = 0; i < reps; i++) {
     cublasSgemm(at, bt, nr, nc, kk, 1.0f, A, lda, B, ldb, 0.0f, C, ldc);
     A += astep;
     B += bstep;
     C += cstep;
   }      
   cudaDeviceSynchronize();
   cudaError_t err = cudaGetLastError();
   return err;
 }
コード例 #18
0
static inline void dw_common_cpu_codelet_update_u22(void *descr[], int s, STARPU_ATTRIBUTE_UNUSED void *_args)
{
	float *left 	= (float *)STARPU_MATRIX_GET_PTR(descr[0]);
	float *right 	= (float *)STARPU_MATRIX_GET_PTR(descr[1]);
	float *center 	= (float *)STARPU_MATRIX_GET_PTR(descr[2]);

	unsigned dx = STARPU_MATRIX_GET_NX(descr[2]);
	unsigned dy = STARPU_MATRIX_GET_NY(descr[2]);
	unsigned dz = STARPU_MATRIX_GET_NY(descr[0]);

	unsigned ld12 = STARPU_MATRIX_GET_LD(descr[0]);
	unsigned ld21 = STARPU_MATRIX_GET_LD(descr[1]);
	unsigned ld22 = STARPU_MATRIX_GET_LD(descr[2]);

#ifdef STARPU_USE_CUDA
	cublasStatus status;
#endif

	switch (s)
	{
		case 0:
			STARPU_SGEMM("N", "N",	dy, dx, dz, 
				-1.0f, left, ld21, right, ld12,
					     1.0f, center, ld22);
			break;

#ifdef STARPU_USE_CUDA
		case 1:
			cublasSgemm('n', 'n', dx, dy, dz, -1.0f, left, ld21,
					right, ld12, 1.0f, center, ld22);
			status = cublasGetError();
			if (status != CUBLAS_STATUS_SUCCESS)
				STARPU_CUBLAS_REPORT_ERROR(status);

			break;
#endif
		default:
			STARPU_ABORT();
			break;
	}
}
コード例 #19
0
int tiramisu_cublas_sgemm(float *A, float *B, float *C,
                          uint64_t M, uint64_t N, uint64_t K,
                          float alpha, float beta,
                          uint64_t ldA, uint64_t ldB, uint64_t ldC,
                          uint64_t offsetA, uint64_t offsetB, uint64_t offsetC,
                          bool transposeA, bool transposeB)
{
    // TODO: Destroy the handle.
    static bool handle_created = false;
    static cublasHandle_t handle;
    if (!handle_created) {
        cublasCreate(&handle);
        handle_created = true;
    }
    // Default values for tight packing:
    if (ldA == 0) {
        ldA = transposeA ? M : K;
    }
    if (ldB == 0) {
        ldB = transposeB ? K : N;
    }
    if (ldC == 0) {
        ldC = N;
    }
    // The cuBLAS GEMM accepts column major buffers by default. We do a simple
    // trick here to multiply row major matrices. From a row-major perspective,
    // column-major multiplication basically transposes inputs, multiplies, and
    // transposes the output again: cublas(A, B) = ((A^T)x(B^T))^T = BxA
    // So it is actually equivalent to row-major GEMM with inputs swapped.
    // We need to reorder the size parameters as well to make it work:
    cublasSetStream(handle, cudaStreamPerThread);
    handle_cublas_error(
        cublasSgemm(handle,
                    transposeB ? CUBLAS_OP_T : CUBLAS_OP_N,
                    transposeA ? CUBLAS_OP_T : CUBLAS_OP_N,
                    N, M, K,
                    &alpha, B + offsetB, ldB, A + offsetA, ldA,
                    &beta, C + offsetC, ldC),
         __FUNCTION__);
    return 0;
}
コード例 #20
0
ファイル: FloatMatrixSgemm_c.c プロジェクト: hsyl20/HaskellPU
static void sgemm_cuda(void *descr[], void *args) {
    float *a = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
    float *b = (float *)STARPU_MATRIX_GET_PTR(descr[1]);
    float *c = (float *)STARPU_MATRIX_GET_PTR(descr[2]);

    unsigned w = STARPU_MATRIX_GET_NY(descr[2]);
    unsigned h = STARPU_MATRIX_GET_NX(descr[2]);
    unsigned k = STARPU_MATRIX_GET_NY(descr[0]);

    unsigned lda = STARPU_MATRIX_GET_LD(descr[0]);
    unsigned ldb = STARPU_MATRIX_GET_LD(descr[1]);
    unsigned ldc = STARPU_MATRIX_GET_LD(descr[2]);

    struct sgemm_arg * arg = (struct sgemm_arg*)args;

    cublasSetStream(cublas_handle, starpu_cuda_get_local_stream());

    cublasSgemm(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, h, w, k, &arg->alpha, a, lda, b, ldb, &arg->beta, c, ldc);
    cudaStreamSynchronize(starpu_cuda_get_local_stream());
    free(arg);
}
コード例 #21
0
void matrixMulti(cublasHandle_t cublasHandle, 
                                       int m, 
                                       int n,
                               int batchSize, 
                                 float alpha,
		                       const float*A, 
                               const float*x, 
                                  float beta, 
                                    float *y)
{
#ifdef DISABLE_GEMV
	checkCublasErrors(cublasSgemm(cublasHandle,
			          CUBLAS_OP_T,
			          CUBLAS_OP_T,
			          n,
			          batchSize,
			          m,
			          &alpha,
			          x,
			          m,
			          A,
			          batchSize,
			          &beta,
			          y,
			          n));

#else

	checkCublasErrors(cublasSgemv(cublasHandle, CUBLAS_OP_T,
			                     m, n,
			                     &alpha,
			                     A, m,
			                     x, 1,
			                     &beta,
			                     y, 1));


#endif

}
コード例 #22
0
		void fully_connected_layer_updater_cuda::enqueue_backward_data_propagation(
			cudaStream_t stream_id,
			unsigned int input_index,
			cuda_linear_buffer_device::ptr input_errors_buffer,
			cuda_linear_buffer_device::const_ptr output_errors_buffer,
			const std::vector<cuda_linear_buffer_device::const_ptr>& schema_data,
			const std::vector<cuda_linear_buffer_device::const_ptr>& data,
			const std::vector<cuda_linear_buffer_device::const_ptr>& data_custom,
			const std::vector<cuda_linear_buffer_device::const_ptr>& input_neurons_buffers,
			cuda_linear_buffer_device::const_ptr output_neurons_buffer,
			const std::vector<cuda_linear_buffer_device::const_ptr>& persistent_working_data,
			cuda_linear_buffer_device::ptr temporary_working_fixed_buffer,
			cuda_linear_buffer_device::ptr temporary_working_per_entry_buffer,
			cuda_linear_buffer_device::const_ptr temporary_fixed_buffer,
			cuda_linear_buffer_device::const_ptr temporary_per_entry_buffer,
			bool add_update_to_destination,
			unsigned int entry_count)
		{
			cublas_safe_call(cublasSetStream(cuda_config->get_cublas_handle(), stream_id));
			float alpha = 1.0F;
			float beta = (add_update_to_destination ? 1.0F : 0.0F);
			cublas_safe_call(cublasSgemm(
				cuda_config->get_cublas_handle(),
				CUBLAS_OP_N,
				CUBLAS_OP_N,
				input_elem_count_per_entry_list[0],
				entry_count,
				output_elem_count_per_entry,
				&alpha,
				*data[0],
				input_elem_count_per_entry_list[0],
				*output_errors_buffer,
				output_elem_count_per_entry,
				&beta,
				*input_errors_buffer,
				input_elem_count_per_entry_list[0]));
		}
コード例 #23
0
ファイル: Wrapper.cpp プロジェクト: razorx89/nmfgpu
		cublasStatus_t cublasXgemm(cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float *alpha, 
								   const float *A, int lda, const float *B, int ldb, const float *beta, float *C, int ldc) {
			return cublasSgemm(g_context->cublasHandle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
		}
コード例 #24
0
ファイル: MatMultTests.cpp プロジェクト: pmullown/kmeans
TEST_F(MatMultTests, SIFT) {

  string fileName("mat-sift");
  int m = 898790;
  int n = 128;
  int k = 256;

  /* allocate data */
  float * data = (float *)malloc(m*n*sizeof(float));
  float * centers = (float *)malloc(k*n*sizeof(float));
  float * result = (float *)malloc(m*k*sizeof(float));
  float * resultCublas = (float *)malloc(m*k*sizeof(float));

  /* read matrix from file */
  FILE * fid = fopen(fileName.c_str(), "rb");
  int nread = fread(data, sizeof(float), m*n, fid);
  ASSERT_EQ(nread, m*n);
  fclose(fid);

  /* initialize centers to 1 */
  for (int i = 0; i<k*n; ++i) centers[i] = (float)1;

  /* allocate device space for the various arrays */
  float * dev_data, *dev_centers, *dev_result;
  int factor = TILESIZE*N_UNROLL_FLOAT;
  int m_padded = ((m + factor - 1) / factor)*factor;

  int nBytes = m_padded*n*sizeof(float);
  cudaMalloc((void**)&dev_data, nBytes);
  cudaMemset(dev_data, 0, nBytes);
  cudaMemcpy(dev_data, data, m*n*sizeof(float), cudaMemcpyHostToDevice);

  nBytes = n*k*sizeof(float);
  cudaMalloc((void**)&dev_centers, nBytes);
  cudaMemcpy(dev_centers, centers, nBytes, cudaMemcpyHostToDevice);

  nBytes = m*k*sizeof(float);
  cudaMalloc((void**)&dev_result, nBytes);
  cudaMemset(dev_result, 0, nBytes);

  /* run MatMatMultF */
  int err = MatMatMultF(m, n, dev_data, k, dev_centers, dev_result);
  if (err) printf("Error int MatMatMultF for mat-sift\n");
  cudaMemcpy(result, dev_result, nBytes, cudaMemcpyDeviceToHost);

  /* run CUBLAS SGEMM */
  float one = 1.f;
  float zero = 0.f;
  cublasHandle_t handle;
  cublasCreate(&handle);
  cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N,
	  k, m, n, (const float *)&one,
	  (const float *)dev_centers, k,
	  (const float *)dev_data, n,
	  (const float *)&zero, (float *)dev_result, k);
  cudaMemcpy(resultCublas, dev_result, nBytes, cudaMemcpyDeviceToHost);

#if 1
  /* check results */
  int maxPrintErrors=10;
  int numPrintErrors=0;
  for (int i = 0; i < m; ++i) {
    for (int j = 0; j < k; ++j) {
      int index = i*k + j;
      if (result[index] == 0 && resultCublas[index] == 0) continue;
      else {
	float err = fabs(result[index] - resultCublas[index]) / fabs(result[index]);
	if (err >= 1.e-6 || result[index] == 0) {
	  printf("i=%d, j=%d : %1.5g, %1.5g, err=%1.5g\n", i, j, result[index], resultCublas[index], err);
	  if (numPrintErrors<maxPrintErrors) {
	    numPrintErrors++;
	    EXPECT_LT(err, 1.e-6);
	  } else {
	    ASSERT_LT(err, 1.e-6);
	  }
	}
      }
    }
  }
#endif

  /* free data */
  if (dev_data) cudaFree(dev_data);
  if (dev_centers) cudaFree(dev_centers);
  if (dev_result) cudaFree(dev_result);

  if (data) free(data);
  if (centers) free(centers);
  if (result) free(result);
  if (resultCublas) free(resultCublas);
  cublasDestroy(handle);
}
コード例 #25
0
ファイル: testing_sgemm.cpp プロジェクト: kjbartel/clmagma
/* ////////////////////////////////////////////////////////////////////////////
   -- Testing sgemm
*/
int main( int argc, char** argv)
{
    TESTING_INIT();

    real_Double_t   gflops, magma_perf, magma_time, dev_perf, dev_time, cpu_perf, cpu_time;
    float          magma_error, dev_error, Cnorm, work[1];
    magma_int_t M, N, K;
    magma_int_t Am, An, Bm, Bn;
    magma_int_t sizeA, sizeB, sizeC;
    magma_int_t lda, ldb, ldc, ldda, lddb, lddc;
    magma_int_t ione     = 1;
    magma_int_t ISEED[4] = {0,0,0,1};
    magma_int_t status = 0;
    
    float *h_A, *h_B, *h_C, *h_Cmagma, *h_Cdev;
    magmaFloat_ptr d_A, d_B, d_C;
    float c_neg_one = MAGMA_S_NEG_ONE;
    float alpha = MAGMA_S_MAKE(  0.29, -0.86 );
    float beta  = MAGMA_S_MAKE( -0.48,  0.38 );
    
    magma_opts opts;
    parse_opts( argc, argv, &opts );
    
    float tol = opts.tolerance * lapackf77_slamch("E");

    #ifdef HAVE_CUBLAS
        // for CUDA, we can check MAGMA vs. CUBLAS, without running LAPACK
        printf("If running lapack (option --lapack), MAGMA and %s error are both computed\n"
               "relative to CPU BLAS result. Else, MAGMA error is computed relative to %s result.\n\n",
                g_platform_str, g_platform_str );
        printf("transA = %s, transB = %s\n",
               lapack_trans_const(opts.transA),
               lapack_trans_const(opts.transB) );
        printf("    M     N     K   MAGMA Gflop/s (ms)  %s Gflop/s (ms)   CPU Gflop/s (ms)  MAGMA error  %s error\n",
                g_platform_str, g_platform_str );
    #else
        // for others, we need LAPACK for check
        opts.lapack |= opts.check;  // check (-c) implies lapack (-l)
        printf("transA = %s, transB = %s\n",
               lapack_trans_const(opts.transA),
               lapack_trans_const(opts.transB) );
        printf("    M     N     K   %s Gflop/s (ms)   CPU Gflop/s (ms)  %s error\n",
                g_platform_str, g_platform_str );
    #endif
    printf("=========================================================================================================\n");
    for( int itest = 0; itest < opts.ntest; ++itest ) {
        for( int iter = 0; iter < opts.niter; ++iter ) {
            M = opts.msize[itest];
            N = opts.nsize[itest];
            K = opts.ksize[itest];
            gflops = FLOPS_SGEMM( M, N, K ) / 1e9;

            if ( opts.transA == MagmaNoTrans ) {
                lda = Am = M;
                An = K;
            } else {
                lda = Am = K;
                An = M;
            }
            
            if ( opts.transB == MagmaNoTrans ) {
                ldb = Bm = K;
                Bn = N;
            } else {
                ldb = Bm = N;
                Bn = K;
            }
            ldc = M;
            
            ldda = ((lda+31)/32)*32;
            lddb = ((ldb+31)/32)*32;
            lddc = ((ldc+31)/32)*32;
            
            sizeA = lda*An;
            sizeB = ldb*Bn;
            sizeC = ldc*N;
            
            TESTING_MALLOC_CPU( h_A,       float, lda*An );
            TESTING_MALLOC_CPU( h_B,       float, ldb*Bn );
            TESTING_MALLOC_CPU( h_C,       float, ldc*N  );
            TESTING_MALLOC_CPU( h_Cmagma,  float, ldc*N  );
            TESTING_MALLOC_CPU( h_Cdev,    float, ldc*N  );
            
            TESTING_MALLOC_DEV( d_A, float, ldda*An );
            TESTING_MALLOC_DEV( d_B, float, lddb*Bn );
            TESTING_MALLOC_DEV( d_C, float, lddc*N  );
            
            /* Initialize the matrices */
            lapackf77_slarnv( &ione, ISEED, &sizeA, h_A );
            lapackf77_slarnv( &ione, ISEED, &sizeB, h_B );
            lapackf77_slarnv( &ione, ISEED, &sizeC, h_C );
            
            magma_ssetmatrix( Am, An, h_A, lda, d_A, 0, ldda, opts.queue );
            magma_ssetmatrix( Bm, Bn, h_B, ldb, d_B, 0, lddb, opts.queue );
            
            /* =====================================================================
               Performs operation using MAGMABLAS (currently only with CUDA)
               =================================================================== */
            #ifdef HAVE_CUBLAS
                magma_ssetmatrix( M, N, h_C, ldc, d_C, lddc );
                
                magma_time = magma_sync_wtime( NULL );
                magmablas_sgemm( opts.transA, opts.transB, M, N, K,
                                 alpha, d_A, ldda,
                                        d_B, lddb,
                                 beta,  d_C, lddc );
                magma_time = magma_sync_wtime( NULL ) - magma_time;
                magma_perf = gflops / magma_time;
                
                magma_sgetmatrix( M, N, d_C, lddc, h_Cmagma, ldc );
            #endif
            
            /* =====================================================================
               Performs operation using CUBLAS / clBLAS / Xeon Phi MKL
               =================================================================== */
            magma_ssetmatrix( M, N, h_C, ldc, d_C, 0, lddc, opts.queue );
            
            #ifdef HAVE_CUBLAS
                dev_time = magma_sync_wtime( NULL );
                cublasSgemm( opts.handle, cublas_trans_const(opts.transA), cublas_trans_const(opts.transB), M, N, K,
                             &alpha, d_A, ldda,
                                     d_B, lddb,
                             &beta,  d_C, lddc );
                dev_time = magma_sync_wtime( NULL ) - dev_time;
            #else
                dev_time = magma_sync_wtime( opts.queue );
                magma_sgemm( opts.transA, opts.transB, M, N, K,
                             alpha, d_A, 0, ldda,
                                    d_B, 0, lddb,
                             beta,  d_C, 0, lddc, opts.queue );
                dev_time = magma_sync_wtime( opts.queue ) - dev_time;
            #endif
            dev_perf = gflops / dev_time;
            
            magma_sgetmatrix( M, N, d_C, 0, lddc, h_Cdev, ldc, opts.queue );
            
            /* =====================================================================
               Performs operation using CPU BLAS
               =================================================================== */
            if ( opts.lapack ) {
                cpu_time = magma_wtime();
                blasf77_sgemm( lapack_trans_const(opts.transA), lapack_trans_const(opts.transB), &M, &N, &K,
                               &alpha, h_A, &lda,
                                       h_B, &ldb,
                               &beta,  h_C, &ldc );
                cpu_time = magma_wtime() - cpu_time;
                cpu_perf = gflops / cpu_time;
            }
            
            /* =====================================================================
               Check the result
               =================================================================== */
            if ( opts.lapack ) {
                // compute relative error for both magma & dev, relative to lapack,
                // |C_magma - C_lapack| / |C_lapack|
                Cnorm = lapackf77_slange( "F", &M, &N, h_C, &ldc, work );
                
                blasf77_saxpy( &sizeC, &c_neg_one, h_C, &ione, h_Cdev, &ione );
                dev_error = lapackf77_slange( "F", &M, &N, h_Cdev, &ldc, work ) / Cnorm;
                
                #ifdef HAVE_CUBLAS
                    blasf77_saxpy( &sizeC, &c_neg_one, h_C, &ione, h_Cmagma, &ione );
                    magma_error = lapackf77_slange( "F", &M, &N, h_Cmagma, &ldc, work ) / Cnorm;
                    
                    printf("%5d %5d %5d   %7.2f (%7.2f)    %7.2f (%7.2f)   %7.2f (%7.2f)    %8.2e     %8.2e   %s\n",
                           (int) M, (int) N, (int) K,
                           magma_perf,  1000.*magma_time,
                           dev_perf,    1000.*dev_time,
                           cpu_perf,    1000.*cpu_time,
                           magma_error, dev_error,
                           (magma_error < tol && dev_error < tol ? "ok" : "failed"));
                    status += ! (magma_error < tol && dev_error < tol);
                #else
                    printf("%5d %5d %5d   %7.2f (%7.2f)   %7.2f (%7.2f)    %8.2e   %s\n",
                           (int) M, (int) N, (int) K,
                           dev_perf,    1000.*dev_time,
                           cpu_perf,    1000.*cpu_time,
                           dev_error,
                           (dev_error < tol ? "ok" : "failed"));
                    status += ! (dev_error < tol);
                #endif
            }
            else {
                #ifdef HAVE_CUBLAS
                    // compute relative error for magma, relative to dev (currently only with CUDA)
                    Cnorm = lapackf77_slange( "F", &M, &N, h_Cdev, &ldc, work );
                    
                    blasf77_saxpy( &sizeC, &c_neg_one, h_Cdev, &ione, h_Cmagma, &ione );
                    magma_error = lapackf77_slange( "F", &M, &N, h_Cmagma, &ldc, work ) / Cnorm;
                    
                    printf("%5d %5d %5d   %7.2f (%7.2f)    %7.2f (%7.2f)     ---   (  ---  )    %8.2e        ---    %s\n",
                           (int) M, (int) N, (int) K,
                           magma_perf,  1000.*magma_time,
                           dev_perf,    1000.*dev_time,
                           magma_error,
                           (magma_error < tol ? "ok" : "failed"));
                    status += ! (magma_error < tol);
                #else
                    printf("%5d %5d %5d   %7.2f (%7.2f)     ---   (  ---  )       ---\n",
                           (int) M, (int) N, (int) K,
                           dev_perf,    1000.*dev_time );
                #endif
            }
            
            TESTING_FREE_CPU( h_A );
            TESTING_FREE_CPU( h_B );
            TESTING_FREE_CPU( h_C );
            TESTING_FREE_CPU( h_Cmagma  );
            TESTING_FREE_CPU( h_Cdev    );
            
            TESTING_FREE_DEV( d_A );
            TESTING_FREE_DEV( d_B );
            TESTING_FREE_DEV( d_C );
            fflush( stdout );
        }
        if ( opts.niter > 1 ) {
            printf( "\n" );
        }
    }

    TESTING_FINALIZE();
    return status;
}
コード例 #26
0
int cublas_multiply_matrix_float2( PGM_Matriz_Double  *A, PGM_Matriz_Double  *B, PGM_Matriz_Double  *result, PGM_Matriz_Float *work){
    cublasStatus_t status;
    cublasHandle_t handle;
    int max_dim = work->n_linhas;
    PGM_Matriz_GPU device_A,
                   device_B,
                   device_C;
    float alpha = 1.0,
          beta = 0.0;

    if (A->n_linhas != result->n_linhas || B->n_colunas != result->n_colunas){
        return 0;
    }

    status = cublasCreate(&handle);

    if (status != CUBLAS_STATUS_SUCCESS){
        return -1;
    }

    if(create_PGM_Matriz_GPU_float(&device_A,A->n_linhas, A->n_colunas,max_dim) != cudaSuccess){
        return -2;
    }
    if(create_PGM_Matriz_GPU_float(&device_B,B->n_linhas, B->n_colunas,max_dim) != cudaSuccess){
        if(cudaFree(device_A.valor) == cudaSuccess) return -2;
        else return -11;
    }
    if(create_PGM_Matriz_GPU_float(&device_C,result->n_linhas, result->n_colunas,max_dim) != cudaSuccess){
        if(cudaFree(device_A.valor) == cudaSuccess && cudaFree(device_B.valor) == cudaSuccess) return -2;
        else return -11;
    }

    if(float_copyMatrixHost2GPU(A,work,&device_A) != CUBLAS_STATUS_SUCCESS){
        if(cudaFree(device_A.valor) == cudaSuccess && cudaFree(device_B.valor) == cudaSuccess && cudaFree(device_C.valor) == cudaSuccess) return -3;
        else return -12;
    }

    if(float_copyMatrixHost2GPU(B,work,&device_B) != CUBLAS_STATUS_SUCCESS){
        if(cudaFree(device_A.valor) == cudaSuccess && cudaFree(device_B.valor) == cudaSuccess && cudaFree(device_C.valor) == cudaSuccess) return -3;
        else return -12;
    }
    if(cublasSgemm(handle, CUBLAS_OP_T, CUBLAS_OP_T, max_dim,max_dim,max_dim, &alpha,(float*) device_A.valor,max_dim,(float*)device_B.valor, max_dim, &beta, (float*)device_C.valor, max_dim) != CUBLAS_STATUS_SUCCESS){
        if(cudaFree(device_A.valor) == cudaSuccess && cudaFree(device_B.valor) == cudaSuccess && cudaFree(device_C.valor) == cudaSuccess) return -4;
        else return -13;
    }
    if(float_copyMatrixGPU2Host_Transpose(result, &device_C, work) != CUBLAS_STATUS_SUCCESS){
        if(cudaFree(device_A.valor) == cudaSuccess && cudaFree(device_B.valor) == cudaSuccess && cudaFree(device_C.valor) == cudaSuccess) return -5;
        else return -14;
    }

    if(cudaFree(device_A.valor) != cudaSuccess){
        return -15;
    }
    if(cudaFree(device_B.valor) != cudaSuccess){
        return -15;
    }
    if(cudaFree(device_C.valor) != cudaSuccess){
        return -15;
    }
    if(cublasDestroy(handle) != CUBLAS_STATUS_SUCCESS){
        return -7;
    }
    return 1;
}
コード例 #27
0
static int sgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB,
                      size_t M, size_t N, size_t K, float alpha,
                      gpudata **A, size_t *offA, size_t lda,
                      gpudata **B, size_t *offB, size_t ldb,
                      float beta, gpudata **C, size_t *offC, size_t ldc,
                      size_t batchCount) {
  cuda_context *ctx;
  size_t *lt, t;
  gpudata **T;
  size_t i;
  cb_transpose transT;
  cublasStatus_t err;

  if (batchCount == 0) return GA_NO_ERROR;

  ASSERT_BUF(A[0]);
  ctx = A[0]->ctx;
  cuda_enter(ctx);

  if (order == cb_c) {
    /* swap A and B */
    t = N;
    N = M;
    M = t;
    T = A;
    A = B;
    B = T;
    t = lda;
    lda = ldb;
    ldb = t;
    transT = transA;
    transA = transB;
    transB = transT;
    lt = offA;
    offA = offB;
    offB = lt;
  }

  // use parallel cublasSgemm calls rather than cublasSgemmBatched for large products
  const size_t threshold = 650;
  const int multiple_dispatch = M * N * K > threshold * threshold * threshold;
  if (multiple_dispatch) {
    for (i = 0; i < batchCount; i++) {
      ASSERT_BUF(A[i]);
      ASSERT_BUF(B[i]);
      ASSERT_BUF(C[i]);
      cuda_wait(A[i], CUDA_WAIT_READ);
      cuda_wait(B[i], CUDA_WAIT_READ);
      cuda_wait(C[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE);

      err = cublasSgemm(((blas_handle *)ctx->blas_handle)->h,
                        convT(transA), convT(transB),
                        M, N, K, &alpha,
                        (float*)A[i]->ptr + offA[i], lda,
                        (float*)B[i]->ptr + offB[i], ldb,
                        &beta,
                        (float*)C[i]->ptr + offC[i], ldc);
      if (err != CUBLAS_STATUS_SUCCESS) {
        cuda_exit(ctx);
        if (err == CUBLAS_STATUS_ARCH_MISMATCH)
          return GA_DEVSUP_ERROR;
        return GA_BLAS_ERROR;
      }

      cuda_record(A[i], CUDA_WAIT_READ);
      cuda_record(B[i], CUDA_WAIT_READ);
      cuda_record(C[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE);
    }
  } else {
    float **T_l = alloca(sizeof(float *) * batchCount * 3);
    const float **A_l = (const float **)T_l;
    const float **B_l = (const float **)T_l + batchCount;
    float **C_l = T_l + (batchCount * 2);
    CUdeviceptr Ta, Aa, Ba, Ca;

    for (i = 0; i < batchCount; i++) {
      ASSERT_BUF(A[i]);
      ASSERT_BUF(B[i]);
      ASSERT_BUF(C[i]);
      cuda_wait(A[i], CUDA_WAIT_READ);
      cuda_wait(B[i], CUDA_WAIT_READ);
      cuda_wait(C[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE);
      A_l[i] = ((float *)A[i]->ptr) + offA[i];
      B_l[i] = ((float *)B[i]->ptr) + offB[i];
      C_l[i] = ((float *)C[i]->ptr) + offC[i];
    }

    cuMemAlloc(&Ta, sizeof(float *) * batchCount * 3);
    Aa = Ta;
    Ba = Ta + (batchCount * sizeof(float *));
    Ca = Ta + (batchCount * sizeof(float *) * 2);

    cuMemcpyHtoD(Ta, T_l, sizeof(float *) * batchCount * 3);

    err = cublasSgemmBatched(((blas_handle *)ctx->blas_handle)->h,
                             convT(transA), convT(transB),
                             M, N, K, &alpha, (const float **)Aa, lda,
                             (const float **)Ba, ldb, &beta,
                             (float **)Ca, ldc, batchCount);
    cuMemFree(Ta);
    if (err != CUBLAS_STATUS_SUCCESS) {
      cuda_exit(ctx);
      if (err == CUBLAS_STATUS_ARCH_MISMATCH)
        return GA_DEVSUP_ERROR;
      return GA_BLAS_ERROR;
    }

    for (i = 0; i < batchCount; i++) {
      cuda_record(A[i], CUDA_WAIT_READ);
      cuda_record(B[i], CUDA_WAIT_READ);
      cuda_record(C[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE);
    }
  }

  cuda_exit(ctx);
  return GA_NO_ERROR;
}
コード例 #28
0
ファイル: micronn.c プロジェクト: microo8/micronn
uint micronn_train(micronn* net, micronn_matrix* inputs, micronn_matrix* targets, uint batch, float eta, float momentum, uint max_iters, float min_error, uint echo_iters)
{
    int j;
    uint i, index, diff;
    float error = DBL_MAX, alpha = 1.0, beta = 0.0, one = 1.0;
    micronn_matrix* tmp, *y;
    micronn_matrix** delta = malloc(sizeof(micronn_matrix*) * (net->nhidden + 1));
    micronn_matrix** grad = malloc(sizeof(micronn_matrix*) * (net->nhidden + 1));
    micronn_matrix** a = malloc(sizeof(micronn_matrix*) * (net->nhidden + 2));
    //micronn_matrix** z = malloc(sizeof(micronn_matrix*) * (net->nhidden + 1));
    //calloc grad
    alpha = 2 * eta / (batch == 0 ? inputs->cols : batch);
    for(i = 0; i <= net->nhidden; i++) {
        grad[i] = micronn_matrix_alloc(net->weights[i]->rows, net->weights[i]->cols);
        micronn_matrix_set_val(grad[i], 0.0);
    }
    micronn_matrix_add_ones(inputs);
    for(i = 0; (max_iters == 0 || i < max_iters) && error > min_error; i++) {
        if(batch == 0) {
            a[0] = inputs;
            y = targets;
        } else {
            index = rand() % (inputs->cols - batch + 1);
            a[0] = malloc(sizeof(micronn_matrix));
            a[0]->rows = inputs->rows;
            a[0]->cols = batch;
            a[0]->devPtrvals = inputs->devPtrvals + index * inputs->rows;
            y = malloc(sizeof(micronn_matrix));
            y->rows = targets->rows;
            y->cols = batch;
            y->devPtrvals = targets->devPtrvals + index * targets->rows;
        }

        //forward and save the outputs of layers
        for(j = 0; j < net->nhidden + 1; j++) {
            if(j > 0) {
                micronn_matrix_add_ones(a[j]);
            }
            tmp = micronn_matrix_dot(net->handle, CUBLAS_OP_N, CUBLAS_OP_N, 1.0, net->weights[j], a[j]);
            //z[j] = tmp;
            a[j + 1] = micronn_matrix_sigmoid(tmp);
            micronn_matrix_free(tmp);
        }

        //calculate error
        if(echo_iters != 0 && i % echo_iters == 0) {
            error = micronn_error(net, inputs, targets, NULL);//a[net->nhidden + 1]);
            diff = micronn_diff(net, inputs, targets, NULL);//a[net->nhidden + 1]);
            printf("\x1B[32miteration \x1B[0m%d\t\t\x1B[31merror: \x1B[0m%.10f\t\t\x1B[35mdiff: \x1B[0m%d/%d\n", i, error, diff, inputs->cols);
        }

        //last delta = (a[last] - y) * f'(z[last])
        delta[net->nhidden] = micronn_matrix_copy(a[net->nhidden + 1]);
        micronn_matrix_sub(delta[net->nhidden], y);
        micronn_matrix_deriv_sigmoid(a[net->nhidden + 1], delta[net->nhidden]);
        //other delta[i] = (W[i])'delta[i+1] * f'(z[i])
        for(j = net->nhidden - 1; j >= 0; j--) {
            delta[j] = micronn_matrix_alloc(net->weights[j + 1]->cols, delta[j + 1]->cols);
            cublasSgemm(net->handle, CUBLAS_OP_T, CUBLAS_OP_N,
                        net->weights[j + 1]->cols, delta[j + 1]->cols, net->weights[j + 1]->rows,
                        &one, net->weights[j + 1]->devPtrvals, net->weights[j + 1]->rows,
                        delta[j + 1]->devPtrvals, delta[j + 1]->rows,
                        &beta, delta[j]->devPtrvals, delta[j]->rows);
            //delta[i] *= f'(z[i+1])
            micronn_matrix_deriv_sigmoid(a[j + 1], delta[j]);
        }
        //compute grad[i] = delta[i+1](a[i])' + momentum*grad[i] and add to weights[i] -= eta/N*grad[i]
        for(j = net->nhidden; j >= 0; j--) {
            //delete the last row from deltas to have correct size of grad
            if(j < net->nhidden) {
                micronn_matrix_remove_last_row(delta[j]);
            }
            cublasSgemm(net->handle, CUBLAS_OP_N, CUBLAS_OP_T,
                        delta[j]->rows,
                        a[j]->rows,
                        a[j]->cols,
                        &alpha, delta[j]->devPtrvals, delta[j]->rows,
                        a[j]->devPtrvals, a[j]->rows, &momentum,
                        grad[j]->devPtrvals, grad[j]->rows);
            micronn_matrix_sub(net->weights[j], grad[j]);
        }

        if(batch != 0) {
            free(a[0]);
            free(y);
        }
        for(j = 1; j < net->nhidden + 2; j++) {
            micronn_matrix_free(a[j]);
        }
        for(j = 0; j <= net->nhidden; j++) {
            micronn_matrix_free(delta[j]);
        }
    }
    for(i = 0; i <= net->nhidden; i++) {
        micronn_matrix_free(grad[i]);
    }
    return 1;
};
コード例 #29
0
ファイル: ocl_blas.c プロジェクト: E-LLP/QuIP
/* Main */
int test_cublas(void)
{    
    cublasStatus status;
    cudaError_t e;
    float* h_A;
    float* h_B;
    float* h_C;
    float* h_C_ref;
    float* d_A = 0;
    void *vp;
    float* d_B = 0;
    float* d_C = 0;
    float alpha = 1.0f;
    float beta = 0.0f;
    int n2 = N * N;
    int i;
    float error_norm;
    float ref_norm;
    float diff;

    /* Initialize CUBLAS */
    printf("simpleCUBLAS test running..\n");

    status = cublasInit();
    if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! CUBLAS initialization error\n");
        return EXIT_FAILURE;
    }

    /* Allocate host memory for the matrices */
    h_A = (float*)malloc(n2 * sizeof(h_A[0]));
    if (h_A == 0) {
        fprintf (stderr, "!!!! host memory allocation error (A)\n");
        return EXIT_FAILURE;
    }
    h_B = (float*)malloc(n2 * sizeof(h_B[0]));
    if (h_B == 0) {
        fprintf (stderr, "!!!! host memory allocation error (B)\n");
        return EXIT_FAILURE;
    }
    h_C = (float*)malloc(n2 * sizeof(h_C[0]));
    if (h_C == 0) {
        fprintf (stderr, "!!!! host memory allocation error (C)\n");
        return EXIT_FAILURE;
    }

    /* Fill the matrices with test data */
    for (i = 0; i < n2; i++) {
        h_A[i] = rand() / (float)RAND_MAX;
        h_B[i] = rand() / (float)RAND_MAX;
        h_C[i] = rand() / (float)RAND_MAX;
    }

    /* Allocate device memory for the matrices */
    if (cudaMalloc(&vp, n2 * sizeof(d_A[0])) != cudaSuccess) {
        fprintf (stderr, "!!!! device memory allocation error (A)\n");
        return EXIT_FAILURE;
    }
    d_A = (float *) vp;

    if (cudaMalloc(&vp, n2 * sizeof(d_B[0])) != cudaSuccess) {
        fprintf (stderr, "!!!! device memory allocation error (B)\n");
        return EXIT_FAILURE;
    }
    d_B = (float *) vp;

    if (cudaMalloc(&vp, n2 * sizeof(d_C[0])) != cudaSuccess) {
        fprintf (stderr, "!!!! device memory allocation error (C)\n");
        return EXIT_FAILURE;
    }
    d_C = (float *) vp;

    /* Initialize the device matrices with the host matrices */
    status = cublasSetVector(n2, sizeof(h_A[0]), h_A, 1, d_A, 1);
    if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! device access error (write A)\n");
        return EXIT_FAILURE;
    }
    status = cublasSetVector(n2, sizeof(h_B[0]), h_B, 1, d_B, 1);
    if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! device access error (write B)\n");
        return EXIT_FAILURE;
    }
    status = cublasSetVector(n2, sizeof(h_C[0]), h_C, 1, d_C, 1);
    if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! device access error (write C)\n");
        return EXIT_FAILURE;
    }
    
    /* Performs operation using plain C code */
    simple_sgemm(N, alpha, h_A, h_B, beta, h_C);
    h_C_ref = h_C;

    /* Clear last error */
    cublasGetError();

    /* Performs operation using cublas */
    cublasSgemm('n', 'n', N, N, N, alpha, d_A, N, d_B, N, beta, d_C, N);
    status = cublasGetError();
    if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! kernel execution error.\n");
        return EXIT_FAILURE;
    }
    
    /* Allocate host memory for reading back the result from device memory */
    h_C = (float*)malloc(n2 * sizeof(h_C[0]));
    if (h_C == 0) {
        fprintf (stderr, "!!!! host memory allocation error (C)\n");
        return EXIT_FAILURE;
    }

    /* Read the result back */
    status = cublasGetVector(n2, sizeof(h_C[0]), d_C, 1, h_C, 1);
    if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! device access error (read C)\n");
        return EXIT_FAILURE;
    }

    /* Check result against reference */
    error_norm = 0;
    ref_norm = 0;
    for (i = 0; i < n2; ++i) {
        diff = h_C_ref[i] - h_C[i];
        error_norm += diff * diff;
        ref_norm += h_C_ref[i] * h_C_ref[i];
    }
    error_norm = (float)sqrt((double)error_norm);
    ref_norm = (float)sqrt((double)ref_norm);
    if (fabs(ref_norm) < 1e-7) {
        fprintf (stderr, "!!!! reference norm is 0\n");
        return EXIT_FAILURE;
    }
    printf( "Test %s\n", (error_norm / ref_norm < 1e-6f) ? "PASSED" : "FAILED");

    /* Memory clean up */
    free(h_A);
    free(h_B);
    free(h_C);
    free(h_C_ref);
    e = cudaFree(d_A);
    if (e != cudaSuccess) {
        fprintf (stderr, "!!!! memory free error (A)\n");
        return EXIT_FAILURE;
    }
    e = cudaFree(d_B);
    if (e != cudaSuccess) {
        fprintf (stderr, "!!!! memory free error (B)\n");
        return EXIT_FAILURE;
    }
    e = cudaFree(d_C);
    if (e != cudaSuccess) {
        fprintf (stderr, "!!!! memory free error (C)\n");
        return EXIT_FAILURE;
    }

    /* Shutdown */
    status = cublasShutdown();
    if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! shutdown error (A)\n");
        return EXIT_FAILURE;
    }

    return EXIT_SUCCESS;
}
コード例 #30
0
/* Main */
int main(int argc, char **argv)
{
    cublasStatus_t status;
    float *h_A;
    float *h_B;
    float *h_C;
    float *h_C_ref;
    float *d_A = 0;
    float *d_B = 0;
    float *d_C = 0;
    float alpha = 1.0f;
    float beta = 0.0f;
    int n2 = N * N;
    int i;
    float error_norm;
    float ref_norm;
    float diff;
    cublasHandle_t handle;

    int dev = findCudaDevice(argc, (const char **) argv);

    if (dev == -1)
    {
        return EXIT_FAILURE;
    }

    /* Initialize CUBLAS */
    printf("simpleCUBLAS test running..\n");

    status = cublasCreate(&handle);

    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! CUBLAS initialization error\n");
        return EXIT_FAILURE;
    }

    /* Allocate host memory for the matrices */
    h_A = (float *)malloc(n2 * sizeof(h_A[0]));

    if (h_A == 0)
    {
        fprintf(stderr, "!!!! host memory allocation error (A)\n");
        return EXIT_FAILURE;
    }

    h_B = (float *)malloc(n2 * sizeof(h_B[0]));

    if (h_B == 0)
    {
        fprintf(stderr, "!!!! host memory allocation error (B)\n");
        return EXIT_FAILURE;
    }

    h_C = (float *)malloc(n2 * sizeof(h_C[0]));

    if (h_C == 0)
    {
        fprintf(stderr, "!!!! host memory allocation error (C)\n");
        return EXIT_FAILURE;
    }

    /* Fill the matrices with test data */
    for (i = 0; i < n2; i++)
    {
        h_A[i] = rand() / (float)RAND_MAX;
        h_B[i] = rand() / (float)RAND_MAX;
        h_C[i] = rand() / (float)RAND_MAX;
    }

    /* Allocate device memory for the matrices */
    if (cudaMalloc((void **)&d_A, n2 * sizeof(d_A[0])) != cudaSuccess)
    {
        fprintf(stderr, "!!!! device memory allocation error (allocate A)\n");
        return EXIT_FAILURE;
    }

    if (cudaMalloc((void **)&d_B, n2 * sizeof(d_B[0])) != cudaSuccess)
    {
        fprintf(stderr, "!!!! device memory allocation error (allocate B)\n");
        return EXIT_FAILURE;
    }

    if (cudaMalloc((void **)&d_C, n2 * sizeof(d_C[0])) != cudaSuccess)
    {
        fprintf(stderr, "!!!! device memory allocation error (allocate C)\n");
        return EXIT_FAILURE;
    }

    /* Initialize the device matrices with the host matrices */
    status = cublasSetVector(n2, sizeof(h_A[0]), h_A, 1, d_A, 1);

    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! device access error (write A)\n");
        return EXIT_FAILURE;
    }

    status = cublasSetVector(n2, sizeof(h_B[0]), h_B, 1, d_B, 1);

    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! device access error (write B)\n");
        return EXIT_FAILURE;
    }

    status = cublasSetVector(n2, sizeof(h_C[0]), h_C, 1, d_C, 1);

    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! device access error (write C)\n");
        return EXIT_FAILURE;
    }

    /* Performs operation using plain C code */
    simple_sgemm(N, alpha, h_A, h_B, beta, h_C);
    h_C_ref = h_C;

    /* Performs operation using cublas */
    status = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_A, N, d_B, N, &beta, d_C, N);

    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! kernel execution error.\n");
        return EXIT_FAILURE;
    }

    /* Allocate host memory for reading back the result from device memory */
    h_C = (float *)malloc(n2 * sizeof(h_C[0]));

    if (h_C == 0)
    {
        fprintf(stderr, "!!!! host memory allocation error (C)\n");
        return EXIT_FAILURE;
    }

    /* Read the result back */
    status = cublasGetVector(n2, sizeof(h_C[0]), d_C, 1, h_C, 1);

    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! device access error (read C)\n");
        return EXIT_FAILURE;
    }

    /* Check result against reference */
    error_norm = 0;
    ref_norm = 0;

    for (i = 0; i < n2; ++i)
    {
        diff = h_C_ref[i] - h_C[i];
        error_norm += diff * diff;
        ref_norm += h_C_ref[i] * h_C_ref[i];
    }

    error_norm = (float)sqrt((double)error_norm);
    ref_norm = (float)sqrt((double)ref_norm);

    if (fabs(ref_norm) < 1e-7)
    {
        fprintf(stderr, "!!!! reference norm is 0\n");
        return EXIT_FAILURE;
    }

    /* Memory clean up */
    free(h_A);
    free(h_B);
    free(h_C);
    free(h_C_ref);

    if (cudaFree(d_A) != cudaSuccess)
    {
        fprintf(stderr, "!!!! memory free error (A)\n");
        return EXIT_FAILURE;
    }

    if (cudaFree(d_B) != cudaSuccess)
    {
        fprintf(stderr, "!!!! memory free error (B)\n");
        return EXIT_FAILURE;
    }

    if (cudaFree(d_C) != cudaSuccess)
    {
        fprintf(stderr, "!!!! memory free error (C)\n");
        return EXIT_FAILURE;
    }

    /* Shutdown */
    status = cublasDestroy(handle);

    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! shutdown error (A)\n");
        return EXIT_FAILURE;
    }

    printf("CUBLAS program finished\n");

    exit(error_norm / ref_norm < 1e-6f ? EXIT_SUCCESS : EXIT_FAILURE);
}