/*z = x * T(y)*/ void matrixMulTB(cuMatrix<float>* x, cuMatrix<float>*y, cuMatrix<float>*z) { if(x->channels != 1 || y->channels != 1 || z->channels != 1){ printf("matrix mul chanels != 1\n"); exit(0); } if(x->cols != y->cols || z->rows != x->rows || z->cols != y->rows){ printf("matrix mul chanels != 1\n"); exit(0); } cublasStatus_t stat; float alpha = 1.0; float beta = 0.0; stat = cublasSgemm( getHandle(), CUBLAS_OP_T, CUBLAS_OP_N, y->rows, x->rows, y->cols, &alpha, y->getDev(), y->cols, x->getDev(), x->cols, &beta, z->getDev(), z->cols); cudaDeviceSynchronize(); if(stat != CUBLAS_STATUS_SUCCESS) { printf("matrixMulTB cublasSgemm error\n"); exit(0); } }
static vl::Error gemm(vl::Context& context, char op1, char op2, ptrdiff_t m, ptrdiff_t n, ptrdiff_t k, type alpha, type const * a, ptrdiff_t lda, type const * b, ptrdiff_t ldb, type beta, type * c, ptrdiff_t ldc) { cublasHandle_t handle ; cublasStatus_t status ; status = context.getCudaHelper().getCublasHandle(&handle) ; if (status != CUBLAS_STATUS_SUCCESS) goto done ; status = cublasSgemm(handle, (op1 == 't') ? CUBLAS_OP_T : CUBLAS_OP_N, (op2 == 't') ? CUBLAS_OP_T : CUBLAS_OP_N, (int)m, (int)n, (int)k, &alpha, a, (int)lda, b, (int)ldb, &beta, c, (int)ldc); done: return context.setError (context.getCudaHelper().catchCublasError(status, "cublasSgemm"), __func__) ; }
/* C <- alpha*A*B + beta*C */ void gemm(double alpha, Mat mA, Mat mB, double beta, Mat mC) { const int n = MatN(mA); const void* const a = MatElems(mA); const void* const b = MatElems(mB); void* const c = MatElems(mC); const bool dev = MatDev(mA); switch (MatElemSize(mA)) { case 4: if (dev) { float alpha32 = alpha, beta32 = beta; cublasSgemm(g_cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n, n, n, &alpha32, a, n, b, n, &beta32, c, n); } else { cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, n, n, n, alpha, a, n, b, n, beta, c, n); } break; case 8: if (dev) { cublasDgemm(g_cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n, n, n, &alpha, a, n, b, n, &beta, c, n); } else { cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, n, n, n, alpha, a, n, b, n, beta, c, n); } break; } }
CAMLprim value spoc_cublasSgemm(value transa, value transb, value m, value n, value k, value alpha, value a, value lda, value b, value ldb, value beta, value c, value ldc, value dev){ CAMLparam5(transa, transb, m, n, k); CAMLxparam5(alpha, a, lda, b, ldb); CAMLxparam4(beta, c, ldc, dev); CAMLlocal3(dev_vec_array, dev_vec, gi); CUdeviceptr d_A; CUdeviceptr d_B; CUdeviceptr d_C; int id; GET_VEC(a, d_A); GET_VEC(b, d_B); GET_VEC(c, d_C); CUBLAS_GET_CONTEXT; cublasSgemm (Int_val(transa), Int_val(transb), Int_val(m), Int_val(n), Int_val(k), (float)Double_val(alpha), (float*) d_A, Int_val(lda), (float*) d_B, Int_val(ldb), (float) Double_val(beta), (float *)d_C, Int_val(ldc)); CUDA_RESTORE_CONTEXT; CAMLreturn(Val_unit); }
static void GEMM (const Teuchos::ETransp transA, const Teuchos::ETransp transB, const float alpha, const View<const float**,LayoutLeft,Cuda>& A, const View<const float**,LayoutLeft,Cuda>& B, const float beta, const View<float**,LayoutLeft,Cuda>& C) { const int m = static_cast<int>(C.dimension_0()), n = static_cast<int>(C.dimension_1()), k = (transA == Teuchos::NO_TRANS ? A.dimension_1() : A.dimension_0()), lda = static_cast<int>(Impl::getStride2DView(A)), ldb = static_cast<int>(Impl::getStride2DView(B)), ldc = static_cast<int>(Impl::getStride2DView(C)); const char char_transA = (transA == Teuchos::NO_TRANS ? 'N' : 'T'), char_transB = (transB == Teuchos::NO_TRANS ? 'N' : 'T'); cublasSgemm (char_transA, char_transB, m, n, k, alpha, A.ptr_on_device(), lda, B.ptr_on_device(), ldb, beta, C.ptr_on_device(), ldc); #ifdef HAVE_KOKKOS_DEBUG const cublasStatus info = cublasGetError (); TEUCHOS_TEST_FOR_EXCEPTION (info != CUBLAS_STATUS_SUCCESS, std::runtime_error, "cublasSgemm failed with status " << info << "." ); #endif // HAVE_KOKKOS_DEBUG }
static int sgemm(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, gpudata *A, size_t offA, size_t lda, gpudata *B, size_t offB, size_t ldb, float beta, gpudata *C, size_t offC, size_t ldc) { cuda_context *ctx = A->ctx; gpudata *T; size_t t; cublasStatus_t err; cb_transpose transT; ASSERT_BUF(A); ASSERT_BUF(B); ASSERT_BUF(C); if (order == cb_c) { /* swap A and B */ t = N; N = M; M = t; T = A; A = B; B = T; t = lda; lda = ldb; ldb = t; transT = transA; transA = transB; transB = transT; t = offA; offA = offB; offB = t; } cuda_enter(ctx); cuda_wait(A, CUDA_WAIT_READ); cuda_wait(B, CUDA_WAIT_READ); cuda_wait(C, CUDA_WAIT_READ|CUDA_WAIT_WRITE); err = cublasSgemm(((blas_handle *)ctx->blas_handle)->h, convT(transA), convT(transB), M, N, K, &alpha, ((float *)A->ptr) + offA, lda, ((float *)B->ptr) + offB, ldb, &beta, ((float *)C->ptr) + offC, ldc); if (err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); if (err == CUBLAS_STATUS_ARCH_MISMATCH) return GA_DEVSUP_ERROR; return GA_BLAS_ERROR; } cuda_record(A, CUDA_WAIT_READ); cuda_record(B, CUDA_WAIT_READ); cuda_record(C, CUDA_WAIT_READ|CUDA_WAIT_WRITE); cuda_exit(ctx); return GA_NO_ERROR; }
inline void gemm( const Order order, const TransA transa, const TransB transb, const int m, const int n, const int k, const float alpha, const float* a, const int lda, const float* b, const int ldb, const float beta, float* c, const int ldc ) { BOOST_STATIC_ASSERT( (is_same<Order, tag::column_major>::value) ); cublasSgemm( blas_option< TransA >::value, blas_option< TransB >::value, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc ); }
int CudaInterface::cublasMatrixMult(float* A, float* B, float* C, MatrixSize_t mSize) { checkCudaErrors(cudaSetDevice(mDevID)); checkCudaErrors(cudaGetDevice(&mDevID)); checkCudaErrors(cublasSgemm(mCublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, mSize.uiWB, mSize.uiHA, mSize.uiWA, &mAlpha, B, mSize.uiWB, A, mSize.uiWA, &mBeta, C, mSize.uiWA)); return 0; }
Darray<float> cudot (const Darray<float>& lhs, const Darray<float>& rhs) { // context check CHECK_EQ(lhs.getDeviceManager().getDeviceID(), rhs.getDeviceManager().getDeviceID()); CHECK_EQ(lhs.ndim(), rhs.ndim()); CHECK_LT(lhs.ndim(), 3); CHECK_LT(rhs.ndim(), 3); Darray<float> ret; if (lhs.ndim()==1 && rhs.ndim()==1) { // shape check CHECK_EQ(lhs.size(), rhs.size()); ret = Darray<float>(lhs.getDeviceManager(), {1}); // using cublas sdot lhs.deviceSet(); cublasSdot (DeviceManager::handle, lhs.size(), lhs.data, 1, rhs.data, 1, ret.data); } // 2D matrix dot else if (lhs.ndim()==2 && rhs.ndim()==2) { // shape check CHECK_EQ(lhs.shape()[1], rhs.shape()[0]); ret = Darray<float>(lhs.getDeviceManager(), {lhs.shape()[0], rhs.shape()[1]}); // using cublas sgemm lhs.deviceSet(); const float alpha = 1.; const float beta = 0.; CUBLAS_SAFE_CALL( cublasSgemm (DeviceManager::handle, CUBLAS_OP_N, CUBLAS_OP_N, lhs.shape()[0], rhs.shape()[1], lhs.shape()[1], &alpha, lhs.dev_data, lhs.shape()[0], rhs.dev_data, rhs.shape()[0], &beta, ret.dev_data, ret.shape()[0]) ); } return ret; }
void gemm(bool transa, bool transb, int m, int n, int k, float alpha, thrust::device_ptr<const float> A, int lda, thrust::device_ptr<const float> B, int ldb, float beta, thrust::device_ptr<float> C, int ldc) { const cublasOperation_t ctransa = transa ? CUBLAS_OP_T : CUBLAS_OP_N; const cublasOperation_t ctransb = transb ? CUBLAS_OP_T : CUBLAS_OP_N; cublasSetStream(context::get().cublasHandle, context::get().stream); cublasSgemm(context::get().cublasHandle, ctransa, ctransb, m, n, k, &alpha, A.get(), lda, B.get(), ldb, &beta, C.get(), ldc); }
void fully_connected_layer_updater_cuda::enqueue_backward_weights_propagation( cudaStream_t stream_id, const std::vector<cuda_linear_buffer_device::const_ptr>& schema_data, const std::vector<cuda_linear_buffer_device::ptr>& gradient, const std::vector<cuda_linear_buffer_device::const_ptr>& data_custom, const std::vector<cuda_linear_buffer_device::const_ptr>& input_neurons_buffers, cuda_linear_buffer_device::const_ptr output_errors_buffer, const std::vector<cuda_linear_buffer_device::const_ptr>& persistent_working_data, cuda_linear_buffer_device::ptr temporary_working_fixed_buffer, cuda_linear_buffer_device::ptr temporary_working_per_entry_buffer, cuda_linear_buffer_device::const_ptr temporary_fixed_buffer, cuda_linear_buffer_device::const_ptr temporary_per_entry_buffer, unsigned int entry_count) { // Update weights { cublas_safe_call(cublasSetStream(cuda_config->get_cublas_handle(), stream_id)); float alpha = 1.0F; float beta = 1.0F; cublas_safe_call(cublasSgemm( cuda_config->get_cublas_handle(), CUBLAS_OP_N, CUBLAS_OP_T, input_elem_count_per_entry_list[0], output_elem_count_per_entry, entry_count, &alpha, *input_neurons_buffers[0], input_elem_count_per_entry_list[0], *output_errors_buffer, output_elem_count_per_entry, &beta, *gradient[0], input_elem_count_per_entry_list[0])); } // Update biases if (bias) { cudnn_safe_call(cudnnSetStream(cuda_config->get_cudnn_handle(), stream_id)); cudnn_util::set_tensor_descriptor( output_data_desc, output_configuration_specific, entry_count); float alpha = 1.0F; float beta = 1.0F; cudnn_safe_call(cudnnConvolutionBackwardBias( cuda_config->get_cudnn_handle(), &alpha, output_data_desc, *output_errors_buffer, &beta, bias_desc, *gradient[1])); } }
void gemm_ongpu(int TA, int TB, int M, int N, int K, float ALPHA, float *A_gpu, int lda, float *B_gpu, int ldb, float BETA, float *C_gpu, int ldc) { cublasHandle_t handle = blas_handle(); cudaError_t status = (cudaError_t)cublasSgemm(handle, (TB ? CUBLAS_OP_T : CUBLAS_OP_N), (TA ? CUBLAS_OP_T : CUBLAS_OP_N), N, M, K, &ALPHA, B_gpu, ldb, A_gpu, lda, &BETA, C_gpu, ldc); check_error(status); }
void fully_connected_layer_updater_cuda::enqueue_forward_propagation( cudaStream_t stream_id, cuda_linear_buffer_device::ptr output_buffer, const std::vector<cuda_linear_buffer_device::const_ptr>& schema_data, const std::vector<cuda_linear_buffer_device::const_ptr>& data, const std::vector<cuda_linear_buffer_device::const_ptr>& data_custom, const std::vector<cuda_linear_buffer_device::const_ptr>& input_buffers, const std::vector<cuda_linear_buffer_device::const_ptr>& persistent_working_data, cuda_linear_buffer_device::ptr temporary_working_fixed_buffer, cuda_linear_buffer_device::ptr temporary_working_per_entry_buffer, cuda_linear_buffer_device::ptr temporary_fixed_buffer, cuda_linear_buffer_device::ptr temporary_per_entry_buffer, unsigned int entry_count) { { cublas_safe_call(cublasSetStream(cuda_config->get_cublas_handle(), stream_id)); float alpha = 1.0F; float beta = 0.0F; cublas_safe_call(cublasSgemm( cuda_config->get_cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N, output_elem_count_per_entry, entry_count, input_elem_count_per_entry_list[0], &alpha, *data[0], input_elem_count_per_entry_list[0], *input_buffers[0], input_elem_count_per_entry_list[0], &beta, *output_buffer, output_elem_count_per_entry)); } if (bias) { cudnn_safe_call(cudnnSetStream(cuda_config->get_cudnn_handle(), stream_id)); cudnn_util::set_tensor_descriptor( output_data_desc, output_configuration_specific, entry_count); float alpha = 1.0F; float beta = 1.0F; cudnn_safe_call(cudnnAddTensor( cuda_config->get_cudnn_handle(), &alpha, bias_desc, *data[1], &beta, output_data_desc, *output_buffer)); } }
micronn_matrix* micronn_matrix_dot(cublasHandle_t handle, cublasOperation_t transA, cublasOperation_t transB, float alpha, micronn_matrix* v, micronn_matrix* w) { float beta = 0.0; micronn_matrix* x = micronn_matrix_alloc(transA == CUBLAS_OP_N ? v->rows : v->cols, transB == CUBLAS_OP_N ? w->cols : w->rows); cublasSgemm(handle, transA, transB, transA == CUBLAS_OP_N ? v->rows : v->cols, transB == CUBLAS_OP_N ? w->cols : w->rows, transA == CUBLAS_OP_N ? v->cols : v->rows, &alpha, v->devPtrvals, v->rows, w->devPtrvals, w->rows, &beta, x->devPtrvals, x->rows); return x; };
void caffe_gpu_gemm<float>(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const float alpha, const float* A, const float* B, const float beta, float* C) { // Note that cublas follows fortran order. int lda = (TransA == CblasNoTrans) ? K : M; int ldb = (TransB == CblasNoTrans) ? N : K; cublasOperation_t cuTransA = (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; cublasOperation_t cuTransB = (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; CUBLAS_CHECK(cublasSgemm(Caffe::cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, N)); }
void magma_sgemm( magma_trans_t transA, magma_trans_t transB, magma_int_t m, magma_int_t n, magma_int_t k, float alpha, float const* dA, magma_int_t lda, float const* dB, magma_int_t ldb, float beta, float* dC, magma_int_t ldc ) { cublasSgemm( cublas_trans_const( transA ), cublas_trans_const( transB ), m, n, k, alpha, dA, lda, dB, ldb, beta, dC, ldc ); }
JNIEXPORT jint JNICALL Java_edu_berkeley_bid_CUMAT_blockSgemm (JNIEnv *env, jobject obj, jint transA, jint transB, jint nr, jint nc, jint kk, jint reps, jobject jA, jint lda, jint astep, jobject jB, jint ldb, jint bstep, jobject jC, jint ldc, jint cstep) { char at, bt; at = (transA) ? 't' : 'n'; bt = (transB) ? 't' : 'n'; float *A = (float*)getPointer(env, jA); float *B = (float*)getPointer(env, jB); float *C = (float*)getPointer(env, jC); for (int i = 0; i < reps; i++) { cublasSgemm(at, bt, nr, nc, kk, 1.0f, A, lda, B, ldb, 0.0f, C, ldc); A += astep; B += bstep; C += cstep; } cudaDeviceSynchronize(); cudaError_t err = cudaGetLastError(); return err; }
static inline void dw_common_cpu_codelet_update_u22(void *descr[], int s, STARPU_ATTRIBUTE_UNUSED void *_args) { float *left = (float *)STARPU_MATRIX_GET_PTR(descr[0]); float *right = (float *)STARPU_MATRIX_GET_PTR(descr[1]); float *center = (float *)STARPU_MATRIX_GET_PTR(descr[2]); unsigned dx = STARPU_MATRIX_GET_NX(descr[2]); unsigned dy = STARPU_MATRIX_GET_NY(descr[2]); unsigned dz = STARPU_MATRIX_GET_NY(descr[0]); unsigned ld12 = STARPU_MATRIX_GET_LD(descr[0]); unsigned ld21 = STARPU_MATRIX_GET_LD(descr[1]); unsigned ld22 = STARPU_MATRIX_GET_LD(descr[2]); #ifdef STARPU_USE_CUDA cublasStatus status; #endif switch (s) { case 0: STARPU_SGEMM("N", "N", dy, dx, dz, -1.0f, left, ld21, right, ld12, 1.0f, center, ld22); break; #ifdef STARPU_USE_CUDA case 1: cublasSgemm('n', 'n', dx, dy, dz, -1.0f, left, ld21, right, ld12, 1.0f, center, ld22); status = cublasGetError(); if (status != CUBLAS_STATUS_SUCCESS) STARPU_CUBLAS_REPORT_ERROR(status); break; #endif default: STARPU_ABORT(); break; } }
int tiramisu_cublas_sgemm(float *A, float *B, float *C, uint64_t M, uint64_t N, uint64_t K, float alpha, float beta, uint64_t ldA, uint64_t ldB, uint64_t ldC, uint64_t offsetA, uint64_t offsetB, uint64_t offsetC, bool transposeA, bool transposeB) { // TODO: Destroy the handle. static bool handle_created = false; static cublasHandle_t handle; if (!handle_created) { cublasCreate(&handle); handle_created = true; } // Default values for tight packing: if (ldA == 0) { ldA = transposeA ? M : K; } if (ldB == 0) { ldB = transposeB ? K : N; } if (ldC == 0) { ldC = N; } // The cuBLAS GEMM accepts column major buffers by default. We do a simple // trick here to multiply row major matrices. From a row-major perspective, // column-major multiplication basically transposes inputs, multiplies, and // transposes the output again: cublas(A, B) = ((A^T)x(B^T))^T = BxA // So it is actually equivalent to row-major GEMM with inputs swapped. // We need to reorder the size parameters as well to make it work: cublasSetStream(handle, cudaStreamPerThread); handle_cublas_error( cublasSgemm(handle, transposeB ? CUBLAS_OP_T : CUBLAS_OP_N, transposeA ? CUBLAS_OP_T : CUBLAS_OP_N, N, M, K, &alpha, B + offsetB, ldB, A + offsetA, ldA, &beta, C + offsetC, ldC), __FUNCTION__); return 0; }
static void sgemm_cuda(void *descr[], void *args) { float *a = (float *)STARPU_MATRIX_GET_PTR(descr[0]); float *b = (float *)STARPU_MATRIX_GET_PTR(descr[1]); float *c = (float *)STARPU_MATRIX_GET_PTR(descr[2]); unsigned w = STARPU_MATRIX_GET_NY(descr[2]); unsigned h = STARPU_MATRIX_GET_NX(descr[2]); unsigned k = STARPU_MATRIX_GET_NY(descr[0]); unsigned lda = STARPU_MATRIX_GET_LD(descr[0]); unsigned ldb = STARPU_MATRIX_GET_LD(descr[1]); unsigned ldc = STARPU_MATRIX_GET_LD(descr[2]); struct sgemm_arg * arg = (struct sgemm_arg*)args; cublasSetStream(cublas_handle, starpu_cuda_get_local_stream()); cublasSgemm(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, h, w, k, &arg->alpha, a, lda, b, ldb, &arg->beta, c, ldc); cudaStreamSynchronize(starpu_cuda_get_local_stream()); free(arg); }
void matrixMulti(cublasHandle_t cublasHandle, int m, int n, int batchSize, float alpha, const float*A, const float*x, float beta, float *y) { #ifdef DISABLE_GEMV checkCublasErrors(cublasSgemm(cublasHandle, CUBLAS_OP_T, CUBLAS_OP_T, n, batchSize, m, &alpha, x, m, A, batchSize, &beta, y, n)); #else checkCublasErrors(cublasSgemv(cublasHandle, CUBLAS_OP_T, m, n, &alpha, A, m, x, 1, &beta, y, 1)); #endif }
void fully_connected_layer_updater_cuda::enqueue_backward_data_propagation( cudaStream_t stream_id, unsigned int input_index, cuda_linear_buffer_device::ptr input_errors_buffer, cuda_linear_buffer_device::const_ptr output_errors_buffer, const std::vector<cuda_linear_buffer_device::const_ptr>& schema_data, const std::vector<cuda_linear_buffer_device::const_ptr>& data, const std::vector<cuda_linear_buffer_device::const_ptr>& data_custom, const std::vector<cuda_linear_buffer_device::const_ptr>& input_neurons_buffers, cuda_linear_buffer_device::const_ptr output_neurons_buffer, const std::vector<cuda_linear_buffer_device::const_ptr>& persistent_working_data, cuda_linear_buffer_device::ptr temporary_working_fixed_buffer, cuda_linear_buffer_device::ptr temporary_working_per_entry_buffer, cuda_linear_buffer_device::const_ptr temporary_fixed_buffer, cuda_linear_buffer_device::const_ptr temporary_per_entry_buffer, bool add_update_to_destination, unsigned int entry_count) { cublas_safe_call(cublasSetStream(cuda_config->get_cublas_handle(), stream_id)); float alpha = 1.0F; float beta = (add_update_to_destination ? 1.0F : 0.0F); cublas_safe_call(cublasSgemm( cuda_config->get_cublas_handle(), CUBLAS_OP_N, CUBLAS_OP_N, input_elem_count_per_entry_list[0], entry_count, output_elem_count_per_entry, &alpha, *data[0], input_elem_count_per_entry_list[0], *output_errors_buffer, output_elem_count_per_entry, &beta, *input_errors_buffer, input_elem_count_per_entry_list[0])); }
cublasStatus_t cublasXgemm(cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float *alpha, const float *A, int lda, const float *B, int ldb, const float *beta, float *C, int ldc) { return cublasSgemm(g_context->cublasHandle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); }
TEST_F(MatMultTests, SIFT) { string fileName("mat-sift"); int m = 898790; int n = 128; int k = 256; /* allocate data */ float * data = (float *)malloc(m*n*sizeof(float)); float * centers = (float *)malloc(k*n*sizeof(float)); float * result = (float *)malloc(m*k*sizeof(float)); float * resultCublas = (float *)malloc(m*k*sizeof(float)); /* read matrix from file */ FILE * fid = fopen(fileName.c_str(), "rb"); int nread = fread(data, sizeof(float), m*n, fid); ASSERT_EQ(nread, m*n); fclose(fid); /* initialize centers to 1 */ for (int i = 0; i<k*n; ++i) centers[i] = (float)1; /* allocate device space for the various arrays */ float * dev_data, *dev_centers, *dev_result; int factor = TILESIZE*N_UNROLL_FLOAT; int m_padded = ((m + factor - 1) / factor)*factor; int nBytes = m_padded*n*sizeof(float); cudaMalloc((void**)&dev_data, nBytes); cudaMemset(dev_data, 0, nBytes); cudaMemcpy(dev_data, data, m*n*sizeof(float), cudaMemcpyHostToDevice); nBytes = n*k*sizeof(float); cudaMalloc((void**)&dev_centers, nBytes); cudaMemcpy(dev_centers, centers, nBytes, cudaMemcpyHostToDevice); nBytes = m*k*sizeof(float); cudaMalloc((void**)&dev_result, nBytes); cudaMemset(dev_result, 0, nBytes); /* run MatMatMultF */ int err = MatMatMultF(m, n, dev_data, k, dev_centers, dev_result); if (err) printf("Error int MatMatMultF for mat-sift\n"); cudaMemcpy(result, dev_result, nBytes, cudaMemcpyDeviceToHost); /* run CUBLAS SGEMM */ float one = 1.f; float zero = 0.f; cublasHandle_t handle; cublasCreate(&handle); cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, k, m, n, (const float *)&one, (const float *)dev_centers, k, (const float *)dev_data, n, (const float *)&zero, (float *)dev_result, k); cudaMemcpy(resultCublas, dev_result, nBytes, cudaMemcpyDeviceToHost); #if 1 /* check results */ int maxPrintErrors=10; int numPrintErrors=0; for (int i = 0; i < m; ++i) { for (int j = 0; j < k; ++j) { int index = i*k + j; if (result[index] == 0 && resultCublas[index] == 0) continue; else { float err = fabs(result[index] - resultCublas[index]) / fabs(result[index]); if (err >= 1.e-6 || result[index] == 0) { printf("i=%d, j=%d : %1.5g, %1.5g, err=%1.5g\n", i, j, result[index], resultCublas[index], err); if (numPrintErrors<maxPrintErrors) { numPrintErrors++; EXPECT_LT(err, 1.e-6); } else { ASSERT_LT(err, 1.e-6); } } } } } #endif /* free data */ if (dev_data) cudaFree(dev_data); if (dev_centers) cudaFree(dev_centers); if (dev_result) cudaFree(dev_result); if (data) free(data); if (centers) free(centers); if (result) free(result); if (resultCublas) free(resultCublas); cublasDestroy(handle); }
/* //////////////////////////////////////////////////////////////////////////// -- Testing sgemm */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, magma_perf, magma_time, dev_perf, dev_time, cpu_perf, cpu_time; float magma_error, dev_error, Cnorm, work[1]; magma_int_t M, N, K; magma_int_t Am, An, Bm, Bn; magma_int_t sizeA, sizeB, sizeC; magma_int_t lda, ldb, ldc, ldda, lddb, lddc; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; float *h_A, *h_B, *h_C, *h_Cmagma, *h_Cdev; magmaFloat_ptr d_A, d_B, d_C; float c_neg_one = MAGMA_S_NEG_ONE; float alpha = MAGMA_S_MAKE( 0.29, -0.86 ); float beta = MAGMA_S_MAKE( -0.48, 0.38 ); magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); #ifdef HAVE_CUBLAS // for CUDA, we can check MAGMA vs. CUBLAS, without running LAPACK printf("If running lapack (option --lapack), MAGMA and %s error are both computed\n" "relative to CPU BLAS result. Else, MAGMA error is computed relative to %s result.\n\n", g_platform_str, g_platform_str ); printf("transA = %s, transB = %s\n", lapack_trans_const(opts.transA), lapack_trans_const(opts.transB) ); printf(" M N K MAGMA Gflop/s (ms) %s Gflop/s (ms) CPU Gflop/s (ms) MAGMA error %s error\n", g_platform_str, g_platform_str ); #else // for others, we need LAPACK for check opts.lapack |= opts.check; // check (-c) implies lapack (-l) printf("transA = %s, transB = %s\n", lapack_trans_const(opts.transA), lapack_trans_const(opts.transB) ); printf(" M N K %s Gflop/s (ms) CPU Gflop/s (ms) %s error\n", g_platform_str, g_platform_str ); #endif printf("=========================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; K = opts.ksize[itest]; gflops = FLOPS_SGEMM( M, N, K ) / 1e9; if ( opts.transA == MagmaNoTrans ) { lda = Am = M; An = K; } else { lda = Am = K; An = M; } if ( opts.transB == MagmaNoTrans ) { ldb = Bm = K; Bn = N; } else { ldb = Bm = N; Bn = K; } ldc = M; ldda = ((lda+31)/32)*32; lddb = ((ldb+31)/32)*32; lddc = ((ldc+31)/32)*32; sizeA = lda*An; sizeB = ldb*Bn; sizeC = ldc*N; TESTING_MALLOC_CPU( h_A, float, lda*An ); TESTING_MALLOC_CPU( h_B, float, ldb*Bn ); TESTING_MALLOC_CPU( h_C, float, ldc*N ); TESTING_MALLOC_CPU( h_Cmagma, float, ldc*N ); TESTING_MALLOC_CPU( h_Cdev, float, ldc*N ); TESTING_MALLOC_DEV( d_A, float, ldda*An ); TESTING_MALLOC_DEV( d_B, float, lddb*Bn ); TESTING_MALLOC_DEV( d_C, float, lddc*N ); /* Initialize the matrices */ lapackf77_slarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_slarnv( &ione, ISEED, &sizeB, h_B ); lapackf77_slarnv( &ione, ISEED, &sizeC, h_C ); magma_ssetmatrix( Am, An, h_A, lda, d_A, 0, ldda, opts.queue ); magma_ssetmatrix( Bm, Bn, h_B, ldb, d_B, 0, lddb, opts.queue ); /* ===================================================================== Performs operation using MAGMABLAS (currently only with CUDA) =================================================================== */ #ifdef HAVE_CUBLAS magma_ssetmatrix( M, N, h_C, ldc, d_C, lddc ); magma_time = magma_sync_wtime( NULL ); magmablas_sgemm( opts.transA, opts.transB, M, N, K, alpha, d_A, ldda, d_B, lddb, beta, d_C, lddc ); magma_time = magma_sync_wtime( NULL ) - magma_time; magma_perf = gflops / magma_time; magma_sgetmatrix( M, N, d_C, lddc, h_Cmagma, ldc ); #endif /* ===================================================================== Performs operation using CUBLAS / clBLAS / Xeon Phi MKL =================================================================== */ magma_ssetmatrix( M, N, h_C, ldc, d_C, 0, lddc, opts.queue ); #ifdef HAVE_CUBLAS dev_time = magma_sync_wtime( NULL ); cublasSgemm( opts.handle, cublas_trans_const(opts.transA), cublas_trans_const(opts.transB), M, N, K, &alpha, d_A, ldda, d_B, lddb, &beta, d_C, lddc ); dev_time = magma_sync_wtime( NULL ) - dev_time; #else dev_time = magma_sync_wtime( opts.queue ); magma_sgemm( opts.transA, opts.transB, M, N, K, alpha, d_A, 0, ldda, d_B, 0, lddb, beta, d_C, 0, lddc, opts.queue ); dev_time = magma_sync_wtime( opts.queue ) - dev_time; #endif dev_perf = gflops / dev_time; magma_sgetmatrix( M, N, d_C, 0, lddc, h_Cdev, ldc, opts.queue ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); blasf77_sgemm( lapack_trans_const(opts.transA), lapack_trans_const(opts.transB), &M, &N, &K, &alpha, h_A, &lda, h_B, &ldb, &beta, h_C, &ldc ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; } /* ===================================================================== Check the result =================================================================== */ if ( opts.lapack ) { // compute relative error for both magma & dev, relative to lapack, // |C_magma - C_lapack| / |C_lapack| Cnorm = lapackf77_slange( "F", &M, &N, h_C, &ldc, work ); blasf77_saxpy( &sizeC, &c_neg_one, h_C, &ione, h_Cdev, &ione ); dev_error = lapackf77_slange( "F", &M, &N, h_Cdev, &ldc, work ) / Cnorm; #ifdef HAVE_CUBLAS blasf77_saxpy( &sizeC, &c_neg_one, h_C, &ione, h_Cmagma, &ione ); magma_error = lapackf77_slange( "F", &M, &N, h_Cmagma, &ldc, work ) / Cnorm; printf("%5d %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e %s\n", (int) M, (int) N, (int) K, magma_perf, 1000.*magma_time, dev_perf, 1000.*dev_time, cpu_perf, 1000.*cpu_time, magma_error, dev_error, (magma_error < tol && dev_error < tol ? "ok" : "failed")); status += ! (magma_error < tol && dev_error < tol); #else printf("%5d %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) M, (int) N, (int) K, dev_perf, 1000.*dev_time, cpu_perf, 1000.*cpu_time, dev_error, (dev_error < tol ? "ok" : "failed")); status += ! (dev_error < tol); #endif } else { #ifdef HAVE_CUBLAS // compute relative error for magma, relative to dev (currently only with CUDA) Cnorm = lapackf77_slange( "F", &M, &N, h_Cdev, &ldc, work ); blasf77_saxpy( &sizeC, &c_neg_one, h_Cdev, &ione, h_Cmagma, &ione ); magma_error = lapackf77_slange( "F", &M, &N, h_Cmagma, &ldc, work ) / Cnorm; printf("%5d %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) --- ( --- ) %8.2e --- %s\n", (int) M, (int) N, (int) K, magma_perf, 1000.*magma_time, dev_perf, 1000.*dev_time, magma_error, (magma_error < tol ? "ok" : "failed")); status += ! (magma_error < tol); #else printf("%5d %5d %5d %7.2f (%7.2f) --- ( --- ) ---\n", (int) M, (int) N, (int) K, dev_perf, 1000.*dev_time ); #endif } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_C ); TESTING_FREE_CPU( h_Cmagma ); TESTING_FREE_CPU( h_Cdev ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); TESTING_FREE_DEV( d_C ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
int cublas_multiply_matrix_float2( PGM_Matriz_Double *A, PGM_Matriz_Double *B, PGM_Matriz_Double *result, PGM_Matriz_Float *work){ cublasStatus_t status; cublasHandle_t handle; int max_dim = work->n_linhas; PGM_Matriz_GPU device_A, device_B, device_C; float alpha = 1.0, beta = 0.0; if (A->n_linhas != result->n_linhas || B->n_colunas != result->n_colunas){ return 0; } status = cublasCreate(&handle); if (status != CUBLAS_STATUS_SUCCESS){ return -1; } if(create_PGM_Matriz_GPU_float(&device_A,A->n_linhas, A->n_colunas,max_dim) != cudaSuccess){ return -2; } if(create_PGM_Matriz_GPU_float(&device_B,B->n_linhas, B->n_colunas,max_dim) != cudaSuccess){ if(cudaFree(device_A.valor) == cudaSuccess) return -2; else return -11; } if(create_PGM_Matriz_GPU_float(&device_C,result->n_linhas, result->n_colunas,max_dim) != cudaSuccess){ if(cudaFree(device_A.valor) == cudaSuccess && cudaFree(device_B.valor) == cudaSuccess) return -2; else return -11; } if(float_copyMatrixHost2GPU(A,work,&device_A) != CUBLAS_STATUS_SUCCESS){ if(cudaFree(device_A.valor) == cudaSuccess && cudaFree(device_B.valor) == cudaSuccess && cudaFree(device_C.valor) == cudaSuccess) return -3; else return -12; } if(float_copyMatrixHost2GPU(B,work,&device_B) != CUBLAS_STATUS_SUCCESS){ if(cudaFree(device_A.valor) == cudaSuccess && cudaFree(device_B.valor) == cudaSuccess && cudaFree(device_C.valor) == cudaSuccess) return -3; else return -12; } if(cublasSgemm(handle, CUBLAS_OP_T, CUBLAS_OP_T, max_dim,max_dim,max_dim, &alpha,(float*) device_A.valor,max_dim,(float*)device_B.valor, max_dim, &beta, (float*)device_C.valor, max_dim) != CUBLAS_STATUS_SUCCESS){ if(cudaFree(device_A.valor) == cudaSuccess && cudaFree(device_B.valor) == cudaSuccess && cudaFree(device_C.valor) == cudaSuccess) return -4; else return -13; } if(float_copyMatrixGPU2Host_Transpose(result, &device_C, work) != CUBLAS_STATUS_SUCCESS){ if(cudaFree(device_A.valor) == cudaSuccess && cudaFree(device_B.valor) == cudaSuccess && cudaFree(device_C.valor) == cudaSuccess) return -5; else return -14; } if(cudaFree(device_A.valor) != cudaSuccess){ return -15; } if(cudaFree(device_B.valor) != cudaSuccess){ return -15; } if(cudaFree(device_C.valor) != cudaSuccess){ return -15; } if(cublasDestroy(handle) != CUBLAS_STATUS_SUCCESS){ return -7; } return 1; }
static int sgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, gpudata **A, size_t *offA, size_t lda, gpudata **B, size_t *offB, size_t ldb, float beta, gpudata **C, size_t *offC, size_t ldc, size_t batchCount) { cuda_context *ctx; size_t *lt, t; gpudata **T; size_t i; cb_transpose transT; cublasStatus_t err; if (batchCount == 0) return GA_NO_ERROR; ASSERT_BUF(A[0]); ctx = A[0]->ctx; cuda_enter(ctx); if (order == cb_c) { /* swap A and B */ t = N; N = M; M = t; T = A; A = B; B = T; t = lda; lda = ldb; ldb = t; transT = transA; transA = transB; transB = transT; lt = offA; offA = offB; offB = lt; } // use parallel cublasSgemm calls rather than cublasSgemmBatched for large products const size_t threshold = 650; const int multiple_dispatch = M * N * K > threshold * threshold * threshold; if (multiple_dispatch) { for (i = 0; i < batchCount; i++) { ASSERT_BUF(A[i]); ASSERT_BUF(B[i]); ASSERT_BUF(C[i]); cuda_wait(A[i], CUDA_WAIT_READ); cuda_wait(B[i], CUDA_WAIT_READ); cuda_wait(C[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE); err = cublasSgemm(((blas_handle *)ctx->blas_handle)->h, convT(transA), convT(transB), M, N, K, &alpha, (float*)A[i]->ptr + offA[i], lda, (float*)B[i]->ptr + offB[i], ldb, &beta, (float*)C[i]->ptr + offC[i], ldc); if (err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); if (err == CUBLAS_STATUS_ARCH_MISMATCH) return GA_DEVSUP_ERROR; return GA_BLAS_ERROR; } cuda_record(A[i], CUDA_WAIT_READ); cuda_record(B[i], CUDA_WAIT_READ); cuda_record(C[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE); } } else { float **T_l = alloca(sizeof(float *) * batchCount * 3); const float **A_l = (const float **)T_l; const float **B_l = (const float **)T_l + batchCount; float **C_l = T_l + (batchCount * 2); CUdeviceptr Ta, Aa, Ba, Ca; for (i = 0; i < batchCount; i++) { ASSERT_BUF(A[i]); ASSERT_BUF(B[i]); ASSERT_BUF(C[i]); cuda_wait(A[i], CUDA_WAIT_READ); cuda_wait(B[i], CUDA_WAIT_READ); cuda_wait(C[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE); A_l[i] = ((float *)A[i]->ptr) + offA[i]; B_l[i] = ((float *)B[i]->ptr) + offB[i]; C_l[i] = ((float *)C[i]->ptr) + offC[i]; } cuMemAlloc(&Ta, sizeof(float *) * batchCount * 3); Aa = Ta; Ba = Ta + (batchCount * sizeof(float *)); Ca = Ta + (batchCount * sizeof(float *) * 2); cuMemcpyHtoD(Ta, T_l, sizeof(float *) * batchCount * 3); err = cublasSgemmBatched(((blas_handle *)ctx->blas_handle)->h, convT(transA), convT(transB), M, N, K, &alpha, (const float **)Aa, lda, (const float **)Ba, ldb, &beta, (float **)Ca, ldc, batchCount); cuMemFree(Ta); if (err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); if (err == CUBLAS_STATUS_ARCH_MISMATCH) return GA_DEVSUP_ERROR; return GA_BLAS_ERROR; } for (i = 0; i < batchCount; i++) { cuda_record(A[i], CUDA_WAIT_READ); cuda_record(B[i], CUDA_WAIT_READ); cuda_record(C[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE); } } cuda_exit(ctx); return GA_NO_ERROR; }
uint micronn_train(micronn* net, micronn_matrix* inputs, micronn_matrix* targets, uint batch, float eta, float momentum, uint max_iters, float min_error, uint echo_iters) { int j; uint i, index, diff; float error = DBL_MAX, alpha = 1.0, beta = 0.0, one = 1.0; micronn_matrix* tmp, *y; micronn_matrix** delta = malloc(sizeof(micronn_matrix*) * (net->nhidden + 1)); micronn_matrix** grad = malloc(sizeof(micronn_matrix*) * (net->nhidden + 1)); micronn_matrix** a = malloc(sizeof(micronn_matrix*) * (net->nhidden + 2)); //micronn_matrix** z = malloc(sizeof(micronn_matrix*) * (net->nhidden + 1)); //calloc grad alpha = 2 * eta / (batch == 0 ? inputs->cols : batch); for(i = 0; i <= net->nhidden; i++) { grad[i] = micronn_matrix_alloc(net->weights[i]->rows, net->weights[i]->cols); micronn_matrix_set_val(grad[i], 0.0); } micronn_matrix_add_ones(inputs); for(i = 0; (max_iters == 0 || i < max_iters) && error > min_error; i++) { if(batch == 0) { a[0] = inputs; y = targets; } else { index = rand() % (inputs->cols - batch + 1); a[0] = malloc(sizeof(micronn_matrix)); a[0]->rows = inputs->rows; a[0]->cols = batch; a[0]->devPtrvals = inputs->devPtrvals + index * inputs->rows; y = malloc(sizeof(micronn_matrix)); y->rows = targets->rows; y->cols = batch; y->devPtrvals = targets->devPtrvals + index * targets->rows; } //forward and save the outputs of layers for(j = 0; j < net->nhidden + 1; j++) { if(j > 0) { micronn_matrix_add_ones(a[j]); } tmp = micronn_matrix_dot(net->handle, CUBLAS_OP_N, CUBLAS_OP_N, 1.0, net->weights[j], a[j]); //z[j] = tmp; a[j + 1] = micronn_matrix_sigmoid(tmp); micronn_matrix_free(tmp); } //calculate error if(echo_iters != 0 && i % echo_iters == 0) { error = micronn_error(net, inputs, targets, NULL);//a[net->nhidden + 1]); diff = micronn_diff(net, inputs, targets, NULL);//a[net->nhidden + 1]); printf("\x1B[32miteration \x1B[0m%d\t\t\x1B[31merror: \x1B[0m%.10f\t\t\x1B[35mdiff: \x1B[0m%d/%d\n", i, error, diff, inputs->cols); } //last delta = (a[last] - y) * f'(z[last]) delta[net->nhidden] = micronn_matrix_copy(a[net->nhidden + 1]); micronn_matrix_sub(delta[net->nhidden], y); micronn_matrix_deriv_sigmoid(a[net->nhidden + 1], delta[net->nhidden]); //other delta[i] = (W[i])'delta[i+1] * f'(z[i]) for(j = net->nhidden - 1; j >= 0; j--) { delta[j] = micronn_matrix_alloc(net->weights[j + 1]->cols, delta[j + 1]->cols); cublasSgemm(net->handle, CUBLAS_OP_T, CUBLAS_OP_N, net->weights[j + 1]->cols, delta[j + 1]->cols, net->weights[j + 1]->rows, &one, net->weights[j + 1]->devPtrvals, net->weights[j + 1]->rows, delta[j + 1]->devPtrvals, delta[j + 1]->rows, &beta, delta[j]->devPtrvals, delta[j]->rows); //delta[i] *= f'(z[i+1]) micronn_matrix_deriv_sigmoid(a[j + 1], delta[j]); } //compute grad[i] = delta[i+1](a[i])' + momentum*grad[i] and add to weights[i] -= eta/N*grad[i] for(j = net->nhidden; j >= 0; j--) { //delete the last row from deltas to have correct size of grad if(j < net->nhidden) { micronn_matrix_remove_last_row(delta[j]); } cublasSgemm(net->handle, CUBLAS_OP_N, CUBLAS_OP_T, delta[j]->rows, a[j]->rows, a[j]->cols, &alpha, delta[j]->devPtrvals, delta[j]->rows, a[j]->devPtrvals, a[j]->rows, &momentum, grad[j]->devPtrvals, grad[j]->rows); micronn_matrix_sub(net->weights[j], grad[j]); } if(batch != 0) { free(a[0]); free(y); } for(j = 1; j < net->nhidden + 2; j++) { micronn_matrix_free(a[j]); } for(j = 0; j <= net->nhidden; j++) { micronn_matrix_free(delta[j]); } } for(i = 0; i <= net->nhidden; i++) { micronn_matrix_free(grad[i]); } return 1; };
/* Main */ int test_cublas(void) { cublasStatus status; cudaError_t e; float* h_A; float* h_B; float* h_C; float* h_C_ref; float* d_A = 0; void *vp; float* d_B = 0; float* d_C = 0; float alpha = 1.0f; float beta = 0.0f; int n2 = N * N; int i; float error_norm; float ref_norm; float diff; /* Initialize CUBLAS */ printf("simpleCUBLAS test running..\n"); status = cublasInit(); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! CUBLAS initialization error\n"); return EXIT_FAILURE; } /* Allocate host memory for the matrices */ h_A = (float*)malloc(n2 * sizeof(h_A[0])); if (h_A == 0) { fprintf (stderr, "!!!! host memory allocation error (A)\n"); return EXIT_FAILURE; } h_B = (float*)malloc(n2 * sizeof(h_B[0])); if (h_B == 0) { fprintf (stderr, "!!!! host memory allocation error (B)\n"); return EXIT_FAILURE; } h_C = (float*)malloc(n2 * sizeof(h_C[0])); if (h_C == 0) { fprintf (stderr, "!!!! host memory allocation error (C)\n"); return EXIT_FAILURE; } /* Fill the matrices with test data */ for (i = 0; i < n2; i++) { h_A[i] = rand() / (float)RAND_MAX; h_B[i] = rand() / (float)RAND_MAX; h_C[i] = rand() / (float)RAND_MAX; } /* Allocate device memory for the matrices */ if (cudaMalloc(&vp, n2 * sizeof(d_A[0])) != cudaSuccess) { fprintf (stderr, "!!!! device memory allocation error (A)\n"); return EXIT_FAILURE; } d_A = (float *) vp; if (cudaMalloc(&vp, n2 * sizeof(d_B[0])) != cudaSuccess) { fprintf (stderr, "!!!! device memory allocation error (B)\n"); return EXIT_FAILURE; } d_B = (float *) vp; if (cudaMalloc(&vp, n2 * sizeof(d_C[0])) != cudaSuccess) { fprintf (stderr, "!!!! device memory allocation error (C)\n"); return EXIT_FAILURE; } d_C = (float *) vp; /* Initialize the device matrices with the host matrices */ status = cublasSetVector(n2, sizeof(h_A[0]), h_A, 1, d_A, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! device access error (write A)\n"); return EXIT_FAILURE; } status = cublasSetVector(n2, sizeof(h_B[0]), h_B, 1, d_B, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! device access error (write B)\n"); return EXIT_FAILURE; } status = cublasSetVector(n2, sizeof(h_C[0]), h_C, 1, d_C, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! device access error (write C)\n"); return EXIT_FAILURE; } /* Performs operation using plain C code */ simple_sgemm(N, alpha, h_A, h_B, beta, h_C); h_C_ref = h_C; /* Clear last error */ cublasGetError(); /* Performs operation using cublas */ cublasSgemm('n', 'n', N, N, N, alpha, d_A, N, d_B, N, beta, d_C, N); status = cublasGetError(); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! kernel execution error.\n"); return EXIT_FAILURE; } /* Allocate host memory for reading back the result from device memory */ h_C = (float*)malloc(n2 * sizeof(h_C[0])); if (h_C == 0) { fprintf (stderr, "!!!! host memory allocation error (C)\n"); return EXIT_FAILURE; } /* Read the result back */ status = cublasGetVector(n2, sizeof(h_C[0]), d_C, 1, h_C, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! device access error (read C)\n"); return EXIT_FAILURE; } /* Check result against reference */ error_norm = 0; ref_norm = 0; for (i = 0; i < n2; ++i) { diff = h_C_ref[i] - h_C[i]; error_norm += diff * diff; ref_norm += h_C_ref[i] * h_C_ref[i]; } error_norm = (float)sqrt((double)error_norm); ref_norm = (float)sqrt((double)ref_norm); if (fabs(ref_norm) < 1e-7) { fprintf (stderr, "!!!! reference norm is 0\n"); return EXIT_FAILURE; } printf( "Test %s\n", (error_norm / ref_norm < 1e-6f) ? "PASSED" : "FAILED"); /* Memory clean up */ free(h_A); free(h_B); free(h_C); free(h_C_ref); e = cudaFree(d_A); if (e != cudaSuccess) { fprintf (stderr, "!!!! memory free error (A)\n"); return EXIT_FAILURE; } e = cudaFree(d_B); if (e != cudaSuccess) { fprintf (stderr, "!!!! memory free error (B)\n"); return EXIT_FAILURE; } e = cudaFree(d_C); if (e != cudaSuccess) { fprintf (stderr, "!!!! memory free error (C)\n"); return EXIT_FAILURE; } /* Shutdown */ status = cublasShutdown(); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! shutdown error (A)\n"); return EXIT_FAILURE; } return EXIT_SUCCESS; }
/* Main */ int main(int argc, char **argv) { cublasStatus_t status; float *h_A; float *h_B; float *h_C; float *h_C_ref; float *d_A = 0; float *d_B = 0; float *d_C = 0; float alpha = 1.0f; float beta = 0.0f; int n2 = N * N; int i; float error_norm; float ref_norm; float diff; cublasHandle_t handle; int dev = findCudaDevice(argc, (const char **) argv); if (dev == -1) { return EXIT_FAILURE; } /* Initialize CUBLAS */ printf("simpleCUBLAS test running..\n"); status = cublasCreate(&handle); if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "!!!! CUBLAS initialization error\n"); return EXIT_FAILURE; } /* Allocate host memory for the matrices */ h_A = (float *)malloc(n2 * sizeof(h_A[0])); if (h_A == 0) { fprintf(stderr, "!!!! host memory allocation error (A)\n"); return EXIT_FAILURE; } h_B = (float *)malloc(n2 * sizeof(h_B[0])); if (h_B == 0) { fprintf(stderr, "!!!! host memory allocation error (B)\n"); return EXIT_FAILURE; } h_C = (float *)malloc(n2 * sizeof(h_C[0])); if (h_C == 0) { fprintf(stderr, "!!!! host memory allocation error (C)\n"); return EXIT_FAILURE; } /* Fill the matrices with test data */ for (i = 0; i < n2; i++) { h_A[i] = rand() / (float)RAND_MAX; h_B[i] = rand() / (float)RAND_MAX; h_C[i] = rand() / (float)RAND_MAX; } /* Allocate device memory for the matrices */ if (cudaMalloc((void **)&d_A, n2 * sizeof(d_A[0])) != cudaSuccess) { fprintf(stderr, "!!!! device memory allocation error (allocate A)\n"); return EXIT_FAILURE; } if (cudaMalloc((void **)&d_B, n2 * sizeof(d_B[0])) != cudaSuccess) { fprintf(stderr, "!!!! device memory allocation error (allocate B)\n"); return EXIT_FAILURE; } if (cudaMalloc((void **)&d_C, n2 * sizeof(d_C[0])) != cudaSuccess) { fprintf(stderr, "!!!! device memory allocation error (allocate C)\n"); return EXIT_FAILURE; } /* Initialize the device matrices with the host matrices */ status = cublasSetVector(n2, sizeof(h_A[0]), h_A, 1, d_A, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "!!!! device access error (write A)\n"); return EXIT_FAILURE; } status = cublasSetVector(n2, sizeof(h_B[0]), h_B, 1, d_B, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "!!!! device access error (write B)\n"); return EXIT_FAILURE; } status = cublasSetVector(n2, sizeof(h_C[0]), h_C, 1, d_C, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "!!!! device access error (write C)\n"); return EXIT_FAILURE; } /* Performs operation using plain C code */ simple_sgemm(N, alpha, h_A, h_B, beta, h_C); h_C_ref = h_C; /* Performs operation using cublas */ status = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_A, N, d_B, N, &beta, d_C, N); if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "!!!! kernel execution error.\n"); return EXIT_FAILURE; } /* Allocate host memory for reading back the result from device memory */ h_C = (float *)malloc(n2 * sizeof(h_C[0])); if (h_C == 0) { fprintf(stderr, "!!!! host memory allocation error (C)\n"); return EXIT_FAILURE; } /* Read the result back */ status = cublasGetVector(n2, sizeof(h_C[0]), d_C, 1, h_C, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "!!!! device access error (read C)\n"); return EXIT_FAILURE; } /* Check result against reference */ error_norm = 0; ref_norm = 0; for (i = 0; i < n2; ++i) { diff = h_C_ref[i] - h_C[i]; error_norm += diff * diff; ref_norm += h_C_ref[i] * h_C_ref[i]; } error_norm = (float)sqrt((double)error_norm); ref_norm = (float)sqrt((double)ref_norm); if (fabs(ref_norm) < 1e-7) { fprintf(stderr, "!!!! reference norm is 0\n"); return EXIT_FAILURE; } /* Memory clean up */ free(h_A); free(h_B); free(h_C); free(h_C_ref); if (cudaFree(d_A) != cudaSuccess) { fprintf(stderr, "!!!! memory free error (A)\n"); return EXIT_FAILURE; } if (cudaFree(d_B) != cudaSuccess) { fprintf(stderr, "!!!! memory free error (B)\n"); return EXIT_FAILURE; } if (cudaFree(d_C) != cudaSuccess) { fprintf(stderr, "!!!! memory free error (C)\n"); return EXIT_FAILURE; } /* Shutdown */ status = cublasDestroy(handle); if (status != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "!!!! shutdown error (A)\n"); return EXIT_FAILURE; } printf("CUBLAS program finished\n"); exit(error_norm / ref_norm < 1e-6f ? EXIT_SUCCESS : EXIT_FAILURE); }