void Caffe::SetDevice(const int device_id) { int current_device; CUDA_CHECK(cudaGetDevice(¤t_device)); if (current_device == device_id) { return; } // The call to cudaSetDevice must come before any calls to Get, which // may perform initialization using the GPU. CUDA_CHECK(cudaSetDevice(device_id)); if (Get().cublas_handle_) CUBLAS_CHECK(cublasDestroy(Get().cublas_handle_)); if (Get().cusparse_descr_)CUSPARSE_CHECK(cusparseDestroyMatDescr(Get().cusparse_descr_)); if (Get().cusparse_handle_)CUSPARSE_CHECK(cusparseDestroy(Get().cusparse_handle_)); if (Get().curand_generator_) { CURAND_CHECK(curandDestroyGenerator(Get().curand_generator_)); } CUSPARSE_CHECK(cusparseCreate(&Get().cusparse_handle_)); CUSPARSE_CHECK(cusparseCreateMatDescr(&Get().cusparse_descr_)); // cusparseSetMatType(cusparse_descr_,CUSPARSE_MATRIX_TYPE_GENERAL); // cusparseSetMatIndexBase(cusparse_descr_,CUSPARSE_INDEX_BASE_ZERO); LOG(INFO)<<"set descr"; CUBLAS_CHECK(cublasCreate(&Get().cublas_handle_)); CURAND_CHECK(curandCreateGenerator(&Get().curand_generator_, CURAND_RNG_PSEUDO_DEFAULT)); CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(Get().curand_generator_, cluster_seedgen())); }
void CudaUtil::cublasClose(cublasHandle_t handle) { cublasStatus_t status = cublasDestroy(handle); if (status != CUBLAS_STATUS_SUCCESS) { throw CudaException("CUBALS destroy error"); } }
void gpu_cublas1(double *A, double *B, double *C, double *D, double *r, double *nrmC, int N, int N2) { #pragma acc data present(A, B, C, D) { #pragma acc host_data use_device(A, B, C, D) { cublasHandle_t handle; cublasCreate(&handle); const double alpha = 1.0; const double beta = 0.0; cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_T, N, N, N, &alpha, A, N, B, N, &beta, C, N); printf(" gpu gemm success \n"); cublasDdot(handle, N2, C, 1, B, 1, r); printf(" gpu dot success \n"); *r = -1.0 * *r; cublasDaxpy(handle, N2, r, B, 1, C, 1); printf(" gpu axpy success \n"); cublasDnrm2(handle, N2, C, 1, nrmC); printf(" gpu nrm2 success \n"); cublasDcopy(handle, N2, C, 1, D, 1); printf(" gpu copy success \n"); *nrmC = 1.0 / *nrmC; cublasDscal(handle, N2, nrmC, D, 1); printf(" gpu scal success \n"); cublasDestroy(handle); printf(" gpu destroy success \n"); } } }
Caffe::~Caffe() { if (cusparse_descr_) CUSPARSE_CHECK(cusparseDestroyMatDescr(cusparse_descr_)); if (cublas_handle_) CUBLAS_CHECK(cublasDestroy(cublas_handle_)); if (cusparse_handle_) CUSPARSE_CHECK(cusparseDestroy(cusparse_handle_)); if (curand_generator_) { CURAND_CHECK(curandDestroyGenerator(curand_generator_)); } }
GpuDevice::Impl::~Impl() { ActivateDevice(); for (size_t i = 0; i < kParallelism; ++i) { CUDNN_CALL(cudnnDestroy(cudnn_handle[i])); CUBLAS_CALL(cublasDestroy(cublas_handle[i])); CUDA_CALL(cudaStreamDestroy(stream[i])); } }
cuda_running_configuration::~cuda_running_configuration() { if (cublas_handle) cublasDestroy(cublas_handle); if (cusparse_handle) cusparseDestroy(cusparse_handle); cudaDeviceReset(); }
void contractTT(sTensorGPU *TT1, sTensorGPU *TT2, const int n, const int size) { cublasHandle_t handle; cublasCreate(&handle); type result=0; sTensorGPU temp1 = emptyTensor(size*size,2); sTensorGPU temp2 = emptyTensor(size*size*2,3); cudaEvent_t start; cudaEventCreate(&start); cudaEvent_t stop; cudaEventCreate(&stop); //printf("Start contractTT\n"); cudaEventRecord(start, NULL); int indA = TT1[0].size[0]; int indB = TT2[0].size[0]; sTensorCPU tt1start = copyToCPU(TT1[0]); sTensorCPU tt2start = copyToCPU(TT2[0]); sTensorCPU tt1end = copyToCPU(TT1[n - 1]); sTensorCPU tt2end = copyToCPU( TT2[n - 1]); for (int i = 0; i < indA; i++){ TT1[0] = prepareTensorStart(tt1start, i); TT1[n - 1] = prepareTensorEnd(tt1end, i); for (int j = 0; j < indB; j++){ TT2[0] = prepareTensorStart(tt2start, j); TT2[n - 1] = prepareTensorEnd(tt2end, j); contractTensor(handle, TT1[0], TT2[0], temp1); for (int i = 1; i < n; i++){ contractTensor(handle, temp1, TT1[i], temp2); contractTensor(handle, temp2, TT2[i], temp1, 2); } type add = 0; cudaMemcpy(&add, temp1.deviceData, sizeof(type), cudaMemcpyDeviceToHost); //printf("%e ", add); result += add; } } cudaEventRecord(stop, NULL); cudaEventSynchronize(stop); float msecTotal = 0.0f; cudaEventElapsedTime(&msecTotal, start, stop); printf("Time: %.3fms\n", msecTotal); printf("Ops: %.0f\n", bops); double gigaFlops = (bops * 1.0e-9f) / (msecTotal / 1000.0f); printf("Perf= %.2f GFlop/s\n", gigaFlops); cublasDestroy(handle); cudaDeviceReset(); printf("%.5e \n", result); exit(0); }
Caffe::~Caffe() { if (cublas_handle_) CUBLAS_CHECK(cublasDestroy(cublas_handle_)); if (curand_generator_) CURAND_CHECK(curandDestroyGenerator(curand_generator_)); if (vsl_stream_) VSL_CHECK(vslDeleteStream(&vsl_stream_)); }
GpuDevice::~GpuDevice() { CUDA_CALL(cudaSetDevice(device_)); pool_.WaitForAllFinished(); for (size_t i = 0; i < kParallelism; ++i) { CUDNN_CALL(cudnnDestroy(cudnn_handle_[i])); CUBLAS_CALL(cublasDestroy(cublas_handle_[i])); CUDA_CALL(cudaStreamDestroy(stream_[i])); } delete data_store_; }
Caffe::~Caffe() { // Make sure all device contexts and // dependent memory blocks are freed properly device_contexts_.clear(); #ifdef USE_CUDA if (cublas_handle_) CUBLAS_CHECK(cublasDestroy(cublas_handle_)); if (curand_generator_) { CURAND_CHECK(curandDestroyGenerator(curand_generator_)); } #endif // USE_CUDA }
uint micronn_free(micronn* net) { uint i; for(i = 0; i <= net->nhidden; i++) { micronn_matrix_free(net->weights[i]); } free(net->chidden); free(net->weights); cublasDestroy(net->handle); free(net); return 1; };
void dot_gpu(double *x, double *y, double *result, int N) { #pragma acc data present(x, y) { #pragma acc host_data use_device(x, y) { cublasHandle_t h; cublasCreate(&h); cublasDdot(h, N, x, 1, y, 1, result); cublasDestroy(h); } } }
CUDAManager::~CUDAManager() { if(m_tempBuffer) cudaFree(m_tempBuffer); cudaFree(m_tempRetBuffer); #ifdef USE_CUSPARSE if(cusparseHandle) cusparseDestroy(cusparseHandle); #endif if(cublasHandle) cublasDestroy(cublasHandle); cudaDeviceReset(); cout << "Cleaned up CUDA." << endl; }
void norm_gpu(double *x, double *norm, int N) { #pragma acc data present(x) { #pragma acc host_data use_device(x) { cublasHandle_t h; cublasCreate(&h); cublasDnrm2(h, N, x, 1, norm); cublasDestroy(h); } } }
void blasx_resource_dest(int GPUs, cublasHandle_t* handles, cudaStream_t* streams, cudaEvent_t* events, float** C_dev) { int GPU_id; for (GPU_id = 0; GPU_id < GPUs; GPU_id++) { cudaSetDevice( GPU_id ); int i = 0; for (i = 0; i < STREAMNUM; i++) { cudaStreamDestroy( streams[i+GPU_id*STREAMNUM] ); cudaEventDestroy( events[i+GPU_id*STREAMNUM] ); } for (i = 0; i < STREAMNUM*2; i++) cudaFree( C_dev[i+GPU_id*STREAMNUM*2] ); cublasDestroy( handles[GPU_id] ); } }
Caffe::~Caffe() { for (vector<cublasHandle_t>& group_cublas_handles : cublas_handles_) { for (cublasHandle_t h : group_cublas_handles) { if (h) { CUBLAS_CHECK(cublasDestroy(h)); } } } for_each(curand_generators_.begin(), curand_generators_.end(), [](curandGenerator_t h) { if (h) { CURAND_CHECK(curandDestroyGenerator(h)); } }); }
void THCudaShutdown(THCState* state) { THCRandom_shutdown(state); free(state->rngState); free(state->cudaHostAllocator); free(state->deviceProperties); int deviceCount = 0; int prevDev = -1; THCudaCheck(cudaGetDevice(&prevDev)); THCudaCheck(cudaGetDeviceCount(&deviceCount)); /* cleanup p2p access state */ for (int dev = 0; dev < deviceCount; ++dev) { free(state->p2pAccessEnabled[dev]); } free(state->p2pAccessEnabled); /* cleanup per-device state */ for (int dev = 0; dev < deviceCount; ++dev) { THCudaCheck(cudaSetDevice(dev)); /* Free Torch-defined streams (0 is the default stream) */ for (int stream = 1; stream <= state->numUserStreams; ++stream) { THCudaCheck(cudaStreamDestroy( THCState_getDeviceStream(state, dev, stream))); } /* Free Torch-defined handles (0 is NULL for consistency with streams API) */ for (int handle = 1; handle <= state->numUserBlasHandles; ++handle) { THCublasCheck(cublasDestroy( THCState_getDeviceBlasHandle(state, dev, handle))); } /* Free per-stream scratch space; starts at 0 because there is space for the default stream as well*/ for (int stream = 0; stream <= state->numUserStreams; ++stream) { THCudaCheck(THCudaFree(state, THCState_getDeviceScratchSpace(state, dev, stream))); } free(state->resourcesPerDevice[dev].streams); free(state->resourcesPerDevice[dev].blasHandles); free(state->resourcesPerDevice[dev].devScratchSpacePerStream); } free(state->resourcesPerDevice); state->cudaDeviceAllocator.shutdown(state->cudaDeviceAllocator.state); THCThreadLocal_free(state->currentPerDeviceStream); THCThreadLocal_free(state->currentPerDeviceBlasHandle); THCudaCheck(cudaSetDevice(prevDev)); }
void cublas_gemm(const double *A, const double *B, double *C, int N) { #pragma acc data present(A, B, C) { #pragma acc host_data use_device(A, B, C) { cublasHandle_t h; cublasCreate(&h); const double alpha = 1.0; const double beta = 0.0; cublasDgemm(h, CUBLAS_OP_T, CUBLAS_OP_T, N, N, N, &alpha, A, N, B, N, &beta, C, N); cublasDestroy(h); } } }
void gemv_gpu(double *a, double *b, double *c1, int N) { #pragma acc data present(a, b, c1) { #pragma acc host_data use_device(a, b, c1) { cublasHandle_t handle; cublasCreate(&handle); const double alpha = 1.0; const double beta = 0.0; cublasDgemv(handle, CUBLAS_OP_T, N, N, &alpha, a, N, b, 1, &beta, c1, 1); cublasDestroy(handle); } } // end pragma data }
// Note : cublasDgemm( handle, CUBLAS_OP_N, CUBLAS_OP_N, n,n,n, &alpha, A, n, B, n, &beta, C, n) // means matrix C = B * A void cublas_gemm(int n, double *c, double *b, double *a ) { #pragma acc data present(a, b, c) { #pragma acc host_data use_device(a, b, c) { cublasHandle_t handle; cublasCreate(&handle); const double alpha = 1.0; const double beta = 0.0; cublasDgemm( handle, CUBLAS_OP_N, CUBLAS_OP_N, n,n,n, &alpha, a, n, b, n, &beta, c, n); cublasDestroy(handle); } } }
void THCudaShutdown(THCState* state) { THCRandom_shutdown(state); free(state->rngState); free(state->deviceProperties); int deviceCount = 0; int prevDev = -1; THCudaCheck(cudaGetDevice(&prevDev)); THCudaCheck(cudaGetDeviceCount(&deviceCount)); /* cleanup p2p access state */ for (int dev = 0; dev < deviceCount; ++dev) { free(state->p2pAccessEnabled[dev]); } free(state->p2pAccessEnabled); /* cleanup per-device state */ for (int dev = 0; dev < deviceCount; ++dev) { THCudaCheck(cudaSetDevice(dev)); THCCudaResourcesPerDevice* res = &(state->resourcesPerDevice[dev]); /* Free user defined BLAS handles */ for (int i = 0; i < res->numBlasHandles; ++i) { THCublasCheck(cublasDestroy(res->blasHandles[i])); } /* Free user defined sparse handles */ for (int i = 0; i < res->numSparseHandles; ++i) { THCusparseCheck(cusparseDestroy(res->sparseHandles[i])); } free(res->blasHandles); free(res->sparseHandles); THCStream_free((THCStream*)THCThreadLocal_get(state->currentStreams[dev])); THCThreadLocal_free(state->currentStreams[dev]); } free(state->resourcesPerDevice); if (state->cudaDeviceAllocator->emptyCache) { state->cudaDeviceAllocator->emptyCache(state->cudaDeviceAllocator->state); } if (state->cudaHostAllocator == &THCCachingHostAllocator) { THCCachingHostAllocator_emptyCache(); } free(state->currentStreams); THCThreadLocal_free(state->currentPerDeviceBlasHandle); THCudaCheck(cudaSetDevice(prevDev)); }
// Solve A * x = b in GPU. void cublas_backsolver(double *A, double *x, double *b, int N) { #pragma acc data present(A, x, b) { #pragma host_data use_device(A, x, b) { cublasHandle_t h; cublasCreate(&h); cublasDcopy(h, N, b, 1, x, 1); // printf(" cublasDcopy success. \n"); cublasDtrsv(h, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_T, CUBLAS_DIAG_NON_UNIT, N, A, N, x, 1); // printf(" cublasDtrsv success. \n"); cublasDestroy(h); } } }
void Caffe::SetDevice(const int device_id) { int current_device; CUDA_CHECK(cudaGetDevice(¤t_device)); if (current_device == device_id) { return; } if (Get().cublas_handle_) CUBLAS_CHECK(cublasDestroy(Get().cublas_handle_)); if (Get().curand_generator_) { CURAND_CHECK(curandDestroyGenerator(Get().curand_generator_)); } CUDA_CHECK(cudaSetDevice(device_id)); CUBLAS_CHECK(cublasCreate(&Get().cublas_handle_)); CURAND_CHECK(curandCreateGenerator(&Get().curand_generator_, CURAND_RNG_PSEUDO_DEFAULT)); CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(Get().curand_generator_, cluster_seedgen())); }
// Multiply the arrays A and B on GPU and save the result in C // C(m,n) = A(m,k) * B(k,n) void gpu_blas_mmul(const double *A, const double *B, double *C, const int m, const int k, const int n) { int lda=m,ldb=k,ldc=m; const double alf = 1; const double bet = 0; const double *alpha = &alf; const double *beta = &bet; // Create a handle for CUBLAS cublasHandle_t handle; cublasCreate(&handle); // Do the actual multiplication cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); // Destroy the handle cublasDestroy(handle); }
static void teardown(void *c) { cuda_context *ctx = (cuda_context *)c; blas_handle *handle = (blas_handle *)ctx->blas_handle; if (ctx->blas_handle == NULL) return; cuda_enter(ctx); cublasDestroy(handle->h); GpuKernel_clear(&handle->sgemvBH_N_a1_b1_small); GpuKernel_clear(&handle->sgemvBH_T_a1_b1_small); GpuKernel_clear(&handle->dgemvBH_N_a1_b1_small); GpuKernel_clear(&handle->dgemvBH_T_a1_b1_small); GpuKernel_clear(&handle->sgerBH_gen_small); GpuKernel_clear(&handle->dgerBH_gen_small); cuda_exit(ctx); free(ctx->blas_handle); ctx->blas_handle = NULL; }
void Engine::SetDevice(const int device_id) { int current_device; CUDA_CHECK(cudaGetDevice(¤t_device)); if (current_device == device_id) { return; } // The call to cudaSetDevice must come before any calls to Get, which // may perform initialization using the GPU. CUDA_CHECK(cudaSetDevice(device_id)); if (Get().cublas_handle_) CUBLAS_CHECK(cublasDestroy(Get().cublas_handle_)); if (Get().curand_generator_) { CURAND_CHECK(curandDestroyGenerator(Get().curand_generator_)); } CUBLAS_CHECK(cublasCreate(&Get().cublas_handle_)); CURAND_CHECK(curandCreateGenerator(&Get().curand_generator_, CURAND_RNG_PSEUDO_DEFAULT)); CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(Get().curand_generator_, cluster_seedgen())); }
double cublas_gemm_norm(const double *A, const double *B, double *C, int N) { double *norm; norm = (double *) malloc(1*sizeof(double)); #pragma acc data present(A, B, C) copyout(norm[0]) { #pragma acc host_data use_device(A, B, C) { cublasHandle_t h; cublasCreate(&h); const double alpha = 1.0; const double beta = 0.0; cublasDgemm(h, CUBLAS_OP_T, CUBLAS_OP_T, N, N, N, &alpha, A, N, B, N, &beta, C, N); cublasDnrm2(h, N*N, C, 1, norm); cublasDestroy(h); } } return *norm; }
void Caffe::SetSlaveDevice(const int slave_device_id) { int current_device; CUDA_CHECK(cudaGetDevice(¤t_device)); if (current_device == slave_device_id) { return; } if (Get().slave_cublas_handle_) CUBLAS_CHECK(cublasDestroy(Get().slave_cublas_handle_)); if (Get().slave_curand_generator_) { CURAND_CHECK(curandDestroyGenerator(Get().slave_curand_generator_)); } CUDA_CHECK(cudaSetDevice(slave_device_id)); CUDA_CHECK(cudaStreamCreate (&Get().slave_cu_stream_)); CUBLAS_CHECK(cublasCreate(&Get().slave_cublas_handle_)); CUBLAS_CHECK(cublasSetStream(Get().slave_cublas_handle_, Get().slave_cu_stream_)); CURAND_CHECK(curandCreateGenerator(&Get().slave_curand_generator_, CURAND_RNG_PSEUDO_DEFAULT)); CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(Get().slave_curand_generator_, cluster_seedgen())); Get().slave_device_id_ = slave_device_id; CUDA_CHECK(cudaSetDevice(current_device)); Caffe::set_gpu_mode(Caffe::MASTER_SLAVE); }
void Caffe::SetDevice(const int device_id) { std::vector<int> devices; devices.push_back(device_id); Caffe::SetDevices(devices); Get().default_device_context_ = GetDeviceContext(device_id); if (Get().default_device_context_->backend() == Backend::BACKEND_CUDA) { #ifdef USE_CUDA int current_device; CUDA_CHECK(cudaGetDevice(¤t_device)); if (current_device == device_id) { return; } // The call to cudaSetDevice must come before any calls to Get, which // may perform initialization using the GPU. CUDA_CHECK(cudaSetDevice(device_id)); if (Get().cublas_handle_) CUBLAS_CHECK(cublasDestroy(Get().cublas_handle_)); if (Get().curand_generator_) { CURAND_CHECK(curandDestroyGenerator(Get().curand_generator_)); } CUBLAS_CHECK(cublasCreate(&Get().cublas_handle_)); CURAND_CHECK( curandCreateGenerator(&Get().curand_generator_, CURAND_RNG_PSEUDO_DEFAULT)); CURAND_CHECK( curandSetPseudoRandomGeneratorSeed(Get().curand_generator_, cluster_seedgen())); #endif // USE_CUDA } else { #ifdef USE_GREENTEA #ifdef USE_CLBLAS clblasSetup(); #endif // USE_CLBLAS #endif // USE_GREENTEA } }
void Context::Clear() { #if defined(USE_CUDA) if (blas_handle_ != nullptr) { CUBLAS_CHECK(cublasDestroy(cublasHandle_t(blas_handle_))); blas_handle_ = nullptr; } #endif #if defined(USE_CUDNN) if (cudnn_handle_ != nullptr) { CUDNN_CHECK(cudnnDestroy(cudnnHandle_t(cudnn_handle_))); cudnn_handle_ = nullptr; } #endif #if defined(USE_NNPACK) if (nnpack_handle_ != nullptr) { CHECK_EQ(nnp_deinitialize(), nnp_status_success); pthreadpool_destroy(pthreadpool_t(nnpack_handle_)); nnpack_handle_ = nullptr; } #endif }