void Caffe::SetDevice(const int device_id) { int current_device; CUDA_CHECK(cudaGetDevice(¤t_device)); if (current_device == device_id) { return; } // The call to cudaSetDevice must come before any calls to Get, which // may perform initialization using the GPU. CUDA_CHECK(cudaSetDevice(device_id)); if (Get().cublas_handle_) CUBLAS_CHECK(cublasDestroy(Get().cublas_handle_)); if (Get().cusparse_descr_)CUSPARSE_CHECK(cusparseDestroyMatDescr(Get().cusparse_descr_)); if (Get().cusparse_handle_)CUSPARSE_CHECK(cusparseDestroy(Get().cusparse_handle_)); if (Get().curand_generator_) { CURAND_CHECK(curandDestroyGenerator(Get().curand_generator_)); } CUSPARSE_CHECK(cusparseCreate(&Get().cusparse_handle_)); CUSPARSE_CHECK(cusparseCreateMatDescr(&Get().cusparse_descr_)); // cusparseSetMatType(cusparse_descr_,CUSPARSE_MATRIX_TYPE_GENERAL); // cusparseSetMatIndexBase(cusparse_descr_,CUSPARSE_INDEX_BASE_ZERO); LOG(INFO)<<"set descr"; CUBLAS_CHECK(cublasCreate(&Get().cublas_handle_)); CURAND_CHECK(curandCreateGenerator(&Get().curand_generator_, CURAND_RNG_PSEUDO_DEFAULT)); CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(Get().curand_generator_, cluster_seedgen())); }
void cuda_initialize() { CUDA_CHECK(cudaStreamCreate(&g_context.stream)); CUBLAS_CHECK(cublasCreate_v2(&g_context.cublas_handle)); CUBLAS_CHECK(cublasSetStream(g_context.cublas_handle, g_context.stream)); // CUDNN_CHECK(cudnnCreate(&g_context.cudnn_handle)); // CUDNN_CHECK(cudnnSetStream(g_context.cudnn_handle, g_context.stream)); }
Caffe::Properties::Properties() : init_time_(std::time(nullptr)), main_thread_id_(std::this_thread::get_id()), caffe_version_(AS_STRING(CAFFE_VERSION)) { #ifndef CPU_ONLY int count = 0; CUDA_CHECK(cudaGetDeviceCount(&count)); compute_capabilities_.resize(count); cudaDeviceProp device_prop; for (int gpu = 0; gpu < compute_capabilities_.size(); ++gpu) { CUDA_CHECK(cudaGetDeviceProperties(&device_prop, gpu)); compute_capabilities_[gpu] = device_prop.major * 100 + device_prop.minor; DLOG(INFO) << "GPU " << gpu << " '" << device_prop.name << "' has compute capability " << device_prop.major << "." << device_prop.minor; } #ifdef USE_CUDNN cudnn_version_ = AS_STRING(CUDNN_MAJOR) "." AS_STRING(CUDNN_MINOR) "." AS_STRING(CUDNN_PATCHLEVEL); #else cudnn_version_ = "USE_CUDNN is not defined"; #endif int cublas_version = 0; CUBLAS_CHECK(cublasGetVersion(Caffe::cublas_handle(), &cublas_version)); cublas_version_ = std::to_string(cublas_version); int cuda_version = 0; CUDA_CHECK(cudaRuntimeGetVersion(&cuda_version)); cuda_version_ = std::to_string(cuda_version); int cuda_driver_version = 0; CUDA_CHECK(cudaDriverGetVersion(&cuda_driver_version)); cuda_driver_version_ = std::to_string(cuda_driver_version); #endif }
void Context::Init(int device_id) { device_id_ = device_id; SwitchDevice(); #if defined(USE_CUDA) if (blas_handle_ == nullptr) { CUBLAS_CHECK(cublasCreate((cublasHandle_t*)&blas_handle_)); CHECK_NOTNULL(blas_handle_); } #endif #if defined(USE_CUDNN) if (cudnn_handle_ == nullptr) { CUDNN_CHECK(cudnnCreate((cudnnHandle_t*)&cudnn_handle_)); CHECK_NOTNULL(cudnn_handle_); } #endif #if defined(USE_NNPACK) if (nnpack_handle_ == nullptr) { CHECK_EQ(nnp_initialize(), nnp_status_success); nnpack_handle_ = pthreadpool_create(0); CHECK_NOTNULL(nnpack_handle_); } #endif }
void caffe_gpu_gemv<double>(const CBLAS_TRANSPOSE TransA, const int M, const int N, const double alpha, const double* A, const double* x, const double beta, double* y) { cublasOperation_t cuTransA = (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N; CUBLAS_CHECK(cublasDgemv(Caffe::cublas_handle(), cuTransA, N, M, &alpha, A, N, x, 1, &beta, y, 1)); }
void Caffe::SetDevice(const int device_id) { int current_device; CUDA_CHECK(cudaGetDevice(¤t_device)); if (current_device == device_id) { return; } if (Get().cublas_handle_) CUBLAS_CHECK(cublasDestroy(Get().cublas_handle_)); if (Get().curand_generator_) { CURAND_CHECK(curandDestroyGenerator(Get().curand_generator_)); } CUDA_CHECK(cudaSetDevice(device_id)); CUBLAS_CHECK(cublasCreate(&Get().cublas_handle_)); CURAND_CHECK(curandCreateGenerator(&Get().curand_generator_, CURAND_RNG_PSEUDO_DEFAULT)); CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(Get().curand_generator_, cluster_seedgen())); }
Caffe::~Caffe() { if (cusparse_descr_) CUSPARSE_CHECK(cusparseDestroyMatDescr(cusparse_descr_)); if (cublas_handle_) CUBLAS_CHECK(cublasDestroy(cublas_handle_)); if (cusparse_handle_) CUSPARSE_CHECK(cusparseDestroy(cusparse_handle_)); if (curand_generator_) { CURAND_CHECK(curandDestroyGenerator(curand_generator_)); } }
void caffe_gpu_gemv<float>(const CBLAS_TRANSPOSE TransA, const int M, const int N, const float alpha, const float* A, const float* x, const float beta, float* y) { cublasOperation_t cuTransA = (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N; CUBLAS_CHECK(cublasSgemv(Caffe::cublas_handle(), cuTransA, N, M, &alpha, A, N, x, 1, &beta, y, 1)); }
Caffe::~Caffe() { if (cublas_handle_) CUBLAS_CHECK(cublasDestroy(cublas_handle_)); if (curand_generator_) CURAND_CHECK(curandDestroyGenerator(curand_generator_)); if (vsl_stream_) VSL_CHECK(vslDeleteStream(&vsl_stream_)); }
Caffe::Caffe() : mode_(Caffe::CPU), phase_(Caffe::TRAIN), cublas_handle_(NULL), curand_generator_(NULL), vsl_stream_(NULL) { CUBLAS_CHECK(cublasCreate(&cublas_handle_)); //TODO: original caffe code has bug here! CURAND_CHECK(curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT)); CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(curand_generator_, 1701ULL)); VSL_CHECK(vslNewStream(&vsl_stream_, VSL_BRNG_MT19937, 1701)); }
void Engine::SetDevice(const int device_id) { int current_device; CUDA_CHECK(cudaGetDevice(¤t_device)); if (current_device == device_id) { return; } // The call to cudaSetDevice must come before any calls to Get, which // may perform initialization using the GPU. CUDA_CHECK(cudaSetDevice(device_id)); if (Get().cublas_handle_) CUBLAS_CHECK(cublasDestroy(Get().cublas_handle_)); if (Get().curand_generator_) { CURAND_CHECK(curandDestroyGenerator(Get().curand_generator_)); } CUBLAS_CHECK(cublasCreate(&Get().cublas_handle_)); CURAND_CHECK(curandCreateGenerator(&Get().curand_generator_, CURAND_RNG_PSEUDO_DEFAULT)); CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(Get().curand_generator_, cluster_seedgen())); }
Caffe::~Caffe() { // Make sure all device contexts and // dependent memory blocks are freed properly device_contexts_.clear(); #ifdef USE_CUDA if (cublas_handle_) CUBLAS_CHECK(cublasDestroy(cublas_handle_)); if (curand_generator_) { CURAND_CHECK(curandDestroyGenerator(curand_generator_)); } #endif // USE_CUDA }
void Caffe::SetSlaveDevice(const int slave_device_id) { int current_device; CUDA_CHECK(cudaGetDevice(¤t_device)); if (current_device == slave_device_id) { return; } if (Get().slave_cublas_handle_) CUBLAS_CHECK(cublasDestroy(Get().slave_cublas_handle_)); if (Get().slave_curand_generator_) { CURAND_CHECK(curandDestroyGenerator(Get().slave_curand_generator_)); } CUDA_CHECK(cudaSetDevice(slave_device_id)); CUDA_CHECK(cudaStreamCreate (&Get().slave_cu_stream_)); CUBLAS_CHECK(cublasCreate(&Get().slave_cublas_handle_)); CUBLAS_CHECK(cublasSetStream(Get().slave_cublas_handle_, Get().slave_cu_stream_)); CURAND_CHECK(curandCreateGenerator(&Get().slave_curand_generator_, CURAND_RNG_PSEUDO_DEFAULT)); CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(Get().slave_curand_generator_, cluster_seedgen())); Get().slave_device_id_ = slave_device_id; CUDA_CHECK(cudaSetDevice(current_device)); Caffe::set_gpu_mode(Caffe::MASTER_SLAVE); }
void Caffe::SetDevice(const int device_id) { std::vector<int> devices; devices.push_back(device_id); Caffe::SetDevices(devices); Get().default_device_context_ = GetDeviceContext(device_id); if (Get().default_device_context_->backend() == Backend::BACKEND_CUDA) { #ifdef USE_CUDA int current_device; CUDA_CHECK(cudaGetDevice(¤t_device)); if (current_device == device_id) { return; } // The call to cudaSetDevice must come before any calls to Get, which // may perform initialization using the GPU. CUDA_CHECK(cudaSetDevice(device_id)); if (Get().cublas_handle_) CUBLAS_CHECK(cublasDestroy(Get().cublas_handle_)); if (Get().curand_generator_) { CURAND_CHECK(curandDestroyGenerator(Get().curand_generator_)); } CUBLAS_CHECK(cublasCreate(&Get().cublas_handle_)); CURAND_CHECK( curandCreateGenerator(&Get().curand_generator_, CURAND_RNG_PSEUDO_DEFAULT)); CURAND_CHECK( curandSetPseudoRandomGeneratorSeed(Get().curand_generator_, cluster_seedgen())); #endif // USE_CUDA } else { #ifdef USE_GREENTEA #ifdef USE_CLBLAS clblasSetup(); #endif // USE_CLBLAS #endif // USE_GREENTEA } }
Caffe::~Caffe() { for (vector<cublasHandle_t>& group_cublas_handles : cublas_handles_) { for (cublasHandle_t h : group_cublas_handles) { if (h) { CUBLAS_CHECK(cublasDestroy(h)); } } } for_each(curand_generators_.begin(), curand_generators_.end(), [](curandGenerator_t h) { if (h) { CURAND_CHECK(curandDestroyGenerator(h)); } }); }
void caffe_gpu_geam<float>(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int M, const int N, const float alpha, const float* A, const float* B, const float beta, float* C){ // Note that cublas follows fortran order. int lda = (TransA == CblasNoTrans) ? N : M; int ldb = (TransB == CblasNoTrans) ? N : M; cublasOperation_t cuTransA = (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; cublasOperation_t cuTransB = (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; CUBLAS_CHECK(cublasSgeam(Caffe::get_current_cublas_handle(), cuTransB, cuTransA, N, M, &alpha, B, ldb, &beta, A, lda, C, N)); }
void caffe_gpu_gemm<double>(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const double alpha, const double* A, const double* B, const double beta, double* C) { // Note that cublas follows fortran order. int lda = (TransA == CblasNoTrans) ? K : M; int ldb = (TransB == CblasNoTrans) ? N : K; cublasOperation_t cuTransA = (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; cublasOperation_t cuTransB = (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; CUBLAS_CHECK(cublasDgemm(Caffe::cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, N)); }
cublasHandle_t Caffe::device_cublas_handle(int group) { std::lock_guard<std::mutex> lock(cublas_mutex_); vector<cublasHandle_t>& group_cublas_handles = cublas_handles_[current_device()]; if (group + 1 > group_cublas_handles.size()) { group_cublas_handles.resize(group + 1); } cublasHandle_t& cublas_handle = group_cublas_handles[group]; if (!cublas_handle) { // Try to create a cublas handler, and report an error if failed (but we will // keep the program running as one might just want to run CPU code). if (cublasCreate(&cublas_handle) != CUBLAS_STATUS_SUCCESS) { LOG(ERROR) << "Cannot create Cublas handle. Cublas won't be available."; } CUBLAS_CHECK(cublasSetStream(cublas_handle, device_pstream(group)->get())); } return cublas_handle; }
void caffe_gpu_gemm_batched<float>(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const float alpha, const float** A, const float** B, const float beta, float** C, int batch_count){ // Note that cublas follows fortran order. int lda = (TransA == CblasNoTrans) ? K : M; int ldb = (TransB == CblasNoTrans) ? N : K; cublasOperation_t cuTransA = (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; cublasOperation_t cuTransB = (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; CUBLAS_CHECK(cublasSgemmBatched(Caffe::get_current_cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, N, batch_count)); }
void Context::Clear() { #if defined(USE_CUDA) if (blas_handle_ != nullptr) { CUBLAS_CHECK(cublasDestroy(cublasHandle_t(blas_handle_))); blas_handle_ = nullptr; } #endif #if defined(USE_CUDNN) if (cudnn_handle_ != nullptr) { CUDNN_CHECK(cudnnDestroy(cudnnHandle_t(cudnn_handle_))); cudnn_handle_ = nullptr; } #endif #if defined(USE_NNPACK) if (nnpack_handle_ != nullptr) { CHECK_EQ(nnp_deinitialize(), nnp_status_success); pthreadpool_destroy(pthreadpool_t(nnpack_handle_)); nnpack_handle_ = nullptr; } #endif }
void caffe_gpu_scal<float>(const int N, const float alpha, float *X) { CUBLAS_CHECK(cublasSscal(Caffe::cublas_handle(), N, &alpha, X, 1)); }
void caffe_gpu_scale<double>(const int n, const double alpha, const double *x, double* y) { CUBLAS_CHECK(cublasDcopy(Caffe::cublas_handle(), n, x, 1, y, 1)); CUBLAS_CHECK(cublasDscal(Caffe::cublas_handle(), n, &alpha, y, 1)); }
void caffe_gpu_scale<float>(const int n, const float alpha, const float *x, float* y) { CUBLAS_CHECK(cublasScopy(Caffe::cublas_handle(), n, x, 1, y, 1)); CUBLAS_CHECK(cublasSscal(Caffe::cublas_handle(), n, &alpha, y, 1)); }
void caffe_gpu_asum<double>(const int n, const double* x, double* y) { CUBLAS_CHECK(cublasDasum(Caffe::cublas_handle(), n, x, 1, y)); }
void caffe_gpu_asum<float>(const int n, const float* x, float* y) { CUBLAS_CHECK(cublasSasum(Caffe::cublas_handle(), n, x, 1, y)); }
void caffe_gpu_dot<double>(const int n, const double* x, const double* y, double * out) { CUBLAS_CHECK(cublasDdot(Caffe::cublas_handle(), n, x, 1, y, 1, out)); }
void caffe_gpu_scal<double>(const int N, const double alpha, double *X) { CUBLAS_CHECK(cublasDscal(Caffe::cublas_handle(), N, &alpha, X, 1)); }
Engine::~Engine() { if (cublas_handle_) CUBLAS_CHECK(cublasDestroy(cublas_handle_)); if (curand_generator_) { CURAND_CHECK(curandDestroyGenerator(curand_generator_)); } }
void caffe_gpu_dot<float>(const int n, const float* x, const float* y, float* out) { CUBLAS_CHECK(cublasSdot(Caffe::cublas_handle(), n, x, 1, y, 1, out)); }
void caffe_gpu_axpy<double>(const int N, const double alpha, const double* X, double* Y) { CUBLAS_CHECK(cublasDaxpy(Caffe::cublas_handle(), N, &alpha, X, 1, Y, 1)); }