Caffe::Caffe() : cublas_handle_(NULL),cusparse_handle_(NULL),cusparse_descr_(NULL),curand_generator_(NULL),random_generator_(),mode_(Caffe::CPU), solver_count_(1), root_solver_(true){ // Try to create a cublas handler, and report an error if failed (but we will // keep the program running as one might just want to run CPU code). LOG(INFO)<<"caffe init."; if (cublasCreate(&cublas_handle_) != CUBLAS_STATUS_SUCCESS) { LOG(ERROR) << "Cannot create Cublas handle. Cublas won't be available."; } //add cusparse handler if (cusparseCreate(&cusparse_handle_)!=CUSPARSE_STATUS_SUCCESS){ LOG(ERROR) << "cannot create Cusparse handle,Cusparse won't be available."; } if(cusparseCreateMatDescr(&cusparse_descr_)!=CUSPARSE_STATUS_SUCCESS){ LOG(ERROR) << "cannot create Cusparse descr,descr won't be available."; }else{ cusparseSetMatType(cusparse_descr_,CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetMatIndexBase(cusparse_descr_,CUSPARSE_INDEX_BASE_ZERO); LOG(INFO)<<"init descr"; } // Try to create a curand handler. if (curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT) != CURAND_STATUS_SUCCESS || curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen()) != CURAND_STATUS_SUCCESS) { LOG(ERROR) << "Cannot create Curand generator. Curand won't be available."; } LOG(INFO)<<"caffe finish"; }
void Context::Init(int device_id) { device_id_ = device_id; SwitchDevice(); #if defined(USE_CUDA) if (blas_handle_ == nullptr) { CUBLAS_CHECK(cublasCreate((cublasHandle_t*)&blas_handle_)); CHECK_NOTNULL(blas_handle_); } #endif #if defined(USE_CUDNN) if (cudnn_handle_ == nullptr) { CUDNN_CHECK(cudnnCreate((cudnnHandle_t*)&cudnn_handle_)); CHECK_NOTNULL(cudnn_handle_); } #endif #if defined(USE_NNPACK) if (nnpack_handle_ == nullptr) { CHECK_EQ(nnp_initialize(), nnp_status_success); nnpack_handle_ = pthreadpool_create(0); CHECK_NOTNULL(nnpack_handle_); } #endif }
void gpuSetUp(const int maxBlocksPerKernel, const int n) { debug("setting up cuBLAS"); if (cublasCreate(&g_cublasHandle) != CUBLAS_STATUS_SUCCESS) { fatal("couldn't open cuBLAS handle"); } cuSetUp(maxBlocksPerKernel, n); }
micronn* micronn_read(FILE* file) { uint i, tmp; cublasStatus_t stat; micronn* net = malloc(sizeof(micronn)); stat = cublasCreate(&net->handle); if(stat != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "CUBLAS initialization failed\n"); return NULL; } fscanf(file, "micro neural network\n"); fscanf(file, "ninputs: %d\nnoutputs: %d\nnhidden %d\n", &net->nin, &net->nout, &net->nhidden); fscanf(file, "chidden:"); net->weights = malloc(sizeof(micronn_matrix*) * (net->nhidden + 1)); net->chidden = malloc(sizeof(uint) * net->nhidden); for(i = 0; i < net->nhidden; i++) { fscanf(file, " %d", &net->chidden[i]); } fscanf(file, "\n"); for(i = 0; i <= net->nhidden; i++) { fscanf(file, "weight %d:\n", &tmp); net->weights[i] = micronn_matrix_read(file); } return net; };
void blasx_resource_init(int GPUs, cublasHandle_t* handles, cudaStream_t* streams, cudaEvent_t* events, void** C_dev, int floatType_id) { if(floatType_id == 0) C_dev = (float**) C_dev; else if(floatType_id == 1) C_dev = (double**) C_dev; else C_dev = (cuDoubleComplex**) C_dev; int GPU_id = 0; for (GPU_id = 0; GPU_id < GPUs; GPU_id++) { assert( cudaSetDevice(GPU_id) == cudaSuccess ); //create handles assert( cublasCreate(&handles[GPU_id]) == CUBLAS_STATUS_SUCCESS); //create streams and event int i = 0; for (i = 0 ; i < STREAMNUM; i++) { assert( cudaStreamCreate(&streams[i+GPU_id*STREAMNUM]) == cudaSuccess ); assert( cudaEventCreateWithFlags(&events[i+GPU_id*STREAMNUM], cudaEventDisableTiming) == cudaSuccess ); } //create C_dev for (i = 0; i < STREAMNUM*2; i++) { if (floatType_id == 0) { assert( cudaMalloc((void**)&C_dev[i+GPU_id*STREAMNUM*2], sizeof(float)*BLOCKSIZE_SGEMM*BLOCKSIZE_SGEMM) == cudaSuccess ); }else if (floatType_id == 1) { assert( cudaMalloc((void**)&C_dev[i+GPU_id*STREAMNUM*2], sizeof(double)*BLOCKSIZE_DGEMM*BLOCKSIZE_DGEMM) == cudaSuccess ); } else { assert( cudaMalloc((void**)&C_dev[i+GPU_id*STREAMNUM*2], sizeof(cuDoubleComplex)*BLOCKSIZE_ZGEMM*BLOCKSIZE_ZGEMM) == cudaSuccess ); } } } }
void gpu_cublas1(double *A, double *B, double *C, double *D, double *r, double *nrmC, int N, int N2) { #pragma acc data present(A, B, C, D) { #pragma acc host_data use_device(A, B, C, D) { cublasHandle_t handle; cublasCreate(&handle); const double alpha = 1.0; const double beta = 0.0; cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_T, N, N, N, &alpha, A, N, B, N, &beta, C, N); printf(" gpu gemm success \n"); cublasDdot(handle, N2, C, 1, B, 1, r); printf(" gpu dot success \n"); *r = -1.0 * *r; cublasDaxpy(handle, N2, r, B, 1, C, 1); printf(" gpu axpy success \n"); cublasDnrm2(handle, N2, C, 1, nrmC); printf(" gpu nrm2 success \n"); cublasDcopy(handle, N2, C, 1, D, 1); printf(" gpu copy success \n"); *nrmC = 1.0 / *nrmC; cublasDscal(handle, N2, nrmC, D, 1); printf(" gpu scal success \n"); cublasDestroy(handle); printf(" gpu destroy success \n"); } } }
void Caffe::SetDevice(const int device_id) { int current_device; CUDA_CHECK(cudaGetDevice(¤t_device)); if (current_device == device_id) { return; } // The call to cudaSetDevice must come before any calls to Get, which // may perform initialization using the GPU. CUDA_CHECK(cudaSetDevice(device_id)); if (Get().cublas_handle_) CUBLAS_CHECK(cublasDestroy(Get().cublas_handle_)); if (Get().cusparse_descr_)CUSPARSE_CHECK(cusparseDestroyMatDescr(Get().cusparse_descr_)); if (Get().cusparse_handle_)CUSPARSE_CHECK(cusparseDestroy(Get().cusparse_handle_)); if (Get().curand_generator_) { CURAND_CHECK(curandDestroyGenerator(Get().curand_generator_)); } CUSPARSE_CHECK(cusparseCreate(&Get().cusparse_handle_)); CUSPARSE_CHECK(cusparseCreateMatDescr(&Get().cusparse_descr_)); // cusparseSetMatType(cusparse_descr_,CUSPARSE_MATRIX_TYPE_GENERAL); // cusparseSetMatIndexBase(cusparse_descr_,CUSPARSE_INDEX_BASE_ZERO); LOG(INFO)<<"set descr"; CUBLAS_CHECK(cublasCreate(&Get().cublas_handle_)); CURAND_CHECK(curandCreateGenerator(&Get().curand_generator_, CURAND_RNG_PSEUDO_DEFAULT)); CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(Get().curand_generator_, cluster_seedgen())); }
void mpla_init_instance_block_rows(struct mpla_instance* instance, MPI_Comm comm) { instance->comm = comm; // get number of process MPI_Comm_size(comm, &(instance->proc_count)); // find number of current process MPI_Comm_rank(comm, &(instance->cur_proc_rank)); if (instance->cur_proc_rank==0) instance->is_parent = true; else instance->is_parent = false; // compute the process grid, enforcing only a parallelization over rows int dims[2]; dims[0]=instance->proc_count; dims[1]=1; MPI_Dims_create(instance->proc_count, 2, dims); instance->proc_rows = dims[0]; instance->proc_cols = dims[1]; // create cartesian communicator and retrieve cartesian coordinates int periods[2]; periods[0]=periods[1]=0; MPI_Cart_create(comm, 2, dims, periods, 0, &(instance->comm)); int cur_proc_coord[2]; MPI_Cart_get(instance->comm, 2, dims, periods, cur_proc_coord); instance->cur_proc_row = cur_proc_coord[0]; instance->cur_proc_col = cur_proc_coord[1];; cublasCreate(&(instance->cublas_handle)); }
void GPUsgemv(int gpuInner, int Md,int Nd,int Kd,float* Adevice,float *Bdevice,float *Cdevice,float *Ahost,float *Bhost,float *Chost, cudaStream_t *stream) { cudaError_t error; int memSizeA = sizeof(float)*Md*Nd; int memSizeB = sizeof(float)*Nd; int memSizeC = sizeof(float)*Md; error = cudaMemcpyAsync(Adevice,Ahost,memSizeA,cudaMemcpyHostToDevice,*stream); if (error != cudaSuccess){printf("cudaMemcpy A returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);} error = cudaMemcpyAsync(Bdevice,Bhost,memSizeB,cudaMemcpyHostToDevice,*stream); if (error != cudaSuccess){printf("cudaMemcpy B returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);} // setup execution parameters dim3 threads(block_size,block_size); dim3 grid(Nd/threads.x,Md/threads.y); // inside CUBLAS cublasHandle_t handle; cublasStatus_t ret; ret = cublasCreate(&handle); if (ret != CUBLAS_STATUS_SUCCESS){printf("cublasCreate returned error code %d, line(%d)\n", ret, __LINE__);exit(EXIT_FAILURE);} const float alpha = 1.0f; const float beta = 0.0f; cublasSetStream(handle,*stream); for (int i = 0; i < gpuInner; i++) { ret = cublasSgemv(handle, CUBLAS_OP_N, Md, Nd, &alpha, Adevice, Md, Bdevice, 1, &beta, Cdevice, 1); if (ret != CUBLAS_STATUS_SUCCESS) { printf("cublasSgemm returned error code %d, line(%d)\n", ret, __LINE__); exit(EXIT_FAILURE); } } // done CUBLAS // copy result back to host error = cudaMemcpyAsync(Chost,Cdevice,memSizeC,cudaMemcpyDeviceToHost,*stream); // printf("GPU Iter queued\n"); }
void contractTT(sTensorGPU *TT1, sTensorGPU *TT2, const int n, const int size) { cublasHandle_t handle; cublasCreate(&handle); type result=0; sTensorGPU temp1 = emptyTensor(size*size,2); sTensorGPU temp2 = emptyTensor(size*size*2,3); cudaEvent_t start; cudaEventCreate(&start); cudaEvent_t stop; cudaEventCreate(&stop); //printf("Start contractTT\n"); cudaEventRecord(start, NULL); int indA = TT1[0].size[0]; int indB = TT2[0].size[0]; sTensorCPU tt1start = copyToCPU(TT1[0]); sTensorCPU tt2start = copyToCPU(TT2[0]); sTensorCPU tt1end = copyToCPU(TT1[n - 1]); sTensorCPU tt2end = copyToCPU( TT2[n - 1]); for (int i = 0; i < indA; i++){ TT1[0] = prepareTensorStart(tt1start, i); TT1[n - 1] = prepareTensorEnd(tt1end, i); for (int j = 0; j < indB; j++){ TT2[0] = prepareTensorStart(tt2start, j); TT2[n - 1] = prepareTensorEnd(tt2end, j); contractTensor(handle, TT1[0], TT2[0], temp1); for (int i = 1; i < n; i++){ contractTensor(handle, temp1, TT1[i], temp2); contractTensor(handle, temp2, TT2[i], temp1, 2); } type add = 0; cudaMemcpy(&add, temp1.deviceData, sizeof(type), cudaMemcpyDeviceToHost); //printf("%e ", add); result += add; } } cudaEventRecord(stop, NULL); cudaEventSynchronize(stop); float msecTotal = 0.0f; cudaEventElapsedTime(&msecTotal, start, stop); printf("Time: %.3fms\n", msecTotal); printf("Ops: %.0f\n", bops); double gigaFlops = (bops * 1.0e-9f) / (msecTotal / 1000.0f); printf("Perf= %.2f GFlop/s\n", gigaFlops); cublasDestroy(handle); cudaDeviceReset(); printf("%.5e \n", result); exit(0); }
cublasHandle_t CudaUtil::cublasInit() { cublasHandle_t handle; cublasStatus_t status = cublasCreate(&handle); if (status != CUBLAS_STATUS_SUCCESS) { throw CudaException("CUBALS initialisation error"); } return handle; }
GpuDevice::Impl::Impl(int d) : device(d) { ActivateDevice(); for (size_t i = 0; i < kParallelism; ++i) { CUDA_CALL(cudaStreamCreate(&stream[i])); CUBLAS_CALL(cublasCreate(&cublas_handle[i])); CUBLAS_CALL(cublasSetStream(cublas_handle[i], stream[i])); CUDNN_CALL(cudnnCreate(&cudnn_handle[i])); CUDNN_CALL(cudnnSetStream(cudnn_handle[i], stream[i])); } }
void initCublasHandle(cublasHandle_t* handle) { cublasStatus_t stat; stat = cublasCreate(handle); if (stat != CUBLAS_STATUS_SUCCESS) { printf("CUBLAS INITIALIZATION FAILED"); } }
Caffe::Caffe() : mode_(Caffe::CPU), phase_(Caffe::TRAIN), cublas_handle_(NULL), curand_generator_(NULL), vsl_stream_(NULL) { CUBLAS_CHECK(cublasCreate(&cublas_handle_)); //TODO: original caffe code has bug here! CURAND_CHECK(curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT)); CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(curand_generator_, 1701ULL)); VSL_CHECK(vslNewStream(&vsl_stream_, VSL_BRNG_MT19937, 1701)); }
/*init cublas Handle*/ bool init(cublasHandle_t& handle) { cublasStatus_t stat; stat = cublasCreate(&handle); if(stat != CUBLAS_STATUS_SUCCESS) { printf ("init: CUBLAS initialization failed\n"); exit(0); } return true; }
cublasHandle_t blas_handle() { static int init = 0; static cublasHandle_t handle; if(!init) { cublasCreate(&handle); init = 1; } return handle; }
cublasHandle_t blas_handle() { static int init[16] = {0}; static cublasHandle_t handle[16]; int i = cuda_get_device(); if(!init[i]) { cublasCreate(&handle[i]); init[i] = 1; } return handle[i]; }
void CudaInterface::initialize() { mDevID = 0; checkCudaErrors(cudaSetDevice(mDevID)); checkCudaErrors(cudaGetDevice(&mDevID)); checkCudaErrors(cudaGetDeviceProperties(&mDeviceProperty, mDevID)); checkCudaErrors(cublasCreate(&mCublasHandle)); printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", mDevID, mDeviceProperty.name, mDeviceProperty.major, mDeviceProperty.minor); // needs a larger block size for Fermi and above int block_size = (mDeviceProperty.major < 2) ? 16 : 32; }
void norm_gpu(double *x, double *norm, int N) { #pragma acc data present(x) { #pragma acc host_data use_device(x) { cublasHandle_t h; cublasCreate(&h); cublasDnrm2(h, N, x, 1, norm); cublasDestroy(h); } } }
void dot_gpu(double *x, double *y, double *result, int N) { #pragma acc data present(x, y) { #pragma acc host_data use_device(x, y) { cublasHandle_t h; cublasCreate(&h); cublasDdot(h, N, x, 1, y, 1, result); cublasDestroy(h); } } }
cublasHandle_t& getHandle() { static cublasHandle_t handle = NULL; if(handle == NULL){ cublasStatus_t stat; stat = cublasCreate(&handle); if(stat != CUBLAS_STATUS_SUCCESS) { printf ("init: CUBLAS initialization failed\n"); exit(0); } } return handle; }
void cublas_gemm(const double *A, const double *B, double *C, int N) { #pragma acc data present(A, B, C) { #pragma acc host_data use_device(A, B, C) { cublasHandle_t h; cublasCreate(&h); const double alpha = 1.0; const double beta = 0.0; cublasDgemm(h, CUBLAS_OP_T, CUBLAS_OP_T, N, N, N, &alpha, A, N, B, N, &beta, C, N); cublasDestroy(h); } } }
void gemv_gpu(double *a, double *b, double *c1, int N) { #pragma acc data present(a, b, c1) { #pragma acc host_data use_device(a, b, c1) { cublasHandle_t handle; cublasCreate(&handle); const double alpha = 1.0; const double beta = 0.0; cublasDgemv(handle, CUBLAS_OP_T, N, N, &alpha, a, N, b, 1, &beta, c1, 1); cublasDestroy(handle); } } // end pragma data }
// Note : cublasDgemm( handle, CUBLAS_OP_N, CUBLAS_OP_N, n,n,n, &alpha, A, n, B, n, &beta, C, n) // means matrix C = B * A void cublas_gemm(int n, double *c, double *b, double *a ) { #pragma acc data present(a, b, c) { #pragma acc host_data use_device(a, b, c) { cublasHandle_t handle; cublasCreate(&handle); const double alpha = 1.0; const double beta = 0.0; cublasDgemm( handle, CUBLAS_OP_N, CUBLAS_OP_N, n,n,n, &alpha, a, n, b, n, &beta, c, n); cublasDestroy(handle); } } }
Engine::Engine() : cublas_handle_(NULL), curand_generator_(NULL), random_generator_(), mode_(Engine::CPU) { // Try to create a cublas handler, and report an error if failed (but we will // keep the program running as one might just want to run CPU code). if (cublasCreate(&cublas_handle_) != CUBLAS_STATUS_SUCCESS) { LOG(ERROR) << "Cannot create Cublas handle. Cublas won't be available."; } // Try to create a curand handler. if (curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT) != CURAND_STATUS_SUCCESS || curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen()) != CURAND_STATUS_SUCCESS) { LOG(ERROR) << "Cannot create Curand generator. Curand won't be available."; } }
// Solve A * x = b in GPU. void cublas_backsolver(double *A, double *x, double *b, int N) { #pragma acc data present(A, x, b) { #pragma host_data use_device(A, x, b) { cublasHandle_t h; cublasCreate(&h); cublasDcopy(h, N, b, 1, x, 1); // printf(" cublasDcopy success. \n"); cublasDtrsv(h, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_T, CUBLAS_DIAG_NON_UNIT, N, A, N, x, 1); // printf(" cublasDtrsv success. \n"); cublasDestroy(h); } } }
cublasHandle_t Caffe::device_cublas_handle(int group) { std::lock_guard<std::mutex> lock(cublas_mutex_); vector<cublasHandle_t>& group_cublas_handles = cublas_handles_[current_device()]; if (group + 1 > group_cublas_handles.size()) { group_cublas_handles.resize(group + 1); } cublasHandle_t& cublas_handle = group_cublas_handles[group]; if (!cublas_handle) { // Try to create a cublas handler, and report an error if failed (but we will // keep the program running as one might just want to run CPU code). if (cublasCreate(&cublas_handle) != CUBLAS_STATUS_SUCCESS) { LOG(ERROR) << "Cannot create Cublas handle. Cublas won't be available."; } CUBLAS_CHECK(cublasSetStream(cublas_handle, device_pstream(group)->get())); } return cublas_handle; }
void Caffe::SetDevice(const int device_id) { int current_device; CUDA_CHECK(cudaGetDevice(¤t_device)); if (current_device == device_id) { return; } if (Get().cublas_handle_) CUBLAS_CHECK(cublasDestroy(Get().cublas_handle_)); if (Get().curand_generator_) { CURAND_CHECK(curandDestroyGenerator(Get().curand_generator_)); } CUDA_CHECK(cudaSetDevice(device_id)); CUBLAS_CHECK(cublasCreate(&Get().cublas_handle_)); CURAND_CHECK(curandCreateGenerator(&Get().curand_generator_, CURAND_RNG_PSEUDO_DEFAULT)); CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(Get().curand_generator_, cluster_seedgen())); }
void cuda_running_configuration::update_parameters() { cuda_safe_call(cudaDriverGetVersion(&driver_version)); cuda_safe_call(cudaRuntimeGetVersion(&runtime_version)); int device_count; cuda_safe_call(cudaGetDeviceCount(&device_count)); if (device_count <= 0) throw neural_network_exception("No CUDA capable devices are found"); if (device_id >= device_count) throw neural_network_exception((boost::format("Device ID %1% specified while %2% devices are available") % device_id % device_count).str()); cudaDeviceProp device_prop; cuda_safe_call(cudaGetDeviceProperties(&device_prop, device_id)); device_name = device_prop.name; compute_capability_major = device_prop.major; compute_capability_minor = device_prop.minor; clock_rate = device_prop.clockRate; memory_clock_rate = device_prop.memoryClockRate; memory_bus_width = device_prop.memoryBusWidth; global_memory_size = device_prop.totalGlobalMem; ecc_enabled = (device_prop.ECCEnabled != 0); l2_cache_size = device_prop.l2CacheSize; multiprocessor_count = device_prop.multiProcessorCount; smem_per_block = device_prop.sharedMemPerBlock; max_threads_per_multiprocessor = device_prop.maxThreadsPerMultiProcessor; max_threads_per_block = device_prop.maxThreadsPerBlock; for(int i = 0; i < sizeof(max_threads_dim) / sizeof(max_threads_dim[0]); ++i) max_threads_dim[i] = device_prop.maxThreadsDim[i]; for(int i = 0; i < sizeof(max_grid_size) / sizeof(max_grid_size[0]); ++i) max_grid_size[i] = device_prop.maxGridSize[i]; max_texture_1d_linear = device_prop.maxTexture1DLinear; texture_alignment = device_prop.textureAlignment; pci_bus_id = device_prop.pciBusID; pci_device_id = device_prop.pciDeviceID; #ifdef _WIN32 tcc_mode = (device_prop.tccDriver != 0); #endif cuda_safe_call(cudaSetDevice(device_id)); cublas_safe_call(cublasCreate(&cublas_handle)); cusparse_safe_call(cusparseCreate(&cusparse_handle)); }
// Multiply the arrays A and B on GPU and save the result in C // C(m,n) = A(m,k) * B(k,n) void gpu_blas_mmul(const double *A, const double *B, double *C, const int m, const int k, const int n) { int lda=m,ldb=k,ldc=m; const double alf = 1; const double bet = 0; const double *alpha = &alf; const double *beta = &bet; // Create a handle for CUBLAS cublasHandle_t handle; cublasCreate(&handle); // Do the actual multiplication cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); // Destroy the handle cublasDestroy(handle); }