Exemple #1
0
Caffe::Caffe()
    : cublas_handle_(NULL),cusparse_handle_(NULL),cusparse_descr_(NULL),curand_generator_(NULL),random_generator_(),mode_(Caffe::CPU), solver_count_(1), root_solver_(true){
  // Try to create a cublas handler, and report an error if failed (but we will
  // keep the program running as one might just want to run CPU code).
    LOG(INFO)<<"caffe init.";
    if (cublasCreate(&cublas_handle_) != CUBLAS_STATUS_SUCCESS) {
    LOG(ERROR) << "Cannot create Cublas handle. Cublas won't be available.";
  }
//add cusparse handler
  if (cusparseCreate(&cusparse_handle_)!=CUSPARSE_STATUS_SUCCESS){
    LOG(ERROR) << "cannot create Cusparse handle,Cusparse won't be available.";
  }
 if(cusparseCreateMatDescr(&cusparse_descr_)!=CUSPARSE_STATUS_SUCCESS){
   LOG(ERROR) << "cannot create Cusparse descr,descr won't be available.";
 }else{
  cusparseSetMatType(cusparse_descr_,CUSPARSE_MATRIX_TYPE_GENERAL);
  cusparseSetMatIndexBase(cusparse_descr_,CUSPARSE_INDEX_BASE_ZERO);
  LOG(INFO)<<"init descr";
 }
  // Try to create a curand handler.
  if (curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT)
      != CURAND_STATUS_SUCCESS ||
      curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen())
      != CURAND_STATUS_SUCCESS) {
    LOG(ERROR) << "Cannot create Curand generator. Curand won't be available.";
  }
  LOG(INFO)<<"caffe finish";
}
Exemple #2
0
void Context::Init(int device_id) {
  device_id_ = device_id;

  SwitchDevice();

#if defined(USE_CUDA)
  if (blas_handle_ == nullptr) {
    CUBLAS_CHECK(cublasCreate((cublasHandle_t*)&blas_handle_));
    CHECK_NOTNULL(blas_handle_);
  }
#endif

#if defined(USE_CUDNN)
  if (cudnn_handle_ == nullptr) {
    CUDNN_CHECK(cudnnCreate((cudnnHandle_t*)&cudnn_handle_));
    CHECK_NOTNULL(cudnn_handle_);
  }
#endif

#if defined(USE_NNPACK)
  if (nnpack_handle_ == nullptr) {
    CHECK_EQ(nnp_initialize(), nnp_status_success);
    nnpack_handle_ = pthreadpool_create(0);
    CHECK_NOTNULL(nnpack_handle_);
  }
#endif
}
Exemple #3
0
void gpuSetUp(const int maxBlocksPerKernel, const int n) {
  debug("setting up cuBLAS");
  if (cublasCreate(&g_cublasHandle) != CUBLAS_STATUS_SUCCESS) {
    fatal("couldn't open cuBLAS handle");
  }
  cuSetUp(maxBlocksPerKernel, n);
}
Exemple #4
0
micronn* micronn_read(FILE* file)
{
    uint i, tmp;
    cublasStatus_t stat;

    micronn* net = malloc(sizeof(micronn));

    stat = cublasCreate(&net->handle);
    if(stat != CUBLAS_STATUS_SUCCESS) {
        fprintf(stderr, "CUBLAS initialization failed\n");
        return NULL;
    }

    fscanf(file, "micro neural network\n");
    fscanf(file, "ninputs: %d\nnoutputs: %d\nnhidden %d\n", &net->nin, &net->nout, &net->nhidden);
    fscanf(file, "chidden:");

    net->weights = malloc(sizeof(micronn_matrix*) * (net->nhidden + 1));
    net->chidden = malloc(sizeof(uint) * net->nhidden);

    for(i = 0; i < net->nhidden; i++) {
        fscanf(file, " %d", &net->chidden[i]);
    }
    fscanf(file, "\n");
    for(i = 0; i <= net->nhidden; i++) {
        fscanf(file, "weight %d:\n", &tmp);
        net->weights[i] = micronn_matrix_read(file);
    }
    return net;
};
Exemple #5
0
void blasx_resource_init(int GPUs, cublasHandle_t* handles, cudaStream_t* streams, cudaEvent_t* events, void** C_dev, int floatType_id) {
    if(floatType_id == 0) C_dev = (float**)  C_dev;
    else if(floatType_id == 1) C_dev = (double**) C_dev;
    else              C_dev = (cuDoubleComplex**) C_dev;
    int GPU_id = 0;
    for (GPU_id = 0; GPU_id < GPUs; GPU_id++) {
        assert( cudaSetDevice(GPU_id) == cudaSuccess );
        //create handles
        assert( cublasCreate(&handles[GPU_id]) == CUBLAS_STATUS_SUCCESS);
        //create streams and event
        int i = 0;
        for (i = 0 ; i < STREAMNUM; i++) {
            assert( cudaStreamCreate(&streams[i+GPU_id*STREAMNUM]) == cudaSuccess );
            assert( cudaEventCreateWithFlags(&events[i+GPU_id*STREAMNUM], cudaEventDisableTiming) == cudaSuccess );
        }
        //create C_dev
        for (i = 0; i < STREAMNUM*2; i++) {
            if (floatType_id == 0) {
                assert( cudaMalloc((void**)&C_dev[i+GPU_id*STREAMNUM*2], sizeof(float)*BLOCKSIZE_SGEMM*BLOCKSIZE_SGEMM) == cudaSuccess );
            }else if (floatType_id == 1) {
                 assert( cudaMalloc((void**)&C_dev[i+GPU_id*STREAMNUM*2], sizeof(double)*BLOCKSIZE_DGEMM*BLOCKSIZE_DGEMM) == cudaSuccess );
            } else {
                 assert( cudaMalloc((void**)&C_dev[i+GPU_id*STREAMNUM*2], sizeof(cuDoubleComplex)*BLOCKSIZE_ZGEMM*BLOCKSIZE_ZGEMM) == cudaSuccess );
            }
        }
    }
}
void gpu_cublas1(double *A, double *B, double *C, double *D, double *r, double *nrmC, int N, int N2)
{
	#pragma acc data present(A, B, C, D)
	{
		#pragma acc host_data use_device(A, B, C, D)
		{
			cublasHandle_t handle;
			cublasCreate(&handle);
			const double alpha = 1.0;
			const double beta = 0.0;
			cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_T, N, N, N, &alpha, A, N, B, N, &beta, C, N);
			printf(" gpu gemm success \n");
			cublasDdot(handle, N2, C, 1, B, 1, r);
			printf(" gpu dot success \n");
			*r = -1.0 * *r;
			cublasDaxpy(handle, N2, r, B, 1, C, 1);
			printf(" gpu axpy success \n");
			cublasDnrm2(handle, N2, C, 1, nrmC);
			printf(" gpu nrm2 success \n");
			cublasDcopy(handle, N2, C, 1, D, 1);
			printf(" gpu copy success \n");
			*nrmC = 1.0 / *nrmC;
			cublasDscal(handle, N2, nrmC, D, 1);
			printf(" gpu scal success \n");
			cublasDestroy(handle);
			printf(" gpu destroy success \n");
		}
	}
}
Exemple #7
0
void Caffe::SetDevice(const int device_id) {
  int current_device;
  CUDA_CHECK(cudaGetDevice(&current_device));
  if (current_device == device_id) {
    return;
  }
  // The call to cudaSetDevice must come before any calls to Get, which
  // may perform initialization using the GPU.
  CUDA_CHECK(cudaSetDevice(device_id));
  if (Get().cublas_handle_) CUBLAS_CHECK(cublasDestroy(Get().cublas_handle_));
  if (Get().cusparse_descr_)CUSPARSE_CHECK(cusparseDestroyMatDescr(Get().cusparse_descr_));
  if (Get().cusparse_handle_)CUSPARSE_CHECK(cusparseDestroy(Get().cusparse_handle_));
  if (Get().curand_generator_) {
    CURAND_CHECK(curandDestroyGenerator(Get().curand_generator_));
  }
  CUSPARSE_CHECK(cusparseCreate(&Get().cusparse_handle_));
  CUSPARSE_CHECK(cusparseCreateMatDescr(&Get().cusparse_descr_));
//  cusparseSetMatType(cusparse_descr_,CUSPARSE_MATRIX_TYPE_GENERAL);
//  cusparseSetMatIndexBase(cusparse_descr_,CUSPARSE_INDEX_BASE_ZERO);
  LOG(INFO)<<"set descr";
  CUBLAS_CHECK(cublasCreate(&Get().cublas_handle_));
  CURAND_CHECK(curandCreateGenerator(&Get().curand_generator_,
      CURAND_RNG_PSEUDO_DEFAULT));
  CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(Get().curand_generator_,
      cluster_seedgen()));
}
Exemple #8
0
void mpla_init_instance_block_rows(struct mpla_instance* instance, MPI_Comm comm)
{
        instance->comm = comm;

        // get number of process
        MPI_Comm_size(comm, &(instance->proc_count));

        // find number of current process
        MPI_Comm_rank(comm, &(instance->cur_proc_rank));
        if (instance->cur_proc_rank==0)
                instance->is_parent = true;
        else
                instance->is_parent = false;

        // compute the process grid, enforcing only a parallelization over rows
        int dims[2];
        dims[0]=instance->proc_count;
	dims[1]=1;
        MPI_Dims_create(instance->proc_count, 2, dims);
        instance->proc_rows = dims[0];
        instance->proc_cols = dims[1];

        // create cartesian communicator and retrieve cartesian coordinates
        int periods[2];
        periods[0]=periods[1]=0;
        MPI_Cart_create(comm, 2, dims, periods, 0, &(instance->comm));
        int cur_proc_coord[2];
        MPI_Cart_get(instance->comm, 2, dims, periods, cur_proc_coord);
        instance->cur_proc_row = cur_proc_coord[0];
        instance->cur_proc_col = cur_proc_coord[1];;

        cublasCreate(&(instance->cublas_handle));

}
void GPUsgemv(int gpuInner, int Md,int Nd,int Kd,float* Adevice,float *Bdevice,float *Cdevice,float *Ahost,float *Bhost,float *Chost, cudaStream_t *stream) {
  cudaError_t error;
  int memSizeA = sizeof(float)*Md*Nd;
  int memSizeB = sizeof(float)*Nd;
  int memSizeC = sizeof(float)*Md;

  error = cudaMemcpyAsync(Adevice,Ahost,memSizeA,cudaMemcpyHostToDevice,*stream); if (error != cudaSuccess){printf("cudaMemcpy A returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);}
  error = cudaMemcpyAsync(Bdevice,Bhost,memSizeB,cudaMemcpyHostToDevice,*stream); if (error != cudaSuccess){printf("cudaMemcpy B returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);}

  // setup execution parameters
  dim3 threads(block_size,block_size);
  dim3 grid(Nd/threads.x,Md/threads.y);

  // inside CUBLAS
  cublasHandle_t handle;
  cublasStatus_t ret;
  ret = cublasCreate(&handle); if (ret != CUBLAS_STATUS_SUCCESS){printf("cublasCreate returned error code %d, line(%d)\n", ret, __LINE__);exit(EXIT_FAILURE);}
  const float alpha = 1.0f;
  const float beta  = 0.0f;
  cublasSetStream(handle,*stream);
  for (int i = 0; i < gpuInner; i++) {
    ret = cublasSgemv(handle, CUBLAS_OP_N, Md, Nd, &alpha, Adevice, Md, Bdevice, 1, &beta, Cdevice, 1);
    if (ret != CUBLAS_STATUS_SUCCESS) {
      printf("cublasSgemm returned error code %d, line(%d)\n", ret, __LINE__);
      exit(EXIT_FAILURE);
    }
  }
  // done CUBLAS

  // copy result back to host
  error = cudaMemcpyAsync(Chost,Cdevice,memSizeC,cudaMemcpyDeviceToHost,*stream);
  //  printf("GPU Iter queued\n");
}
Exemple #10
0
void contractTT(sTensorGPU *TT1, sTensorGPU *TT2, const int n, const int size)
{
	cublasHandle_t handle;
	cublasCreate(&handle);
	type result=0;

	sTensorGPU temp1 = emptyTensor(size*size,2);
	sTensorGPU temp2 = emptyTensor(size*size*2,3);
	cudaEvent_t start;
	cudaEventCreate(&start);
	cudaEvent_t stop;
	cudaEventCreate(&stop);

	//printf("Start contractTT\n");

	cudaEventRecord(start, NULL);
	int indA = TT1[0].size[0];
	int indB = TT2[0].size[0];

	sTensorCPU tt1start = copyToCPU(TT1[0]);
	sTensorCPU tt2start = copyToCPU(TT2[0]);
	sTensorCPU tt1end = copyToCPU(TT1[n - 1]);
	sTensorCPU tt2end = copyToCPU( TT2[n - 1]);


	for (int i = 0; i < indA; i++){
		TT1[0] = prepareTensorStart(tt1start, i);
		TT1[n - 1] = prepareTensorEnd(tt1end, i);
		for (int j = 0; j < indB; j++){
			TT2[0] = prepareTensorStart(tt2start, j);
			TT2[n - 1] = prepareTensorEnd(tt2end, j);
			contractTensor(handle, TT1[0], TT2[0], temp1);
			for (int i = 1; i < n; i++){
				contractTensor(handle, temp1, TT1[i], temp2);
				contractTensor(handle, temp2, TT2[i], temp1, 2);
			}
			type add = 0;
			cudaMemcpy(&add, temp1.deviceData, sizeof(type), cudaMemcpyDeviceToHost);
			//printf("%e ", add);
			result += add;
		}
	}
	cudaEventRecord(stop, NULL);
	cudaEventSynchronize(stop);
	
	float msecTotal = 0.0f;
	cudaEventElapsedTime(&msecTotal, start, stop);
	printf("Time: %.3fms\n", msecTotal);
	printf("Ops: %.0f\n", bops);
	double gigaFlops = (bops * 1.0e-9f) / (msecTotal / 1000.0f);
	printf("Perf= %.2f GFlop/s\n", gigaFlops);

	cublasDestroy(handle);
	cudaDeviceReset();

	printf("%.5e \n", result);
	exit(0);
}
cublasHandle_t CudaUtil::cublasInit()
{
	cublasHandle_t handle;
	cublasStatus_t status = cublasCreate(&handle);
	if (status != CUBLAS_STATUS_SUCCESS) {
		throw CudaException("CUBALS initialisation error");
	}
	return handle;
}
Exemple #12
0
GpuDevice::Impl::Impl(int d) : device(d) {
  ActivateDevice();
  for (size_t i = 0; i < kParallelism; ++i) {
    CUDA_CALL(cudaStreamCreate(&stream[i]));
    CUBLAS_CALL(cublasCreate(&cublas_handle[i]));
    CUBLAS_CALL(cublasSetStream(cublas_handle[i], stream[i]));
    CUDNN_CALL(cudnnCreate(&cudnn_handle[i]));
    CUDNN_CALL(cudnnSetStream(cudnn_handle[i], stream[i]));
  }
}
void initCublasHandle(cublasHandle_t* handle)
{
	cublasStatus_t stat;
	stat = cublasCreate(handle);
	if (stat != CUBLAS_STATUS_SUCCESS)
	{
		printf("CUBLAS INITIALIZATION FAILED");
	}

}
Exemple #14
0
Caffe::Caffe()
  : mode_(Caffe::CPU), phase_(Caffe::TRAIN), cublas_handle_(NULL),
  curand_generator_(NULL), vsl_stream_(NULL)
{
  CUBLAS_CHECK(cublasCreate(&cublas_handle_));
  //TODO: original caffe code has bug here!
  CURAND_CHECK(curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT));
  CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(curand_generator_, 1701ULL));
  VSL_CHECK(vslNewStream(&vsl_stream_, VSL_BRNG_MT19937, 1701));
}
Exemple #15
0
/*init cublas Handle*/
bool init(cublasHandle_t& handle)
{
	cublasStatus_t stat;
	stat = cublasCreate(&handle);
	if(stat != CUBLAS_STATUS_SUCCESS) {
		printf ("init: CUBLAS initialization failed\n");
		exit(0);
	}
	return true;
}
Exemple #16
0
cublasHandle_t blas_handle()
{
    static int init = 0;
    static cublasHandle_t handle;
    if(!init) {
        cublasCreate(&handle);
        init = 1;
    }
    return handle;
}
Exemple #17
0
cublasHandle_t blas_handle()
{
    static int init[16] = {0};
    static cublasHandle_t handle[16];
    int i = cuda_get_device();
    if(!init[i]) {
        cublasCreate(&handle[i]);
        init[i] = 1;
    }
    return handle[i];
}
void
CudaInterface::initialize() {
    mDevID = 0;
    checkCudaErrors(cudaSetDevice(mDevID));
    checkCudaErrors(cudaGetDevice(&mDevID));
    checkCudaErrors(cudaGetDeviceProperties(&mDeviceProperty, mDevID));
    checkCudaErrors(cublasCreate(&mCublasHandle));
    printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", mDevID, mDeviceProperty.name, mDeviceProperty.major, mDeviceProperty.minor);

    // needs a larger block size for Fermi and above
    int block_size = (mDeviceProperty.major < 2) ? 16 : 32;
}
Exemple #19
0
void norm_gpu(double *x, double *norm, int N)
{
	#pragma acc data present(x)
	{
		#pragma acc host_data use_device(x)
		{
			cublasHandle_t h;
			cublasCreate(&h);
			cublasDnrm2(h, N, x, 1, norm);
			cublasDestroy(h);
		}
	}
}
Exemple #20
0
void dot_gpu(double *x, double *y, double *result, int N)
{
	#pragma acc data present(x, y)
	{
		#pragma acc host_data use_device(x, y)
		{
			cublasHandle_t h;
			cublasCreate(&h);
			cublasDdot(h, N, x, 1, y, 1, result);
			cublasDestroy(h);
		}
	}
}
Exemple #21
0
cublasHandle_t& getHandle()
{
	static cublasHandle_t handle = NULL;
	if(handle == NULL){
		cublasStatus_t stat;
		stat = cublasCreate(&handle);
		if(stat != CUBLAS_STATUS_SUCCESS) {
			printf ("init: CUBLAS initialization failed\n");
			exit(0);
		}
	}
	return handle;
}
Exemple #22
0
void cublas_gemm(const double *A, const double *B, double *C, int N)
{
	#pragma acc data present(A, B, C)
	{
		#pragma acc host_data use_device(A, B, C)
		{
			cublasHandle_t h;
			cublasCreate(&h);
			const double alpha = 1.0;
			const double beta = 0.0;
			cublasDgemm(h, CUBLAS_OP_T, CUBLAS_OP_T, N, N, N, &alpha, A, N, B, N, &beta, C, N);
			cublasDestroy(h);
		}
	}
}
Exemple #23
0
void gemv_gpu(double *a, double *b, double *c1, int N)
{
	#pragma acc data present(a, b, c1)
	{
		#pragma acc host_data use_device(a, b, c1)
		{
			cublasHandle_t handle;
			cublasCreate(&handle);
			const double alpha = 1.0;
			const double beta = 0.0;
			cublasDgemv(handle, CUBLAS_OP_T, N, N, &alpha, a, N, b, 1, &beta, c1, 1);
			cublasDestroy(handle);
		}
	} // end pragma data
}
Exemple #24
0
// Note : cublasDgemm( handle, CUBLAS_OP_N, CUBLAS_OP_N, n,n,n, &alpha, A, n, B, n, &beta, C, n)
// means matrix C = B * A
void cublas_gemm(int n, double *c, double *b, double *a )
{
	#pragma acc data present(a, b, c)
	{
		#pragma acc host_data use_device(a, b, c)
		{
			cublasHandle_t handle;
			cublasCreate(&handle);
			const double alpha = 1.0;
			const double beta = 0.0;
			cublasDgemm( handle, CUBLAS_OP_N, CUBLAS_OP_N, n,n,n, &alpha, a, n, b, n, &beta, c, n);
			cublasDestroy(handle);
		}
	}
}
Exemple #25
0
Engine::Engine()
    : cublas_handle_(NULL), curand_generator_(NULL), random_generator_(),
    mode_(Engine::CPU) {
  // Try to create a cublas handler, and report an error if failed (but we will
  // keep the program running as one might just want to run CPU code).
  if (cublasCreate(&cublas_handle_) != CUBLAS_STATUS_SUCCESS) {
    LOG(ERROR) << "Cannot create Cublas handle. Cublas won't be available.";
  }
  // Try to create a curand handler.
  if (curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT)
      != CURAND_STATUS_SUCCESS ||
      curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen())
      != CURAND_STATUS_SUCCESS) {
    LOG(ERROR) << "Cannot create Curand generator. Curand won't be available.";
  }
}
Exemple #26
0
// Solve A * x = b in GPU.
void cublas_backsolver(double *A, double *x, double *b, int N)
{
	#pragma acc data present(A, x, b)
	{
		#pragma host_data use_device(A, x, b)
		{
			cublasHandle_t h;
			cublasCreate(&h);
			cublasDcopy(h, N, b, 1, x, 1);
//			printf(" cublasDcopy success. \n");
			cublasDtrsv(h, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_T, CUBLAS_DIAG_NON_UNIT, N, A, N, x, 1);
//			printf(" cublasDtrsv success. \n");
			cublasDestroy(h);
		}
	}
}
cublasHandle_t Caffe::device_cublas_handle(int group) {
  std::lock_guard<std::mutex> lock(cublas_mutex_);
  vector<cublasHandle_t>& group_cublas_handles = cublas_handles_[current_device()];
  if (group + 1 > group_cublas_handles.size()) {
    group_cublas_handles.resize(group + 1);
  }
  cublasHandle_t& cublas_handle = group_cublas_handles[group];
  if (!cublas_handle) {
    // Try to create a cublas handler, and report an error if failed (but we will
    // keep the program running as one might just want to run CPU code).
    if (cublasCreate(&cublas_handle) != CUBLAS_STATUS_SUCCESS) {
      LOG(ERROR) << "Cannot create Cublas handle. Cublas won't be available.";
    }
    CUBLAS_CHECK(cublasSetStream(cublas_handle, device_pstream(group)->get()));
  }
  return cublas_handle;
}
Exemple #28
0
void Caffe::SetDevice(const int device_id) {
  int current_device;
  CUDA_CHECK(cudaGetDevice(&current_device));
  if (current_device == device_id) {
    return;
  }
  if (Get().cublas_handle_) CUBLAS_CHECK(cublasDestroy(Get().cublas_handle_));
  if (Get().curand_generator_) {
    CURAND_CHECK(curandDestroyGenerator(Get().curand_generator_));
  }
  CUDA_CHECK(cudaSetDevice(device_id));
  CUBLAS_CHECK(cublasCreate(&Get().cublas_handle_));
  CURAND_CHECK(curandCreateGenerator(&Get().curand_generator_,
      CURAND_RNG_PSEUDO_DEFAULT));
  CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(Get().curand_generator_,
      cluster_seedgen()));
}
		void cuda_running_configuration::update_parameters()
		{
	        cuda_safe_call(cudaDriverGetVersion(&driver_version));
	        cuda_safe_call(cudaRuntimeGetVersion(&runtime_version));

			int device_count;
		    cuda_safe_call(cudaGetDeviceCount(&device_count));
			if (device_count <= 0)
				throw neural_network_exception("No CUDA capable devices are found");

			if (device_id >= device_count)
				throw neural_network_exception((boost::format("Device ID %1% specified while %2% devices are available") % device_id % device_count).str());

			cudaDeviceProp device_prop;
			cuda_safe_call(cudaGetDeviceProperties(&device_prop, device_id));
			device_name = device_prop.name;
			compute_capability_major = device_prop.major;
			compute_capability_minor = device_prop.minor;
			clock_rate = device_prop.clockRate;
			memory_clock_rate = device_prop.memoryClockRate;
			memory_bus_width = device_prop.memoryBusWidth;
			global_memory_size = device_prop.totalGlobalMem;
			ecc_enabled = (device_prop.ECCEnabled != 0);
			l2_cache_size = device_prop.l2CacheSize;
			multiprocessor_count = device_prop.multiProcessorCount;
			smem_per_block = device_prop.sharedMemPerBlock;
			max_threads_per_multiprocessor = device_prop.maxThreadsPerMultiProcessor;
			max_threads_per_block = device_prop.maxThreadsPerBlock;
			for(int i = 0; i < sizeof(max_threads_dim) / sizeof(max_threads_dim[0]); ++i)
				max_threads_dim[i] = device_prop.maxThreadsDim[i];
			for(int i = 0; i < sizeof(max_grid_size) / sizeof(max_grid_size[0]); ++i)
				max_grid_size[i] = device_prop.maxGridSize[i];
			max_texture_1d_linear = device_prop.maxTexture1DLinear;
			texture_alignment = device_prop.textureAlignment;
			pci_bus_id = device_prop.pciBusID;
			pci_device_id = device_prop.pciDeviceID;
		#ifdef _WIN32
			tcc_mode = (device_prop.tccDriver != 0);
		#endif

			cuda_safe_call(cudaSetDevice(device_id));

			cublas_safe_call(cublasCreate(&cublas_handle));

			cusparse_safe_call(cusparseCreate(&cusparse_handle));
		}
// Multiply the arrays A and B on GPU and save the result in C
// C(m,n) = A(m,k) * B(k,n)
void gpu_blas_mmul(const double *A, const double *B, double *C, const int m, const int k, const int n) {
     int lda=m,ldb=k,ldc=m;
     const double alf = 1;
     const double bet = 0;
     const double *alpha = &alf;
     const double *beta = &bet;
 
     // Create a handle for CUBLAS
     cublasHandle_t handle;
     cublasCreate(&handle);
 
     // Do the actual multiplication
     cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
 
     // Destroy the handle
     cublasDestroy(handle);
 }