Exemplo n.º 1
0
static void strmm_cuda(void *descr[], void *args) {
  float *a = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
  float *b = (float *)STARPU_MATRIX_GET_PTR(descr[1]);
  float *c = (float *)STARPU_MATRIX_GET_PTR(descr[2]);

  unsigned w = STARPU_MATRIX_GET_NY(descr[0]);
  unsigned h = STARPU_MATRIX_GET_NX(descr[1]);

  unsigned lda = STARPU_MATRIX_GET_LD(descr[0]);
  unsigned ldb = STARPU_MATRIX_GET_LD(descr[1]);
  unsigned ldc = STARPU_MATRIX_GET_LD(descr[2]);
  
  struct strmm_arg * arg = (struct strmm_arg *)args;

  cublasSideMode_t side = arg->side ? CUBLAS_SIDE_RIGHT : CUBLAS_SIDE_LEFT;
  cublasFillMode_t uplo = arg->uplo ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER;
  cublasDiagType_t diag = arg->unit ? CUBLAS_DIAG_UNIT : CUBLAS_DIAG_NON_UNIT;
  cublasOperation_t trans = CUBLAS_OP_T;

  const float factor = 1.0f;
  
  cublasSetStream(cublas_handle, starpu_cuda_get_local_stream());

  cublasStrmm(cublas_handle, side, uplo, trans, diag, w, h, &factor, a, lda, b, ldb, c, ldc);
  cudaStreamSynchronize(starpu_cuda_get_local_stream());

  free(arg);
}
Exemplo n.º 2
0
void GPUsgemv(int gpuInner, int Md,int Nd,int Kd,float* Adevice,float *Bdevice,float *Cdevice,float *Ahost,float *Bhost,float *Chost, cudaStream_t *stream) {
  cudaError_t error;
  int memSizeA = sizeof(float)*Md*Nd;
  int memSizeB = sizeof(float)*Nd;
  int memSizeC = sizeof(float)*Md;

  error = cudaMemcpyAsync(Adevice,Ahost,memSizeA,cudaMemcpyHostToDevice,*stream); if (error != cudaSuccess){printf("cudaMemcpy A returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);}
  error = cudaMemcpyAsync(Bdevice,Bhost,memSizeB,cudaMemcpyHostToDevice,*stream); if (error != cudaSuccess){printf("cudaMemcpy B returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);}

  // setup execution parameters
  dim3 threads(block_size,block_size);
  dim3 grid(Nd/threads.x,Md/threads.y);

  // inside CUBLAS
  cublasHandle_t handle;
  cublasStatus_t ret;
  ret = cublasCreate(&handle); if (ret != CUBLAS_STATUS_SUCCESS){printf("cublasCreate returned error code %d, line(%d)\n", ret, __LINE__);exit(EXIT_FAILURE);}
  const float alpha = 1.0f;
  const float beta  = 0.0f;
  cublasSetStream(handle,*stream);
  for (int i = 0; i < gpuInner; i++) {
    ret = cublasSgemv(handle, CUBLAS_OP_N, Md, Nd, &alpha, Adevice, Md, Bdevice, 1, &beta, Cdevice, 1);
    if (ret != CUBLAS_STATUS_SUCCESS) {
      printf("cublasSgemm returned error code %d, line(%d)\n", ret, __LINE__);
      exit(EXIT_FAILURE);
    }
  }
  // done CUBLAS

  // copy result back to host
  error = cudaMemcpyAsync(Chost,Cdevice,memSizeC,cudaMemcpyDeviceToHost,*stream);
  //  printf("GPU Iter queued\n");
}
Exemplo n.º 3
0
void cuda_initialize() {
    CUDA_CHECK(cudaStreamCreate(&g_context.stream));
    CUBLAS_CHECK(cublasCreate_v2(&g_context.cublas_handle));
    CUBLAS_CHECK(cublasSetStream(g_context.cublas_handle, g_context.stream));
    // CUDNN_CHECK(cudnnCreate(&g_context.cudnn_handle));
    // CUDNN_CHECK(cudnnSetStream(g_context.cudnn_handle, g_context.stream));
}
Exemplo n.º 4
0
void gemm(bool transa, bool transb, int m, int n, int k, double alpha, thrust::device_ptr<const double> A, int lda,
    thrust::device_ptr<const double> B, int ldb, double beta, thrust::device_ptr<double> C, int ldc)
{
  const cublasOperation_t ctransa = transa ? CUBLAS_OP_T : CUBLAS_OP_N;
  const cublasOperation_t ctransb = transb ? CUBLAS_OP_T : CUBLAS_OP_N;

  cublasSetStream(context::get().cublasHandle, context::get().stream);
  cublasDgemm(context::get().cublasHandle, ctransa, ctransb, m, n, k, &alpha, A.get(), lda, B.get(), ldb, &beta, C.get(), ldc);
}
		void fully_connected_layer_updater_cuda::enqueue_backward_weights_propagation(
			cudaStream_t stream_id,
			const std::vector<cuda_linear_buffer_device::const_ptr>& schema_data,
			const std::vector<cuda_linear_buffer_device::ptr>& gradient,
			const std::vector<cuda_linear_buffer_device::const_ptr>& data_custom,
			const std::vector<cuda_linear_buffer_device::const_ptr>& input_neurons_buffers,
			cuda_linear_buffer_device::const_ptr output_errors_buffer,
			const std::vector<cuda_linear_buffer_device::const_ptr>& persistent_working_data,
			cuda_linear_buffer_device::ptr temporary_working_fixed_buffer,
			cuda_linear_buffer_device::ptr temporary_working_per_entry_buffer,
			cuda_linear_buffer_device::const_ptr temporary_fixed_buffer,
			cuda_linear_buffer_device::const_ptr temporary_per_entry_buffer,
			unsigned int entry_count)
		{
			// Update weights
			{
				cublas_safe_call(cublasSetStream(cuda_config->get_cublas_handle(), stream_id));
				float alpha = 1.0F;
				float beta = 1.0F;
				cublas_safe_call(cublasSgemm(
					cuda_config->get_cublas_handle(),
					CUBLAS_OP_N,
					CUBLAS_OP_T,
					input_elem_count_per_entry_list[0],
					output_elem_count_per_entry,
					entry_count,
					&alpha,
					*input_neurons_buffers[0],
					input_elem_count_per_entry_list[0],
					*output_errors_buffer,
					output_elem_count_per_entry,
					&beta,
					*gradient[0],
					input_elem_count_per_entry_list[0]));
			}

			// Update biases
			if (bias)
			{
				cudnn_safe_call(cudnnSetStream(cuda_config->get_cudnn_handle(), stream_id));
				cudnn_util::set_tensor_descriptor(
					output_data_desc,
					output_configuration_specific,
					entry_count);
				float alpha = 1.0F;
				float beta = 1.0F;
				cudnn_safe_call(cudnnConvolutionBackwardBias(
					cuda_config->get_cudnn_handle(),
					&alpha,
					output_data_desc,
					*output_errors_buffer,
					&beta,
					bias_desc,
					*gradient[1]));
			}
		}
Exemplo n.º 6
0
GpuDevice::Impl::Impl(int d) : device(d) {
  ActivateDevice();
  for (size_t i = 0; i < kParallelism; ++i) {
    CUDA_CALL(cudaStreamCreate(&stream[i]));
    CUBLAS_CALL(cublasCreate(&cublas_handle[i]));
    CUBLAS_CALL(cublasSetStream(cublas_handle[i], stream[i]));
    CUDNN_CALL(cudnnCreate(&cudnn_handle[i]));
    CUDNN_CALL(cudnnSetStream(cudnn_handle[i], stream[i]));
  }
}
		void fully_connected_layer_updater_cuda::enqueue_forward_propagation(
			cudaStream_t stream_id,
			cuda_linear_buffer_device::ptr output_buffer,
			const std::vector<cuda_linear_buffer_device::const_ptr>& schema_data,
			const std::vector<cuda_linear_buffer_device::const_ptr>& data,
			const std::vector<cuda_linear_buffer_device::const_ptr>& data_custom,
			const std::vector<cuda_linear_buffer_device::const_ptr>& input_buffers,
			const std::vector<cuda_linear_buffer_device::const_ptr>& persistent_working_data,
			cuda_linear_buffer_device::ptr temporary_working_fixed_buffer,
			cuda_linear_buffer_device::ptr temporary_working_per_entry_buffer,
			cuda_linear_buffer_device::ptr temporary_fixed_buffer,
			cuda_linear_buffer_device::ptr temporary_per_entry_buffer,
			unsigned int entry_count)
		{
			{
				cublas_safe_call(cublasSetStream(cuda_config->get_cublas_handle(), stream_id));
				float alpha = 1.0F;
				float beta = 0.0F;
				cublas_safe_call(cublasSgemm(
					cuda_config->get_cublas_handle(),
					CUBLAS_OP_T,
					CUBLAS_OP_N,
					output_elem_count_per_entry,
					entry_count,
					input_elem_count_per_entry_list[0],
					&alpha,
					*data[0],
					input_elem_count_per_entry_list[0],
					*input_buffers[0],
					input_elem_count_per_entry_list[0],
					&beta,
					*output_buffer,
					output_elem_count_per_entry));
			}

			if (bias)
			{
				cudnn_safe_call(cudnnSetStream(cuda_config->get_cudnn_handle(), stream_id));
				cudnn_util::set_tensor_descriptor(
					output_data_desc,
					output_configuration_specific,
					entry_count);
				float alpha = 1.0F;
				float beta = 1.0F;
				cudnn_safe_call(cudnnAddTensor(
					cuda_config->get_cudnn_handle(),
					&alpha,
					bias_desc,
					*data[1],
					&beta,
					output_data_desc,
					*output_buffer));
			}
		}
Exemplo n.º 8
0
cublasHandle_t Caffe::device_cublas_handle(int group) {
  std::lock_guard<std::mutex> lock(cublas_mutex_);
  vector<cublasHandle_t>& group_cublas_handles = cublas_handles_[current_device()];
  if (group + 1 > group_cublas_handles.size()) {
    group_cublas_handles.resize(group + 1);
  }
  cublasHandle_t& cublas_handle = group_cublas_handles[group];
  if (!cublas_handle) {
    // Try to create a cublas handler, and report an error if failed (but we will
    // keep the program running as one might just want to run CPU code).
    if (cublasCreate(&cublas_handle) != CUBLAS_STATUS_SUCCESS) {
      LOG(ERROR) << "Cannot create Cublas handle. Cublas won't be available.";
    }
    CUBLAS_CHECK(cublasSetStream(cublas_handle, device_pstream(group)->get()));
  }
  return cublas_handle;
}
Exemplo n.º 9
0
void THCState_setBlasHandle(THCState *state, int device, int handle)
{  /* `device` is a CUDA index */
  if (device >= state->numDevices || device < 0)
  {
    THError("%d is not a device", device + 1 /* back to Torch index */);
  }

  if (handle > state->numUserBlasHandles || handle <= 0)
  {
    THError("%d is not a valid handle, valid range is: (1, %d)",
            handle, state->numUserBlasHandles);
  }
  state->currentBlasHandle =
    THCState_getDeviceBlasHandle(state, device, handle);
  state->currentPerDeviceBlasHandle = handle;
  THCublasCheck(cublasSetStream(state->currentBlasHandle, state->currentStream));
}
Exemplo n.º 10
0
		void reshape_layer_updater_cuda::enqueue_backward_data_propagation(
			cudaStream_t stream_id,
			unsigned int input_index,
			cuda_linear_buffer_device::ptr input_errors_buffer,
			cuda_linear_buffer_device::const_ptr output_errors_buffer,
			const std::vector<cuda_linear_buffer_device::const_ptr>& schema_data,
			const std::vector<cuda_linear_buffer_device::const_ptr>& data,
			const std::vector<cuda_linear_buffer_device::const_ptr>& data_custom,
			const std::vector<cuda_linear_buffer_device::const_ptr>& input_neurons_buffers,
			cuda_linear_buffer_device::const_ptr output_neurons_buffer,
			const std::vector<cuda_linear_buffer_device::const_ptr>& persistent_working_data,
			cuda_linear_buffer_device::ptr temporary_working_fixed_buffer,
			cuda_linear_buffer_device::ptr temporary_working_per_entry_buffer,
			cuda_linear_buffer_device::const_ptr temporary_fixed_buffer,
			cuda_linear_buffer_device::const_ptr temporary_per_entry_buffer,
			bool add_update_to_destination,
			unsigned int entry_count)
		{
			unsigned int elem_count = entry_count * output_elem_count_per_entry;
			if (add_update_to_destination)
			{
				cublas_safe_call(cublasSetStream(cuda_config->get_cublas_handle(), stream_id));
				float alpha = 1.0F;
				cublas_safe_call(cublasSaxpy(
					cuda_config->get_cublas_handle(),
					elem_count,
					&alpha,
					*output_errors_buffer,
					1,
					*input_errors_buffer,
					1));
			}
			else
			{
				if ((const float *)(*input_errors_buffer) != (const float *)(*output_errors_buffer))
				{
					cuda_util::copy_buffer(
						*cuda_config,
						*output_errors_buffer,
						*input_errors_buffer,
						output_elem_count_per_entry * entry_count,
						stream_id);
				}
			}
		}
Exemplo n.º 11
0
void THCState_setStream(THCState *state, int device, int stream)
{
  /* `device` is a CUDA index */
  if (device >= state->numDevices || device < 0)
  {
    THError("%d is not a device", device + 1 /* back to Torch index */);
  }

  if (stream > state->numUserStreams || stream < 0)
  {
    THError("%d is not a stream", stream);
  }
  state->currentStream =
    THCState_getDeviceStream(state, device, stream);
  state->currentPerDeviceStream = stream;
  THCublasCheck(cublasSetStream(state->currentBlasHandle,
                                state->currentStream));
}
Exemplo n.º 12
0
int tiramisu_cublas_sgemm(float *A, float *B, float *C,
                          uint64_t M, uint64_t N, uint64_t K,
                          float alpha, float beta,
                          uint64_t ldA, uint64_t ldB, uint64_t ldC,
                          uint64_t offsetA, uint64_t offsetB, uint64_t offsetC,
                          bool transposeA, bool transposeB)
{
    // TODO: Destroy the handle.
    static bool handle_created = false;
    static cublasHandle_t handle;
    if (!handle_created) {
        cublasCreate(&handle);
        handle_created = true;
    }
    // Default values for tight packing:
    if (ldA == 0) {
        ldA = transposeA ? M : K;
    }
    if (ldB == 0) {
        ldB = transposeB ? K : N;
    }
    if (ldC == 0) {
        ldC = N;
    }
    // The cuBLAS GEMM accepts column major buffers by default. We do a simple
    // trick here to multiply row major matrices. From a row-major perspective,
    // column-major multiplication basically transposes inputs, multiplies, and
    // transposes the output again: cublas(A, B) = ((A^T)x(B^T))^T = BxA
    // So it is actually equivalent to row-major GEMM with inputs swapped.
    // We need to reorder the size parameters as well to make it work:
    cublasSetStream(handle, cudaStreamPerThread);
    handle_cublas_error(
        cublasSgemm(handle,
                    transposeB ? CUBLAS_OP_T : CUBLAS_OP_N,
                    transposeA ? CUBLAS_OP_T : CUBLAS_OP_N,
                    N, M, K,
                    &alpha, B + offsetB, ldB, A + offsetA, ldA,
                    &beta, C + offsetC, ldC),
         __FUNCTION__);
    return 0;
}
Exemplo n.º 13
0
static void sgemm_cuda(void *descr[], void *args) {
    float *a = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
    float *b = (float *)STARPU_MATRIX_GET_PTR(descr[1]);
    float *c = (float *)STARPU_MATRIX_GET_PTR(descr[2]);

    unsigned w = STARPU_MATRIX_GET_NY(descr[2]);
    unsigned h = STARPU_MATRIX_GET_NX(descr[2]);
    unsigned k = STARPU_MATRIX_GET_NY(descr[0]);

    unsigned lda = STARPU_MATRIX_GET_LD(descr[0]);
    unsigned ldb = STARPU_MATRIX_GET_LD(descr[1]);
    unsigned ldc = STARPU_MATRIX_GET_LD(descr[2]);

    struct sgemm_arg * arg = (struct sgemm_arg*)args;

    cublasSetStream(cublas_handle, starpu_cuda_get_local_stream());

    cublasSgemm(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, h, w, k, &arg->alpha, a, lda, b, ldb, &arg->beta, c, ldc);
    cudaStreamSynchronize(starpu_cuda_get_local_stream());
    free(arg);
}
Exemplo n.º 14
0
GpuDevice::GpuDevice(uint64_t device_id, DeviceListener* l, int gpu_id) : ThreadedDevice(device_id, l, kParallelism), device_(gpu_id) {
  CUDA_CALL(cudaSetDevice(device_));
  cudaFree(0);  // Initialize
  auto allocator = [this](size_t len) -> void* {
    void* ret;
    CUDA_CALL(cudaSetDevice(device_));
    CUDA_CALL(cudaMalloc(&ret, len));
    return ret;
  };
  auto deallocator = [this](void* ptr) {
    CUDA_CALL(cudaSetDevice(device_));
    CUDA_CALL(cudaFree(ptr));
  };
  data_store_ = new PooledDataStore(DEFAULT_POOL_SIZE, allocator, deallocator);
  for (size_t i = 0; i < kParallelism; ++i) {
    CUDA_CALL(cudaStreamCreate(&stream_[i]));
    CUBLAS_CALL(cublasCreate(&cublas_handle_[i]));
    CUBLAS_CALL(cublasSetStream(cublas_handle_[i], stream_[i]));
    CUDNN_CALL(cudnnCreate(&cudnn_handle_[i]));
    CUDNN_CALL(cudnnSetStream(cudnn_handle_[i], stream_[i]));
  }
}
Exemplo n.º 15
0
void Caffe::SetSlaveDevice(const int slave_device_id) {
  int current_device;
  CUDA_CHECK(cudaGetDevice(&current_device));
  if (current_device == slave_device_id) {
    return;
  }
  if (Get().slave_cublas_handle_) CUBLAS_CHECK(cublasDestroy(Get().slave_cublas_handle_));
  if (Get().slave_curand_generator_) {
    CURAND_CHECK(curandDestroyGenerator(Get().slave_curand_generator_));
  }
  CUDA_CHECK(cudaSetDevice(slave_device_id));
  CUDA_CHECK(cudaStreamCreate (&Get().slave_cu_stream_));
  CUBLAS_CHECK(cublasCreate(&Get().slave_cublas_handle_));
  CUBLAS_CHECK(cublasSetStream(Get().slave_cublas_handle_, Get().slave_cu_stream_));
  CURAND_CHECK(curandCreateGenerator(&Get().slave_curand_generator_,
      CURAND_RNG_PSEUDO_DEFAULT));
  CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(Get().slave_curand_generator_,
      cluster_seedgen()));
  Get().slave_device_id_ = slave_device_id;
  CUDA_CHECK(cudaSetDevice(current_device));
  Caffe::set_gpu_mode(Caffe::MASTER_SLAVE);
}
		void fully_connected_layer_updater_cuda::enqueue_backward_data_propagation(
			cudaStream_t stream_id,
			unsigned int input_index,
			cuda_linear_buffer_device::ptr input_errors_buffer,
			cuda_linear_buffer_device::const_ptr output_errors_buffer,
			const std::vector<cuda_linear_buffer_device::const_ptr>& schema_data,
			const std::vector<cuda_linear_buffer_device::const_ptr>& data,
			const std::vector<cuda_linear_buffer_device::const_ptr>& data_custom,
			const std::vector<cuda_linear_buffer_device::const_ptr>& input_neurons_buffers,
			cuda_linear_buffer_device::const_ptr output_neurons_buffer,
			const std::vector<cuda_linear_buffer_device::const_ptr>& persistent_working_data,
			cuda_linear_buffer_device::ptr temporary_working_fixed_buffer,
			cuda_linear_buffer_device::ptr temporary_working_per_entry_buffer,
			cuda_linear_buffer_device::const_ptr temporary_fixed_buffer,
			cuda_linear_buffer_device::const_ptr temporary_per_entry_buffer,
			bool add_update_to_destination,
			unsigned int entry_count)
		{
			cublas_safe_call(cublasSetStream(cuda_config->get_cublas_handle(), stream_id));
			float alpha = 1.0F;
			float beta = (add_update_to_destination ? 1.0F : 0.0F);
			cublas_safe_call(cublasSgemm(
				cuda_config->get_cublas_handle(),
				CUBLAS_OP_N,
				CUBLAS_OP_N,
				input_elem_count_per_entry_list[0],
				entry_count,
				output_elem_count_per_entry,
				&alpha,
				*data[0],
				input_elem_count_per_entry_list[0],
				*output_errors_buffer,
				output_elem_count_per_entry,
				&beta,
				*input_errors_buffer,
				input_elem_count_per_entry_list[0]));
		}
Exemplo n.º 17
0
Arquivo: blas.hpp Projeto: SuHui/mgpu
 /// associate all following blas commands with stream
 inline void set_stream(dev_stream & stream)
 { MGPU_CUDA_BLAS_CALL(cublasSetStream(handle_, stream.get())); }
Exemplo n.º 18
0
Arquivo: blas.hpp Projeto: SuHui/mgpu
 /// associate all following blas commands with default stream
 inline void reset_stream()
 { MGPU_CUDA_BLAS_CALL(cublasSetStream(handle_, NULL)); }
Exemplo n.º 19
0
int TEMPLATE2 (CHOLMOD (gpu_triangular_solve))
(
    Int nsrow2,     /* L1 and S2 are nsrow2-by-nscol2 */
    Int nscol2,     /* L1 is nscol2-by-nscol2 */
    Int nsrow,      /* leading dimension of L1, L2, and S2 */
    Int psx,        /* L1 is at Lx+L_ENTRY*psx;
                     * L2 at Lx+L_ENTRY*(psx+nscol2)*/
    double *Lx,     /* holds L1, L2, and S2 */
    cholmod_common *Common,
    cholmod_gpu_pointers *gpu_p
)
{
    double *devPtrA, *devPtrB ;
    cudaError_t cudaStat ;
    cublasStatus_t cublasStatus ;
    Int gpu_lda, gpu_ldb, gpu_rowstep ;

    Int gpu_row_start = 0 ;
    Int gpu_row_max_chunk, gpu_row_chunk;
    int ibuf = 0;
    int iblock = 0;
    int iHostBuff = (Common->ibuffer+CHOLMOD_HOST_SUPERNODE_BUFFERS-1) %
        CHOLMOD_HOST_SUPERNODE_BUFFERS;
    int i, j;
    Int iidx;
    int iwrap;

#ifndef NTIMER
    double tstart ;
#endif

#ifdef REAL
    double alpha  = 1.0 ;
    gpu_row_max_chunk = 768;
#else
    cuDoubleComplex calpha  = {1.0,0.0} ;
    gpu_row_max_chunk = 256;
#endif

    if ( nsrow2 <= 0 )
    {
        return (0) ;
    }

#ifndef NTIMER
    tstart = SuiteSparse_time ( ) ;
    Common->CHOLMOD_GPU_TRSM_CALLS++ ;
#endif

    gpu_lda = ((nscol2+31)/32)*32 ;
    gpu_ldb = ((nsrow2+31)/32)*32 ;

    devPtrA = gpu_p->d_Lx[0];
    devPtrB = gpu_p->d_Lx[1];

    /* make sure the copy of B has completed */
    cudaStreamSynchronize( Common->gpuStream[0] );

    /* ---------------------------------------------------------------------- */
    /* do the CUDA BLAS dtrsm */
    /* ---------------------------------------------------------------------- */

    while ( gpu_row_start < nsrow2 )
    {

        gpu_row_chunk = nsrow2 - gpu_row_start;
        if ( gpu_row_chunk  > gpu_row_max_chunk ) {
            gpu_row_chunk = gpu_row_max_chunk;
        }

        cublasStatus = cublasSetStream ( Common->cublasHandle,
                                         Common->gpuStream[ibuf] );

        if ( cublasStatus != CUBLAS_STATUS_SUCCESS )
        {
            ERROR ( CHOLMOD_GPU_PROBLEM, "GPU CUBLAS stream");
        }

#ifdef REAL
        cublasStatus = cublasDtrsm (Common->cublasHandle,
                                    CUBLAS_SIDE_RIGHT,
                                    CUBLAS_FILL_MODE_LOWER,
                                    CUBLAS_OP_T,
                                    CUBLAS_DIAG_NON_UNIT,
                                    gpu_row_chunk,
                                    nscol2,
                                    &alpha,
                                    devPtrA,
                                    gpu_lda,
                                    devPtrB + gpu_row_start,
                                    gpu_ldb) ;
#else
        cublasStatus = cublasZtrsm (Common->cublasHandle,
                                    CUBLAS_SIDE_RIGHT,
                                    CUBLAS_FILL_MODE_LOWER,
                                    CUBLAS_OP_C,
                                    CUBLAS_DIAG_NON_UNIT,
                                    gpu_row_chunk,
                                    nscol2,
                                    &calpha,
                                    (const cuDoubleComplex *) devPtrA,
                                    gpu_lda,
                                    (cuDoubleComplex *)devPtrB + gpu_row_start ,
                                    gpu_ldb) ;
#endif
        if (cublasStatus != CUBLAS_STATUS_SUCCESS)
        {
            ERROR (CHOLMOD_GPU_PROBLEM, "GPU CUBLAS routine failure") ;
        }

        /* ------------------------------------------------------------------ */
        /* copy result back to the CPU */
        /* ------------------------------------------------------------------ */

        cudaStat = cudaMemcpy2DAsync (
            gpu_p->h_Lx[iHostBuff] +
            L_ENTRY*(nscol2+gpu_row_start),
            nsrow * L_ENTRY * sizeof (Lx [0]),
            devPtrB + L_ENTRY*gpu_row_start,
            gpu_ldb * L_ENTRY * sizeof (devPtrB [0]),
            gpu_row_chunk * L_ENTRY *
            sizeof (devPtrB [0]),
            nscol2,
            cudaMemcpyDeviceToHost,
            Common->gpuStream[ibuf]);

        if (cudaStat)
        {
            ERROR (CHOLMOD_GPU_PROBLEM, "GPU memcopy from device") ;
        }

        cudaEventRecord ( Common->updateCBuffersFree[ibuf],
                          Common->gpuStream[ibuf] );

        gpu_row_start += gpu_row_chunk;
        ibuf++;
        ibuf = ibuf % CHOLMOD_HOST_SUPERNODE_BUFFERS;

        iblock ++;

        if ( iblock >= CHOLMOD_HOST_SUPERNODE_BUFFERS )
        {
            Int gpu_row_start2 ;
            Int gpu_row_end ;

            /* then CHOLMOD_HOST_SUPERNODE_BUFFERS worth of work has been
             *  scheduled, so check for completed events and copy result into
             *  Lx before continuing. */
            cudaEventSynchronize ( Common->updateCBuffersFree
                                   [iblock%CHOLMOD_HOST_SUPERNODE_BUFFERS] );

            /* copy into Lx */
            gpu_row_start2 = nscol2 +
                (iblock-CHOLMOD_HOST_SUPERNODE_BUFFERS)
                *gpu_row_max_chunk;
            gpu_row_end = gpu_row_start2+gpu_row_max_chunk;

            if ( gpu_row_end > nsrow ) gpu_row_end = nsrow;

#pragma omp parallel for num_threads(CHOLMOD_OMP_NUM_THREADS)   \
    private(iidx) if ( nscol2 > 32 )

            for ( j=0; j<nscol2; j++ ) {
                for ( i=gpu_row_start2*L_ENTRY; i<gpu_row_end*L_ENTRY; i++ ) {
                    iidx = j*nsrow*L_ENTRY+i;
                    Lx[psx*L_ENTRY+iidx] = gpu_p->h_Lx[iHostBuff][iidx];
                }
            }
        }
    }

    /* Convenient to copy the L1 block here */

#pragma omp parallel for num_threads(CHOLMOD_OMP_NUM_THREADS)   \
private ( iidx ) if ( nscol2 > 32 )

    for ( j=0; j<nscol2; j++ ) {
        for ( i=j*L_ENTRY; i<nscol2*L_ENTRY; i++ ) {
            iidx = j*nsrow*L_ENTRY + i;
            Lx[psx*L_ENTRY+iidx] = gpu_p->h_Lx[iHostBuff][iidx];
        }
    }

    /* now account for the last HSTREAMS buffers */
    for ( iwrap=0; iwrap<CHOLMOD_HOST_SUPERNODE_BUFFERS; iwrap++ )
    {
        int i, j;
        Int gpu_row_start2 = nscol2 + (iblock-CHOLMOD_HOST_SUPERNODE_BUFFERS)
            *gpu_row_max_chunk;
        if (iblock-CHOLMOD_HOST_SUPERNODE_BUFFERS >= 0 &&
            gpu_row_start2 < nsrow )
        {
            Int iidx;
            Int gpu_row_end = gpu_row_start2+gpu_row_max_chunk;
            if ( gpu_row_end > nsrow ) gpu_row_end = nsrow;
            cudaEventSynchronize ( Common->updateCBuffersFree
                                   [iblock%CHOLMOD_HOST_SUPERNODE_BUFFERS] );
            /* copy into Lx */

#pragma omp parallel for num_threads(CHOLMOD_OMP_NUM_THREADS)   \
    private(iidx) if ( nscol2 > 32 )

            for ( j=0; j<nscol2; j++ ) {
                for ( i=gpu_row_start2*L_ENTRY; i<gpu_row_end*L_ENTRY; i++ ) {
                    iidx = j*nsrow*L_ENTRY+i;
                    Lx[psx*L_ENTRY+iidx] = gpu_p->h_Lx[iHostBuff][iidx];
                }
            }
        }
        iblock++;
    }

    /* ---------------------------------------------------------------------- */
    /* return */
    /* ---------------------------------------------------------------------- */

#ifndef NTIMER
    Common->CHOLMOD_GPU_TRSM_TIME += SuiteSparse_time ( ) - tstart ;
#endif

    return (1) ;
}
Exemplo n.º 20
0
int TEMPLATE2 (CHOLMOD (gpu_updateC))
(
    Int ndrow1,         /* C is ndrow2-by-ndrow2 */
    Int ndrow2,
    Int ndrow,          /* leading dimension of Lx */
    Int ndcol,          /* L1 is ndrow1-by-ndcol */
    Int nsrow,
    Int pdx1,           /* L1 starts at Lx + L_ENTRY*pdx1 */
    /* L2 starts at Lx + L_ENTRY*(pdx1 + ndrow1) */
    Int pdi1,
    double *Lx,
    double *C,
    cholmod_common *Common,
    cholmod_gpu_pointers *gpu_p
)
{
    double *devPtrLx, *devPtrC ;
    double alpha, beta ;
    cublasStatus_t cublasStatus ;
    cudaError_t cudaStat [2] ;
    Int ndrow3 ;
    int icol, irow;
    int iHostBuff, iDevBuff ;

#ifndef NTIMER
    double tstart = 0;
#endif

    if ((ndrow2*L_ENTRY < CHOLMOD_ND_ROW_LIMIT) ||
        (ndcol*L_ENTRY <  CHOLMOD_ND_COL_LIMIT))
    {
        /* too small for the CUDA BLAS; use the CPU instead */
        return (0) ;
    }

    ndrow3 = ndrow2 - ndrow1 ;

#ifndef NTIMER
    Common->syrkStart = SuiteSparse_time ( ) ;
    Common->CHOLMOD_GPU_SYRK_CALLS++ ;
#endif

    /* ---------------------------------------------------------------------- */
    /* allocate workspace on the GPU */
    /* ---------------------------------------------------------------------- */

    iHostBuff = (Common->ibuffer)%CHOLMOD_HOST_SUPERNODE_BUFFERS;
    iDevBuff = (Common->ibuffer)%CHOLMOD_DEVICE_STREAMS;

    /* cycle the device Lx buffer, d_Lx, through CHOLMOD_DEVICE_STREAMS,
       usually 2, so we can overlap the copy of this descendent supernode
       with the compute of the previous descendant supernode */
    devPtrLx = (double *)(gpu_p->d_Lx[iDevBuff]);
    /* very little overlap between kernels for difference descendant supernodes
       (since we enforce the supernodes must be large enough to fill the
       device) so we only need one C buffer */
    devPtrC = (double *)(gpu_p->d_C);

    /* ---------------------------------------------------------------------- */
    /* copy Lx to the GPU */
    /* ---------------------------------------------------------------------- */

    /* copy host data to pinned buffer first for better H2D bandwidth */
#pragma omp parallel for num_threads(CHOLMOD_OMP_NUM_THREADS) if (ndcol > 32)
    for ( icol=0; icol<ndcol; icol++ ) {
        for ( irow=0; irow<ndrow2*L_ENTRY; irow++ ) {
            gpu_p->h_Lx[iHostBuff][icol*ndrow2*L_ENTRY+irow] =
                Lx[pdx1*L_ENTRY+icol*ndrow*L_ENTRY + irow];
        }
    }

    cudaStat[0] = cudaMemcpyAsync ( devPtrLx,
        gpu_p->h_Lx[iHostBuff],
        ndrow2*ndcol*L_ENTRY*sizeof(devPtrLx[0]),
        cudaMemcpyHostToDevice,
        Common->gpuStream[iDevBuff] );

    if ( cudaStat[0] ) {
        CHOLMOD_GPU_PRINTF ((" ERROR cudaMemcpyAsync = %d \n", cudaStat[0]));
        return (0);
    }

    /* make the current stream wait for kernels in previous streams */
    cudaStreamWaitEvent ( Common->gpuStream[iDevBuff],
                          Common->updateCKernelsComplete, 0 ) ;

    /* ---------------------------------------------------------------------- */
    /* create the relative map for this descendant supernode */
    /* ---------------------------------------------------------------------- */

    createRelativeMapOnDevice ( (Int *)(gpu_p->d_Map),
                                (Int *)(gpu_p->d_Ls),
                                (Int *)(gpu_p->d_RelativeMap),
                                pdi1, ndrow2,
                                &(Common->gpuStream[iDevBuff]) );

    /* ---------------------------------------------------------------------- */
    /* do the CUDA SYRK */
    /* ---------------------------------------------------------------------- */

    cublasStatus = cublasSetStream (Common->cublasHandle,
                                    Common->gpuStream[iDevBuff]) ;
    if (cublasStatus != CUBLAS_STATUS_SUCCESS)
    {
        ERROR (CHOLMOD_GPU_PROBLEM, "GPU CUBLAS stream") ;
    }

    alpha  = 1.0 ;
    beta   = 0.0 ;

#ifdef REAL
    cublasStatus = cublasDsyrk (Common->cublasHandle,
        CUBLAS_FILL_MODE_LOWER,
        CUBLAS_OP_N,
        (int) ndrow1,
        (int) ndcol,    /* N, K: L1 is ndrow1-by-ndcol */
        &alpha,         /* ALPHA:  1 */
        devPtrLx,
        ndrow2,         /* A, LDA: L1, ndrow2 */
        &beta,          /* BETA:   0 */
        devPtrC,
        ndrow2) ;       /* C, LDC: C1 */
#else
    cublasStatus = cublasZherk (Common->cublasHandle,
        CUBLAS_FILL_MODE_LOWER,
        CUBLAS_OP_N,
        (int) ndrow1,
        (int) ndcol,    /* N, K: L1 is ndrow1-by-ndcol*/
        &alpha,         /* ALPHA:  1 */
        (const cuDoubleComplex *) devPtrLx,
        ndrow2,         /* A, LDA: L1, ndrow2 */
        &beta,          /* BETA:   0 */
        (cuDoubleComplex *) devPtrC,
        ndrow2) ;       /* C, LDC: C1 */
#endif

    if (cublasStatus != CUBLAS_STATUS_SUCCESS)
    {
        ERROR (CHOLMOD_GPU_PROBLEM, "GPU CUBLAS routine failure") ;
    }

#ifndef NTIMER
    Common->CHOLMOD_GPU_SYRK_TIME += SuiteSparse_time() - Common->syrkStart;
#endif

    /* ---------------------------------------------------------------------- */
    /* compute remaining (ndrow2-ndrow1)-by-ndrow1 block of C, C2 = L2*L1'    */
    /* ---------------------------------------------------------------------- */

#ifndef NTIMER
    Common->CHOLMOD_GPU_GEMM_CALLS++ ;
    tstart = SuiteSparse_time();
#endif

    if (ndrow3 > 0)
    {
#ifndef REAL
        cuDoubleComplex calpha  = {1.0,0.0} ;
        cuDoubleComplex cbeta   = {0.0,0.0} ;
#endif

        /* ------------------------------------------------------------------ */
        /* do the CUDA BLAS dgemm */
        /* ------------------------------------------------------------------ */

#ifdef REAL
        alpha  = 1.0 ;
        beta   = 0.0 ;
        cublasStatus = cublasDgemm (Common->cublasHandle,
            CUBLAS_OP_N, CUBLAS_OP_T,
            ndrow3, ndrow1, ndcol,          /* M, N, K */
            &alpha,                         /* ALPHA:  1 */
            devPtrLx + L_ENTRY*(ndrow1),    /* A, LDA: L2*/
            ndrow2,                         /* ndrow */
            devPtrLx,                       /* B, LDB: L1 */
            ndrow2,                         /* ndrow */
            &beta,                          /* BETA:   0 */
            devPtrC + L_ENTRY*ndrow1,       /* C, LDC: C2 */
            ndrow2) ;
#else
        cublasStatus = cublasZgemm (Common->cublasHandle,
            CUBLAS_OP_N, CUBLAS_OP_C,
            ndrow3, ndrow1, ndcol,          /* M, N, K */
            &calpha,                        /* ALPHA:  1 */
            (const cuDoubleComplex*) devPtrLx + ndrow1,
            ndrow2,                         /* ndrow */
            (const cuDoubleComplex *) devPtrLx,
            ndrow2,                         /* ndrow */
            &cbeta,                         /* BETA:   0 */
            (cuDoubleComplex *)devPtrC + ndrow1,
            ndrow2) ;
#endif

        if (cublasStatus != CUBLAS_STATUS_SUCCESS)
        {
            ERROR (CHOLMOD_GPU_PROBLEM, "GPU CUBLAS routine failure") ;
        }

    }

#ifndef NTIMER
    Common->CHOLMOD_GPU_GEMM_TIME += SuiteSparse_time() - tstart;
#endif

    /* ------------------------------------------------------------------ */
    /* Assemble the update C on the device using the d_RelativeMap */
    /* ------------------------------------------------------------------ */

#ifdef REAL
    addUpdateOnDevice ( gpu_p->d_A[0], devPtrC,
        gpu_p->d_RelativeMap, ndrow1, ndrow2, nsrow,
        &(Common->gpuStream[iDevBuff]) );
#else
    addComplexUpdateOnDevice ( gpu_p->d_A[0], devPtrC,
        gpu_p->d_RelativeMap, ndrow1, ndrow2, nsrow,
        &(Common->gpuStream[iDevBuff]) );
#endif

    /* Record an event indicating that kernels for
       this descendant are complete */
    cudaEventRecord ( Common->updateCKernelsComplete,
                      Common->gpuStream[iDevBuff]);
    cudaEventRecord ( Common->updateCBuffersFree[iHostBuff],
                      Common->gpuStream[iDevBuff]);

    return (1) ;
}
Exemplo n.º 21
0
void blasx_gpu_dgemm_kernel(int j,
                            int nrowa, int ncola,
                            int nrowb, int ncolb,
                            int nrowc, int ncolc,
                            int current_task, int prior_task,
                            enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB,
                            double* A, double* B, double* C,
                            int lda, int ldb, int ldc,
                            int x, int y, int z,
                            double** C_dev,
                            cudaStream_t *stream, cublasHandle_t *handle_p,
                            int current_stream,
                            double alpha, double beta, int block_dim,
                            int switcher, int* task_batch_counter,
                            LRU_t **LRUs, int GPUs,
                            int *mem_cpy_counter,
                            reader_tracker *addr_track,
                            int GPU_id)
{
    int nrowa_dev, nrowb_dev, nrowc_dev;
    int ncola_dev, ncolb_dev, ncolc_dev;
    int nrow_offset_a, nrow_offset_b;
    int ncol_offset_a, ncol_offset_b;
    int i = current_task/(y+1);
    int k = current_task%(y+1);
    double *A_dev, *B_dev;
    if (TransA != CblasNoTrans) {
        margin_adjustment(nrowa,ncola,block_dim,j,i,&nrowa_dev,&ncola_dev);
    }else{
        margin_adjustment(nrowa,ncola,block_dim,i,j,&nrowa_dev,&ncola_dev);
    }
    if (TransB != CblasNoTrans) {
        margin_adjustment(nrowb,ncolb,block_dim,k,j,&nrowb_dev,&ncolb_dev);
    }else{
        margin_adjustment(nrowb,ncolb,block_dim,j,k,&nrowb_dev,&ncolb_dev);
    }
    margin_adjustment(nrowc,ncolc,block_dim,i,k,&nrowc_dev,&ncolc_dev);
    if (TransA != CblasNoTrans) {
        nrow_offset_a = j*block_dim, ncol_offset_a = i*block_dim;
    }else{
        nrow_offset_a = i*block_dim, ncol_offset_a = j*block_dim;
    }
    if (TransB != CblasNoTrans) {
        nrow_offset_b = k*block_dim, ncol_offset_b = j*block_dim;
    }else{
        nrow_offset_b = j*block_dim, ncol_offset_b = k*block_dim;
    }
    double *starting_point_A = &A[nrow_offset_a+ncol_offset_a*lda];
    double *starting_point_B = &B[nrow_offset_b+ncol_offset_b*ldb];
    //Asynchonizing set matrix on GPU
    //----------------LRU&RBT optimization----------------//
    mem_control_kernel_double(starting_point_A, &A_dev, LRUs, GPUs, GPU_id, block_dim, mem_cpy_counter, addr_track, stream, nrowa_dev, ncola_dev, lda);
    mem_control_kernel_double(starting_point_B, &B_dev, LRUs, GPUs, GPU_id, block_dim, mem_cpy_counter, addr_track, stream, nrowb_dev, ncolb_dev, ldb);
    //----------------------------------------------------//
    
    if (j == 0) {
        margin_adjustment(nrowc,ncolc,block_dim,i,k,&nrowc_dev,&ncolc_dev);
        int nrow_offset_c = i*block_dim;
        int ncol_offset_c = k*block_dim;
        double *starting_point_C = &C[nrow_offset_c+ncol_offset_c*ldc];
        if (beta != 0) {
            assert( cublasSetMatrixAsync(nrowc_dev, ncolc_dev, sizeof(double), starting_point_C, ldc, C_dev[switcher*STREAMNUM+current_stream], block_dim, *stream) == CUBLAS_STATUS_SUCCESS );
        }
        if (*task_batch_counter != 0) {//Set matrix back
            int i_pre = prior_task/(y+1);
            int k_pre = prior_task%(y+1);
            int nrowc_dev_pre, ncolc_dev_pre;
            margin_adjustment(nrowc,ncolc,block_dim,i_pre,k_pre,&nrowc_dev_pre,&ncolc_dev_pre);
            int nrow_offset_c_pre = i_pre*block_dim;
            int ncol_offset_c_pre = k_pre*block_dim;
            double *starting_point_C_pre = &C[nrow_offset_c_pre+ncol_offset_c_pre*ldc];
            assert( cublasGetMatrixAsync(nrowc_dev_pre, ncolc_dev_pre, sizeof(double), C_dev[current_stream+(1-switcher)*STREAMNUM], block_dim, starting_point_C_pre, ldc,*stream) == CUBLAS_STATUS_SUCCESS);
        }
    }
    cudaStreamSynchronize(*stream);
    assert( cublasSetStream(*handle_p, *stream) == CUBLAS_STATUS_SUCCESS );

    double beta_inner = (j==0)?(beta):(1);
    int ka = (TransA != CblasNoTrans)?(nrowa_dev):(ncola_dev);
    cublasOperation_t transa, transb;
    CBLasTransToCuBlasTrans(TransA, &transa);
    CBLasTransToCuBlasTrans(TransB, &transb);
    cublasStatus_t status = cublasDgemm(*handle_p,
                                        transa, transb,
                                        nrowc_dev, ncolc_dev, ka,
                                        &alpha,
                                        A_dev, block_dim,
                                        B_dev, block_dim,
                                        &beta_inner,
                                        C_dev[switcher*STREAMNUM+current_stream], block_dim);
    assert( status == CUBLAS_STATUS_SUCCESS );
}
Exemplo n.º 22
0
int TEMPLATE2 (CHOLMOD (gpu_lower_potrf))
(
    Int nscol2,     /* S is nscol2-by-nscol2 */
    Int nsrow,      /* leading dimension of S */
    Int psx,        /* S is located at Lx + L_ENTRY*psx */
    double *Lx,     /* contains S; overwritten with Cholesky factor */
    Int *info,      /* BLAS info return value */
    cholmod_common *Common,
    cholmod_gpu_pointers *gpu_p
)
{
    double *devPtrA, *devPtrB, *A ;
    double alpha, beta ;
    cudaError_t cudaStat ;
    cublasStatus_t cublasStatus ;
    Int j, nsrow2, nb, n, gpu_lda, lda, gpu_ldb ;
    int ilda, ijb, iinfo ;
#ifndef NTIMER
    double tstart ;
#endif

    if (nscol2 * L_ENTRY < CHOLMOD_POTRF_LIMIT)
    {
        /* too small for the CUDA BLAS; use the CPU instead */
        return (0) ;
    }

#ifndef NTIMER
    tstart = SuiteSparse_time ( ) ;
    Common->CHOLMOD_GPU_POTRF_CALLS++ ;
#endif

    nsrow2 = nsrow - nscol2 ;

    /* ---------------------------------------------------------------------- */
    /* heuristic to get the block size depending of the problem size */
    /* ---------------------------------------------------------------------- */

    nb = 128 ;
    if (nscol2 > 4096) nb = 256 ;
    if (nscol2 > 8192) nb = 384 ;
    n  = nscol2 ;
    gpu_lda = ((nscol2+31)/32)*32 ;
    lda = nsrow ;

    A = gpu_p->h_Lx[(Common->ibuffer+CHOLMOD_HOST_SUPERNODE_BUFFERS-1)%
                    CHOLMOD_HOST_SUPERNODE_BUFFERS];

    /* ---------------------------------------------------------------------- */
    /* determine the GPU leading dimension of B */
    /* ---------------------------------------------------------------------- */

    gpu_ldb = 0 ;
    if (nsrow2 > 0)
    {
        gpu_ldb = ((nsrow2+31)/32)*32 ;
    }

    /* ---------------------------------------------------------------------- */
    /* remember where device memory is, to be used by triangular solve later */
    /* ---------------------------------------------------------------------- */

    devPtrA = gpu_p->d_Lx[0];
    devPtrB = gpu_p->d_Lx[1];

    /* ---------------------------------------------------------------------- */
    /* copy A from device to device */
    /* ---------------------------------------------------------------------- */

    cudaStat = cudaMemcpy2DAsync ( devPtrA,
       gpu_lda * L_ENTRY * sizeof (devPtrA[0]),
       gpu_p->d_A[1],
       nsrow * L_ENTRY * sizeof (Lx[0]),
       nscol2 * L_ENTRY * sizeof (devPtrA[0]),
       nscol2,
       cudaMemcpyDeviceToDevice,
       Common->gpuStream[0] );

    if ( cudaStat ) {
        ERROR ( CHOLMOD_GPU_PROBLEM, "GPU memcopy device to device");
    }

    /* ---------------------------------------------------------------------- */
    /* copy B in advance, for gpu_triangular_solve */
    /* ---------------------------------------------------------------------- */

    if (nsrow2 > 0)
    {
        cudaStat = cudaMemcpy2DAsync (devPtrB,
            gpu_ldb * L_ENTRY * sizeof (devPtrB [0]),
            gpu_p->d_A[1] + L_ENTRY*nscol2,
            nsrow * L_ENTRY * sizeof (Lx [0]),
            nsrow2 * L_ENTRY * sizeof (devPtrB [0]),
            nscol2,
            cudaMemcpyDeviceToDevice,
            Common->gpuStream[0]) ;
        if (cudaStat)
        {
            ERROR (CHOLMOD_GPU_PROBLEM, "GPU memcopy to device") ;
        }
    }

    /* ------------------------------------------------------------------ */
    /* define the dpotrf stream */
    /* ------------------------------------------------------------------ */

    cublasStatus = cublasSetStream (Common->cublasHandle,
                                    Common->gpuStream [0]) ;
    if (cublasStatus != CUBLAS_STATUS_SUCCESS) {
        ERROR (CHOLMOD_GPU_PROBLEM, "GPU CUBLAS stream") ;
    }

    /* ---------------------------------------------------------------------- */
    /* block Cholesky factorization of S */
    /* ---------------------------------------------------------------------- */

    for (j = 0 ; j < n ; j += nb)
    {
        Int jb = nb < (n-j) ? nb : (n-j) ;

        /* ------------------------------------------------------------------ */
        /* do the CUDA BLAS dsyrk */
        /* ------------------------------------------------------------------ */

        alpha = -1.0 ;
        beta  = 1.0 ;
#ifdef REAL
        cublasStatus = cublasDsyrk (Common->cublasHandle,
            CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, jb, j,
            &alpha, devPtrA + j, gpu_lda,
            &beta,  devPtrA + j + j*gpu_lda, gpu_lda) ;

#else
        cublasStatus = cublasZherk (Common->cublasHandle,
            CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, jb, j,
            &alpha, (cuDoubleComplex*)devPtrA + j,
            gpu_lda,
            &beta,
            (cuDoubleComplex*)devPtrA + j + j*gpu_lda,
            gpu_lda) ;
#endif

        if (cublasStatus != CUBLAS_STATUS_SUCCESS)
        {
            ERROR (CHOLMOD_GPU_PROBLEM, "GPU CUBLAS routine failure") ;
        }

        /* ------------------------------------------------------------------ */

        cudaStat = cudaEventRecord (Common->cublasEventPotrf [0],
                                    Common->gpuStream [0]) ;
        if (cudaStat)
        {
            ERROR (CHOLMOD_GPU_PROBLEM, "CUDA event failure") ;
        }

        cudaStat = cudaStreamWaitEvent (Common->gpuStream [1],
                                        Common->cublasEventPotrf [0], 0) ;
        if (cudaStat)
        {
            ERROR (CHOLMOD_GPU_PROBLEM, "CUDA event failure") ;
        }

        /* ------------------------------------------------------------------ */
        /* copy back the jb columns on two different streams */
        /* ------------------------------------------------------------------ */

        cudaStat = cudaMemcpy2DAsync (A + L_ENTRY*(j + j*lda),
            lda * L_ENTRY * sizeof (double),
            devPtrA + L_ENTRY*(j + j*gpu_lda),
            gpu_lda * L_ENTRY * sizeof (double),
            L_ENTRY * sizeof (double)*jb,
            jb,
            cudaMemcpyDeviceToHost,
            Common->gpuStream [1]) ;

        if (cudaStat)
        {
            ERROR (CHOLMOD_GPU_PROBLEM, "GPU memcopy from device") ;
        }

        /* ------------------------------------------------------------------ */
        /* do the CUDA BLAS dgemm */
        /* ------------------------------------------------------------------ */

        if ((j+jb) < n)
        {

#ifdef REAL
            alpha = -1.0 ;
            beta  = 1.0 ;
            cublasStatus = cublasDgemm (Common->cublasHandle,
                CUBLAS_OP_N, CUBLAS_OP_T,
                (n-j-jb), jb, j,
                &alpha,
                devPtrA + (j+jb), gpu_lda,
                devPtrA + (j)  , gpu_lda,
                &beta,
                devPtrA + (j+jb + j*gpu_lda), gpu_lda) ;

#else
            cuDoubleComplex calpha = {-1.0,0.0} ;
            cuDoubleComplex cbeta  = { 1.0,0.0} ;
            cublasStatus = cublasZgemm (Common->cublasHandle,
                CUBLAS_OP_N, CUBLAS_OP_C,
                (n-j-jb), jb, j,
                &calpha,
                (cuDoubleComplex*)devPtrA + (j+jb),
                gpu_lda,
                (cuDoubleComplex*)devPtrA + (j),
                gpu_lda,
                &cbeta,
                (cuDoubleComplex*)devPtrA +
                (j+jb + j*gpu_lda),
                gpu_lda ) ;
#endif

            if (cublasStatus != CUBLAS_STATUS_SUCCESS)
            {
                ERROR (CHOLMOD_GPU_PROBLEM, "GPU CUBLAS routine failure") ;
            }
        }

        cudaStat = cudaStreamSynchronize (Common->gpuStream [1]) ;
        if (cudaStat)
        {
            ERROR (CHOLMOD_GPU_PROBLEM, "GPU memcopy to device") ;
        }

        /* ------------------------------------------------------------------ */
        /* compute the Cholesky factorization of the jbxjb block on the CPU */
        /* ------------------------------------------------------------------ */

        ilda = (int) lda ;
        ijb  = jb ;
#ifdef REAL
        LAPACK_DPOTRF ("L", &ijb, A + L_ENTRY * (j + j*lda), &ilda, &iinfo) ;
#else
        LAPACK_ZPOTRF ("L", &ijb, A + L_ENTRY * (j + j*lda), &ilda, &iinfo) ;
#endif
        *info = iinfo ;

        if (*info != 0)
        {
            *info = *info + j ;
            break ;
        }

        /* ------------------------------------------------------------------ */
        /* copy the result back to the GPU */
        /* ------------------------------------------------------------------ */

        cudaStat = cudaMemcpy2DAsync (devPtrA + L_ENTRY*(j + j*gpu_lda),
            gpu_lda * L_ENTRY * sizeof (double),
            A + L_ENTRY * (j + j*lda),
            lda * L_ENTRY * sizeof (double),
            L_ENTRY * sizeof (double) * jb,
            jb,
            cudaMemcpyHostToDevice,
            Common->gpuStream [0]) ;

        if (cudaStat)
        {
            ERROR (CHOLMOD_GPU_PROBLEM, "GPU memcopy to device") ;
        }

        /* ------------------------------------------------------------------ */
        /* do the CUDA BLAS dtrsm */
        /* ------------------------------------------------------------------ */

        if ((j+jb) < n)
        {

#ifdef REAL
            alpha  = 1.0 ;
            cublasStatus = cublasDtrsm (Common->cublasHandle,
                CUBLAS_SIDE_RIGHT,
                CUBLAS_FILL_MODE_LOWER,
                CUBLAS_OP_T, CUBLAS_DIAG_NON_UNIT,
                (n-j-jb), jb,
                &alpha,
                devPtrA + (j + j*gpu_lda), gpu_lda,
                devPtrA + (j+jb + j*gpu_lda), gpu_lda) ;
#else
            cuDoubleComplex calpha  = {1.0,0.0};
            cublasStatus = cublasZtrsm (Common->cublasHandle,
                CUBLAS_SIDE_RIGHT,
                CUBLAS_FILL_MODE_LOWER,
                CUBLAS_OP_C, CUBLAS_DIAG_NON_UNIT,
                (n-j-jb), jb,
                &calpha,
                (cuDoubleComplex *)devPtrA +
                (j + j*gpu_lda),
                gpu_lda,
                (cuDoubleComplex *)devPtrA +
                (j+jb + j*gpu_lda),
                gpu_lda) ;
#endif

            if (cublasStatus != CUBLAS_STATUS_SUCCESS)
            {
                ERROR (CHOLMOD_GPU_PROBLEM, "GPU CUBLAS routine failure") ;
            }

            /* -------------------------------------------------------------- */
            /* Copy factored column back to host.                             */
            /* -------------------------------------------------------------- */

            cudaStat = cudaEventRecord (Common->cublasEventPotrf[2],
                                        Common->gpuStream[0]) ;
            if (cudaStat)
            {
                ERROR (CHOLMOD_GPU_PROBLEM, "CUDA event failure") ;
            }

            cudaStat = cudaStreamWaitEvent (Common->gpuStream[1],
                                            Common->cublasEventPotrf[2], 0) ;
            if (cudaStat)
            {
                ERROR (CHOLMOD_GPU_PROBLEM, "CUDA event failure") ;
            }

            cudaStat = cudaMemcpy2DAsync (A + L_ENTRY*(j + jb + j * lda),
                  lda * L_ENTRY * sizeof (double),
                  devPtrA + L_ENTRY*
                  (j + jb + j * gpu_lda),
                  gpu_lda * L_ENTRY * sizeof (double),
                  L_ENTRY * sizeof (double)*
                  (n - j - jb), jb,
                  cudaMemcpyDeviceToHost,
                  Common->gpuStream[1]) ;

            if (cudaStat)
            {
                ERROR (CHOLMOD_GPU_PROBLEM, "GPU memcopy to device") ;
            }
        }
    }

#ifndef NTIMER
    Common->CHOLMOD_GPU_POTRF_TIME += SuiteSparse_time ( ) - tstart ;
#endif

    return (1) ;
}
static int setup(void *c) {
  cuda_context *ctx = (cuda_context *)c;
  blas_handle *handle;
  const char *tmp[2];
  cublasStatus_t err;
  int e;
  int types[10];

  if (ctx->blas_handle != NULL)
    return GA_NO_ERROR;

  handle = calloc(1, sizeof(*handle));
  if (handle == NULL)
    return GA_MEMORY_ERROR;

  cuda_enter(ctx);
  err = cublasCreate(&handle->h);
  if (err != CUBLAS_STATUS_SUCCESS) {
    cuda_exit(ctx);
    free(handle);
    return GA_BLAS_ERROR;
  }

  err = cublasSetStream(handle->h, ctx->s);
  if (err != CUBLAS_STATUS_SUCCESS) {
    e = GA_BLAS_ERROR;
    goto e1;
  }

  cublasSetPointerMode(handle->h, CUBLAS_POINTER_MODE_HOST);
  cublasSetAtomicsMode(handle->h, CUBLAS_ATOMICS_ALLOWED);

  types[0] = GA_BUFFER;
  types[1] = GA_SIZE;
  types[2] = GA_BUFFER;
  types[3] = GA_SIZE;
  types[4] = GA_BUFFER;
  types[5] = GA_SIZE;
  types[6] = GA_SIZE;
  types[7] = GA_SIZE;
  types[8] = GA_SIZE;
  e = GpuKernel_init(&handle->sgemvBH_N_a1_b1_small, &cuda_ops, ctx, 1, &code_sgemvBH_N_a1_b1_small, NULL, "sgemv", 9, types, 0, NULL);
  if (e != GA_NO_ERROR) goto e1;
  e = GpuKernel_init(&handle->sgemvBH_T_a1_b1_small, &cuda_ops, ctx, 1, &code_sgemvBH_T_a1_b1_small, NULL, "sgemv", 9, types, 0, NULL);
  if (e != GA_NO_ERROR) goto e2;
  tmp[0] = atomicadd_double;
  tmp[1] = code_dgemvBH_N_a1_b1_small;
  e = GpuKernel_init(&handle->dgemvBH_N_a1_b1_small, &cuda_ops, ctx, 2, tmp, NULL, "dgemv", 9, types, GA_USE_DOUBLE, NULL);
  if (e != GA_NO_ERROR) goto e3;
  tmp[0] = atomicadd_double;
  tmp[1] = code_dgemvBH_T_a1_b1_small;
  e = GpuKernel_init(&handle->dgemvBH_T_a1_b1_small, &cuda_ops, ctx, 2, tmp, NULL, "dgemv", 9, types, GA_USE_DOUBLE, NULL);
  if (e != GA_NO_ERROR) goto e4;

  types[0] = GA_BUFFER;
  types[1] = GA_SIZE;
  types[2] = GA_BUFFER;
  types[3] = GA_SIZE;
  types[4] = GA_FLOAT;
  types[5] = GA_BUFFER;
  types[6] = GA_SIZE;
  types[7] = GA_SIZE;
  types[8] = GA_SIZE;
  types[9] = GA_SIZE;
  e = GpuKernel_init(&handle->sgerBH_gen_small, &cuda_ops, ctx, 1, &code_sgerBH_gen_small, NULL, "_sgerBH_gen_small", 10, types, 0, NULL);
  if (e != GA_NO_ERROR) goto e5;
  types[4] = GA_DOUBLE;
  tmp[0] = atomicadd_double;
  tmp[1] = code_dgerBH_gen_small;
  e = GpuKernel_init(&handle->dgerBH_gen_small, &cuda_ops, ctx, 2, tmp, NULL, "_dgerBH_gen_small", 10, types, GA_USE_DOUBLE, NULL);
  if (e != GA_NO_ERROR) goto e6;

  ctx->blas_handle = handle;

  cuda_exit(ctx);

  return GA_NO_ERROR;

 e6:
  GpuKernel_clear(&handle->sgerBH_gen_small);
 e5:
  GpuKernel_clear(&handle->dgemvBH_T_a1_b1_small);
 e4:
  GpuKernel_clear(&handle->dgemvBH_N_a1_b1_small);
 e3:
  GpuKernel_clear(&handle->sgemvBH_T_a1_b1_small);
 e2:
  GpuKernel_clear(&handle->sgemvBH_N_a1_b1_small);
 e1:
  cublasDestroy(handle->h);
  cuda_exit(ctx);
  free(handle);
  return e;
}
		void convolution_1x1_layer_tester_cuda::enqueue_test(
			cudaStream_t stream_id,
			const std::vector<const_cuda_linear_buffer_device_smart_ptr>& schema_data,
			const std::vector<const_cuda_linear_buffer_device_smart_ptr>& data,
			const std::vector<const_cuda_linear_buffer_device_smart_ptr>& data_custom,
			cuda_linear_buffer_device_smart_ptr input_buffer,
			const std::vector<cuda_linear_buffer_device_smart_ptr>& additional_buffers,
			unsigned int entry_count)
		{
			{
				cuda_util::transpose(
					*cuda_config,
					*input_buffer,
					*additional_buffers[1],
					input_elem_count_per_feature_map,
					input_configuration_specific.feature_map_count,
					entry_count,
					stream_id);

				cublas_safe_call(cublasSetStream(cuda_config->get_cublas_handle(), stream_id));
				float alpha = 1.0F;
				float beta = 0.0F;
				cublas_safe_call(cublasSgemm(
					cuda_config->get_cublas_handle(),
					CUBLAS_OP_T,
					CUBLAS_OP_N,
					output_configuration_specific.feature_map_count,
					entry_count * input_elem_count_per_feature_map,
					input_configuration_specific.feature_map_count,
					&alpha,
					*data[0],
					input_configuration_specific.feature_map_count,
					*additional_buffers[1],
					input_configuration_specific.feature_map_count,
					&beta,
					*additional_buffers[2],
					output_configuration_specific.feature_map_count));

				cuda_util::transpose(
					*cuda_config,
					*additional_buffers[2],
					*additional_buffers[0],
					output_configuration_specific.feature_map_count,
					output_elem_count_per_feature_map,
					entry_count,
					stream_id);
			}

			// Add bias
			{
				cudnn_safe_call(cudnnSetStream(cuda_config->get_cudnn_handle(), stream_id));
				cudnn_safe_call(cudnnSetTensor4dDescriptor(
					output_data_desc,
					CUDNN_TENSOR_NCHW,
					CUDNN_DATA_FLOAT,
					entry_count,
					output_configuration_specific.feature_map_count,
					1,
					output_elem_count_per_feature_map));

				float alpha = 1.0F;
				float beta = 1.0F;
				cudnn_safe_call(cudnnAddTensor(
					cuda_config->get_cudnn_handle(),
					CUDNN_ADD_SAME_C,
					&alpha,
					bias_desc,
					*data[1],
					&beta,
					output_data_desc,
					*additional_buffers[0]));
			}
		}