static void strmm_cuda(void *descr[], void *args) { float *a = (float *)STARPU_MATRIX_GET_PTR(descr[0]); float *b = (float *)STARPU_MATRIX_GET_PTR(descr[1]); float *c = (float *)STARPU_MATRIX_GET_PTR(descr[2]); unsigned w = STARPU_MATRIX_GET_NY(descr[0]); unsigned h = STARPU_MATRIX_GET_NX(descr[1]); unsigned lda = STARPU_MATRIX_GET_LD(descr[0]); unsigned ldb = STARPU_MATRIX_GET_LD(descr[1]); unsigned ldc = STARPU_MATRIX_GET_LD(descr[2]); struct strmm_arg * arg = (struct strmm_arg *)args; cublasSideMode_t side = arg->side ? CUBLAS_SIDE_RIGHT : CUBLAS_SIDE_LEFT; cublasFillMode_t uplo = arg->uplo ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER; cublasDiagType_t diag = arg->unit ? CUBLAS_DIAG_UNIT : CUBLAS_DIAG_NON_UNIT; cublasOperation_t trans = CUBLAS_OP_T; const float factor = 1.0f; cublasSetStream(cublas_handle, starpu_cuda_get_local_stream()); cublasStrmm(cublas_handle, side, uplo, trans, diag, w, h, &factor, a, lda, b, ldb, c, ldc); cudaStreamSynchronize(starpu_cuda_get_local_stream()); free(arg); }
void GPUsgemv(int gpuInner, int Md,int Nd,int Kd,float* Adevice,float *Bdevice,float *Cdevice,float *Ahost,float *Bhost,float *Chost, cudaStream_t *stream) { cudaError_t error; int memSizeA = sizeof(float)*Md*Nd; int memSizeB = sizeof(float)*Nd; int memSizeC = sizeof(float)*Md; error = cudaMemcpyAsync(Adevice,Ahost,memSizeA,cudaMemcpyHostToDevice,*stream); if (error != cudaSuccess){printf("cudaMemcpy A returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);} error = cudaMemcpyAsync(Bdevice,Bhost,memSizeB,cudaMemcpyHostToDevice,*stream); if (error != cudaSuccess){printf("cudaMemcpy B returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);} // setup execution parameters dim3 threads(block_size,block_size); dim3 grid(Nd/threads.x,Md/threads.y); // inside CUBLAS cublasHandle_t handle; cublasStatus_t ret; ret = cublasCreate(&handle); if (ret != CUBLAS_STATUS_SUCCESS){printf("cublasCreate returned error code %d, line(%d)\n", ret, __LINE__);exit(EXIT_FAILURE);} const float alpha = 1.0f; const float beta = 0.0f; cublasSetStream(handle,*stream); for (int i = 0; i < gpuInner; i++) { ret = cublasSgemv(handle, CUBLAS_OP_N, Md, Nd, &alpha, Adevice, Md, Bdevice, 1, &beta, Cdevice, 1); if (ret != CUBLAS_STATUS_SUCCESS) { printf("cublasSgemm returned error code %d, line(%d)\n", ret, __LINE__); exit(EXIT_FAILURE); } } // done CUBLAS // copy result back to host error = cudaMemcpyAsync(Chost,Cdevice,memSizeC,cudaMemcpyDeviceToHost,*stream); // printf("GPU Iter queued\n"); }
void cuda_initialize() { CUDA_CHECK(cudaStreamCreate(&g_context.stream)); CUBLAS_CHECK(cublasCreate_v2(&g_context.cublas_handle)); CUBLAS_CHECK(cublasSetStream(g_context.cublas_handle, g_context.stream)); // CUDNN_CHECK(cudnnCreate(&g_context.cudnn_handle)); // CUDNN_CHECK(cudnnSetStream(g_context.cudnn_handle, g_context.stream)); }
void gemm(bool transa, bool transb, int m, int n, int k, double alpha, thrust::device_ptr<const double> A, int lda, thrust::device_ptr<const double> B, int ldb, double beta, thrust::device_ptr<double> C, int ldc) { const cublasOperation_t ctransa = transa ? CUBLAS_OP_T : CUBLAS_OP_N; const cublasOperation_t ctransb = transb ? CUBLAS_OP_T : CUBLAS_OP_N; cublasSetStream(context::get().cublasHandle, context::get().stream); cublasDgemm(context::get().cublasHandle, ctransa, ctransb, m, n, k, &alpha, A.get(), lda, B.get(), ldb, &beta, C.get(), ldc); }
void fully_connected_layer_updater_cuda::enqueue_backward_weights_propagation( cudaStream_t stream_id, const std::vector<cuda_linear_buffer_device::const_ptr>& schema_data, const std::vector<cuda_linear_buffer_device::ptr>& gradient, const std::vector<cuda_linear_buffer_device::const_ptr>& data_custom, const std::vector<cuda_linear_buffer_device::const_ptr>& input_neurons_buffers, cuda_linear_buffer_device::const_ptr output_errors_buffer, const std::vector<cuda_linear_buffer_device::const_ptr>& persistent_working_data, cuda_linear_buffer_device::ptr temporary_working_fixed_buffer, cuda_linear_buffer_device::ptr temporary_working_per_entry_buffer, cuda_linear_buffer_device::const_ptr temporary_fixed_buffer, cuda_linear_buffer_device::const_ptr temporary_per_entry_buffer, unsigned int entry_count) { // Update weights { cublas_safe_call(cublasSetStream(cuda_config->get_cublas_handle(), stream_id)); float alpha = 1.0F; float beta = 1.0F; cublas_safe_call(cublasSgemm( cuda_config->get_cublas_handle(), CUBLAS_OP_N, CUBLAS_OP_T, input_elem_count_per_entry_list[0], output_elem_count_per_entry, entry_count, &alpha, *input_neurons_buffers[0], input_elem_count_per_entry_list[0], *output_errors_buffer, output_elem_count_per_entry, &beta, *gradient[0], input_elem_count_per_entry_list[0])); } // Update biases if (bias) { cudnn_safe_call(cudnnSetStream(cuda_config->get_cudnn_handle(), stream_id)); cudnn_util::set_tensor_descriptor( output_data_desc, output_configuration_specific, entry_count); float alpha = 1.0F; float beta = 1.0F; cudnn_safe_call(cudnnConvolutionBackwardBias( cuda_config->get_cudnn_handle(), &alpha, output_data_desc, *output_errors_buffer, &beta, bias_desc, *gradient[1])); } }
GpuDevice::Impl::Impl(int d) : device(d) { ActivateDevice(); for (size_t i = 0; i < kParallelism; ++i) { CUDA_CALL(cudaStreamCreate(&stream[i])); CUBLAS_CALL(cublasCreate(&cublas_handle[i])); CUBLAS_CALL(cublasSetStream(cublas_handle[i], stream[i])); CUDNN_CALL(cudnnCreate(&cudnn_handle[i])); CUDNN_CALL(cudnnSetStream(cudnn_handle[i], stream[i])); } }
void fully_connected_layer_updater_cuda::enqueue_forward_propagation( cudaStream_t stream_id, cuda_linear_buffer_device::ptr output_buffer, const std::vector<cuda_linear_buffer_device::const_ptr>& schema_data, const std::vector<cuda_linear_buffer_device::const_ptr>& data, const std::vector<cuda_linear_buffer_device::const_ptr>& data_custom, const std::vector<cuda_linear_buffer_device::const_ptr>& input_buffers, const std::vector<cuda_linear_buffer_device::const_ptr>& persistent_working_data, cuda_linear_buffer_device::ptr temporary_working_fixed_buffer, cuda_linear_buffer_device::ptr temporary_working_per_entry_buffer, cuda_linear_buffer_device::ptr temporary_fixed_buffer, cuda_linear_buffer_device::ptr temporary_per_entry_buffer, unsigned int entry_count) { { cublas_safe_call(cublasSetStream(cuda_config->get_cublas_handle(), stream_id)); float alpha = 1.0F; float beta = 0.0F; cublas_safe_call(cublasSgemm( cuda_config->get_cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N, output_elem_count_per_entry, entry_count, input_elem_count_per_entry_list[0], &alpha, *data[0], input_elem_count_per_entry_list[0], *input_buffers[0], input_elem_count_per_entry_list[0], &beta, *output_buffer, output_elem_count_per_entry)); } if (bias) { cudnn_safe_call(cudnnSetStream(cuda_config->get_cudnn_handle(), stream_id)); cudnn_util::set_tensor_descriptor( output_data_desc, output_configuration_specific, entry_count); float alpha = 1.0F; float beta = 1.0F; cudnn_safe_call(cudnnAddTensor( cuda_config->get_cudnn_handle(), &alpha, bias_desc, *data[1], &beta, output_data_desc, *output_buffer)); } }
cublasHandle_t Caffe::device_cublas_handle(int group) { std::lock_guard<std::mutex> lock(cublas_mutex_); vector<cublasHandle_t>& group_cublas_handles = cublas_handles_[current_device()]; if (group + 1 > group_cublas_handles.size()) { group_cublas_handles.resize(group + 1); } cublasHandle_t& cublas_handle = group_cublas_handles[group]; if (!cublas_handle) { // Try to create a cublas handler, and report an error if failed (but we will // keep the program running as one might just want to run CPU code). if (cublasCreate(&cublas_handle) != CUBLAS_STATUS_SUCCESS) { LOG(ERROR) << "Cannot create Cublas handle. Cublas won't be available."; } CUBLAS_CHECK(cublasSetStream(cublas_handle, device_pstream(group)->get())); } return cublas_handle; }
void THCState_setBlasHandle(THCState *state, int device, int handle) { /* `device` is a CUDA index */ if (device >= state->numDevices || device < 0) { THError("%d is not a device", device + 1 /* back to Torch index */); } if (handle > state->numUserBlasHandles || handle <= 0) { THError("%d is not a valid handle, valid range is: (1, %d)", handle, state->numUserBlasHandles); } state->currentBlasHandle = THCState_getDeviceBlasHandle(state, device, handle); state->currentPerDeviceBlasHandle = handle; THCublasCheck(cublasSetStream(state->currentBlasHandle, state->currentStream)); }
void reshape_layer_updater_cuda::enqueue_backward_data_propagation( cudaStream_t stream_id, unsigned int input_index, cuda_linear_buffer_device::ptr input_errors_buffer, cuda_linear_buffer_device::const_ptr output_errors_buffer, const std::vector<cuda_linear_buffer_device::const_ptr>& schema_data, const std::vector<cuda_linear_buffer_device::const_ptr>& data, const std::vector<cuda_linear_buffer_device::const_ptr>& data_custom, const std::vector<cuda_linear_buffer_device::const_ptr>& input_neurons_buffers, cuda_linear_buffer_device::const_ptr output_neurons_buffer, const std::vector<cuda_linear_buffer_device::const_ptr>& persistent_working_data, cuda_linear_buffer_device::ptr temporary_working_fixed_buffer, cuda_linear_buffer_device::ptr temporary_working_per_entry_buffer, cuda_linear_buffer_device::const_ptr temporary_fixed_buffer, cuda_linear_buffer_device::const_ptr temporary_per_entry_buffer, bool add_update_to_destination, unsigned int entry_count) { unsigned int elem_count = entry_count * output_elem_count_per_entry; if (add_update_to_destination) { cublas_safe_call(cublasSetStream(cuda_config->get_cublas_handle(), stream_id)); float alpha = 1.0F; cublas_safe_call(cublasSaxpy( cuda_config->get_cublas_handle(), elem_count, &alpha, *output_errors_buffer, 1, *input_errors_buffer, 1)); } else { if ((const float *)(*input_errors_buffer) != (const float *)(*output_errors_buffer)) { cuda_util::copy_buffer( *cuda_config, *output_errors_buffer, *input_errors_buffer, output_elem_count_per_entry * entry_count, stream_id); } } }
void THCState_setStream(THCState *state, int device, int stream) { /* `device` is a CUDA index */ if (device >= state->numDevices || device < 0) { THError("%d is not a device", device + 1 /* back to Torch index */); } if (stream > state->numUserStreams || stream < 0) { THError("%d is not a stream", stream); } state->currentStream = THCState_getDeviceStream(state, device, stream); state->currentPerDeviceStream = stream; THCublasCheck(cublasSetStream(state->currentBlasHandle, state->currentStream)); }
int tiramisu_cublas_sgemm(float *A, float *B, float *C, uint64_t M, uint64_t N, uint64_t K, float alpha, float beta, uint64_t ldA, uint64_t ldB, uint64_t ldC, uint64_t offsetA, uint64_t offsetB, uint64_t offsetC, bool transposeA, bool transposeB) { // TODO: Destroy the handle. static bool handle_created = false; static cublasHandle_t handle; if (!handle_created) { cublasCreate(&handle); handle_created = true; } // Default values for tight packing: if (ldA == 0) { ldA = transposeA ? M : K; } if (ldB == 0) { ldB = transposeB ? K : N; } if (ldC == 0) { ldC = N; } // The cuBLAS GEMM accepts column major buffers by default. We do a simple // trick here to multiply row major matrices. From a row-major perspective, // column-major multiplication basically transposes inputs, multiplies, and // transposes the output again: cublas(A, B) = ((A^T)x(B^T))^T = BxA // So it is actually equivalent to row-major GEMM with inputs swapped. // We need to reorder the size parameters as well to make it work: cublasSetStream(handle, cudaStreamPerThread); handle_cublas_error( cublasSgemm(handle, transposeB ? CUBLAS_OP_T : CUBLAS_OP_N, transposeA ? CUBLAS_OP_T : CUBLAS_OP_N, N, M, K, &alpha, B + offsetB, ldB, A + offsetA, ldA, &beta, C + offsetC, ldC), __FUNCTION__); return 0; }
static void sgemm_cuda(void *descr[], void *args) { float *a = (float *)STARPU_MATRIX_GET_PTR(descr[0]); float *b = (float *)STARPU_MATRIX_GET_PTR(descr[1]); float *c = (float *)STARPU_MATRIX_GET_PTR(descr[2]); unsigned w = STARPU_MATRIX_GET_NY(descr[2]); unsigned h = STARPU_MATRIX_GET_NX(descr[2]); unsigned k = STARPU_MATRIX_GET_NY(descr[0]); unsigned lda = STARPU_MATRIX_GET_LD(descr[0]); unsigned ldb = STARPU_MATRIX_GET_LD(descr[1]); unsigned ldc = STARPU_MATRIX_GET_LD(descr[2]); struct sgemm_arg * arg = (struct sgemm_arg*)args; cublasSetStream(cublas_handle, starpu_cuda_get_local_stream()); cublasSgemm(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, h, w, k, &arg->alpha, a, lda, b, ldb, &arg->beta, c, ldc); cudaStreamSynchronize(starpu_cuda_get_local_stream()); free(arg); }
GpuDevice::GpuDevice(uint64_t device_id, DeviceListener* l, int gpu_id) : ThreadedDevice(device_id, l, kParallelism), device_(gpu_id) { CUDA_CALL(cudaSetDevice(device_)); cudaFree(0); // Initialize auto allocator = [this](size_t len) -> void* { void* ret; CUDA_CALL(cudaSetDevice(device_)); CUDA_CALL(cudaMalloc(&ret, len)); return ret; }; auto deallocator = [this](void* ptr) { CUDA_CALL(cudaSetDevice(device_)); CUDA_CALL(cudaFree(ptr)); }; data_store_ = new PooledDataStore(DEFAULT_POOL_SIZE, allocator, deallocator); for (size_t i = 0; i < kParallelism; ++i) { CUDA_CALL(cudaStreamCreate(&stream_[i])); CUBLAS_CALL(cublasCreate(&cublas_handle_[i])); CUBLAS_CALL(cublasSetStream(cublas_handle_[i], stream_[i])); CUDNN_CALL(cudnnCreate(&cudnn_handle_[i])); CUDNN_CALL(cudnnSetStream(cudnn_handle_[i], stream_[i])); } }
void Caffe::SetSlaveDevice(const int slave_device_id) { int current_device; CUDA_CHECK(cudaGetDevice(¤t_device)); if (current_device == slave_device_id) { return; } if (Get().slave_cublas_handle_) CUBLAS_CHECK(cublasDestroy(Get().slave_cublas_handle_)); if (Get().slave_curand_generator_) { CURAND_CHECK(curandDestroyGenerator(Get().slave_curand_generator_)); } CUDA_CHECK(cudaSetDevice(slave_device_id)); CUDA_CHECK(cudaStreamCreate (&Get().slave_cu_stream_)); CUBLAS_CHECK(cublasCreate(&Get().slave_cublas_handle_)); CUBLAS_CHECK(cublasSetStream(Get().slave_cublas_handle_, Get().slave_cu_stream_)); CURAND_CHECK(curandCreateGenerator(&Get().slave_curand_generator_, CURAND_RNG_PSEUDO_DEFAULT)); CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(Get().slave_curand_generator_, cluster_seedgen())); Get().slave_device_id_ = slave_device_id; CUDA_CHECK(cudaSetDevice(current_device)); Caffe::set_gpu_mode(Caffe::MASTER_SLAVE); }
void fully_connected_layer_updater_cuda::enqueue_backward_data_propagation( cudaStream_t stream_id, unsigned int input_index, cuda_linear_buffer_device::ptr input_errors_buffer, cuda_linear_buffer_device::const_ptr output_errors_buffer, const std::vector<cuda_linear_buffer_device::const_ptr>& schema_data, const std::vector<cuda_linear_buffer_device::const_ptr>& data, const std::vector<cuda_linear_buffer_device::const_ptr>& data_custom, const std::vector<cuda_linear_buffer_device::const_ptr>& input_neurons_buffers, cuda_linear_buffer_device::const_ptr output_neurons_buffer, const std::vector<cuda_linear_buffer_device::const_ptr>& persistent_working_data, cuda_linear_buffer_device::ptr temporary_working_fixed_buffer, cuda_linear_buffer_device::ptr temporary_working_per_entry_buffer, cuda_linear_buffer_device::const_ptr temporary_fixed_buffer, cuda_linear_buffer_device::const_ptr temporary_per_entry_buffer, bool add_update_to_destination, unsigned int entry_count) { cublas_safe_call(cublasSetStream(cuda_config->get_cublas_handle(), stream_id)); float alpha = 1.0F; float beta = (add_update_to_destination ? 1.0F : 0.0F); cublas_safe_call(cublasSgemm( cuda_config->get_cublas_handle(), CUBLAS_OP_N, CUBLAS_OP_N, input_elem_count_per_entry_list[0], entry_count, output_elem_count_per_entry, &alpha, *data[0], input_elem_count_per_entry_list[0], *output_errors_buffer, output_elem_count_per_entry, &beta, *input_errors_buffer, input_elem_count_per_entry_list[0])); }
/// associate all following blas commands with stream inline void set_stream(dev_stream & stream) { MGPU_CUDA_BLAS_CALL(cublasSetStream(handle_, stream.get())); }
/// associate all following blas commands with default stream inline void reset_stream() { MGPU_CUDA_BLAS_CALL(cublasSetStream(handle_, NULL)); }
int TEMPLATE2 (CHOLMOD (gpu_triangular_solve)) ( Int nsrow2, /* L1 and S2 are nsrow2-by-nscol2 */ Int nscol2, /* L1 is nscol2-by-nscol2 */ Int nsrow, /* leading dimension of L1, L2, and S2 */ Int psx, /* L1 is at Lx+L_ENTRY*psx; * L2 at Lx+L_ENTRY*(psx+nscol2)*/ double *Lx, /* holds L1, L2, and S2 */ cholmod_common *Common, cholmod_gpu_pointers *gpu_p ) { double *devPtrA, *devPtrB ; cudaError_t cudaStat ; cublasStatus_t cublasStatus ; Int gpu_lda, gpu_ldb, gpu_rowstep ; Int gpu_row_start = 0 ; Int gpu_row_max_chunk, gpu_row_chunk; int ibuf = 0; int iblock = 0; int iHostBuff = (Common->ibuffer+CHOLMOD_HOST_SUPERNODE_BUFFERS-1) % CHOLMOD_HOST_SUPERNODE_BUFFERS; int i, j; Int iidx; int iwrap; #ifndef NTIMER double tstart ; #endif #ifdef REAL double alpha = 1.0 ; gpu_row_max_chunk = 768; #else cuDoubleComplex calpha = {1.0,0.0} ; gpu_row_max_chunk = 256; #endif if ( nsrow2 <= 0 ) { return (0) ; } #ifndef NTIMER tstart = SuiteSparse_time ( ) ; Common->CHOLMOD_GPU_TRSM_CALLS++ ; #endif gpu_lda = ((nscol2+31)/32)*32 ; gpu_ldb = ((nsrow2+31)/32)*32 ; devPtrA = gpu_p->d_Lx[0]; devPtrB = gpu_p->d_Lx[1]; /* make sure the copy of B has completed */ cudaStreamSynchronize( Common->gpuStream[0] ); /* ---------------------------------------------------------------------- */ /* do the CUDA BLAS dtrsm */ /* ---------------------------------------------------------------------- */ while ( gpu_row_start < nsrow2 ) { gpu_row_chunk = nsrow2 - gpu_row_start; if ( gpu_row_chunk > gpu_row_max_chunk ) { gpu_row_chunk = gpu_row_max_chunk; } cublasStatus = cublasSetStream ( Common->cublasHandle, Common->gpuStream[ibuf] ); if ( cublasStatus != CUBLAS_STATUS_SUCCESS ) { ERROR ( CHOLMOD_GPU_PROBLEM, "GPU CUBLAS stream"); } #ifdef REAL cublasStatus = cublasDtrsm (Common->cublasHandle, CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_T, CUBLAS_DIAG_NON_UNIT, gpu_row_chunk, nscol2, &alpha, devPtrA, gpu_lda, devPtrB + gpu_row_start, gpu_ldb) ; #else cublasStatus = cublasZtrsm (Common->cublasHandle, CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_C, CUBLAS_DIAG_NON_UNIT, gpu_row_chunk, nscol2, &calpha, (const cuDoubleComplex *) devPtrA, gpu_lda, (cuDoubleComplex *)devPtrB + gpu_row_start , gpu_ldb) ; #endif if (cublasStatus != CUBLAS_STATUS_SUCCESS) { ERROR (CHOLMOD_GPU_PROBLEM, "GPU CUBLAS routine failure") ; } /* ------------------------------------------------------------------ */ /* copy result back to the CPU */ /* ------------------------------------------------------------------ */ cudaStat = cudaMemcpy2DAsync ( gpu_p->h_Lx[iHostBuff] + L_ENTRY*(nscol2+gpu_row_start), nsrow * L_ENTRY * sizeof (Lx [0]), devPtrB + L_ENTRY*gpu_row_start, gpu_ldb * L_ENTRY * sizeof (devPtrB [0]), gpu_row_chunk * L_ENTRY * sizeof (devPtrB [0]), nscol2, cudaMemcpyDeviceToHost, Common->gpuStream[ibuf]); if (cudaStat) { ERROR (CHOLMOD_GPU_PROBLEM, "GPU memcopy from device") ; } cudaEventRecord ( Common->updateCBuffersFree[ibuf], Common->gpuStream[ibuf] ); gpu_row_start += gpu_row_chunk; ibuf++; ibuf = ibuf % CHOLMOD_HOST_SUPERNODE_BUFFERS; iblock ++; if ( iblock >= CHOLMOD_HOST_SUPERNODE_BUFFERS ) { Int gpu_row_start2 ; Int gpu_row_end ; /* then CHOLMOD_HOST_SUPERNODE_BUFFERS worth of work has been * scheduled, so check for completed events and copy result into * Lx before continuing. */ cudaEventSynchronize ( Common->updateCBuffersFree [iblock%CHOLMOD_HOST_SUPERNODE_BUFFERS] ); /* copy into Lx */ gpu_row_start2 = nscol2 + (iblock-CHOLMOD_HOST_SUPERNODE_BUFFERS) *gpu_row_max_chunk; gpu_row_end = gpu_row_start2+gpu_row_max_chunk; if ( gpu_row_end > nsrow ) gpu_row_end = nsrow; #pragma omp parallel for num_threads(CHOLMOD_OMP_NUM_THREADS) \ private(iidx) if ( nscol2 > 32 ) for ( j=0; j<nscol2; j++ ) { for ( i=gpu_row_start2*L_ENTRY; i<gpu_row_end*L_ENTRY; i++ ) { iidx = j*nsrow*L_ENTRY+i; Lx[psx*L_ENTRY+iidx] = gpu_p->h_Lx[iHostBuff][iidx]; } } } } /* Convenient to copy the L1 block here */ #pragma omp parallel for num_threads(CHOLMOD_OMP_NUM_THREADS) \ private ( iidx ) if ( nscol2 > 32 ) for ( j=0; j<nscol2; j++ ) { for ( i=j*L_ENTRY; i<nscol2*L_ENTRY; i++ ) { iidx = j*nsrow*L_ENTRY + i; Lx[psx*L_ENTRY+iidx] = gpu_p->h_Lx[iHostBuff][iidx]; } } /* now account for the last HSTREAMS buffers */ for ( iwrap=0; iwrap<CHOLMOD_HOST_SUPERNODE_BUFFERS; iwrap++ ) { int i, j; Int gpu_row_start2 = nscol2 + (iblock-CHOLMOD_HOST_SUPERNODE_BUFFERS) *gpu_row_max_chunk; if (iblock-CHOLMOD_HOST_SUPERNODE_BUFFERS >= 0 && gpu_row_start2 < nsrow ) { Int iidx; Int gpu_row_end = gpu_row_start2+gpu_row_max_chunk; if ( gpu_row_end > nsrow ) gpu_row_end = nsrow; cudaEventSynchronize ( Common->updateCBuffersFree [iblock%CHOLMOD_HOST_SUPERNODE_BUFFERS] ); /* copy into Lx */ #pragma omp parallel for num_threads(CHOLMOD_OMP_NUM_THREADS) \ private(iidx) if ( nscol2 > 32 ) for ( j=0; j<nscol2; j++ ) { for ( i=gpu_row_start2*L_ENTRY; i<gpu_row_end*L_ENTRY; i++ ) { iidx = j*nsrow*L_ENTRY+i; Lx[psx*L_ENTRY+iidx] = gpu_p->h_Lx[iHostBuff][iidx]; } } } iblock++; } /* ---------------------------------------------------------------------- */ /* return */ /* ---------------------------------------------------------------------- */ #ifndef NTIMER Common->CHOLMOD_GPU_TRSM_TIME += SuiteSparse_time ( ) - tstart ; #endif return (1) ; }
int TEMPLATE2 (CHOLMOD (gpu_updateC)) ( Int ndrow1, /* C is ndrow2-by-ndrow2 */ Int ndrow2, Int ndrow, /* leading dimension of Lx */ Int ndcol, /* L1 is ndrow1-by-ndcol */ Int nsrow, Int pdx1, /* L1 starts at Lx + L_ENTRY*pdx1 */ /* L2 starts at Lx + L_ENTRY*(pdx1 + ndrow1) */ Int pdi1, double *Lx, double *C, cholmod_common *Common, cholmod_gpu_pointers *gpu_p ) { double *devPtrLx, *devPtrC ; double alpha, beta ; cublasStatus_t cublasStatus ; cudaError_t cudaStat [2] ; Int ndrow3 ; int icol, irow; int iHostBuff, iDevBuff ; #ifndef NTIMER double tstart = 0; #endif if ((ndrow2*L_ENTRY < CHOLMOD_ND_ROW_LIMIT) || (ndcol*L_ENTRY < CHOLMOD_ND_COL_LIMIT)) { /* too small for the CUDA BLAS; use the CPU instead */ return (0) ; } ndrow3 = ndrow2 - ndrow1 ; #ifndef NTIMER Common->syrkStart = SuiteSparse_time ( ) ; Common->CHOLMOD_GPU_SYRK_CALLS++ ; #endif /* ---------------------------------------------------------------------- */ /* allocate workspace on the GPU */ /* ---------------------------------------------------------------------- */ iHostBuff = (Common->ibuffer)%CHOLMOD_HOST_SUPERNODE_BUFFERS; iDevBuff = (Common->ibuffer)%CHOLMOD_DEVICE_STREAMS; /* cycle the device Lx buffer, d_Lx, through CHOLMOD_DEVICE_STREAMS, usually 2, so we can overlap the copy of this descendent supernode with the compute of the previous descendant supernode */ devPtrLx = (double *)(gpu_p->d_Lx[iDevBuff]); /* very little overlap between kernels for difference descendant supernodes (since we enforce the supernodes must be large enough to fill the device) so we only need one C buffer */ devPtrC = (double *)(gpu_p->d_C); /* ---------------------------------------------------------------------- */ /* copy Lx to the GPU */ /* ---------------------------------------------------------------------- */ /* copy host data to pinned buffer first for better H2D bandwidth */ #pragma omp parallel for num_threads(CHOLMOD_OMP_NUM_THREADS) if (ndcol > 32) for ( icol=0; icol<ndcol; icol++ ) { for ( irow=0; irow<ndrow2*L_ENTRY; irow++ ) { gpu_p->h_Lx[iHostBuff][icol*ndrow2*L_ENTRY+irow] = Lx[pdx1*L_ENTRY+icol*ndrow*L_ENTRY + irow]; } } cudaStat[0] = cudaMemcpyAsync ( devPtrLx, gpu_p->h_Lx[iHostBuff], ndrow2*ndcol*L_ENTRY*sizeof(devPtrLx[0]), cudaMemcpyHostToDevice, Common->gpuStream[iDevBuff] ); if ( cudaStat[0] ) { CHOLMOD_GPU_PRINTF ((" ERROR cudaMemcpyAsync = %d \n", cudaStat[0])); return (0); } /* make the current stream wait for kernels in previous streams */ cudaStreamWaitEvent ( Common->gpuStream[iDevBuff], Common->updateCKernelsComplete, 0 ) ; /* ---------------------------------------------------------------------- */ /* create the relative map for this descendant supernode */ /* ---------------------------------------------------------------------- */ createRelativeMapOnDevice ( (Int *)(gpu_p->d_Map), (Int *)(gpu_p->d_Ls), (Int *)(gpu_p->d_RelativeMap), pdi1, ndrow2, &(Common->gpuStream[iDevBuff]) ); /* ---------------------------------------------------------------------- */ /* do the CUDA SYRK */ /* ---------------------------------------------------------------------- */ cublasStatus = cublasSetStream (Common->cublasHandle, Common->gpuStream[iDevBuff]) ; if (cublasStatus != CUBLAS_STATUS_SUCCESS) { ERROR (CHOLMOD_GPU_PROBLEM, "GPU CUBLAS stream") ; } alpha = 1.0 ; beta = 0.0 ; #ifdef REAL cublasStatus = cublasDsyrk (Common->cublasHandle, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, (int) ndrow1, (int) ndcol, /* N, K: L1 is ndrow1-by-ndcol */ &alpha, /* ALPHA: 1 */ devPtrLx, ndrow2, /* A, LDA: L1, ndrow2 */ &beta, /* BETA: 0 */ devPtrC, ndrow2) ; /* C, LDC: C1 */ #else cublasStatus = cublasZherk (Common->cublasHandle, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, (int) ndrow1, (int) ndcol, /* N, K: L1 is ndrow1-by-ndcol*/ &alpha, /* ALPHA: 1 */ (const cuDoubleComplex *) devPtrLx, ndrow2, /* A, LDA: L1, ndrow2 */ &beta, /* BETA: 0 */ (cuDoubleComplex *) devPtrC, ndrow2) ; /* C, LDC: C1 */ #endif if (cublasStatus != CUBLAS_STATUS_SUCCESS) { ERROR (CHOLMOD_GPU_PROBLEM, "GPU CUBLAS routine failure") ; } #ifndef NTIMER Common->CHOLMOD_GPU_SYRK_TIME += SuiteSparse_time() - Common->syrkStart; #endif /* ---------------------------------------------------------------------- */ /* compute remaining (ndrow2-ndrow1)-by-ndrow1 block of C, C2 = L2*L1' */ /* ---------------------------------------------------------------------- */ #ifndef NTIMER Common->CHOLMOD_GPU_GEMM_CALLS++ ; tstart = SuiteSparse_time(); #endif if (ndrow3 > 0) { #ifndef REAL cuDoubleComplex calpha = {1.0,0.0} ; cuDoubleComplex cbeta = {0.0,0.0} ; #endif /* ------------------------------------------------------------------ */ /* do the CUDA BLAS dgemm */ /* ------------------------------------------------------------------ */ #ifdef REAL alpha = 1.0 ; beta = 0.0 ; cublasStatus = cublasDgemm (Common->cublasHandle, CUBLAS_OP_N, CUBLAS_OP_T, ndrow3, ndrow1, ndcol, /* M, N, K */ &alpha, /* ALPHA: 1 */ devPtrLx + L_ENTRY*(ndrow1), /* A, LDA: L2*/ ndrow2, /* ndrow */ devPtrLx, /* B, LDB: L1 */ ndrow2, /* ndrow */ &beta, /* BETA: 0 */ devPtrC + L_ENTRY*ndrow1, /* C, LDC: C2 */ ndrow2) ; #else cublasStatus = cublasZgemm (Common->cublasHandle, CUBLAS_OP_N, CUBLAS_OP_C, ndrow3, ndrow1, ndcol, /* M, N, K */ &calpha, /* ALPHA: 1 */ (const cuDoubleComplex*) devPtrLx + ndrow1, ndrow2, /* ndrow */ (const cuDoubleComplex *) devPtrLx, ndrow2, /* ndrow */ &cbeta, /* BETA: 0 */ (cuDoubleComplex *)devPtrC + ndrow1, ndrow2) ; #endif if (cublasStatus != CUBLAS_STATUS_SUCCESS) { ERROR (CHOLMOD_GPU_PROBLEM, "GPU CUBLAS routine failure") ; } } #ifndef NTIMER Common->CHOLMOD_GPU_GEMM_TIME += SuiteSparse_time() - tstart; #endif /* ------------------------------------------------------------------ */ /* Assemble the update C on the device using the d_RelativeMap */ /* ------------------------------------------------------------------ */ #ifdef REAL addUpdateOnDevice ( gpu_p->d_A[0], devPtrC, gpu_p->d_RelativeMap, ndrow1, ndrow2, nsrow, &(Common->gpuStream[iDevBuff]) ); #else addComplexUpdateOnDevice ( gpu_p->d_A[0], devPtrC, gpu_p->d_RelativeMap, ndrow1, ndrow2, nsrow, &(Common->gpuStream[iDevBuff]) ); #endif /* Record an event indicating that kernels for this descendant are complete */ cudaEventRecord ( Common->updateCKernelsComplete, Common->gpuStream[iDevBuff]); cudaEventRecord ( Common->updateCBuffersFree[iHostBuff], Common->gpuStream[iDevBuff]); return (1) ; }
void blasx_gpu_dgemm_kernel(int j, int nrowa, int ncola, int nrowb, int ncolb, int nrowc, int ncolc, int current_task, int prior_task, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, double* A, double* B, double* C, int lda, int ldb, int ldc, int x, int y, int z, double** C_dev, cudaStream_t *stream, cublasHandle_t *handle_p, int current_stream, double alpha, double beta, int block_dim, int switcher, int* task_batch_counter, LRU_t **LRUs, int GPUs, int *mem_cpy_counter, reader_tracker *addr_track, int GPU_id) { int nrowa_dev, nrowb_dev, nrowc_dev; int ncola_dev, ncolb_dev, ncolc_dev; int nrow_offset_a, nrow_offset_b; int ncol_offset_a, ncol_offset_b; int i = current_task/(y+1); int k = current_task%(y+1); double *A_dev, *B_dev; if (TransA != CblasNoTrans) { margin_adjustment(nrowa,ncola,block_dim,j,i,&nrowa_dev,&ncola_dev); }else{ margin_adjustment(nrowa,ncola,block_dim,i,j,&nrowa_dev,&ncola_dev); } if (TransB != CblasNoTrans) { margin_adjustment(nrowb,ncolb,block_dim,k,j,&nrowb_dev,&ncolb_dev); }else{ margin_adjustment(nrowb,ncolb,block_dim,j,k,&nrowb_dev,&ncolb_dev); } margin_adjustment(nrowc,ncolc,block_dim,i,k,&nrowc_dev,&ncolc_dev); if (TransA != CblasNoTrans) { nrow_offset_a = j*block_dim, ncol_offset_a = i*block_dim; }else{ nrow_offset_a = i*block_dim, ncol_offset_a = j*block_dim; } if (TransB != CblasNoTrans) { nrow_offset_b = k*block_dim, ncol_offset_b = j*block_dim; }else{ nrow_offset_b = j*block_dim, ncol_offset_b = k*block_dim; } double *starting_point_A = &A[nrow_offset_a+ncol_offset_a*lda]; double *starting_point_B = &B[nrow_offset_b+ncol_offset_b*ldb]; //Asynchonizing set matrix on GPU //----------------LRU&RBT optimization----------------// mem_control_kernel_double(starting_point_A, &A_dev, LRUs, GPUs, GPU_id, block_dim, mem_cpy_counter, addr_track, stream, nrowa_dev, ncola_dev, lda); mem_control_kernel_double(starting_point_B, &B_dev, LRUs, GPUs, GPU_id, block_dim, mem_cpy_counter, addr_track, stream, nrowb_dev, ncolb_dev, ldb); //----------------------------------------------------// if (j == 0) { margin_adjustment(nrowc,ncolc,block_dim,i,k,&nrowc_dev,&ncolc_dev); int nrow_offset_c = i*block_dim; int ncol_offset_c = k*block_dim; double *starting_point_C = &C[nrow_offset_c+ncol_offset_c*ldc]; if (beta != 0) { assert( cublasSetMatrixAsync(nrowc_dev, ncolc_dev, sizeof(double), starting_point_C, ldc, C_dev[switcher*STREAMNUM+current_stream], block_dim, *stream) == CUBLAS_STATUS_SUCCESS ); } if (*task_batch_counter != 0) {//Set matrix back int i_pre = prior_task/(y+1); int k_pre = prior_task%(y+1); int nrowc_dev_pre, ncolc_dev_pre; margin_adjustment(nrowc,ncolc,block_dim,i_pre,k_pre,&nrowc_dev_pre,&ncolc_dev_pre); int nrow_offset_c_pre = i_pre*block_dim; int ncol_offset_c_pre = k_pre*block_dim; double *starting_point_C_pre = &C[nrow_offset_c_pre+ncol_offset_c_pre*ldc]; assert( cublasGetMatrixAsync(nrowc_dev_pre, ncolc_dev_pre, sizeof(double), C_dev[current_stream+(1-switcher)*STREAMNUM], block_dim, starting_point_C_pre, ldc,*stream) == CUBLAS_STATUS_SUCCESS); } } cudaStreamSynchronize(*stream); assert( cublasSetStream(*handle_p, *stream) == CUBLAS_STATUS_SUCCESS ); double beta_inner = (j==0)?(beta):(1); int ka = (TransA != CblasNoTrans)?(nrowa_dev):(ncola_dev); cublasOperation_t transa, transb; CBLasTransToCuBlasTrans(TransA, &transa); CBLasTransToCuBlasTrans(TransB, &transb); cublasStatus_t status = cublasDgemm(*handle_p, transa, transb, nrowc_dev, ncolc_dev, ka, &alpha, A_dev, block_dim, B_dev, block_dim, &beta_inner, C_dev[switcher*STREAMNUM+current_stream], block_dim); assert( status == CUBLAS_STATUS_SUCCESS ); }
int TEMPLATE2 (CHOLMOD (gpu_lower_potrf)) ( Int nscol2, /* S is nscol2-by-nscol2 */ Int nsrow, /* leading dimension of S */ Int psx, /* S is located at Lx + L_ENTRY*psx */ double *Lx, /* contains S; overwritten with Cholesky factor */ Int *info, /* BLAS info return value */ cholmod_common *Common, cholmod_gpu_pointers *gpu_p ) { double *devPtrA, *devPtrB, *A ; double alpha, beta ; cudaError_t cudaStat ; cublasStatus_t cublasStatus ; Int j, nsrow2, nb, n, gpu_lda, lda, gpu_ldb ; int ilda, ijb, iinfo ; #ifndef NTIMER double tstart ; #endif if (nscol2 * L_ENTRY < CHOLMOD_POTRF_LIMIT) { /* too small for the CUDA BLAS; use the CPU instead */ return (0) ; } #ifndef NTIMER tstart = SuiteSparse_time ( ) ; Common->CHOLMOD_GPU_POTRF_CALLS++ ; #endif nsrow2 = nsrow - nscol2 ; /* ---------------------------------------------------------------------- */ /* heuristic to get the block size depending of the problem size */ /* ---------------------------------------------------------------------- */ nb = 128 ; if (nscol2 > 4096) nb = 256 ; if (nscol2 > 8192) nb = 384 ; n = nscol2 ; gpu_lda = ((nscol2+31)/32)*32 ; lda = nsrow ; A = gpu_p->h_Lx[(Common->ibuffer+CHOLMOD_HOST_SUPERNODE_BUFFERS-1)% CHOLMOD_HOST_SUPERNODE_BUFFERS]; /* ---------------------------------------------------------------------- */ /* determine the GPU leading dimension of B */ /* ---------------------------------------------------------------------- */ gpu_ldb = 0 ; if (nsrow2 > 0) { gpu_ldb = ((nsrow2+31)/32)*32 ; } /* ---------------------------------------------------------------------- */ /* remember where device memory is, to be used by triangular solve later */ /* ---------------------------------------------------------------------- */ devPtrA = gpu_p->d_Lx[0]; devPtrB = gpu_p->d_Lx[1]; /* ---------------------------------------------------------------------- */ /* copy A from device to device */ /* ---------------------------------------------------------------------- */ cudaStat = cudaMemcpy2DAsync ( devPtrA, gpu_lda * L_ENTRY * sizeof (devPtrA[0]), gpu_p->d_A[1], nsrow * L_ENTRY * sizeof (Lx[0]), nscol2 * L_ENTRY * sizeof (devPtrA[0]), nscol2, cudaMemcpyDeviceToDevice, Common->gpuStream[0] ); if ( cudaStat ) { ERROR ( CHOLMOD_GPU_PROBLEM, "GPU memcopy device to device"); } /* ---------------------------------------------------------------------- */ /* copy B in advance, for gpu_triangular_solve */ /* ---------------------------------------------------------------------- */ if (nsrow2 > 0) { cudaStat = cudaMemcpy2DAsync (devPtrB, gpu_ldb * L_ENTRY * sizeof (devPtrB [0]), gpu_p->d_A[1] + L_ENTRY*nscol2, nsrow * L_ENTRY * sizeof (Lx [0]), nsrow2 * L_ENTRY * sizeof (devPtrB [0]), nscol2, cudaMemcpyDeviceToDevice, Common->gpuStream[0]) ; if (cudaStat) { ERROR (CHOLMOD_GPU_PROBLEM, "GPU memcopy to device") ; } } /* ------------------------------------------------------------------ */ /* define the dpotrf stream */ /* ------------------------------------------------------------------ */ cublasStatus = cublasSetStream (Common->cublasHandle, Common->gpuStream [0]) ; if (cublasStatus != CUBLAS_STATUS_SUCCESS) { ERROR (CHOLMOD_GPU_PROBLEM, "GPU CUBLAS stream") ; } /* ---------------------------------------------------------------------- */ /* block Cholesky factorization of S */ /* ---------------------------------------------------------------------- */ for (j = 0 ; j < n ; j += nb) { Int jb = nb < (n-j) ? nb : (n-j) ; /* ------------------------------------------------------------------ */ /* do the CUDA BLAS dsyrk */ /* ------------------------------------------------------------------ */ alpha = -1.0 ; beta = 1.0 ; #ifdef REAL cublasStatus = cublasDsyrk (Common->cublasHandle, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, jb, j, &alpha, devPtrA + j, gpu_lda, &beta, devPtrA + j + j*gpu_lda, gpu_lda) ; #else cublasStatus = cublasZherk (Common->cublasHandle, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, jb, j, &alpha, (cuDoubleComplex*)devPtrA + j, gpu_lda, &beta, (cuDoubleComplex*)devPtrA + j + j*gpu_lda, gpu_lda) ; #endif if (cublasStatus != CUBLAS_STATUS_SUCCESS) { ERROR (CHOLMOD_GPU_PROBLEM, "GPU CUBLAS routine failure") ; } /* ------------------------------------------------------------------ */ cudaStat = cudaEventRecord (Common->cublasEventPotrf [0], Common->gpuStream [0]) ; if (cudaStat) { ERROR (CHOLMOD_GPU_PROBLEM, "CUDA event failure") ; } cudaStat = cudaStreamWaitEvent (Common->gpuStream [1], Common->cublasEventPotrf [0], 0) ; if (cudaStat) { ERROR (CHOLMOD_GPU_PROBLEM, "CUDA event failure") ; } /* ------------------------------------------------------------------ */ /* copy back the jb columns on two different streams */ /* ------------------------------------------------------------------ */ cudaStat = cudaMemcpy2DAsync (A + L_ENTRY*(j + j*lda), lda * L_ENTRY * sizeof (double), devPtrA + L_ENTRY*(j + j*gpu_lda), gpu_lda * L_ENTRY * sizeof (double), L_ENTRY * sizeof (double)*jb, jb, cudaMemcpyDeviceToHost, Common->gpuStream [1]) ; if (cudaStat) { ERROR (CHOLMOD_GPU_PROBLEM, "GPU memcopy from device") ; } /* ------------------------------------------------------------------ */ /* do the CUDA BLAS dgemm */ /* ------------------------------------------------------------------ */ if ((j+jb) < n) { #ifdef REAL alpha = -1.0 ; beta = 1.0 ; cublasStatus = cublasDgemm (Common->cublasHandle, CUBLAS_OP_N, CUBLAS_OP_T, (n-j-jb), jb, j, &alpha, devPtrA + (j+jb), gpu_lda, devPtrA + (j) , gpu_lda, &beta, devPtrA + (j+jb + j*gpu_lda), gpu_lda) ; #else cuDoubleComplex calpha = {-1.0,0.0} ; cuDoubleComplex cbeta = { 1.0,0.0} ; cublasStatus = cublasZgemm (Common->cublasHandle, CUBLAS_OP_N, CUBLAS_OP_C, (n-j-jb), jb, j, &calpha, (cuDoubleComplex*)devPtrA + (j+jb), gpu_lda, (cuDoubleComplex*)devPtrA + (j), gpu_lda, &cbeta, (cuDoubleComplex*)devPtrA + (j+jb + j*gpu_lda), gpu_lda ) ; #endif if (cublasStatus != CUBLAS_STATUS_SUCCESS) { ERROR (CHOLMOD_GPU_PROBLEM, "GPU CUBLAS routine failure") ; } } cudaStat = cudaStreamSynchronize (Common->gpuStream [1]) ; if (cudaStat) { ERROR (CHOLMOD_GPU_PROBLEM, "GPU memcopy to device") ; } /* ------------------------------------------------------------------ */ /* compute the Cholesky factorization of the jbxjb block on the CPU */ /* ------------------------------------------------------------------ */ ilda = (int) lda ; ijb = jb ; #ifdef REAL LAPACK_DPOTRF ("L", &ijb, A + L_ENTRY * (j + j*lda), &ilda, &iinfo) ; #else LAPACK_ZPOTRF ("L", &ijb, A + L_ENTRY * (j + j*lda), &ilda, &iinfo) ; #endif *info = iinfo ; if (*info != 0) { *info = *info + j ; break ; } /* ------------------------------------------------------------------ */ /* copy the result back to the GPU */ /* ------------------------------------------------------------------ */ cudaStat = cudaMemcpy2DAsync (devPtrA + L_ENTRY*(j + j*gpu_lda), gpu_lda * L_ENTRY * sizeof (double), A + L_ENTRY * (j + j*lda), lda * L_ENTRY * sizeof (double), L_ENTRY * sizeof (double) * jb, jb, cudaMemcpyHostToDevice, Common->gpuStream [0]) ; if (cudaStat) { ERROR (CHOLMOD_GPU_PROBLEM, "GPU memcopy to device") ; } /* ------------------------------------------------------------------ */ /* do the CUDA BLAS dtrsm */ /* ------------------------------------------------------------------ */ if ((j+jb) < n) { #ifdef REAL alpha = 1.0 ; cublasStatus = cublasDtrsm (Common->cublasHandle, CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_T, CUBLAS_DIAG_NON_UNIT, (n-j-jb), jb, &alpha, devPtrA + (j + j*gpu_lda), gpu_lda, devPtrA + (j+jb + j*gpu_lda), gpu_lda) ; #else cuDoubleComplex calpha = {1.0,0.0}; cublasStatus = cublasZtrsm (Common->cublasHandle, CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_C, CUBLAS_DIAG_NON_UNIT, (n-j-jb), jb, &calpha, (cuDoubleComplex *)devPtrA + (j + j*gpu_lda), gpu_lda, (cuDoubleComplex *)devPtrA + (j+jb + j*gpu_lda), gpu_lda) ; #endif if (cublasStatus != CUBLAS_STATUS_SUCCESS) { ERROR (CHOLMOD_GPU_PROBLEM, "GPU CUBLAS routine failure") ; } /* -------------------------------------------------------------- */ /* Copy factored column back to host. */ /* -------------------------------------------------------------- */ cudaStat = cudaEventRecord (Common->cublasEventPotrf[2], Common->gpuStream[0]) ; if (cudaStat) { ERROR (CHOLMOD_GPU_PROBLEM, "CUDA event failure") ; } cudaStat = cudaStreamWaitEvent (Common->gpuStream[1], Common->cublasEventPotrf[2], 0) ; if (cudaStat) { ERROR (CHOLMOD_GPU_PROBLEM, "CUDA event failure") ; } cudaStat = cudaMemcpy2DAsync (A + L_ENTRY*(j + jb + j * lda), lda * L_ENTRY * sizeof (double), devPtrA + L_ENTRY* (j + jb + j * gpu_lda), gpu_lda * L_ENTRY * sizeof (double), L_ENTRY * sizeof (double)* (n - j - jb), jb, cudaMemcpyDeviceToHost, Common->gpuStream[1]) ; if (cudaStat) { ERROR (CHOLMOD_GPU_PROBLEM, "GPU memcopy to device") ; } } } #ifndef NTIMER Common->CHOLMOD_GPU_POTRF_TIME += SuiteSparse_time ( ) - tstart ; #endif return (1) ; }
static int setup(void *c) { cuda_context *ctx = (cuda_context *)c; blas_handle *handle; const char *tmp[2]; cublasStatus_t err; int e; int types[10]; if (ctx->blas_handle != NULL) return GA_NO_ERROR; handle = calloc(1, sizeof(*handle)); if (handle == NULL) return GA_MEMORY_ERROR; cuda_enter(ctx); err = cublasCreate(&handle->h); if (err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); free(handle); return GA_BLAS_ERROR; } err = cublasSetStream(handle->h, ctx->s); if (err != CUBLAS_STATUS_SUCCESS) { e = GA_BLAS_ERROR; goto e1; } cublasSetPointerMode(handle->h, CUBLAS_POINTER_MODE_HOST); cublasSetAtomicsMode(handle->h, CUBLAS_ATOMICS_ALLOWED); types[0] = GA_BUFFER; types[1] = GA_SIZE; types[2] = GA_BUFFER; types[3] = GA_SIZE; types[4] = GA_BUFFER; types[5] = GA_SIZE; types[6] = GA_SIZE; types[7] = GA_SIZE; types[8] = GA_SIZE; e = GpuKernel_init(&handle->sgemvBH_N_a1_b1_small, &cuda_ops, ctx, 1, &code_sgemvBH_N_a1_b1_small, NULL, "sgemv", 9, types, 0, NULL); if (e != GA_NO_ERROR) goto e1; e = GpuKernel_init(&handle->sgemvBH_T_a1_b1_small, &cuda_ops, ctx, 1, &code_sgemvBH_T_a1_b1_small, NULL, "sgemv", 9, types, 0, NULL); if (e != GA_NO_ERROR) goto e2; tmp[0] = atomicadd_double; tmp[1] = code_dgemvBH_N_a1_b1_small; e = GpuKernel_init(&handle->dgemvBH_N_a1_b1_small, &cuda_ops, ctx, 2, tmp, NULL, "dgemv", 9, types, GA_USE_DOUBLE, NULL); if (e != GA_NO_ERROR) goto e3; tmp[0] = atomicadd_double; tmp[1] = code_dgemvBH_T_a1_b1_small; e = GpuKernel_init(&handle->dgemvBH_T_a1_b1_small, &cuda_ops, ctx, 2, tmp, NULL, "dgemv", 9, types, GA_USE_DOUBLE, NULL); if (e != GA_NO_ERROR) goto e4; types[0] = GA_BUFFER; types[1] = GA_SIZE; types[2] = GA_BUFFER; types[3] = GA_SIZE; types[4] = GA_FLOAT; types[5] = GA_BUFFER; types[6] = GA_SIZE; types[7] = GA_SIZE; types[8] = GA_SIZE; types[9] = GA_SIZE; e = GpuKernel_init(&handle->sgerBH_gen_small, &cuda_ops, ctx, 1, &code_sgerBH_gen_small, NULL, "_sgerBH_gen_small", 10, types, 0, NULL); if (e != GA_NO_ERROR) goto e5; types[4] = GA_DOUBLE; tmp[0] = atomicadd_double; tmp[1] = code_dgerBH_gen_small; e = GpuKernel_init(&handle->dgerBH_gen_small, &cuda_ops, ctx, 2, tmp, NULL, "_dgerBH_gen_small", 10, types, GA_USE_DOUBLE, NULL); if (e != GA_NO_ERROR) goto e6; ctx->blas_handle = handle; cuda_exit(ctx); return GA_NO_ERROR; e6: GpuKernel_clear(&handle->sgerBH_gen_small); e5: GpuKernel_clear(&handle->dgemvBH_T_a1_b1_small); e4: GpuKernel_clear(&handle->dgemvBH_N_a1_b1_small); e3: GpuKernel_clear(&handle->sgemvBH_T_a1_b1_small); e2: GpuKernel_clear(&handle->sgemvBH_N_a1_b1_small); e1: cublasDestroy(handle->h); cuda_exit(ctx); free(handle); return e; }
void convolution_1x1_layer_tester_cuda::enqueue_test( cudaStream_t stream_id, const std::vector<const_cuda_linear_buffer_device_smart_ptr>& schema_data, const std::vector<const_cuda_linear_buffer_device_smart_ptr>& data, const std::vector<const_cuda_linear_buffer_device_smart_ptr>& data_custom, cuda_linear_buffer_device_smart_ptr input_buffer, const std::vector<cuda_linear_buffer_device_smart_ptr>& additional_buffers, unsigned int entry_count) { { cuda_util::transpose( *cuda_config, *input_buffer, *additional_buffers[1], input_elem_count_per_feature_map, input_configuration_specific.feature_map_count, entry_count, stream_id); cublas_safe_call(cublasSetStream(cuda_config->get_cublas_handle(), stream_id)); float alpha = 1.0F; float beta = 0.0F; cublas_safe_call(cublasSgemm( cuda_config->get_cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N, output_configuration_specific.feature_map_count, entry_count * input_elem_count_per_feature_map, input_configuration_specific.feature_map_count, &alpha, *data[0], input_configuration_specific.feature_map_count, *additional_buffers[1], input_configuration_specific.feature_map_count, &beta, *additional_buffers[2], output_configuration_specific.feature_map_count)); cuda_util::transpose( *cuda_config, *additional_buffers[2], *additional_buffers[0], output_configuration_specific.feature_map_count, output_elem_count_per_feature_map, entry_count, stream_id); } // Add bias { cudnn_safe_call(cudnnSetStream(cuda_config->get_cudnn_handle(), stream_id)); cudnn_safe_call(cudnnSetTensor4dDescriptor( output_data_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, entry_count, output_configuration_specific.feature_map_count, 1, output_elem_count_per_feature_map)); float alpha = 1.0F; float beta = 1.0F; cudnn_safe_call(cudnnAddTensor( cuda_config->get_cudnn_handle(), CUDNN_ADD_SAME_C, &alpha, bias_desc, *data[1], &beta, output_data_desc, *additional_buffers[0])); } }