void collect_final_result_dsyrk_syr2k(int *tasks_rs, int *tasks_rs_size, int switcher, cudaStream_t *stream, double** C_dev, int block_dim, int stream_num, int x,int y, int z, int nrowc, int ncolc, int ldc, double *C, enum CBLAS_UPLO Uplo) { switcher = 1 - switcher; int temp = 0; for (temp = tasks_rs_size[switcher]; temp < tasks_rs_size[1-switcher] ; temp++) { int prior_task = tasks_rs[temp+stream_num*(1-switcher)]; int i_pre, k_pre; blasx_get_index(prior_task, 0, x, &i_pre, &k_pre, Uplo, x); int current_stream = temp; int nrowc_dev_pre, ncolc_dev_pre; margin_adjustment(nrowc, ncolc, block_dim, i_pre, k_pre, &nrowc_dev_pre, &ncolc_dev_pre); int nrow_offset_c_pre = i_pre*block_dim; int ncol_offset_c_pre = k_pre*block_dim; double *starting_point_C_pre = &C[nrow_offset_c_pre+ncol_offset_c_pre*ldc]; cublasGetMatrixAsync(nrowc_dev_pre, ncolc_dev_pre, sizeof(double), C_dev[current_stream+(1-switcher)*stream_num], block_dim, starting_point_C_pre, ldc,stream[current_stream]); cudaStreamSynchronize(stream[current_stream]); } for (temp = 0; temp < tasks_rs_size[switcher]; temp++) { //assume 1-switcher int prior_task = tasks_rs[temp+stream_num*(switcher)]; int i_pre, k_pre; blasx_get_index(prior_task, 0, x, &i_pre, &k_pre, Uplo, x); int current_stream = temp; int nrowc_dev_pre, ncolc_dev_pre; margin_adjustment(nrowc, ncolc, block_dim, i_pre, k_pre, &nrowc_dev_pre, &ncolc_dev_pre); int nrow_offset_c_pre = i_pre*block_dim; int ncol_offset_c_pre = k_pre*block_dim; double *starting_point_C_pre = &C[nrow_offset_c_pre+ncol_offset_c_pre*ldc]; cublasGetMatrixAsync(nrowc_dev_pre, ncolc_dev_pre, sizeof(double), C_dev[current_stream+switcher*stream_num], block_dim, starting_point_C_pre, ldc,stream[current_stream]); cudaStreamSynchronize(stream[current_stream]); } }
void collect_final_result_zgemm(int *tasks_rs, int *tasks_rs_size, int switcher, cudaStream_t *stream, cuDoubleComplex** C_dev, int block_dim, int stream_num, int x, int y, int z, int nrowc, int ncolc, int ldc, cuDoubleComplex *C) { switcher = 1 - switcher; int temp = 0; for (temp = tasks_rs_size[switcher]; temp < tasks_rs_size[1-switcher] ; temp++) { int prior_task = tasks_rs[temp+stream_num*(1-switcher)]; int i_pre = prior_task/(y+1); int k_pre = prior_task%(y+1); int current_stream = temp; int nrowc_dev_pre, ncolc_dev_pre; margin_adjustment(nrowc, ncolc, block_dim, i_pre, k_pre, &nrowc_dev_pre, &ncolc_dev_pre); int nrow_offset_c_pre = i_pre*block_dim; int ncol_offset_c_pre = k_pre*block_dim; cuDoubleComplex *starting_point_C_pre = &C[nrow_offset_c_pre+ncol_offset_c_pre*ldc]; assert( cublasGetMatrixAsync(nrowc_dev_pre, ncolc_dev_pre, sizeof(cuDoubleComplex), C_dev[current_stream+(1-switcher)*stream_num], block_dim, starting_point_C_pre, ldc,stream[current_stream]) == CUBLAS_STATUS_SUCCESS ); assert(cudaStreamSynchronize(stream[current_stream]) == cudaSuccess); } for (temp = 0; temp < tasks_rs_size[switcher]; temp++) { int prior_task = tasks_rs[temp+stream_num*(switcher)]; int i_pre = prior_task/(y+1); int k_pre = prior_task%(y+1); int current_stream = temp; int nrowc_dev_pre, ncolc_dev_pre; margin_adjustment(nrowc, ncolc, block_dim, i_pre, k_pre, &nrowc_dev_pre, &ncolc_dev_pre); int nrow_offset_c_pre = i_pre*block_dim; int ncol_offset_c_pre = k_pre*block_dim; cuDoubleComplex *starting_point_C_pre = &C[nrow_offset_c_pre+ncol_offset_c_pre*ldc]; assert( cublasGetMatrixAsync(nrowc_dev_pre, ncolc_dev_pre, sizeof(cuDoubleComplex), C_dev[current_stream+switcher*stream_num], block_dim, starting_point_C_pre, ldc,stream[current_stream]) == CUBLAS_STATUS_SUCCESS ); assert(cudaStreamSynchronize(stream[current_stream]) == cudaSuccess); } }
cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id) { cudaError_t result = cudaSuccess; if (abort_flag) return result; if (situation >= 0) { static std::map<int, tsumarray> tsum; double a = 0.95, b = 0.05; if (tsum.find(situation) == tsum.end()) { a = 0.5; b = 0.5; } // faster initial convergence double tsync = 0.0; double tsleep = 0.95 * tsum[situation].value[thr_id]; if (cudaStreamQuery(stream) == cudaErrorNotReady) { usleep((useconds_t)(1e6*tsleep)); struct timeval tv_start, tv_end; gettimeofday(&tv_start, NULL); result = cudaStreamSynchronize(stream); gettimeofday(&tv_end, NULL); tsync = 1e-6 * (tv_end.tv_usec-tv_start.tv_usec) + (tv_end.tv_sec-tv_start.tv_sec); } if (tsync >= 0) tsum[situation].value[thr_id] = a * tsum[situation].value[thr_id] + b * (tsleep+tsync); } else result = cudaStreamSynchronize(stream); return result; }
void collect_final_result_dtrsm_mode_1(int *tasks_rs, int *tasks_rs_size, int switcher, int switcher_rs, cudaStream_t *stream, double** buffer_dev, int block_dim, int stream_num, int x, int y, int z, int nrowb, int ncolb, int ldb, double *B, int* switch_tracker) { int temp = 0; for (temp = tasks_rs_size[switcher_rs]; temp < tasks_rs_size[1-switcher_rs] ; temp++) { // printf("retrieving B[%d, %d] @stream=%d switcher:%d\n", z, tasks_rs[temp+STREAMNUM*(1-switcher_rs)], temp, switcher); int row = z; int col = tasks_rs[temp+stream_num*(1-switcher_rs)]; int current_stream = temp; int nrow_offset = row*block_dim; int ncol_offset = col*block_dim; int nrow_dev, ncol_dev; margin_adjustment(nrowb, ncolb, block_dim, row, col, &nrow_dev, &ncol_dev); double *starting_point = &B[nrow_offset+ncol_offset*ldb]; cublasGetMatrixAsync(nrow_dev, ncol_dev, sizeof(double), buffer_dev[current_stream+switch_tracker[temp]*stream_num], block_dim, starting_point, ldb, stream[current_stream]); cudaStreamSynchronize(stream[current_stream]); } for (temp = 0; temp < tasks_rs_size[switcher_rs]; temp++) { //assume 1-switcher //printf("retrieving B[%d, %d] @stream=%d\n", z, tasks_rs[temp+STREAMNUM*switcher_rs], temp); int row = z; int col = tasks_rs[temp+stream_num*switcher_rs]; int current_stream = temp; int nrow_offset = row*block_dim; int ncol_offset = col*block_dim; int nrow_dev, ncol_dev; margin_adjustment(nrowb, ncolb, block_dim, row, col, &nrow_dev, &ncol_dev); double *starting_point = &B[nrow_offset+ncol_offset*ldb]; cublasGetMatrixAsync(nrow_dev, ncol_dev, sizeof(double), buffer_dev[current_stream+switch_tracker[temp]*stream_num], block_dim, starting_point, ldb, stream[current_stream]); cudaStreamSynchronize(stream[current_stream]); } }
JNIEXPORT jdouble JNICALL Java_org_apache_spark_mllib_classification_LogisticRegressionNative_predictPoint (JNIEnv *env, jobject obj, jdoubleArray data, jdoubleArray weights, jdouble intercept) { // the kernel is written to take multiple data sets and produce a set of results, but we're going // to run it as multiple parallel kernels, each producing a single result instead double *d_dataBuffer, *d_weightsBuffer, *d_score; int dataCount, dataLen, whichGPU; jdouble h_score, *h_dataBuffer, *h_weightsBuffer; cudaStream_t stream; // select a GPU for *this* specific dataset whichGPU = get_gpu(); checkCudaErrors(cudaSetDevice(whichGPU)); checkCudaErrors(cudaStreamCreate(&stream)); // get a pointer to the raw input data, pinning them in memory dataCount = env->GetArrayLength(data); dataLen = dataCount*sizeof(double); assert(dataCount == env->GetArrayLength(weights)); h_dataBuffer = (jdouble*) env->GetPrimitiveArrayCritical(data, 0); h_weightsBuffer = (jdouble*) env->GetPrimitiveArrayCritical(weights, 0); // copy input data to the GPU memory // TODO: It may be better to access host memory directly, skipping the copy. Investigate. checkCudaErrors(mallocBest((void**)&d_dataBuffer, dataLen)); checkCudaErrors(mallocBest((void**)&d_weightsBuffer, dataLen)); checkCudaErrors(cudaMemcpyAsync(d_dataBuffer, h_dataBuffer, dataLen, cudaMemcpyHostToDevice, stream)); checkCudaErrors(cudaMemcpyAsync(d_weightsBuffer, h_weightsBuffer, dataLen, cudaMemcpyHostToDevice, stream)); // synchronize before unpinning, and also because there is a device-device transfer in predictKernelDevice checkCudaErrors(cudaStreamSynchronize(stream)); // un-pin the host arrays, as we're done with them env->ReleasePrimitiveArrayCritical(data, h_dataBuffer, 0); env->ReleasePrimitiveArrayCritical(weights, h_weightsBuffer, 0); // allocate storage for the result checkCudaErrors(mallocBest((void**)&d_score, sizeof(double))); // run the kernel, to produce a result predictKernelDevice(d_dataBuffer, d_weightsBuffer, intercept, d_score, 1, dataCount, stream); checkCudaErrors(cudaStreamSynchronize(stream)); // copy result back to host checkCudaErrors(cudaMemcpyAsync(&h_score, d_score, sizeof(double), cudaMemcpyDeviceToHost, stream)); checkCudaErrors(cudaStreamSynchronize(stream)); // Free the GPU buffers checkCudaErrors(freeBest(d_dataBuffer)); checkCudaErrors(freeBest(d_weightsBuffer)); checkCudaErrors(freeBest(d_score)); checkCudaErrors(cudaStreamDestroy(stream)); return h_score; }
static inline void dw_common_codelet_update_u11(void *descr[], int s, STARPU_ATTRIBUTE_UNUSED void *_args) { float *sub11; sub11 = (float *)STARPU_MATRIX_GET_PTR(descr[0]); unsigned long nx = STARPU_MATRIX_GET_NX(descr[0]); unsigned long ld = STARPU_MATRIX_GET_LD(descr[0]); unsigned long z; switch (s) { case 0: for (z = 0; z < nx; z++) { float pivot; pivot = sub11[z+z*ld]; STARPU_ASSERT(pivot != 0.0f); STARPU_SSCAL(nx - z - 1, (1.0f/pivot), &sub11[z+(z+1)*ld], ld); STARPU_SGER(nx - z - 1, nx - z - 1, -1.0f, &sub11[z+(z+1)*ld], ld, &sub11[(z+1)+z*ld], 1, &sub11[(z+1) + (z+1)*ld],ld); } break; #ifdef STARPU_USE_CUDA case 1: for (z = 0; z < nx; z++) { float pivot; cudaMemcpyAsync(&pivot, &sub11[z+z*ld], sizeof(float), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream()); cudaStreamSynchronize(starpu_cuda_get_local_stream()); STARPU_ASSERT(pivot != 0.0f); cublasSscal(nx - z - 1, 1.0f/pivot, &sub11[z+(z+1)*ld], ld); cublasSger(nx - z - 1, nx - z - 1, -1.0f, &sub11[z+(z+1)*ld], ld, &sub11[(z+1)+z*ld], 1, &sub11[(z+1) + (z+1)*ld],ld); } cudaStreamSynchronize(starpu_cuda_get_local_stream()); break; #endif default: STARPU_ABORT(); break; } }
void BasePrefetchingDataLayer<Dtype>::InternalThreadEntry() { #ifndef CPU_ONLY cudaStream_t stream; cudaStream_t stream2; if (Caffe::mode() == Caffe::GPU) { CAFFE1_CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); if (untransformed_top_) CAFFE1_CUDA_CHECK(cudaStreamCreateWithFlags(&stream2, cudaStreamNonBlocking)); } #endif try { while (!must_stop()) { Batch<Dtype>* batch = prefetch_free_.pop(); Batch<Dtype>* batch_untransformed = NULL; if (untransformed_top_) { batch_untransformed = prefetch_free_untransformed_.pop(); load_batch_and_untransformed_batch(batch,batch_untransformed); } else load_batch(batch); #ifndef CPU_ONLY if (Caffe::mode() == Caffe::GPU) { batch->data_.data().get()->async_gpu_push(stream); CAFFE1_CUDA_CHECK(cudaStreamSynchronize(stream)); if (untransformed_top_) { batch_untransformed->data_.data().get()->async_gpu_push(stream2); CAFFE1_CUDA_CHECK(cudaStreamSynchronize(stream2)); } } #endif prefetch_full_.push(batch); if (untransformed_top_) prefetch_full_untransformed_.push(batch_untransformed); } } catch (boost::thread_interrupted&) { // Interrupted exception is expected on shutdown } #ifndef CPU_ONLY if (Caffe::mode() == Caffe::GPU) { CAFFE1_CUDA_CHECK(cudaStreamDestroy(stream)); if (untransformed_top_) CAFFE1_CUDA_CHECK(cudaStreamDestroy(stream2)); } #endif }
static void strmm_cuda(void *descr[], void *args) { float *a = (float *)STARPU_MATRIX_GET_PTR(descr[0]); float *b = (float *)STARPU_MATRIX_GET_PTR(descr[1]); float *c = (float *)STARPU_MATRIX_GET_PTR(descr[2]); unsigned w = STARPU_MATRIX_GET_NY(descr[0]); unsigned h = STARPU_MATRIX_GET_NX(descr[1]); unsigned lda = STARPU_MATRIX_GET_LD(descr[0]); unsigned ldb = STARPU_MATRIX_GET_LD(descr[1]); unsigned ldc = STARPU_MATRIX_GET_LD(descr[2]); struct strmm_arg * arg = (struct strmm_arg *)args; cublasSideMode_t side = arg->side ? CUBLAS_SIDE_RIGHT : CUBLAS_SIDE_LEFT; cublasFillMode_t uplo = arg->uplo ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER; cublasDiagType_t diag = arg->unit ? CUBLAS_DIAG_UNIT : CUBLAS_DIAG_NON_UNIT; cublasOperation_t trans = CUBLAS_OP_T; const float factor = 1.0f; cublasSetStream(cublas_handle, starpu_cuda_get_local_stream()); cublasStrmm(cublas_handle, side, uplo, trans, diag, w, h, &factor, a, lda, b, ldb, c, ldc); cudaStreamSynchronize(starpu_cuda_get_local_stream()); free(arg); }
/** * @brief This performs the exchanging of all necessary halos between 2 neighboring MPI processes * * @param[in] cartComm The carthesian MPI communicator * @param[in] domSize The 2D size of the local domain * @param[in] topIndex The 2D index of the calling MPI process in the topology * @param[in] neighbors The list of ranks which are direct neighbors to the caller * @param[in] copyStream The stream used to overlap top & bottom halo exchange with side halo copy to host memory * @param[in, out] devBlocks The 2 device blocks that are updated during the Jacobi run * @param[in, out] devSideEdges The 2 side edges (parallel to the Y direction) that hold the packed halo values before sending them * @param[in, out] devHaloLines The 2 halo lines (parallel to the Y direction) that hold the packed halo values after receiving them * @param[in, out] hostSendLines The 2 host send buffers that are used during the halo exchange by the normal CUDA & MPI version * @param[in, out] hostRecvLines The 2 host receive buffers that are used during the halo exchange by the normal CUDA & MPI version * @return The time spent during the MPI transfers */ double TransferAllHalos(MPI_Comm cartComm, const int2 * domSize, const int2 * topIndex, const int * neighbors, cudaStream_t copyStream, real * devBlocks[2], real * devSideEdges[2], real * devHaloLines[2], real * hostSendLines[2], real * hostRecvLines[2]) { real * devSendLines[2] = {devBlocks[0] + domSize->x + 3, devBlocks[0] + domSize->y * (domSize->x + 2) + 1}; real * devRecvLines[2] = {devBlocks[0] + 1, devBlocks[0] + (domSize->y + 1) * (domSize->x + 2) + 1}; int yNeighbors[2] = {neighbors[DIR_TOP], neighbors[DIR_BOTTOM]}; int xNeighbors[2] = {neighbors[DIR_LEFT], neighbors[DIR_RIGHT]}; int2 order = make_int2(topIndex->x % 2, topIndex->y % 2); double transferTime; // Populate the block's side edges CopyDevSideEdgesFromBlock(devBlocks[0], devSideEdges, domSize, neighbors, copyStream); // Exchange data with the top and bottom neighbors transferTime = MPI_Wtime(); ExchangeHalos(cartComm, devSendLines[ order.y ], hostSendLines[0], hostRecvLines[0], devRecvLines[ order.y ], yNeighbors[ order.y ], domSize->x); ExchangeHalos(cartComm, devSendLines[1 - order.y], hostSendLines[0], hostRecvLines[0], devRecvLines[1 - order.y], yNeighbors[1 - order.y], domSize->x); SafeCudaCall(cudaStreamSynchronize(copyStream)); // Exchange data with the left and right neighbors ExchangeHalos(cartComm, devSideEdges[ order.x ], hostSendLines[1], hostRecvLines[1], devHaloLines[ order.x ], xNeighbors[ order.x ], domSize->y); ExchangeHalos(cartComm, devSideEdges[1 - order.x], hostSendLines[1], hostRecvLines[1], devHaloLines[1 - order.x], xNeighbors[1 - order.x], domSize->y); transferTime = MPI_Wtime() - transferTime; // Copy the received halos to the device block CopyDevHalosToBlock(devBlocks[0], devHaloLines[0], devHaloLines[1], domSize, neighbors); return transferTime; }
void forward_process_synchronized(struct ixmapfwd_thread *thread, unsigned int port_index, struct ixmap_packet *packet, unsigned int num_packets, struct ixmap_packet_cuda *result, uint8_t *read_buf) { int fd, i; cudaStreamSynchronize(thread->stream); for(i = 0; i < num_packets; i++){ if(result[i].outif >= 0){ ixmap_tx_assign(thread->plane, result[i].outif, thread->buf, &packet[i]); }else if(result[i].outif == -1){ goto packet_drop; }else{ goto packet_inject; } continue; packet_inject: memcpy(read_buf, packet[i].slot_buf, packet[i].slot_size); fd = thread->tun_plane->ports[port_index].fd; write(fd, read_buf, packet[i].slot_size); packet_drop: ixmap_slot_release(thread->buf, packet[i].slot_index); } return; }
void BasePrefetchingDataLayer<Dtype>::InternalThreadEntry() { #ifndef CPU_ONLY cudaStream_t stream;//创建CUDA stream,非阻塞类型 if (Caffe::mode() == Caffe::GPU) { CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); } #endif try { while (!must_stop()) { //循环载入批量数据 Batch<Dtype>* batch = prefetch_free_.pop();//拿到一个空闲batch load_batch(batch);//载入批量数据 #ifndef CPU_ONLY if (Caffe::mode() == Caffe::GPU) { batch->data_.data().get()->async_gpu_push(stream); if (this->output_labels_) { batch->label_.data().get()->async_gpu_push(stream); } CUDA_CHECK(cudaStreamSynchronize(stream));//同步到GPU } #endif prefetch_full_.push(batch);//加入到带负载的Batch队列中 } } catch (boost::thread_interrupted&) {//捕获异常,退出while循环 // Interrupted exception is expected on shutdown } #ifndef CPU_ONLY if (Caffe::mode() == Caffe::GPU) { CUDA_CHECK(cudaStreamDestroy(stream));//销毁CUDA stream } #endif }
void CUDABLAS1::scal( IndexType n, const double alpha, double* x_d, const IndexType incx, SyncToken* syncToken ) { LAMA_CHECK_CUDA_ACCESS cudaStream_t stream = NULL; if ( syncToken ) { CUDAStreamSyncToken* cudaStreamSyncToken = dynamic_cast<CUDAStreamSyncToken*>( syncToken ); LAMA_ASSERT_DEBUG( cudaStreamSyncToken, "no cuda stream sync token provided" ) stream = cudaStreamSyncToken->getCUDAStream(); } cublasSetKernelStream( stream ); LAMA_CHECK_CUBLAS_ERROR cublasDscal( n, alpha, x_d, incx ); // No error check here possible as kernel is started asynchronously if ( !syncToken ) { cudaStreamSynchronize( 0 ); LAMA_CHECK_CUDA_ERROR }
/* * Advance the simulation by <n> generations by mapping the OpenGL pixel buffer * objects for writing from CUDA, executing the kernel <n> times, and unmapping * the pixel buffer object. */ void advance_generations(unsigned long n) { uint8_t* device_bufs[2]; size_t size; DEBUG2("Mapping CUDA resources and retrieving device buffer pointers\n"); cudaGraphicsMapResources(2, cuda_graphics_resources, (cudaStream_t)0); cudaGraphicsResourceGetMappedPointer((void**)&device_bufs[0], &size, cuda_graphics_resources[0]); cudaGraphicsResourceGetMappedPointer((void**)&device_bufs[1], &size, cuda_graphics_resources[1]); check_cuda_error(); while (n--) { DEBUG2("Launching kernel (grid.width = %u, grid.height = %u)\n", grid.width, grid.height); launch_kernel(device_bufs[grid.which_buf], device_bufs[!grid.which_buf], grid.width, grid.height); grid.which_buf ^= 1; } DEBUG2("Unmapping CUDA resources\n"); cudaGraphicsUnmapResources(2, cuda_graphics_resources, (cudaStream_t)0); cudaStreamSynchronize(0); }
void NCCL<Dtype>::run(int layer) { CHECK(solver_->param().layer_wise_reduce()); vector<shared_ptr<Blob<Dtype> > >& blobs = solver_->net()->layers()[layer]->blobs(); #ifdef DEBUG // Assert blobs are contiguous to reduce in one step (e.g. bias often small) for (int i = 1; i < blobs.size(); ++i) { CHECK_EQ(blobs[i - 1]->gpu_diff() + blobs[i - 1]->count(), blobs[i + 0]->gpu_diff()); } #endif if (blobs.size() > 0) { // Make sure default stream is done computing gradients. Could be // replaced by cudaEventRecord+cudaStreamWaitEvent to avoid // blocking the default stream, but it's actually slower. CUDA_CHECK(cudaStreamSynchronize(cudaStreamDefault)); // Reduce asynchronously int size = 0; for (int i = 0; i < blobs.size(); ++i) { size += blobs[i]->count(); } if (barrier_) { // NULL in multi process case barrier_->wait(); } NCCL_CHECK(ncclAllReduce(blobs[0]->mutable_gpu_diff(), blobs[0]->mutable_gpu_diff(), size, nccl::dataType<Dtype>::type, ncclSum, comm_, stream_)); caffe_gpu_scal(size, (Dtype) 1.0 / Caffe::solver_count(), blobs[0]->mutable_gpu_diff(), stream_); } }
void KMeansController<Dtype>::PostProcess() { /** Use the cluster center calculated to get cluster labels**/ int* label_data = this->int_outputs_[0]->mutable_cpu_data(); // first restart the data provider. this->data_providers_[0]->ForceRestart(); // assign cluster labels bool assignment_finished = false; while (!assignment_finished) { size_t batch_num = 0; size_t index = this->data_providers_[0]->current_index(); // getting current index must precede GetData() // otherwise the current index in the data provide would be advanced this->mats_[0] = this->data_providers_[0]->GetData(batch_num); if ( this->data_providers_[0]->current_index() == 0) { assignment_finished = true; } //execute only the maximization functions for all samples this->function_input_vecs_[0][0] = this->mats_[0].get(); this->funcs_[0]->Execute(this->function_input_vecs_[0], this->function_output_vecs_[0], GKMeans::stream(0)); CUDA_CHECK(cudaStreamSynchronize(GKMeans::stream(0))); // copy out cluster labels int* out_data = (int*)this->mats_[2]->cpu_data(); for (size_t i = 0; i < batch_num; i++) { label_data[i + index] = out_data[i]; } } // copy out center data memcpy(this->numeric_outputs_[0]->mutable_cpu_data(), this->mats_[1]->cpu_data(), this->mats_[1]->count() * sizeof(Dtype)); }
void octree::resetCompact() { // reset counts to 1 so next compact proceeds... cudaStreamSynchronize(execStream->s()); // must make sure any outstanding compact is finished this->devMemCountsx[0] = 1; this->devMemCountsx.h2d(1, false, copyStream->s()); }
void P2PSync<Dtype>::on_gradients_ready(Timer* timer, ostringstream* timing) { #ifndef CPU_ONLY #ifdef DEBUG int device; CUDA_CHECK(cudaGetDevice(&device)); CHECK(device == solver_->param().device_id()); #endif // Sum children gradients as they appear in the queue for (int i = 0; i < children_.size(); ++i) { timer->Start(); P2PSync<Dtype> *child = queue_.pop(); Dtype* src = child->parent_grads_; Dtype* dst = diff_; #ifdef DEBUG bool ok = false; for (int j = 0; j < children_.size(); ++j) { if (child == children_[j]) { ok = true; } } CHECK(ok); cudaPointerAttributes attributes; CUDA_CHECK(cudaPointerGetAttributes(&attributes, src)); CHECK(attributes.device == device); CUDA_CHECK(cudaPointerGetAttributes(&attributes, dst)); CHECK(attributes.device == device); #endif caffe_gpu_add(size_, src, dst, dst); *timing << " add_grad: " << timer->MilliSeconds(); } // Send gradients to parent if (parent_) { timer->Start(); Dtype* src = diff_; Dtype* dst = parent_grads_; #ifdef DEBUG cudaPointerAttributes attributes; CUDA_CHECK(cudaPointerGetAttributes(&attributes, src)); CHECK(attributes.device == device); CUDA_CHECK(cudaPointerGetAttributes(&attributes, dst)); CHECK(attributes.device == parent_->solver_->param().device_id()); #endif CUDA_CHECK(cudaMemcpyAsync(dst, src, size_ * sizeof(Dtype), // cudaMemcpyDeviceToDevice, cudaStreamDefault)); CUDA_CHECK(cudaStreamSynchronize(cudaStreamDefault)); parent_->queue_.push(this); *timing << " send_grad: " << timer->MilliSeconds(); } else { // Loss functions divide gradients by the batch size, so to compensate // for split batch, the root solver divides by number of solvers. caffe_gpu_scal(size_, Dtype(1.0 / Caffe::solver_count()), diff_); } #endif }
TEST(MemcpyAsync, CheckReturnValues) { /** * The API documentation states that * cudaErrorInvalidDevicePointer is a valid return value for * cudaMemcpyAsync * * TODO; This needs a test. */ cudaError_t ret; cudaStream_t stream; ret = cudaStreamCreate(&stream); ASSERT_EQ(cudaSuccess, ret); /** * Test woefully out of range directions. */ int a = 0; ret = cudaMemcpyAsync(&a, &a, sizeof(a), (cudaMemcpyKind) -1, stream); EXPECT_EQ(cudaErrorInvalidMemcpyDirection, ret); ret = cudaMemcpyAsync(NULL, NULL, sizeof(a), (cudaMemcpyKind) -1, stream); EXPECT_EQ(cudaErrorInvalidMemcpyDirection, ret); ret = cudaStreamSynchronize(stream); EXPECT_EQ(cudaSuccess, ret); ret = cudaStreamDestroy(stream); EXPECT_EQ(cudaSuccess, ret); }
/** * CUDA4 introduced the cudaMemcpyDefault direction to cudaMemcpy. */ TEST(MemcpyAsync, CheckDefaultDirection) { cudaError_t ret; cudaStream_t stream; ret = cudaStreamCreate(&stream); ASSERT_EQ(cudaSuccess, ret); int a1 = 0; int a2 = 0; int * b; ret = cudaMalloc((void**) &b, sizeof(*b)); ASSERT_EQ(cudaSuccess, ret); ret = cudaMemcpyAsync(&a1, &a2, sizeof(a1), cudaMemcpyDefault, stream); EXPECT_EQ(cudaSuccess, ret); ret = cudaMemcpyAsync(&a1, b, sizeof(a1), cudaMemcpyDefault, stream); EXPECT_EQ(cudaSuccess, ret); ret = cudaMemcpyAsync(b, &a1, sizeof(a1), cudaMemcpyDefault, stream); EXPECT_EQ(cudaSuccess, ret); ret = cudaMemcpyAsync(b, b, sizeof(a1), cudaMemcpyDefault, stream); EXPECT_EQ(cudaSuccess, ret); ret = cudaStreamSynchronize(stream); EXPECT_EQ(cudaSuccess, ret); ret = cudaFree(b); ASSERT_EQ(cudaSuccess, ret); ret = cudaStreamDestroy(stream); EXPECT_EQ(cudaSuccess, ret); }
TEST_P(MemcpyAsync, H2DTransfers) { const size_t param = GetParam(); const size_t alloc = 1 << param; cudaError_t ret; void *d1, *h1; ret = cudaMalloc(&d1, alloc); ASSERT_EQ(cudaSuccess, ret); ret = cudaHostAlloc(&h1, alloc, cudaHostAllocMapped); ASSERT_EQ(cudaSuccess, ret); cudaStream_t stream; ret = cudaStreamCreate(&stream); ASSERT_EQ(cudaSuccess, ret); ret = cudaMemcpyAsync(d1, h1, alloc, cudaMemcpyHostToDevice, stream); ASSERT_EQ(cudaSuccess, ret); ret = cudaStreamSynchronize(stream); ASSERT_EQ(cudaSuccess, ret); ret = cudaFree(d1); ASSERT_EQ(cudaSuccess, ret); ret = cudaFreeHost(h1); ASSERT_EQ(cudaSuccess, ret); ret = cudaStreamDestroy(stream); ASSERT_EQ(cudaSuccess, ret); }
TEST_P(MemcpyAsync, D2DTransfers) { const size_t param = GetParam(); const size_t alloc = 1 << param; cudaError_t ret; void *d1, *d2; ret = cudaMalloc(&d1, alloc); ASSERT_EQ(cudaSuccess, ret); ret = cudaMalloc(&d2, alloc); ASSERT_EQ(cudaSuccess, ret); cudaStream_t stream; ret = cudaStreamCreate(&stream); ASSERT_EQ(cudaSuccess, ret); ret = cudaMemcpyAsync(d2, d1, alloc, cudaMemcpyDeviceToDevice, stream); ASSERT_EQ(cudaSuccess, ret); ret = cudaStreamSynchronize(stream); ASSERT_EQ(cudaSuccess, ret); ret = cudaFree(d1); ASSERT_EQ(cudaSuccess, ret); ret = cudaFree(d2); ASSERT_EQ(cudaSuccess, ret); ret = cudaStreamDestroy(stream); ASSERT_EQ(cudaSuccess, ret); }
shared_ptr<Mat<Dtype>> DataProviderBase<Dtype>::GetData(size_t &num){ //first join the async task num = num_future_.get(); //also sync the cuda strea CUDA_CHECK(cudaStreamSynchronize(data_stream_)); // get the mat with data filled int ready_idx = data_q_.front(); shared_ptr<Mat<Dtype>> ready_mat = data_slot_vec_[ready_idx]; // send the id of the mat to the end of the queue data_q_.pop_front(); data_q_.push_back(ready_idx); // get the mat to be filled (id at the front of the queue) Mat<Dtype>* working_mat = data_slot_vec_[data_q_.front()].get(); // kickout async task (force async launch) num_future_ = std::async(std::launch::async, &DataProviderBase::AsyncFunc, this, working_mat); // increment the current data cursor. current_index_ += num; if (current_index_ == round_size_){ current_index_ = 0; } return ready_mat; }
void BasePrefetchingLabelmapDataLayer<Dtype>::InternalThreadEntry() { #ifndef CPU_ONLY cudaStream_t stream; if (Caffe::mode() == Caffe::GPU) { CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); } #endif try { while (!must_stop()) { LabelmapBatch<Dtype>* batch = prefetch_free_.pop(); load_batch(batch); #ifndef CPU_ONLY if (Caffe::mode() == Caffe::GPU) { batch->data_.data().get()->async_gpu_push(stream); CUDA_CHECK(cudaStreamSynchronize(stream)); } #endif prefetch_full_.push(batch); } } catch (boost::thread_interrupted&) { // Interrupted exception is expected on shutdown } #ifndef CPU_ONLY if (Caffe::mode() == Caffe::GPU) { CUDA_CHECK(cudaStreamDestroy(stream)); } #endif }
void SentinelWalker<DistanceEvaluator, ResultsCalculator>::walk (size_t gridSize, size_t blockSize, size_t minWalks, DistanceEvaluator& distance, ResultsCalculator& results) { size_t threads = gridSize * blockSize; size_t walksPerThread = minWalks / threads + 1; srand(time(NULL)); ResultsCalculator* d_results; cuda::check(__FILE__, __LINE__, cudaMalloc((void**)&d_results, sizeof(ResultsCalculator))); cuda::check(__FILE__, __LINE__, cudaMemcpy(d_results, &results, sizeof(ResultsCalculator), cudaMemcpyHostToDevice)); DistanceEvaluator* d_distance; cuda::check(__FILE__, __LINE__, cudaMalloc((void**)&d_distance, sizeof(DistanceEvaluator))); cuda::check(__FILE__, __LINE__, cudaMemcpy(d_distance, &distance, sizeof(DistanceEvaluator), cudaMemcpyHostToDevice)); cuda::check(__FILE__, __LINE__, cudaDeviceSynchronize()); cudaStream_t workStream; cuda::check(cudaStreamCreateWithFlags(&workStream, cudaStreamNonBlocking)); cudaStream_t sentinelStream; cuda::check(cudaStreamCreateWithFlags(&sentinelStream, cudaStreamNonBlocking)); double start = seconds(); gpuWalkWithSentinel<DistanceEvaluator><<<gridSize, blockSize, 0, workStream>>> (rand(), d_distance, d_results); size_t runningTotal; do { runningTotal = results.getRunningTotal(sentinelStream); //std::cout << runningTotal << std::endl; } while (runningTotal < minWalks); std::cout << "Stopping." << std::endl; results.endRun(sentinelStream); std::cout << "Stop signal sent." << std::endl; cuda::check(__FILE__, __LINE__, cudaStreamSynchronize(workStream)); double timeSpent = seconds() - start; double totalWalks = walksPerThread * threads; double timePerWalk = timeSpent / totalWalks; double walksPerSecond = totalWalks / timeSpent; std::cout << "Total GPU Time: " << timeSpent << "s"<< std::endl; std::cout << "Time Per Walk: " << timePerWalk << "s" << std::endl; std::cout << "Walks Per Second: " << walksPerSecond << "s" << std::endl; }
void THCStorage_(set)(THCState *state, THCStorage *self, ptrdiff_t index, scalar_t value) { THArgCheck((index >= 0) && (index < self->numel()), 2, "index out of bounds"); cudaStream_t stream = THCState_getCurrentStream(state); THCudaCheck(cudaMemcpyAsync(THCStorage_(data)(state, self) + index, &value, sizeof(scalar_t), cudaMemcpyHostToDevice, stream)); THCudaCheck(cudaStreamSynchronize(stream)); }
bool ControlCubeCache::_readElement(NodeLinkedList<index_node_t> * element) { #ifndef NDEBUG if ((int)element->element > _maxNumCubes) { std::cerr<<"Control Cube CPU Cache, try to write outside reserved memory"<<std::endl; throw; } #endif index_node_t idCube = element->id; float * cube = (_memory + element->element*_sizeElement); if (!checkCubeInside(element->id)) { if (cudaSuccess != cudaMemset((void*)cube, 0, _sizeElement*sizeof(float))) { std::cout<<"---> "<<idCube<<" "<<_minValue<<" "<<_maxValue<<std::endl; LBERROR<<"Control Cube Cache: error copying to a device: "<<cudaGetErrorString(cudaGetLastError()) <<" "<<cube<<" "<<_sizeElement<<std::endl; throw; } return true; } index_node_t idCubeCPU = idCube >> 3*(_levelCube - _cpuCache->getCubeLevel()); float * pCube = _cpuCache->getAndBlockElement(idCubeCPU); if (pCube != 0) { vmml::vector<3, int> coord = getMinBoxIndex2(idCube, _levelCube, _nLevels); vmml::vector<3, int> coordC = getMinBoxIndex2(idCubeCPU, _cpuCache->getCubeLevel(), _nLevels); coord -= coordC; vmml::vector<3, int> realDimCPU = _cpuCache->getRealCubeDim(); cudaMemcpy3DParms myParms = {0}; myParms.srcPtr = make_cudaPitchedPtr((void*)pCube, realDimCPU.z()*sizeof(float), realDimCPU.x(), realDimCPU.y()); //myParms.dstPtr = make_cudaPitchedPtr((void*)cube, _realcubeDim.z()*sizeof(float), _realcubeDim.x(), _realcubeDim.y()); myParms.dstPtr = make_cudaPitchedPtr((void*)cube, _dimCube*sizeof(float), _dimCube, _dimCube); myParms.extent = make_cudaExtent(_dimCube*sizeof(float), _dimCube, _dimCube); myParms.dstPos = make_cudaPos(0,0,0); myParms.srcPos = make_cudaPos(coord.z()*sizeof(float), coord.y(), coord.x()); myParms.kind = cudaMemcpyHostToDevice; if (cudaSuccess != cudaMemcpy3DAsync(&myParms, _stream) || cudaSuccess != cudaStreamSynchronize(_stream)) { std::cout<<"---> "<<idCube<<" "<<_minValue<<" "<<_maxValue<<std::endl; LBERROR<<"Control Cube Cache: error copying to a device: "<<cudaGetErrorString(cudaGetLastError()) <<" "<<cube<<" "<<pCube<<" "<<_sizeElement<<std::endl; throw; } _cpuCache->unlockElement(idCubeCPU); return true; } else { return false; } }
TEST(MemcpyAsync, Pinned) { /** * Host memory must be pinned in order to be used as an argument to * cudaMemcpyAsync. Panoptes only prints a warning about this error * rather than actually return an error via the CUDA API. This test is * written as to check for the absence of an error once the CUDA * implementation starts returning one for nonpinned host memory. */ const long page_size_ = sysconf(_SC_PAGESIZE); ASSERT_LT(0, page_size_); const size_t page_size = page_size_; const size_t pages = 3; assert(pages > 0); cudaError_t ret; cudaStream_t stream; uint8_t *device_ptr, *host_ptr; ret = cudaMalloc((void **) &device_ptr, pages * page_size); ASSERT_EQ(cudaSuccess, ret); ret = cudaMallocHost((void **) &host_ptr, pages * page_size); ASSERT_EQ(cudaSuccess, ret); ret = cudaStreamCreate(&stream); ASSERT_EQ(cudaSuccess, ret); /* Page aligned transfers */ for (size_t i = 0; i < pages; i++) { for (size_t j = i; j < pages; j++) { ret = cudaMemcpyAsync(device_ptr, host_ptr + i * page_size, (pages - j) * page_size, cudaMemcpyHostToDevice, stream); EXPECT_EQ(cudaSuccess, ret); ret = cudaMemcpyAsync(host_ptr + i * page_size, device_ptr, (pages - j) * page_size, cudaMemcpyDeviceToHost, stream); EXPECT_EQ(cudaSuccess, ret); } } /* Try a nonaligned transfer. */ ret = cudaMemcpyAsync(device_ptr, host_ptr + (page_size / 2), page_size / 2, cudaMemcpyHostToDevice, stream); ret = cudaStreamSynchronize(stream); EXPECT_EQ(cudaSuccess, ret); ret = cudaStreamDestroy(stream); ASSERT_EQ(cudaSuccess, ret); ret = cudaFreeHost(host_ptr); ASSERT_EQ(cudaSuccess, ret); ret = cudaFree(device_ptr); ASSERT_EQ(cudaSuccess, ret); }
void BasePrefetchingDataLayer<Ftype, Btype>::InternalThreadEntryN(size_t thread_id) { #ifndef CPU_ONLY const bool use_gpu_transform = this->is_gpu_transform(); #endif static thread_local bool iter0 = this->phase_ == TRAIN; if (iter0 && this->net_inititialized_flag_ != nullptr) { this->net_inititialized_flag_->wait(); } else { // nothing to wait -> initialize and start pumping std::lock_guard<std::mutex> lock(mutex_in_); InitializePrefetch(); start_reading(); iter0 = false; } try { while (!must_stop(thread_id)) { const size_t qid = this->queue_id(thread_id); #ifndef CPU_ONLY shared_ptr<Batch<Ftype>> batch = prefetches_free_[qid]->pop(); CHECK_EQ((size_t) -1, batch->id()); load_batch(batch.get(), thread_id, qid); if (Caffe::mode() == Caffe::GPU) { if (!use_gpu_transform) { batch->data_.async_gpu_push(); } if (this->output_labels_) { batch->label_.async_gpu_push(); } CUDA_CHECK(cudaStreamSynchronize(Caffe::th_stream_aux(Caffe::STREAM_ID_ASYNC_PUSH))); } prefetches_full_[qid]->push(batch); #else shared_ptr<Batch<Ftype>> batch = prefetches_free_[qid]->pop(); load_batch(batch.get(), thread_id, qid); prefetches_full_[qid]->push(batch); #endif if (iter0) { if (this->net_iteration0_flag_ != nullptr) { this->net_iteration0_flag_->wait(); } std::lock_guard<std::mutex> lock(mutex_out_); if (this->net_inititialized_flag_ != nullptr) { this->net_inititialized_flag_ = nullptr; // no wait on the second round InitializePrefetch(); start_reading(); } if (this->auto_mode_) { break; } // manual otherwise, thus keep rolling iter0 = false; } } } catch (boost::thread_interrupted&) { } }
void GpuDevice::DoExecute(const DataList& in, const DataList& out, PhysicalOp& op, int thrid) { Context ctx; ctx.impl_type = ImplType::kCuda; ctx.stream = impl_->stream[thrid]; ctx.cublas_handle = impl_->cublas_handle[thrid]; ctx.cudnn_handle = impl_->cudnn_handle[thrid]; op.compute_fn->Execute(in, out, ctx); CUDA_CALL_MSG(op.compute_fn->Name(), cudaStreamSynchronize(impl_->stream[thrid])); }
void HistogramWalker<DistanceEvaluator, ResultsCalculator>::walk (size_t gridSize, size_t blockSize, size_t minWalks, DistanceEvaluator& distance, ResultsCalculator& results) { #ifdef PORTION std::cout << "Portion Walk Sizes:" << std::endl; #endif #ifdef EARLYSENTINEL std::cout << "Early Sentinel Walk Sizes:" << std::endl; #endif #ifdef WAITSENTINEL std::cout << "Wait Sentinel Walk Sizes:" << std::endl; #endif size_t threads = gridSize * blockSize; size_t walksPerThread = minWalks / threads + 1; srand(time(NULL)); ResultsCalculator* d_results; cuda::check(__FILE__, __LINE__, cudaMalloc((void**)&d_results, sizeof(ResultsCalculator))); cuda::check(__FILE__, __LINE__, cudaMemcpy(d_results, &results, sizeof(ResultsCalculator), cudaMemcpyHostToDevice)); DistanceEvaluator* d_distance; cuda::check(__FILE__, __LINE__, cudaMalloc((void**)&d_distance, sizeof(DistanceEvaluator))); cuda::check(__FILE__, __LINE__, cudaMemcpy(d_distance, &distance, sizeof(DistanceEvaluator), cudaMemcpyHostToDevice)); cuda::check(__FILE__, __LINE__, cudaDeviceSynchronize()); cudaStream_t workStream; cuda::check(cudaStreamCreateWithFlags(&workStream, cudaStreamNonBlocking)); cudaStream_t sentinelStream; cuda::check(cudaStreamCreateWithFlags(&sentinelStream, cudaStreamNonBlocking)); double start = seconds(); gpuWalkWithSentinel<DistanceEvaluator><<<gridSize, blockSize, 0, workStream>>> (walksPerThread, rand(), d_distance, d_results); size_t runningTotal; do { runningTotal = results.getRunningTotal(sentinelStream); } while (runningTotal < minWalks); results.endRun(sentinelStream); cuda::check(__FILE__, __LINE__, cudaStreamSynchronize(workStream)); }