SEXP R_auto_cudaMemcpyFromSymbolAsync(SEXP r_dst, SEXP r_symbol, SEXP r_count, SEXP r_offset, SEXP r_kind, SEXP r_stream) { SEXP r_ans = R_NilValue; void * dst = GET_REF(r_dst, void ); const void * symbol = GET_REF(r_symbol, const void ); size_t count = REAL(r_count)[0]; size_t offset = REAL(r_offset)[0]; enum cudaMemcpyKind kind = (enum cudaMemcpyKind) INTEGER(r_kind)[0]; cudaStream_t stream = (cudaStream_t) getRReference(r_stream); cudaError_t ans; ans = cudaMemcpyFromSymbolAsync(dst, symbol, count, offset, kind, stream); r_ans = Renum_convert_cudaError_t(ans) ; return(r_ans); }
void regions(cuda::Param<T> out, cuda::CParam<char> in, cudaTextureObject_t tex) { const dim3 threads(THREADS_X, THREADS_Y); const int blk_x = divup(in.dims[0], threads.x*2); const int blk_y = divup(in.dims[1], threads.y*2); const dim3 blocks(blk_x, blk_y); CUDA_LAUNCH((initial_label<T,n_per_thread>), blocks, threads, out, in); POST_LAUNCH_CHECK(); int h_continue = 1; while (h_continue) { h_continue = 0; CUDA_CHECK(cudaMemcpyToSymbolAsync(continue_flag, &h_continue, sizeof(int), 0, cudaMemcpyHostToDevice, cuda::getActiveStream())); CUDA_LAUNCH((update_equiv<T, 16, n_per_thread, full_conn>), blocks, threads, out, tex); POST_LAUNCH_CHECK(); CUDA_CHECK(cudaMemcpyFromSymbolAsync(&h_continue, continue_flag, sizeof(int), 0, cudaMemcpyDeviceToHost, cuda::getActiveStream())); CUDA_CHECK(cudaStreamSynchronize(cuda::getActiveStream())); } // Now, perform the final relabeling. This converts the equivalency // map from having unique labels based on the lowest pixel in the // component to being sequentially numbered components starting at // 1. int size = in.dims[0] * in.dims[1]; auto tmp = cuda::memAlloc<T>(size); CUDA_CHECK(cudaMemcpyAsync(tmp.get(), out.ptr, size * sizeof(T), cudaMemcpyDeviceToDevice, cuda::getActiveStream())); // Wrap raw device ptr thrust::device_ptr<T> wrapped_tmp = thrust::device_pointer_cast(tmp.get()); // Sort the copy THRUST_SELECT(thrust::sort, wrapped_tmp, wrapped_tmp + size); // Take the max element which is the number // of label assignments to compute. const int num_bins = wrapped_tmp[size - 1] + 1; // If the number of label assignments is two, // then either the entire input image is one big // component(1's) or it has only one component other than // background(0's). Either way, no further // post-processing of labels is required. if (num_bins<=2) return; cuda::ThrustVector<T> labels(num_bins); // Find the end of each section of values thrust::counting_iterator<T> search_begin(0); THRUST_SELECT(thrust::upper_bound, wrapped_tmp, wrapped_tmp + size, search_begin, search_begin + num_bins, labels.begin()); THRUST_SELECT(thrust::adjacent_difference, labels.begin(), labels.end(), labels.begin()); // Operators for the scan clamp_to_one<T> clamp; thrust::plus<T> add; // Perform scan -- this computes the correct labels for each component THRUST_SELECT(thrust::transform_exclusive_scan, labels.begin(), labels.end(), labels.begin(), clamp, 0, add); // Apply the correct labels to the equivalency map CUDA_LAUNCH((final_relabel<T,n_per_thread>), blocks,threads, out, in, thrust::raw_pointer_cast(&labels[0])); POST_LAUNCH_CHECK(); }
cudaError_t WINAPI wine_cudaMemcpyFromSymbolAsync( void *dst, const char *symbol, size_t count, size_t offset, enum cudaMemcpyKind kind, cudaStream_t stream ) { WINE_TRACE("\n"); return cudaMemcpyFromSymbolAsync( dst, symbol, count, offset, kind, stream); }