void transform(Param<T> out, CParam<T> in, CParam<float> tf, const bool inverse) { int nimages = in.dims[2]; // Multiplied in src/backend/transform.cpp const int ntransforms = out.dims[2] / in.dims[2]; // Copy transform to constant memory. CUDA_CHECK(cudaMemcpyToSymbolAsync(c_tmat, tf.ptr, ntransforms * 6 * sizeof(float), 0, cudaMemcpyDeviceToDevice, cuda::getStream(cuda::getActiveDeviceId()))); dim3 threads(TX, TY, 1); dim3 blocks(divup(out.dims[0], threads.x), divup(out.dims[1], threads.y)); const int blocksXPerImage = blocks.x; if(nimages > TI) { int tile_images = divup(nimages, TI); nimages = TI; blocks.x = blocks.x * tile_images; } if (ntransforms > 1) { blocks.y *= ntransforms; } if(inverse) { CUDA_LAUNCH((transform_kernel<T, true, method>), blocks, threads, out, in, nimages, ntransforms, blocksXPerImage); } else { CUDA_LAUNCH((transform_kernel<T, false, method>), blocks, threads, out, in, nimages, ntransforms, blocksXPerImage); } POST_LAUNCH_CHECK(); }
void FGHKernelLauncher(const FGHKernelArgs* h_ctx, const KernelConfiguration& config) { if(step > 1) { std::cout << "This kernel is only valid for 2-step RK" << std::endl; exit(-1); } //Upload parameters to the GPU KPSIMULATOR_CHECK_CUDA(cudaMemcpyToSymbolAsync(fgh_ctx, h_ctx, sizeof(FGHKernelArgs), 0, cudaMemcpyHostToDevice, config.stream)); //Launch kernel cudaFuncSetCacheConfig(FGHKernel<KPSIMULATOR_FLUX_BLOCK_WIDTH, KPSIMULATOR_FLUX_BLOCK_HEIGHT, step>, cudaFuncCachePreferShared); FGHKernel<KPSIMULATOR_FLUX_BLOCK_WIDTH, KPSIMULATOR_FLUX_BLOCK_HEIGHT, step><<<config.grid, config.block, 0, config.stream>>>(); KPSIMULATOR_CHECK_CUDA_ERROR("fluxSourceKernel"); }
SEXP R_auto_cudaMemcpyToSymbolAsync(SEXP r_symbol, SEXP r_src, SEXP r_count, SEXP r_offset, SEXP r_kind, SEXP r_stream) { SEXP r_ans = R_NilValue; const void * symbol = GET_REF(r_symbol, const void ); const void * src = GET_REF(r_src, const void ); size_t count = REAL(r_count)[0]; size_t offset = REAL(r_offset)[0]; enum cudaMemcpyKind kind = (enum cudaMemcpyKind) INTEGER(r_kind)[0]; cudaStream_t stream = (cudaStream_t) getRReference(r_stream); cudaError_t ans; ans = cudaMemcpyToSymbolAsync(symbol, src, count, offset, kind, stream); r_ans = Renum_convert_cudaError_t(ans) ; return(r_ans); }
Array<T> morph(const Array<T> &in, const Array<T> &mask) { const dim4 mdims = mask.dims(); if (mdims[0] != mdims[1]) CUDA_NOT_SUPPORTED("Rectangular masks are not supported"); if (mdims[0] > 19) CUDA_NOT_SUPPORTED("Kernels > 19x19 are not supported"); Array<T> out = createEmptyArray<T>(in.dims()); CUDA_CHECK(cudaMemcpyToSymbolAsync( kernel::cFilter, mask.get(), mdims[0] * mdims[1] * sizeof(T), 0, cudaMemcpyDeviceToDevice, cuda::getActiveStream())); if (isDilation) kernel::morph<T, true>(out, in, mdims[0]); else kernel::morph<T, false>(out, in, mdims[0]); return out; }
Array<T> morph(const Array<T> &in, const Array<T> &mask) { const dim4 mdims = mask.dims(); if (mdims[0] != mdims[1]) AF_ERROR("Only square masks are supported in cuda morph currently", AF_ERR_SIZE); if (mdims[0] > 19) AF_ERROR("Upto 19x19 square kernels are only supported in cuda currently", AF_ERR_SIZE); Array<T> out = createEmptyArray<T>(in.dims()); CUDA_CHECK(cudaMemcpyToSymbolAsync(kernel::cFilter, mask.get(), mdims[0] * mdims[1] * sizeof(T), 0, cudaMemcpyDeviceToDevice, cuda::getStream(cuda::getActiveDeviceId()))); if (isDilation) kernel::morph<T, true >(out, in, mdims[0]); else kernel::morph<T, false>(out, in, mdims[0]); return out; }
void regions(cuda::Param<T> out, cuda::CParam<char> in, cudaTextureObject_t tex) { const dim3 threads(THREADS_X, THREADS_Y); const int blk_x = divup(in.dims[0], threads.x*2); const int blk_y = divup(in.dims[1], threads.y*2); const dim3 blocks(blk_x, blk_y); CUDA_LAUNCH((initial_label<T,n_per_thread>), blocks, threads, out, in); POST_LAUNCH_CHECK(); int h_continue = 1; while (h_continue) { h_continue = 0; CUDA_CHECK(cudaMemcpyToSymbolAsync(continue_flag, &h_continue, sizeof(int), 0, cudaMemcpyHostToDevice, cuda::getActiveStream())); CUDA_LAUNCH((update_equiv<T, 16, n_per_thread, full_conn>), blocks, threads, out, tex); POST_LAUNCH_CHECK(); CUDA_CHECK(cudaMemcpyFromSymbolAsync(&h_continue, continue_flag, sizeof(int), 0, cudaMemcpyDeviceToHost, cuda::getActiveStream())); CUDA_CHECK(cudaStreamSynchronize(cuda::getActiveStream())); } // Now, perform the final relabeling. This converts the equivalency // map from having unique labels based on the lowest pixel in the // component to being sequentially numbered components starting at // 1. int size = in.dims[0] * in.dims[1]; auto tmp = cuda::memAlloc<T>(size); CUDA_CHECK(cudaMemcpyAsync(tmp.get(), out.ptr, size * sizeof(T), cudaMemcpyDeviceToDevice, cuda::getActiveStream())); // Wrap raw device ptr thrust::device_ptr<T> wrapped_tmp = thrust::device_pointer_cast(tmp.get()); // Sort the copy THRUST_SELECT(thrust::sort, wrapped_tmp, wrapped_tmp + size); // Take the max element which is the number // of label assignments to compute. const int num_bins = wrapped_tmp[size - 1] + 1; // If the number of label assignments is two, // then either the entire input image is one big // component(1's) or it has only one component other than // background(0's). Either way, no further // post-processing of labels is required. if (num_bins<=2) return; cuda::ThrustVector<T> labels(num_bins); // Find the end of each section of values thrust::counting_iterator<T> search_begin(0); THRUST_SELECT(thrust::upper_bound, wrapped_tmp, wrapped_tmp + size, search_begin, search_begin + num_bins, labels.begin()); THRUST_SELECT(thrust::adjacent_difference, labels.begin(), labels.end(), labels.begin()); // Operators for the scan clamp_to_one<T> clamp; thrust::plus<T> add; // Perform scan -- this computes the correct labels for each component THRUST_SELECT(thrust::transform_exclusive_scan, labels.begin(), labels.end(), labels.begin(), clamp, 0, add); // Apply the correct labels to the equivalency map CUDA_LAUNCH((final_relabel<T,n_per_thread>), blocks,threads, out, in, thrust::raw_pointer_cast(&labels[0])); POST_LAUNCH_CHECK(); }
cudaError_t WINAPI wine_cudaMemcpyToSymbolAsync( const char *symbol, const void *src, size_t count, size_t offset, enum cudaMemcpyKind kind, cudaStream_t stream ) { WINE_TRACE("\n"); return cudaMemcpyToSymbolAsync( symbol, src, count, offset, kind, stream ); }
template<class T> static inline void uploadConstant(const char* name, const T& value, cudaStream_t stream) { cudaSafeCall( cudaMemcpyToSymbolAsync(name, &value, sizeof(T), 0, cudaMemcpyHostToDevice, stream) ); }