コード例 #1
0
ファイル: transform.hpp プロジェクト: hxiaox/arrayfire
        void transform(Param<T> out, CParam<T> in, CParam<float> tf,
                       const bool inverse)
        {
            int nimages = in.dims[2];
            // Multiplied in src/backend/transform.cpp
            const int ntransforms = out.dims[2] / in.dims[2];

            // Copy transform to constant memory.
            CUDA_CHECK(cudaMemcpyToSymbolAsync(c_tmat, tf.ptr, ntransforms * 6 * sizeof(float), 0,
                                          cudaMemcpyDeviceToDevice,
                                          cuda::getStream(cuda::getActiveDeviceId())));

            dim3 threads(TX, TY, 1);
            dim3 blocks(divup(out.dims[0], threads.x), divup(out.dims[1], threads.y));

            const int blocksXPerImage = blocks.x;
            if(nimages > TI) {
                int tile_images = divup(nimages, TI);
                nimages = TI;
                blocks.x = blocks.x * tile_images;
            }

            if (ntransforms > 1) { blocks.y *= ntransforms; }

            if(inverse) {
                CUDA_LAUNCH((transform_kernel<T, true, method>), blocks, threads,
                                out, in, nimages, ntransforms, blocksXPerImage);
            } else {
                CUDA_LAUNCH((transform_kernel<T, false, method>), blocks, threads,
                                out, in, nimages, ntransforms, blocksXPerImage);
            }
            POST_LAUNCH_CHECK();
        }
コード例 #2
0
ファイル: FGH.hpp プロジェクト: nornamor/Prosjektoppgave
void FGHKernelLauncher(const FGHKernelArgs* h_ctx, const KernelConfiguration& config) {
	if(step > 1) {
		std::cout << "This kernel is only valid for 2-step RK" << std::endl;
		exit(-1);
	}

	//Upload parameters to the GPU
	KPSIMULATOR_CHECK_CUDA(cudaMemcpyToSymbolAsync(fgh_ctx, h_ctx, sizeof(FGHKernelArgs), 0, cudaMemcpyHostToDevice, config.stream));

	//Launch kernel
	cudaFuncSetCacheConfig(FGHKernel<KPSIMULATOR_FLUX_BLOCK_WIDTH, KPSIMULATOR_FLUX_BLOCK_HEIGHT, step>, cudaFuncCachePreferShared);
	FGHKernel<KPSIMULATOR_FLUX_BLOCK_WIDTH, KPSIMULATOR_FLUX_BLOCK_HEIGHT, step><<<config.grid, config.block, 0, config.stream>>>();
	KPSIMULATOR_CHECK_CUDA_ERROR("fluxSourceKernel");
}
コード例 #3
0
ファイル: autoMemory.c プロジェクト: PachoAlvarez/RCUDA
SEXP R_auto_cudaMemcpyToSymbolAsync(SEXP r_symbol, SEXP r_src, SEXP r_count, SEXP r_offset, SEXP r_kind, SEXP r_stream)
{
    SEXP r_ans = R_NilValue;
    const void * symbol = GET_REF(r_symbol, const void );
    const void * src = GET_REF(r_src, const void );
    size_t count = REAL(r_count)[0];
    size_t offset = REAL(r_offset)[0];
    enum cudaMemcpyKind kind = (enum cudaMemcpyKind) INTEGER(r_kind)[0];
    cudaStream_t stream = (cudaStream_t) getRReference(r_stream);
    
    cudaError_t ans;
    ans = cudaMemcpyToSymbolAsync(symbol, src, count, offset, kind, stream);
    
    r_ans = Renum_convert_cudaError_t(ans) ;
    
    return(r_ans);
}
コード例 #4
0
ファイル: morph_impl.hpp プロジェクト: 9prady9/arrayfire
Array<T> morph(const Array<T> &in, const Array<T> &mask) {
    const dim4 mdims = mask.dims();

    if (mdims[0] != mdims[1])
        CUDA_NOT_SUPPORTED("Rectangular masks are not supported");

    if (mdims[0] > 19) CUDA_NOT_SUPPORTED("Kernels > 19x19 are not supported");

    Array<T> out = createEmptyArray<T>(in.dims());

    CUDA_CHECK(cudaMemcpyToSymbolAsync(
        kernel::cFilter, mask.get(), mdims[0] * mdims[1] * sizeof(T), 0,
        cudaMemcpyDeviceToDevice, cuda::getActiveStream()));

    if (isDilation)
        kernel::morph<T, true>(out, in, mdims[0]);
    else
        kernel::morph<T, false>(out, in, mdims[0]);

    return out;
}
コード例 #5
0
ファイル: morph_impl.hpp プロジェクト: hxiaox/arrayfire
Array<T>  morph(const Array<T> &in, const Array<T> &mask)
{
    const dim4 mdims = mask.dims();

    if (mdims[0] != mdims[1])
        AF_ERROR("Only square masks are supported in cuda morph currently", AF_ERR_SIZE);
    if (mdims[0] > 19)
        AF_ERROR("Upto 19x19 square kernels are only supported in cuda currently", AF_ERR_SIZE);

    Array<T> out = createEmptyArray<T>(in.dims());

    CUDA_CHECK(cudaMemcpyToSymbolAsync(kernel::cFilter, mask.get(),
                                       mdims[0] * mdims[1] * sizeof(T),
                                       0, cudaMemcpyDeviceToDevice,
                                       cuda::getStream(cuda::getActiveDeviceId())));

    if (isDilation)
        kernel::morph<T, true >(out, in, mdims[0]);
    else
        kernel::morph<T, false>(out, in, mdims[0]);

    return out;
}
コード例 #6
0
void regions(cuda::Param<T> out, cuda::CParam<char> in, cudaTextureObject_t tex)
{
    const dim3 threads(THREADS_X, THREADS_Y);

    const int blk_x = divup(in.dims[0], threads.x*2);
    const int blk_y = divup(in.dims[1], threads.y*2);

    const dim3 blocks(blk_x, blk_y);

    CUDA_LAUNCH((initial_label<T,n_per_thread>), blocks, threads, out, in);

    POST_LAUNCH_CHECK();

    int h_continue = 1;

    while (h_continue) {
        h_continue = 0;
        CUDA_CHECK(cudaMemcpyToSymbolAsync(continue_flag, &h_continue, sizeof(int),
                    0, cudaMemcpyHostToDevice,
                    cuda::getActiveStream()));

        CUDA_LAUNCH((update_equiv<T, 16, n_per_thread, full_conn>), blocks, threads, out, tex);

        POST_LAUNCH_CHECK();

        CUDA_CHECK(cudaMemcpyFromSymbolAsync(&h_continue, continue_flag, sizeof(int),
                    0, cudaMemcpyDeviceToHost,
                    cuda::getActiveStream()));
        CUDA_CHECK(cudaStreamSynchronize(cuda::getActiveStream()));
    }

    // Now, perform the final relabeling.  This converts the equivalency
    // map from having unique labels based on the lowest pixel in the
    // component to being sequentially numbered components starting at
    // 1.
    int size = in.dims[0] * in.dims[1];
    auto tmp = cuda::memAlloc<T>(size);
    CUDA_CHECK(cudaMemcpyAsync(tmp.get(), out.ptr, size * sizeof(T),
                          cudaMemcpyDeviceToDevice,
                          cuda::getActiveStream()));

    // Wrap raw device ptr
    thrust::device_ptr<T> wrapped_tmp = thrust::device_pointer_cast(tmp.get());

    // Sort the copy
    THRUST_SELECT(thrust::sort, wrapped_tmp, wrapped_tmp + size);

    // Take the max element which is the number
    // of label assignments to compute.
    const int num_bins = wrapped_tmp[size - 1] + 1;

    // If the number of label assignments is two,
    // then either the entire input image is one big
    // component(1's) or it has only one component other than
    // background(0's). Either way, no further
    // post-processing of labels is required.
    if (num_bins<=2)
        return;

    cuda::ThrustVector<T> labels(num_bins);

    // Find the end of each section of values
    thrust::counting_iterator<T> search_begin(0);
    THRUST_SELECT(thrust::upper_bound, wrapped_tmp,  wrapped_tmp  + size,
                        search_begin, search_begin + num_bins,
                        labels.begin());

    THRUST_SELECT(thrust::adjacent_difference, labels.begin(), labels.end(), labels.begin());

    // Operators for the scan
    clamp_to_one<T> clamp;
    thrust::plus<T> add;

    // Perform scan -- this computes the correct labels for each component
    THRUST_SELECT(thrust::transform_exclusive_scan,
                                     labels.begin(),
                                     labels.end(),
                                     labels.begin(),
                                     clamp,
                                     0,
                                     add);
    // Apply the correct labels to the equivalency map
    CUDA_LAUNCH((final_relabel<T,n_per_thread>), blocks,threads,
            out, in, thrust::raw_pointer_cast(&labels[0]));

    POST_LAUNCH_CHECK();
}
コード例 #7
0
ファイル: cudart.c プロジェクト: Shelnutt2/cuda-wine-wrapper
cudaError_t WINAPI wine_cudaMemcpyToSymbolAsync( const char *symbol, const void *src, size_t count, size_t offset, enum cudaMemcpyKind kind, cudaStream_t stream ) {
    WINE_TRACE("\n");
    return cudaMemcpyToSymbolAsync( symbol, src, count, offset, kind, stream );
}
コード例 #8
0
ファイル: cuda_shared.hpp プロジェクト: SCS-B3C/OpenCV2-2
 template<class T> static inline void uploadConstant(const char* name, const T& value, cudaStream_t stream) 
 {
     cudaSafeCall( cudaMemcpyToSymbolAsync(name, &value, sizeof(T), 0, cudaMemcpyHostToDevice, stream) ); 
 }