Example #1
0
        void sort0_index(Param<T> val, Param<unsigned> idx)
        {
            thrust::device_ptr<T>        val_ptr = thrust::device_pointer_cast(val.ptr);
            thrust::device_ptr<unsigned> idx_ptr = thrust::device_pointer_cast(idx.ptr);

            for(int w = 0; w < val.dims[3]; w++) {
                int valW = w * val.strides[3];
                int idxW = w * idx.strides[3];
                for(int z = 0; z < val.dims[2]; z++) {
                    int valWZ = valW + z * val.strides[2];
                    int idxWZ = idxW + z * idx.strides[2];
                    for(int y = 0; y < val.dims[1]; y++) {

                        int valOffset = valWZ + y * val.strides[1];
                        int idxOffset = idxWZ + y * idx.strides[1];

                        THRUST_SELECT(thrust::sequence, idx_ptr + idxOffset, idx_ptr + idxOffset + idx.dims[0]);
                        if(isAscending) {
                            THRUST_SELECT(thrust::sort_by_key,
                                    val_ptr + valOffset, val_ptr + valOffset + val.dims[0],
                                    idx_ptr + idxOffset);
                        } else {
                            THRUST_SELECT(thrust::sort_by_key,
                                        val_ptr + valOffset, val_ptr + valOffset + val.dims[0],
                                        idx_ptr + idxOffset, thrust::greater<T>());
                        }
                    }
                }
            }
            POST_LAUNCH_CHECK();
        }
void regions(cuda::Param<T> out, cuda::CParam<char> in, cudaTextureObject_t tex)
{
    const dim3 threads(THREADS_X, THREADS_Y);

    const int blk_x = divup(in.dims[0], threads.x*2);
    const int blk_y = divup(in.dims[1], threads.y*2);

    const dim3 blocks(blk_x, blk_y);

    CUDA_LAUNCH((initial_label<T,n_per_thread>), blocks, threads, out, in);

    POST_LAUNCH_CHECK();

    int h_continue = 1;

    while (h_continue) {
        h_continue = 0;
        CUDA_CHECK(cudaMemcpyToSymbolAsync(continue_flag, &h_continue, sizeof(int),
                    0, cudaMemcpyHostToDevice,
                    cuda::getActiveStream()));

        CUDA_LAUNCH((update_equiv<T, 16, n_per_thread, full_conn>), blocks, threads, out, tex);

        POST_LAUNCH_CHECK();

        CUDA_CHECK(cudaMemcpyFromSymbolAsync(&h_continue, continue_flag, sizeof(int),
                    0, cudaMemcpyDeviceToHost,
                    cuda::getActiveStream()));
        CUDA_CHECK(cudaStreamSynchronize(cuda::getActiveStream()));
    }

    // Now, perform the final relabeling.  This converts the equivalency
    // map from having unique labels based on the lowest pixel in the
    // component to being sequentially numbered components starting at
    // 1.
    int size = in.dims[0] * in.dims[1];
    auto tmp = cuda::memAlloc<T>(size);
    CUDA_CHECK(cudaMemcpyAsync(tmp.get(), out.ptr, size * sizeof(T),
                          cudaMemcpyDeviceToDevice,
                          cuda::getActiveStream()));

    // Wrap raw device ptr
    thrust::device_ptr<T> wrapped_tmp = thrust::device_pointer_cast(tmp.get());

    // Sort the copy
    THRUST_SELECT(thrust::sort, wrapped_tmp, wrapped_tmp + size);

    // Take the max element which is the number
    // of label assignments to compute.
    const int num_bins = wrapped_tmp[size - 1] + 1;

    // If the number of label assignments is two,
    // then either the entire input image is one big
    // component(1's) or it has only one component other than
    // background(0's). Either way, no further
    // post-processing of labels is required.
    if (num_bins<=2)
        return;

    cuda::ThrustVector<T> labels(num_bins);

    // Find the end of each section of values
    thrust::counting_iterator<T> search_begin(0);
    THRUST_SELECT(thrust::upper_bound, wrapped_tmp,  wrapped_tmp  + size,
                        search_begin, search_begin + num_bins,
                        labels.begin());

    THRUST_SELECT(thrust::adjacent_difference, labels.begin(), labels.end(), labels.begin());

    // Operators for the scan
    clamp_to_one<T> clamp;
    thrust::plus<T> add;

    // Perform scan -- this computes the correct labels for each component
    THRUST_SELECT(thrust::transform_exclusive_scan,
                                     labels.begin(),
                                     labels.end(),
                                     labels.begin(),
                                     clamp,
                                     0,
                                     add);
    // Apply the correct labels to the equivalency map
    CUDA_LAUNCH((final_relabel<T,n_per_thread>), blocks,threads,
            out, in, thrust::raw_pointer_cast(&labels[0]));

    POST_LAUNCH_CHECK();
}