void split(const GpuMat& src, GpuMat* dst, const cudaStream_t& stream) 
    {
        CV_Assert(dst);

        bool double_ok = TargetArchs::builtWith(NATIVE_DOUBLE) && 
                         DeviceInfo().supports(NATIVE_DOUBLE);
        CV_Assert(src.depth() != CV_64F || double_ok);

        int depth = src.depth();
        int num_channels = src.channels();
        Size size = src.size();

        if (num_channels == 1)
        {
            src.copyTo(dst[0]);
            return;
        }

        for (int i = 0; i < num_channels; ++i)
            dst[i].create(src.size(), depth);

        CV_Assert(num_channels <= 4);

        DevMem2D dst_as_devmem[4];
        for (int i = 0; i < num_channels; ++i)
            dst_as_devmem[i] = dst[i];

        DevMem2D src_as_devmem(src);
        split_merge::split_caller(src_as_devmem, dst_as_devmem,
                                  num_channels, src.elemSize1(), 
                                  stream);
    }