Example #1
0
void DeviceMatrix3D_copyFromDevice(const DeviceMatrix3D& self, float* dst)
{
    if ((self.dim_x == 0) || (self.dim_y == 0) || (self.dim_t == 0)) {
        // Bail early if there is nothing to copy
        return;
    }

    if (self.pitch_t == self.dim_y * self.pitch_y) {
        // Shortcut if we're packed in the t direction
        const size_t widthInBytes = self.dim_x * sizeof(float);
        CUDA_SAFE_CALL_NO_SYNC
            (cudaMemcpy2D(dst, widthInBytes,
                          self.data, self.pitch_y * sizeof(float),
                          widthInBytes, self.dim_y * self.dim_t,
                          cudaMemcpyDeviceToHost));

        return;
    }

    // Do a series of copies to fill in the 3D array
    for (size_t t=0; t < self.dim_t; t++) {
        const size_t widthInBytes = self.dim_x * sizeof(float);
        float* host_start = dst + t * self.dim_y * self.dim_x;
        float* device_start = self.data + t * self.pitch_t;
        CUDA_SAFE_CALL_NO_SYNC
            (cudaMemcpy2D(host_start, widthInBytes,
                          device_start, self.pitch_y * sizeof(float),
                          widthInBytes, self.dim_y,
                          cudaMemcpyDeviceToHost));
    }
}
void PaddingLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top) {
  // top[n, c, h, w] = bottom[n, c, h-pad_beg, w-pad_beg] if in range
  if (pad_pos_) {
    caffe_gpu_set(top[0]->count(), Dtype(0), top[0]->mutable_gpu_data());
    for (int n = 0; n < num_; ++n) {
      for (int c = 0; c < channels_; ++c) {
	CUDA_CHECK(cudaMemcpy2D(
	    top[0]->mutable_gpu_data(n, c, pad_beg_, pad_beg_), sizeof(Dtype) * width_out_,
	    bottom[0]->gpu_data(n, c), sizeof(Dtype) * width_in_,
	    sizeof(Dtype) * width_in_, height_in_,
	    cudaMemcpyDeviceToDevice));
      }
    }
  }
  else {
    for (int n = 0; n < num_; ++n) {
      for (int c = 0; c < channels_; ++c) {
	CUDA_CHECK(cudaMemcpy2D(
	    top[0]->mutable_gpu_data(n, c), sizeof(Dtype) * width_out_,
	    bottom[0]->gpu_data(n, c, - pad_beg_, - pad_beg_), sizeof(Dtype) * width_in_,
	    sizeof(Dtype) * width_out_, height_out_,
	    cudaMemcpyDeviceToDevice));
      }
    }
  }
}
Example #3
0
void
FreeImageStack::loadImage(unsigned int iSlice, npp::ImageNPP_8u_C1 & rImage)
const
{
    NPP_ASSERT_MSG(iSlice < slices(), "Slice index exceeded number of slices in stack.");
    FIBITMAP * pBitmap = FreeImage_LockPage(pImageStack_, iSlice);
    NPP_ASSERT_NOT_NULL(pBitmap);
            // make sure this is an 8-bit single channel image
    NPP_DEBUG_ASSERT(FreeImage_GetColorType(pBitmap) == FIC_MINISBLACK);
    NPP_DEBUG_ASSERT(FreeImage_GetBPP(pBitmap) == 8);
    
    NPP_DEBUG_ASSERT(FreeImage_GetWidth(pBitmap) == nWidth_);
    NPP_DEBUG_ASSERT(FreeImage_GetHeight(pBitmap) == nHeight_);
    unsigned int    nSrcPitch = FreeImage_GetPitch(pBitmap);
    unsigned char * pSrcData  = FreeImage_GetBits(pBitmap);
    
    if (rImage.width() == nWidth_ && rImage.height() == nHeight_)
    {
        NPP_CHECK_CUDA(cudaMemcpy2D(rImage.data(), rImage.pitch(), pSrcData, nSrcPitch, 
                                    nWidth_, nHeight_, cudaMemcpyHostToDevice));
    }
    else
    {
                // create new NPP image
        npp::ImageNPP_8u_C1 oImage(nWidth_, nHeight_);
                // transfer slice data into new device image
        NPP_CHECK_CUDA(cudaMemcpy2D(oImage.data(), oImage.pitch(), pSrcData, nSrcPitch, 
                                    nWidth_, nHeight_, cudaMemcpyHostToDevice));
                // swap the result image with the reference passed into this method
        rImage.swap(oImage);
    }
                // release locked slice
    FreeImage_UnlockPage(pImageStack_, pBitmap, FALSE);
}
Example #4
0
oz::gpu_image oz::cpu_image::gpu() const {
    if (!is_valid()) return gpu_image();
    gpu_image dst(size(), format(), type_size());
    cudaMemcpy2D(dst.ptr<void>(), dst.pitch(), ptr<void>(), pitch(),
                 row_size(), h(), cudaMemcpyHostToDevice);
    return dst;
}
Example #5
0
oz::cpu_image::cpu_image( const float4 *src, unsigned src_pitch, unsigned w, unsigned h, bool ignore_alpha )
{
    d_ = new image_data_cpu(w, h, ignore_alpha? FMT_FLOAT3 : FMT_FLOAT4);
    cudaMemcpy2D(ptr(), pitch(),
                 src, src_pitch? src_pitch : sizeof(float4)*w, sizeof(float4)*w, h,
                 cudaMemcpyHostToHost);
}
Example #6
0
oz::cpu_image::cpu_image( const uchar4 *src, unsigned src_pitch, unsigned w, unsigned h, bool ignore_alpha )
{
    d_ = new image_data_cpu(w, h, ignore_alpha? FMT_UCHAR3 : FMT_UCHAR4);
    cudaMemcpy2D(ptr(), pitch(),
                 src, src_pitch? src_pitch : sizeof(uchar4)*w, sizeof(uchar4)*w, h,
                 cudaMemcpyHostToHost);
}
Example #7
0
    //! @brief Copy the ND-array device array to host array.
    //! You must allocate the array with the appropriate size.
    __host__
    inline void copy_to_host(T *host_data) const
    {
      if (N == 2)
      {
        SHAKTI_SAFE_CUDA_CALL(cudaMemcpy2D(
          host_data, _sizes[0] * sizeof(T), _data, _pitch, _sizes[0] * sizeof(T), _sizes[1],
          cudaMemcpyDeviceToHost));
      }
      else if (N == 3)
      {
        cudaMemcpy3DParms params = { 0 };

        params.srcPtr = make_cudaPitchedPtr(reinterpret_cast<void *>(_data),
                                            _pitch, _sizes[0], _sizes[1]);
        params.dstPtr = make_cudaPitchedPtr(host_data, _sizes[0] * sizeof(T),
                                            _sizes[0], _sizes[1]);
        params.extent = make_cudaExtent(_sizes[0]*sizeof(T), _sizes[1],
                                        _sizes[2]);
        params.kind = cudaMemcpyDeviceToHost;

        SHAKTI_SAFE_CUDA_CALL(cudaMemcpy3D(&params));
      }
      else
        SHAKTI_SAFE_CUDA_CALL(cudaMemcpy(host_data, _data, sizeof(T) * size(),
                                         cudaMemcpyDeviceToHost));
    }
Example #8
0
oz::cpu_image oz::gpu_image::cpu() const {
    if (!is_valid()) return cpu_image();
    cpu_image dst(size(), format(), type_size());
    OZ_CUDA_SAFE_CALL(cudaMemcpy2D(dst.ptr<void>(), dst.pitch(), ptr<void>(), pitch(),
                                   row_size(), h(), cudaMemcpyDeviceToHost));
    return dst;
}
Example #9
0
 static
 void
 HostToDeviceCopy2D(Npp32f * pDst, size_t nDstPitch, const Npp32f * pSrc, size_t nSrcPitch, size_t nWidth, size_t nHeight)
 {
     cudaError_t eResult;
     eResult = cudaMemcpy2D(pDst, nDstPitch, pSrc, nSrcPitch, nWidth * sizeof(Npp32f), nHeight, cudaMemcpyHostToDevice);
     NPP_ASSERT(cudaSuccess == eResult);
 };
Example #10
0
inline void Copy(Tensor<A,dim> _dst, Tensor<B,dim> _src, cudaMemcpyKind kind){
  utils::Assert( _dst.shape == _src.shape, "Copy:shape mismatch" );
  Tensor<A,2> dst = _dst.FlatTo2D();
  Tensor<B,2> src = _src.FlatTo2D();
  cudaError_t err = cudaMemcpy2D( dst.dptr, dst.shape.stride_ * sizeof(real_t),
                                  src.dptr, src.shape.stride_ * sizeof(real_t),
                                  dst.shape[0] * sizeof(real_t),
                                  dst.shape[1], kind );
  utils::Assert( err == cudaSuccess, cudaGetErrorString(err) );
}
void CudaUtil::cudaCheckMemcpy2D(void *dst, size_t dpitch, const void *src, size_t spitch,
		size_t width, size_t height, enum cudaMemcpyKind kind, int line, const char* file)
{
	int error = cudaMemcpy2D(dst, dpitch, src, spitch, width, height, kind);
    if (error != cudaSuccess) {
    	std::ostringstream os;
    	os << "cudaMemcpy2D returned error code " << error << ", line " << line << ", in file " << file;
    	throw CudaException(os.str());
    }
}
Example #12
0
 void operator()(Type* dest, const math::Size_t<1> pitchDest, 
                 const Type* source, const math::Size_t<1> pitchSource, const math::Size_t<2u>& size,
                 flags::Memcopy::Direction direction)
 {
         const cudaMemcpyKind kind[] = {cudaMemcpyHostToDevice, cudaMemcpyDeviceToHost,
                                  cudaMemcpyHostToHost, cudaMemcpyDeviceToDevice};
                                  
         CUDA_CHECK_NO_EXCEP(cudaMemcpy2D(dest, pitchDest.x(), source, pitchSource.x(), sizeof(Type) * size.x(), size.y(),
                      kind[direction]));
 }                    
void vm::scanner::cuda::DeviceMemory2D::copyTo(DeviceMemory2D& other) const
{
    if (empty())
        other.release();
    else
    {
        other.create(rows_, colsBytes_);    
        cudaSafeCall( cudaMemcpy2D(other.data_, other.step_, data_, step_, colsBytes_, rows_, cudaMemcpyDeviceToDevice) );
        cudaSafeCall( cudaDeviceSynchronize() );
    }
}
Example #14
0
void DeviceMatrix3D_copyToDevice(DeviceMatrix3D& self, const float* data)
{
    if ((self.dim_x > 0) && (self.dim_y > 0) && (self.dim_t > 0)) {
        const size_t widthInBytes = self.dim_x * sizeof(float);
        CUDA_SAFE_CALL_NO_SYNC
        (cudaMemcpy2D(self.data, self.pitch_y * sizeof(float),
                      data, widthInBytes,
                      widthInBytes, self.dim_y * self.dim_t,
                      cudaMemcpyHostToDevice));
    }
}
Example #15
0
void DeviceMatrix_copyToDevice(DeviceMatrix& self, const float* data)
{
	if ((self.width > 0) && (self.height > 0)) {
		const size_t widthInBytes = self.width * sizeof(float);
		CUDA_SAFE_CALL_NO_SYNC
			(cudaMemcpy2D(self.data, self.pitch * sizeof(float),
			data, widthInBytes,
			widthInBytes, self.height,
			cudaMemcpyHostToDevice));
	}
}
Example #16
0
void DeviceMatrix_copyFromDevice(const DeviceMatrix& self, float* dst)
{
	if ((self.width > 0) && (self.height > 0)) {
		const size_t widthInBytes = self.width * sizeof(float);
		CUDA_SAFE_CALL_NO_SYNC
			(cudaMemcpy2D(dst, widthInBytes,
			self.data, self.pitch * sizeof(float),
			widthInBytes, self.height,
			cudaMemcpyDeviceToHost));
	}
}
Example #17
0
oz::cpu_image oz::cpu_image::copy( int x1, int y1, int x2, int y2 ) const {
    int cw = x2 - x1 + 1;
    int ch = y2 - y1 + 1;
    if ((x1 < 0) || (x2 >= (int)w()) ||
        (y1 < 0) || (y2 >= (int)h()) ||
        (cw <= 0) || (ch <= 0)) OZ_X() << "Invalid region!";

    cpu_image dst(cw, ch, format());
    uchar *src_ptr = ptr<uchar>() + y1 * row_size() + x1 * type_size();
    cudaMemcpy2D(dst.ptr(), dst.pitch(), src_ptr, pitch(), dst.row_size(), dst.h(), cudaMemcpyHostToHost);
    return dst;
}
Example #18
0
void magma_copymatrix(
    magma_int_t m, magma_int_t n, size_t elemSize,
    void const* dA_src, magma_int_t lda,
    void*       dB_dst, magma_int_t ldb )
{
    cudaError_t status;
    status = cudaMemcpy2D(
        dB_dst, ldb*elemSize,
        dA_src, lda*elemSize,
        m*elemSize, n, cudaMemcpyDeviceToDevice );
    check_error( status );
}
Example #19
0
void magma_scopymatrix_internal(
    magma_int_t m, magma_int_t n,
    float const* dA_src, magma_int_t lda,
    float*       dB_dst, magma_int_t ldb,
    const char* func, const char* file, int line )
{
    cudaError_t status;
    status = cudaMemcpy2D(
        dB_dst, ldb*sizeof(float),
        dA_src, lda*sizeof(float),
        m*sizeof(float), n, cudaMemcpyDeviceToDevice );
    check_xerror( status, func, file, line );
}
Example #20
0
void magma_copymatrix_internal(
    magma_int_t m, magma_int_t n, magma_int_t elemSize,
    void const* dA_src, magma_int_t lda,
    void*       dB_dst, magma_int_t ldb,
    const char* func, const char* file, int line )
{
    cudaError_t status;
    status = cudaMemcpy2D(
        dB_dst, ldb*elemSize,
        dA_src, lda*elemSize,
        m*elemSize, n, cudaMemcpyDeviceToDevice );
    check_xerror( status, func, file, line );
}
Example #21
0
// --------------------
extern "C" void
magma_zcopymatrix_internal(
    magma_int_t m, magma_int_t n,
    magmaDoubleComplex_const_ptr dA_src, magma_int_t lda,
    magmaDoubleComplex_ptr       dB_dst, magma_int_t ldb,
    const char* func, const char* file, int line )
{
    cudaError_t status;
    status = cudaMemcpy2D(
        dB_dst, ldb*sizeof(magmaDoubleComplex),
        dA_src, lda*sizeof(magmaDoubleComplex),
        m*sizeof(magmaDoubleComplex), n, cudaMemcpyDeviceToDevice );
    check_xerror( status, func, file, line );
}
Example #22
0
// This test specifies a single test (where you specify radius and/or iterations)
int runSingleTest(char *ref_file, char *exec_path)
{
    int nTotalErrors = 0;
    char dump_file[256];

    printf("[runSingleTest]: [%s]\n", sSDKsample);

    initCuda();

    unsigned int *dResult;
    unsigned int *hResult = (unsigned int *)malloc(width * height * sizeof(unsigned int));
    size_t pitch;
    checkCudaErrors(cudaMallocPitch((void **)&dResult, &pitch, width*sizeof(unsigned int), height));

    // run the sample radius
    {
        printf("%s (radius=%d) (passes=%d) ", sSDKsample, filter_radius, iterations);
        bilateralFilterRGBA(dResult, width, height, euclidean_delta, filter_radius, iterations, kernel_timer);

        // check if kernel execution generated an error
        getLastCudaError("Error: bilateralFilterRGBA Kernel execution FAILED");
        checkCudaErrors(cudaDeviceSynchronize());

        // readback the results to system memory
        cudaMemcpy2D(hResult, sizeof(unsigned int)*width, dResult, pitch,
                     sizeof(unsigned int)*width, height, cudaMemcpyDeviceToHost);

        sprintf(dump_file, "nature_%02d.ppm", filter_radius);

        sdkSavePPM4ub((const char *)dump_file, (unsigned char *)hResult, width, height);

        if (!sdkComparePPM(dump_file, sdkFindFilePath(ref_file, exec_path), MAX_EPSILON_ERROR, 0.15f, false))
        {
            printf("Image is Different ");
            nTotalErrors++;
        }
        else
        {
            printf("Image is Matching ");
        }

        printf(" <%s>\n", ref_file);
    }
    printf("\n");

    free(hResult);
    checkCudaErrors(cudaFree(dResult));

    return nTotalErrors;
}
Example #23
0
cudaError_t cudaMemcpy3Dfix(const struct cudaMemcpy3DParms *param) {
    const cudaMemcpy3DParms& p = *param;
	// Use cudaMemcpy3D for 3D only
	// But it does not handle 2D or 1D copies well

	if (1<p.extent.depth) {
		return cudaMemcpy3D( &p );

	} else if (1<p.extent.height) {
		// 2D copy

		// Arraycopy
		if (0 != p.srcArray && 0 == p.dstArray) {
			return cudaMemcpy2DFromArray(p.dstPtr.ptr, p.dstPtr.pitch, p.srcArray, p.srcPos.x, p.srcPos.y, p.extent.width, p.extent.height, p.kind);

		} else if(0 == p.srcArray && 0 != p.dstArray) {
			return cudaMemcpy2DToArray(p.dstArray, p.dstPos.x, p.dstPos.y, p.srcPtr.ptr, p.srcPtr.pitch, p.extent.width, p.extent.height, p.kind);

		} else if(0 != p.srcArray && 0 != p.dstArray) {
			return cudaMemcpy2DArrayToArray( p.dstArray, p.dstPos.x, p.dstPos.y, p.srcArray, p.srcPos.x, p.srcPos.y, p.extent.width, p.extent.height, p.kind);

		} else {
			return cudaMemcpy2D( p.dstPtr.ptr, p.dstPtr.pitch, p.srcPtr.ptr, p.srcPtr.pitch, p.extent.width, p.extent.height, p.kind );

		}

	} else {
		// 1D copy

        // p.extent.width should not include pitch
        EXCEPTION_ASSERT( p.extent.width == p.dstPtr.xsize );
        EXCEPTION_ASSERT( p.extent.width == p.srcPtr.xsize );

		// Arraycopy
		if (0 != p.srcArray && 0 == p.dstArray) {
			return cudaMemcpyFromArray(p.dstPtr.ptr, p.srcArray, p.srcPos.x, p.srcPos.y, p.extent.width, p.kind);

		} else if(0 == p.srcArray && 0 != p.dstArray) {
			return cudaMemcpyToArray(p.dstArray, p.dstPos.x, p.dstPos.y, p.srcPtr.ptr, p.extent.width, p.kind);

		} else if(0 != p.srcArray && 0 != p.dstArray) {
			return cudaMemcpyArrayToArray(p.dstArray, p.dstPos.x, p.dstPos.y, p.srcArray, p.srcPos.x, p.srcPos.y, p.extent.width, p.kind);

        } else {
            return cudaMemcpy( p.dstPtr.ptr, p.srcPtr.ptr, p.extent.width, p.kind );

		}
	}
}
Example #24
0
void
FreeImageStack::appendImage(const npp::ImageNPP_32f_C1 & rImage)
{
    NPP_ASSERT(rImage.width() == nWidth_);
    NPP_ASSERT(rImage.height() == nHeight_);
    
            // create the result image storage using FreeImage so we can easily 
            // save
    unsigned int nResultPitch   = FreeImage_GetPitch(pBitmap_32f_);
    float * pResultData = reinterpret_cast<float *>(FreeImage_GetBits(pBitmap_32f_));

    NPP_CHECK_CUDA(cudaMemcpy2D(pResultData, nResultPitch, rImage.data(), rImage.pitch(),
                                nWidth_ * 4, nHeight_, cudaMemcpyDeviceToHost));
    FreeImage_AppendPage(pImageStack_, pBitmap_32f_);

    appendAzimuthalAnalysis(pResultData, nResultPitch);
}
Example #25
0
SEXP R_auto_cudaMemcpy2D(SEXP r_dst, SEXP r_dpitch, SEXP r_src, SEXP r_spitch, SEXP r_width, SEXP r_height, SEXP r_kind)
{
    SEXP r_ans = R_NilValue;
    void * dst = GET_REF(r_dst, void );
    size_t dpitch = REAL(r_dpitch)[0];
    const void * src = GET_REF(r_src, const void );
    size_t spitch = REAL(r_spitch)[0];
    size_t width = REAL(r_width)[0];
    size_t height = REAL(r_height)[0];
    enum cudaMemcpyKind kind = (enum cudaMemcpyKind) INTEGER(r_kind)[0];
    
    cudaError_t ans;
    ans = cudaMemcpy2D(dst, dpitch, src, spitch, width, height, kind);
    
    r_ans = Renum_convert_cudaError_t(ans) ;
    
    return(r_ans);
}
int main(int argc, char *argv[])
{
    printf("%s Starting...\n\n", argv[0]);

    try
    {
        std::string sFilename;
        char *filePath = sdkFindFilePath("person.txt", argv[0]);

        if (filePath)
        {
            sFilename = filePath;
        }
        else
        {
            printf("Error %s was unable to find person.txt\n", argv[0]);
            exit(EXIT_FAILURE);
        }

        cudaDeviceInit(argc, (const char **)argv);

        printfNPPinfo(argc, argv);

        if (g_bQATest == false && (g_nDevice == -1) && argc > 1)
        {
            sFilename = argv[1];
        }

        // if we specify the filename at the command line, then we only test sFilename
        int file_errors = 0;

        std::ifstream infile(sFilename.data(), std::ifstream::in);

        if (infile.good())
        {
            std::cout << "imageSegmentationNPP opened: <" << sFilename.data() << "> successfully!" << std::endl;
            file_errors = 0;
            infile.close();
        }
        else
        {
            std::cout << "imageSegmentationNPP unable to open: <" << sFilename.data() << ">" << std::endl;
            file_errors++;
            infile.close();
        }

        if (file_errors > 0)
        {
            exit(EXIT_FAILURE);
        }

        std::string sResultFilename = sFilename;

        std::string::size_type dot = sResultFilename.rfind('.');

        if (dot != std::string::npos)
        {
            sResultFilename = sResultFilename.substr(0, dot);
        }

        sResultFilename += "_segmentation.pgm";

        if (argc >= 3 && !g_bQATest)
        {
            sResultFilename = argv[2];
        }

        // load MRF declaration
        int width, height, nLabels;
        int *hCue, *vCue, *dataCostArray;

        loadMiddleburyMRFData(sFilename, dataCostArray, hCue, vCue, width, height, nLabels);
        NPP_ASSERT(nLabels == 2);

        std::cout << "Dataset: " << sFilename << std::endl;
        std::cout << "Size: " << width << "x" << height << std::endl;

        NppiSize size;
        size.width = width;
        size.height = height;

        NppiRect roi;
        roi.x=0;
        roi.y=0;
        roi.width=width;
        roi.height=height;

        // Setup flow network
        int step, transposed_step;
        Npp32s *d_source, *d_sink, *d_terminals, *d_left_transposed, *d_right_transposed, *d_top, *d_bottom;

        // Setup terminal capacities
        d_source = nppiMalloc_32s_C1(width, height, &step);
        cudaMemcpy2D(d_source, step, dataCostArray, width * sizeof(int), width*sizeof(int), height, cudaMemcpyHostToDevice);
        d_sink = nppiMalloc_32s_C1(width, height, &step);
        cudaMemcpy2D(d_sink, step, &dataCostArray[width*height], width * sizeof(int), width*sizeof(int), height, cudaMemcpyHostToDevice);

        d_terminals = nppiMalloc_32s_C1(width, height, &step);

        nppiSub_32s_C1RSfs(d_sink, step, d_source, step, d_terminals, step, size, 0);

        // Setup edge capacities
        NppiSize edgeTranposedSize;
        edgeTranposedSize.width = height;
        edgeTranposedSize.height = width-1;

        NppiSize oneRowTranposedSize;
        oneRowTranposedSize.width = height;
        oneRowTranposedSize.height = 1;

        d_right_transposed = nppiMalloc_32s_C1(height, width, &transposed_step);
        cudaMemcpy2D(d_right_transposed, transposed_step, hCue, height * sizeof(int), height * sizeof(int), width, cudaMemcpyHostToDevice);

        d_left_transposed = nppiMalloc_32s_C1(height, width, &transposed_step);
        nppiSet_32s_C1R(0, d_left_transposed, transposed_step, oneRowTranposedSize);
        nppiCopy_32s_C1R(d_right_transposed, transposed_step, d_left_transposed + transposed_step/sizeof(int), transposed_step, edgeTranposedSize);

        NppiSize edgeSize;
        edgeSize.width = width;
        edgeSize.height = height-1;

        NppiSize oneRowSize;
        oneRowSize.width = width;
        oneRowSize.height = 1;

        d_bottom = nppiMalloc_32s_C1(width, height, &step);
        cudaMemcpy2D(d_bottom, step, vCue, width * sizeof(int), width*sizeof(int), height, cudaMemcpyHostToDevice);

        d_top = nppiMalloc_32s_C1(width, height, &step);
        nppiSet_32s_C1R(0, d_top, step, oneRowSize);
        nppiCopy_32s_C1R(d_bottom, step, d_top + step/sizeof(int), step, edgeSize);

        // Allocate temp storage for graphcut computation
        Npp8u *pBuffer;
        int bufferSize;
        nppiGraphcutGetSize(size, &bufferSize);
        cudaMalloc(&pBuffer, bufferSize);

        NppiGraphcutState *pGraphcutState;
        nppiGraphcutInitAlloc(size, &pGraphcutState, pBuffer);

        // Allocate label storage
        npp::ImageNPP_8u_C1 oDeviceDst(width, height);

        cudaEvent_t start, stop;
        cudaEventCreate(&start);
        cudaEventCreate(&stop);

        // Compute the graphcut, result is 0 / !=0
        cudaEventRecord(start,0);

        nppiGraphcut_32s8u(d_terminals, d_left_transposed, d_right_transposed,
                           d_top, d_bottom, step, transposed_step,
                           size, oDeviceDst.data(), oDeviceDst.pitch(), pGraphcutState);

        cudaEventRecord(stop,0);
        cudaEventSynchronize(stop);

        float time;
        cudaEventElapsedTime(&time, start, stop);
        std::cout << "Elapsed Time: " << time << " ms" << std::endl;

        // declare a host image object for an 8-bit grayscale image
        npp::ImageCPU_8u_C1 oHostAlpha(width, height);

        // convert graphcut result to 0/255 alpha image using new nppiCompareC_8u_C1R primitive (CUDA 5.0)
        npp::ImageNPP_8u_C1 oDeviceAlpha(width, height);
        nppiCompareC_8u_C1R(oDeviceDst.data(), oDeviceDst.pitch(), 0, oDeviceAlpha.data(), oDeviceAlpha.pitch(), size,
                            NPP_CMP_GREATER);

        // and copy the result to host
        oDeviceAlpha.copyTo(oHostAlpha.data(), oHostAlpha.pitch());

        int E_d, E_s;
        std::cout << "Graphcut Cost: " << computeEnergy(E_d, E_s, oHostAlpha.data(), oHostAlpha.pitch(), hCue, vCue, dataCostArray, width, height) << std::endl;
        std::cout << "(E_d = " << E_d << ", E_s = " << E_s << ")" << std::endl;

        std::cout << "Saving segmentation result as " << sResultFilename << std::endl;
        saveImage(sResultFilename, oHostAlpha);

        nppiGraphcutFree(pGraphcutState);
        cudaFree(pBuffer);
        cudaFree(d_top);
        cudaFree(d_bottom);
        cudaFree(d_left_transposed);
        cudaFree(d_right_transposed);
        cudaFree(d_source);
        cudaFree(d_sink);
        cudaFree(d_terminals);

        exit(EXIT_SUCCESS);
    }
    catch (npp::Exception &rException)
    {
        std::cerr << "Program error! The following exception occurred: \n";
        std::cerr << rException << std::endl;
        std::cerr << "Aborting." << std::endl;
        exit(EXIT_FAILURE);
    }
    catch (...)
    {
        std::cerr << "Program error! An unknow type of exception occurred. \n";
        std::cerr << "Aborting." << std::endl;
        exit(EXIT_FAILURE);
    }

    return 0;
}
Example #27
0
////////////////////////////////////////////////////////////////////////////////
// Program Main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char *argv[])
{
	int Nx, Ny, Nz, max_iters;
	int blockX, blockY, blockZ;

	if (argc == 8) {
		Nx = atoi(argv[1]);
		Ny = atoi(argv[2]);
		Nz = atoi(argv[3]);
		max_iters = atoi(argv[4]);
		blockX = atoi(argv[5]);
		blockY = atoi(argv[6]);
		blockZ = atoi(argv[7]);
	}
	else
	{
		printf("Usage: %s nx ny nz i block_x block_y block_z number_of_threads\n", 
			argv[0]);
		exit(1);
	}

	// Get the number of GPUS
	int number_of_devices;
	checkCuda(cudaGetDeviceCount(&number_of_devices));
  
  if (number_of_devices < 2) {
  	printf("Less than two devices were found.\n");
  	printf("Exiting...\n");

  	return -1;
  }

	// Decompose along the Z-axis
	int _Nz = Nz/number_of_devices;

	// Define constants
	const _DOUBLE_ L = 1.0;
	const _DOUBLE_ h = L/(Nx+1);
	const _DOUBLE_ dt = h*h/6.0;
	const _DOUBLE_ beta = dt/(h*h);
	const _DOUBLE_ c0 = beta;
	const _DOUBLE_ c1 = (1-6*beta);

	// Check if ECC is turned on
	ECCCheck(number_of_devices);

	// Set the number of OpenMP threads
	omp_set_num_threads(number_of_devices);

	#pragma omp parallel
	{
		unsigned int tid = omp_get_num_threads();

		#pragma omp single
		{
			printf("Number of OpenMP threads: %d\n", tid);
		}
	}

  // CPU memory operations
  int dt_size = sizeof(_DOUBLE_);

	_DOUBLE_ *u_new, *u_old;

	u_new = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(Nz+2));
	u_old = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(Nz+2));

	init(u_old, u_new, h, Nx, Ny, Nz);

	// Allocate and generate arrays on the host
	size_t pitch_bytes;
	size_t pitch_gc_bytes;

	_DOUBLE_ *h_Unew, *h_Uold;
	_DOUBLE_ *h_s_Uolds[number_of_devices], *h_s_Unews[number_of_devices];
	_DOUBLE_ *left_send_buffer[number_of_devices], *left_receive_buffer[number_of_devices];
	_DOUBLE_ *right_send_buffer[number_of_devices], *right_receive_buffer[number_of_devices];

	h_Unew = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(Nz+2));
	h_Uold = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(Nz+2));

	init(h_Uold, h_Unew, h, Nx, Ny, Nz);

	#pragma omp parallel
	{
		unsigned int tid = omp_get_thread_num();

		h_s_Unews[tid] = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_Nz+2));
		h_s_Uolds[tid] = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_Nz+2));

		right_send_buffer[tid] = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_GC_DEPTH));
		left_send_buffer[tid] = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_GC_DEPTH));
		right_receive_buffer[tid] = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_GC_DEPTH));
		left_receive_buffer[tid] = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_GC_DEPTH));

		checkCuda(cudaHostAlloc((void**)&h_s_Unews[tid], dt_size*(Nx+2)*(Ny+2)*(_Nz+2), cudaHostAllocPortable));
		checkCuda(cudaHostAlloc((void**)&h_s_Uolds[tid], dt_size*(Nx+2)*(Ny+2)*(_Nz+2), cudaHostAllocPortable));
		checkCuda(cudaHostAlloc((void**)&right_send_buffer[tid], dt_size*(Nx+2)*(Ny+2)*(_GC_DEPTH), cudaHostAllocPortable));
		checkCuda(cudaHostAlloc((void**)&left_send_buffer[tid], dt_size*(Nx+2)*(Ny+2)*(_GC_DEPTH), cudaHostAllocPortable));
		checkCuda(cudaHostAlloc((void**)&right_receive_buffer[tid], dt_size*(Nx+2)*(Ny+2)*(_GC_DEPTH), cudaHostAllocPortable));
		checkCuda(cudaHostAlloc((void**)&left_receive_buffer[tid], dt_size*(Nx+2)*(Ny+2)*(_GC_DEPTH), cudaHostAllocPortable));

		init_subdomain(h_s_Uolds[tid], h_Uold, Nx, Ny, _Nz, tid);
	}

	// GPU memory operations
	_DOUBLE_ *d_s_Unews[number_of_devices], *d_s_Uolds[number_of_devices];
	_DOUBLE_ *d_right_send_buffer[number_of_devices], *d_left_send_buffer[number_of_devices];
	_DOUBLE_ *d_right_receive_buffer[number_of_devices], *d_left_receive_buffer[number_of_devices];

	#pragma omp parallel
	{
		unsigned int tid = omp_get_thread_num();

		checkCuda(cudaSetDevice(tid));

		CopyToConstantMemory(c0, c1);

		checkCuda(cudaMallocPitch((void**)&d_s_Uolds[tid], &pitch_bytes, dt_size*(Nx+2), (Ny+2)*(_Nz+2)));
		checkCuda(cudaMallocPitch((void**)&d_s_Unews[tid], &pitch_bytes, dt_size*(Nx+2), (Ny+2)*(_Nz+2)));
		checkCuda(cudaMallocPitch((void**)&d_left_receive_buffer[tid], &pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH)));
		checkCuda(cudaMallocPitch((void**)&d_right_receive_buffer[tid], &pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH)));
		checkCuda(cudaMallocPitch((void**)&d_left_send_buffer[tid], &pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH)));
		checkCuda(cudaMallocPitch((void**)&d_right_send_buffer[tid], &pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH)));
	}

	// Copy data from host to the device
	double HtD_timer = 0.;
	HtD_timer -= omp_get_wtime();
	#pragma omp parallel
	{
		unsigned int tid = omp_get_thread_num();
		checkCuda(cudaSetDevice(tid));
		checkCuda(cudaMemcpy2D(d_s_Uolds[tid], pitch_bytes, h_s_Uolds[tid], dt_size*(Nx+2), dt_size*(Nx+2), ((Ny+2)*(_Nz+2)), cudaMemcpyDefault));
		checkCuda(cudaMemcpy2D(d_s_Unews[tid], pitch_bytes, h_s_Unews[tid], dt_size*(Nx+2), dt_size*(Nx+2), ((Ny+2)*(_Nz+2)), cudaMemcpyDefault));
	}
	HtD_timer += omp_get_wtime();

	int pitch = pitch_bytes/dt_size;
	int gc_pitch = pitch_gc_bytes/dt_size;

    // GPU kernel launch parameters
	dim3 threads_per_block(blockX, blockY, blockZ);
	unsigned int blocksInX = getBlock(Nx, blockX);
	unsigned int blocksInY = getBlock(Ny, blockY);
	unsigned int blocksInZ = getBlock(_Nz-2, k_loop);
	dim3 thread_blocks(blocksInX, blocksInY, blocksInZ);
	dim3 thread_blocks_halo(blocksInX, blocksInY);

	double compute_timer = 0.;
  compute_timer -= omp_get_wtime();

	#pragma omp parallel
	{
		unsigned int tid = omp_get_thread_num();

		for(int iterations = 0; iterations < max_iters; iterations++)
		{
			// Compute inner nodes
			checkCuda(cudaSetDevice(tid));
			ComputeInnerPoints(thread_blocks, threads_per_block, d_s_Unews[tid], d_s_Uolds[tid], pitch, Nx, Ny, _Nz);

			// Copy right boundary data to host
			if (tid == 0)
			{
				checkCuda(cudaSetDevice(tid));
				CopyBoundaryRegionToGhostCell(thread_blocks_halo, threads_per_block, d_s_Unews[tid], d_right_send_buffer[tid], Nx, Ny, _Nz, pitch, gc_pitch, 0);
				checkCuda(cudaMemcpy2D(right_send_buffer[tid], dt_size*(Nx+2), d_right_send_buffer[tid], pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH), cudaMemcpyDefault));
			}
			// Copy left boundary data to host
			if (tid == 1)
			{
				checkCuda(cudaSetDevice(tid));
				CopyBoundaryRegionToGhostCell(thread_blocks_halo, threads_per_block, d_s_Unews[tid], d_left_send_buffer[tid], Nx, Ny, _Nz, pitch, gc_pitch, 1);
				checkCuda(cudaMemcpy2D(left_send_buffer[tid], dt_size*(Nx+2), d_left_send_buffer[tid], pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH), cudaMemcpyDefault));
			}

			#pragma omp barrier

			// Copy right boundary data to device 1
			if (tid == 1)
			{
				checkCuda(cudaSetDevice(tid));
				
				checkCuda(cudaMemcpy2D(d_left_receive_buffer[tid], pitch_gc_bytes, right_send_buffer[tid-1], dt_size*(Nx+2), dt_size*(Nx+2), ((Ny+2)*(_GC_DEPTH)), cudaMemcpyDefault));
				CopyGhostCellToBoundaryRegion(thread_blocks_halo, threads_per_block, d_s_Unews[tid], d_left_receive_buffer[tid], Nx, Ny, _Nz, pitch, gc_pitch, 1);
			}

			// Copy left boundary data to device 0
			if (tid == 0)
			{
				checkCuda(cudaSetDevice(tid));

				checkCuda(cudaMemcpy2D(d_right_receive_buffer[tid], pitch_gc_bytes, left_send_buffer[tid+1], dt_size*(Nx+2), dt_size*(Nx+2), ((Ny+2)*(_GC_DEPTH)), cudaMemcpyDefault));
				CopyGhostCellToBoundaryRegion(thread_blocks_halo, threads_per_block, d_s_Unews[tid], d_right_receive_buffer[tid], Nx, Ny, _Nz, pitch, gc_pitch, 0);
			}

			// Swap pointers on the host
			#pragma omp barrier
			checkCuda(cudaSetDevice(tid));
			checkCuda(cudaDeviceSynchronize());
			swap(_DOUBLE_*, d_s_Unews[tid], d_s_Uolds[tid]);
		}
	}

	compute_timer += omp_get_wtime();

  // Copy data from device to host
	double DtH_timer = 0;
  DtH_timer -= omp_get_wtime();
	#pragma omp parallel
	{
		unsigned int tid = omp_get_thread_num();
		checkCuda(cudaSetDevice(tid));
		checkCuda(cudaMemcpy2D(h_s_Uolds[tid], dt_size*(Nx+2), d_s_Uolds[tid], pitch_bytes, dt_size*(Nx+2), (Ny+2)*(_Nz+2), cudaMemcpyDeviceToHost));
	}
	DtH_timer += omp_get_wtime();

	// Merge sub-domains into a one big domain
	#pragma omp parallel
	{
		unsigned int tid = omp_get_thread_num();

		merge_domains(h_s_Uolds[tid], h_Uold, Nx, Ny, _Nz, tid);
	}

   	// Calculate on host
#if defined(DEBUG) || defined(_DEBUG)
	cpu_heat3D(u_new, u_old, c0, c1, max_iters, Nx, Ny, Nz);
#endif

    float gflops = CalcGflops(compute_timer, max_iters, Nx, Ny, Nz);
    PrintSummary("3D Heat (7-pt)", "Plane sweeping", compute_timer, HtD_timer, DtH_timer, gflops, max_iters, Nx);

    _DOUBLE_ t = max_iters * dt;
    CalcError(h_Uold, u_old, t, h, Nx, Ny, Nz);

#if defined(DEBUG) || defined(_DEBUG)
    //exportToVTK(h_Uold, h, "heat3D.vtk", Nx, Ny, Nz);
#endif

	#pragma omp parallel
	{
		unsigned int tid = omp_get_thread_num();
		
		checkCuda(cudaSetDevice(tid));
		checkCuda(cudaFree(d_s_Unews[tid]));
    checkCuda(cudaFree(d_s_Uolds[tid]));
    checkCuda(cudaFree(d_right_send_buffer[tid]));
    checkCuda(cudaFree(d_left_send_buffer[tid]));
    checkCuda(cudaFree(d_right_receive_buffer[tid]));
    checkCuda(cudaFree(d_left_receive_buffer[tid]));
    checkCuda(cudaFreeHost(h_s_Unews[tid]));
    checkCuda(cudaFreeHost(h_s_Uolds[tid]));
    checkCuda(cudaFreeHost(left_send_buffer[tid]));
    checkCuda(cudaFreeHost(right_send_buffer[tid]));
    checkCuda(cudaFreeHost(left_receive_buffer[tid]));
    checkCuda(cudaFreeHost(right_receive_buffer[tid]));
    checkCuda(cudaDeviceReset());
  }

  free(u_old);
  free(u_new);

	return 0;
}
Example #28
0
void cuda_memcpy_strided(const long dims[2], long ostr, void* dst, long istr, const void* src)
{
	CUDA_ERROR(cudaMemcpy2D(dst, ostr, src, istr, dims[0], dims[1], cudaMemcpyDefault));
}
void vm::scanner::cuda::DeviceMemory2D::upload(const void *host_ptr_arg, size_t host_step_arg, int rows_arg, int colsBytes_arg)
{
    create(rows_arg, colsBytes_arg);
    cudaSafeCall( cudaMemcpy2D(data_, step_, host_ptr_arg, host_step_arg, colsBytes_, rows_, cudaMemcpyHostToDevice) );        
    cudaSafeCall( cudaDeviceSynchronize() );
}
void vm::scanner::cuda::DeviceMemory2D::download(void *host_ptr_arg, size_t host_step_arg) const
{    
    cudaSafeCall( cudaMemcpy2D(host_ptr_arg, host_step_arg, data_, step_, colsBytes_, rows_, cudaMemcpyDeviceToHost) );
    cudaSafeCall( cudaDeviceSynchronize() );
}