void DeviceMatrix3D_copyFromDevice(const DeviceMatrix3D& self, float* dst) { if ((self.dim_x == 0) || (self.dim_y == 0) || (self.dim_t == 0)) { // Bail early if there is nothing to copy return; } if (self.pitch_t == self.dim_y * self.pitch_y) { // Shortcut if we're packed in the t direction const size_t widthInBytes = self.dim_x * sizeof(float); CUDA_SAFE_CALL_NO_SYNC (cudaMemcpy2D(dst, widthInBytes, self.data, self.pitch_y * sizeof(float), widthInBytes, self.dim_y * self.dim_t, cudaMemcpyDeviceToHost)); return; } // Do a series of copies to fill in the 3D array for (size_t t=0; t < self.dim_t; t++) { const size_t widthInBytes = self.dim_x * sizeof(float); float* host_start = dst + t * self.dim_y * self.dim_x; float* device_start = self.data + t * self.pitch_t; CUDA_SAFE_CALL_NO_SYNC (cudaMemcpy2D(host_start, widthInBytes, device_start, self.pitch_y * sizeof(float), widthInBytes, self.dim_y, cudaMemcpyDeviceToHost)); } }
void PaddingLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { // top[n, c, h, w] = bottom[n, c, h-pad_beg, w-pad_beg] if in range if (pad_pos_) { caffe_gpu_set(top[0]->count(), Dtype(0), top[0]->mutable_gpu_data()); for (int n = 0; n < num_; ++n) { for (int c = 0; c < channels_; ++c) { CUDA_CHECK(cudaMemcpy2D( top[0]->mutable_gpu_data(n, c, pad_beg_, pad_beg_), sizeof(Dtype) * width_out_, bottom[0]->gpu_data(n, c), sizeof(Dtype) * width_in_, sizeof(Dtype) * width_in_, height_in_, cudaMemcpyDeviceToDevice)); } } } else { for (int n = 0; n < num_; ++n) { for (int c = 0; c < channels_; ++c) { CUDA_CHECK(cudaMemcpy2D( top[0]->mutable_gpu_data(n, c), sizeof(Dtype) * width_out_, bottom[0]->gpu_data(n, c, - pad_beg_, - pad_beg_), sizeof(Dtype) * width_in_, sizeof(Dtype) * width_out_, height_out_, cudaMemcpyDeviceToDevice)); } } } }
void FreeImageStack::loadImage(unsigned int iSlice, npp::ImageNPP_8u_C1 & rImage) const { NPP_ASSERT_MSG(iSlice < slices(), "Slice index exceeded number of slices in stack."); FIBITMAP * pBitmap = FreeImage_LockPage(pImageStack_, iSlice); NPP_ASSERT_NOT_NULL(pBitmap); // make sure this is an 8-bit single channel image NPP_DEBUG_ASSERT(FreeImage_GetColorType(pBitmap) == FIC_MINISBLACK); NPP_DEBUG_ASSERT(FreeImage_GetBPP(pBitmap) == 8); NPP_DEBUG_ASSERT(FreeImage_GetWidth(pBitmap) == nWidth_); NPP_DEBUG_ASSERT(FreeImage_GetHeight(pBitmap) == nHeight_); unsigned int nSrcPitch = FreeImage_GetPitch(pBitmap); unsigned char * pSrcData = FreeImage_GetBits(pBitmap); if (rImage.width() == nWidth_ && rImage.height() == nHeight_) { NPP_CHECK_CUDA(cudaMemcpy2D(rImage.data(), rImage.pitch(), pSrcData, nSrcPitch, nWidth_, nHeight_, cudaMemcpyHostToDevice)); } else { // create new NPP image npp::ImageNPP_8u_C1 oImage(nWidth_, nHeight_); // transfer slice data into new device image NPP_CHECK_CUDA(cudaMemcpy2D(oImage.data(), oImage.pitch(), pSrcData, nSrcPitch, nWidth_, nHeight_, cudaMemcpyHostToDevice)); // swap the result image with the reference passed into this method rImage.swap(oImage); } // release locked slice FreeImage_UnlockPage(pImageStack_, pBitmap, FALSE); }
oz::gpu_image oz::cpu_image::gpu() const { if (!is_valid()) return gpu_image(); gpu_image dst(size(), format(), type_size()); cudaMemcpy2D(dst.ptr<void>(), dst.pitch(), ptr<void>(), pitch(), row_size(), h(), cudaMemcpyHostToDevice); return dst; }
oz::cpu_image::cpu_image( const float4 *src, unsigned src_pitch, unsigned w, unsigned h, bool ignore_alpha ) { d_ = new image_data_cpu(w, h, ignore_alpha? FMT_FLOAT3 : FMT_FLOAT4); cudaMemcpy2D(ptr(), pitch(), src, src_pitch? src_pitch : sizeof(float4)*w, sizeof(float4)*w, h, cudaMemcpyHostToHost); }
oz::cpu_image::cpu_image( const uchar4 *src, unsigned src_pitch, unsigned w, unsigned h, bool ignore_alpha ) { d_ = new image_data_cpu(w, h, ignore_alpha? FMT_UCHAR3 : FMT_UCHAR4); cudaMemcpy2D(ptr(), pitch(), src, src_pitch? src_pitch : sizeof(uchar4)*w, sizeof(uchar4)*w, h, cudaMemcpyHostToHost); }
//! @brief Copy the ND-array device array to host array. //! You must allocate the array with the appropriate size. __host__ inline void copy_to_host(T *host_data) const { if (N == 2) { SHAKTI_SAFE_CUDA_CALL(cudaMemcpy2D( host_data, _sizes[0] * sizeof(T), _data, _pitch, _sizes[0] * sizeof(T), _sizes[1], cudaMemcpyDeviceToHost)); } else if (N == 3) { cudaMemcpy3DParms params = { 0 }; params.srcPtr = make_cudaPitchedPtr(reinterpret_cast<void *>(_data), _pitch, _sizes[0], _sizes[1]); params.dstPtr = make_cudaPitchedPtr(host_data, _sizes[0] * sizeof(T), _sizes[0], _sizes[1]); params.extent = make_cudaExtent(_sizes[0]*sizeof(T), _sizes[1], _sizes[2]); params.kind = cudaMemcpyDeviceToHost; SHAKTI_SAFE_CUDA_CALL(cudaMemcpy3D(¶ms)); } else SHAKTI_SAFE_CUDA_CALL(cudaMemcpy(host_data, _data, sizeof(T) * size(), cudaMemcpyDeviceToHost)); }
oz::cpu_image oz::gpu_image::cpu() const { if (!is_valid()) return cpu_image(); cpu_image dst(size(), format(), type_size()); OZ_CUDA_SAFE_CALL(cudaMemcpy2D(dst.ptr<void>(), dst.pitch(), ptr<void>(), pitch(), row_size(), h(), cudaMemcpyDeviceToHost)); return dst; }
static void HostToDeviceCopy2D(Npp32f * pDst, size_t nDstPitch, const Npp32f * pSrc, size_t nSrcPitch, size_t nWidth, size_t nHeight) { cudaError_t eResult; eResult = cudaMemcpy2D(pDst, nDstPitch, pSrc, nSrcPitch, nWidth * sizeof(Npp32f), nHeight, cudaMemcpyHostToDevice); NPP_ASSERT(cudaSuccess == eResult); };
inline void Copy(Tensor<A,dim> _dst, Tensor<B,dim> _src, cudaMemcpyKind kind){ utils::Assert( _dst.shape == _src.shape, "Copy:shape mismatch" ); Tensor<A,2> dst = _dst.FlatTo2D(); Tensor<B,2> src = _src.FlatTo2D(); cudaError_t err = cudaMemcpy2D( dst.dptr, dst.shape.stride_ * sizeof(real_t), src.dptr, src.shape.stride_ * sizeof(real_t), dst.shape[0] * sizeof(real_t), dst.shape[1], kind ); utils::Assert( err == cudaSuccess, cudaGetErrorString(err) ); }
void CudaUtil::cudaCheckMemcpy2D(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, int line, const char* file) { int error = cudaMemcpy2D(dst, dpitch, src, spitch, width, height, kind); if (error != cudaSuccess) { std::ostringstream os; os << "cudaMemcpy2D returned error code " << error << ", line " << line << ", in file " << file; throw CudaException(os.str()); } }
void operator()(Type* dest, const math::Size_t<1> pitchDest, const Type* source, const math::Size_t<1> pitchSource, const math::Size_t<2u>& size, flags::Memcopy::Direction direction) { const cudaMemcpyKind kind[] = {cudaMemcpyHostToDevice, cudaMemcpyDeviceToHost, cudaMemcpyHostToHost, cudaMemcpyDeviceToDevice}; CUDA_CHECK_NO_EXCEP(cudaMemcpy2D(dest, pitchDest.x(), source, pitchSource.x(), sizeof(Type) * size.x(), size.y(), kind[direction])); }
void vm::scanner::cuda::DeviceMemory2D::copyTo(DeviceMemory2D& other) const { if (empty()) other.release(); else { other.create(rows_, colsBytes_); cudaSafeCall( cudaMemcpy2D(other.data_, other.step_, data_, step_, colsBytes_, rows_, cudaMemcpyDeviceToDevice) ); cudaSafeCall( cudaDeviceSynchronize() ); } }
void DeviceMatrix3D_copyToDevice(DeviceMatrix3D& self, const float* data) { if ((self.dim_x > 0) && (self.dim_y > 0) && (self.dim_t > 0)) { const size_t widthInBytes = self.dim_x * sizeof(float); CUDA_SAFE_CALL_NO_SYNC (cudaMemcpy2D(self.data, self.pitch_y * sizeof(float), data, widthInBytes, widthInBytes, self.dim_y * self.dim_t, cudaMemcpyHostToDevice)); } }
void DeviceMatrix_copyToDevice(DeviceMatrix& self, const float* data) { if ((self.width > 0) && (self.height > 0)) { const size_t widthInBytes = self.width * sizeof(float); CUDA_SAFE_CALL_NO_SYNC (cudaMemcpy2D(self.data, self.pitch * sizeof(float), data, widthInBytes, widthInBytes, self.height, cudaMemcpyHostToDevice)); } }
void DeviceMatrix_copyFromDevice(const DeviceMatrix& self, float* dst) { if ((self.width > 0) && (self.height > 0)) { const size_t widthInBytes = self.width * sizeof(float); CUDA_SAFE_CALL_NO_SYNC (cudaMemcpy2D(dst, widthInBytes, self.data, self.pitch * sizeof(float), widthInBytes, self.height, cudaMemcpyDeviceToHost)); } }
oz::cpu_image oz::cpu_image::copy( int x1, int y1, int x2, int y2 ) const { int cw = x2 - x1 + 1; int ch = y2 - y1 + 1; if ((x1 < 0) || (x2 >= (int)w()) || (y1 < 0) || (y2 >= (int)h()) || (cw <= 0) || (ch <= 0)) OZ_X() << "Invalid region!"; cpu_image dst(cw, ch, format()); uchar *src_ptr = ptr<uchar>() + y1 * row_size() + x1 * type_size(); cudaMemcpy2D(dst.ptr(), dst.pitch(), src_ptr, pitch(), dst.row_size(), dst.h(), cudaMemcpyHostToHost); return dst; }
void magma_copymatrix( magma_int_t m, magma_int_t n, size_t elemSize, void const* dA_src, magma_int_t lda, void* dB_dst, magma_int_t ldb ) { cudaError_t status; status = cudaMemcpy2D( dB_dst, ldb*elemSize, dA_src, lda*elemSize, m*elemSize, n, cudaMemcpyDeviceToDevice ); check_error( status ); }
void magma_scopymatrix_internal( magma_int_t m, magma_int_t n, float const* dA_src, magma_int_t lda, float* dB_dst, magma_int_t ldb, const char* func, const char* file, int line ) { cudaError_t status; status = cudaMemcpy2D( dB_dst, ldb*sizeof(float), dA_src, lda*sizeof(float), m*sizeof(float), n, cudaMemcpyDeviceToDevice ); check_xerror( status, func, file, line ); }
void magma_copymatrix_internal( magma_int_t m, magma_int_t n, magma_int_t elemSize, void const* dA_src, magma_int_t lda, void* dB_dst, magma_int_t ldb, const char* func, const char* file, int line ) { cudaError_t status; status = cudaMemcpy2D( dB_dst, ldb*elemSize, dA_src, lda*elemSize, m*elemSize, n, cudaMemcpyDeviceToDevice ); check_xerror( status, func, file, line ); }
// -------------------- extern "C" void magma_zcopymatrix_internal( magma_int_t m, magma_int_t n, magmaDoubleComplex_const_ptr dA_src, magma_int_t lda, magmaDoubleComplex_ptr dB_dst, magma_int_t ldb, const char* func, const char* file, int line ) { cudaError_t status; status = cudaMemcpy2D( dB_dst, ldb*sizeof(magmaDoubleComplex), dA_src, lda*sizeof(magmaDoubleComplex), m*sizeof(magmaDoubleComplex), n, cudaMemcpyDeviceToDevice ); check_xerror( status, func, file, line ); }
// This test specifies a single test (where you specify radius and/or iterations) int runSingleTest(char *ref_file, char *exec_path) { int nTotalErrors = 0; char dump_file[256]; printf("[runSingleTest]: [%s]\n", sSDKsample); initCuda(); unsigned int *dResult; unsigned int *hResult = (unsigned int *)malloc(width * height * sizeof(unsigned int)); size_t pitch; checkCudaErrors(cudaMallocPitch((void **)&dResult, &pitch, width*sizeof(unsigned int), height)); // run the sample radius { printf("%s (radius=%d) (passes=%d) ", sSDKsample, filter_radius, iterations); bilateralFilterRGBA(dResult, width, height, euclidean_delta, filter_radius, iterations, kernel_timer); // check if kernel execution generated an error getLastCudaError("Error: bilateralFilterRGBA Kernel execution FAILED"); checkCudaErrors(cudaDeviceSynchronize()); // readback the results to system memory cudaMemcpy2D(hResult, sizeof(unsigned int)*width, dResult, pitch, sizeof(unsigned int)*width, height, cudaMemcpyDeviceToHost); sprintf(dump_file, "nature_%02d.ppm", filter_radius); sdkSavePPM4ub((const char *)dump_file, (unsigned char *)hResult, width, height); if (!sdkComparePPM(dump_file, sdkFindFilePath(ref_file, exec_path), MAX_EPSILON_ERROR, 0.15f, false)) { printf("Image is Different "); nTotalErrors++; } else { printf("Image is Matching "); } printf(" <%s>\n", ref_file); } printf("\n"); free(hResult); checkCudaErrors(cudaFree(dResult)); return nTotalErrors; }
cudaError_t cudaMemcpy3Dfix(const struct cudaMemcpy3DParms *param) { const cudaMemcpy3DParms& p = *param; // Use cudaMemcpy3D for 3D only // But it does not handle 2D or 1D copies well if (1<p.extent.depth) { return cudaMemcpy3D( &p ); } else if (1<p.extent.height) { // 2D copy // Arraycopy if (0 != p.srcArray && 0 == p.dstArray) { return cudaMemcpy2DFromArray(p.dstPtr.ptr, p.dstPtr.pitch, p.srcArray, p.srcPos.x, p.srcPos.y, p.extent.width, p.extent.height, p.kind); } else if(0 == p.srcArray && 0 != p.dstArray) { return cudaMemcpy2DToArray(p.dstArray, p.dstPos.x, p.dstPos.y, p.srcPtr.ptr, p.srcPtr.pitch, p.extent.width, p.extent.height, p.kind); } else if(0 != p.srcArray && 0 != p.dstArray) { return cudaMemcpy2DArrayToArray( p.dstArray, p.dstPos.x, p.dstPos.y, p.srcArray, p.srcPos.x, p.srcPos.y, p.extent.width, p.extent.height, p.kind); } else { return cudaMemcpy2D( p.dstPtr.ptr, p.dstPtr.pitch, p.srcPtr.ptr, p.srcPtr.pitch, p.extent.width, p.extent.height, p.kind ); } } else { // 1D copy // p.extent.width should not include pitch EXCEPTION_ASSERT( p.extent.width == p.dstPtr.xsize ); EXCEPTION_ASSERT( p.extent.width == p.srcPtr.xsize ); // Arraycopy if (0 != p.srcArray && 0 == p.dstArray) { return cudaMemcpyFromArray(p.dstPtr.ptr, p.srcArray, p.srcPos.x, p.srcPos.y, p.extent.width, p.kind); } else if(0 == p.srcArray && 0 != p.dstArray) { return cudaMemcpyToArray(p.dstArray, p.dstPos.x, p.dstPos.y, p.srcPtr.ptr, p.extent.width, p.kind); } else if(0 != p.srcArray && 0 != p.dstArray) { return cudaMemcpyArrayToArray(p.dstArray, p.dstPos.x, p.dstPos.y, p.srcArray, p.srcPos.x, p.srcPos.y, p.extent.width, p.kind); } else { return cudaMemcpy( p.dstPtr.ptr, p.srcPtr.ptr, p.extent.width, p.kind ); } } }
void FreeImageStack::appendImage(const npp::ImageNPP_32f_C1 & rImage) { NPP_ASSERT(rImage.width() == nWidth_); NPP_ASSERT(rImage.height() == nHeight_); // create the result image storage using FreeImage so we can easily // save unsigned int nResultPitch = FreeImage_GetPitch(pBitmap_32f_); float * pResultData = reinterpret_cast<float *>(FreeImage_GetBits(pBitmap_32f_)); NPP_CHECK_CUDA(cudaMemcpy2D(pResultData, nResultPitch, rImage.data(), rImage.pitch(), nWidth_ * 4, nHeight_, cudaMemcpyDeviceToHost)); FreeImage_AppendPage(pImageStack_, pBitmap_32f_); appendAzimuthalAnalysis(pResultData, nResultPitch); }
SEXP R_auto_cudaMemcpy2D(SEXP r_dst, SEXP r_dpitch, SEXP r_src, SEXP r_spitch, SEXP r_width, SEXP r_height, SEXP r_kind) { SEXP r_ans = R_NilValue; void * dst = GET_REF(r_dst, void ); size_t dpitch = REAL(r_dpitch)[0]; const void * src = GET_REF(r_src, const void ); size_t spitch = REAL(r_spitch)[0]; size_t width = REAL(r_width)[0]; size_t height = REAL(r_height)[0]; enum cudaMemcpyKind kind = (enum cudaMemcpyKind) INTEGER(r_kind)[0]; cudaError_t ans; ans = cudaMemcpy2D(dst, dpitch, src, spitch, width, height, kind); r_ans = Renum_convert_cudaError_t(ans) ; return(r_ans); }
int main(int argc, char *argv[]) { printf("%s Starting...\n\n", argv[0]); try { std::string sFilename; char *filePath = sdkFindFilePath("person.txt", argv[0]); if (filePath) { sFilename = filePath; } else { printf("Error %s was unable to find person.txt\n", argv[0]); exit(EXIT_FAILURE); } cudaDeviceInit(argc, (const char **)argv); printfNPPinfo(argc, argv); if (g_bQATest == false && (g_nDevice == -1) && argc > 1) { sFilename = argv[1]; } // if we specify the filename at the command line, then we only test sFilename int file_errors = 0; std::ifstream infile(sFilename.data(), std::ifstream::in); if (infile.good()) { std::cout << "imageSegmentationNPP opened: <" << sFilename.data() << "> successfully!" << std::endl; file_errors = 0; infile.close(); } else { std::cout << "imageSegmentationNPP unable to open: <" << sFilename.data() << ">" << std::endl; file_errors++; infile.close(); } if (file_errors > 0) { exit(EXIT_FAILURE); } std::string sResultFilename = sFilename; std::string::size_type dot = sResultFilename.rfind('.'); if (dot != std::string::npos) { sResultFilename = sResultFilename.substr(0, dot); } sResultFilename += "_segmentation.pgm"; if (argc >= 3 && !g_bQATest) { sResultFilename = argv[2]; } // load MRF declaration int width, height, nLabels; int *hCue, *vCue, *dataCostArray; loadMiddleburyMRFData(sFilename, dataCostArray, hCue, vCue, width, height, nLabels); NPP_ASSERT(nLabels == 2); std::cout << "Dataset: " << sFilename << std::endl; std::cout << "Size: " << width << "x" << height << std::endl; NppiSize size; size.width = width; size.height = height; NppiRect roi; roi.x=0; roi.y=0; roi.width=width; roi.height=height; // Setup flow network int step, transposed_step; Npp32s *d_source, *d_sink, *d_terminals, *d_left_transposed, *d_right_transposed, *d_top, *d_bottom; // Setup terminal capacities d_source = nppiMalloc_32s_C1(width, height, &step); cudaMemcpy2D(d_source, step, dataCostArray, width * sizeof(int), width*sizeof(int), height, cudaMemcpyHostToDevice); d_sink = nppiMalloc_32s_C1(width, height, &step); cudaMemcpy2D(d_sink, step, &dataCostArray[width*height], width * sizeof(int), width*sizeof(int), height, cudaMemcpyHostToDevice); d_terminals = nppiMalloc_32s_C1(width, height, &step); nppiSub_32s_C1RSfs(d_sink, step, d_source, step, d_terminals, step, size, 0); // Setup edge capacities NppiSize edgeTranposedSize; edgeTranposedSize.width = height; edgeTranposedSize.height = width-1; NppiSize oneRowTranposedSize; oneRowTranposedSize.width = height; oneRowTranposedSize.height = 1; d_right_transposed = nppiMalloc_32s_C1(height, width, &transposed_step); cudaMemcpy2D(d_right_transposed, transposed_step, hCue, height * sizeof(int), height * sizeof(int), width, cudaMemcpyHostToDevice); d_left_transposed = nppiMalloc_32s_C1(height, width, &transposed_step); nppiSet_32s_C1R(0, d_left_transposed, transposed_step, oneRowTranposedSize); nppiCopy_32s_C1R(d_right_transposed, transposed_step, d_left_transposed + transposed_step/sizeof(int), transposed_step, edgeTranposedSize); NppiSize edgeSize; edgeSize.width = width; edgeSize.height = height-1; NppiSize oneRowSize; oneRowSize.width = width; oneRowSize.height = 1; d_bottom = nppiMalloc_32s_C1(width, height, &step); cudaMemcpy2D(d_bottom, step, vCue, width * sizeof(int), width*sizeof(int), height, cudaMemcpyHostToDevice); d_top = nppiMalloc_32s_C1(width, height, &step); nppiSet_32s_C1R(0, d_top, step, oneRowSize); nppiCopy_32s_C1R(d_bottom, step, d_top + step/sizeof(int), step, edgeSize); // Allocate temp storage for graphcut computation Npp8u *pBuffer; int bufferSize; nppiGraphcutGetSize(size, &bufferSize); cudaMalloc(&pBuffer, bufferSize); NppiGraphcutState *pGraphcutState; nppiGraphcutInitAlloc(size, &pGraphcutState, pBuffer); // Allocate label storage npp::ImageNPP_8u_C1 oDeviceDst(width, height); cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); // Compute the graphcut, result is 0 / !=0 cudaEventRecord(start,0); nppiGraphcut_32s8u(d_terminals, d_left_transposed, d_right_transposed, d_top, d_bottom, step, transposed_step, size, oDeviceDst.data(), oDeviceDst.pitch(), pGraphcutState); cudaEventRecord(stop,0); cudaEventSynchronize(stop); float time; cudaEventElapsedTime(&time, start, stop); std::cout << "Elapsed Time: " << time << " ms" << std::endl; // declare a host image object for an 8-bit grayscale image npp::ImageCPU_8u_C1 oHostAlpha(width, height); // convert graphcut result to 0/255 alpha image using new nppiCompareC_8u_C1R primitive (CUDA 5.0) npp::ImageNPP_8u_C1 oDeviceAlpha(width, height); nppiCompareC_8u_C1R(oDeviceDst.data(), oDeviceDst.pitch(), 0, oDeviceAlpha.data(), oDeviceAlpha.pitch(), size, NPP_CMP_GREATER); // and copy the result to host oDeviceAlpha.copyTo(oHostAlpha.data(), oHostAlpha.pitch()); int E_d, E_s; std::cout << "Graphcut Cost: " << computeEnergy(E_d, E_s, oHostAlpha.data(), oHostAlpha.pitch(), hCue, vCue, dataCostArray, width, height) << std::endl; std::cout << "(E_d = " << E_d << ", E_s = " << E_s << ")" << std::endl; std::cout << "Saving segmentation result as " << sResultFilename << std::endl; saveImage(sResultFilename, oHostAlpha); nppiGraphcutFree(pGraphcutState); cudaFree(pBuffer); cudaFree(d_top); cudaFree(d_bottom); cudaFree(d_left_transposed); cudaFree(d_right_transposed); cudaFree(d_source); cudaFree(d_sink); cudaFree(d_terminals); exit(EXIT_SUCCESS); } catch (npp::Exception &rException) { std::cerr << "Program error! The following exception occurred: \n"; std::cerr << rException << std::endl; std::cerr << "Aborting." << std::endl; exit(EXIT_FAILURE); } catch (...) { std::cerr << "Program error! An unknow type of exception occurred. \n"; std::cerr << "Aborting." << std::endl; exit(EXIT_FAILURE); } return 0; }
//////////////////////////////////////////////////////////////////////////////// // Program Main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char *argv[]) { int Nx, Ny, Nz, max_iters; int blockX, blockY, blockZ; if (argc == 8) { Nx = atoi(argv[1]); Ny = atoi(argv[2]); Nz = atoi(argv[3]); max_iters = atoi(argv[4]); blockX = atoi(argv[5]); blockY = atoi(argv[6]); blockZ = atoi(argv[7]); } else { printf("Usage: %s nx ny nz i block_x block_y block_z number_of_threads\n", argv[0]); exit(1); } // Get the number of GPUS int number_of_devices; checkCuda(cudaGetDeviceCount(&number_of_devices)); if (number_of_devices < 2) { printf("Less than two devices were found.\n"); printf("Exiting...\n"); return -1; } // Decompose along the Z-axis int _Nz = Nz/number_of_devices; // Define constants const _DOUBLE_ L = 1.0; const _DOUBLE_ h = L/(Nx+1); const _DOUBLE_ dt = h*h/6.0; const _DOUBLE_ beta = dt/(h*h); const _DOUBLE_ c0 = beta; const _DOUBLE_ c1 = (1-6*beta); // Check if ECC is turned on ECCCheck(number_of_devices); // Set the number of OpenMP threads omp_set_num_threads(number_of_devices); #pragma omp parallel { unsigned int tid = omp_get_num_threads(); #pragma omp single { printf("Number of OpenMP threads: %d\n", tid); } } // CPU memory operations int dt_size = sizeof(_DOUBLE_); _DOUBLE_ *u_new, *u_old; u_new = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(Nz+2)); u_old = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(Nz+2)); init(u_old, u_new, h, Nx, Ny, Nz); // Allocate and generate arrays on the host size_t pitch_bytes; size_t pitch_gc_bytes; _DOUBLE_ *h_Unew, *h_Uold; _DOUBLE_ *h_s_Uolds[number_of_devices], *h_s_Unews[number_of_devices]; _DOUBLE_ *left_send_buffer[number_of_devices], *left_receive_buffer[number_of_devices]; _DOUBLE_ *right_send_buffer[number_of_devices], *right_receive_buffer[number_of_devices]; h_Unew = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(Nz+2)); h_Uold = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(Nz+2)); init(h_Uold, h_Unew, h, Nx, Ny, Nz); #pragma omp parallel { unsigned int tid = omp_get_thread_num(); h_s_Unews[tid] = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_Nz+2)); h_s_Uolds[tid] = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_Nz+2)); right_send_buffer[tid] = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_GC_DEPTH)); left_send_buffer[tid] = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_GC_DEPTH)); right_receive_buffer[tid] = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_GC_DEPTH)); left_receive_buffer[tid] = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_GC_DEPTH)); checkCuda(cudaHostAlloc((void**)&h_s_Unews[tid], dt_size*(Nx+2)*(Ny+2)*(_Nz+2), cudaHostAllocPortable)); checkCuda(cudaHostAlloc((void**)&h_s_Uolds[tid], dt_size*(Nx+2)*(Ny+2)*(_Nz+2), cudaHostAllocPortable)); checkCuda(cudaHostAlloc((void**)&right_send_buffer[tid], dt_size*(Nx+2)*(Ny+2)*(_GC_DEPTH), cudaHostAllocPortable)); checkCuda(cudaHostAlloc((void**)&left_send_buffer[tid], dt_size*(Nx+2)*(Ny+2)*(_GC_DEPTH), cudaHostAllocPortable)); checkCuda(cudaHostAlloc((void**)&right_receive_buffer[tid], dt_size*(Nx+2)*(Ny+2)*(_GC_DEPTH), cudaHostAllocPortable)); checkCuda(cudaHostAlloc((void**)&left_receive_buffer[tid], dt_size*(Nx+2)*(Ny+2)*(_GC_DEPTH), cudaHostAllocPortable)); init_subdomain(h_s_Uolds[tid], h_Uold, Nx, Ny, _Nz, tid); } // GPU memory operations _DOUBLE_ *d_s_Unews[number_of_devices], *d_s_Uolds[number_of_devices]; _DOUBLE_ *d_right_send_buffer[number_of_devices], *d_left_send_buffer[number_of_devices]; _DOUBLE_ *d_right_receive_buffer[number_of_devices], *d_left_receive_buffer[number_of_devices]; #pragma omp parallel { unsigned int tid = omp_get_thread_num(); checkCuda(cudaSetDevice(tid)); CopyToConstantMemory(c0, c1); checkCuda(cudaMallocPitch((void**)&d_s_Uolds[tid], &pitch_bytes, dt_size*(Nx+2), (Ny+2)*(_Nz+2))); checkCuda(cudaMallocPitch((void**)&d_s_Unews[tid], &pitch_bytes, dt_size*(Nx+2), (Ny+2)*(_Nz+2))); checkCuda(cudaMallocPitch((void**)&d_left_receive_buffer[tid], &pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH))); checkCuda(cudaMallocPitch((void**)&d_right_receive_buffer[tid], &pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH))); checkCuda(cudaMallocPitch((void**)&d_left_send_buffer[tid], &pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH))); checkCuda(cudaMallocPitch((void**)&d_right_send_buffer[tid], &pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH))); } // Copy data from host to the device double HtD_timer = 0.; HtD_timer -= omp_get_wtime(); #pragma omp parallel { unsigned int tid = omp_get_thread_num(); checkCuda(cudaSetDevice(tid)); checkCuda(cudaMemcpy2D(d_s_Uolds[tid], pitch_bytes, h_s_Uolds[tid], dt_size*(Nx+2), dt_size*(Nx+2), ((Ny+2)*(_Nz+2)), cudaMemcpyDefault)); checkCuda(cudaMemcpy2D(d_s_Unews[tid], pitch_bytes, h_s_Unews[tid], dt_size*(Nx+2), dt_size*(Nx+2), ((Ny+2)*(_Nz+2)), cudaMemcpyDefault)); } HtD_timer += omp_get_wtime(); int pitch = pitch_bytes/dt_size; int gc_pitch = pitch_gc_bytes/dt_size; // GPU kernel launch parameters dim3 threads_per_block(blockX, blockY, blockZ); unsigned int blocksInX = getBlock(Nx, blockX); unsigned int blocksInY = getBlock(Ny, blockY); unsigned int blocksInZ = getBlock(_Nz-2, k_loop); dim3 thread_blocks(blocksInX, blocksInY, blocksInZ); dim3 thread_blocks_halo(blocksInX, blocksInY); double compute_timer = 0.; compute_timer -= omp_get_wtime(); #pragma omp parallel { unsigned int tid = omp_get_thread_num(); for(int iterations = 0; iterations < max_iters; iterations++) { // Compute inner nodes checkCuda(cudaSetDevice(tid)); ComputeInnerPoints(thread_blocks, threads_per_block, d_s_Unews[tid], d_s_Uolds[tid], pitch, Nx, Ny, _Nz); // Copy right boundary data to host if (tid == 0) { checkCuda(cudaSetDevice(tid)); CopyBoundaryRegionToGhostCell(thread_blocks_halo, threads_per_block, d_s_Unews[tid], d_right_send_buffer[tid], Nx, Ny, _Nz, pitch, gc_pitch, 0); checkCuda(cudaMemcpy2D(right_send_buffer[tid], dt_size*(Nx+2), d_right_send_buffer[tid], pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH), cudaMemcpyDefault)); } // Copy left boundary data to host if (tid == 1) { checkCuda(cudaSetDevice(tid)); CopyBoundaryRegionToGhostCell(thread_blocks_halo, threads_per_block, d_s_Unews[tid], d_left_send_buffer[tid], Nx, Ny, _Nz, pitch, gc_pitch, 1); checkCuda(cudaMemcpy2D(left_send_buffer[tid], dt_size*(Nx+2), d_left_send_buffer[tid], pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH), cudaMemcpyDefault)); } #pragma omp barrier // Copy right boundary data to device 1 if (tid == 1) { checkCuda(cudaSetDevice(tid)); checkCuda(cudaMemcpy2D(d_left_receive_buffer[tid], pitch_gc_bytes, right_send_buffer[tid-1], dt_size*(Nx+2), dt_size*(Nx+2), ((Ny+2)*(_GC_DEPTH)), cudaMemcpyDefault)); CopyGhostCellToBoundaryRegion(thread_blocks_halo, threads_per_block, d_s_Unews[tid], d_left_receive_buffer[tid], Nx, Ny, _Nz, pitch, gc_pitch, 1); } // Copy left boundary data to device 0 if (tid == 0) { checkCuda(cudaSetDevice(tid)); checkCuda(cudaMemcpy2D(d_right_receive_buffer[tid], pitch_gc_bytes, left_send_buffer[tid+1], dt_size*(Nx+2), dt_size*(Nx+2), ((Ny+2)*(_GC_DEPTH)), cudaMemcpyDefault)); CopyGhostCellToBoundaryRegion(thread_blocks_halo, threads_per_block, d_s_Unews[tid], d_right_receive_buffer[tid], Nx, Ny, _Nz, pitch, gc_pitch, 0); } // Swap pointers on the host #pragma omp barrier checkCuda(cudaSetDevice(tid)); checkCuda(cudaDeviceSynchronize()); swap(_DOUBLE_*, d_s_Unews[tid], d_s_Uolds[tid]); } } compute_timer += omp_get_wtime(); // Copy data from device to host double DtH_timer = 0; DtH_timer -= omp_get_wtime(); #pragma omp parallel { unsigned int tid = omp_get_thread_num(); checkCuda(cudaSetDevice(tid)); checkCuda(cudaMemcpy2D(h_s_Uolds[tid], dt_size*(Nx+2), d_s_Uolds[tid], pitch_bytes, dt_size*(Nx+2), (Ny+2)*(_Nz+2), cudaMemcpyDeviceToHost)); } DtH_timer += omp_get_wtime(); // Merge sub-domains into a one big domain #pragma omp parallel { unsigned int tid = omp_get_thread_num(); merge_domains(h_s_Uolds[tid], h_Uold, Nx, Ny, _Nz, tid); } // Calculate on host #if defined(DEBUG) || defined(_DEBUG) cpu_heat3D(u_new, u_old, c0, c1, max_iters, Nx, Ny, Nz); #endif float gflops = CalcGflops(compute_timer, max_iters, Nx, Ny, Nz); PrintSummary("3D Heat (7-pt)", "Plane sweeping", compute_timer, HtD_timer, DtH_timer, gflops, max_iters, Nx); _DOUBLE_ t = max_iters * dt; CalcError(h_Uold, u_old, t, h, Nx, Ny, Nz); #if defined(DEBUG) || defined(_DEBUG) //exportToVTK(h_Uold, h, "heat3D.vtk", Nx, Ny, Nz); #endif #pragma omp parallel { unsigned int tid = omp_get_thread_num(); checkCuda(cudaSetDevice(tid)); checkCuda(cudaFree(d_s_Unews[tid])); checkCuda(cudaFree(d_s_Uolds[tid])); checkCuda(cudaFree(d_right_send_buffer[tid])); checkCuda(cudaFree(d_left_send_buffer[tid])); checkCuda(cudaFree(d_right_receive_buffer[tid])); checkCuda(cudaFree(d_left_receive_buffer[tid])); checkCuda(cudaFreeHost(h_s_Unews[tid])); checkCuda(cudaFreeHost(h_s_Uolds[tid])); checkCuda(cudaFreeHost(left_send_buffer[tid])); checkCuda(cudaFreeHost(right_send_buffer[tid])); checkCuda(cudaFreeHost(left_receive_buffer[tid])); checkCuda(cudaFreeHost(right_receive_buffer[tid])); checkCuda(cudaDeviceReset()); } free(u_old); free(u_new); return 0; }
void cuda_memcpy_strided(const long dims[2], long ostr, void* dst, long istr, const void* src) { CUDA_ERROR(cudaMemcpy2D(dst, ostr, src, istr, dims[0], dims[1], cudaMemcpyDefault)); }
void vm::scanner::cuda::DeviceMemory2D::upload(const void *host_ptr_arg, size_t host_step_arg, int rows_arg, int colsBytes_arg) { create(rows_arg, colsBytes_arg); cudaSafeCall( cudaMemcpy2D(data_, step_, host_ptr_arg, host_step_arg, colsBytes_, rows_, cudaMemcpyHostToDevice) ); cudaSafeCall( cudaDeviceSynchronize() ); }
void vm::scanner::cuda::DeviceMemory2D::download(void *host_ptr_arg, size_t host_step_arg) const { cudaSafeCall( cudaMemcpy2D(host_ptr_arg, host_step_arg, data_, step_, colsBytes_, rows_, cudaMemcpyDeviceToHost) ); cudaSafeCall( cudaDeviceSynchronize() ); }