//! @brief Copy the ND-array device array to host array. //! You must allocate the array with the appropriate size. __host__ inline void copy_to_host(T *host_data) const { if (N == 2) { SHAKTI_SAFE_CUDA_CALL(cudaMemcpy2D( host_data, _sizes[0] * sizeof(T), _data, _pitch, _sizes[0] * sizeof(T), _sizes[1], cudaMemcpyDeviceToHost)); } else if (N == 3) { cudaMemcpy3DParms params = { 0 }; params.srcPtr = make_cudaPitchedPtr(reinterpret_cast<void *>(_data), _pitch, _sizes[0], _sizes[1]); params.dstPtr = make_cudaPitchedPtr(host_data, _sizes[0] * sizeof(T), _sizes[0], _sizes[1]); params.extent = make_cudaExtent(_sizes[0]*sizeof(T), _sizes[1], _sizes[2]); params.kind = cudaMemcpyDeviceToHost; SHAKTI_SAFE_CUDA_CALL(cudaMemcpy3D(¶ms)); } else SHAKTI_SAFE_CUDA_CALL(cudaMemcpy(host_data, _data, sizeof(T) * size(), cudaMemcpyDeviceToHost)); }
void operator()(Type* dest, const math::Size_t<2u> pitchDest, Type* source, const math::Size_t<2u> pitchSource, const math::Size_t<3>& size, flags::Memcopy::Direction direction) { const cudaMemcpyKind kind[] = {cudaMemcpyHostToDevice, cudaMemcpyDeviceToHost, cudaMemcpyHostToHost, cudaMemcpyDeviceToDevice}; cudaPitchedPtr pitchedPtrDest; pitchedPtrDest.pitch = pitchDest.x(); pitchedPtrDest.ptr = dest; pitchedPtrDest.xsize = size.x(); pitchedPtrDest.ysize = size.y(); cudaPitchedPtr pitchedPtrSource; pitchedPtrSource.pitch = pitchSource.x(); pitchedPtrSource.ptr = source; pitchedPtrSource.xsize = size.x(); pitchedPtrSource.ysize = size.y(); cudaMemcpy3DParms params; params.srcArray = NULL; params.srcPos = make_cudaPos(0,0,0); params.srcPtr = pitchedPtrSource; params.dstArray = NULL; params.dstPos = make_cudaPos(0,0,0); params.dstPtr = pitchedPtrDest; params.extent = make_cudaExtent(size.x() * sizeof(Type), size.y(), size.z()); params.kind = kind[direction]; CUDA_CHECK_NO_EXCEP(cudaMemcpy3D(¶ms)); }
void cpyD2H() { // D2H“]‘— cudaMemcpy3DParms parms = { 0 }; parms.srcPos = make_cudaPos(0, 0, 0); parms.srcPtr = make_cudaPitchedPtr(vdm.dPtr, sizeof(TYPE) * buf.pitch.x, buf.pitch.x, buf.pitch.y); parms.dstPos = make_cudaPos(sizeof(TYPE) * buf.lower.x, buf.lower.y, buf.lower.z); parms.dstPtr = make_cudaPitchedPtr(vdm.hPtr, sizeof(TYPE) * vdm.pitch.x, vdm.pitch.x, vdm.pitch.y); parms.extent = make_cudaExtent(sizeof(TYPE) * buf.pitch.x, buf.pitch.y, buf.pitch.z); parms.kind = cudaMemcpyDeviceToHost; cudaMemcpy3D(&parms); }
SEXP R_auto_cudaMemcpy3D(SEXP r_p) { SEXP r_ans = R_NilValue; const struct cudaMemcpy3DParms * p = GET_REF(r_p, const struct cudaMemcpy3DParms ); cudaError_t ans; ans = cudaMemcpy3D(p); r_ans = Renum_convert_cudaError_t(ans) ; return(r_ans); }
cudaError_t cudaMemcpy3Dfix(const struct cudaMemcpy3DParms *param) { const cudaMemcpy3DParms& p = *param; // Use cudaMemcpy3D for 3D only // But it does not handle 2D or 1D copies well if (1<p.extent.depth) { return cudaMemcpy3D( &p ); } else if (1<p.extent.height) { // 2D copy // Arraycopy if (0 != p.srcArray && 0 == p.dstArray) { return cudaMemcpy2DFromArray(p.dstPtr.ptr, p.dstPtr.pitch, p.srcArray, p.srcPos.x, p.srcPos.y, p.extent.width, p.extent.height, p.kind); } else if(0 == p.srcArray && 0 != p.dstArray) { return cudaMemcpy2DToArray(p.dstArray, p.dstPos.x, p.dstPos.y, p.srcPtr.ptr, p.srcPtr.pitch, p.extent.width, p.extent.height, p.kind); } else if(0 != p.srcArray && 0 != p.dstArray) { return cudaMemcpy2DArrayToArray( p.dstArray, p.dstPos.x, p.dstPos.y, p.srcArray, p.srcPos.x, p.srcPos.y, p.extent.width, p.extent.height, p.kind); } else { return cudaMemcpy2D( p.dstPtr.ptr, p.dstPtr.pitch, p.srcPtr.ptr, p.srcPtr.pitch, p.extent.width, p.extent.height, p.kind ); } } else { // 1D copy // p.extent.width should not include pitch EXCEPTION_ASSERT( p.extent.width == p.dstPtr.xsize ); EXCEPTION_ASSERT( p.extent.width == p.srcPtr.xsize ); // Arraycopy if (0 != p.srcArray && 0 == p.dstArray) { return cudaMemcpyFromArray(p.dstPtr.ptr, p.srcArray, p.srcPos.x, p.srcPos.y, p.extent.width, p.kind); } else if(0 == p.srcArray && 0 != p.dstArray) { return cudaMemcpyToArray(p.dstArray, p.dstPos.x, p.dstPos.y, p.srcPtr.ptr, p.extent.width, p.kind); } else if(0 != p.srcArray && 0 != p.dstArray) { return cudaMemcpyArrayToArray(p.dstArray, p.dstPos.x, p.dstPos.y, p.srcArray, p.srcPos.x, p.srcPos.y, p.extent.width, p.kind); } else { return cudaMemcpy( p.dstPtr.ptr, p.srcPtr.ptr, p.extent.width, p.kind ); } } }
void CudaImagePyramidHost::copyFromHost(int width, int height, const void* source, int layer) { assert(isInitialized()); assert(_textureType == cudaTextureType2DLayered); cudaMemcpy3DParms myParms = {0}; myParms.srcPtr = make_cudaPitchedPtr((void*)source,width*_typeSize,width,height); myParms.srcPos = make_cudaPos(0,0,0); myParms.dstArray = _storage; myParms.dstPos = make_cudaPos(0,0,layer); myParms.extent = make_cudaExtent(width,height,1); myParms.kind = cudaMemcpyHostToDevice; cudaMemcpy3D(&myParms); checkCUDAError("Memcpy error", _name); }
void Buffer3D::setData( void const* srcData, dp::math::Vec3ui const& srcOffset, dp::math::Vec3ui const& srcStride, dp::math::Vec3ui const& srcExtent, dp::math::Vec3ui const& dstOffset ) { #if !defined(NDEBUG) for ( int i=0 ; i<3 ; i++ ) { DP_ASSERT( srcOffset[i] + ( srcExtent[i] ? srcExtent[i] : m_extent[i] ) <= m_extent[i] ); DP_ASSERT( dstOffset[i] + ( srcExtent[i] ? srcExtent[i] : m_extent[i] ) <= m_extent[i] ); } #endif cudaMemcpy3DParms parms = { 0 }; parms.srcPos = make_cudaPos( m_elementSize * srcOffset[0], srcOffset[1], srcOffset[2] ); parms.srcPtr = make_cudaPitchedPtr( const_cast<void *>(srcData), m_elementSize * ( srcStride[0] ? srcStride[0] : m_extent[0] ), srcStride[0] ? srcStride[0] : m_extent[0], srcStride[1] ? srcStride[1] : m_extent[1] ); parms.dstPos = make_cudaPos( m_elementSize * dstOffset[0], dstOffset[1], dstOffset[2] ); parms.dstPtr = m_pitchedPtr; parms.extent = make_cudaExtent( m_elementSize * ( srcExtent[0] ? srcExtent[0] : m_extent[0] ), srcExtent[1] ? srcExtent[1] : m_extent[1], srcExtent[2] ? srcExtent[2] : m_extent[2] ); parms.kind = cudaMemcpyHostToDevice; CUDA_VERIFY( cudaMemcpy3D( &parms ) ); }
CTfactory( const VolumeGPU<T>& src, U& texRef, const cudaTextureFilterMode fm = cudaFilterModePoint, const cudaTextureAddressMode am = cudaAddressModeClamp, const int norm = false ) : dca_data(NULL) { // Check for valid input if( src.d_data.ptr == NULL ) { std::cerr << __FUNCTION__ << ": Source has no data" << std::endl; abort(); } // Allocate memory cudaChannelFormatDesc cd = cudaCreateChannelDesc<T>(); cudaExtent tmpExtent = ExtentFromDims( src.dims ); CUDA_SAFE_CALL( cudaMalloc3DArray( &(this->dca_data), &cd, tmpExtent ) ); // Do the copy cudaMemcpy3DParms cp = {0}; cp.srcPtr = src.d_data; cp.dstArray = this->dca_data; cp.extent = tmpExtent; cp.kind = cudaMemcpyDeviceToDevice; CUDA_SAFE_CALL( cudaMemcpy3D( &cp ) ); // Bind the texture texRef.normalized = norm; texRef.addressMode[0] = am; texRef.addressMode[1] = am; texRef.addressMode[2] = am; texRef.filterMode = fm; CUDA_SAFE_CALL( cudaBindTextureToArray( texRef, this->dca_data ) ); }
void VolSkin::init( int width, int height, TetMesh *tm ) { this->width = width; this->height = height; tetMesh = tm; // TEMP initialize volume data cudaExtent volumeSize = make_cudaExtent(128, 128, 128); //cudaExtent volumeSize = make_cudaExtent(256, 256, 256); // generate raw volume data float *h_densityData = (float*)malloc( sizeof(float)*volumeSize.width*volumeSize.height*volumeSize.depth ); math::PerlinNoise pn; pn.setDepth( 4 ); pn.setFrequency(3.0f); //pn.setInflection(true); for( int k=0;k<volumeSize.depth;++k ) for( int j=0;j<volumeSize.height;++j ) for( int i=0;i<volumeSize.width;++i ) { int index = k*volumeSize.width*volumeSize.height + j*volumeSize.width + i; math::Vec3f uvw( (float)(i)/(float)(volumeSize.width), (float)(j)/(float)(volumeSize.height), (float)(k)/(float)(volumeSize.depth)); float t = (float)(j)/(float)(volumeSize.height); //h_densityData[index] = 0.5f; //h_densityData[index] = (1.0f-t)*1.0f; h_densityData[index] = std::max( 0.0f, pn.perlinNoise_3D( uvw.x, uvw.y*2.0, uvw.z ) )*1.0f; // cylinder //h_densityData[index] = std::max( 0.0f, pn.perlinNoise_3D( uvw.x*2.0f, uvw.y*2.0f, uvw.z*2.0f ))*1.0f; // tetraeder //h_densityData[index] = (uvw.getLength() < 0.2f ? 1.0f : 0.0f)*2.0f; } // create 3D array d_densityArray = 0; cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>(); cudaMalloc3DArray(&d_densityArray, &channelDesc, volumeSize); // copy data to 3D array cudaMemcpy3DParms copyParams = {0}; copyParams.srcPtr = make_cudaPitchedPtr((void*)h_densityData, volumeSize.width*sizeof(float), volumeSize.width, volumeSize.height); copyParams.dstArray = d_densityArray; copyParams.extent = volumeSize; copyParams.kind = cudaMemcpyHostToDevice; cudaMemcpy3D(©Params); // TMP /* h_debugVec.resize( 1000.0f ); d_debugVec = h_debugVec; h_debugInfo.samples = convertToKernel(d_debugVec); h_debugInfo.numSamples = 0; cudaMemcpyToSymbol( d_debugInfo, &h_debugInfo, sizeof(DebugInfo), 0, cudaMemcpyHostToDevice ); */ // setup lighting m_light0.cam = base::CameraPtr( new base::Camera() ); m_light0.cam->m_aspectRatio = 1.0; //m_light0.cam->m_transform = math::createLookAtMatrix( math::Vec3f( -2.0f, -2.0f, 2.0f ), math::Vec3f( 0.0f, 0.0f, 0.0f ), math::Vec3f( 0.0f, 1.0f, 0.0f ), false ); //m_light0.cam->m_transform = math::Matrix44f::TranslationMatrix( 0.3f, 0.15f, 2.0f ); //m_light0.cam->m_transform = math::Matrix44f::TranslationMatrix( -3.0f, 0.0f, 0.0f ); m_light0.cam->m_transform = math::createLookAtMatrix( math::Vec3f( 4.0f, 0.0f, 0.0f ), math::Vec3f( 0.0f, 0.0f, 0.0f ), math::Vec3f( 0.0f, 1.0f, 0.0f ), false ); m_light0.cam->update(); cudaMalloc( &m_light0.d_dctCoefficients, width*height*sizeof(float)*8 );// 8 floats /6 coefficients // set defaults setTotalCrossSection( 10.0f ); setAlbedo( 1.0f ); setAbsorptionColor( math::Vec3f(0.5f,0.5f, 0.5f) ); setScatteringColor(math::Vec3f(0.5f, 0.5f, 0.5f)); setLight(0, math::Vec3f(1.0f, 1.0f, 1.0f), 0.0f); setTime( 0.0f ); setStepSize( 0.01f ); // get tetmesh onto gpu gpuUploadTetMesh(); }
//////////////////////////////////////////////////////////////////////////////// //! Run the Cuda part of the computation //////////////////////////////////////////////////////////////////////////////// void RunKernels() { static float t = 0.0f; // populate the 2d texture { cudaArray *cuArray; cudaGraphicsSubResourceGetMappedArray(&cuArray, g_texture_2d.cudaResource, 0, 0); getLastCudaError("cudaGraphicsSubResourceGetMappedArray (cuda_texture_2d) failed"); // kick off the kernel and send the staging buffer cudaLinearMemory as an argument to allow the kernel to write to it cuda_texture_2d(g_texture_2d.cudaLinearMemory, g_texture_2d.width, g_texture_2d.height, g_texture_2d.pitch, t); getLastCudaError("cuda_texture_2d failed"); // then we want to copy cudaLinearMemory to the D3D texture, via its mapped form : cudaArray cudaMemcpy2DToArray( cuArray, // dst array 0, 0, // offset g_texture_2d.cudaLinearMemory, g_texture_2d.pitch, // src g_texture_2d.width*4*sizeof(float), g_texture_2d.height, // extent cudaMemcpyDeviceToDevice); // kind getLastCudaError("cudaMemcpy2DToArray failed"); } // populate the volume texture { size_t pitchSlice = g_texture_vol.pitch * g_texture_vol.height; cudaArray *cuArray; cudaGraphicsSubResourceGetMappedArray(&cuArray, g_texture_vol.cudaResource, 0, 0); getLastCudaError("cudaGraphicsSubResourceGetMappedArray (cuda_texture_3d) failed"); // kick off the kernel and send the staging buffer cudaLinearMemory as an argument to allow the kernel to write to it cuda_texture_volume(g_texture_vol.cudaLinearMemory, g_texture_vol.width, g_texture_vol.height, g_texture_vol.depth, g_texture_vol.pitch, pitchSlice, t); getLastCudaError("cuda_texture_3d failed"); // then we want to copy cudaLinearMemory to the D3D texture, via its mapped form : cudaArray struct cudaMemcpy3DParms memcpyParams = {0}; memcpyParams.dstArray = cuArray; memcpyParams.srcPtr.ptr = g_texture_vol.cudaLinearMemory; memcpyParams.srcPtr.pitch = g_texture_vol.pitch; memcpyParams.srcPtr.xsize = g_texture_vol.width; memcpyParams.srcPtr.ysize = g_texture_vol.height; memcpyParams.extent.width = g_texture_vol.width; memcpyParams.extent.height = g_texture_vol.height; memcpyParams.extent.depth = g_texture_vol.depth; memcpyParams.kind = cudaMemcpyDeviceToDevice; cudaMemcpy3D(&memcpyParams); getLastCudaError("cudaMemcpy3D failed"); } // populate the faces of the cube map for (int face = 0; face < 6; ++face) { cudaArray *cuArray; cudaGraphicsSubResourceGetMappedArray(&cuArray, g_texture_cube.cudaResource, face, 0); getLastCudaError("cudaGraphicsSubResourceGetMappedArray (cuda_texture_cube) failed"); // kick off the kernel and send the staging buffer cudaLinearMemory as an argument to allow the kernel to write to it cuda_texture_cube(g_texture_cube.cudaLinearMemory, g_texture_cube.size, g_texture_cube.size, g_texture_cube.pitch, face, t); getLastCudaError("cuda_texture_cube failed"); // then we want to copy cudaLinearMemory to the D3D texture, via its mapped form : cudaArray cudaMemcpy2DToArray( cuArray, // dst array 0, 0, // offset g_texture_cube.cudaLinearMemory, g_texture_cube.pitch, // src g_texture_cube.size*4, g_texture_cube.size, // extent cudaMemcpyDeviceToDevice); // kind getLastCudaError("cudaMemcpy2DToArray failed"); } t += 0.1f; }
cudaError_t WINAPI wine_cudaMemcpy3D( const struct cudaMemcpy3DParms *p ) { WINE_TRACE("\n"); return cudaMemcpy3D( p ); }