void allocate(std::size_t size_,std::size_t nstreams , nt2::host_ &) { if(size_ > size) { if(size != 0) { for(std::size_t i =0; i < device.size(); ++i) { CUDA_ERROR(cudaFreeHost(host_pinned[i])); CUDA_ERROR(cudaFree(device[i])); } } ns = nstreams; size = size_; std::size_t sizeof_ = size*sizeof(T); host_pinned.resize(nstreams); device.resize(nstreams); for(std::size_t i =0; i < nstreams; ++i) { CUDA_ERROR(cudaMallocHost( (void**)&host_pinned[i] , sizeof_ )); CUDA_ERROR(cudaMalloc((void**)&device[i] , sizeof_ )); } } }
int cuda_init_memopt(void) { int num_devices = cuda_devices(); int device; int max_device = 0; if (num_devices > 1) { size_t mem_max = 0; size_t mem_free; size_t mem_total; for (device = 0; device < num_devices; device++) { cuda_init(device); CUDA_ERROR(cudaMemGetInfo(&mem_free, &mem_total)); //printf(" device (%d): %d\n", device, mem_available); if (mem_max < mem_free) { mem_max = mem_free; max_device = device; } } //printf(" max device: %d\n", max_device); CUDA_ERROR(cudaSetDevice(max_device)); // FIXME: we should set last_init } return max_device; }
void cuda_p2p(int a, int b) { int dev; CUDA_ERROR(cudaGetDevice(&dev)); CUDA_ERROR(cudaSetDevice(a)); CUDA_ERROR(cudaDeviceEnablePeerAccess(b, 0)); CUDA_ERROR(cudaSetDevice(dev)); }
static void* cuda_malloc_wrapper(size_t size) { void* ptr; CUDA_ERROR(cudaMalloc(&ptr, size)); return ptr; }
/** Executes the CUFFT plan. This method executes the CUFFT complex-to-real transform plan. CUFFT uses as input data the GPU memory specified by the idata parameter. The Fourier coefficients are stored in the odata array. If idata and odata refer to the same memory location, this method does an in‐place transform. @param idata input data @param odata output data */ inline void exec(const DeviceMemory<complex, 1> &idata, DeviceMemory<real, 1> &odata) { if((Dim > 1) && !(idata.contiguous() && odata.contiguous())) CUDA_ERROR("CUFFT can only be used for contiguous memory (i.e., no padding between rows)"); CUFFT_CHECK(cufftExecC2R(plan, const_cast<complex *>(idata.getBuffer()), odata.getBuffer())); }
void DeviceMemoryLinear<Type, Dim>:: alloc() { this->free(); size_t p = 1; for(size_t i = Dim; i--;) p *= this->size[i]; // allocating empty data is not considered an error // since this is a normal operation within STL containers if(p == 0) { this->setPitch(0); return; } CUDA_CHECK(cudaMalloc((void **)&this->buffer, p * sizeof(Type))); this->setPitch(0); if(this->buffer == 0) CUDA_ERROR("cudaMalloc failed"); #ifdef CUDA_DEBUG_INIT_MEMORY CUDA_CHECK(cudaMemset(this->buffer, 0, this->getBytes())); #endif }
/** Returns a single slice from a higher dimensional dataset. Keeps region of interest and other information. @param slice slice to which reference will be created */ DeviceMemoryReference<Type, Dim-1> getSlice(unsigned int slice) { CUDA_STATIC_ASSERT(Dim >= 2); if (slice>=this->size[Dim-1]) CUDA_ERROR("out of bounds"); // Calculate new size Cuda::Size<Dim-1> slice_size; for(int i = Dim-1; i--;) slice_size[i] = this->size[i]; int offset = this->stride[Dim-2]*slice; DeviceMemoryReference<Type, Dim-1> slice_ref(slice_size, this->buffer + offset); for(int i = Dim-1; i--;) { slice_ref.region_ofs[i] = this->region_ofs[i]; slice_ref.region_size[i] = this->region_size[i]; slice_ref.stride[i] = this->stride[i]; slice_ref.spacing[i] = this->spacing[i]; } return slice_ref; }
~cu_buffers() { for(std::size_t i = 0 ; i < device.size() ; ++i ) { CUDA_ERROR(cudaFree(device[i])); } for(std::size_t i = 0 ; i < host_pinned.size() ; ++i ) { CUDA_ERROR(cudaFreeHost(host_pinned[i])); } size = 0; device.resize(0); host_pinned.resize(0); }
/** Object-specific part of map action. */ void mapInternal() { CUDA_CHECK(cudaGraphicsSubResourceGetMappedArray(&this->array, resource, 0, 0)); if(this->array == 0) CUDA_ERROR("map image object failed"); }
void HostMemoryHeap<Type, Dim>:: realloc() { this->setPitch(0); this->buffer = (Type *)malloc(this->getSize() * sizeof(Type)); if(this->buffer == 0) CUDA_ERROR("out of memory"); #ifdef CUDA_DEBUG_INIT_MEMORY memset(this->buffer, 0, this->getBytes()); #endif }
inline void copy( In const& a, Out& b , HDI const& , HDO const& , cudaStream_t stream = 0) { using T = typename Out::value_type; //TODO CUDA_ERROR(cudaMemcpyAsync( (T*)b.data() , a.data() , a.size()* sizeof(T) , copy_<HDI,HDO>::mode() , stream )); }
void cuda_p2p_table(int n, bool table[n][n]) { assert(n == cuda_devices()); for (int i = 0; i < n; i++) { for (int j = 0; j < n; j++) { int r; CUDA_ERROR(cudaDeviceCanAccessPeer(&r, i, j)); table[i][j] = (1 == r); } } }
inline void transfer_dth( Out & out , int blockid, Stream & stream ,std::size_t streamid , std::size_t leftover , nt2::pinned_ &) { std::size_t sizeb = blocksize; if(leftover !=0) sizeb = leftover ; if(block_stream_dth[blockid] == false ) { CUDA_ERROR(cudaMemcpyAsync( out.data() , buffers.get_device(streamid) , sizeb * sizeof(T) , cudaMemcpyDeviceToHost , stream )); block_stream_dth[blockid] = true; cudaStreamSynchronize(stream); } }
inline void transfer_htd( In & in, int blockid, Stream & stream ,std::size_t streamid , std::size_t leftover , nt2::pinned_ & ) { std::size_t sizeb = blocksize; if(leftover !=0) sizeb = leftover ; if( block_stream_htd[blockid] == false ) { block_stream_htd[blockid] = true; CUDA_ERROR(cudaMemcpyAsync( buffers.get_device(streamid) , in.data() , sizeb* sizeof(T) , cudaMemcpyHostToDevice , stream )); cudaStreamSynchronize(stream); } }
void* cuda_malloc(long size) { if (cuda_memcache) { struct cuda_mem_s* nptr = find_free(size); if (NULL != nptr) { assert(nptr->device); assert(!nptr->free); nptr->thread_id = omp_get_thread_num(); return (void*)(nptr->ptr); } } void* ptr; CUDA_ERROR(cudaMalloc(&ptr, size)); insert(ptr, size, true); return ptr; }
void cuda_exit(void) { cuda_memcache_clear(); CUDA_ERROR(cudaDeviceReset()); }
void cuda_init(int device) { last_init = device; CUDA_ERROR(cudaSetDevice(device)); }
int cuda_devices(void) { int count; CUDA_ERROR(cudaGetDeviceCount(&count)); return count; }
void cuda_exit(void) { cuda_memcache_clear(); CUDA_ERROR(cudaThreadExit()); }
void cuda_memcpy_strided(const long dims[2], long ostr, void* dst, long istr, const void* src) { CUDA_ERROR(cudaMemcpy2D(dst, ostr, src, istr, dims[0], dims[1], cudaMemcpyDefault)); }
void cuda_memcpy(long size, void* dst, const void* src) { // printf("COPY %x %x %ld\n", dst, src, size); CUDA_ERROR(cudaMemcpy(dst, src, size, cudaMemcpyDefault)); }
void cuda_clear(long size, void* dst) { // printf("CLEAR %x %ld\n", dst, size); CUDA_ERROR(cudaMemset(dst, 0, size)); }
psaError_t transferCPUtoGPUStream(sequences_t *queries, sequences_t *candidates, alignments_t *alignments) { //allocate & transfer Queries to GPU CUDA_ERROR(cudaMalloc((void**)&queries->d_ASCII, queries->numASCIIEntries * sizeof(ASCIIEntry_t))); CUDA_ERROR(cudaMemcpy(queries->d_ASCII, queries->h_ASCII, queries->numASCIIEntries * sizeof(ASCIIEntry_t), cudaMemcpyHostToDevice)); CUDA_ERROR(cudaMalloc((void**)&queries->d_ASCIIposition, queries->num * sizeof(uint32_t))); CUDA_ERROR(cudaMemcpy(queries->d_ASCIIposition, queries->h_ASCIIposition, queries->num * sizeof(uint32_t), cudaMemcpyHostToDevice)); //allocate & transfer FMIndex to GPU CUDA_ERROR(cudaMalloc((void**)&candidates->d_ASCII, candidates->numASCIIEntries * sizeof(ASCIIEntry_t))); CUDA_ERROR(cudaMemcpy(candidates->d_ASCII, candidates->h_ASCII, candidates->numASCIIEntries * sizeof(ASCIIEntry_t), cudaMemcpyHostToDevice)); CUDA_ERROR(cudaMalloc((void**)&candidates->d_ASCIIposition, candidates->num * sizeof(uint32_t))); CUDA_ERROR(cudaMemcpy(candidates->d_ASCIIposition, candidates->h_ASCIIposition, candidates->num * sizeof(uint32_t), cudaMemcpyHostToDevice)); //allocate & initialize Results CUDA_ERROR(cudaMalloc((void**)&alignments->d_info, alignments->num * sizeof(alignmentInfo_t))); CUDA_ERROR(cudaMemcpy(alignments->d_info, alignments->h_info, alignments->num * sizeof(alignmentInfo_t), cudaMemcpyHostToDevice)); CUDA_ERROR(cudaMalloc((void**)&alignments->d_results, alignments->num * sizeof(alignmentEntry_t))); CUDA_ERROR(cudaMemset(alignments->d_results, 0, alignments->num * sizeof(alignmentEntry_t))); return (SUCCESS); }
static void cuda_free_wrapper(const void* ptr) { CUDA_ERROR(cudaFree((void*)ptr)); }
psaError_t transferGPUtoCPU(alignments_t *alignments) { CUDA_ERROR(cudaMemcpy(alignments->h_results, alignments->d_results, alignments->num * sizeof(alignmentEntry_t), cudaMemcpyDeviceToHost)); return (SUCCESS); }