示例#1
0
      void allocate(std::size_t size_,std::size_t nstreams , nt2::host_ &)
      {
        if(size_ > size)
        {
          if(size != 0)
          {
            for(std::size_t i =0; i < device.size(); ++i)
            {
              CUDA_ERROR(cudaFreeHost(host_pinned[i]));
              CUDA_ERROR(cudaFree(device[i]));
            }
          }
          ns = nstreams;
          size = size_;
          std::size_t sizeof_ = size*sizeof(T);
          host_pinned.resize(nstreams);
          device.resize(nstreams);
          for(std::size_t i =0; i < nstreams; ++i)
          {
            CUDA_ERROR(cudaMallocHost( (void**)&host_pinned[i] , sizeof_ ));
            CUDA_ERROR(cudaMalloc((void**)&device[i] , sizeof_  ));

          }
        }
      }
示例#2
0
文件: gpuops.c 项目: frankong/bart
int cuda_init_memopt(void) 
{
	int num_devices = cuda_devices();
	int device;
	int max_device = 0;

	if (num_devices > 1) {

		size_t mem_max = 0;
		size_t mem_free;
		size_t mem_total;

		for (device = 0; device < num_devices; device++) {

			cuda_init(device);
			CUDA_ERROR(cudaMemGetInfo(&mem_free, &mem_total));
			//printf(" device (%d): %d\n", device, mem_available);

			if (mem_max < mem_free) {

				mem_max = mem_free;
				max_device = device;
			}
		}
		//printf(" max device: %d\n", max_device);
		CUDA_ERROR(cudaSetDevice(max_device));
		// FIXME: we should set last_init
	}

	return max_device;
}
示例#3
0
文件: gpuops.c 项目: frankong/bart
void cuda_p2p(int a, int b)
{
	int dev;
	CUDA_ERROR(cudaGetDevice(&dev));
	CUDA_ERROR(cudaSetDevice(a));
	CUDA_ERROR(cudaDeviceEnablePeerAccess(b, 0));
	CUDA_ERROR(cudaSetDevice(dev));
}
示例#4
0
文件: gpuops.c 项目: mrirecon/bart
static void* cuda_malloc_wrapper(size_t size)
{
	void* ptr;
        CUDA_ERROR(cudaMalloc(&ptr, size));

	return ptr;
}
  /**
     Executes the CUFFT plan.
     This method executes the CUFFT complex-to-real transform plan. CUFFT
     uses as input data the GPU memory specified by the idata parameter. The
     Fourier coefficients are stored in the odata array. If idata and odata
     refer to the same memory location, this method does an in‐place transform.
     @param idata input data
     @param odata output data
     
  */
  inline void exec(const DeviceMemory<complex, 1> &idata, DeviceMemory<real, 1> &odata)
  {
    if((Dim > 1) && !(idata.contiguous() && odata.contiguous()))
      CUDA_ERROR("CUFFT can only be used for contiguous memory (i.e., no padding between rows)");

    CUFFT_CHECK(cufftExecC2R(plan, const_cast<complex *>(idata.getBuffer()), odata.getBuffer()));
  }
void DeviceMemoryLinear<Type, Dim>::
alloc()
{
    this->free();
    size_t p = 1;

    for(size_t i = Dim; i--;)
        p *= this->size[i];

    // allocating empty data is not considered an error
    // since this is a normal operation within STL containers
    if(p == 0) {
        this->setPitch(0);
        return;
    }

    CUDA_CHECK(cudaMalloc((void **)&this->buffer, p * sizeof(Type)));
    this->setPitch(0);

    if(this->buffer == 0)
        CUDA_ERROR("cudaMalloc failed");

#ifdef CUDA_DEBUG_INIT_MEMORY
    CUDA_CHECK(cudaMemset(this->buffer, 0, this->getBytes()));
#endif
}
  /**
    Returns a single slice from a higher dimensional dataset.
    Keeps region of interest and other information.
    @param slice slice to which reference will be created
  */
  DeviceMemoryReference<Type, Dim-1> getSlice(unsigned int slice)
  {
    CUDA_STATIC_ASSERT(Dim >= 2);

    if (slice>=this->size[Dim-1])
      CUDA_ERROR("out of bounds");

    // Calculate new size
    Cuda::Size<Dim-1> slice_size;
    for(int i = Dim-1; i--;)
      slice_size[i] = this->size[i];

    int offset = this->stride[Dim-2]*slice;
    DeviceMemoryReference<Type, Dim-1> slice_ref(slice_size, this->buffer + offset);

    for(int i = Dim-1; i--;)
    {
      slice_ref.region_ofs[i] = this->region_ofs[i];
      slice_ref.region_size[i] = this->region_size[i];
      slice_ref.stride[i] = this->stride[i];
      slice_ref.spacing[i] = this->spacing[i];
    }

    return slice_ref;
  }
示例#8
0
      ~cu_buffers()
      {
        for(std::size_t i = 0 ; i < device.size() ; ++i )
        {
          CUDA_ERROR(cudaFree(device[i]));
        }

        for(std::size_t i = 0 ; i < host_pinned.size() ; ++i )
        {
          CUDA_ERROR(cudaFreeHost(host_pinned[i]));
        }

        size = 0;
        device.resize(0);
        host_pinned.resize(0);
      }
  /**
     Object-specific part of map action.
  */
  void mapInternal()
  {
    CUDA_CHECK(cudaGraphicsSubResourceGetMappedArray(&this->array, resource, 0, 0));

    if(this->array == 0)
      CUDA_ERROR("map image object failed");
  }
void HostMemoryHeap<Type, Dim>::
realloc()
{
  this->setPitch(0);
  this->buffer = (Type *)malloc(this->getSize() * sizeof(Type));

  if(this->buffer == 0)
    CUDA_ERROR("out of memory");

#ifdef CUDA_DEBUG_INIT_MEMORY
  memset(this->buffer, 0, this->getBytes());
#endif
}
示例#11
0
文件: copy.hpp 项目: psiha/nt2
  inline void copy( In const& a, Out& b , HDI const& , HDO const&
                  , cudaStream_t stream = 0)
  {
    using T = typename Out::value_type;
//TODO

    CUDA_ERROR(cudaMemcpyAsync( (T*)b.data()
                              , a.data()
                              , a.size()* sizeof(T)
                              , copy_<HDI,HDO>::mode()
                              , stream
                              ));
  }
示例#12
0
文件: gpuops.c 项目: frankong/bart
void cuda_p2p_table(int n, bool table[n][n])
{
	assert(n == cuda_devices());

	for (int i = 0; i < n; i++) {
		for (int j = 0; j < n; j++) {


			int r;
			CUDA_ERROR(cudaDeviceCanAccessPeer(&r, i, j));

			table[i][j] = (1 == r);
		}
	}
}
示例#13
0
      inline void transfer_dth( Out & out , int blockid, Stream & stream  ,std::size_t streamid
                              , std::size_t leftover , nt2::pinned_ &)
      {
        std::size_t sizeb = blocksize;
        if(leftover !=0) sizeb = leftover ;

        if(block_stream_dth[blockid] == false )
        {
          CUDA_ERROR(cudaMemcpyAsync( out.data()
                          , buffers.get_device(streamid)
                          , sizeb * sizeof(T)
                          , cudaMemcpyDeviceToHost
                          , stream
                    ));

          block_stream_dth[blockid] = true;
          cudaStreamSynchronize(stream);

        }
      }
示例#14
0
      inline void transfer_htd( In & in, int blockid, Stream & stream ,std::size_t streamid
                              , std::size_t leftover , nt2::pinned_ & )
      {
        std::size_t sizeb = blocksize;
        if(leftover !=0) sizeb = leftover ;

        if( block_stream_htd[blockid] == false )
        {
        block_stream_htd[blockid] = true;

        CUDA_ERROR(cudaMemcpyAsync( buffers.get_device(streamid)
                                  , in.data()
                                  , sizeb* sizeof(T)
                                  , cudaMemcpyHostToDevice
                                  , stream
                  ));
        cudaStreamSynchronize(stream);
        }

      }
示例#15
0
文件: gpuops.c 项目: frankong/bart
void* cuda_malloc(long size)
{
	if (cuda_memcache) {

		struct cuda_mem_s* nptr = find_free(size);

		if (NULL != nptr) {

			assert(nptr->device);
			assert(!nptr->free);

			nptr->thread_id = omp_get_thread_num();

			return (void*)(nptr->ptr);
		}
	}

	void* ptr;
        CUDA_ERROR(cudaMalloc(&ptr, size));

	insert(ptr, size, true);

	return ptr;
}
示例#16
0
文件: gpuops.c 项目: mrirecon/bart
void cuda_exit(void)
{
	cuda_memcache_clear();
	CUDA_ERROR(cudaDeviceReset());
}
示例#17
0
文件: gpuops.c 项目: frankong/bart
void cuda_init(int device)
{
	last_init = device;
	CUDA_ERROR(cudaSetDevice(device));
}
示例#18
0
文件: gpuops.c 项目: frankong/bart
int cuda_devices(void)
{
	int count;
	CUDA_ERROR(cudaGetDeviceCount(&count));
	return count;
}
示例#19
0
文件: gpuops.c 项目: frankong/bart
void cuda_exit(void)
{
	cuda_memcache_clear();
	CUDA_ERROR(cudaThreadExit());
}
示例#20
0
文件: gpuops.c 项目: frankong/bart
void cuda_memcpy_strided(const long dims[2], long ostr, void* dst, long istr, const void* src)
{
	CUDA_ERROR(cudaMemcpy2D(dst, ostr, src, istr, dims[0], dims[1], cudaMemcpyDefault));
}
示例#21
0
文件: gpuops.c 项目: frankong/bart
void cuda_memcpy(long size, void* dst, const void* src)
{
//	printf("COPY %x %x %ld\n", dst, src, size);
	CUDA_ERROR(cudaMemcpy(dst, src, size, cudaMemcpyDefault));
}
示例#22
0
文件: gpuops.c 项目: frankong/bart
void cuda_clear(long size, void* dst)
{
//	printf("CLEAR %x %ld\n", dst, size);
	CUDA_ERROR(cudaMemset(dst, 0, size));
}
示例#23
0
psaError_t transferCPUtoGPUStream(sequences_t *queries, sequences_t *candidates, alignments_t *alignments)
{
	//allocate & transfer Queries to GPU
	CUDA_ERROR(cudaMalloc((void**)&queries->d_ASCII, queries->numASCIIEntries * sizeof(ASCIIEntry_t)));
	CUDA_ERROR(cudaMemcpy(queries->d_ASCII, queries->h_ASCII, queries->numASCIIEntries * sizeof(ASCIIEntry_t), cudaMemcpyHostToDevice));

	CUDA_ERROR(cudaMalloc((void**)&queries->d_ASCIIposition, queries->num * sizeof(uint32_t)));
	CUDA_ERROR(cudaMemcpy(queries->d_ASCIIposition, queries->h_ASCIIposition, queries->num * sizeof(uint32_t), cudaMemcpyHostToDevice));


	//allocate & transfer FMIndex to GPU
	CUDA_ERROR(cudaMalloc((void**)&candidates->d_ASCII, candidates->numASCIIEntries * sizeof(ASCIIEntry_t)));
	CUDA_ERROR(cudaMemcpy(candidates->d_ASCII, candidates->h_ASCII, candidates->numASCIIEntries * sizeof(ASCIIEntry_t), cudaMemcpyHostToDevice));

	CUDA_ERROR(cudaMalloc((void**)&candidates->d_ASCIIposition, candidates->num * sizeof(uint32_t)));
	CUDA_ERROR(cudaMemcpy(candidates->d_ASCIIposition, candidates->h_ASCIIposition, candidates->num * sizeof(uint32_t), cudaMemcpyHostToDevice));


	//allocate & initialize Results
	CUDA_ERROR(cudaMalloc((void**)&alignments->d_info, alignments->num * sizeof(alignmentInfo_t)));
	CUDA_ERROR(cudaMemcpy(alignments->d_info, alignments->h_info, alignments->num * sizeof(alignmentInfo_t), cudaMemcpyHostToDevice));

 	CUDA_ERROR(cudaMalloc((void**)&alignments->d_results, alignments->num * sizeof(alignmentEntry_t)));
 	CUDA_ERROR(cudaMemset(alignments->d_results, 0, alignments->num * sizeof(alignmentEntry_t)));

	return (SUCCESS);
}
示例#24
0
文件: gpuops.c 项目: mrirecon/bart
static void cuda_free_wrapper(const void* ptr)
{
	CUDA_ERROR(cudaFree((void*)ptr));
}
示例#25
0
psaError_t transferGPUtoCPU(alignments_t *alignments)
{	
	CUDA_ERROR(cudaMemcpy(alignments->h_results, alignments->d_results, alignments->num * sizeof(alignmentEntry_t), cudaMemcpyDeviceToHost));

	return (SUCCESS);
}