예제 #1
0
inline typename std::enable_if<
    std::is_convertible<
        typename std::iterator_traits<InputIterator>::iterator_category
      , device_random_access_iterator_tag
    >::value
    && std::is_convertible<
        typename std::iterator_traits<OutputIterator>::iterator_category
      , device_random_access_iterator_tag
    >::value
    && std::is_same<
        typename std::iterator_traits<InputIterator>::value_type
      , typename std::iterator_traits<OutputIterator>::value_type
    >::value
  , OutputIterator>::type copy(InputIterator first, InputIterator last, OutputIterator result)
{
    typename std::iterator_traits<InputIterator>::difference_type size = last - first;
    CUDA_CALL( cudaMemcpy(
        &*result
      , &*first
      , size * sizeof(typename std::iterator_traits<InputIterator>::value_type)
      , cudaMemcpyDeviceToDevice
    ) );
    return result + size;
}
예제 #2
0
GpuDevice::GpuDevice(uint64_t device_id, DeviceListener* l, int gpu_id) : ThreadedDevice(device_id, l, kParallelism), device_(gpu_id) {
  CUDA_CALL(cudaSetDevice(device_));
  cudaFree(0);  // Initialize
  auto allocator = [this](size_t len) -> void* {
    void* ret;
    CUDA_CALL(cudaSetDevice(device_));
    CUDA_CALL(cudaMalloc(&ret, len));
    return ret;
  };
  auto deallocator = [this](void* ptr) {
    CUDA_CALL(cudaSetDevice(device_));
    CUDA_CALL(cudaFree(ptr));
  };
  data_store_ = new PooledDataStore(DEFAULT_POOL_SIZE, allocator, deallocator);
  for (size_t i = 0; i < kParallelism; ++i) {
    CUDA_CALL(cudaStreamCreate(&stream_[i]));
    CUBLAS_CALL(cublasCreate(&cublas_handle_[i]));
    CUBLAS_CALL(cublasSetStream(cublas_handle_[i], stream_[i]));
    CUDNN_CALL(cudnnCreate(&cudnn_handle_[i]));
    CUDNN_CALL(cudnnSetStream(cudnn_handle_[i], stream_[i]));
  }
}
예제 #3
0
파일: plugin-nvptx.c 프로젝트: SuperLu/gcc
static bool
link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
	  unsigned num_objs)
{
  CUjit_option opts[6];
  void *optvals[6];
  float elapsed = 0.0;
#define LOGSIZE 8192
  char elog[LOGSIZE];
  char ilog[LOGSIZE];
  unsigned long logsize = LOGSIZE;
  CUlinkState linkstate;
  CUresult r;
  void *linkout;
  size_t linkoutsize __attribute__ ((unused));

  opts[0] = CU_JIT_WALL_TIME;
  optvals[0] = &elapsed;

  opts[1] = CU_JIT_INFO_LOG_BUFFER;
  optvals[1] = &ilog[0];

  opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
  optvals[2] = (void *) logsize;

  opts[3] = CU_JIT_ERROR_LOG_BUFFER;
  optvals[3] = &elog[0];

  opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
  optvals[4] = (void *) logsize;

  opts[5] = CU_JIT_LOG_VERBOSE;
  optvals[5] = (void *) 1;

  CUDA_CALL (cuLinkCreate, 6, opts, optvals, &linkstate);

  for (; num_objs--; ptx_objs++)
    {
      /* cuLinkAddData's 'data' argument erroneously omits the const
	 qualifier.  */
      GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code);
      r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, (char*)ptx_objs->code,
			 ptx_objs->size, 0, 0, 0, 0);
      if (r != CUDA_SUCCESS)
	{
	  GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
	  GOMP_PLUGIN_error ("cuLinkAddData (ptx_code) error: %s",
			     cuda_error (r));
	  return false;
	}
    }

  GOMP_PLUGIN_debug (0, "Linking\n");
  r = cuLinkComplete (linkstate, &linkout, &linkoutsize);

  GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
  GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);

  if (r != CUDA_SUCCESS)
    {
      GOMP_PLUGIN_error ("cuLinkComplete error: %s", cuda_error (r));
      return false;
    }

  CUDA_CALL (cuModuleLoadData, module, linkout);
  CUDA_CALL (cuLinkDestroy, linkstate);
  return true;
}
예제 #4
0
	void RenderTarget::Map(void) {
		glBindTexture(GL_TEXTURE_2D, 0);
		CUDA_CALL(cudaGraphicsMapResources(1, &_resource, 0));
		CUDA_CALL(cudaGraphicsSubResourceGetMappedArray(&_array,
			_resource, 0, 0));
	}
예제 #5
0
 /**
  * get device on which the active host thread executes device code
  */
 static int get()
 {
     int dev;
     CUDA_CALL(cudaGetDevice(&dev));
     return dev;
 }
예제 #6
0
 /**
  * returns number of devices available for execution
  */
 static int count()
 {
     int count;
     CUDA_CALL(cudaGetDeviceCount(&count));
     return count;
 }
	virtual ~CudaGraphicsResourceMapped()
	{
		cudaGraphicsResource_t cudaGraphicsResource = m_resource->getResource();
		CUDA_CALL(cudaGraphicsUnmapResources(1, &cudaGraphicsResource));
	}
	~CudaGraphicsResource()
	{
		CUDA_CALL(cudaGraphicsUnregisterResource(m_resource));
	}
	// creates and registers an opengl texture to cuda
	CudaGraphicsResource(GLuint image, GLenum target, unsigned int flags)
	{
		CUDA_CALL(cudaGraphicsGLRegisterImage(&m_resource, image, target, flags));
	}
예제 #10
0
파일: device.cpp 프로젝트: AI42/minerva
void GpuDevice::DoCopyRemoteData(float* dst, float* src, size_t size, int thrid) {
  CUDA_CALL(cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, impl_->stream[thrid]));
  CUDA_CALL(cudaStreamSynchronize(impl_->stream[thrid]));
}
예제 #11
0
파일: device.cpp 프로젝트: AI42/minerva
void GpuDevice::Barrier(int thrid) {
  CUDA_CALL(cudaStreamSynchronize(impl_->stream[thrid]));
}
예제 #12
0
파일: device.cpp 프로젝트: AI42/minerva
void GpuDevice::Impl::ActivateDevice() const {
  CUDA_CALL(cudaSetDevice(device));
}
예제 #13
0
 /*
  * cleans up all runtime-related resources associated with calling thread
  */
 static void exit()
 {
     CUDA_CALL(cudaThreadExit());
 }
예제 #14
0
 /*
  * blocks until the device has completed all preceding requested tasks
  */
 static void synchronize()
 {
     CUDA_CALL(cudaThreadSynchronize());
 }
예제 #15
0
void GpuDevice::PreExecute() {
  CUDA_CALL(cudaSetDevice(device_));
}
예제 #16
0
int main() 
{

#if VISUAL
    plotGrid* pg = new plotGrid;
#endif

    // number of reps
    int numBlocks = 128;
    // length of grid
    int Nx = 8;
    int N = Nx * Nx;
    int N2 = 0.5 * N;
    int N4 = 0.5 * N2;
    int N_ALL = N * numBlocks;



    dim3 threadGrid(Nx, Nx);
    curandState *devRands;
    CUDA_CALL(cudaMalloc((void **)&devRands, N_ALL * sizeof(curandState)));

    srand (time(NULL));
    initRands(threadGrid, numBlocks, devRands, rand());


    float* d_wg;
    CUDA_CALL(cudaMalloc((void**)&d_wg, sizeof(float) *  (N_ALL) ));
    int* d_states;
    CUDA_CALL(cudaMalloc((void**)&d_states, sizeof(int) * N_ALL));
    int* d_states2;
    CUDA_CALL(cudaMalloc((void**)&d_states2, sizeof(int) * N_ALL));

    float* d_up;
    CUDA_CALL(cudaMalloc((void**)&d_up, sizeof(float) *  (N + 1) ));

    float* h_up = new float [N+1];

    float* d_down;
    CUDA_CALL(cudaMalloc((void**)&d_down, sizeof(float) *  (N + 1) ));

    float* h_down = new float [N+1];

    int* d_upcount;
    CUDA_CALL(cudaMalloc((void**)&d_upcount, sizeof(int) *  (N + 1) ));

    int* h_upcount = new int [N+1];

    int* d_downcount;
    CUDA_CALL(cudaMalloc((void**)&d_downcount, sizeof(int) *  (N + 1) ));

    int* h_downcount = new int [N+1];



    int* d_blockTotals;
    CUDA_CALL(cudaMalloc((void**)&d_blockTotals, sizeof(int) * numBlocks));

    float* h_wg = new float [N_ALL];
    int* h_states = new int[N_ALL];
    int* h_blockTotals = new int[numBlocks];
    int* h_blockTimes = new int[numBlocks];
    int wgCount = 1;

    const unsigned int shape[] = {N+1,2};

    float* results = new float[(N+1)*2];
    for (int i=0;i<(N+1)*2;i++)
       results[i]=0.0f;



    for (int G=0;G<wgCount;G++)
    {
        float wg = 0.25;//5 + 0.2 * float(G);
        for (int i=0;i<N_ALL;i++)
        {
            h_wg[i]=wg;
            float unum =rand()/double(RAND_MAX);
   //         cout<<unum<<endl;
 //           if (unum<0.2345)
 //               h_wg[i]+=0.833;
        }
        CUDA_CALL(cudaMemcpy(d_wg, h_wg, (N_ALL) * sizeof(float), cudaMemcpyHostToDevice));


        for (int b=0;b<numBlocks;b++)
            h_blockTimes[b] = -1;
        int maxTime = 100000;
        int checkTime = 100;
        float sw = 1.0f;

        char fileName[30];
        sprintf(fileName, "potential%d-%d.npy", int(10*wg),int(100.0*sw));
//        cout<<fileName<<endl;

        CUDA_CALL(cudaMemset (d_states, 0, sizeof(int) * (N_ALL)));
        CUDA_CALL(cudaMemset (d_blockTotals, 0, sizeof(int) * (numBlocks)));
        CUDA_CALL(cudaMemset (d_up, 0, sizeof(float) * (N + 1)));
        CUDA_CALL(cudaMemset (d_down, 0, sizeof(float) * (N + 1)));
        CUDA_CALL(cudaMemset (d_upcount, 0, sizeof(int) * (N + 1)));
        CUDA_CALL(cudaMemset (d_downcount, 0, sizeof(int) * (N + 1)));





        for (int t=0;t<maxTime;t++)
        {
            
            advanceTimestep(threadGrid, numBlocks, devRands, d_wg, d_states, Nx, sw, t);
            recordData(threadGrid, numBlocks, d_states, d_states2, Nx, d_up, d_down, d_upcount, d_downcount, t);
            /*
            CUDA_CALL(cudaMemcpy(h_states, d_states, (N_ALL) * sizeof(int), cudaMemcpyDeviceToHost));
            int countUp = 0;
            for (int i=0;i<N_ALL;i++)
                if (h_states[i]>0)
                    countUp++;
            cout<<"~~~~~~~~~~~~~~~~~~~~~~~~~~~"<<endl<<countUp<<endl;
//            */
#if VISUAL
            CUDA_CALL(cudaMemcpy(h_states, d_states, (N_ALL) * sizeof(int), cudaMemcpyDeviceToHost));
            pg->draw(Nx, h_states);
#endif
            if (t%checkTime == 0 ) 
            {
                countStates(N, numBlocks, d_states, d_blockTotals, N_ALL);
                cout<<t<<" check"<<endl;

                CUDA_CALL(cudaMemcpy(h_blockTotals, d_blockTotals, (numBlocks) * sizeof(int), cudaMemcpyDeviceToHost));
                bool allDone = true;
                for (int b=0;b<numBlocks;b++)
                {
                    if (h_blockTotals[b]>0.75*N)
                    {
    //                    cout<<"block total : "<<h_blockTotals[b]<<endl;
                        if (h_blockTimes[b]<0)
                            h_blockTimes[b]=t;
                    }
                    else
                        allDone = false;
                }
                if (allDone)
                {
                    for (int b=0;b<numBlocks;b++)
                        h_blockTimes[b] = -1;
                    CUDA_CALL(cudaMemset (d_states, 0, sizeof(int) * (N_ALL)));
                } 
            }

        }
        CUDA_CALL(cudaMemcpy(h_up, d_up, (N + 1) * sizeof(float), cudaMemcpyDeviceToHost));
        CUDA_CALL(cudaMemcpy(h_down, d_down, (N + 1) * sizeof(float), cudaMemcpyDeviceToHost));
        CUDA_CALL(cudaMemcpy(h_upcount, d_upcount, (N + 1) * sizeof(int), cudaMemcpyDeviceToHost));

        for (int i=0;i<N+1;i++)
        {
            results[2*i]=h_up[i];
            results[2*i+1]=h_down[i];
            cout<<i/float(N)<<" : "<<h_up[i]<<" : "<<h_down[i]<<" : "<<h_upcount[i]<<endl;
        }

        cnpy::npy_save(fileName,results,shape,2,"w");
    }
    return 0;
}
예제 #17
0
 /**
  * bind CUDA texture to device memory array
  */
 void bind(cuda::vector<T> const& array) const
 {
     CUDA_CALL(cudaBindTexture(NULL, ptr_, array.data(), &desc_));
 }
예제 #18
0
 /**
  * unbind CUDA texture
  */
 void unbind() const
 {
     CUDA_CALL(cudaUnbindTexture(ptr_));
 }
	// creates and registers an opengl buffer to cuda
	CudaGraphicsResource(GLuint buffer, unsigned int flags)
	{
		CUDA_CALL(cudaGraphicsGLRegisterBuffer(&m_resource, buffer, flags));
	}
예제 #20
0
	CudaSurfaceObject(cudaResourceDesc desc)
	{
		CUDA_CALL(cudaCreateSurfaceObject(&m_surfaceObject, &desc));
	}
	CudaGraphicsResourceMapped(CudaGraphicsResource* resource)
		: m_resource(resource)
	{
		cudaGraphicsResource_t cudaGraphicsResource = m_resource->getResource();
		CUDA_CALL(cudaGraphicsMapResources(1, &cudaGraphicsResource));
	}
예제 #22
0
	~CudaSurfaceObject()
	{
		CUDA_CALL(cudaDestroySurfaceObject(m_surfaceObject));
	}
예제 #23
0
static void deleteDeviceMatrix3D(DeviceMatrix3D* mat)
{
	//printf("cudaFree: %p\n", mat->data);
	CUDA_CALL(cudaFree(mat->data));
	delete mat;
}
예제 #24
0
int
main(int argc, char ** argv)
{
  unsigned int  iseed = (unsigned int)time(NULL);
  int           n;
  int           lda;
  PASTIX_FLOAT        *A;
  PASTIX_FLOAT        *B;
  PASTIX_FLOAT        *B_save;
  PASTIX_FLOAT        *B_res;
  CU_FLOAT     *d_A;
  CU_FLOAT     *d_B;
  Clock         clk;
  Clock         clk_wt;
  PASTIX_FLOAT         alpha = 1.0;
  double        time_CPU;
  double        time_CUDA;
  double        time_CUDA_wt;
  int           ops = n*n;

  if (argc != 3)
    {
      usage(argv[0]);
      return 1;
    }

  READ_INT(n, 1);
  READ_INT(lda, 2);
  srand (iseed);

  MALLOC_INTERN(A,      n*lda, PASTIX_FLOAT);
  MALLOC_INTERN(B,      n*lda, PASTIX_FLOAT);
  MALLOC_INTERN(B_save, n*lda, PASTIX_FLOAT);
  MALLOC_INTERN(B_res,  n*lda, PASTIX_FLOAT);

  FILL(A, n*lda);
  FILL(B, n*lda);
  memcpy(B_save, B, n*lda*sizeof(PASTIX_FLOAT));

  clockInit(&(clk));
  clockStart(&(clk));
  DimTrans(A, lda, n, B);
  clockStop(&(clk));
  time_CPU = clockVal(&(clk));
  PRINT_TIME("GETRA on CPU", time_CPU, ops);

  clockInit(&(clk_wt));
  clockStart(&(clk_wt));
  CUDA_CALL(cudaMalloc((void*)&(d_A),
                       lda*n*sizeof(PASTIX_FLOAT)));
  CUDA_CALL(cudaMemcpy((void*)d_A, A,
                       lda*n*sizeof(PASTIX_FLOAT),
                       cudaMemcpyHostToDevice));
  CUDA_CALL(cudaMalloc((void*)&(d_B),
                       lda*n*sizeof(PASTIX_FLOAT)));
  CUDA_CALL(cudaMemcpy((void*)d_B, B_save,
                       lda*n*sizeof(PASTIX_FLOAT),
                       cudaMemcpyHostToDevice));
  clockInit(&(clk));
  clockStart(&(clk));
  getra_cuda(d_A, lda,
             d_B, lda, n);
  clockStop(&(clk));

  CUDA_CALL(cudaMemcpy((void*)B_res, d_B,
                       lda*n*sizeof(PASTIX_FLOAT),
                       cudaMemcpyDeviceToHost));
  CUDA_CALL(cudaFree(d_A));
  CUDA_CALL(cudaFree(d_B));
  clockStop(&(clk_wt));

  time_CUDA    = clockVal(&(clk));
  time_CUDA_wt = clockVal(&(clk_wt));

  COMPARE_TIME("GETRA on GPU",
               time_CUDA, ops, time_CPU);
  COMPARE_TIME("GETRA on GPU with transfer",
               time_CUDA_wt, ops, time_CPU);
  COMPARE_RES(B, B_res);

  memFree_null(A);
  memFree_null(B);
  memFree_null(B_save);
  memFree_null(B_res);

  return EXIT_SUCCESS;
}
예제 #25
0
 /**
  * set device on which the active host thread executes device code
  */
 static void set(int dev)
 {
     CUDA_CALL(cudaSetDevice(dev));
 }
예제 #26
0
파일: plugin-nvptx.c 프로젝트: SuperLu/gcc
static bool
map_fini (struct ptx_stream *s)
{
  CUDA_CALL (cuMemFreeHost, s->h);
  return true;
}
예제 #27
0
 /**
  * retrieve properties of given device
  */
 properties(int dev)
 {
     CUDA_CALL(cudaGetDeviceProperties(&prop, dev));
 }
예제 #28
0
	RenderTarget::~RenderTarget(void) {
		CUDA_CALL(cudaGraphicsUnregisterResource(_resource));
	}