inline typename std::enable_if< std::is_convertible< typename std::iterator_traits<InputIterator>::iterator_category , device_random_access_iterator_tag >::value && std::is_convertible< typename std::iterator_traits<OutputIterator>::iterator_category , device_random_access_iterator_tag >::value && std::is_same< typename std::iterator_traits<InputIterator>::value_type , typename std::iterator_traits<OutputIterator>::value_type >::value , OutputIterator>::type copy(InputIterator first, InputIterator last, OutputIterator result) { typename std::iterator_traits<InputIterator>::difference_type size = last - first; CUDA_CALL( cudaMemcpy( &*result , &*first , size * sizeof(typename std::iterator_traits<InputIterator>::value_type) , cudaMemcpyDeviceToDevice ) ); return result + size; }
GpuDevice::GpuDevice(uint64_t device_id, DeviceListener* l, int gpu_id) : ThreadedDevice(device_id, l, kParallelism), device_(gpu_id) { CUDA_CALL(cudaSetDevice(device_)); cudaFree(0); // Initialize auto allocator = [this](size_t len) -> void* { void* ret; CUDA_CALL(cudaSetDevice(device_)); CUDA_CALL(cudaMalloc(&ret, len)); return ret; }; auto deallocator = [this](void* ptr) { CUDA_CALL(cudaSetDevice(device_)); CUDA_CALL(cudaFree(ptr)); }; data_store_ = new PooledDataStore(DEFAULT_POOL_SIZE, allocator, deallocator); for (size_t i = 0; i < kParallelism; ++i) { CUDA_CALL(cudaStreamCreate(&stream_[i])); CUBLAS_CALL(cublasCreate(&cublas_handle_[i])); CUBLAS_CALL(cublasSetStream(cublas_handle_[i], stream_[i])); CUDNN_CALL(cudnnCreate(&cudnn_handle_[i])); CUDNN_CALL(cudnnSetStream(cudnn_handle_[i], stream_[i])); } }
static bool link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs, unsigned num_objs) { CUjit_option opts[6]; void *optvals[6]; float elapsed = 0.0; #define LOGSIZE 8192 char elog[LOGSIZE]; char ilog[LOGSIZE]; unsigned long logsize = LOGSIZE; CUlinkState linkstate; CUresult r; void *linkout; size_t linkoutsize __attribute__ ((unused)); opts[0] = CU_JIT_WALL_TIME; optvals[0] = &elapsed; opts[1] = CU_JIT_INFO_LOG_BUFFER; optvals[1] = &ilog[0]; opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; optvals[2] = (void *) logsize; opts[3] = CU_JIT_ERROR_LOG_BUFFER; optvals[3] = &elog[0]; opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES; optvals[4] = (void *) logsize; opts[5] = CU_JIT_LOG_VERBOSE; optvals[5] = (void *) 1; CUDA_CALL (cuLinkCreate, 6, opts, optvals, &linkstate); for (; num_objs--; ptx_objs++) { /* cuLinkAddData's 'data' argument erroneously omits the const qualifier. */ GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code); r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, (char*)ptx_objs->code, ptx_objs->size, 0, 0, 0, 0); if (r != CUDA_SUCCESS) { GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]); GOMP_PLUGIN_error ("cuLinkAddData (ptx_code) error: %s", cuda_error (r)); return false; } } GOMP_PLUGIN_debug (0, "Linking\n"); r = cuLinkComplete (linkstate, &linkout, &linkoutsize); GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed); GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]); if (r != CUDA_SUCCESS) { GOMP_PLUGIN_error ("cuLinkComplete error: %s", cuda_error (r)); return false; } CUDA_CALL (cuModuleLoadData, module, linkout); CUDA_CALL (cuLinkDestroy, linkstate); return true; }
void RenderTarget::Map(void) { glBindTexture(GL_TEXTURE_2D, 0); CUDA_CALL(cudaGraphicsMapResources(1, &_resource, 0)); CUDA_CALL(cudaGraphicsSubResourceGetMappedArray(&_array, _resource, 0, 0)); }
/** * get device on which the active host thread executes device code */ static int get() { int dev; CUDA_CALL(cudaGetDevice(&dev)); return dev; }
/** * returns number of devices available for execution */ static int count() { int count; CUDA_CALL(cudaGetDeviceCount(&count)); return count; }
virtual ~CudaGraphicsResourceMapped() { cudaGraphicsResource_t cudaGraphicsResource = m_resource->getResource(); CUDA_CALL(cudaGraphicsUnmapResources(1, &cudaGraphicsResource)); }
~CudaGraphicsResource() { CUDA_CALL(cudaGraphicsUnregisterResource(m_resource)); }
// creates and registers an opengl texture to cuda CudaGraphicsResource(GLuint image, GLenum target, unsigned int flags) { CUDA_CALL(cudaGraphicsGLRegisterImage(&m_resource, image, target, flags)); }
void GpuDevice::DoCopyRemoteData(float* dst, float* src, size_t size, int thrid) { CUDA_CALL(cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, impl_->stream[thrid])); CUDA_CALL(cudaStreamSynchronize(impl_->stream[thrid])); }
void GpuDevice::Barrier(int thrid) { CUDA_CALL(cudaStreamSynchronize(impl_->stream[thrid])); }
void GpuDevice::Impl::ActivateDevice() const { CUDA_CALL(cudaSetDevice(device)); }
/* * cleans up all runtime-related resources associated with calling thread */ static void exit() { CUDA_CALL(cudaThreadExit()); }
/* * blocks until the device has completed all preceding requested tasks */ static void synchronize() { CUDA_CALL(cudaThreadSynchronize()); }
void GpuDevice::PreExecute() { CUDA_CALL(cudaSetDevice(device_)); }
int main() { #if VISUAL plotGrid* pg = new plotGrid; #endif // number of reps int numBlocks = 128; // length of grid int Nx = 8; int N = Nx * Nx; int N2 = 0.5 * N; int N4 = 0.5 * N2; int N_ALL = N * numBlocks; dim3 threadGrid(Nx, Nx); curandState *devRands; CUDA_CALL(cudaMalloc((void **)&devRands, N_ALL * sizeof(curandState))); srand (time(NULL)); initRands(threadGrid, numBlocks, devRands, rand()); float* d_wg; CUDA_CALL(cudaMalloc((void**)&d_wg, sizeof(float) * (N_ALL) )); int* d_states; CUDA_CALL(cudaMalloc((void**)&d_states, sizeof(int) * N_ALL)); int* d_states2; CUDA_CALL(cudaMalloc((void**)&d_states2, sizeof(int) * N_ALL)); float* d_up; CUDA_CALL(cudaMalloc((void**)&d_up, sizeof(float) * (N + 1) )); float* h_up = new float [N+1]; float* d_down; CUDA_CALL(cudaMalloc((void**)&d_down, sizeof(float) * (N + 1) )); float* h_down = new float [N+1]; int* d_upcount; CUDA_CALL(cudaMalloc((void**)&d_upcount, sizeof(int) * (N + 1) )); int* h_upcount = new int [N+1]; int* d_downcount; CUDA_CALL(cudaMalloc((void**)&d_downcount, sizeof(int) * (N + 1) )); int* h_downcount = new int [N+1]; int* d_blockTotals; CUDA_CALL(cudaMalloc((void**)&d_blockTotals, sizeof(int) * numBlocks)); float* h_wg = new float [N_ALL]; int* h_states = new int[N_ALL]; int* h_blockTotals = new int[numBlocks]; int* h_blockTimes = new int[numBlocks]; int wgCount = 1; const unsigned int shape[] = {N+1,2}; float* results = new float[(N+1)*2]; for (int i=0;i<(N+1)*2;i++) results[i]=0.0f; for (int G=0;G<wgCount;G++) { float wg = 0.25;//5 + 0.2 * float(G); for (int i=0;i<N_ALL;i++) { h_wg[i]=wg; float unum =rand()/double(RAND_MAX); // cout<<unum<<endl; // if (unum<0.2345) // h_wg[i]+=0.833; } CUDA_CALL(cudaMemcpy(d_wg, h_wg, (N_ALL) * sizeof(float), cudaMemcpyHostToDevice)); for (int b=0;b<numBlocks;b++) h_blockTimes[b] = -1; int maxTime = 100000; int checkTime = 100; float sw = 1.0f; char fileName[30]; sprintf(fileName, "potential%d-%d.npy", int(10*wg),int(100.0*sw)); // cout<<fileName<<endl; CUDA_CALL(cudaMemset (d_states, 0, sizeof(int) * (N_ALL))); CUDA_CALL(cudaMemset (d_blockTotals, 0, sizeof(int) * (numBlocks))); CUDA_CALL(cudaMemset (d_up, 0, sizeof(float) * (N + 1))); CUDA_CALL(cudaMemset (d_down, 0, sizeof(float) * (N + 1))); CUDA_CALL(cudaMemset (d_upcount, 0, sizeof(int) * (N + 1))); CUDA_CALL(cudaMemset (d_downcount, 0, sizeof(int) * (N + 1))); for (int t=0;t<maxTime;t++) { advanceTimestep(threadGrid, numBlocks, devRands, d_wg, d_states, Nx, sw, t); recordData(threadGrid, numBlocks, d_states, d_states2, Nx, d_up, d_down, d_upcount, d_downcount, t); /* CUDA_CALL(cudaMemcpy(h_states, d_states, (N_ALL) * sizeof(int), cudaMemcpyDeviceToHost)); int countUp = 0; for (int i=0;i<N_ALL;i++) if (h_states[i]>0) countUp++; cout<<"~~~~~~~~~~~~~~~~~~~~~~~~~~~"<<endl<<countUp<<endl; // */ #if VISUAL CUDA_CALL(cudaMemcpy(h_states, d_states, (N_ALL) * sizeof(int), cudaMemcpyDeviceToHost)); pg->draw(Nx, h_states); #endif if (t%checkTime == 0 ) { countStates(N, numBlocks, d_states, d_blockTotals, N_ALL); cout<<t<<" check"<<endl; CUDA_CALL(cudaMemcpy(h_blockTotals, d_blockTotals, (numBlocks) * sizeof(int), cudaMemcpyDeviceToHost)); bool allDone = true; for (int b=0;b<numBlocks;b++) { if (h_blockTotals[b]>0.75*N) { // cout<<"block total : "<<h_blockTotals[b]<<endl; if (h_blockTimes[b]<0) h_blockTimes[b]=t; } else allDone = false; } if (allDone) { for (int b=0;b<numBlocks;b++) h_blockTimes[b] = -1; CUDA_CALL(cudaMemset (d_states, 0, sizeof(int) * (N_ALL))); } } } CUDA_CALL(cudaMemcpy(h_up, d_up, (N + 1) * sizeof(float), cudaMemcpyDeviceToHost)); CUDA_CALL(cudaMemcpy(h_down, d_down, (N + 1) * sizeof(float), cudaMemcpyDeviceToHost)); CUDA_CALL(cudaMemcpy(h_upcount, d_upcount, (N + 1) * sizeof(int), cudaMemcpyDeviceToHost)); for (int i=0;i<N+1;i++) { results[2*i]=h_up[i]; results[2*i+1]=h_down[i]; cout<<i/float(N)<<" : "<<h_up[i]<<" : "<<h_down[i]<<" : "<<h_upcount[i]<<endl; } cnpy::npy_save(fileName,results,shape,2,"w"); } return 0; }
/** * bind CUDA texture to device memory array */ void bind(cuda::vector<T> const& array) const { CUDA_CALL(cudaBindTexture(NULL, ptr_, array.data(), &desc_)); }
/** * unbind CUDA texture */ void unbind() const { CUDA_CALL(cudaUnbindTexture(ptr_)); }
// creates and registers an opengl buffer to cuda CudaGraphicsResource(GLuint buffer, unsigned int flags) { CUDA_CALL(cudaGraphicsGLRegisterBuffer(&m_resource, buffer, flags)); }
CudaSurfaceObject(cudaResourceDesc desc) { CUDA_CALL(cudaCreateSurfaceObject(&m_surfaceObject, &desc)); }
CudaGraphicsResourceMapped(CudaGraphicsResource* resource) : m_resource(resource) { cudaGraphicsResource_t cudaGraphicsResource = m_resource->getResource(); CUDA_CALL(cudaGraphicsMapResources(1, &cudaGraphicsResource)); }
~CudaSurfaceObject() { CUDA_CALL(cudaDestroySurfaceObject(m_surfaceObject)); }
static void deleteDeviceMatrix3D(DeviceMatrix3D* mat) { //printf("cudaFree: %p\n", mat->data); CUDA_CALL(cudaFree(mat->data)); delete mat; }
int main(int argc, char ** argv) { unsigned int iseed = (unsigned int)time(NULL); int n; int lda; PASTIX_FLOAT *A; PASTIX_FLOAT *B; PASTIX_FLOAT *B_save; PASTIX_FLOAT *B_res; CU_FLOAT *d_A; CU_FLOAT *d_B; Clock clk; Clock clk_wt; PASTIX_FLOAT alpha = 1.0; double time_CPU; double time_CUDA; double time_CUDA_wt; int ops = n*n; if (argc != 3) { usage(argv[0]); return 1; } READ_INT(n, 1); READ_INT(lda, 2); srand (iseed); MALLOC_INTERN(A, n*lda, PASTIX_FLOAT); MALLOC_INTERN(B, n*lda, PASTIX_FLOAT); MALLOC_INTERN(B_save, n*lda, PASTIX_FLOAT); MALLOC_INTERN(B_res, n*lda, PASTIX_FLOAT); FILL(A, n*lda); FILL(B, n*lda); memcpy(B_save, B, n*lda*sizeof(PASTIX_FLOAT)); clockInit(&(clk)); clockStart(&(clk)); DimTrans(A, lda, n, B); clockStop(&(clk)); time_CPU = clockVal(&(clk)); PRINT_TIME("GETRA on CPU", time_CPU, ops); clockInit(&(clk_wt)); clockStart(&(clk_wt)); CUDA_CALL(cudaMalloc((void*)&(d_A), lda*n*sizeof(PASTIX_FLOAT))); CUDA_CALL(cudaMemcpy((void*)d_A, A, lda*n*sizeof(PASTIX_FLOAT), cudaMemcpyHostToDevice)); CUDA_CALL(cudaMalloc((void*)&(d_B), lda*n*sizeof(PASTIX_FLOAT))); CUDA_CALL(cudaMemcpy((void*)d_B, B_save, lda*n*sizeof(PASTIX_FLOAT), cudaMemcpyHostToDevice)); clockInit(&(clk)); clockStart(&(clk)); getra_cuda(d_A, lda, d_B, lda, n); clockStop(&(clk)); CUDA_CALL(cudaMemcpy((void*)B_res, d_B, lda*n*sizeof(PASTIX_FLOAT), cudaMemcpyDeviceToHost)); CUDA_CALL(cudaFree(d_A)); CUDA_CALL(cudaFree(d_B)); clockStop(&(clk_wt)); time_CUDA = clockVal(&(clk)); time_CUDA_wt = clockVal(&(clk_wt)); COMPARE_TIME("GETRA on GPU", time_CUDA, ops, time_CPU); COMPARE_TIME("GETRA on GPU with transfer", time_CUDA_wt, ops, time_CPU); COMPARE_RES(B, B_res); memFree_null(A); memFree_null(B); memFree_null(B_save); memFree_null(B_res); return EXIT_SUCCESS; }
/** * set device on which the active host thread executes device code */ static void set(int dev) { CUDA_CALL(cudaSetDevice(dev)); }
static bool map_fini (struct ptx_stream *s) { CUDA_CALL (cuMemFreeHost, s->h); return true; }
/** * retrieve properties of given device */ properties(int dev) { CUDA_CALL(cudaGetDeviceProperties(&prop, dev)); }
RenderTarget::~RenderTarget(void) { CUDA_CALL(cudaGraphicsUnregisterResource(_resource)); }