static void copyOprodFromCPUArrayQuda(FullOprod cudaOprod, void *cpuOprod, size_t bytes_per_dir, int Vh) { // Use pinned memory float2 *packedEven, *packedOdd; if(cudaMallocHost(&packedEven, bytes_per_dir) == cudaErrorMemoryAllocation) { errorQuda("ERROR: cudaMallocHost failed for packedEven\n"); } if (cudaMallocHost(&packedOdd, bytes_per_dir) == cudaErrorMemoryAllocation) { errorQuda("ERROR: cudaMallocHost failed for packedOdd\n"); } for(int dir=0; dir<4; dir++){ packOprodFieldDir(packedEven, (float*)cpuOprod, dir, 0, Vh); packOprodFieldDir(packedOdd, (float*)cpuOprod, dir, 1, Vh); cudaMemset(cudaOprod.even.data[dir], 0, bytes_per_dir); cudaMemset(cudaOprod.odd.data[dir], 0, bytes_per_dir); checkCudaError(); cudaMemcpy(cudaOprod.even.data[dir], packedEven, bytes_per_dir, cudaMemcpyHostToDevice); cudaMemcpy(cudaOprod.odd.data[dir], packedOdd, bytes_per_dir, cudaMemcpyHostToDevice); checkCudaError(); } cudaFreeHost(packedEven); cudaFreeHost(packedOdd); }
void cudaCloverField::createTexObject(cudaTextureObject_t &tex, cudaTextureObject_t &texNorm, void *field, void *norm) { if (order == QUDA_FLOAT2_CLOVER_ORDER || order == QUDA_FLOAT4_CLOVER_ORDER) { // create the texture for the field components cudaChannelFormatDesc desc; memset(&desc, 0, sizeof(cudaChannelFormatDesc)); if (precision == QUDA_SINGLE_PRECISION) desc.f = cudaChannelFormatKindFloat; else desc.f = cudaChannelFormatKindSigned; // half is short, double is int2 // always four components regardless of precision desc.x = (precision == QUDA_DOUBLE_PRECISION) ? 8*sizeof(int) : 8*precision; desc.y = (precision == QUDA_DOUBLE_PRECISION) ? 8*sizeof(int) : 8*precision; desc.z = (precision == QUDA_DOUBLE_PRECISION) ? 8*sizeof(int) : 8*precision; desc.w = (precision == QUDA_DOUBLE_PRECISION) ? 8*sizeof(int) : 8*precision; cudaResourceDesc resDesc; memset(&resDesc, 0, sizeof(resDesc)); resDesc.resType = cudaResourceTypeLinear; resDesc.res.linear.devPtr = field; resDesc.res.linear.desc = desc; resDesc.res.linear.sizeInBytes = bytes/2; cudaTextureDesc texDesc; memset(&texDesc, 0, sizeof(texDesc)); if (precision == QUDA_HALF_PRECISION) texDesc.readMode = cudaReadModeNormalizedFloat; else texDesc.readMode = cudaReadModeElementType; cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL); checkCudaError(); // create the texture for the norm components if (precision == QUDA_HALF_PRECISION) { cudaChannelFormatDesc desc; memset(&desc, 0, sizeof(cudaChannelFormatDesc)); desc.f = cudaChannelFormatKindFloat; desc.x = 8*QUDA_SINGLE_PRECISION; desc.y = 0; desc.z = 0; desc.w = 0; cudaResourceDesc resDesc; memset(&resDesc, 0, sizeof(resDesc)); resDesc.resType = cudaResourceTypeLinear; resDesc.res.linear.devPtr = norm; resDesc.res.linear.desc = desc; resDesc.res.linear.sizeInBytes = norm_bytes/2; cudaTextureDesc texDesc; memset(&texDesc, 0, sizeof(texDesc)); texDesc.readMode = cudaReadModeElementType; cudaCreateTextureObject(&texNorm, &resDesc, &texDesc, NULL); checkCudaError(); } } }
void loadOprodToGPU(void *cudaOprodEven, void *cudaOprodOdd, void *oprod, int vol){ checkCudaError(); int bytes = 4*vol*18*sizeof(float); std::cout << "vol = " << vol << std::endl; std::cout << "bytes = " << bytes << std::endl; checkCudaError(); loadOprodFromCPUArrayQuda(cudaOprodEven, cudaOprodOdd, oprod, bytes, vol); }
device::device(int device_id) { cuInit(0); checkCudaError("device::device Init"); cuDeviceGet(&cu_device, device_id); checkCudaError("device::device Get device"); //cuCtxCreate(&cu_context, 0, cu_device); //checkCudaError("device::device Create context"); this->set_device(device_id); cudaGetDeviceProperties(&props, device_id); checkCudaError("device::device Get properties "); this->device_name = props.name; }
void printDeviceColumnMajorMatrix(float *dA, int nrRows, int nrCols) { int size = nrRows * nrCols; float hA[size]; checkCudaError(__LINE__, cudaMemcpy(hA, dA, size * sizeof(float), cudaMemcpyDeviceToHost)); printColumnMajorMatrix(hA, nrRows, nrCols); }
void cudaCloverField::copy(const CloverField &src, bool inverse) { checkField(src); if (typeid(src) == typeid(cudaCloverField)) { if (src.V(false)) copyGenericClover(*this, src, false, QUDA_CUDA_FIELD_LOCATION); if (src.V(true)) copyGenericClover(*this, src, true, QUDA_CUDA_FIELD_LOCATION); } else if (typeid(src) == typeid(cpuCloverField)) { resizeBufferPinned(bytes + norm_bytes); void *packClover = bufferPinned; void *packCloverNorm = (precision == QUDA_HALF_PRECISION) ? (char*)bufferPinned + bytes : 0; if (src.V(false)) { copyGenericClover(*this, src, false, QUDA_CPU_FIELD_LOCATION, packClover, 0, packCloverNorm, 0); cudaMemcpy(clover, packClover, bytes, cudaMemcpyHostToDevice); if (precision == QUDA_HALF_PRECISION) cudaMemcpy(norm, packCloverNorm, norm_bytes, cudaMemcpyHostToDevice); } if (src.V(true) && inverse) { copyGenericClover(*this, src, true, QUDA_CPU_FIELD_LOCATION, packClover, 0, packCloverNorm, 0); cudaMemcpy(cloverInv, packClover, bytes, cudaMemcpyHostToDevice); if (precision == QUDA_HALF_PRECISION) cudaMemcpy(invNorm, packCloverNorm, norm_bytes, cudaMemcpyHostToDevice); } } else { errorQuda("Invalid clover field type"); } checkCudaError(); }
cudaChannelFormatDesc getChannelDesc(cudaArray_const_t array) { cudaChannelFormatDesc desc; if (Ctx->isCreated()) { checkCudaError(cudaGetChannelDesc(&desc, array)); } return desc; }
size_t getTextureAlignmentOffset(const textureReference* tex) { size_t offset = 0; if (Ctx->isCreated()) { checkCudaError(cudaGetTextureAlignmentOffset(&offset, tex)); } return offset; }
const textureReference* getTextureReference(const void* symbol) { const textureReference* tex = nullptr; if (Ctx->isCreated()) { checkCudaError(cudaGetTextureReference(&tex, symbol)); } return tex; }
//////////////////////////////////////////////////////////////////////////////// //! Run the Cuda part of the computation //////////////////////////////////////////////////////////////////////////////// void runCuda() { cudaStream_t stream = 0; const int nbResources = 2; cudaGraphicsResource *ppResources[nbResources] = { g_histogram.cudaResource, g_color.cudaResource, }; // Map resources for Cuda checkCudaErrors(cudaGraphicsMapResources(nbResources, ppResources, stream)); getLastCudaError("cudaGraphicsMapResources(2) failed"); // Get pointers checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&g_histogram.cudaBuffer, &g_histogram.size, g_histogram.cudaResource)); getLastCudaError("cudaGraphicsResourceGetMappedPointer (g_color.pBuffer) failed"); cudaGraphicsSubResourceGetMappedArray(&g_color.pCudaArray, g_color.cudaResource, 0, 0); getLastCudaError("cudaGraphicsSubResourceGetMappedArray (g_color.pBuffer) failed"); // Execute kernel createHistogramTex(g_histogram.cudaBuffer, g_WindowWidth, g_WindowHeight, g_color.pCudaArray); checkCudaError(); // // unmap the resources // checkCudaErrors(cudaGraphicsUnmapResources(nbResources, ppResources, stream)); getLastCudaError("cudaGraphicsUnmapResources(2) failed"); }
void bindTextureToMipmappedArray(const textureReference* tex, cudaMipmappedArray_const_t mipmappedArray, const cudaChannelFormatDesc* desc) { if (Ctx->isCreated() && tex && mipmappedArray && desc) { checkCudaError(cudaBindTextureToMipmappedArray(tex, mipmappedArray, desc)); } }
void CudaDeviceContextPrivate::destroy() { if (isValid()) { std::cout << "Reset cuda device" << std::endl; checkCudaError(cudaDeviceReset()); activeDevice = -1; activeStream = nullptr; } }
device::device() { cuInit(0); cuDeviceGet(&cu_device, 0); checkCudaError("device::device Init"); //cuCtxCreate(&cu_context, 0, cu_device); //checkCudaError("device::device Create context"); device_name = props.name; }
AsyncCopier::AsyncCopier(size_t bufferSize) : bufferSize_(bufferSize), buffer_(allocPageLocked(bufferSize)) { int deviceCount; checkCudaError(cudaGetDeviceCount(&deviceCount), "cudaGetDeviceCount"); events_.resize(deviceCount); freeEvents_.resize(deviceCount); }
bool CudaDeviceContext::setDevice(int device) { if (device > 0) { D(CudaDeviceContext); checkCudaError(cudaSetDevice(device)); d->activeDevice = device; return true; } return false; }
void bindTexture(const textureReference* tex, const void* devPtr, const cudaChannelFormatDesc* desc, size_t* offset, size_t size) { if (Ctx->isCreated() && tex && devPtr && desc) { checkCudaError(cudaBindTexture(offset, tex, devPtr, desc, size)); } }
AsyncCopier::Event::Event(int d) : device(d), refCount(0) { event.emplace(); checkCudaError( cudaEventCreateWithFlags( get_pointer(event), cudaEventDisableTiming | cudaEventBlockingSync), "cudaEventCreateWithFlags"); }
void allocateOprodFields(void **cudaOprodEven, void **cudaOprodOdd, int vol){ int bytes = 4*vol*18*sizeof(float); if (cudaMalloc((void **)cudaOprodEven, bytes) == cudaErrorMemoryAllocation) { errorQuda("Error allocating even outer product field"); } cudaMemset((*cudaOprodEven), 0, bytes); checkCudaError(); if (cudaMalloc((void **)cudaOprodOdd, bytes) == cudaErrorMemoryAllocation) { errorQuda("Error allocating odd outer product field"); } cudaMemset((*cudaOprodOdd), 0, bytes); checkCudaError(); }
void masterKendall(const float * x, size_t nx, const float * y, size_t ny, size_t sampleSize, double * results) { size_t outputLength = nx * ny, outputBytes = outputLength*sizeof(double), xBytes = nx*sampleSize*sizeof(float), yBytes = ny*sampleSize*sizeof(float); float * gpux, * gpuy; double * gpuResults; dim3 grid(nx, ny), block(NUMTHREADS, NUMTHREADS); cudaMalloc((void **)&gpux, xBytes); cudaMalloc((void **)&gpuy, yBytes); checkCudaError("input vector space allocation"); cudaMemcpy(gpux, x, xBytes, cudaMemcpyHostToDevice); cudaMemcpy(gpuy, y, yBytes, cudaMemcpyHostToDevice); checkCudaError("copying input vectors to gpu"); cudaMalloc((void **)&gpuResults, outputBytes); checkCudaError("allocation of space for result matrix"); void *args[] = { &gpux , &nx , &gpuy , &ny , &sampleSize , &gpuResults }; cudaLaunch("gpuKendall", args, grid, block); cudaFree(gpux); cudaFree(gpuy); cudaMemcpy(results, gpuResults, outputBytes, cudaMemcpyDeviceToHost); cudaFree(gpuResults); checkCudaError("copying results from gpu and cleaning up"); }
void bindTexture2D(const textureReference* tex, const void* devPtr, const cudaChannelFormatDesc* desc, size_t* offset, size_t width, size_t height, size_t pitch) { if (Ctx->isCreated() && tex && devPtr && desc) { checkCudaError(cudaBindTexture2D(offset, tex, devPtr, desc, width, height, pitch)); } }
cudaCloverField::~cudaCloverField() { if (clover != cloverInv) { if ( clover ) cudaFree(clover); if ( norm ) cudaFree(norm); } if ( cloverInv ) cudaFree(cloverInv); if ( invNorm ) cudaFree(invNorm); checkCudaError(); }
static void loadOprodFromCPUArrayQuda(void *cudaOprodEven, void *cudaOprodOdd, void *cpuOprod, size_t bytes, int Vh) { // Use pinned memory float2 *packedEven, *packedOdd; checkCudaError(); if (cudaMallocHost(&packedEven, bytes) == cudaErrorMemoryAllocation) { errorQuda("ERROR: cudaMallocHost failed for packedEven\n"); } if (cudaMallocHost(&packedOdd, bytes) == cudaErrorMemoryAllocation) { errorQuda("ERROR: cudaMallocHost failed for packedEven\n"); } checkCudaError(); packOprodField(packedEven, (float*)cpuOprod, 0, Vh); packOprodField(packedOdd, (float*)cpuOprod, 1, Vh); checkCudaError(); cudaMemset(cudaOprodEven, 0, bytes); cudaMemset(cudaOprodOdd, 0, bytes); checkCudaError(); cudaMemcpy(cudaOprodEven, packedEven, bytes, cudaMemcpyHostToDevice); checkCudaError(); cudaMemcpy(cudaOprodOdd, packedOdd, bytes, cudaMemcpyHostToDevice); checkCudaError(); cudaFreeHost(packedEven); cudaFreeHost(packedOdd); }
void init_GPU_data(GlobalParams &p) { int n_known = p.nRegions; int n_total = p.pred_dist.n_rows; int n_pred = n_total-n_known; scatr_magma_init(); arma::mat dist12 = p.pred_dist(arma::span(0,n_known-1), arma::span(n_known, n_total-1)); arma::mat dist22 = p.pred_dist(arma::span(n_known, n_total-1), arma::span(n_known, n_total-1)); checkCublasError( cublasCreate_v2(&p.handle), "handle (Create)" ); cudaMalloc((void**) &p.d_dist12, n_known*n_pred*sizeof(double)); checkCudaError("dist12 (Malloc)"); checkCublasError( cublasSetMatrix(n_known, n_pred, sizeof(double), dist12.memptr(), n_known, p.d_dist12, n_known), "dist12 (Set)" ); cudaMalloc((void**) &p.d_dist22, n_pred*n_pred * sizeof(double)); checkCudaError("dist22 (Malloc)"); checkCublasError( cublasSetMatrix(n_pred, n_pred, sizeof(double), dist22.memptr(), n_pred, p.d_dist22, n_pred), "dist22 (Set)" ); cudaMalloc((void**) &p.d_cov12, n_known * n_pred * sizeof(double)); checkCudaError("cov12 (Malloc)"); cudaMalloc((void**) &p.d_cov22, n_pred * n_pred * sizeof(double)); checkCudaError("cov22 (Malloc)"); cudaMalloc((void**) &p.d_invcov11, n_known * n_known * sizeof(double)); checkCudaError("invcov11 (Malloc)"); cudaMalloc((void**) &p.d_tmp, n_pred * n_known * sizeof(double)); checkCudaError("tmp (Malloc)"); }
void device::get_cuda_info() { const int kb = 1024; const int mb = kb * kb; cudaDeviceProp props; cudaError_t error = cudaGetDeviceProperties(&props, this->device_id); checkCudaError("device::device::get_cuda_info Get properties "); if (error == cudaErrorInvalidDevice) { std::cout << "Device does not exist" << std::endl; } std::cout << props.name << std::endl; std::cout << " Global memory: " << props.totalGlobalMem / mb << "mb" << std::endl; std::cout << " Shared memory: " << props.sharedMemPerBlock / kb << "kb" << std::endl; std::cout << " Constant memory: " << props.totalConstMem / kb << "kb" << std::endl; std::cout << " Block registers: " << props.regsPerBlock << std::endl << std::endl; std::cout << " Warp size: " << props.warpSize << std::endl; std::cout << " Threads per block: " << props.maxThreadsPerBlock << std::endl; std::cout << " Max block dimensions: [ " << props.maxThreadsDim[0] << ", " << props.maxThreadsDim[1] << ", " << props.maxThreadsDim[2] << " ]" << std::endl; std::cout << " Max grid dimensions: [ " << props.maxGridSize[0] << ", " << props.maxGridSize[1] << ", " << props.maxGridSize[2] << " ]" << std::endl; std::cout << " Multiprocessor Count: " << props.multiProcessorCount << std::endl; std::cout << std::endl; std::cout << " Unified addressing: " << props.unifiedAddressing << std::endl; std::cout << " Concurrent kernels: " << props.concurrentKernels << std::endl; std::cout << " Diver Overlap: " << props.deviceOverlap << std::endl; std::cout << " Memory Clock Rate: " << props.memoryClockRate << std::endl; std::cout << " Memory Bus Width: " << props.memoryBusWidth << std::endl; std::cout << " l2 Cache Size: " << props.l2CacheSize << std::endl; std::cout << " Clock Rate: " << props.clockRate << std::endl; std::cout << " Exec Time Out: " << props.kernelExecTimeoutEnabled << std::endl << std::endl; std::cout << " Compute Capability: " << props.major << "." << props.minor << std::endl; std::cout << " Compute Modes: " << props.computeMode << std::endl << std::endl; //} }
void cudaCloverField::destroyTexObject() { cudaDestroyTextureObject(evenTex); cudaDestroyTextureObject(oddTex); cudaDestroyTextureObject(evenInvTex); cudaDestroyTextureObject(oddInvTex); if (precision == QUDA_HALF_PRECISION) { cudaDestroyTextureObject(evenNormTex); cudaDestroyTextureObject(evenNormTex); cudaDestroyTextureObject(evenInvNormTex); cudaDestroyTextureObject(evenInvNormTex); } checkCudaError(); }
int main(int argc, char **argv) { // device memory real *psi_d, *z_d; size_t fSize = sizeof(real); /* grid dimensions */ unsigned int Nx = 513, Ny = 513; // omitting boundaries unsigned int nGridPoints = (Nx-2)*(Ny-2); cudaMalloc((void **) &psi_d, (nGridPoints+1)*fSize); cudaMalloc((void **) &z_d, (nGridPoints+1)*fSize); /* initialization */ fillArray(psi_d, 0.0, nGridPoints+1); fillArray(z_d, 1.0, nGridPoints+1); checkCudaError("Initialization of grid"); // for timing purposes cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); // start timer cudaEventRecord(start,0); /* Call the poisson solver, right hand side * is stored on the device in z_d (make sure the data * is copied from CPU to GPU!), result is stored in * psi_d (on the GPU/device). * Here NX-2 is the width of the grid's interior * (without the boundaries). */ cuPoisson((Nx-2), psi_d, z_d); // stop timer cudaEventRecord(stop,0); cudaEventSynchronize(stop); float computationTime; cudaEventElapsedTime(&computationTime, start, stop); printf("Computation time was %.5f seconds.\n\n", computationTime/1000.0); printf("Writing result to disk...\n"); // write result to file writeBinaryFile(Nx, Ny, psi_d, "data.dat"); printf("done\n"); return EXIT_SUCCESS; }
static void fetchOprodFromGPUArraysQuda(void *cudaOprodEven, void *cudaOprodOdd, void *cpuOprod, size_t bytes, int Vh) { float2 *packedEven, *packedOdd; if(cudaMallocHost(&packedEven,bytes) == cudaErrorMemoryAllocation) { errorQuda("ERROR: cudaMallocHost failed for packedEven\n"); } if (cudaMallocHost(&packedOdd, bytes) == cudaErrorMemoryAllocation) { errorQuda("ERROR: cudaMallocHost failed for packedOdd\n"); } cudaMemcpy(packedEven, cudaOprodEven, bytes, cudaMemcpyDeviceToHost); checkCudaError(); cudaMemcpy(packedOdd, cudaOprodOdd, bytes, cudaMemcpyDeviceToHost); checkCudaError(); unpackOprodField((float*)cpuOprod, packedEven, 0, Vh); unpackOprodField((float*)cpuOprod, packedOdd, 1, Vh); cudaFreeHost(packedEven); cudaFreeHost(packedOdd); }
void cudaCloverField::destroyTexObject() { if (order == QUDA_FLOAT2_CLOVER_ORDER || order == QUDA_FLOAT4_CLOVER_ORDER) { cudaDestroyTextureObject(evenTex); cudaDestroyTextureObject(oddTex); cudaDestroyTextureObject(evenInvTex); cudaDestroyTextureObject(oddInvTex); if (precision == QUDA_HALF_PRECISION) { cudaDestroyTextureObject(evenNormTex); cudaDestroyTextureObject(oddNormTex); cudaDestroyTextureObject(evenInvNormTex); cudaDestroyTextureObject(oddInvNormTex); } checkCudaError(); } }
cudaCloverField::~cudaCloverField() { #ifdef USE_TEXTURE_OBJECTS destroyTexObject(); #endif if (clover != cloverInv) { if (clover) device_free(clover); if (norm) device_free(norm); } if (cloverInv) device_free(cloverInv); if (invNorm) device_free(invNorm); checkCudaError(); }
cudaCloverField::~cudaCloverField() { #ifdef USE_TEXTURE_OBJECTS destroyTexObject(); #endif if (create != QUDA_REFERENCE_FIELD_CREATE) { if (clover != cloverInv) { if (clover) device_free(clover); if (norm) device_free(norm); } if (cloverInv) device_free(cloverInv); if (invNorm) device_free(invNorm); } checkCudaError(); }