const char * what() const throw() { return cudaGetErrorString(err_num); }
int APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, PyGpuArrayObject *om, cudnnConvolutionDescriptor_t desc, double alpha, double beta, PyGpuArrayObject **output, PyGpuContextObject *c) { cudnnStatus_t err = CUDNN_STATUS_SUCCESS; float af = alpha, bf = beta; void *alpha_p; void *beta_p; if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1]) { PyErr_SetString(PyExc_ValueError, "images and kernel must have the same stack size"); return 1; } if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1) return 1; if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1) return 1; switch (input->ga.typecode) { case GA_DOUBLE: alpha_p = (void *)α beta_p = (void *)β break; case GA_FLOAT: case GA_HALF: alpha_p = (void *)⁡ beta_p = (void *)&bf; break; default: PyErr_SetString(PyExc_TypeError, "Unsupported type in convolution"); return 1; } #ifdef CONV_INPLACE Py_XDECREF(*output); *output = om; Py_INCREF(*output); #else if (theano_prep_output(output, PyGpuArray_NDIM(om), PyGpuArray_DIMS(om), om->ga.typecode, GA_C_ORDER, c) != 0) return 1; if (beta != 0.0 && pygpu_move(*output, om)) return 1; #endif if (c_set_tensorNd(*output, APPLY_SPECIFIC(output)) == -1) return 1; cudnnConvolutionFwdAlgo_t algo = CONV_ALGO; cuda_enter(c->ctx); #ifdef CHOOSE_ALGO /* Static variables are only initialized once so this will not * reset the previous algo every time */ static int reuse_algo = 0; static cudnnConvolutionFwdAlgo_t prev_algo = CONV_ALGO; #ifndef CHOOSE_ONCE static size_t prev_img_dims[5] = {0}; static size_t prev_kern_dims[5] = {0}; reuse_algo = 1; for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) { reuse_algo = (reuse_algo && PyGpuArray_DIM(input, i) == prev_img_dims[i]); reuse_algo = (reuse_algo && PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]); } #endif if (!reuse_algo) { #ifdef CHOOSE_TIME int count; cudnnConvolutionFwdAlgoPerf_t choice; err = cudnnFindConvolutionForwardAlgorithm( APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns), desc, APPLY_SPECIFIC(output), 1, &count, &choice); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } algo = choice.algo; #else size_t free = 0, total = 0; cudaError_t err2 = cudaMemGetInfo(&free, &total); if (err2 != cudaSuccess) { PyErr_Format(PyExc_RuntimeError, "Error when trying to find the " "memory information on the GPU: %s\n", cudaGetErrorString(err2)); cuda_exit(c->ctx); return 1; } err = cudnnGetConvolutionForwardAlgorithm( APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns), desc, APPLY_SPECIFIC(output), CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, free, &algo); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } #endif prev_algo = algo; } else { algo = prev_algo; } #ifdef CHOOSE_ONCE reuse_algo = 1; #else for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) { prev_img_dims[i] = PyGpuArray_DIM(input, i); prev_kern_dims[i] = PyGpuArray_DIM(kerns, i); } #endif #endif /* These two algos are not supported for 3d conv */ if (PyGpuArray_NDIM(input) == 5 && (algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM || algo == CUDNN_CONVOLUTION_FWD_ALGO_GEMM)) algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; #if CUDNN_VERSION > 3000 if (algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT) { int nd; int pad[2]; int stride[2]; int upscale[2]; cudnnConvolutionMode_t mode; err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride, upscale, &mode); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error getting convolution properties: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } if (stride[0] != 1 || stride[1] != 1 || PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 || (PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1)) { algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; } } #endif #if CUDNN_VERSION < 3000 /* cuDNN before v3 does not support kernels larger than input even * if appropriate padding is selected. */ for (unsigned int i = 2; i < PyGpuArray_NDIM(input); i++) { if (PyGpuArray_DIM(kerns, i) > PyGpuArray_DIM(input, i)) { PyErr_SetString(PyExc_RuntimeError, "the current version " "of CuDNN does not support kernels larger than the " "inputs in any spatial dimension, even if the inputs " "are padded such that the padded inputs are larger " "than the kernels. Update your installation of CuDNN " "to V3 or more recent to solve the issue."); cuda_exit(c->ctx); return 1; } } #endif { size_t worksize; gpudata *workspace; err = cudnnGetConvolutionForwardWorkspaceSize(APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns), desc, APPLY_SPECIFIC(output), algo, &worksize); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } /* * This is less than ideal since we need to free it after (which * introduces a synchronization point. But we don't have a module * to place a nice get_work_mem() function in. */ if (worksize != 0) { workspace = c->ops->buffer_alloc(c->ctx, worksize, NULL, 0, NULL); if (workspace == NULL) { PyErr_SetString(PyExc_RuntimeError, "Could not allocate working memory"); cuda_exit(c->ctx); return 1; } } err = cudnnConvolutionForward( APPLY_SPECIFIC(_handle), alpha_p, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input), APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns), desc, algo, worksize == 0 ? NULL : *(void **)workspace, worksize, beta_p, APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*output)); if (worksize != 0) c->ops->buffer_release(workspace); } cuda_exit(c->ctx); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error doing operation: %s", cudnnGetErrorString(err)); return 1; } return 0; }
void oskar_cuda_device_info_scan(oskar_CudaDeviceInfo* device, int id) { int arch, device_count = 0; cudaError_t error; struct cudaDeviceProp device_prop; size_t total_memory = 0, free_memory = 0; /* Set CUDA device. */ cudaSetDevice(id); /* Set default values in case of errors. */ device->name[0] = 0; device->compute.capability.major = 0; device->compute.capability.minor = 0; device->supports_double = 0; device->global_memory_size = 0; device->free_memory = 0; device->num_multiprocessors = 0; device->num_cores = 0; device->gpu_clock = 0; device->memory_clock = 0; device->memory_bus_width = 0; device->level_2_cache_size = 0; device->shared_memory_size = 0; device->num_registers = 0; device->warp_size = 0; device->max_threads_per_block = 0; device->max_threads_dim[0] = 0; device->max_threads_dim[1] = 0; device->max_threads_dim[2] = 0; device->max_grid_size[0] = 0; device->max_grid_size[1] = 0; device->max_grid_size[2] = 0; /* Get device count. */ error = cudaGetDeviceCount(&device_count); if (error != cudaSuccess || device_count == 0) { fprintf(stderr, "Unable to determine number of CUDA devices: %s\n", cudaGetErrorString(error)); return; } /* Check device ID is within range. */ if (id > device_count - 1) { fprintf(stderr, "Error: Device ID out of range.\n"); return; } /* Get device properties. */ cudaGetDeviceProperties(&device_prop, id); strcpy(device->name, device_prop.name); device->compute.capability.major = device_prop.major; device->compute.capability.minor = device_prop.minor; device->supports_double = 0; if (device_prop.major >= 2 || device_prop.minor >= 3) device->supports_double = 1; total_memory = device_prop.totalGlobalMem / 1024; device->global_memory_size = total_memory; device->num_multiprocessors = device_prop.multiProcessorCount; arch = (device_prop.major << 4) + device_prop.minor; switch (arch) { case 0x10: case 0x11: case 0x12: case 0x13: device->num_cores = 8; break; case 0x20: device->num_cores = 32; break; case 0x21: device->num_cores = 48; break; case 0x30: case 0x32: case 0x35: case 0x37: device->num_cores = 192; break; case 0x50: case 0x52: case 0x53: device->num_cores = 128; break; case 0x60: device->num_cores = 64; break; case 0x61: case 0x62: device->num_cores = 128; break; default: device->num_cores = -1; break; } if (device->num_cores > 0) device->num_cores *= device->num_multiprocessors; device->gpu_clock = device_prop.clockRate; #if CUDART_VERSION >= 4000 device->memory_clock = device_prop.memoryClockRate; device->memory_bus_width = device_prop.memoryBusWidth; device->level_2_cache_size = device_prop.l2CacheSize; #else device->memory_clock = -1; device->memory_bus_width = -1; device->level_2_cache_size = -1; #endif /* Get free memory size. */ cudaMemGetInfo(&free_memory, &total_memory); free_memory /= 1024; device->free_memory = free_memory; /* Get block properties. */ device->shared_memory_size = device_prop.sharedMemPerBlock; device->num_registers = device_prop.regsPerBlock; device->warp_size = device_prop.warpSize; device->max_threads_per_block = device_prop.maxThreadsPerBlock; device->max_threads_dim[0] = device_prop.maxThreadsDim[0]; device->max_threads_dim[1] = device_prop.maxThreadsDim[1]; device->max_threads_dim[2] = device_prop.maxThreadsDim[2]; device->max_grid_size[0] = device_prop.maxGridSize[0]; device->max_grid_size[1] = device_prop.maxGridSize[1]; device->max_grid_size[2] = device_prop.maxGridSize[2]; }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main( int argc, char** argv) { pArgc = &argc; pArgv = argv; shrQAStart(argc, argv); shrSetLogFileName ("deviceQuery.txt"); shrLog("%s Starting...\n\n", argv[0]); shrLog(" CUDA Device Query (Runtime API) version (CUDART static linking)\n\n"); int deviceCount = 0; cudaError_t error_id = cudaGetDeviceCount(&deviceCount); if (error_id != cudaSuccess) { shrLog( "cudaGetDeviceCount returned %d\n-> %s\n", (int)error_id, cudaGetErrorString(error_id) ); shrQAFinishExit(*pArgc, (const char **)pArgv, QA_FAILED); } // This function call returns 0 if there are no CUDA capable devices. if (deviceCount == 0) shrLog("There is no device supporting CUDA\n"); else shrLog("Found %d CUDA Capable device(s)\n", deviceCount); int dev, driverVersion = 0, runtimeVersion = 0; for (dev = 0; dev < deviceCount; ++dev) { cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, dev); shrLog("\nDevice %d: \"%s\"\n", dev, deviceProp.name); #if CUDART_VERSION >= 2020 // Console log cudaDriverGetVersion(&driverVersion); cudaRuntimeGetVersion(&runtimeVersion); shrLog(" CUDA Driver Version / Runtime Version %d.%d / %d.%d\n", driverVersion/1000, (driverVersion%100)/10, runtimeVersion/1000, (runtimeVersion%100)/10); #endif shrLog(" CUDA Capability Major/Minor version number: %d.%d\n", deviceProp.major, deviceProp.minor); char msg[256]; sprintf(msg, " Total amount of global memory: %.0f MBytes (%llu bytes)\n", (float)deviceProp.totalGlobalMem/1048576.0f, (unsigned long long) deviceProp.totalGlobalMem); shrLog(msg); #if CUDART_VERSION >= 2000 shrLog(" (%2d) Multiprocessors x (%3d) CUDA Cores/MP: %d CUDA Cores\n", deviceProp.multiProcessorCount, ConvertSMVer2Cores(deviceProp.major, deviceProp.minor), ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount); #endif shrLog(" GPU Clock rate: %.0f MHz (%0.2f GHz)\n", deviceProp.clockRate * 1e-3f, deviceProp.clockRate * 1e-6f); #if CUDART_VERSION >= 4000 // This is not available in the CUDA Runtime API, so we make the necessary calls the driver API to support this for output int memoryClock; getCudaAttribute<int>( &memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev ); shrLog(" Memory Clock rate: %.0f Mhz\n", memoryClock * 1e-3f); int memBusWidth; getCudaAttribute<int>( &memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev ); shrLog(" Memory Bus Width: %d-bit\n", memBusWidth); int L2CacheSize; getCudaAttribute<int>( &L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev ); if (L2CacheSize) { shrLog(" L2 Cache Size: %d bytes\n", L2CacheSize); } shrLog(" Max Texture Dimension Size (x,y,z) 1D=(%d), 2D=(%d,%d), 3D=(%d,%d,%d)\n", deviceProp.maxTexture1D, deviceProp.maxTexture2D[0], deviceProp.maxTexture2D[1], deviceProp.maxTexture3D[0], deviceProp.maxTexture3D[1], deviceProp.maxTexture3D[2]); shrLog(" Max Layered Texture Size (dim) x layers 1D=(%d) x %d, 2D=(%d,%d) x %d\n", deviceProp.maxTexture1DLayered[0], deviceProp.maxTexture1DLayered[1], deviceProp.maxTexture2DLayered[0], deviceProp.maxTexture2DLayered[1], deviceProp.maxTexture2DLayered[2]); #endif shrLog(" Total amount of constant memory: %u bytes\n", deviceProp.totalConstMem); shrLog(" Total amount of shared memory per block: %u bytes\n", deviceProp.sharedMemPerBlock); shrLog(" Total number of registers available per block: %d\n", deviceProp.regsPerBlock); shrLog(" Warp size: %d\n", deviceProp.warpSize); shrLog(" Maximum number of threads per multiprocessor: %d\n", deviceProp.maxThreadsPerMultiProcessor); shrLog(" Maximum number of threads per block: %d\n", deviceProp.maxThreadsPerBlock); shrLog(" Maximum sizes of each dimension of a block: %d x %d x %d\n", deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1], deviceProp.maxThreadsDim[2]); shrLog(" Maximum sizes of each dimension of a grid: %d x %d x %d\n", deviceProp.maxGridSize[0], deviceProp.maxGridSize[1], deviceProp.maxGridSize[2]); shrLog(" Maximum memory pitch: %u bytes\n", deviceProp.memPitch); shrLog(" Texture alignment: %u bytes\n", deviceProp.textureAlignment); #if CUDART_VERSION >= 4000 shrLog(" Concurrent copy and execution: %s with %d copy engine(s)\n", (deviceProp.deviceOverlap ? "Yes" : "No"), deviceProp.asyncEngineCount); #else shrLog(" Concurrent copy and execution: %s\n", deviceProp.deviceOverlap ? "Yes" : "No"); #endif #if CUDART_VERSION >= 2020 shrLog(" Run time limit on kernels: %s\n", deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No"); shrLog(" Integrated GPU sharing Host Memory: %s\n", deviceProp.integrated ? "Yes" : "No"); shrLog(" Support host page-locked memory mapping: %s\n", deviceProp.canMapHostMemory ? "Yes" : "No"); #endif #if CUDART_VERSION >= 3000 shrLog(" Concurrent kernel execution: %s\n", deviceProp.concurrentKernels ? "Yes" : "No"); shrLog(" Alignment requirement for Surfaces: %s\n", deviceProp.surfaceAlignment ? "Yes" : "No"); #endif #if CUDART_VERSION >= 3010 shrLog(" Device has ECC support enabled: %s\n", deviceProp.ECCEnabled ? "Yes" : "No"); #endif #if CUDART_VERSION >= 3020 shrLog(" Device is using TCC driver mode: %s\n", deviceProp.tccDriver ? "Yes" : "No"); #endif #if CUDART_VERSION >= 4000 shrLog(" Device supports Unified Addressing (UVA): %s\n", deviceProp.unifiedAddressing ? "Yes" : "No"); shrLog(" Device PCI Bus ID / PCI location ID: %d / %d\n", deviceProp.pciBusID, deviceProp.pciDeviceID ); #endif #if CUDART_VERSION >= 2020 const char *sComputeMode[] = { "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)", "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)", "Prohibited (no host thread can use ::cudaSetDevice() with this device)", "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)", "Unknown", NULL }; shrLog(" Compute Mode:\n"); shrLog(" < %s >\n", sComputeMode[deviceProp.computeMode]); #endif } // csv masterlog info // ***************************** // exe and CUDA driver name shrLog("\n"); std::string sProfileString = "deviceQuery, CUDA Driver = CUDART"; char cTemp[10]; // driver version sProfileString += ", CUDA Driver Version = "; #ifdef WIN32 sprintf_s(cTemp, 10, "%d.%d", driverVersion/1000, (driverVersion%100)/10); #else sprintf(cTemp, "%d.%d", driverVersion/1000, (driverVersion%100)/10); #endif sProfileString += cTemp; // Runtime version sProfileString += ", CUDA Runtime Version = "; #ifdef WIN32 sprintf_s(cTemp, 10, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10); #else sprintf(cTemp, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10); #endif sProfileString += cTemp; // Device count sProfileString += ", NumDevs = "; #ifdef WIN32 sprintf_s(cTemp, 10, "%d", deviceCount); #else sprintf(cTemp, "%d", deviceCount); #endif sProfileString += cTemp; // First 2 device names, if any for (dev = 0; dev < ((deviceCount > 2) ? 2 : deviceCount); ++dev) { cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, dev); sProfileString += ", Device = "; sProfileString += deviceProp.name; } sProfileString += "\n"; shrLogEx(LOGBOTH | MASTER, 0, sProfileString.c_str()); // finish shrQAFinishExit(argc, (const char **)argv, QA_PASSED); }
static void _cudaHandleError(cudaError_t err, const char *file, int line) { if (err != cudaSuccess) { printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line); exit(EXIT_FAILURE); } }
static inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "") { if (cudaSuccess != err) error(cudaGetErrorString(err), file, line, func); }
int APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, PyGpuArrayObject *km, cudnnConvolutionDescriptor_t desc, double alpha, double beta, PyGpuArrayObject **kerns, PyGpuContextObject *c) { cudnnStatus_t err = CUDNN_STATUS_SUCCESS; float af = alpha, bf = beta; void *alpha_p; void *beta_p; if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(km)[1]) { PyErr_SetString(PyExc_ValueError, "GpuDnnConv images and kernel must have the same stack size"); return 1; } if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1) return 1; if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1) return 1; switch (input->ga.typecode) { case GA_DOUBLE: alpha_p = (void *)α beta_p = (void *)β break; case GA_FLOAT: case GA_HALF: alpha_p = (void *)⁡ beta_p = (void *)&bf; break; default: PyErr_SetString(PyExc_TypeError, "Unsupported type in convolution"); return 1; } #ifdef CONV_INPLACE Py_XDECREF(*kerns); *kerns = km; Py_INCREF(*kerns); #else if (theano_prep_output(kerns, PyGpuArray_NDIM(km), PyGpuArray_DIMS(km), km->ga.typecode, GA_C_ORDER, c) != 0) return 1; if (beta != 0.0 && pygpu_move(*kerns, km)) return 1; #endif if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns)) == -1) return 1; cudnnConvolutionBwdFilterAlgo_t algo = CONV_ALGO; cuda_enter(c->ctx); #ifdef CHOOSE_ALGO static int reuse_algo = 0; static cudnnConvolutionBwdFilterAlgo_t prev_algo = CONV_ALGO; #ifndef CHOOSE_ONCE static size_t prev_img_dims[5] = {0}; static size_t prev_top_dims[5] = {0}; reuse_algo = 1; for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) { reuse_algo = (reuse_algo && PyGpuArray_DIM(input, i) == prev_img_dims[i]); reuse_algo = (reuse_algo && PyGpuArray_DIM(output, i) == prev_top_dims[i]); } #endif if (!reuse_algo) { #ifdef CHOOSE_TIME int count; cudnnConvolutionBwdFilterAlgoPerf_t choice; err = cudnnFindConvolutionBackwardFilterAlgorithm( APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc, APPLY_SPECIFIC(kerns), 1, &count, &choice); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } algo = choice.algo; #else size_t free = 0, total = 0; cudaError_t err2 = cudaMemGetInfo(&free, &total); if (err2 != cudaSuccess){ cudaGetLastError(); PyErr_Format(PyExc_RuntimeError, "Error when trying to find the memory " "information on the GPU: %s\n", cudaGetErrorString(err2)); cuda_exit(c->ctx); return 1; } err = cudnnGetConvolutionBackwardFilterAlgorithm( APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc, APPLY_SPECIFIC(kerns), CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, free, &algo); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } #endif prev_algo = algo; } else { algo = prev_algo; } #ifdef CHOOSE_ONCE reuse_algo = 1; #else for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) { prev_img_dims[i] = PyGpuArray_DIM(input, i); prev_top_dims[i] = PyGpuArray_DIM(output, i); } #endif #endif #if CUDNN_VERSION > 3000 if (algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT) { int nd; int pad[2]; int stride[2]; int upscale[2]; cudnnConvolutionMode_t mode; err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride, upscale, &mode); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error getting convolution properties: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } if (stride[0] != 1 || stride[1] != 1 || PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 || (PyGpuArray_DIM(*kerns, 2) == 1 && PyGpuArray_DIM(*kerns, 3) == 1)) { algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0; } } #endif size_t worksize; gpudata *workspace; err = cudnnGetConvolutionBackwardFilterWorkspaceSize( APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc, APPLY_SPECIFIC(kerns), algo, &worksize); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } if (worksize != 0) { workspace = c->ops->buffer_alloc(c->ctx, worksize, NULL, 0, NULL); if (workspace == NULL) { PyErr_SetString(PyExc_RuntimeError, "Could not allocate working memory"); cuda_exit(c->ctx); return 1; } } cuda_wait(input->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_wait(output->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_wait((*kerns)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); err = cudnnConvolutionBackwardFilter_v3( APPLY_SPECIFIC(_handle), alpha_p, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input), APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc, algo, worksize == 0 ? NULL : *(void **)workspace, worksize, beta_p, APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(*kerns)); if (worksize != 0) c->ops->buffer_release(workspace); cuda_record(input->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_record(output->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_record((*kerns)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); cuda_exit(c->ctx); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error doing operation: %s", cudnnGetErrorString(err)); return 1; } return 0; }
void CudaSynchronizedMemory<T>::pullFromDevice(size_t nElements) { ASRL_ASSERT_GT_DBG(m_size,0, "The array is empty"); if(nElements > m_size) nElements = m_size; cudaError_t err = (cudaMemcpy((void*) m_host, (void *)m_device, nElements*sizeof(T), cudaMemcpyDeviceToHost)); ASRL_ASSERT_EQ(err, cudaSuccess, "Unable to copy " << typeid(T).name() << " array of size " << m_size << " from device (" << err << "): " << cudaGetErrorString(err)); }
void CudaSynchronizedMemory<T>::pullFromDeviceAsync(cudaStream_t stream, size_t nElements) { ASRL_ASSERT_GT(m_size,0, "The array is empty"); ASRL_ASSERT(m_pageLocked, "Asynchronous transfer is only valid for page-locked host memory"); if(nElements > m_size) nElements = m_size; cudaError_t err = (cudaMemcpyAsync((void*) m_host, (void *)m_device, nElements*sizeof(T), cudaMemcpyDeviceToHost, stream)); ASRL_ASSERT_EQ(err,cudaSuccess, "Unable to copy " << typeid(T).name() << " array of size " << m_size << " from device. Stream " << stream << ": (" << err << "): " << cudaGetErrorString(err)); }
void compute_process(int agents_total, int nreps, int world_width, int world_height) { int np, pid; MPI_Comm_rank(MPI_COMM_WORLD, &pid); MPI_Comm_size(MPI_COMM_WORLD, &np); int server_process = np - 1; MPI_Status status; /* create a type for struct agent */ const int nitems=5; int blocklengths[5] = {1,1,1,1,1}; MPI_Datatype types[5] = {MPI_INT, MPI_INT, MPI_INT, MPI_FLOAT, MPI_FLOAT}; MPI_Datatype mpi_agent_type; MPI_Aint offsets[5]; offsets[0] = offsetof(agent, id); offsets[1] = offsetof(agent, x); offsets[2] = offsetof(agent, y); offsets[3] = offsetof(agent, z); offsets[4] = offsetof(agent, w); MPI_Type_create_struct(nitems, blocklengths, offsets, types, &mpi_agent_type); MPI_Type_commit(&mpi_agent_type); unsigned int num_bytes = agents_total * sizeof(float4); unsigned int num_halo_points = RADIO * world_width; unsigned int num_halo_bytes = num_halo_points * sizeof(short int); //unsigned int world_node_height = (world_height / (np-1)) + (RADIO * 2); //if(pid == 0 or pid == np - 2) // world_node_height -= RADIO; size_t size_world = world_width * world_height * sizeof(short int); short int *h_world = (short int *)malloc(size_world); *h_world = 0; short int *d_world; for(int j = 0; j < world_width * world_height; j++) { h_world[j] = 0; } /* alloc host memory */ agent *h_agents_in = (agent *)malloc(num_bytes); //agent *d_agents_in; float4 *h_agents_pos; float4 *d_agents_pos; //MPI_Recv(rcv_address, num_points, MPI_FLOAT, server_process, MPI_ANY_TAG, MPI_COMM_WORLD, &status); MPI_Recv(h_agents_in, agents_total, mpi_agent_type, server_process, 0, MPI_COMM_WORLD, &status); //Iniatialize world for( int i = 0; i < agents_total; i++) { h_world[(world_width * (h_agents_in[i].y - 1) ) + h_agents_in[i].x] = (h_agents_in[i].x!=0?1:0); //if(h_world[(world_width * (h_agents_in[i].y - 1) ) + h_agents_in[i].x] == 1) //printf("world x: %d, y: %d\n", h_agents_in[i].x, h_agents_in[i].y); h_agents_pos[i].x = h_agents_in[i].x; h_agents_pos[i].y = h_agents_in[i].y; h_agents_pos[i].z = h_agents_in[i].z; h_agents_pos[i].w = h_agents_in[i].w; } /*** if(pid ==1) { int k=0; for(int j = 0; j < world_width * world_height; j++) { if ( j%96 == 0 and j>0) { k++; printf("%d row: %d\n", h_world[j], k); } else printf("%d ", h_world[j]); } } ***/ // Error code to check return values for CUDA calls cudaError_t err = cudaSuccess; // Allocate the device pointer err = cudaMalloc((void **)&d_world, size_world); if (err != cudaSuccess) { fprintf(stderr, "Failed to allocate device pointer (error code %s)!\n", cudaGetErrorString(err)); exit(EXIT_FAILURE); } err = cudaMemcpy(d_world, h_world, size_world, cudaMemcpyHostToDevice); if (err != cudaSuccess) { fprintf(stderr, "Failed to copy pointer from host to device (error code %s)!\n", cudaGetErrorString(err)); exit(EXIT_FAILURE); } //http://cuda-programming.blogspot.com.es/2013/02/cuda-array-in-cuda-how-to-use-cuda.html //http://stackoverflow.com/questions/17924705/structure-of-arrays-vs-array-of-structures-in-cuda // Allocate the device pointer err = cudaMalloc((void **)&d_agents_pos, num_bytes); if (err != cudaSuccess) { fprintf(stderr, "Failed to allocate device pointer (error code %s)!\n", cudaGetErrorString(err)); exit(EXIT_FAILURE); } err = cudaMemcpy(d_agents_pos, h_agents_pos, num_bytes, cudaMemcpyHostToDevice); if (err != cudaSuccess) { fprintf(stderr, "Failed to copy pointer from host to device (error code %s)!\n", cudaGetErrorString(err)); exit(EXIT_FAILURE); } launch_kernel(d_agents_pos, d_world, world_width, world_height ); MPI_Barrier( MPI_COMM_WORLD); #ifdef DEBUG // printf("pid: %d\n", pid); // display_data(h_agents_in, agents_total ); #endif MPI_Send(h_agents_in, agents_total, mpi_agent_type, server_process, DATA_COLLECT, MPI_COMM_WORLD); /* Release resources */ free(h_agents_in); /* free(h_output); cudaFreeHost(h_left_boundary); cudaFreeHost(h_right_boundary); cudaFreeHost(h_left_halo); cudaFreeHost(h_right_halo); cudaFree(d_input); cudaFree(d_output); */ }
void GridGpu::gridding() { cudaError_t status = kernelCall(); if (status != cudaSuccess) qWarning() << cudaGetErrorString(status); }
bool CudaGLVertexBuffer::allocate() { int size = _numElements * _numVertices * sizeof(float); glGenBuffers(1, &_vbo); #if defined(GL_EXT_direct_state_access) if (glNamedBufferDataEXT) { glNamedBufferDataEXT(_vbo, size, 0, GL_DYNAMIC_DRAW); } else { #else { #endif glBindBuffer(GL_ARRAY_BUFFER, _vbo); glBufferData(GL_ARRAY_BUFFER, size, 0, GL_DYNAMIC_DRAW); glBindBuffer(GL_ARRAY_BUFFER, 0); } // register vbo as cuda resource cudaError_t err = cudaGraphicsGLRegisterBuffer( &_cudaResource, _vbo, cudaGraphicsMapFlagsWriteDiscard); if (err != cudaSuccess) return false; return true; } void CudaGLVertexBuffer::map() { if (_devicePtr) return; size_t num_bytes; void *ptr; cudaError_t err = cudaGraphicsMapResources(1, &_cudaResource, 0); if (err != cudaSuccess) Far::Error(Far::FAR_RUNTIME_ERROR, "CudaGLVertexBuffer::map failed.\n%s\n", cudaGetErrorString(err)); err = cudaGraphicsResourceGetMappedPointer(&ptr, &num_bytes, _cudaResource); if (err != cudaSuccess) Far::Error(Far::FAR_RUNTIME_ERROR, "CudaGLVertexBuffer::map failed.\n%s\n", cudaGetErrorString(err)); _devicePtr = ptr; } void CudaGLVertexBuffer::unmap() { if (_devicePtr == NULL) return; cudaError_t err = cudaGraphicsUnmapResources(1, &_cudaResource, 0); if (err != cudaSuccess) Far::Error(Far::FAR_RUNTIME_ERROR, "CudaGLVertexBuffer::unmap failed.\n%s\n", cudaGetErrorString(err)); _devicePtr = NULL; } } // end namespace Osd
void testCuda(int m, int n, int nnz, std::vector<int>& rows, std::vector<int>& cols, std::vector<double>& values, double* matB){ double tol=1e-9; double start, stop, time_to_build, time_to_solve; int cudaDevice = 0; checkCudaErrors(cudaSetDevice(cudaDevice)); cudaDeviceProp prop; cudaGetDeviceProperties(&prop, cudaDevice); printf("Device Number: %d\n", cudaDevice); printf(" Device name: %s\n", prop.name); checkCudaErrors(cudaDeviceReset()); size_t mem_tot = 0; size_t mem_free = 0; cudaMemGetInfo(&mem_free, & mem_tot); printf("\nFree memory: %d", mem_free); MatSparse matA; matA.setSize(m, n); std::vector<int> I, J; std::vector<double> V; for (int k = 0; k < nnz; k++){ double _val = values[k]; int i = rows[k]; int j = cols[k]; if (fabs(_val) > tol){ I.push_back(i-1); J.push_back(j-1); V.push_back(_val); } } start = second(); matA.fromTruples(I, J, V); stop = second(); time_to_build = stop - start; std::cerr << "Time to Build in GPU (second): " << time_to_build << std::endl; // ******************************** GPU SOLVER ******************************** // // --- Initialize cuSPARSE cusolverSpHandle_t cusolver_handle = NULL; checkCudaErrors(cusolverSpCreate(&cusolver_handle)); cusparseHandle_t cusparse_handle = NULL; checkCudaErrors(cusparseCreate(&cusparse_handle)); cudaStream_t cudaStream = NULL; checkCudaErrors(cudaStreamCreate(&cudaStream)); checkCudaErrors(cusolverSpSetStream(cusolver_handle, cudaStream)); checkCudaErrors(cusparseSetStream(cusparse_handle, cudaStream)); cusparseMatDescr_t descrA; checkCudaErrors(cusparseCreateMatDescr(&descrA)); checkCudaErrors(cusparseSetMatType (descrA, CUSPARSE_MATRIX_TYPE_GENERAL)); checkCudaErrors(cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO)); printf("\nAlloc GPU memory...\n"); double *d_A; checkCudaErrors(cudaMalloc(&d_A, nnz * sizeof(double))); int *d_A_RowIndices; checkCudaErrors(cudaMalloc(&d_A_RowIndices, (m + 1) * sizeof(int))); int *d_A_ColIndices; checkCudaErrors(cudaMalloc(&d_A_ColIndices, nnz * sizeof(int))); double *d_x; checkCudaErrors(cudaMalloc(&d_x, m * sizeof(double))); double *d_b; checkCudaErrors(cudaMalloc(&d_b, m * sizeof(double))); printf("\nError: %s", cudaGetErrorString(cudaGetLastError())); printf("\nCopying data...\n"); checkCudaErrors(cudaMemcpy(d_A, matA.valuesPtr(), nnz * sizeof(double), cudaMemcpyHostToDevice)); checkCudaErrors(cudaMemcpy(d_A_RowIndices, matA.RowPtr(), (m + 1) * sizeof(int), cudaMemcpyHostToDevice)); checkCudaErrors(cudaMemcpy(d_A_ColIndices, matA.ColIdxPtr(), nnz * sizeof(int), cudaMemcpyHostToDevice)); checkCudaErrors(cudaMemcpy(d_b, matB, m * sizeof(double), cudaMemcpyHostToDevice)); double *h_x = (double *)malloc(m * sizeof(double)); printf("\nError: %s", cudaGetErrorString(cudaGetLastError())); cudaMemGetInfo(&mem_free, &mem_tot); printf("\nFree memory: %d", mem_free); int reorder = 0; int singularity = 0; start = second(); //checkCudaErrors(cusolverSpDcsrlsvluHost(cusolver_handle, Nrows, nnz, descrA, sparse.Values(), // sparse.RowPtr(), sparse.ColIdx(), mB.values, tol, reorder, h_x, &singularity)); checkCudaErrors(cusolverSpDcsrlsvqr(cusolver_handle, m, nnz, descrA, d_A, d_A_RowIndices, d_A_ColIndices, d_b, tol, reorder, d_x, &singularity)); checkCudaErrors(cudaDeviceSynchronize()); stop = second(); time_to_solve = stop - start; checkCudaErrors(cudaMemcpy(h_x, d_x, m * sizeof(double), cudaMemcpyDeviceToHost)); // for (int k=0; k<mA.getNumRows(); k++) solution[k] = h_x[k]; checkCudaErrors(cusparseDestroy(cusparse_handle)); checkCudaErrors(cusolverSpDestroy(cusolver_handle)); checkCudaErrors(cudaStreamDestroy(cudaStream)); checkCudaErrors(cudaFree(d_b)); checkCudaErrors(cudaFree(d_x)); checkCudaErrors(cudaFree(d_A)); checkCudaErrors(cudaFree(d_A_RowIndices)); checkCudaErrors(cudaFree(d_A_ColIndices)); free(h_x); std::cerr << "Time to Build in GPU (second): " << time_to_build << std::endl; std::cerr << "Time to Solve in GPU (second): " << time_to_solve << std::endl; std::cerr << "done!"; // ****************************************************************************** // }
const char* WINAPI wine_cudaGetErrorString(cudaError_t error) { WINE_TRACE("\n"); return cudaGetErrorString(error); }
static void cuda_error(int line, cudaError_t code) { const char *err_str = cudaGetErrorString(code); error("cuda error: %d %s \n", line, err_str); }
//////////////////////////////////////////////////////////////////////////////// //! Run the Cuda part of the computation //////////////////////////////////////////////////////////////////////////////// void refreshData(struct cudaGraphicsResource **vbo_resource) { int np; MPI_Comm_size(MPI_COMM_WORLD, &np); MPI_Status status; int num_comp_nodes = np -1; unsigned int num_bytes = sizeof(sAgents); sAgents h_agents_in, h_agents_out[num_comp_nodes]; size_t size_agents = agents_total * sizeof(float4); // map OpenGL buffer object for writing from CUDA float4 *d_agents; // Error code to check return values for CUDA calls cudaError_t err = cudaSuccess; checkCudaErrors(cudaGraphicsMapResources(1, vbo_resource, 0)); size_t num_agents_bytes; checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&d_agents, &num_agents_bytes, *vbo_resource)); //for(int process = 0; process < num_comp_nodes; process++) //while(1) //{ /* Wait for nodes to compute */ MPI_Barrier(MPI_COMM_WORLD); for(int process = 0; process < num_comp_nodes; process++) { MPI_Recv(&h_agents_out[process], num_bytes, MPI_BYTE, process, DATA_COLLECT, MPI_COMM_WORLD, &status); for( int i = 0; i < agents_total; i++) { if( h_agents_out[process].ids[i].y == 1 ) { h_agents_in.pos[i] = h_agents_out[process].pos[i]; h_agents_in.ids[i] = h_agents_out[process].ids[i]; h_agents_in.pos[i].z = 0; h_agents_in.pos[i].w = 1; } } } #ifdef DEBUG printf("Final Data:\n"); display_data(h_agents_in); #endif /*** //pasar los valores al vbo glBindBuffer(GL_ARRAY_BUFFER, vbo); unsigned int size = agent_width * agent_height * 4 * sizeof(float); glBufferSubData(GL_ARRAY_BUFFER, 0, size, &h_agents_in.pos); glBindBuffer(GL_ARRAY_BUFFER, 0); ***/ // Copy the host pointer memory to the device memory printf("Copy pointer from the host memory to the CUDA device\n"); err = cudaMemcpy(d_agents, h_agents_in.pos, size_agents, cudaMemcpyHostToDevice); if (err != cudaSuccess) { fprintf(stderr, "Failed to copy pointer from host to device (error code %s)!\n", cudaGetErrorString(err)); exit(EXIT_FAILURE); } //glutPostRedisplay(); printf("sale de refreshData:\n"); //} // unmap buffer object checkCudaErrors(cudaGraphicsUnmapResources(1, vbo_resource, 0)); /* release resources */ //free(&h_agents_in); //free(&h_agents_out); }
int main(int argc, char **argv) { printf("Computing Game Of Life On %d x %d Board.\n", DIM_X, DIM_Y); int *host_current, *host_future, *host_future_naive, *host_future_cached; int *gpu_current, *gpu_future; clock_t start, stop; cudaMallocHost((void**) &host_current, DIM_X * DIM_Y * sizeof(int)); cudaMallocHost((void**) &host_future, DIM_X * DIM_Y * sizeof(int)); cudaMallocHost((void**) &host_future_naive, DIM_X * DIM_Y * sizeof(int)); cudaMallocHost((void**) &host_future_cached, DIM_X * DIM_Y * sizeof(int)); assert(cudaGetLastError() == cudaSuccess); cudaMalloc((void**) &gpu_current, DIM_X * DIM_Y * sizeof(int)); cudaMalloc((void**) &gpu_future, DIM_X * DIM_Y * sizeof(int)); printf("%s\n", cudaGetErrorString(cudaGetLastError())); assert(cudaGetLastError() == cudaSuccess); fill_board(host_current, 40); add_glider(host_current); cudaMemcpy(gpu_current, host_current, DIM_X * DIM_Y * sizeof(int), cudaMemcpyHostToDevice); // print_board(host_current); float time_naive, time_cached, time_cpu; for(int i = 1; i < STEPS; i++) { printf("=========\n"); start = clock(); naive_game_of_life_wrapper(gpu_current, gpu_future); cudaMemcpy(host_future_naive, gpu_future, DIM_X * DIM_Y * sizeof(int), cudaMemcpyDeviceToHost); stop = clock(); time_naive = (float)(stop - start)/CLOCKS_PER_SEC; printf("Time for Naive GPU To Compute Next Phase: %.5f s\n", time_naive); start = clock(); cached_game_of_life_wrapper(gpu_current, gpu_future); cudaMemcpy(host_future_cached, gpu_future, DIM_X * DIM_Y * sizeof(int), cudaMemcpyDeviceToHost); stop = clock(); time_cached = (float)(stop - start)/CLOCKS_PER_SEC; printf("Time for Cached GPU To Compute Next Phase: %.5f s\n", time_cached); start = clock(); update_board(host_current, host_future); stop = clock(); time_cpu = (float)(stop - start)/CLOCKS_PER_SEC; printf("Time for CPU To Compute Next Phase: %.5f s\n", time_cpu); printf("speedup for naive = %.2f; speedup for cached = %.2f; speedup for cached over naive = %.2f\n", time_cpu/time_naive, time_cpu/time_cached, time_naive/time_cached); check_boards(host_future, host_future_naive); check_boards(host_future, host_future_cached); int *temp; temp = host_current; host_current = host_future; host_future = temp; temp = gpu_current; gpu_current = gpu_future; gpu_future = temp; } cudaFree(host_future); cudaFree(host_future_naive); cudaFree(host_future_cached); cudaFree(host_current); cudaFree(gpu_current); cudaFree(gpu_future); return 0; }
void compute_process() { int np, pid; MPI_Comm_rank(MPI_COMM_WORLD, &pid); MPI_Comm_size(MPI_COMM_WORLD, &np); int server_process = np - 1; MPI_Status status; int num_comp_nodes = np -1; unsigned int num_bytes = sizeof(sAgents); unsigned int num_halo_points = RADIO * world_width; unsigned int num_halo_bytes = num_halo_points * sizeof(int); size_t size_world = world_width * world_height * sizeof(int); int *h_world = (int *)malloc(size_world); int *d_world; int left_neighbor = (pid > 0) ? (pid - 1) : MPI_PROC_NULL; int right_neighbor = (pid < np -2) ? (pid + 1) : MPI_PROC_NULL; for(int j = 0; j < world_width * world_height; j++) { h_world[j] = 0; } sAgents h_agents_in, h_agents_left_node, h_agents_right_node; float4 h_agents_pos[agents_total], h_agents_ids[agents_total]; float4 *d_agents_pos, *d_agents_ids; unsigned int num_bytes_agents = agents_total * sizeof(float4); int world_height_node = world_height / num_comp_nodes; // Error code to check return values for CUDA calls cudaError_t err = cudaSuccess; // Allocate the device pointer err = cudaMalloc((void **)&d_world, size_world); if (err != cudaSuccess) { fprintf(stderr, "Failed to allocate device pointer (error code %s)!\n", cudaGetErrorString(err)); exit(EXIT_FAILURE); } err = cudaMalloc((void **)&d_agents_pos, num_bytes_agents); if (err != cudaSuccess) { fprintf(stderr, "Failed to allocate device pointer (error code %s)!\n", cudaGetErrorString(err)); exit(EXIT_FAILURE); } err = cudaMalloc((void **)&d_agents_ids, num_bytes_agents); if (err != cudaSuccess) { fprintf(stderr, "Failed to allocate device pointer (error code %s)!\n", cudaGetErrorString(err)); exit(EXIT_FAILURE); } MPI_Recv(&h_agents_in, num_bytes, MPI_BYTE, server_process, 0, MPI_COMM_WORLD, &status); for(int i = 0; i < agents_total; i++) { //identify the active agents according to the y coordinate and set the busy cells in the world if( ( round(h_agents_in.pos[i].y) >= (pid * world_height_node) ) and ( round(h_agents_in.pos[i].y) < ( (pid + 1) * world_height_node ) ) ) { h_agents_in.ids[i].y = 1; h_world[(int)round( (world_width * (h_agents_in.pos[i].y - 1) ) + h_agents_in.pos[i].x )] = h_agents_in.ids[i].x; } //Copy the data to a local arrays h_agents_pos[i] = h_agents_in.pos[i]; h_agents_ids[i] = h_agents_in.ids[i]; } err = cudaMemcpy(d_world, h_world, size_world, cudaMemcpyHostToDevice); if (err != cudaSuccess) { fprintf(stderr, "Failed to copy pointer from host to device (error code %s)!\n", cudaGetErrorString(err)); exit(EXIT_FAILURE); } //for(int it = 0; it < nreps ; it++) while(1) { int it=4; err = cudaMemcpy(d_agents_pos, h_agents_pos, num_bytes_agents, cudaMemcpyHostToDevice); if (err != cudaSuccess) { fprintf(stderr, "Failed to copy pointer from host to device (error code %s)!\n", cudaGetErrorString(err)); exit(EXIT_FAILURE); } err = cudaMemcpy(d_agents_ids, h_agents_ids, num_bytes_agents, cudaMemcpyHostToDevice); if (err != cudaSuccess) { fprintf(stderr, "Failed to copy pointer from host to device (error code %s)!\n", cudaGetErrorString(err)); exit(EXIT_FAILURE); } launch_kernel(d_agents_pos, d_agents_ids, d_world, world_width, world_height, agent_width, agent_height, world_height_node, pid ); cudaMemcpy(h_agents_pos, d_agents_pos, num_bytes_agents, cudaMemcpyDeviceToHost); cudaMemcpy(h_agents_ids, d_agents_ids, num_bytes_agents, cudaMemcpyDeviceToHost); //copy the data to the struct for( int i = 0; i < agents_total; i++) { h_agents_in.pos[i] = h_agents_pos[i]; h_agents_in.ids[i] = h_agents_ids[i]; } MPI_Barrier(MPI_COMM_WORLD); MPI_Send(&h_agents_in, num_bytes, MPI_BYTE, server_process, DATA_COLLECT, MPI_COMM_WORLD); #ifdef DEBUG //printf("pid: %d\n", pid); //display_data(h_agents_in); #endif // send data to left, get data from right MPI_Sendrecv(&h_agents_in, num_bytes, MPI_BYTE, left_neighbor, it, &h_agents_right_node, num_bytes, MPI_BYTE, right_neighbor, it, MPI_COMM_WORLD, &status); // send data to right, get data from left MPI_Sendrecv(&h_agents_in, num_bytes, MPI_BYTE, right_neighbor, it, &h_agents_left_node, num_bytes, MPI_BYTE, left_neighbor, it, MPI_COMM_WORLD, &status); for( int i = 0; i < agents_total; i++) { if(pid != np-2) { if(h_agents_right_node.ids[i].y == 2) { h_agents_in.pos[i] = h_agents_right_node.pos[i]; h_agents_pos[i] = h_agents_right_node.pos[i]; h_agents_in.ids[i].y = 1; h_agents_ids[i].y = 1; } } if(pid != 0) { if(h_agents_left_node.ids[i].y == 3) { h_agents_in.pos[i] = h_agents_left_node.pos[i]; h_agents_pos[i] = h_agents_left_node.pos[i]; h_agents_in.ids[i].y = 1; h_agents_ids[i].y = 1; } } } /*** if(pid == 1) { printf("pid: %d\n", pid); display_data(h_agents_in); display_data(h_agents_right_node); display_data(h_agents_left_node); } ***/ } /* Release resources */ // free(h_agents_in); /* free(h_output); cudaFreeHost(h_left_boundary); cudaFreeHost(h_right_boundary); cudaFreeHost(h_left_halo); cudaFreeHost(h_right_halo); cudaFree(d_input); cudaFree(d_output); */ }
int main(int argc, char *argv[]) { // needed to work correctly with piped benchmarkrunner setlinebuf(stdout); setlinebuf(stdin); int n_indices = 1; int n_dimensions = 1; char inBuf[200]; // ridiculously large input buffer. bool isFirst = true; do { // Allocate memory for the arrays int *h_indices = 0; double *h_outputGPU = 0; try { h_indices = new int [n_indices * n_dimensions]; h_outputGPU = new double [n_indices * n_dimensions]; } catch (std::exception e) { std::cerr << "Caught exception: " << e.what() << std::endl; std::cerr << "Unable to allocate CPU memory (try running with fewer vectors/dimensions)" << std::endl; return -1; } int *d_indices; double *d_output; try { cudaError_t cudaResult; cudaResult = cudaMalloc((void **)&d_indices, n_dimensions * n_indices * sizeof(int)); if (cudaResult != cudaSuccess) { throw std::runtime_error(cudaGetErrorString(cudaResult)); } } catch (std::runtime_error e) { std::cerr << "Caught exception: " << e.what() << std::endl; std::cerr << "Unable to allocate GPU memory (try running with fewer vectors/dimensions)" << std::endl; return -1; } // Initialize the indices (done on the host) for(int i = 0; i < n_indices; i++) { h_indices[i] = i; } // Copy the indices to the device cudaMemcpy(d_indices, h_indices, n_dimensions * n_indices * sizeof(int), cudaMemcpyHostToDevice); cudaDeviceSynchronize(); // Execute the QRNG on the device int n_vec; sobol_nikola_unsimplified(n_indices, d_indices, n_indices, &d_output, &n_vec); cudaDeviceSynchronize(); cudaMemcpy(h_outputGPU, d_output, n_indices * n_dimensions * sizeof(double), cudaMemcpyDeviceToHost); // Cleanup and terminate delete h_indices; cudaFree(d_indices); cudaFree(d_output); if(!isFirst) { printf("RESULT "); for(int i = 0; i < std::min(n_indices,10); i++) printf("%f ", h_outputGPU[i]); printf("\n"); } else { printf("OK\n"); isFirst = false; } delete h_outputGPU; fgets(inBuf, 200, stdin); if (sscanf(inBuf, "%u", &n_indices) == 0) { // if input is not a number, it has to be "EXIT" if (strncmp("EXIT",inBuf,4)==0) { printf("OK\n"); break; } else { printf("ERROR. Bad input: %s\n", inBuf); break; } } } while (true); cudaDeviceReset(); return -1; }
void check_error(cudaError_t ret) const { if( ret != cudaSuccess ) { throw std::runtime_error(cudaGetErrorString(ret)); } }
//-------------------------------------------------------------------------- // CUDA init //-------------------------------------------------------------------------- bool CUDAContext::configInit( ) { #ifdef EQUALIZER_USE_CUDA cudaDeviceProp props; uint32_t device = getPipe()->getDevice(); // Setup the CUDA device if( device == LB_UNDEFINED_UINT32 ) { device = _getFastestDeviceID(); LBWARN << "No CUDA device, using the fastest device: " << device << std::endl; } int device_count = 0; cudaGetDeviceCount( &device_count ); LBINFO << "CUDA devices found: " << device_count << std::endl; LBASSERT( static_cast< uint32_t >( device_count ) > device ); if( static_cast< uint32_t >( device_count ) <= device ) { sendError( ERROR_CUDACONTEXT_DEVICE_NOTFOUND ) << lexical_cast< std::string >( device ); return false; } // We assume GL interop here, otherwise use cudaSetDevice( device ); // Attention: this call requires a valid GL context! cudaGLSetGLDevice( device ); int usedDevice = static_cast< int >( device ); #ifdef _WIN32 HGPUNV handle = 0; if( !WGLEW_NV_gpu_affinity ) { LBWARN <<"WGL_NV_gpu_affinity unsupported, ignoring device setting" << std::endl; return true; } if( !wglEnumGpusNV( device, &handle )) { LBWARN << "wglEnumGpusNV failed : " << lunchbox::sysError << std::endl; return false; } cudaWGLGetDevice( &usedDevice, handle ); #else cudaGetDevice( &usedDevice ); #endif LBASSERT( device == static_cast< uint32_t >( device )); cudaGetDeviceProperties( &props, usedDevice ); cudaError_t err = cudaGetLastError(); if( cudaSuccess != err) { sendError( ERROR_CUDACONTEXT_INIT_FAILED ) << std::string( cudaGetErrorString( err )); return false; } LBINFO << "Using CUDA device: " << device << std::endl; return true; #else sendError( ERROR_CUDACONTEXT_MISSING_SUPPORT ); return false; #endif }
static inline void checkCudaError(cudaError_t err, const char* file, const int line, const char* func) { if (cudaSuccess != err) cv::error(cv::Error::GpuApiCallError, cudaGetErrorString(err), func, file, line); }
void ControlCubeCache::_reSizeCache() { _nLevels = _nextnLevels; _levelCube = _nextLevelCube; _offset = _nextOffset; _nextnLevels = 0; _nextLevelCube = 0; _dimCube = exp2(_nLevels - _levelCube) + 2 * CUBE_INC; _sizeElement = pow(_dimCube, 3); int dimV = exp2(_nLevels); _minValue = coordinateToIndex(vmml::vector<3,int>(0,0,0), _levelCube, _nLevels); _maxValue = coordinateToIndex(vmml::vector<3,int>(dimV-1,dimV-1,dimV-1), _levelCube, _nLevels); int dc = exp2(_nLevels - _levelCube); vmml::vector<3,int> mn = _cpuCache->getMinCoord(); vmml::vector<3,int> mx = _cpuCache->getMaxCoord(); _maxC = mx - mn; if ((mx.x() - mn.x()) % dc != 0) _maxC[0] += dc; if ((mx.y() - mn.y()) % dc != 0) _maxC[1] += dc; if ((mx.z() - mn.z()) % dc != 0) _maxC[2] += dc; if (cudaSuccess != cudaSetDevice(_device)) { std::cerr<<"Control Cube Cache, error setting device: "<<cudaGetErrorString(cudaGetLastError())<<std::endl; throw; } if (_memory != 0) if (cudaSuccess != cudaFree((void*)_memory)) { std::cerr<<"Control Cube Cache, error resizing cache: "<<cudaGetErrorString(cudaGetLastError())<<std::endl; throw; } size_t total = 0; size_t free = 0; if (cudaSuccess != cudaMemGetInfo(&free, &total)) { std::cerr<<"Control Cube Cache, error resizing cache: "<<cudaGetErrorString(cudaGetLastError())<<std::endl; throw; } float memorySize = (0.80f*free); // Get 80% of free memory _maxNumCubes = memorySize/ (_sizeElement*sizeof(float)); if (_maxNumCubes == 0) { std::cerr<<"Control Cube Cache: Memory aviable is not enough "<<memorySize/1024/1024<<" MB"<<std::endl; throw; } if (cudaSuccess != cudaMalloc((void**)&_memory, _maxNumCubes*_sizeElement*sizeof(float))) { std::cerr<<"Control Cube Cache, error resizing cache: "<<cudaGetErrorString(cudaGetLastError())<<std::endl; throw; } _freeSlots = _maxNumCubes; ControlElementCache::_reSizeCache(); }
static void HandleError(const char *file, int line, cudaError_t err) { printf("ERROR in %s:%d: %s (%d)\n", file, line, cudaGetErrorString(err), err); exit(1); }
void cudaAssert(cudaError code, const char *file, int line){ if (code != cudaSuccess){ fprintf(stderr,"cudaErrChk: %s %s %d\n", cudaGetErrorString(code), file, line); //exit(code); } }
void MFNHashTypePlainCUDA::freeThreadAndDeviceMemory() { trace_printf("MFNHashTypePlainCUDA::freeThreadAndDeviceMemory()\n"); cudaError_t err; // Free all the memory, then look for errors. cudaFree((void *)this->DeviceHashlistAddress); cudaFreeHost((void *)this->HostSuccessAddress); delete[] this->HostSuccessReportedAddress; // Only cudaFree if zeroCopy is in use. if (!this->useZeroCopy) { cudaFree((void *)this->DeviceSuccessAddress); cudaFree((void *)this->DeviceFoundPasswordsAddress); } cudaFreeHost((void *)this->HostFoundPasswordsAddress); cudaFreeHost((void*)this->HostStartPointAddress); cudaFree((void *)this->DeviceStartPointAddress); cudaFree((void *)this->DeviceStartPasswords32Address); // Free salted hashes if in use. if (this->hashAttributes.hashUsesWordlist) { cudaFree((void *)this->DeviceWordlistBlocks); cudaFree((void *)this->DeviceWordlistLengths); } if (this->hashAttributes.hashUsesSalt) { cudaFree((void *)this->DeviceSaltLengthsAddress); cudaFree((void *)this->DeviceSaltValuesAddress); } // Only free the bitmap memory if it has been allocated. if (this->DeviceBitmap256kb_Address) { cudaFree((void *)this->DeviceBitmap256kb_Address); this->DeviceBitmap256kb_Address = 0; } if (this->DeviceBitmap128mb_a_Address) { cudaFree((void *)this->DeviceBitmap128mb_a_Address); this->DeviceBitmap128mb_a_Address = 0; } if (this->DeviceBitmap128mb_b_Address) { cudaFree((void *)this->DeviceBitmap128mb_b_Address); this->DeviceBitmap128mb_b_Address = 0; } if (this->DeviceBitmap128mb_c_Address) { cudaFree((void *)this->DeviceBitmap128mb_c_Address); this->DeviceBitmap128mb_c_Address = 0; } if (this->DeviceBitmap128mb_d_Address) { cudaFree((void *)this->DeviceBitmap128mb_d_Address); this->DeviceBitmap128mb_d_Address = 0; } // Get any error that occurred above and report it. err = cudaGetLastError(); if (err != cudaSuccess) { printf("Thread %d: CUDA error freeing memory: %s. Exiting.\n", this->threadId, cudaGetErrorString( err)); exit(1); } }
void gpu_data:: set_size( size_t new_size ) { if (new_size == 0) { if (device_in_use) { // Wait for any possible CUDA kernels that might be using our memory block to // complete before we free the memory. synchronize_stream(0); device_in_use = false; } wait_for_transfer_to_finish(); data_size = 0; host_current = true; device_current = true; device_in_use = false; data_host.reset(); data_device.reset(); } else if (new_size != data_size) { if (device_in_use) { // Wait for any possible CUDA kernels that might be using our memory block to // complete before we free the memory. synchronize_stream(0); device_in_use = false; } wait_for_transfer_to_finish(); data_size = new_size; host_current = true; device_current = true; device_in_use = false; try { CHECK_CUDA(cudaGetDevice(&the_device_id)); // free memory blocks before we allocate new ones. data_host.reset(); data_device.reset(); void* data; CHECK_CUDA(cudaMallocHost(&data, new_size*sizeof(float))); // Note that we don't throw exceptions since the free calls are invariably // called in destructors. They also shouldn't fail anyway unless someone // is resetting the GPU card in the middle of their program. data_host.reset((float*)data, [](float* ptr){ auto err = cudaFreeHost(ptr); if(err!=cudaSuccess) std::cerr << "cudaFreeHost() failed. Reason: " << cudaGetErrorString(err) << std::endl; }); CHECK_CUDA(cudaMalloc(&data, new_size*sizeof(float))); data_device.reset((float*)data, [](float* ptr){ auto err = cudaFree(ptr); if(err!=cudaSuccess) std::cerr << "cudaFree() failed. Reason: " << cudaGetErrorString(err) << std::endl; }); if (!cuda_stream) { cudaStream_t cstream; CHECK_CUDA(cudaStreamCreateWithFlags(&cstream, cudaStreamNonBlocking)); cuda_stream.reset(cstream, [](void* ptr){ auto err = cudaStreamDestroy((cudaStream_t)ptr); if(err!=cudaSuccess) std::cerr << "cudaStreamDestroy() failed. Reason: " << cudaGetErrorString(err) << std::endl; }); } } catch(...) { set_size(0); throw; } } }
extern int scanhash_groestlcoin(int thr_id, uint32_t *pdata, uint32_t *ptarget, uint32_t max_nonce, uint32_t *hashes_done) { static THREAD uint32_t *foundNounce = nullptr; uint32_t start_nonce = pdata[19]; unsigned int intensity = (device_sm[device_map[thr_id]] > 500) ? 24 : 23; uint32_t throughputmax = device_intensity(device_map[thr_id], __func__, 1U << intensity); uint32_t throughput = min(throughputmax, max_nonce - start_nonce) & 0xfffffc00; if (opt_benchmark) ptarget[7] = 0x0000000f; // init static THREAD volatile bool init = false; if(!init) { CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id])); cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); cudaDeviceSetCacheConfig(cudaFuncCachePreferL1); CUDA_SAFE_CALL(cudaStreamCreate(&gpustream[thr_id])); groestlcoin_cpu_init(thr_id, throughputmax); CUDA_SAFE_CALL(cudaMallocHost(&foundNounce, 2 * 4)); init = true; } // Endian Drehung ist notwendig uint32_t endiandata[32]; for (int kk=0; kk < 32; kk++) be32enc(&endiandata[kk], pdata[kk]); // Context mit dem Endian gedrehten Blockheader vorbereiten (Nonce wird später ersetzt) groestlcoin_cpu_setBlock(thr_id, endiandata); do { // GPU const uint32_t Htarg = ptarget[7]; groestlcoin_cpu_hash(thr_id, throughput, pdata[19], foundNounce, ptarget[7]); if(stop_mining) {mining_has_stopped[thr_id] = true; cudaStreamDestroy(gpustream[thr_id]); pthread_exit(nullptr);} if(foundNounce[0] < 0xffffffff) { uint32_t tmpHash[8]; endiandata[19] = SWAP32(foundNounce[0]); groestlhash(tmpHash, endiandata); if(tmpHash[7] <= Htarg && fulltest(tmpHash, ptarget)) { int res = 1; if(opt_benchmark) applog(LOG_INFO, "GPU #%d Found nounce %08x", device_map[thr_id], foundNounce[0]); *hashes_done = pdata[19] - start_nonce + throughput; if(foundNounce[1] != 0xffffffff) { endiandata[19] = SWAP32(foundNounce[1]); groestlhash(tmpHash, endiandata); if(tmpHash[7] <= Htarg && fulltest(tmpHash, ptarget)) { pdata[21] = foundNounce[1]; res++; if(opt_benchmark) applog(LOG_INFO, "GPU #%d Found second nounce %08x", device_map[thr_id], foundNounce[1]); } else { if(tmpHash[7] != Htarg) { applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], foundNounce[1]); } } } pdata[19] = foundNounce[0]; return res; } else { if(tmpHash[7] != Htarg) { applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], foundNounce[0]); } } } pdata[19] += throughput; cudaError_t err = cudaGetLastError(); if(err != cudaSuccess) { applog(LOG_ERR, "GPU #%d: %s", device_map[thr_id], cudaGetErrorString(err)); exit(EXIT_FAILURE); } } while(!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput))); *hashes_done = pdata[19] - start_nonce; return 0; }
PetscErrorCode PetscOptionsCheckInitial_Private(void) { char string[64],mname[PETSC_MAX_PATH_LEN],*f; MPI_Comm comm = PETSC_COMM_WORLD; PetscBool flg1 = PETSC_FALSE,flg2 = PETSC_FALSE,flg3 = PETSC_FALSE,flag; PetscErrorCode ierr; PetscReal si; PetscInt intensity; int i; PetscMPIInt rank; char version[256]; #if !defined(PETSC_HAVE_THREADSAFETY) PetscReal logthreshold; PetscBool flg4 = PETSC_FALSE; #endif PetscFunctionBegin; ierr = MPI_Comm_rank(PETSC_COMM_WORLD,&rank);CHKERRQ(ierr); #if !defined(PETSC_HAVE_THREADSAFETY) /* Setup the memory management; support for tracing malloc() usage */ ierr = PetscOptionsHasName(NULL,"-malloc_log",&flg3);CHKERRQ(ierr); logthreshold = 0.0; ierr = PetscOptionsGetReal(NULL,"-malloc_log_threshold",&logthreshold,&flg1);CHKERRQ(ierr); if (flg1) flg3 = PETSC_TRUE; #if defined(PETSC_USE_DEBUG) ierr = PetscOptionsGetBool(NULL,"-malloc",&flg1,&flg2);CHKERRQ(ierr); if ((!flg2 || flg1) && !petscsetmallocvisited) { if (flg2 || !(PETSC_RUNNING_ON_VALGRIND)) { /* turn off default -malloc if valgrind is being used */ ierr = PetscSetUseTrMalloc_Private();CHKERRQ(ierr); } } #else ierr = PetscOptionsGetBool(NULL,"-malloc_dump",&flg1,NULL);CHKERRQ(ierr); ierr = PetscOptionsGetBool(NULL,"-malloc",&flg2,NULL);CHKERRQ(ierr); if (flg1 || flg2 || flg3) {ierr = PetscSetUseTrMalloc_Private();CHKERRQ(ierr);} #endif if (flg3) { ierr = PetscMallocSetDumpLogThreshold((PetscLogDouble)logthreshold);CHKERRQ(ierr); } flg1 = PETSC_FALSE; ierr = PetscOptionsGetBool(NULL,"-malloc_debug",&flg1,NULL);CHKERRQ(ierr); if (flg1) { ierr = PetscSetUseTrMalloc_Private();CHKERRQ(ierr); ierr = PetscMallocDebug(PETSC_TRUE);CHKERRQ(ierr); } flg1 = PETSC_FALSE; ierr = PetscOptionsGetBool(NULL,"-malloc_test",&flg1,NULL);CHKERRQ(ierr); #if defined(PETSC_USE_DEBUG) if (flg1 && !PETSC_RUNNING_ON_VALGRIND) { ierr = PetscSetUseTrMalloc_Private();CHKERRQ(ierr); ierr = PetscMallocSetDumpLog();CHKERRQ(ierr); ierr = PetscMallocDebug(PETSC_TRUE);CHKERRQ(ierr); } #endif flg1 = PETSC_FALSE; ierr = PetscOptionsGetBool(NULL,"-malloc_info",&flg1,NULL);CHKERRQ(ierr); if (!flg1) { flg1 = PETSC_FALSE; ierr = PetscOptionsGetBool(NULL,"-memory_view",&flg1,NULL);CHKERRQ(ierr); } if (flg1) { ierr = PetscMemorySetGetMaximumUsage();CHKERRQ(ierr); } #endif #if defined(PETSC_USE_LOG) ierr = PetscOptionsHasName(NULL,"-objects_dump",&PetscObjectsLog);CHKERRQ(ierr); #endif /* Set the display variable for graphics */ ierr = PetscSetDisplay();CHKERRQ(ierr); /* Print the PETSc version information */ ierr = PetscOptionsHasName(NULL,"-v",&flg1);CHKERRQ(ierr); ierr = PetscOptionsHasName(NULL,"-version",&flg2);CHKERRQ(ierr); ierr = PetscOptionsHasName(NULL,"-help",&flg3);CHKERRQ(ierr); if (flg1 || flg2 || flg3) { /* Print "higher-level" package version message */ if (PetscExternalVersionFunction) { ierr = (*PetscExternalVersionFunction)(comm);CHKERRQ(ierr); } ierr = PetscGetVersion(version,256);CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm,"--------------------------------------------\ ------------------------------\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm,"%s\n",version);CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm,"%s",PETSC_AUTHOR_INFO);CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm,"See docs/changes/index.html for recent updates.\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm,"See docs/faq.html for problems.\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm,"See docs/manualpages/index.html for help. \n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm,"Libraries linked from %s\n",PETSC_LIB_DIR);CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm,"--------------------------------------------\ ------------------------------\n");CHKERRQ(ierr); } /* Print "higher-level" package help message */ if (flg3) { if (PetscExternalHelpFunction) { ierr = (*PetscExternalHelpFunction)(comm);CHKERRQ(ierr); } } /* Setup the error handling */ flg1 = PETSC_FALSE; ierr = PetscOptionsGetBool(NULL,"-on_error_abort",&flg1,NULL);CHKERRQ(ierr); if (flg1) { ierr = MPI_Comm_set_errhandler(PETSC_COMM_WORLD,MPI_ERRORS_ARE_FATAL);CHKERRQ(ierr); ierr = PetscPushErrorHandler(PetscAbortErrorHandler,0);CHKERRQ(ierr); } flg1 = PETSC_FALSE; ierr = PetscOptionsGetBool(NULL,"-on_error_mpiabort",&flg1,NULL);CHKERRQ(ierr); if (flg1) { ierr = PetscPushErrorHandler(PetscMPIAbortErrorHandler,0);CHKERRQ(ierr);} flg1 = PETSC_FALSE; ierr = PetscOptionsGetBool(NULL,"-mpi_return_on_error",&flg1,NULL);CHKERRQ(ierr); if (flg1) { ierr = MPI_Comm_set_errhandler(comm,MPI_ERRORS_RETURN);CHKERRQ(ierr); } flg1 = PETSC_FALSE; ierr = PetscOptionsGetBool(NULL,"-no_signal_handler",&flg1,NULL);CHKERRQ(ierr); if (!flg1) {ierr = PetscPushSignalHandler(PetscSignalHandlerDefault,(void*)0);CHKERRQ(ierr);} flg1 = PETSC_FALSE; ierr = PetscOptionsGetBool(NULL,"-fp_trap",&flg1,NULL);CHKERRQ(ierr); if (flg1) {ierr = PetscSetFPTrap(PETSC_FP_TRAP_ON);CHKERRQ(ierr);} ierr = PetscOptionsGetInt(NULL,"-check_pointer_intensity",&intensity,&flag);CHKERRQ(ierr); if (flag) {ierr = PetscCheckPointerSetIntensity(intensity);CHKERRQ(ierr);} /* Setup debugger information */ ierr = PetscSetDefaultDebugger();CHKERRQ(ierr); ierr = PetscOptionsGetString(NULL,"-on_error_attach_debugger",string,64,&flg1);CHKERRQ(ierr); if (flg1) { MPI_Errhandler err_handler; ierr = PetscSetDebuggerFromString(string);CHKERRQ(ierr); ierr = MPI_Comm_create_errhandler((MPI_Handler_function*)Petsc_MPI_DebuggerOnError,&err_handler);CHKERRQ(ierr); ierr = MPI_Comm_set_errhandler(comm,err_handler);CHKERRQ(ierr); ierr = PetscPushErrorHandler(PetscAttachDebuggerErrorHandler,0);CHKERRQ(ierr); } ierr = PetscOptionsGetString(NULL,"-debug_terminal",string,64,&flg1);CHKERRQ(ierr); if (flg1) { ierr = PetscSetDebugTerminal(string);CHKERRQ(ierr); } ierr = PetscOptionsGetString(NULL,"-start_in_debugger",string,64,&flg1);CHKERRQ(ierr); ierr = PetscOptionsGetString(NULL,"-stop_for_debugger",string,64,&flg2);CHKERRQ(ierr); if (flg1 || flg2) { PetscMPIInt size; PetscInt lsize,*nodes; MPI_Errhandler err_handler; /* we have to make sure that all processors have opened connections to all other processors, otherwise once the debugger has stated it is likely to receive a SIGUSR1 and kill the program. */ ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRQ(ierr); if (size > 2) { PetscMPIInt dummy = 0; MPI_Status status; for (i=0; i<size; i++) { if (rank != i) { ierr = MPI_Send(&dummy,1,MPI_INT,i,109,PETSC_COMM_WORLD);CHKERRQ(ierr); } } for (i=0; i<size; i++) { if (rank != i) { ierr = MPI_Recv(&dummy,1,MPI_INT,i,109,PETSC_COMM_WORLD,&status);CHKERRQ(ierr); } } } /* check if this processor node should be in debugger */ ierr = PetscMalloc1(size,&nodes);CHKERRQ(ierr); lsize = size; ierr = PetscOptionsGetIntArray(NULL,"-debugger_nodes",nodes,&lsize,&flag);CHKERRQ(ierr); if (flag) { for (i=0; i<lsize; i++) { if (nodes[i] == rank) { flag = PETSC_FALSE; break; } } } if (!flag) { ierr = PetscSetDebuggerFromString(string);CHKERRQ(ierr); ierr = PetscPushErrorHandler(PetscAbortErrorHandler,0);CHKERRQ(ierr); if (flg1) { ierr = PetscAttachDebugger();CHKERRQ(ierr); } else { ierr = PetscStopForDebugger();CHKERRQ(ierr); } ierr = MPI_Comm_create_errhandler((MPI_Handler_function*)Petsc_MPI_AbortOnError,&err_handler);CHKERRQ(ierr); ierr = MPI_Comm_set_errhandler(comm,err_handler);CHKERRQ(ierr); } ierr = PetscFree(nodes);CHKERRQ(ierr); } ierr = PetscOptionsGetString(NULL,"-on_error_emacs",emacsmachinename,128,&flg1);CHKERRQ(ierr); if (flg1 && !rank) {ierr = PetscPushErrorHandler(PetscEmacsClientErrorHandler,emacsmachinename);CHKERRQ(ierr);} /* Setup profiling and logging */ #if defined(PETSC_USE_INFO) { char logname[PETSC_MAX_PATH_LEN]; logname[0] = 0; ierr = PetscOptionsGetString(NULL,"-info",logname,250,&flg1);CHKERRQ(ierr); if (flg1 && logname[0]) { ierr = PetscInfoAllow(PETSC_TRUE,logname);CHKERRQ(ierr); } else if (flg1) { ierr = PetscInfoAllow(PETSC_TRUE,NULL);CHKERRQ(ierr); } } #endif #if defined(PETSC_USE_LOG) mname[0] = 0; ierr = PetscOptionsGetString(NULL,"-history",mname,PETSC_MAX_PATH_LEN,&flg1);CHKERRQ(ierr); if (flg1) { if (mname[0]) { ierr = PetscOpenHistoryFile(mname,&petsc_history);CHKERRQ(ierr); } else { ierr = PetscOpenHistoryFile(NULL,&petsc_history);CHKERRQ(ierr); } } #if defined(PETSC_HAVE_MPE) flg1 = PETSC_FALSE; ierr = PetscOptionsHasName(NULL,"-log_mpe",&flg1);CHKERRQ(ierr); if (flg1) {ierr = PetscLogMPEBegin();CHKERRQ(ierr);} #endif flg1 = PETSC_FALSE; flg2 = PETSC_FALSE; flg3 = PETSC_FALSE; ierr = PetscOptionsGetBool(NULL,"-log_all",&flg1,NULL);CHKERRQ(ierr); ierr = PetscOptionsGetBool(NULL,"-log",&flg2,NULL);CHKERRQ(ierr); ierr = PetscOptionsHasName(NULL,"-log_summary",&flg3);CHKERRQ(ierr); ierr = PetscOptionsHasName(NULL,"-log_view",&flg4);CHKERRQ(ierr); if (flg1) { ierr = PetscLogAllBegin();CHKERRQ(ierr); } else if (flg2 || flg3 || flg4) { ierr = PetscLogBegin();CHKERRQ(ierr);} ierr = PetscOptionsGetString(NULL,"-log_trace",mname,250,&flg1);CHKERRQ(ierr); if (flg1) { char name[PETSC_MAX_PATH_LEN],fname[PETSC_MAX_PATH_LEN]; FILE *file; if (mname[0]) { sprintf(name,"%s.%d",mname,rank); ierr = PetscFixFilename(name,fname);CHKERRQ(ierr); file = fopen(fname,"w"); if (!file) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_FILE_OPEN,"Unable to open trace file: %s",fname); } else file = PETSC_STDOUT; ierr = PetscLogTraceBegin(file);CHKERRQ(ierr); } #endif ierr = PetscOptionsGetBool(NULL,"-saws_options",&PetscOptionsPublish,NULL);CHKERRQ(ierr); #if defined(PETSC_HAVE_CUDA) ierr = PetscOptionsHasName(NULL,"-cuda_show_devices",&flg1);CHKERRQ(ierr); if (flg1) { struct cudaDeviceProp prop; int devCount; int device; cudaError_t err = cudaSuccess; err = cudaGetDeviceCount(&devCount); if (err != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SYS,"error in cudaGetDeviceCount %s",cudaGetErrorString(err)); for (device = 0; device < devCount; ++device) { err = cudaGetDeviceProperties(&prop, device); if (err != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SYS,"error in cudaGetDeviceProperties %s",cudaGetErrorString(err)); ierr = PetscPrintf(PETSC_COMM_WORLD, "CUDA device %d: %s\n", device, prop.name);CHKERRQ(ierr); } } { int size; ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRQ(ierr); if (size>1) { int devCount, device, rank; cudaError_t err = cudaSuccess; /* check to see if we force multiple ranks to hit the same GPU */ ierr = PetscOptionsGetInt(NULL,"-cuda_set_device", &device, &flg1);CHKERRQ(ierr); if (flg1) { err = cudaSetDevice(device); if (err != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SYS,"error in cudaSetDevice %s",cudaGetErrorString(err)); } else { /* we're not using the same GPU on multiple MPI threads. So try to allocated different GPUs to different processes */ /* First get the device count */ err = cudaGetDeviceCount(&devCount); if (err != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SYS,"error in cudaGetDeviceCount %s",cudaGetErrorString(err)); /* next determine the rank and then set the device via a mod */ ierr = MPI_Comm_rank(PETSC_COMM_WORLD,&rank);CHKERRQ(ierr); device = rank % devCount; err = cudaSetDevice(device); if (err != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SYS,"error in cudaSetDevice %s",cudaGetErrorString(err)); } /* set the device flags so that it can map host memory ... do NOT throw exception on err!=cudaSuccess multiple devices may try to set the flags on the same device. So long as one of them succeeds, things are ok. */ err = cudaSetDeviceFlags(cudaDeviceMapHost); if (err != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SYS,"error in cudaSetDeviceFlags %s",cudaGetErrorString(err)); } else { int device; cudaError_t err = cudaSuccess; /* the code below works for serial GPU simulations */ ierr = PetscOptionsGetInt(NULL,"-cuda_set_device", &device, &flg1);CHKERRQ(ierr); if (flg1) { err = cudaSetDevice(device); if (err != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SYS,"error in cudaSetDevice %s",cudaGetErrorString(err)); } /* set the device flags so that it can map host memory ... here, we error check. */ err = cudaSetDeviceFlags(cudaDeviceMapHost); if (err != cudaSuccess) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SYS,"error in cudaSetDeviceFlags %s",cudaGetErrorString(err)); } } #endif /* Print basic help message */ ierr = PetscOptionsHasName(NULL,"-help",&flg1);CHKERRQ(ierr); if (flg1) { ierr = (*PetscHelpPrintf)(comm,"Options for all PETSc programs:\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -help: prints help method for each option\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -on_error_abort: cause an abort when an error is detected. Useful \n ");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," only when run in the debugger\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -on_error_attach_debugger [gdb,dbx,xxgdb,ups,noxterm]\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," start the debugger in new xterm\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," unless noxterm is given\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -start_in_debugger [gdb,dbx,xxgdb,ups,noxterm]\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," start all processes in the debugger\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -on_error_emacs <machinename>\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," emacs jumps to error file\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -debugger_nodes [n1,n2,..] Nodes to start in debugger\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -debugger_pause [m] : delay (in seconds) to attach debugger\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -stop_for_debugger : prints message on how to attach debugger manually\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," waits the delay for you to attach\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -display display: Location where X window graphics and debuggers are displayed\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -no_signal_handler: do not trap error signals\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -mpi_return_on_error: MPI returns error code, rather than abort on internal error\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -fp_trap: stop on floating point exceptions\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," note on IBM RS6000 this slows run greatly\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -malloc_dump <optional filename>: dump list of unfreed memory at conclusion\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -malloc: use our error checking malloc\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -malloc no: don't use error checking malloc\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -malloc_info: prints total memory usage\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -malloc_log: keeps log of all memory allocations\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -malloc_debug: enables extended checking for memory corruption\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -options_table: dump list of options inputted\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -options_left: dump list of unused options\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -options_left no: don't dump list of unused options\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -tmp tmpdir: alternative /tmp directory\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -shared_tmp: tmp directory is shared by all processors\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -not_shared_tmp: each processor has separate tmp directory\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -memory_view: print memory usage at end of run\n");CHKERRQ(ierr); #if defined(PETSC_USE_LOG) ierr = (*PetscHelpPrintf)(comm," -get_total_flops: total flops over all processors\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -log[_summary _summary_python]: logging objects and events\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -log_trace [filename]: prints trace of all PETSc calls\n");CHKERRQ(ierr); #if defined(PETSC_HAVE_MPE) ierr = (*PetscHelpPrintf)(comm," -log_mpe: Also create logfile viewable through Jumpshot\n");CHKERRQ(ierr); #endif ierr = (*PetscHelpPrintf)(comm," -info <optional filename>: print informative messages about the calculations\n");CHKERRQ(ierr); #endif ierr = (*PetscHelpPrintf)(comm," -v: prints PETSc version number and release date\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -options_file <file>: reads options from file\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm," -petsc_sleep n: sleeps n seconds before running program\n");CHKERRQ(ierr); ierr = (*PetscHelpPrintf)(comm,"-----------------------------------------------\n");CHKERRQ(ierr); } #if defined(PETSC_HAVE_POPEN) { char machine[128]; ierr = PetscOptionsGetString(NULL,"-popen_machine",machine,128,&flg1);CHKERRQ(ierr); if (flg1) { ierr = PetscPOpenSetMachine(machine);CHKERRQ(ierr); } } #endif ierr = PetscOptionsGetReal(NULL,"-petsc_sleep",&si,&flg1);CHKERRQ(ierr); if (flg1) { ierr = PetscSleep(si);CHKERRQ(ierr); } ierr = PetscOptionsGetString(NULL,"-info_exclude",mname,PETSC_MAX_PATH_LEN,&flg1);CHKERRQ(ierr); if (flg1) { ierr = PetscStrstr(mname,"null",&f);CHKERRQ(ierr); if (f) { ierr = PetscInfoDeactivateClass(0);CHKERRQ(ierr); } } #if defined(PETSC_HAVE_CUSP) || defined(PETSC_HAVE_VIENNACL) ierr = PetscOptionsHasName(NULL,"-log_summary",&flg3);CHKERRQ(ierr); if (!flg3) { ierr = PetscOptionsHasName(NULL,"-log_view",&flg3);CHKERRQ(ierr); } #endif #if defined(PETSC_HAVE_CUSP) ierr = PetscOptionsGetBool(NULL,"-cusp_synchronize",&flg3,NULL);CHKERRQ(ierr); PetscCUSPSynchronize = flg3; #elif defined(PETSC_HAVE_VIENNACL) ierr = PetscOptionsGetBool(NULL,"-viennacl_synchronize",&flg3,NULL);CHKERRQ(ierr); PetscViennaCLSynchronize = flg3; #endif PetscFunctionReturn(0); }
int main( int argc,char** argv) { printf("hello world\n"); if (!InitCUDA()) { return 0; } int iter = 1000; int trainnum = 20; bool isProfiler = false; int intProfiler = 0; int testnum = -1; float maxtime = 0.0f; cutGetCmdLineArgumenti(argc, (const char**) argv, "train", &trainnum); cutGetCmdLineArgumenti(argc, (const char**) argv, "iter", &iter); cutGetCmdLineArgumenti(argc, (const char**) argv, "profiler", &intProfiler); cutGetCmdLineArgumenti(argc, (const char**) argv, "test", &testnum); cutGetCmdLineArgumentf(argc, (const char**) argv, "maxtime", &maxtime); printf("%d\n", intProfiler); if(intProfiler) { isProfiler = true; } if(testnum == -1) testnum = trainnum /2; printf("Iter = %d\n", iter); printf("TrainNum = %d\n", trainnum); printf("TestNum = %d\n", testnum); CUT_DEVICE_INIT(argc, argv); cublasStatus status; status = cublasInit(); if(status != CUBLAS_STATUS_SUCCESS) { printf("Can't init cublas\n"); printf("%s\n", cudaGetErrorString(cudaGetLastError())); return -1; } Image* imageList = new Image[trainnum+testnum]; read64("my_optdigits.tra", imageList, trainnum + testnum); const int warmUpTime = 3; if(!isProfiler) { freopen("verbose.txt", "w", stdout); for(int i=0;i< warmUpTime;i++) { runImage(argc, argv, imageList, trainnum < warmUpTime ? trainnum : warmUpTime, 0, 10, false, 0.0f); } freopen("CON", "w", stdout); printf("Warm-up complete.\n\n\n"); } #ifdef _DEBUG freopen("out.txt", "w", stdout); #endif // _DEBUG runImage(argc, argv, imageList, trainnum, testnum, iter, true, maxtime); freopen("CON", "w", stdout); delete[] imageList; //TestReduce(); cublasShutdown(); if(!isProfiler) { CUT_EXIT(argc, argv); } //getchar(); return 0; }