void THCudaInit(THCudaState* state) { int count = 0; THCudaCheck(cudaGetDeviceCount(&count)); int device = 0; THCudaCheck(cudaGetDevice(&device)); state->rngState = (THCudaRNGState*)malloc(sizeof(THCudaRNGState)); THCRandom_init(state->rngState, count, device); THCudaBlas_init(count, device); int i,j; for(i=0; i < count; ++i) { THCudaCheck(cudaSetDevice(i)); for (j=0; j < count; ++j) { if(i != j) { int can = 0; THCudaCheck(cudaDeviceCanAccessPeer(&can, i, j)); if(can) THCudaCheck(cudaDeviceEnablePeerAccess(j, 0)); } } } THCudaCheck(cudaSetDevice(device)); }
void THCudaInit(THCState* state) { int count = 0; THCudaCheck(cudaGetDeviceCount(&count)); int device = 0; THCudaCheck(cudaGetDevice(&device)); state->rngState = (THCRNGState*)malloc(sizeof(THCRNGState)); THCRandom_init(state, count, device); state->blasState = (THCBlasState*)malloc(sizeof(THCBlasState)); THCudaBlas_init(state, count, device); state->numDevices = count; state->deviceProperties = (struct cudaDeviceProp*)malloc(count * sizeof(struct cudaDeviceProp)); THCState_setDeviceMode(state, THCStateDeviceModeManual); state->numUserStreams = 0; state->streamsPerDevice = (cudaStream_t**)malloc(count * sizeof(cudaStream_t*)); /* Enable P2P access between all pairs, if possible */ THCudaEnablePeerToPeerAccess(state); for (int i = 0; i < count; ++i) { THCudaCheck(cudaSetDevice(i)); THCudaCheck(cudaGetDeviceProperties(&state->deviceProperties[i], i)); /* Stream index 0 will be the default stream for convenience; by default no user streams are reserved */ state->streamsPerDevice[i] = (cudaStream_t*)malloc(sizeof(cudaStream_t)); state->streamsPerDevice[i][0] = NULL; } /* Restore to previous device */ THCudaCheck(cudaSetDevice(device)); /* Start in the default stream on the current device */ state->currentPerDeviceStream = 0; state->currentStream = NULL; }
void THCudaInit(THCState* state) { int count = 0; THCudaCheck(cudaGetDeviceCount(&count)); int device = 0; THCudaCheck(cudaGetDevice(&device)); state->rngState = (THCRNGState*)malloc(sizeof(THCRNGState)); THCRandom_init(state, count, device); state->blasState = (THCBlasState*)malloc(sizeof(THCBlasState)); THCudaBlas_init(state, count, device); int i,j; for(i=0; i < count; ++i) { THCudaCheck(cudaSetDevice(i)); for (j=0; j < count; ++j) { if(i != j) { int can = 0; THCudaCheck(cudaDeviceCanAccessPeer(&can, i, j)); if(can) { cudaError_t err = cudaDeviceEnablePeerAccess(j, 0); if (err == cudaErrorPeerAccessAlreadyEnabled) { // Any future call to cudaGetLastError will now return an error, // even though we've already dealt with this specific error here. // Call cudaGetLastError once to reset the last error state. cudaGetLastError(); continue; } THCudaCheck(err); } } } } THCudaCheck(cudaSetDevice(device)); }
void THCudaInit(THCState* state) { int count = 0; THCudaCheck(cudaGetDeviceCount(&count)); int device = 0; THCudaCheck(cudaGetDevice(&device)); state->rngState = (THCRNGState*)malloc(sizeof(THCRNGState)); THCRandom_init(state, count, device); THCAllocator_init(state); state->numDevices = count; state->deviceProperties = (struct cudaDeviceProp*)malloc(count * sizeof(struct cudaDeviceProp)); state->numUserStreams = 0; state->numUserBlasHandles = 0; /* Enable P2P access between all pairs, if possible */ THCudaEnablePeerToPeerAccess(state); state->resourcesPerDevice = (THCCudaResourcesPerDevice*) malloc(count * sizeof(THCCudaResourcesPerDevice)); for (int i = 0; i < count; ++i) { THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, i); THCudaCheck(cudaSetDevice(i)); THCudaCheck(cudaGetDeviceProperties(&state->deviceProperties[i], i)); /* Stream index 0 will be the default stream for convenience; by default no user streams are reserved */ res->streams = NULL; res->blasHandles = NULL; /* The scratch space that we want to have available per each device is based on the number of SMs available per device */ int numSM = state->deviceProperties[i].multiProcessorCount; size_t sizePerStream = numSM * GLOBAL_SCRATCH_SPACE_PER_SM_STREAM; res->scratchSpacePerStream = sizePerStream; /* Allocate scratch space for each stream */ res->devScratchSpacePerStream = (void**) malloc(sizeof(void*)); THCudaCheck(THCudaMalloc(state, &res->devScratchSpacePerStream[0], sizePerStream)); } /* Restore to previous device */ THCudaCheck(cudaSetDevice(device)); /* Start in the default stream on the current device */ state->currentPerDeviceStream = 0; state->currentStream = NULL; /* There is no such thing as a default cublas handle. To maintain consistency with streams API, handle 0 is always NULL and we start counting at 1 */ THCState_reserveBlasHandles(state, 1); state->currentPerDeviceBlasHandle = 1; state->currentBlasHandle = THCState_getDeviceBlasHandle(state, device, 1); state->cutorchGCFunction = NULL; state->cutorchGCData = NULL; state->heapSoftmax = 3e8; // 300MB, adjusted upward dynamically state->heapDelta = 0; }
void THCudaInit(THCState* state) { if (!state->cudaDeviceAllocator) { state->cudaDeviceAllocator = &defaultDeviceAllocator; } if (!state->cudaHostAllocator) { state->cudaHostAllocator = &THCudaHostAllocator; } if (!state->cudaUVAAllocator) { state->cudaUVAAllocator = &THCUVAAllocator; } int numDevices = 0; THCudaCheck(cudaGetDeviceCount(&numDevices)); state->numDevices = numDevices; int device = 0; THCudaCheck(cudaGetDevice(&device)); /* Start in the default stream on the current device */ state->currentStreams = (THCThreadLocal*) malloc(numDevices * sizeof(THCThreadLocal)); for (int i = 0; i < numDevices; ++i) { state->currentStreams[i] = THCThreadLocal_alloc(); } state->currentPerDeviceBlasHandle = THCThreadLocal_alloc(); state->currentPerDeviceSparseHandle = THCThreadLocal_alloc(); state->resourcesPerDevice = (THCCudaResourcesPerDevice*) malloc(numDevices * sizeof(THCCudaResourcesPerDevice)); memset(state->resourcesPerDevice, 0, numDevices * sizeof(THCCudaResourcesPerDevice)); state->deviceProperties = (struct cudaDeviceProp*)malloc(numDevices * sizeof(struct cudaDeviceProp)); state->rngState = (THCRNGState*)malloc(sizeof(THCRNGState)); THCRandom_init(state, numDevices, device); // By default, all direct p2p kernel access (besides copy) is disallowed, // since direct access without knowing whether or not a certain operation // should be cross-GPU leads to synchronization errors. The user can choose // to disable this functionality, however. state->p2pKernelAccessEnabled = 0; // p2pAccessEnabled records if p2p copies are allowed between pairs of // devices. Values include "1" (copy allowed), "0" (copy not allowed), and // "-1" (unknown). // Currently the max number of gpus in P2P group is 8, so if there are more // we enable P2P in groups of 8 state->p2pAccessEnabled = (int**) malloc(sizeof(int*) * numDevices); for (int i = 0; i < numDevices; ++i) { state->p2pAccessEnabled[i] = (int*) malloc(sizeof(int) * numDevices); for (int j = 0; j < numDevices; ++j) if (i == j) state->p2pAccessEnabled[i][j] = 1; else if (j / THC_CUDA_MAX_PEER_SIZE != i / THC_CUDA_MAX_PEER_SIZE) state->p2pAccessEnabled[i][j] = 0; else state->p2pAccessEnabled[i][j] = -1; } for (int i = 0; i < numDevices; ++i) { THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, i); THCudaCheck(cudaSetDevice(i)); THCudaCheck(cudaGetDeviceProperties(&state->deviceProperties[i], i)); /* The scratch space that we want to have available per each device is based on the number of SMs available per device. We guarantee a minimum of 128kb of space per device, but to future-proof against future architectures that may have huge #s of SMs, we guarantee that we have at least 16 bytes for each SM. */ int numSM = state->deviceProperties[i].multiProcessorCount; size_t sizePerStream = MIN_GLOBAL_SCRATCH_SPACE_PER_DEVICE >= numSM * MIN_GLOBAL_SCRATCH_SPACE_PER_SM_STREAM ? MIN_GLOBAL_SCRATCH_SPACE_PER_DEVICE : numSM * MIN_GLOBAL_SCRATCH_SPACE_PER_SM_STREAM; res->scratchSpacePerStream = sizePerStream; } /* Restore to previous device */ THCudaCheck(cudaSetDevice(device)); // Unlike CUDA streams, there is no NULL cuBLAS handle. The default THC // cuBLAS handle is the first user BLAS handle. Note that the actual BLAS // handles are created lazily. state->numUserBlasHandles = 1; state->numUserSparseHandles = 1; state->heapSoftmax = 3e8; // 300MB, adjusted upward dynamically state->heapDelta = 0; }
void THCudaInit(THCState* state) { if (!state->cudaDeviceAllocator.malloc) { THCState_initDefaultDeviceAllocator(&state->cudaDeviceAllocator); } int numDevices = 0; THCudaCheck(cudaGetDeviceCount(&numDevices)); state->numDevices = numDevices; int device = 0; THCudaCheck(cudaGetDevice(&device)); /* Start in the default stream on the current device */ state->currentPerDeviceStream = THCThreadLocal_alloc(); state->currentPerDeviceBlasHandle = THCThreadLocal_alloc(); state->resourcesPerDevice = (THCCudaResourcesPerDevice*) malloc(numDevices * sizeof(THCCudaResourcesPerDevice)); memset(state->resourcesPerDevice, 0, numDevices * sizeof(THCCudaResourcesPerDevice)); state->deviceProperties = (struct cudaDeviceProp*)malloc(numDevices * sizeof(struct cudaDeviceProp)); state->rngState = (THCRNGState*)malloc(sizeof(THCRNGState)); THCRandom_init(state, numDevices, device); state->cudaHostAllocator = (THAllocator*)malloc(sizeof(THAllocator)); THCAllocator_init(state->cudaHostAllocator); /* Enable P2P access between all pairs, if possible */ THCudaEnablePeerToPeerAccess(state); for (int i = 0; i < numDevices; ++i) { THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, i); THCudaCheck(cudaSetDevice(i)); THCudaCheck(cudaGetDeviceProperties(&state->deviceProperties[i], i)); /* The scratch space that we want to have available per each device is based on the number of SMs available per device */ int numSM = state->deviceProperties[i].multiProcessorCount; size_t sizePerStream = numSM * GLOBAL_SCRATCH_SPACE_PER_SM_STREAM; res->scratchSpacePerStream = sizePerStream; /* Allocate scratch space for each stream */ res->devScratchSpacePerStream = (void**) malloc(sizeof(void*)); THCudaCheck(THCudaMalloc(state, &res->devScratchSpacePerStream[0], sizePerStream)); } /* Restore to previous device */ THCudaCheck(cudaSetDevice(device)); /* There is no such thing as a default cublas handle. To maintain consistency with streams API, handle 0 is always NULL and we start counting at 1. If currentPerDeviceBlasHandle is 0 (the default thread-local value), then we assume it means 1. */ THCState_reserveBlasHandles(state, 1); state->heapSoftmax = 3e8; // 300MB, adjusted upward dynamically state->heapDelta = 0; }