void THCudaShutdown(THCState* state) { THCRandom_shutdown(state); free(state->rngState); free(state->cudaHostAllocator); free(state->deviceProperties); int deviceCount = 0; int prevDev = -1; THCudaCheck(cudaGetDevice(&prevDev)); THCudaCheck(cudaGetDeviceCount(&deviceCount)); /* cleanup p2p access state */ for (int dev = 0; dev < deviceCount; ++dev) { free(state->p2pAccessEnabled[dev]); } free(state->p2pAccessEnabled); /* cleanup per-device state */ for (int dev = 0; dev < deviceCount; ++dev) { THCudaCheck(cudaSetDevice(dev)); /* Free Torch-defined streams (0 is the default stream) */ for (int stream = 1; stream <= state->numUserStreams; ++stream) { THCudaCheck(cudaStreamDestroy( THCState_getDeviceStream(state, dev, stream))); } /* Free Torch-defined handles (0 is NULL for consistency with streams API) */ for (int handle = 1; handle <= state->numUserBlasHandles; ++handle) { THCublasCheck(cublasDestroy( THCState_getDeviceBlasHandle(state, dev, handle))); } /* Free per-stream scratch space; starts at 0 because there is space for the default stream as well*/ for (int stream = 0; stream <= state->numUserStreams; ++stream) { THCudaCheck(THCudaFree(state, THCState_getDeviceScratchSpace(state, dev, stream))); } free(state->resourcesPerDevice[dev].streams); free(state->resourcesPerDevice[dev].blasHandles); free(state->resourcesPerDevice[dev].devScratchSpacePerStream); } free(state->resourcesPerDevice); state->cudaDeviceAllocator.shutdown(state->cudaDeviceAllocator.state); THCThreadLocal_free(state->currentPerDeviceStream); THCThreadLocal_free(state->currentPerDeviceBlasHandle); THCudaCheck(cudaSetDevice(prevDev)); }
cublasHandle_t THCState_getCurrentBlasHandle(THCState *state) { /* This is called at the point of kernel execution. For some debugging code or improperly instrumented kernels, `state` is null */ if (state) { int device; THCudaCheck(cudaGetDevice(&device)); int handle = THCState_getCurrentBlasHandleIndex(state); return THCState_getDeviceBlasHandle(state, device, handle); } THError("THCState and blasHandles must be set as there is no default blasHandle"); return NULL; }
void THCState_setBlasHandle(THCState *state, int device, int handle) { /* `device` is a CUDA index */ if (device >= state->numDevices || device < 0) { THError("%d is not a device", device + 1 /* back to Torch index */); } if (handle > state->numUserBlasHandles || handle <= 0) { THError("%d is not a valid handle, valid range is: (1, %d)", handle, state->numUserBlasHandles); } state->currentBlasHandle = THCState_getDeviceBlasHandle(state, device, handle); state->currentPerDeviceBlasHandle = handle; THCublasCheck(cublasSetStream(state->currentBlasHandle, state->currentStream)); }
void THCState_reserveBlasHandles(THCState* state, int numBlasHandles) { if (numBlasHandles <= state->numUserBlasHandles) { return; } int prevDev = -1; THCudaCheck(cudaGetDevice(&prevDev)); /* Otherwise, we have to allocate a new set of blasHandles */ for (int dev = 0; dev < state->numDevices; ++dev) { THCudaCheck(cudaSetDevice(dev)); /* +1 to be consistent with stream API, blas handle 0 is NULL and unused */ cublasHandle_t* newBlasHandles = (cublasHandle_t*) malloc((numBlasHandles + 1) * sizeof(cublasHandle_t)); /* Copy over old blasHandles (0 is NULL, 1 ... numUserBlasHandles are rest) */ newBlasHandles[0] = NULL; for (int hndl = 1; hndl <= state->numUserBlasHandles; ++hndl) { newBlasHandles[hndl] = THCState_getDeviceBlasHandle(state, dev, hndl); } /* Allocate new handles */ for (int hndl = state->numUserBlasHandles + 1; hndl <= numBlasHandles; ++hndl) { newBlasHandles[hndl] = NULL; THCublasCheck(cublasCreate(newBlasHandles + hndl)); } THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, dev); free(res->blasHandles); res->blasHandles = newBlasHandles; } state->numUserBlasHandles = numBlasHandles; THCudaCheck(cudaSetDevice(prevDev)); }
void THCudaInit(THCState* state) { int count = 0; THCudaCheck(cudaGetDeviceCount(&count)); int device = 0; THCudaCheck(cudaGetDevice(&device)); state->rngState = (THCRNGState*)malloc(sizeof(THCRNGState)); THCRandom_init(state, count, device); THCAllocator_init(state); state->numDevices = count; state->deviceProperties = (struct cudaDeviceProp*)malloc(count * sizeof(struct cudaDeviceProp)); state->numUserStreams = 0; state->numUserBlasHandles = 0; /* Enable P2P access between all pairs, if possible */ THCudaEnablePeerToPeerAccess(state); state->resourcesPerDevice = (THCCudaResourcesPerDevice*) malloc(count * sizeof(THCCudaResourcesPerDevice)); for (int i = 0; i < count; ++i) { THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, i); THCudaCheck(cudaSetDevice(i)); THCudaCheck(cudaGetDeviceProperties(&state->deviceProperties[i], i)); /* Stream index 0 will be the default stream for convenience; by default no user streams are reserved */ res->streams = NULL; res->blasHandles = NULL; /* The scratch space that we want to have available per each device is based on the number of SMs available per device */ int numSM = state->deviceProperties[i].multiProcessorCount; size_t sizePerStream = numSM * GLOBAL_SCRATCH_SPACE_PER_SM_STREAM; res->scratchSpacePerStream = sizePerStream; /* Allocate scratch space for each stream */ res->devScratchSpacePerStream = (void**) malloc(sizeof(void*)); THCudaCheck(THCudaMalloc(state, &res->devScratchSpacePerStream[0], sizePerStream)); } /* Restore to previous device */ THCudaCheck(cudaSetDevice(device)); /* Start in the default stream on the current device */ state->currentPerDeviceStream = 0; state->currentStream = NULL; /* There is no such thing as a default cublas handle. To maintain consistency with streams API, handle 0 is always NULL and we start counting at 1 */ THCState_reserveBlasHandles(state, 1); state->currentPerDeviceBlasHandle = 1; state->currentBlasHandle = THCState_getDeviceBlasHandle(state, device, 1); state->cutorchGCFunction = NULL; state->cutorchGCData = NULL; state->heapSoftmax = 3e8; // 300MB, adjusted upward dynamically state->heapDelta = 0; }