void THCudaShutdown(THCState* state) { THCRandom_shutdown(state); free(state->rngState); free(state->cudaHostAllocator); free(state->deviceProperties); int deviceCount = 0; int prevDev = -1; THCudaCheck(cudaGetDevice(&prevDev)); THCudaCheck(cudaGetDeviceCount(&deviceCount)); /* cleanup p2p access state */ for (int dev = 0; dev < deviceCount; ++dev) { free(state->p2pAccessEnabled[dev]); } free(state->p2pAccessEnabled); /* cleanup per-device state */ for (int dev = 0; dev < deviceCount; ++dev) { THCudaCheck(cudaSetDevice(dev)); /* Free Torch-defined streams (0 is the default stream) */ for (int stream = 1; stream <= state->numUserStreams; ++stream) { THCudaCheck(cudaStreamDestroy( THCState_getDeviceStream(state, dev, stream))); } /* Free Torch-defined handles (0 is NULL for consistency with streams API) */ for (int handle = 1; handle <= state->numUserBlasHandles; ++handle) { THCublasCheck(cublasDestroy( THCState_getDeviceBlasHandle(state, dev, handle))); } /* Free per-stream scratch space; starts at 0 because there is space for the default stream as well*/ for (int stream = 0; stream <= state->numUserStreams; ++stream) { THCudaCheck(THCudaFree(state, THCState_getDeviceScratchSpace(state, dev, stream))); } free(state->resourcesPerDevice[dev].streams); free(state->resourcesPerDevice[dev].blasHandles); free(state->resourcesPerDevice[dev].devScratchSpacePerStream); } free(state->resourcesPerDevice); state->cudaDeviceAllocator.shutdown(state->cudaDeviceAllocator.state); THCThreadLocal_free(state->currentPerDeviceStream); THCThreadLocal_free(state->currentPerDeviceBlasHandle); THCudaCheck(cudaSetDevice(prevDev)); }
hostreal THCStorage_(get)(THCState *state, const THCStorage *self, long index) { THArgCheck((index >= 0) && (index < self->size), 2, "index out of bounds"); #ifndef THC_REAL_IS_HALF real value; THCudaCheck(cudaMemcpy(&value, self->data + index, sizeof(real), cudaMemcpyDeviceToHost)); return realToHostreal(value); #else float *ret_d; float ret; THCudaCheck(THCudaMalloc(state, (void**)&ret_d, sizeof(float))); THCHalf2Float(state, ret_d, self->data + index, 1); THCudaCheck(cudaMemcpy(&ret, ret_d, sizeof(float), cudaMemcpyDeviceToHost)); THCudaFree(state, ret_d); return ret; #endif }