void THCState_reserveStreams(THCState* state, int numStreams, int nonBlocking) { if (numStreams <= state->numUserStreams) { return; } int prevDev = -1; THCudaCheck(cudaGetDevice(&prevDev)); /* Otherwise, we have to allocate a new set of streams and stream data */ for (int dev = 0; dev < state->numDevices; ++dev) { THCudaCheck(cudaSetDevice(dev)); /* +1 for the default stream as well */ cudaStream_t* newStreams = (cudaStream_t*) malloc((numStreams + 1) * sizeof(cudaStream_t)); void** newScratchSpace = (void**) malloc((numStreams + 1) * sizeof(void*)); /* Copy over old stream data (0 is default stream, 1 ... numUserStreams are rest) */ for (int stream = 0; stream <= state->numUserStreams; ++stream) { newStreams[stream] = THCState_getDeviceStream(state, dev, stream); newScratchSpace[stream] = THCState_getDeviceScratchSpace(state, dev, stream); } /* Allocate new stream resources */ size_t scratchSpaceSize = THCState_getDeviceScratchSpaceSize(state, dev); unsigned int flags = nonBlocking ? cudaStreamNonBlocking : cudaStreamDefault; for (int stream = state->numUserStreams + 1; stream <= numStreams; ++stream) { newStreams[stream] = NULL; THCudaCheck(cudaStreamCreateWithFlags(newStreams + stream, flags)); newScratchSpace[stream] = NULL; THCudaCheck(THCudaMalloc(state, &newScratchSpace[stream], scratchSpaceSize)); } THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, dev); free(res->streams); res->streams = newStreams; free(res->devScratchSpacePerStream); res->devScratchSpacePerStream = newScratchSpace; } state->numUserStreams = numStreams; THCudaCheck(cudaSetDevice(prevDev)); }
size_t THCState_getCurrentDeviceScratchSpaceSize(THCState* state) { int device = -1; THCudaCheck(cudaGetDevice(&device)); return THCState_getDeviceScratchSpaceSize(state, device); }