/* Usage: n = cutorch.getBlasHandle() Returns the current blasHandle for all devices in use (as previously set via cutorch.setBlasHandle(n). */ static int cutorch_getBlasHandle(lua_State *L) { THCState *state = cutorch_getstate(L); lua_pushnumber(L, THCState_getCurrentBlasHandleIndex(state)); return 1; }
/* Usage: cutorch.setDefaultStream() Equivalent to cutorch.setStream(0). */ static int cutorch_setDefaultStream(lua_State *L) { THCState *state = cutorch_getstate(L); THCState_setStreamForCurrentDevice(state, 0); return 0; }
/* Usage: n = cutorch.getNumBlasHandles() Returns the number of user blasHandles allocated for every device present. By default, is 1. */ static int cutorch_getNumBlasHandles(lua_State *L) { THCState *state = cutorch_getstate(L); lua_pushnumber(L, THCState_getNumBlasHandles(state)); return 1; }
/* now we overwrite some methods specific to CudaTensor */ static int cutorch_CudaTensor_copy(lua_State *L) { THCState *state = cutorch_getstate(L); THCudaTensor *storage = luaT_checkudata(L, 1, "torch.CudaTensor"); void *src; if( (src = luaT_toudata(L, 2, "torch.CudaTensor")) ) THCudaTensor_copy(state, storage, src); else if( (src = luaT_toudata(L, 2, "torch.ByteTensor")) ) THCudaTensor_copyByte(state, storage, src); else if( (src = luaT_toudata(L, 2, "torch.CharTensor")) ) THCudaTensor_copyChar(state, storage, src); else if( (src = luaT_toudata(L, 2, "torch.ShortTensor")) ) THCudaTensor_copyShort(state, storage, src); else if( (src = luaT_toudata(L, 2, "torch.IntTensor")) ) THCudaTensor_copyInt(state, storage, src); else if( (src = luaT_toudata(L, 2, "torch.LongTensor")) ) THCudaTensor_copyLong(state, storage, src); else if( (src = luaT_toudata(L, 2, "torch.FloatTensor")) ) THCudaTensor_copyFloat(state, storage, src); else if( (src = luaT_toudata(L, 2, "torch.DoubleTensor")) ) THCudaTensor_copyDouble(state, storage, src); else if( (src = luaT_toudata(L, 2, "torch.CudaTensor")) ) THCudaTensor_copyCuda(state, storage, src); else luaL_typerror(L, 2, "torch.*Tensor"); lua_settop(L, 1); return 1; }
/* Usage: cutorch.streamWaitFor(waiterStream, {waitForStream1, ..., waitForStreamN}) for streams on the current device. Creates a one-way barrier where waiterStream waits for waitForStream1-N to reach the current point. */ static int cutorch_streamWaitFor(lua_State *L) { THCState *state = cutorch_getstate(L); int curDev = -1; THCudaCheck(cudaGetDevice(&curDev)); /* Check that the waiting stream is in bounds; this will error out if not */ int waitingId = (int) luaL_checknumber(L, 1); cudaStream_t streamWaiting = THCState_getDeviceStream(state, curDev, waitingId); /* Validate the streams that we are waiting on */ int streams = checkAndCountListOfStreams(L, state, 2, curDev); if (streams < 1) { /* nothing to synchronize */ return 0; } /* One-way dependency; streamWaiting will wait for the list of streams to wait on to complete execution of pending scheduled kernels/events */ cudaEvent_t * events = (cudaEvent_t*)malloc(sizeof(cudaEvent_t) * streams); createSingleDeviceEvents(L, state, 2, curDev, events); /* Then, wait on them */ for (int i = 0; i < streams; i++) { THCudaCheck(cudaStreamWaitEvent(streamWaiting, events[i], 0)); THCudaCheck(cudaEventDestroy(events[i])); } free(events); return 0; }
/* Usage: cutorch.streamBarrier({stream1, stream2, ..., streamN}) applies to streams for the current device. Creates a N-way barrier to synchronize all of the streams given */ static int cutorch_streamBarrier(lua_State *L) { THCState *state = cutorch_getstate(L); int curDev = -1; THCudaCheck(cudaGetDevice(&curDev)); int streams = checkAndCountListOfStreams(L, state, 1, curDev); if (streams < 2) { /* nothing to synchronize together */ return 0; } /* Multi-way dependency (barrier); all streams must complete execution of pending scheduled kernels/events */ cudaEvent_t * events = (cudaEvent_t*)malloc(sizeof(cudaEvent_t) * streams); /* First, create an event and record them for all streams */ int eventsCreated = createSingleDeviceEvents(L, state, 1, curDev, events); /* Then, wait on the event. Each stream is actually waiting on itself here too, but that's harmless and isn't worth weeding out. */ waitSingleDeviceEvents(L, state, 1, curDev, events, eventsCreated); for (int i = 0; i < eventsCreated; i++) THCudaCheck(cudaEventDestroy(events[i])); free(events); return 0; }
static int cutorch_getRNGState(lua_State *L) { THByteTensor* t = THByteTensor_new(); THCRandom_getRNGState(cutorch_getstate(L), t); luaT_pushudata(L, t, "torch.ByteTensor"); return 1; }
/* Usage: n = cutorch.getStream() Returns the current user stream for all devices in use (as previously set via cutorch.setStream(n). 0 is the default stream on the device and is its initial value. */ static int cutorch_getStream(lua_State *L) { THCState *state = cutorch_getstate(L); lua_pushnumber(L, THCState_getCurrentStreamIndex(state)); return 1; }
/* Usage: cutorch.setBlasHandle(n) For all devices, sets the current blasHandle in use to the index specified. e.g., --- cutorch.setDevice(1) cutorch.setBlasHandle(3) -- device 1 blasHandle 3 in use here cutorch.setDevice(2) -- device 2 blasHandle 3 in use here --- */ static int cutorch_setBlasHandle(lua_State *L) { THCState *state = cutorch_getstate(L); int handle = (int) luaL_checknumber(L, 1); THCState_setBlasHandleForCurrentDevice(state, handle); return 0; }
/* Usage: cutorch.reserveBlasHandles(n) Allocates n blasHandles for every device present. If fewer than n blasHandles are currently allocated, an additional number will be added. If more than n blasHandles are currently allocated, does nothing. Unlike for streams, there is no default blasHandle. */ static int cutorch_reserveBlasHandles(lua_State *L) { THCState *state = cutorch_getstate(L); int numHandles = (int) luaL_checknumber(L, 1); THCState_reserveBlasHandles(state, numHandles); return 0; }
/* Usage: cutorch.reserveStreams(n) Allocates n user streams for every device present. If fewer than n streams are currently allocated, an additional number will be added. If more than n streams are currently allocated, does nothing. The default CUDA stream is assumed to be stream 0 and is always present; the allocated streams are user streams on top of the CUDA streams (thus, reserveStreams(1) will create 1 user stream with two being available, the default stream 0 and the user stream 1, on each device). */ static int cutorch_reserveStreams(lua_State *L) { THCState *state = cutorch_getstate(L); int numStreams = (int) luaL_checknumber(L, 1); THCState_reserveStreams(state, numStreams); return 0; }
/* Usage: cutorch.setStream(n) For all devices, sets the current user stream in use to the index specified. e.g., --- cutorch.setDevice(1) cutorch.setStream(3) -- device 1 stream 3 in use here cutorch.setDevice(2) -- device 2 stream 3 in use here --- 0 is the default stream on the device. */ static int cutorch_setStream(lua_State *L) { THCState *state = cutorch_getstate(L); int stream = (int) luaL_checknumber(L, 1); THCState_setStreamForCurrentDevice(state, stream); return 0; }
static int cutorch_Event_waitOn(lua_State *L) { cudaEvent_t *event = luaT_checkudata(L, 1, "cutorch.Event"); THCState *state = cutorch_getstate(L); THCudaCheck(cudaStreamWaitEvent(THCState_getCurrentStream(state), *event, 0)); return 0; }
static int cutorch_Event_new(lua_State *L) { cudaEvent_t *event = luaT_alloc(L, sizeof(cudaEvent_t)); THCudaCheck(cudaEventCreate(event)); THCState *state = cutorch_getstate(L); THCudaCheck(cudaEventRecord(*event, THCState_getCurrentStream(state))); luaT_pushudata(L, event, "cutorch.Event"); return 1; }
static int cutorch_setHeapTracking(lua_State *L) { THCState *state = cutorch_getstate(L); int enabled = luaT_checkboolean(L,1); if(enabled) { THCSetGCHandler(state, luaCutorchGCFunction, L); } else { THCSetGCHandler(state, NULL, NULL); } return 0; }
static int cutorch_setPeerToPeerAccess(lua_State *L) { THCState *state = cutorch_getstate(L); int dev = (int) luaL_checknumber(L, 1) - 1; int devToAccess = (int) luaL_checknumber(L, 2) - 1; int enable = lua_toboolean(L, 3); /* device bounds checking is performed within */ THCState_setPeerToPeerAccess(state, dev, devToAccess, enable); return 0; }
static int cutorch_getPeerToPeerAccess(lua_State *L) { THCState *state = cutorch_getstate(L); int dev = (int) luaL_checknumber(L, 1) - 1; int devToAccess = (int) luaL_checknumber(L, 2) - 1; /* device bounds checking is performed within */ int enabled = THCState_getPeerToPeerAccess(state, dev, devToAccess); lua_pushboolean(L, enabled); return 1; }
static int torch_Tensor_(storage)(lua_State *L) { THTensor *tensor = luaT_checkudata(L, 1, torch_Tensor); if(tensor->storage) { THStorage_(retain)(cutorch_getstate(L), tensor->storage); luaT_pushudata(L, tensor->storage, torch_Storage); } else lua_pushnil(L); return 1; }
static int cutorch_setDevice(lua_State *L) { THCState *state = cutorch_getstate(L); int device = (int)luaL_checknumber(L, 1)-1; THCudaCheck(cudaSetDevice(device)); THCRandom_setGenerator(state, device); /* The stream is per device, so update the stream as well */ THCState_setStream(state, device, THCState_getCurrentStreamIndex(state)); THCState_setBlasHandle(state, device, THCState_getCurrentBlasHandleIndex(state)); return 0; }
/* Usage: cutorch.streamWaitForMultiDevice(gpuWaiter, streamWaiter, {[gpu1]={stream1_1, ..., stream1_N}, [gpuK]={streamK_1, ..., streamK_M}}) with a specified GPU per each list of streams. Stream (gpuWaiter, streamWaiter) will wait on all of the other streams (gpu1, stream1_1), ..., (gpu1, stream1_N), ..., (gpuK, streamK_1), ..., (gpuK, streamK_M) to complete fully, as a one-way barrier only (only streamWaiter is blocked). The streams to wait on are bucketed per device. Equivalent to streamWaitFor() if only one GPU's streams are listed. */ static int cutorch_streamWaitForMultiDevice(lua_State *L) { THCState *state = cutorch_getstate(L); int prevDev = -1; THCudaCheck(cudaGetDevice(&prevDev)); /* Validate waiting (gpu, stream); this will error out if not */ int gpuWaiter = (int) luaL_checknumber(L, 1) - 1; int streamWaiter = (int) luaL_checknumber(L, 2); cudaStream_t streamWaiting = THCState_getDeviceStream(state, gpuWaiter, streamWaiter); /* Validate and count set of {gpu={streams...}} we are waiting on */ int gpus = 0; int streams = 0; checkAndCountListOfGPUStreamPairs(L, state, 3, &gpus, &streams); if (streams < 1) { /* nothing to synchronize together */ return 0; } /* Events can only be recorded on the same device on which they are created. -For each GPU, create and record event per each stream given for that GPU. -For (gpuWaiter, streamWaiter), wait on all of the above events. */ cudaEvent_t* events = (cudaEvent_t*) malloc(sizeof(cudaEvent_t) * streams); /* First, create an event per GPU and record events for the specified stream on that GPU */ createMultiDeviceEvents(L, state, 3, events); /* Then, wait on the events */ THCudaCheck(cudaSetDevice(gpuWaiter)); for (int i = 0; i < streams; ++i) { THCudaCheck(cudaStreamWaitEvent(streamWaiting, events[i], 0)); } /* Clean up events */ for (int i = 0; i < streams; ++i) { THCudaCheck(cudaEventDestroy(events[i])); } free(events); THCudaCheck(cudaSetDevice(prevDev)); return 0; }
/* Usage: cutorch.streamSynchronize(n) For the current device, synchronizes with the given stream only (cudaStreamSynchronize). 0 is the default stream on the device. */ static int cutorch_streamSynchronize(lua_State *L) { THCState *state = cutorch_getstate(L); int streamId = (int) luaL_checknumber(L, 1); int curDev = -1; THCudaCheck(cudaGetDevice(&curDev)); /* This also validates the stream */ cudaStream_t stream = THCState_getDeviceStream(state, curDev, streamId); THCudaCheck(cudaStreamSynchronize(stream)); return 0; }
static int cutorch_Tensor_(copyAsyncCPU)(lua_State *L) { #define STRINGIFY_TENSOR(x) TH_CONCAT_STRING_3(torch.,x,Tensor) THCState *state = cutorch_getstate(L); THCTensor *tensor = luaT_checkudata(L, 1, STRINGIFY_TENSOR(CReal)); void *src; if( (src = luaT_toudata(L, 2, STRINGIFY_TENSOR(CReal)))) THCTensor_(copy)(state, tensor, src); else if( (src = luaT_toudata(L, 2, STRINGIFY_TENSOR(Real)))) THCTensor_(copyAsyncCPU)(state, tensor, src); else luaL_typerror(L, 2, STRINGIFY_TENSOR(Real) " or " STRINGIFY_TENSOR(CReal)); lua_settop(L, 1); return 1; #undef STRINGIFY_TENSOR }
static int cutorch_Storage_(copy)(lua_State *L) { THCState *state = cutorch_getstate(L); THCStorage *storage = luaT_checkudata(L, 1, torch_Storage); void *src; if( (src = luaT_toudata(L, 2, "torch.CudaByteStorage")) ) THCStorage_(copyCudaByte)(state, storage, src); else if( (src = luaT_toudata(L, 2, "torch.CudaCharStorage")) ) THCStorage_(copyCudaChar)(state, storage, src); else if( (src = luaT_toudata(L, 2, "torch.CudaShortStorage")) ) THCStorage_(copyCudaShort)(state, storage, src); else if( (src = luaT_toudata(L, 2, "torch.CudaIntStorage")) ) THCStorage_(copyCudaInt)(state, storage, src); else if( (src = luaT_toudata(L, 2, "torch.CudaLongStorage")) ) THCStorage_(copyCudaLong)(state, storage, src); else if( (src = luaT_toudata(L, 2, "torch.CudaStorage")) ) THCStorage_(copyCudaFloat)(state, storage, src); else if( (src = luaT_toudata(L, 2, "torch.CudaDoubleStorage")) ) THCStorage_(copyCudaDouble)(state, storage, src); #if CUDA_VERSION >= 7050 else if( (src = luaT_toudata(L, 2, "torch.CudaHalfStorage")) ) THCStorage_(copyCudaHalf)(state, storage, src); #endif else if( (src = luaT_toudata(L, 2, "torch.ByteStorage")) ) THCStorage_(copyByte)(state, storage, src); else if( (src = luaT_toudata(L, 2, "torch.CharStorage")) ) THCStorage_(copyChar)(state, storage, src); else if( (src = luaT_toudata(L, 2, "torch.ShortStorage")) ) THCStorage_(copyShort)(state, storage, src); else if( (src = luaT_toudata(L, 2, "torch.IntStorage")) ) THCStorage_(copyInt)(state, storage, src); else if( (src = luaT_toudata(L, 2, "torch.LongStorage")) ) THCStorage_(copyLong)(state, storage, src); else if( (src = luaT_toudata(L, 2, "torch.FloatStorage")) ) THCStorage_(copyFloat)(state, storage, src); else if( (src = luaT_toudata(L, 2, "torch.DoubleStorage")) ) THCStorage_(copyDouble)(state, storage, src); else luaL_typerror(L, 2, "torch.*Storage"); lua_settop(L, 1); return 1; }
/* usage: cutorch.streamBarrierMultiDevice({[gpu1]={stream1_1, ..., stream1_N}, [gpuK]={streamK_1, ..., streamK_M}}) with a specified GPU per each list of streams. Each stream (gpu1, stream1_1), ..., (gpu1, stream1_N), ..., (gpuK, streamK_1), ..., (gpuK, streamK_M) will wait for all others to complete fully. Streams are bucketed per device. Equivalent to streamBarrier() if only one GPU is specified. */ static int cutorch_streamBarrierMultiDevice(lua_State *L) { THCState *state = cutorch_getstate(L); int prevDev = -1; THCudaCheck(cudaGetDevice(&prevDev)); /* Validate and count set of {gpu={streams...}} that are mutually waiting */ int gpus = 0; int streams = 0; checkAndCountListOfGPUStreamPairs(L, state, 1, &gpus, &streams); if (streams < 2) { /* nothing to synchronize together */ return 0; } /* Events can only be recorded on the same device on which they are created. -For each GPU, create an event, and record that event on each stream given for that GPU. -For each GPU, for each stream, wait on the event created by each other GPU. */ cudaEvent_t* events = (cudaEvent_t*) malloc(sizeof(cudaEvent_t) * streams); /* First, create an event per GPU and record events for the specified stream on that GPU */ createMultiDeviceEvents(L, state, 1, events); /* Then, wait on the events. Each stream is actually waiting on itself here too, but that's harmless and isn't worth weeding out. */ waitMultiDeviceEvents(L, state, 1, events, streams); /* Clean up events */ for (int i = 0; i < streams; ++i) { THCudaCheck(cudaEventDestroy(events[i])); } free(events); THCudaCheck(cudaSetDevice(prevDev)); return 0; }
/* now we overwrite some methods specific to CudaTensor */ static int cutorch_Tensor_(copy)(lua_State *L) { THCState *state = cutorch_getstate(L); THCTensor *tensor = luaT_checkudata(L, 1, torch_Tensor); void *src; if( (src = luaT_toudata(L, 2, "torch.CudaTensor")) ) THCTensor_(copyCudaFloat)(state, tensor, src); else if( (src = luaT_toudata(L, 2, "torch.CudaByteTensor")) ) THCTensor_(copyCudaByte)(state, tensor, src); else if( (src = luaT_toudata(L, 2, "torch.CudaCharTensor")) ) THCTensor_(copyCudaChar)(state, tensor, src); else if( (src = luaT_toudata(L, 2, "torch.CudaShortTensor")) ) THCTensor_(copyCudaShort)(state, tensor, src); else if( (src = luaT_toudata(L, 2, "torch.CudaIntTensor")) ) THCTensor_(copyCudaInt)(state, tensor, src); else if( (src = luaT_toudata(L, 2, "torch.CudaLongTensor")) ) THCTensor_(copyCudaLong)(state, tensor, src); else if( (src = luaT_toudata(L, 2, "torch.CudaDoubleTensor")) ) THCTensor_(copyCudaDouble)(state, tensor, src); else if( (src = luaT_toudata(L, 2, "torch.ByteTensor")) ) THCTensor_(copyByte)(state, tensor, src); else if( (src = luaT_toudata(L, 2, "torch.CharTensor")) ) THCTensor_(copyChar)(state, tensor, src); else if( (src = luaT_toudata(L, 2, "torch.ShortTensor")) ) THCTensor_(copyShort)(state, tensor, src); else if( (src = luaT_toudata(L, 2, "torch.IntTensor")) ) THCTensor_(copyInt)(state, tensor, src); else if( (src = luaT_toudata(L, 2, "torch.LongTensor")) ) THCTensor_(copyLong)(state, tensor, src); else if( (src = luaT_toudata(L, 2, "torch.FloatTensor")) ) THCTensor_(copyFloat)(state, tensor, src); else if( (src = luaT_toudata(L, 2, "torch.DoubleTensor")) ) THCTensor_(copyDouble)(state, tensor, src); else luaL_typerror(L, 2, "torch.*Tensor"); lua_settop(L, 1); return 1; }
#ifndef TH_GENERIC_FILE #define TH_GENERIC_FILE "generic/Storage.c" #else static int torch_Storage_(new)(lua_State *L) { THCState *state = cutorch_getstate(L); THStorage *storage; if(lua_type(L, 1) == LUA_TSTRING) { const char *fileName = luaL_checkstring(L, 1); int isShared = luaT_optboolean(L, 2, 0); long size = luaL_optlong(L, 3, 0); storage = THStorage_(newWithMapping)(state, fileName, size, isShared); } else if(lua_type(L, 1) == LUA_TTABLE) { long size = lua_objlen(L, 1); long i; storage = THStorage_(newWithSize)(state, size); for(i = 1; i <= size; i++) { lua_rawgeti(L, 1, i); if(!lua_isnumber(L, -1)) { THStorage_(free)(state, storage); luaL_error(L, "element at index %d is not a number", i); } THStorage_(set)(state, storage, i-1, (real)lua_tonumber(L, -1)); lua_pop(L, 1); }
static int cutorch_CudaTensor_getDevice(lua_State *L) { THCudaTensor *tensor = luaT_checkudata(L, 1, "torch.CudaTensor"); lua_pushinteger(L, THCudaTensor_getDevice(cutorch_getstate(L), tensor) + 1); return 1; }
static int cutorch_setRNGState(lua_State *L) { THByteTensor* t = luaT_checkudata(L, 1, "torch.ByteTensor"); THCRandom_setRNGState(cutorch_getstate(L), t); return 0; }
static int cutorch_initialSeed(lua_State *L) { unsigned long seed = THCRandom_initialSeed(cutorch_getstate(L)); lua_pushnumber(L, seed); return 1; }
static int cutorch_manualSeedAll(lua_State* L) { unsigned long seed = luaL_checknumber(L, 1); THCRandom_manualSeedAll(cutorch_getstate(L), seed); return 0; }