Example #1
0
/*
   Usage:
   n = cutorch.getBlasHandle()
   Returns the current blasHandle for all devices in use (as previously
   set via cutorch.setBlasHandle(n).
*/
static int cutorch_getBlasHandle(lua_State *L)
{
  THCState *state = cutorch_getstate(L);
  lua_pushnumber(L, THCState_getCurrentBlasHandleIndex(state));

  return 1;
}
Example #2
0
/*
   Usage:
   cutorch.setDefaultStream()
   Equivalent to cutorch.setStream(0).
*/
static int cutorch_setDefaultStream(lua_State *L)
{
  THCState *state = cutorch_getstate(L);
  THCState_setStreamForCurrentDevice(state, 0);

  return 0;
}
Example #3
0
/*
   Usage:
   n = cutorch.getNumBlasHandles()
   Returns the number of user blasHandles allocated for every device present.
   By default, is 1.
*/
static int cutorch_getNumBlasHandles(lua_State *L)
{
  THCState *state = cutorch_getstate(L);
  lua_pushnumber(L, THCState_getNumBlasHandles(state));

  return 1;
}
Example #4
0
/* now we overwrite some methods specific to CudaTensor */
static int cutorch_CudaTensor_copy(lua_State *L)
{
  THCState *state = cutorch_getstate(L);
  THCudaTensor *storage = luaT_checkudata(L, 1, "torch.CudaTensor");
  void *src;
  if( (src = luaT_toudata(L, 2, "torch.CudaTensor")) )
    THCudaTensor_copy(state, storage, src);
  else if( (src = luaT_toudata(L, 2, "torch.ByteTensor")) )
    THCudaTensor_copyByte(state, storage, src);
  else if( (src = luaT_toudata(L, 2, "torch.CharTensor")) )
    THCudaTensor_copyChar(state, storage, src);
  else if( (src = luaT_toudata(L, 2, "torch.ShortTensor")) )
    THCudaTensor_copyShort(state, storage, src);
  else if( (src = luaT_toudata(L, 2, "torch.IntTensor")) )
    THCudaTensor_copyInt(state, storage, src);
  else if( (src = luaT_toudata(L, 2, "torch.LongTensor")) )
    THCudaTensor_copyLong(state, storage, src);
  else if( (src = luaT_toudata(L, 2, "torch.FloatTensor")) )
    THCudaTensor_copyFloat(state, storage, src);
  else if( (src = luaT_toudata(L, 2, "torch.DoubleTensor")) )
    THCudaTensor_copyDouble(state, storage, src);
  else if( (src = luaT_toudata(L, 2, "torch.CudaTensor")) )
    THCudaTensor_copyCuda(state, storage, src);
  else
    luaL_typerror(L, 2, "torch.*Tensor");

  lua_settop(L, 1);
  return 1;
}
Example #5
0
/*
   Usage:
   cutorch.streamWaitFor(waiterStream, {waitForStream1, ..., waitForStreamN})
   for streams on the current device. Creates a one-way barrier where
   waiterStream waits for waitForStream1-N to reach the current point.
*/
static int cutorch_streamWaitFor(lua_State *L)
{
  THCState *state = cutorch_getstate(L);

  int curDev = -1;
  THCudaCheck(cudaGetDevice(&curDev));

  /* Check that the waiting stream is in bounds; this will error out if not */
  int waitingId = (int) luaL_checknumber(L, 1);
  cudaStream_t streamWaiting =
    THCState_getDeviceStream(state, curDev, waitingId);

  /* Validate the streams that we are waiting on */
  int streams = checkAndCountListOfStreams(L, state, 2, curDev);

  if (streams < 1) {
    /* nothing to synchronize */
    return 0;
  }
  /* One-way dependency; streamWaiting will wait for the list of streams to
     wait on to complete execution of pending scheduled kernels/events */
  cudaEvent_t * events = (cudaEvent_t*)malloc(sizeof(cudaEvent_t) * streams);
  createSingleDeviceEvents(L, state, 2, curDev, events);
  /* Then, wait on them */
  for (int i = 0; i < streams; i++) {
    THCudaCheck(cudaStreamWaitEvent(streamWaiting, events[i], 0));
    THCudaCheck(cudaEventDestroy(events[i]));
  }
  free(events);
  return 0;
}
Example #6
0
/*
   Usage:
   cutorch.streamBarrier({stream1, stream2, ..., streamN})
   applies to streams for the current device. Creates a N-way barrier
   to synchronize all of the streams given
*/
static int cutorch_streamBarrier(lua_State *L)
{
  THCState *state = cutorch_getstate(L);

  int curDev = -1;
  THCudaCheck(cudaGetDevice(&curDev));

  int streams = checkAndCountListOfStreams(L, state, 1, curDev);

  if (streams < 2) {
    /* nothing to synchronize together */
    return 0;
  }
  /* Multi-way dependency (barrier); all streams must complete execution
     of pending scheduled kernels/events */
  cudaEvent_t * events = (cudaEvent_t*)malloc(sizeof(cudaEvent_t) * streams);
  /* First, create an event and record them for all streams */
  int eventsCreated =  createSingleDeviceEvents(L, state, 1, curDev, events);

  /* Then, wait on the event. Each stream is actually waiting on itself here
     too, but that's harmless and isn't worth weeding out. */
  waitSingleDeviceEvents(L, state, 1, curDev, events, eventsCreated);
  for (int i = 0; i < eventsCreated; i++)
    THCudaCheck(cudaEventDestroy(events[i]));

  free(events);
  return 0;
}
Example #7
0
static int cutorch_getRNGState(lua_State *L)
{
  THByteTensor* t = THByteTensor_new();
  THCRandom_getRNGState(cutorch_getstate(L), t);
  luaT_pushudata(L, t, "torch.ByteTensor");
  return 1;
}
Example #8
0
/*
   Usage:
   n = cutorch.getStream()
   Returns the current user stream for all devices in use (as previously
   set via cutorch.setStream(n). 0 is the default stream on the device
   and is its initial value.
*/
static int cutorch_getStream(lua_State *L)
{
  THCState *state = cutorch_getstate(L);
  lua_pushnumber(L, THCState_getCurrentStreamIndex(state));

  return 1;
}
Example #9
0
/*
   Usage:
   cutorch.setBlasHandle(n)
   For all devices, sets the current blasHandle in use to the index
   specified. e.g.,
   ---
   cutorch.setDevice(1)
   cutorch.setBlasHandle(3)
   -- device 1 blasHandle 3 in use here
   cutorch.setDevice(2)
   -- device 2 blasHandle 3 in use here
   ---
*/
static int cutorch_setBlasHandle(lua_State *L)
{
  THCState *state = cutorch_getstate(L);
  int handle = (int) luaL_checknumber(L, 1);
  THCState_setBlasHandleForCurrentDevice(state, handle);

  return 0;
}
Example #10
0
/*
   Usage:
   cutorch.reserveBlasHandles(n)
   Allocates n blasHandles for every device present. If fewer than
   n blasHandles are currently allocated, an additional number will be added.
   If more than n blasHandles are currently allocated, does nothing.
   Unlike for streams, there is no default blasHandle.
*/
static int cutorch_reserveBlasHandles(lua_State *L)
{
  THCState *state = cutorch_getstate(L);
  int numHandles = (int) luaL_checknumber(L, 1);
  THCState_reserveBlasHandles(state, numHandles);

  return 0;
}
Example #11
0
/*
   Usage:
   cutorch.reserveStreams(n)
   Allocates n user streams for every device present. If fewer than
   n streams are currently allocated, an additional number will be added.
   If more than n streams are currently allocated, does nothing.
   The default CUDA stream is assumed to be stream 0 and is always present;
   the allocated streams are user streams on top of the CUDA streams
   (thus, reserveStreams(1) will create 1 user stream with two being available,
   the default stream 0 and the user stream 1, on each device).
*/
static int cutorch_reserveStreams(lua_State *L)
{
  THCState *state = cutorch_getstate(L);
  int numStreams = (int) luaL_checknumber(L, 1);
  THCState_reserveStreams(state, numStreams);

  return 0;
}
Example #12
0
/*
   Usage:
   cutorch.setStream(n)
   For all devices, sets the current user stream in use to the index
   specified. e.g.,
   ---
   cutorch.setDevice(1)
   cutorch.setStream(3)
   -- device 1 stream 3 in use here
   cutorch.setDevice(2)
   -- device 2 stream 3 in use here
   ---
   0 is the default stream on the device.
*/
static int cutorch_setStream(lua_State *L)
{
  THCState *state = cutorch_getstate(L);
  int stream = (int) luaL_checknumber(L, 1);
  THCState_setStreamForCurrentDevice(state, stream);

  return 0;
}
Example #13
0
static int cutorch_Event_waitOn(lua_State *L)
{
  cudaEvent_t *event = luaT_checkudata(L, 1, "cutorch.Event");
  THCState *state = cutorch_getstate(L);
  THCudaCheck(cudaStreamWaitEvent(THCState_getCurrentStream(state), *event, 0));

  return 0;
}
Example #14
0
static int cutorch_Event_new(lua_State *L)
{
  cudaEvent_t *event = luaT_alloc(L, sizeof(cudaEvent_t));
  THCudaCheck(cudaEventCreate(event));

  THCState *state = cutorch_getstate(L);
  THCudaCheck(cudaEventRecord(*event, THCState_getCurrentStream(state)));
  luaT_pushudata(L, event, "cutorch.Event");

  return 1;
}
Example #15
0
static int cutorch_setHeapTracking(lua_State *L)
{
  THCState *state = cutorch_getstate(L);
  int enabled = luaT_checkboolean(L,1);
  if(enabled) {
    THCSetGCHandler(state, luaCutorchGCFunction, L);
  } else {
    THCSetGCHandler(state, NULL, NULL);
  }
  return 0;
}
Example #16
0
static int cutorch_setPeerToPeerAccess(lua_State *L)
{
  THCState *state = cutorch_getstate(L);
  int dev = (int) luaL_checknumber(L, 1) - 1;
  int devToAccess = (int) luaL_checknumber(L, 2) - 1;
  int enable = lua_toboolean(L, 3);

  /* device bounds checking is performed within */
  THCState_setPeerToPeerAccess(state, dev, devToAccess, enable);

  return 0;
}
Example #17
0
static int cutorch_getPeerToPeerAccess(lua_State *L)
{
  THCState *state = cutorch_getstate(L);
  int dev = (int) luaL_checknumber(L, 1) - 1;
  int devToAccess = (int) luaL_checknumber(L, 2) - 1;

  /* device bounds checking is performed within */
  int enabled = THCState_getPeerToPeerAccess(state, dev, devToAccess);
  lua_pushboolean(L, enabled);

  return 1;
}
Example #18
0
File: Tensor.c Project: d11/cutorch
static int torch_Tensor_(storage)(lua_State *L)
{
  THTensor *tensor = luaT_checkudata(L, 1, torch_Tensor);
  if(tensor->storage)
  {
    THStorage_(retain)(cutorch_getstate(L), tensor->storage);
    luaT_pushudata(L, tensor->storage, torch_Storage);
  }
  else
    lua_pushnil(L);

  return 1;
}
Example #19
0
static int cutorch_setDevice(lua_State *L)
{
  THCState *state = cutorch_getstate(L);
  int device = (int)luaL_checknumber(L, 1)-1;
  THCudaCheck(cudaSetDevice(device));
  THCRandom_setGenerator(state, device);

  /* The stream is per device, so update the stream as well */
  THCState_setStream(state, device, THCState_getCurrentStreamIndex(state));
  THCState_setBlasHandle(state, device, THCState_getCurrentBlasHandleIndex(state));

  return 0;
}
Example #20
0
/*
   Usage:
   cutorch.streamWaitForMultiDevice(gpuWaiter, streamWaiter,
                                    {[gpu1]={stream1_1, ..., stream1_N},
                                    [gpuK]={streamK_1, ..., streamK_M}})
   with a specified GPU per each list of streams.
   Stream (gpuWaiter, streamWaiter) will wait on all of the other streams
   (gpu1, stream1_1), ..., (gpu1, stream1_N), ...,
   (gpuK, streamK_1), ..., (gpuK, streamK_M) to complete fully, as a one-way
   barrier only (only streamWaiter is blocked).
   The streams to wait on are bucketed per device. Equivalent to
   streamWaitFor() if only one GPU's streams are listed.
*/
static int cutorch_streamWaitForMultiDevice(lua_State *L)
{
  THCState *state = cutorch_getstate(L);

  int prevDev = -1;
  THCudaCheck(cudaGetDevice(&prevDev));

  /* Validate waiting (gpu, stream); this will error out if not */
  int gpuWaiter = (int) luaL_checknumber(L, 1) - 1;
  int streamWaiter = (int) luaL_checknumber(L, 2);
  cudaStream_t streamWaiting =
    THCState_getDeviceStream(state, gpuWaiter, streamWaiter);

  /* Validate and count set of {gpu={streams...}} we are waiting on */
  int gpus = 0;
  int streams = 0;
  checkAndCountListOfGPUStreamPairs(L, state, 3, &gpus, &streams);

  if (streams < 1) {
    /* nothing to synchronize together */
    return 0;
  }

  /*
     Events can only be recorded on the same device on which they are created.
     -For each GPU, create and record event per each stream given
     for that GPU.
     -For (gpuWaiter, streamWaiter), wait on all of the above events.
  */
  cudaEvent_t* events = (cudaEvent_t*) malloc(sizeof(cudaEvent_t) * streams);

  /* First, create an event per GPU and record events for the specified stream
     on that GPU */
  createMultiDeviceEvents(L, state, 3, events);

  /* Then, wait on the events */
  THCudaCheck(cudaSetDevice(gpuWaiter));
  for (int i = 0; i < streams; ++i) {
    THCudaCheck(cudaStreamWaitEvent(streamWaiting, events[i], 0));
  }

  /* Clean up events */
  for (int i = 0; i < streams; ++i) {
    THCudaCheck(cudaEventDestroy(events[i]));
  }
  free(events);
  THCudaCheck(cudaSetDevice(prevDev));

  return 0;
}
Example #21
0
/*
   Usage:
   cutorch.streamSynchronize(n)
   For the current device, synchronizes with the given stream only
   (cudaStreamSynchronize).
   0 is the default stream on the device.
*/
static int cutorch_streamSynchronize(lua_State *L)
{
  THCState *state = cutorch_getstate(L);
  int streamId = (int) luaL_checknumber(L, 1);

  int curDev = -1;
  THCudaCheck(cudaGetDevice(&curDev));

  /* This also validates the stream */
  cudaStream_t stream = THCState_getDeviceStream(state, curDev, streamId);
  THCudaCheck(cudaStreamSynchronize(stream));

  return 0;
}
Example #22
0
static int cutorch_Tensor_(copyAsyncCPU)(lua_State *L)
{
#define STRINGIFY_TENSOR(x) TH_CONCAT_STRING_3(torch.,x,Tensor)
  THCState *state = cutorch_getstate(L);
  THCTensor *tensor = luaT_checkudata(L, 1, STRINGIFY_TENSOR(CReal));
  void *src;
  if( (src = luaT_toudata(L, 2, STRINGIFY_TENSOR(CReal))))
    THCTensor_(copy)(state, tensor, src);
  else if( (src = luaT_toudata(L, 2, STRINGIFY_TENSOR(Real))))
    THCTensor_(copyAsyncCPU)(state, tensor, src);
  else
    luaL_typerror(L, 2, STRINGIFY_TENSOR(Real) " or " STRINGIFY_TENSOR(CReal));

  lua_settop(L, 1);
  return 1;
#undef STRINGIFY_TENSOR
}
Example #23
0
static int cutorch_Storage_(copy)(lua_State *L)
{
  THCState *state = cutorch_getstate(L);
  THCStorage *storage = luaT_checkudata(L, 1, torch_Storage);
  void *src;
  if( (src = luaT_toudata(L, 2, "torch.CudaByteStorage")) )
    THCStorage_(copyCudaByte)(state, storage, src);
  else if( (src = luaT_toudata(L, 2, "torch.CudaCharStorage")) )
    THCStorage_(copyCudaChar)(state, storage, src);
  else if( (src = luaT_toudata(L, 2, "torch.CudaShortStorage")) )
    THCStorage_(copyCudaShort)(state, storage, src);
  else if( (src = luaT_toudata(L, 2, "torch.CudaIntStorage")) )
    THCStorage_(copyCudaInt)(state, storage, src);
  else if( (src = luaT_toudata(L, 2, "torch.CudaLongStorage")) )
    THCStorage_(copyCudaLong)(state, storage, src);
  else if( (src = luaT_toudata(L, 2, "torch.CudaStorage")) )
    THCStorage_(copyCudaFloat)(state, storage, src);
  else if( (src = luaT_toudata(L, 2, "torch.CudaDoubleStorage")) )
    THCStorage_(copyCudaDouble)(state, storage, src);
#if CUDA_VERSION >= 7050
  else if( (src = luaT_toudata(L, 2, "torch.CudaHalfStorage")) )
    THCStorage_(copyCudaHalf)(state, storage, src);
#endif

  else if( (src = luaT_toudata(L, 2, "torch.ByteStorage")) )
    THCStorage_(copyByte)(state, storage, src);
  else if( (src = luaT_toudata(L, 2, "torch.CharStorage")) )
    THCStorage_(copyChar)(state, storage, src);
  else if( (src = luaT_toudata(L, 2, "torch.ShortStorage")) )
    THCStorage_(copyShort)(state, storage, src);
  else if( (src = luaT_toudata(L, 2, "torch.IntStorage")) )
    THCStorage_(copyInt)(state, storage, src);
  else if( (src = luaT_toudata(L, 2, "torch.LongStorage")) )
    THCStorage_(copyLong)(state, storage, src);
  else if( (src = luaT_toudata(L, 2, "torch.FloatStorage")) )
    THCStorage_(copyFloat)(state, storage, src);
  else if( (src = luaT_toudata(L, 2, "torch.DoubleStorage")) )
    THCStorage_(copyDouble)(state, storage, src);
  else
    luaL_typerror(L, 2, "torch.*Storage");

  lua_settop(L, 1);
  return 1;
}
Example #24
0
/* usage:
   cutorch.streamBarrierMultiDevice({[gpu1]={stream1_1, ..., stream1_N},
                                     [gpuK]={streamK_1, ..., streamK_M}})
   with a specified GPU per each list of streams.
   Each stream (gpu1, stream1_1), ..., (gpu1, stream1_N), ...,
               (gpuK, streamK_1), ..., (gpuK, streamK_M) will wait
   for all others to complete fully.
   Streams are bucketed per device. Equivalent to streamBarrier() if only
   one GPU is specified.
 */
static int cutorch_streamBarrierMultiDevice(lua_State *L)
{
  THCState *state = cutorch_getstate(L);

  int prevDev = -1;
  THCudaCheck(cudaGetDevice(&prevDev));

  /* Validate and count set of {gpu={streams...}} that are mutually waiting */
  int gpus = 0;
  int streams = 0;
  checkAndCountListOfGPUStreamPairs(L, state, 1, &gpus, &streams);

  if (streams < 2) {
    /* nothing to synchronize together */
    return 0;
  }

  /*
     Events can only be recorded on the same device on which they are created.
     -For each GPU, create an event, and record that event on each stream given
     for that GPU.
     -For each GPU, for each stream, wait on the event created by each other
     GPU.
  */
  cudaEvent_t* events = (cudaEvent_t*) malloc(sizeof(cudaEvent_t) * streams);

  /* First, create an event per GPU and record events for the specified stream
     on that GPU */
  createMultiDeviceEvents(L, state, 1, events);

  /* Then, wait on the events. Each stream is actually waiting on itself here
     too, but that's harmless and isn't worth weeding out. */
  waitMultiDeviceEvents(L, state, 1, events, streams);

  /* Clean up events */
  for (int i = 0; i < streams; ++i) {
    THCudaCheck(cudaEventDestroy(events[i]));
  }
  free(events);
  THCudaCheck(cudaSetDevice(prevDev));

  return 0;
}
Example #25
0
/* now we overwrite some methods specific to CudaTensor */
static int cutorch_Tensor_(copy)(lua_State *L)
{
  THCState *state = cutorch_getstate(L);
  THCTensor *tensor = luaT_checkudata(L, 1, torch_Tensor);
  void *src;
  if( (src = luaT_toudata(L, 2, "torch.CudaTensor")) )
    THCTensor_(copyCudaFloat)(state, tensor, src);
  else if( (src = luaT_toudata(L, 2, "torch.CudaByteTensor")) )
    THCTensor_(copyCudaByte)(state, tensor, src);
  else if( (src = luaT_toudata(L, 2, "torch.CudaCharTensor")) )
    THCTensor_(copyCudaChar)(state, tensor, src);
  else if( (src = luaT_toudata(L, 2, "torch.CudaShortTensor")) )
    THCTensor_(copyCudaShort)(state, tensor, src);
  else if( (src = luaT_toudata(L, 2, "torch.CudaIntTensor")) )
    THCTensor_(copyCudaInt)(state, tensor, src);
  else if( (src = luaT_toudata(L, 2, "torch.CudaLongTensor")) )
    THCTensor_(copyCudaLong)(state, tensor, src);
  else if( (src = luaT_toudata(L, 2, "torch.CudaDoubleTensor")) )
    THCTensor_(copyCudaDouble)(state, tensor, src);

  else if( (src = luaT_toudata(L, 2, "torch.ByteTensor")) )
    THCTensor_(copyByte)(state, tensor, src);
  else if( (src = luaT_toudata(L, 2, "torch.CharTensor")) )
    THCTensor_(copyChar)(state, tensor, src);
  else if( (src = luaT_toudata(L, 2, "torch.ShortTensor")) )
    THCTensor_(copyShort)(state, tensor, src);
  else if( (src = luaT_toudata(L, 2, "torch.IntTensor")) )
    THCTensor_(copyInt)(state, tensor, src);
  else if( (src = luaT_toudata(L, 2, "torch.LongTensor")) )
    THCTensor_(copyLong)(state, tensor, src);
  else if( (src = luaT_toudata(L, 2, "torch.FloatTensor")) )
    THCTensor_(copyFloat)(state, tensor, src);
  else if( (src = luaT_toudata(L, 2, "torch.DoubleTensor")) )
    THCTensor_(copyDouble)(state, tensor, src);
  else
    luaL_typerror(L, 2, "torch.*Tensor");

  lua_settop(L, 1);
  return 1;
}
Example #26
0
#ifndef TH_GENERIC_FILE
#define TH_GENERIC_FILE "generic/Storage.c"
#else

static int torch_Storage_(new)(lua_State *L)
{
    THCState *state = cutorch_getstate(L);
    THStorage *storage;
    if(lua_type(L, 1) == LUA_TSTRING)
    {
        const char *fileName = luaL_checkstring(L, 1);
        int isShared = luaT_optboolean(L, 2, 0);
        long size = luaL_optlong(L, 3, 0);
        storage = THStorage_(newWithMapping)(state, fileName, size, isShared);
    }
    else if(lua_type(L, 1) == LUA_TTABLE)
    {
        long size = lua_objlen(L, 1);
        long i;
        storage = THStorage_(newWithSize)(state, size);
        for(i = 1; i <= size; i++)
        {
            lua_rawgeti(L, 1, i);
            if(!lua_isnumber(L, -1))
            {
                THStorage_(free)(state, storage);
                luaL_error(L, "element at index %d is not a number", i);
            }
            THStorage_(set)(state, storage, i-1, (real)lua_tonumber(L, -1));
            lua_pop(L, 1);
        }
Example #27
0
static int cutorch_CudaTensor_getDevice(lua_State *L) {
  THCudaTensor *tensor = luaT_checkudata(L, 1, "torch.CudaTensor");
  lua_pushinteger(L, THCudaTensor_getDevice(cutorch_getstate(L), tensor) + 1);
  return 1;
}
Example #28
0
static int cutorch_setRNGState(lua_State *L)
{
  THByteTensor* t = luaT_checkudata(L, 1, "torch.ByteTensor");
  THCRandom_setRNGState(cutorch_getstate(L), t);
  return 0;
}
Example #29
0
static int cutorch_initialSeed(lua_State *L)
{
  unsigned long seed = THCRandom_initialSeed(cutorch_getstate(L));
  lua_pushnumber(L, seed);
  return 1;
}
Example #30
0
static int cutorch_manualSeedAll(lua_State* L)
{
  unsigned long seed = luaL_checknumber(L, 1);
  THCRandom_manualSeedAll(cutorch_getstate(L), seed);
  return 0;
}