Пример #1
0
TH_API void THLab_(syev)(THTensor *a_, THTensor *w_, const char *jobz, const char *uplo)
{
  int n, lda, lwork, info;
  THTensor *A;
  THTensor *work;
  real wkopt;

  THArgCheck(a_->nDimension == 2, 2, "A should be 2 dimensional");
  A = THTensor_(newContiguous)(a_);
  n = A->size[1];
  lda = n;
  THTensor_(resize1d)(w_,n);
  // get optimal workspace size
  THLapack_(syev)(jobz[0], uplo[0], n, THTensor_(data)(A), lda,
		  THTensor_(data)(w_), &wkopt, -1, &info);
  lwork = (int)wkopt;
  work = THTensor_(newWithSize1d)(lwork);
  THLapack_(syev)(jobz[0], uplo[0], n, THTensor_(data)(A), lda,
		  THTensor_(data)(w_), THTensor_(data)(work), lwork, &info);

  if (info > 0)
  {
    THError(" Lapack syev : Failed to converge. %d off-diagonal elements of an didn't converge to zero",info);
  }
  else if (info < 0)
  {
    THError("Lapack syev : Argument %d : illegal value", -info);
  }
  THTensor_(free)(A);
  THTensor_(free)(work);
}
Пример #2
0
void THFloatTensor_addr(THFloatTensor *r_, float beta, THFloatTensor *t, float alpha, THFloatTensor *vec1, THFloatTensor *vec2)
{
	if( (vec1->nDimension != 1) || (vec2->nDimension != 1) )
		THError("vector and vector expected, got %dD, %dD tensors", vec1->nDimension, vec2->nDimension);

	if(t->nDimension != 2)
		THError("expected matrix, got %dD tensor for t", t->nDimension);

	if( (t->size[0] != vec1->size[0]) || (t->size[1] != vec2->size[0]) )
		THError("size mismatch, t: %ld, vec1: %ld, t: %ld, vec2: %ld", t->size[0], vec1->size[0], t->size[1], vec2->size[0]);

	if(r_ != t)
		THError("r_ != t not implemented");

	if(beta != 1)
		THFloatTensor_mul(r_, r_, beta);

  if(r_->stride[0] == 1)
  {
    THBlas_ger(vec1->size[0], vec2->size[0],
                 alpha, THFloatTensor_data(vec1), vec1->stride[0],
                 THFloatTensor_data(vec2), vec2->stride[0],
                 THFloatTensor_data(r_), r_->stride[1]);
  }
  else if(r_->stride[1] == 1)
  {
    THBlas_ger(vec2->size[0], vec1->size[0],
                 alpha, THFloatTensor_data(vec2), vec2->stride[0],
                 THFloatTensor_data(vec1), vec1->stride[0],
                 THFloatTensor_data(r_), r_->stride[0]);
  }
  else THError("addr for non-contiguous not implemented");
}
Пример #3
0
void* THRealloc(void *ptr, ptrdiff_t size)
{
  if(!ptr)
    return(THAlloc(size));

  if(size == 0)
  {
    THFree(ptr);
    return NULL;
  }

  if(size < 0)
    THError("$ Torch: invalid memory size -- maybe an overflow?");

  ptrdiff_t oldSize = -getAllocSize(ptr);
  void *newptr = realloc(ptr, size);

  if(!newptr && torchGCFunction) {
    torchGCFunction(torchGCData);
    newptr = realloc(ptr, size);
  }

  if(!newptr)
    THError("$ Torch: not enough memory: you tried to reallocate %dGB. Buy new RAM!", size/1073741824);

  // update heapSize only after successfully reallocated
  THHeapUpdate(oldSize + getAllocSize(newptr));

  return newptr;
}
Пример #4
0
Файл: init.c Проект: 0wu/cutorch
int checkAndCountListOfStreams(lua_State *L, THCState *state, int arg,
                               int device)
{
  if (!lua_istable(L, arg)) {
    THError("expecting table of device streams");
  }

  /* Push table to top */
  lua_pushvalue(L, arg);

  /* Check that all values in the table are numeric and in bounds */
  int streams = 0;
  lua_pushnil(L);
  while (lua_next(L, -2)) {
    if (!lua_isnumber(L, -1)) {
      THError("streamWaitFor: list of streams must be numeric");
    }
    int streamId = (int) lua_tonumber(L, -1);

    /* This will error out if the stream is not in bounds */
    THCState_getDeviceStream(state, device, streamId);

    ++streams;
    lua_pop(L, 1);
  }

  /* Pop table from top */
  lua_pop(L, 1);
  return streams;
}
Пример #5
0
void THNN_(ClassNLLCriterion_updateOutput)(THNNState *state, THTensor *input,
                                           THIndexTensor *target,
                                           THTensor *output, bool sizeAverage,
                                           THTensor *weights,
                                           THTensor *total_weight)
{
  int n_dims = THTensor_(nDimension)(input);
  int n_classes = THTensor_(size)(input, n_dims - 1);

  if (THIndexTensor_(nDimension)(target) > 1) {
    THError("multi-target not supported");
  }
  if (THTensor_(nDimension)(input) > 2) {
    THError("input tensor should be 1D or 2D");
  }

  input = THTensor_(newContiguous)(input);
  target = THIndexTensor_(newContiguous)(target);
  weights = weights ? THTensor_(newContiguous)(weights) : NULL;

  real *input_data = THTensor_(data)(input);
  THIndex_t *target_data = THIndexTensor_(data)(target);
  real *weights_data = weights ? THTensor_(data)(weights) : NULL;
  real *output_data = THTensor_(data)(output);
  real *total_weight_data = THTensor_(data)(total_weight);

  output_data[0] = total_weight_data[0] = 0.0;

  if (THTensor_(nDimension)(input) == 1) {
    int cur_target = target_data[0] - 1;
    THAssert(cur_target >= 0 && cur_target < n_classes);
    total_weight_data[0] = weights ? weights_data[cur_target] : 1.0f;
    output_data[0] = -input_data[cur_target] * total_weight_data[0];
  } else if (THTensor_(nDimension)(input) == 2) {
    int batch_size = THTensor_(size)(input, 0);
    THAssert(THIndexTensor_(size)(target, 0) == batch_size);

    int n_target = THTensor_(size)(input, 1);

    int i;
    for (i = 0; i < batch_size; i++) {
      int cur_target = target_data[i] - 1;
      THAssert(cur_target >= 0 && cur_target < n_classes);

      real cur_weight = weights ? weights_data[cur_target] : 1.0f;
      total_weight_data[0] += cur_weight;
      output_data[0] -= input_data[i * n_target + cur_target] * cur_weight;
    }
  }

  if (sizeAverage && total_weight_data[0]) {
    output_data[0] /= total_weight_data[0];
  }

  if (weights) {
    THTensor_(free)(weights);
  }
  THTensor_(free)(input);
  THIndexTensor_(free)(target);
}
Пример #6
0
static void THMapAllocator_free(void* ctx_, void* data) {
  THMapAllocatorContext *ctx = ctx_;

#ifdef _WIN32
  if(UnmapViewOfFile(data) == 0)
    THError("could not unmap the shared memory file");
#else /* _WIN32 */
  if (ctx->flags & TH_ALLOCATOR_MAPPED_KEEPFD) {
    if (close(ctx->fd) == -1)
      THError("could not close file descriptor %d", ctx->fd);
  }

  if (munmap(data, ctx->size))
    THError("could not unmap the shared memory file");

  if (!(ctx->flags & (TH_ALLOCATOR_MAPPED_FROMFD | TH_ALLOCATOR_MAPPED_UNLINK)))
  {
    if (ctx->flags & TH_ALLOCATOR_MAPPED_SHAREDMEM)
    {
#ifdef HAVE_SHM_UNLINK
      if (shm_unlink(ctx->filename) == -1)
        THError("could not unlink the shared memory file %s", ctx->filename);
#else
      THError("could not unlink the shared memory file %s, shm_unlink not available on platform", ctx->filename);
#endif
    }
  }
#endif /* _WIN32 */

  THMapAllocatorContext_free(ctx);
}
Пример #7
0
void* THRealloc(void *ptr, long size)
{
  if(!ptr)
    return(THAlloc(size));
  
  if(size == 0)
  {
    THFree(ptr);
    return NULL;
  }

  if(size < 0)
    THError("$ Torch: invalid memory size -- maybe an overflow?");

  THHeapUpdate(-getAllocSize(ptr));
  void *newptr = realloc(ptr, size);

  if(!newptr && torchGCFunction) {
    torchGCFunction(torchGCData);
    newptr = realloc(ptr, size);
  }
  THHeapUpdate(getAllocSize(newptr ? newptr : ptr));

  if(!newptr)
    THError("$ Torch: not enough memory: you tried to reallocate %dGB. Buy new RAM!", size/1073741824);

  return newptr;
}
Пример #8
0
void THNN_(LookupTable_renorm)(
          THNNState *state,
          THIndexTensor *idx,
          THTensor *weight,
          real maxNorm,
          real normType)
{
  if (!THTensor_(isContiguous)(weight))
    THError("weight must be contiguous");
  if (!THIndexTensor_(isContiguous)(idx))
    THError("input must be contiguous");
  if (THIndexTensor_(nDimension)(idx) != 1)
    THError("idx must be a vector");
  if (normType <= 0)
    THError("non-positive-norm not supported");

  long i;
  THIndex_t *row_idx = THIndexTensor_(data)(idx);
  long numel = THIndexTensor_(nElement)(idx);

  long numw = THTensor_(size)(weight, 0);
  long stride = THTensor_(stride)(weight, 0);
  real *gw = THTensor_(data)(weight);
  for (i=0; i<numel; i++)
    if (row_idx[i] < 1 || row_idx[i] > numw)
      THError("input out of range");
  // get unique indices
  qsort(row_idx, numel, sizeof(THIndex_t), THNN_(compare_THIndex));
  long ptr = 0;
  for (i=0; i<numel; i++)
    if (i == 0 || row_idx[i] != row_idx[i-1])
      row_idx[ptr++] = row_idx[i];
  numel = ptr;

#ifdef _OPENMP
  if (numel > 1000)
  {
    // The strategy is to parallelize over the rows that appear in
    // row_idx, so that thread 1 handles the rows in row_idx[0..numel/nThreads].
    // This distributes the work evenly to each thread.
    #pragma omp parallel for private(i)
    for (i=0; i<numel; i++)
    {
      long k = row_idx[i] - 1;
      THNN_(LookupTable_renormRow)(gw + k*stride, stride, maxNorm, normType);
    }
    return;
  }
#endif
  for (i=0; i<numel; i++)
  {
    long k = row_idx[i] - 1;
    THNN_(LookupTable_renormRow)(gw + k*stride, stride, maxNorm, normType);
  }
}
Пример #9
0
TH_API void THLab_(gesvd)(THTensor *a_, THTensor *s_, THTensor *u_, THTensor *vt_, char jobu)
{
  int k,m, n, lda, ldu, ldvt, lwork, info;
  THTensor *A, *work;
  real wkopt;
  char jobvt = jobu;

  THArgCheck(a_->nDimension == 2, 2, "A should be 2 dimensional");
  THArgCheck(jobu == 'A' || jobu == 'S',4, "jobu can be 'A' or 'S'");
  A = THTensor_(newContiguous)(a_);
  m = A->size[1];
  n = A->size[0];
  k = (m < n ? m : n);

  lda = m;
  ldu = m;
  ldvt = n;
  THTensor_(resize1d)(s_,k);
  THTensor_(resize2d)(vt_,n,ldvt);
  if (jobu == 'A')
  {
    THTensor_(resize2d)(u_,m,ldu);
  }
  else
  {
    THTensor_(resize2d)(u_,k,ldu);
  }
  THLapack_(gesvd)(jobu,jobvt,
		   m,n,THTensor_(data)(A),lda,
		   THTensor_(data)(s_),
		   THTensor_(data)(u_),
		   ldu,
		   THTensor_(data)(vt_), ldvt,
		   &wkopt, -1, &info);
  lwork = (int)wkopt;
  work = THTensor_(newWithSize1d)(lwork);
  THLapack_(gesvd)(jobu,jobvt,
		   m,n,THTensor_(data)(A),lda,
		   THTensor_(data)(s_),
		   THTensor_(data)(u_),
		   ldu,
		   THTensor_(data)(vt_), ldvt,
		   THTensor_(data)(work),lwork, &info);
  if (info > 0)
  {
    THError(" Lapack gesvd : %d superdiagonals failed to converge.",info);
  }
  else if (info < 0)
  {
    THError("Lapack gesvd : Argument %d : illegal value", -info);
  }
  THTensor_(free)(A);
  THTensor_(free)(work);
}
Пример #10
0
static ptrdiff_t applyHeapDelta() {
  ptrdiff_t oldHeapSize = THAtomicAddPtrdiff(&heapSize, heapDelta);
#ifdef DEBUG
  if (heapDelta > 0 && oldHeapSize > PTRDIFF_MAX - heapDelta)
    THError("applyHeapDelta: heapSize(%td) + increased(%td) > PTRDIFF_MAX, heapSize overflow!", oldHeapSize, heapDelta);
  if (heapDelta < 0 && oldHeapSize < PTRDIFF_MIN - heapDelta)
    THError("applyHeapDelta: heapSize(%td) + decreased(%td) < PTRDIFF_MIN, heapSize underflow!", oldHeapSize, heapDelta);
#endif
  ptrdiff_t newHeapSize = oldHeapSize + heapDelta;
  heapDelta = 0;
  return newHeapSize;
}
Пример #11
0
int THCState_getPeerToPeerAccess(THCState* state, int dev, int devToAccess)
{
  if (dev < 0 || dev >= state->numDevices) {
    THError("%d is not a device", dev);
  }

  if (devToAccess < 0 || dev >= state->numDevices) {
    THError("%d is not a device", devToAccess);
  }

  return state->p2pAccessEnabled[dev][devToAccess];
}
Пример #12
0
int THCState_getPeerToPeerAccess(THCState* state, int dev, int devToAccess)
{
  int numDevices = 0;
  THCudaCheck(cudaGetDeviceCount(&numDevices));
  if (dev < 0 || dev >= numDevices) {
    THError("%d is not a device", dev);
  }

  if (devToAccess < 0 || dev >= numDevices) {
    THError("%d is not a device", devToAccess);
  }

  return state->p2pAccessEnabled[dev][devToAccess];
}
Пример #13
0
cublasHandle_t THCState_getCurrentBlasHandle(THCState *state)
{
  /* This is called at the point of kernel execution.
     For some debugging code or improperly instrumented kernels,
     `state` is null */
  if (state) {
    if (state->currentBlasHandle <= 0) {
      THError("%d is not a valid handle, valid range is: (1, %d)",
              state->currentBlasHandle, state->numUserBlasHandles);
    }
    return state->currentBlasHandle;
  }
  THError("THCState and blasHandles must be set as there is no default blasHandle");
  return NULL;
}
Пример #14
0
TH_API void THLab_(gesv)(THTensor *a_, THTensor *b_)
{
  int n, nrhs, lda, ldb, info;
  THIntTensor *ipiv;
  THTensor *A, *B;
  
  THArgCheck(a_->nDimension == 2, 2, "A should be 2 dimensional");
  THArgCheck(a_->size[0] == a_->size[1], 2, "A should be symmetric");

  n = (int)a_->size[1];
  lda = n;
  ldb = n;
  if (b_->nDimension == 1)
  {
    nrhs = 1;
    THArgCheck(n == b_->size[0], 1, "size incompatible A,b");
  }
  else
  {
    nrhs = b_->size[0];
    THArgCheck(n == b_->size[1], 1, "size incompatible A,b");
  }

  A = THTensor_(newContiguous)(a_);
  B = THTensor_(newContiguous)(b_);
  ipiv = THIntTensor_newWithSize1d((long)n);
  THLapack_(gesv)(n, nrhs, 
		  THTensor_(data)(A), lda, THIntTensor_data(ipiv),
		  THTensor_(data)(B), ldb, &info);

  if(!THTensor_(isContiguous)(b_))
  {
    THTensor_(copy)(b_,B);
  }

  if (info < 0)
  {
    THError("Lapack gesv : Argument %d : illegal value", -info);
  }
  else if (info > 0)
  {
    THError("Lapack gesv : U(%d,%d) is zero, singular U.", info,info);
  }

  THIntTensor_free(ipiv);
  THTensor_(free)(A);
  THTensor_(free)(B);
}
Пример #15
0
static void THCudaTensor_rawSet(THCState *state, THCudaTensor *self, THCudaStorage *storage, long storageOffset, int nDimension, long *size, long *stride)
{
  THAssert(self->storage != NULL);
  /* storage */
  if(self->storage != storage)
  {
    if(self->storage)
      THCudaStorage_free(state, self->storage);

    if(storage)
    {
      self->storage = storage;
      THCudaStorage_retain(state, self->storage);
    }
    else
      self->storage = THCudaStorage_new(state);
  }

  /* storageOffset */
  if(storageOffset < 0)
    THError("Tensor: invalid storage offset");
  self->storageOffset = storageOffset;

  /* size and stride */
  THCudaTensor_rawResize(state, self, nDimension, size, stride);
}
Пример #16
0
static void THNN_(SpatialMaxUnpooling_updateGradInput_frame)(scalar_t *gradInput_p, scalar_t *gradOutput_p,
                                                         THIndex_t *ind_p,
                                                         int nslices,
                                                         int iwidth, int iheight,
                                                         int owidth, int oheight)
{
  at::parallel_for(0, nslices, 0, [&](int64_t start, int64_t end) {
    for (auto k = start; k < end; k++)
    {
      scalar_t *gradInput_p_k = gradInput_p + k*iwidth*iheight;
      scalar_t *gradOutput_p_k = gradOutput_p + k*owidth*oheight;
      THIndex_t *ind_p_k = ind_p + k*iwidth*iheight;

      int i, j;
      THIndex_t maxp;
      for(i = 0; i < iheight; i++)
      {
        for(j = 0; j < iwidth; j++)
        {
          maxp = ind_p_k[i*iwidth + j]; /* retrieve position of max */
          if(maxp < 0 || maxp >= owidth * oheight) {
              THError("invalid max index %ld, owidth= %d, oheight= %d", maxp, owidth, oheight);
          }
          gradInput_p_k[i*iwidth + j] = gradOutput_p_k[maxp]; /* update gradient */
        }
      }
    }
  });
}
THFloatTensor *nn_SpatialConvolutionMM_updateOutput(struct module *module, THFloatTensor *input)
{
	int kW = module->SpatialConvolution.kW;
	int kH = module->SpatialConvolution.kH;
	int dW = module->SpatialConvolution.dW;
	int dH = module->SpatialConvolution.dH;
	int padW = module->SpatialConvolution.padW;
	int padH = module->SpatialConvolution.padH;

	THFloatTensor *finput = module->SpatialConvolution.finput;
	THFloatTensor *weight = module->SpatialConvolution.weight;
	THFloatTensor *bias   = module->SpatialConvolution.bias;
	THFloatTensor *output = module->output;

	int batch = 1;
	if (input->nDimension == 3) {
		batch = 0;
		THFloatTensor_resize4d(input, 1, input->size[0], input->size[1], input->size[2]);
	}

	long batchSize = input->size[0];
	long nInputPlane  = module->SpatialConvolution.nInputPlane;
	long nOutputPlane = module->SpatialConvolution.nOutputPlane;
	long inputWidth   = input->size[3];
	long inputHeight  = input->size[2];
	long outputWidth  = (inputWidth + 2*padW - kW) / dW + 1;
	long outputHeight = (inputHeight + 2*padH - kH) / dH + 1;


	if (outputWidth < 1 || outputHeight < 1)
		THError("Given input size: (%dx%dx%d). Calculated output size: (%dx%dx%d). Output size is too small",
		nInputPlane,inputHeight,inputWidth,nOutputPlane,outputHeight,outputWidth);

	THFloatTensor_resize3d(finput, batchSize, kW*kH*nInputPlane, outputHeight*outputWidth);
	THFloatTensor_resize4d(output, batchSize, nOutputPlane, outputHeight, outputWidth);

	long t;
#pragma omp parallel for if(batchSize >= 4) private(t)
	for (t = 0; t < batchSize; t++) {
		THFloatTensor *input_t = THFloatTensor_newSelect(input, 0, t);
		THFloatTensor *output_t = THFloatTensor_newSelect(output, 0, t);
		THFloatTensor *finput_t = THFloatTensor_newSelect(finput, 0, t);

		nn_SpatialConvolutionMM_updateOutput_frame(input_t, output_t, weight, bias, finput_t,
			kW, kH, dW, dH, padW, padH,
			nInputPlane, inputWidth, inputHeight,
			nOutputPlane, outputWidth, outputHeight);

		THFloatTensor_free(input_t);
		THFloatTensor_free(output_t);
		THFloatTensor_free(finput_t);
	}

	if (batch == 0) {
		THFloatTensor_resize3d(output, nOutputPlane, outputHeight, outputWidth);
		THFloatTensor_resize3d(input, nInputPlane, inputHeight, inputWidth);
	}

	return output;
}
Пример #18
0
static inline void THNN_(VolumetricFullConvolution_shapeCheck)(
                         THTensor *input, THTensor *gradOutput,
                         THTensor *weight, THTensor *bias,
                         int dT, int dW, int dH, int pT, int pW, int pH,
                         int aT, int aW, int aH) {
  THNN_ARGCHECK(input->nDimension == 4 || input->nDimension == 5, 2, input,
                "4D or 5D (batch mode) tensor expected for input, but got: %s");
  // number of input & output planes and kernel size is indirectly defined by the weight tensor
  THNN_ARGCHECK(weight->nDimension == 5, 4, weight,
                "5D (nOutputPlane x nInputPlane x kT x kH x kW) tensor "
                "expected for weight, but got: %s");
  THArgCheck(dT > 0 && dW > 0 && dH > 0, 11,
             "stride should be greater than zero, but got dT: %d dH: %d dW: %d", dT, dH, dW);
  THArgCheck(aT < dT && aW < dW && aH < dH, 15,
             "output adjustment must be smaller than stride, but got "
             "adjT: %d adjH: %d adjW: %d dT: %d dH: %d dW: %d",
             aT, aH, aW, dT, dH, dW);

  int ndim = input->nDimension;
  const int nInputPlane  = (int)weight->size[0];
  const int nOutputPlane = (int)weight->size[1];
  const int kT           = (int)weight->size[2];
  const int kH           = (int)weight->size[3];
  const int kW           = (int)weight->size[4];

  if (bias != NULL) {
    THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size[1]);
  }

  int dimf = 0;
  int dimd = 1;
  int dimh = 2;
  int dimw = 3;

  if (ndim == 5) {
    dimf++;
    dimd++;
    dimh++;
    dimw++;
  }

  const long inputWidth   = input->size[dimw];
  const long inputHeight  = input->size[dimh];
  const long inputDepth   = input->size[dimd];
  const long outputWidth  = (inputWidth  - 1) * dW - 2*pW + kW + aW;
  const long outputHeight = (inputHeight - 1) * dH - 2*pH + kH + aH;
  const long outputDepth  = (inputDepth  - 1) * dT - 2*pT + kT + aT;

  if (outputDepth < 1 || outputWidth < 1 || outputHeight < 1)
    THError("Given input size: (%dx%dx%dx%d). Calculated output size: (%dx%dx%dx%d). Output size is too small",
            nInputPlane,inputDepth,inputHeight,inputWidth,nOutputPlane,outputDepth,outputHeight,outputWidth);

  THNN_CHECK_DIM_SIZE(input, ndim, dimf, nInputPlane);
  if (gradOutput != NULL) {
    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimd, outputDepth);
    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight);
    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth);
  }
}
Пример #19
0
static void THNN_(SpatialMaxUnpooling_updateGradInput_frame)(real *gradInput_p, real *gradOutput_p,
                                                         THIndex_t *ind_p,
                                                         long nslices,
                                                         long iwidth, long iheight,
                                                         long owidth, long oheight)
{
  long k;
#pragma omp parallel for private(k)
  for (k = 0; k < nslices; k++)
  {
    real *gradInput_p_k = gradInput_p + k*iwidth*iheight;
    real *gradOutput_p_k = gradOutput_p + k*owidth*oheight;
    THIndex_t *ind_p_k = ind_p + k*iwidth*iheight;

    long i, j, maxp;
    for(i = 0; i < iheight; i++)
    {
      for(j = 0; j < iwidth; j++)
      {
        maxp = ind_p_k[i*iwidth + j] - TH_INDEX_BASE; /* retrieve position of max */
        if(maxp<0 || maxp>=owidth*oheight){
            THError("invalid max index %d, owidth= %d, oheight= %d",maxp,owidth,oheight);
        }
        gradInput_p_k[i*iwidth + j] = gradOutput_p_k[maxp]; /* update gradient */
      }
    }
  }
}
Пример #20
0
void THCudaGetGridSize(int *nBlockPerColumn_, int *nBlockPerRow_, int *nThreadPerBlock_, long size)
{
  const int nThreadPerBlock = 256;
  long nBlockPerGrid = size / nThreadPerBlock;
  long nBlockPerColumn = 0L;
  long nBlockPerRow = 0L;

  if(size % nThreadPerBlock)
    nBlockPerGrid++;

  if(nBlockPerGrid <= 65535)
  {
    nBlockPerRow = nBlockPerGrid;
    nBlockPerColumn = 1;
  }
  else if(nBlockPerGrid <= (65355L * 65355L))
  {
    unsigned int uiSqrt = (unsigned int)(sqrt((float)nBlockPerGrid));
    nBlockPerRow = uiSqrt;
    nBlockPerColumn = uiSqrt;
    while((nBlockPerRow * nBlockPerColumn) < nBlockPerGrid)
      nBlockPerRow++;
  }
  else
    THError("too large vector for Cuda, sorry");

  *nBlockPerColumn_ = (int)nBlockPerColumn;
  *nBlockPerRow_ = (int)nBlockPerRow;
  *nThreadPerBlock_ = (int)nThreadPerBlock;
}
Пример #21
0
cudaStream_t THCState_getDeviceStream(THCState *state, int device, int stream)
{
  /* `device` is a CUDA index */
  if (device >= state->numDevices || device < 0)
  {
    THError("%d is not a device", device + 1 /* back to Torch index */);
  }

  /* Stream 0 is the default stream, 1 ... `numUserStreams` are Torch streams */
  if (stream > state->numUserStreams || stream < 0)
  {
    THError("%d is not a stream", stream);
  }

  return state->streamsPerDevice[device][stream];
}
Пример #22
0
THMapAllocatorContext *THMapAllocatorContext_new(const char *filename, int flags)
{
  THMapAllocatorContext *ctx = THAlloc(sizeof(THMapAllocatorContext));

  if (!(flags & TH_ALLOCATOR_MAPPED_SHARED) && !(flags & TH_ALLOCATOR_MAPPED_SHAREDMEM))
    flags &= ~TH_ALLOCATOR_MAPPED_NOCREATE;
  if ((flags ^ TH_ALLOCATOR_MAPPED_EXCLUSIVE) == 0)
    THError("TH_ALLOCATOR_MAPPED_EXCLUSIVE flag requires opening the file "
        "in shared mode");

  if (filename) {
    ctx->filename = THAlloc(strlen(filename)+1);
    strcpy(ctx->filename, filename);
  } else {
    ctx->filename = unknown_filename;
  }
  ctx->flags = flags;
  ctx->size = 0;
#ifdef _WIN32
  ctx->handle = INVALID_HANDLE_VALUE;
#else
  ctx->fd = -1;
#endif

  return ctx;
}
Пример #23
0
static int loadnpz_l(lua_State *L){
	try{
		const char *filename = lua_tostring(L, 1);

		std::string fpath = std::string(filename);

		cnpy::npz_t npzData = cnpy::npz_load(filename);

		// create a new table
		lua_newtable(L);  
		int tbl = lua_gettop(L);

		for (cnpy::npz_t::iterator i=npzData.begin(); i!=npzData.end(); ++i){
			std::string name = i->first;
			cnpy::NpyArray arr = i->second;

			lua_pushstring(L, name.c_str());
			load_array_to_lua(L, arr);
			lua_rawset(L, tbl);
		}


	}catch (std::exception& e){
		THError(e.what());
	}

	return 1;
}
Пример #24
0
static void nn_(SpatialMaxUnpooling_updateOutput_frame)(real *input_p, real *output_p,
                                                      real *ind_p,
                                                      long nslices,
                                                      long iwidth, long iheight,
                                                      long owidth, long oheight)
{
  long k;
#pragma omp parallel for private(k)
  for (k = 0; k < nslices; k++)
  {    
    real *output_p_k = output_p + k*owidth*oheight;
    real *input_p_k = input_p + k*iwidth*iheight;
    real *ind_p_k = ind_p + k*iwidth*iheight;

    long i, j, maxp;
    for(i = 0; i < iheight; i++)
    {
      for(j = 0; j < iwidth; j++)
      {
        maxp = ind_p_k[i*iwidth + j] - 1;  /* retrieve position of max */
        if(maxp<0 || maxp>=owidth*oheight){
            THError("invalid max index %d, owidth= %d, oheight= %d",maxp,owidth,oheight);
        }
        output_p_k[maxp] = input_p_k[i*iwidth + j]; /* update output */
      }
    }
  }
}
Пример #25
0
void THCState_setPeerToPeerAccess(THCState* state, int dev, int devToAccess,
                                  int enable)
{
  /* This will perform device bounds checking for us */
  int prevEnabled = THCState_getPeerToPeerAccess(state, dev, devToAccess);

  if (enable != prevEnabled) {
    /* If we're attempting to enable p2p access but p2p access isn't */
    /* supported, throw an error */
    if (enable) {
      int access = 0;
      THCudaCheck(cudaDeviceCanAccessPeer(&access, dev, devToAccess));

      if (!access) {
        THError("p2p access not supported for %d accessing %d",
                dev, devToAccess);
      }
    }

    state->p2pAccessEnabled[dev][devToAccess] = enable;

    int prevDev = 0;
    THCudaCheck(cudaGetDevice(&prevDev));
    THCudaCheck(cudaSetDevice(dev));

    /* This should be in sync with the current access state */
    if (enable) {
      THCudaCheck(cudaDeviceEnablePeerAccess(devToAccess, 0));
    } else {
      THCudaCheck(cudaDeviceDisablePeerAccess(devToAccess));
    }

    THCudaCheck(cudaSetDevice(prevDev));
  }
}
Пример #26
0
int THProcessYUYV(THNETWORK *network, unsigned char *image, int width, int height, float **results, int *outwidth, int *outheight)
{
	THFloatTensor *out;
	THFloatStorage *st;

#ifdef CUDNN
	if(network->net->cuda)
		THError("This function is not supported with CUDNN");
#endif
	st = THFloatStorage_new(width * height * 3);
	yuyv2fRGB(image, st->data, width*height, width, width, height, network->mean, network->std);
	THFloatTensor *t = THFloatTensor_new();
	t->storage = st;
	t->nDimension = 3;
	t->size[0] = 3;
	t->size[1] = height;
	t->size[2] = width;
	t->stride[0] = width * height;
	t->stride[1] = width;
	t->stride[2] = 1;
	out = forward(network->net, t);
	THFloatTensor_free(t);
	*results = out->storage->data;
	if(out->nDimension >= 3)
	{
		*outwidth = out->size[out->nDimension - 1];
		*outheight = out->size[out->nDimension - 2];
	} else *outwidth = *outheight = 1;
	return THFloatTensor_nElement(out);
}
Пример #27
0
void* THAlloc(long size)
{
  void *ptr;

  if(size < 0)
    THError("$ Torch: invalid memory size -- maybe an overflow?");

  if(size == 0)
    return NULL;

  ptr = malloc(size);
  if(!ptr)
    THError("$ Torch: not enough memory: you tried to allocate %dGB. Buy new RAM!", size/1073741824);

  return ptr;
}
Пример #28
0
static void THGPUTensor_rawSet(THGPUTensor *self, THGPUStorage *storage, long storageOffset, int nDimension, long *size, long *stride)
{
  /* storage */
  if (self->storage != storage)
  {
    if (self->storage)
      THGPUStorage_free(self->storage);

    if (storage)
    {
      self->storage = storage;
      THGPUStorage_retain(self->storage);
    }
    else
      self->storage = NULL;
  }

  /* storageOffset */
  if (storageOffset < 0)
    THError("Tensor: invalid storage offset");
  self->storageOffset = storageOffset;

  /* size and stride */
  THGPUTensor_rawResize(self, nDimension, size, stride);
}
Пример #29
0
static inline void THNN_(Col2Im_shapeCheck)(
                         THNNState *state,
                         THTensor *input,
                         THTensor *gradOutput,
                         int64_t outputHeight, int64_t outputWidth,
                         int64_t kH, int64_t kW, int64_t dilationH, int64_t dilationW,
                         int64_t padH, int64_t padW, int64_t dH, int64_t dW) {

  THArgCheck(kW > 0 && kH > 0, 6,
             "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
  THArgCheck(dW > 0 && dH > 0, 12,
             "stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
  THArgCheck(dilationW > 0 && dilationH > 0, 8,
             "dilation should be greater than zero, but got dilationH: %d dilationW: %d", dilationH, dilationW);

  int64_t ndim = THTensor_(nDimensionLegacyNoScalars)(input);
  THNN_ARGCHECK(!input->is_empty() && (ndim == 2 || ndim == 3), 2, input,
                "Expected non-empty 2D or 3D input tensor, but got input of shape %s");

  int64_t batch_dim = (ndim == 3) ? 0 : -1;
  int64_t nInputPlane  = input->size(batch_dim + 1);

  if (nInputPlane % (kW * kH) != 0) {
    THError("Expected size of input's dimension 1 to be divisible by the "
            "product of kernel_size, but got input.size(1)=%lld and "
            "kernel_size=(%d, %d).", (long long) nInputPlane, kH, kW);
  }

  int64_t inputLength  = input->size(batch_dim + 2);
  int64_t nBlocksH = div_rtn<int64_t>(outputHeight + 2 * padH - dilationH * (kH - 1) - 1, dH) + 1;
  int64_t nBlocksW = div_rtn<int64_t>(outputWidth + 2 * padW - dilationW * (kW - 1) - 1, dW) + 1;

  if (inputLength != (nBlocksH * nBlocksW)) {
    THError("Given output_size=(%d, %d), kernel_size=(%d, %d), "
            "dilation=(%d, %d), padding=(%d, %d), stride=(%d, %d), expected "
            "size of input's dimension 2 to match the calculated number of "
            "sliding blocks %lld * %lld = %lld, but got input.size(2)=%lld.",
            outputHeight, outputWidth, kH, kW, dilationH, dilationW, padH, padW, dH, dW,
            (long long) nBlocksH, (long long) nBlocksW,
            (long long) (nBlocksH * nBlocksW), (long long) inputLength);
  }

  if (outputWidth < 1 || outputHeight < 1) {
    THError("Expected output spatial size to be positive, but got: output_size=(%d, %d).",
            outputHeight, outputWidth);
  }
}
Пример #30
0
static void jhu_THLogSum1d_init(lua_State *L) {
    int ret = luaT_pushmetatable(L, "torch.DoubleTensor");
    if(ret == 0) {
        THError("problem pushing metatable");
    }
    luaT_registeratname(L, jhu_THLogSum1d__, "jhu");
    lua_pop(L, 1);
}