예제 #1
0
/* note: due to write issues, this one cannot be parallelized as well as unfolded_copy */
static void nn_(unfolded_acc)(THTensor *finput, THTensor *input,
                               int kW, int kH,
                               int dW, int dH,
                               int padW, int padH,
                               int nInputPlane,
                               int inputWidth, int inputHeight,
                               int outputWidth, int outputHeight)
{
  int nip;
  real *input_data = THTensor_(data)(input);
  real *finput_data = THTensor_(data)(finput);

#pragma omp parallel for private(nip)
  for(nip = 0; nip < nInputPlane; nip++)
  {
    int kw, kh, y, x, ix, iy;
    for(kh = 0; kh < kH; kh++)
    {
      for(kw = 0; kw < kW; kw++)
      {
        real *src = finput_data + nip*(kH*kW*outputHeight*outputWidth) + kh*(kW*outputHeight*outputWidth) + kw*(outputHeight*outputWidth);
        real *dst = input_data + nip*(inputHeight*inputWidth);
        if (padW > 0 || padH > 0) {
          int lpad,rpad;
          for(y = 0; y < outputHeight; y++) {
            iy = y*dH - padH + kh;
            if (iy < 0 || iy >= inputHeight) {
            } else {
              if (dW==1){
                 ix = 0 - padW + kw;
                 lpad = fmaxf(0,padW-kw);
                 rpad = fmaxf(0,padW-(kW-kw-1));
                 THVector_(add)(dst+iy*inputWidth+ix+lpad, src+y*outputWidth+lpad, 1, outputWidth - lpad - rpad); /* note: THVector_add could handle 1 value better */
              }
              else{
                for (x=0; x<outputWidth; x++){
                   ix = x*dW - padW + kw;
                   if (ix < 0 || ix >= inputWidth){
                   }else
                     THVector_(add)(dst+iy*inputWidth+ix, src+y*outputWidth+x, 1, 1);
                }
              }
            }
          }
        } else {
          for(y = 0; y < outputHeight; y++) {
            iy = y*dH + kh;
            ix = 0 + kw;
            if (dW == 1 )
               THVector_(add)(dst+iy*inputWidth+ix, src+y*outputWidth, 1, outputWidth); /* note: THVector_add could handle 1 value better */
            else{
              for(x = 0; x < outputWidth; x++)
                THVector_(add)(dst+iy*inputWidth+ix+x*dW, src+y*outputWidth+x, 1, 1);
            }
          }
        }
      }
    }
  }
}
예제 #2
0
void THVector_(normal_fill_DEFAULT)(real *data,
                                    int64_t size,
                                    THGenerator *generator,
                                    const real mean,
                                    const real stddev)
{
  THAssert(size >= 16 && "Size must be >= 16 for normal fill");

  for (int64_t i = 0; i < size; ++i) {
#ifdef TH_REAL_IS_FLOAT
    data[i] = THRandom_uniformFloat(generator, 0, 1);
#else
    data[i] = THRandom_uniform(generator, 0, 1);
#endif
  }

  for (int64_t i = 0; i < size - 15; i += 16) {
    THVector_(interleaved_normal_fill_16)(data + i, mean, stddev);
  }

  if (size % 16 != 0) {
    // Recompute the last 16 values.
    data = data + size - 16;
    for (int64_t i = 0; i < 16; ++i) {
#ifdef TH_REAL_IS_FLOAT
      data[i] = THRandom_uniformFloat(generator, 0, 1);
#else
      data[i] = THRandom_uniform(generator, 0, 1);
#endif
    }
    THVector_(interleaved_normal_fill_16)(data, mean, stddev);
  }
}
예제 #3
0
void THVector_(normal_fill)(real *data,
                            const int64_t size,
                            struct THGenerator *generator,
                            const real mean,
                            const real stddev) {
  THVector_(normal_fill_DISPATCHPTR)(data, size, generator, mean, stddev);
}
예제 #4
0
/* note: due to write issues, this one cannot be parallelized as well as unfolded_copy */
static void nn_(unfolded_acc)(THTensor *finput, THTensor *input,
                               int kW, int kH,
                               int nInputPlane,
                               int inputWidth, int inputHeight,
                               int outputWidth, int outputHeight)
{
  int nip;
  real *input_data = THTensor_(data)(input);
  real *finput_data = THTensor_(data)(finput);

#pragma omp parallel for private(nip)
  for(nip = 0; nip < nInputPlane; nip++)
  {
    int kw, kh, y;
    for(kh = 0; kh < kH; kh++)
    {
      for(kw = 0; kw < kW; kw++)
      {
        real *src = finput_data + nip*(kH*kW*outputHeight*outputWidth) + kh*(kW*outputHeight*outputWidth) + kw*(outputHeight*outputWidth);
        real *dst = input_data + nip*(inputHeight*inputWidth) + kh*inputWidth + kw;
        for(y = 0; y < outputHeight; y++)
          THVector_(add)(dst+y*inputWidth, src+y*outputWidth, 1, outputWidth); /* note: THVector_add could handle 1 value better */
      }
    }
  }
}
예제 #5
0
static void THNN_(VolumetricConvolutionMM_updateOutput_frame)(
          THTensor *input,
          THTensor *output,
          THTensor *weight,
          THTensor *bias,
          THTensor *finput,
          int kT,
          int kW,
          int kH,
          int dT,
          int dW,
          int dH,
          int pT,
          int pW,
          int pH,
          int64_t nInputPlane,
          int64_t inputDepth,
          int64_t inputWidth,
          int64_t inputHeight,
          int64_t nOutputPlane,
          int64_t outputDepth,
          int64_t outputWidth,
          int64_t outputHeight)
{
  int64_t i;
  THTensor *output2d;

  THNN_(unfolded_copy_vol)(
    finput, input,
    kT, kW, kH,
    dT, dW, dH,
    pT, pW, pH,
    nInputPlane,
    inputDepth, inputWidth, inputHeight,
    outputDepth, outputWidth, outputHeight
  );

  output2d = THTensor_(newWithStorage2d)(
    output->storage, output->storageOffset, nOutputPlane, -1,
    outputDepth*outputHeight*outputWidth, -1
  );

  if (bias) {
      for (i = 0; i < nOutputPlane; i++)
      {
        THVector_(fill)(
          output->storage->data+output->storageOffset+output->stride[0]*i,
          THTensor_(get1d)(bias, i),
          outputDepth*outputHeight*outputWidth
        );
      }
  } else {
    THTensor_(zero)(output);
  }

  THTensor_(addmm)(output2d, 1, output2d, 1, weight, finput);

  THTensor_(free)(output2d);
}
예제 #6
0
static void THNN_(unfolded_acc_row)(
	THTensor *finput,
	THTensor *input,
	int kW,
	int dW,
	int padW,
	int64_t inputFrameSize,
	int64_t nInputFrame,
	int64_t nOutputFrame) {

	int64_t c;
	real *input_data = THTensor_(data)(input);
	real *finput_data = THTensor_(data)(finput);

// #pragma omp parallel for private(c)
	for (c = 0; c < inputFrameSize; c++) {
		int64_t kw, x;
		int64_t ix = 0;

		for (kw = 0; kw < kW; kw++) {
			real *src = finput_data
			            + c * (kW * nOutputFrame)
			            + kw * (nOutputFrame);
			real *dst = input_data + c * (nInputFrame);

			ix = (size_t)(kw);
			if (dW == 1) {
			  real *dst_slice = dst + (size_t)(ix);
			  THVector_(cadd)(dst_slice, dst_slice, src, 1, nOutputFrame);
			} else {
				for (x = 0; x < nOutputFrame; x++) {
				  real *dst_slice = dst + (size_t)(ix + x * dW);
				  THVector_(cadd)(dst_slice, dst_slice,
						  src + (size_t)(x), 1, 1);
				}
			}
		}
	}
}
예제 #7
0
파일: THTensorCopy.c 프로젝트: MlWoo/torch7
void THTensor_(copy)(THTensor *tensor, THTensor *src)
{
  if (THTensor_(isContiguous)(tensor) && THTensor_(isContiguous)(src) && THTensor_(nElement)(tensor) == THTensor_(nElement)(src)) {
    real *sp = THTensor_(data)(src);
    real *rp = THTensor_(data)(tensor);
    ptrdiff_t sz = THTensor_(nElement)(tensor);
#ifndef TH_REAL_IS_HALF
    THVector_(copy)(rp, sp, sz); 
#else
    memcpy(rp, sp, sz * sizeof(real));
#endif
  } else {
    TH_TENSOR_APPLY2(real, tensor, real, src, *tensor_data = *src_data;)
  }
}
예제 #8
0
void THNN_(SparseLinear_legacyZeroGradParameters)(
          THNNState *state,
          THTensor *gradWeight,
          THTensor *gradBias,
          THTensor *lastInput)
{
  int64_t h, i, j;

  int64_t outDim = gradWeight->size[0];
  int64_t inDim = gradWeight->size[1];

  THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 3, "gradBias size wrong");
  THArgCheck(THNN_(checkLegacyInput)(lastInput), 4,
             "input size must be batchsize x nnz x 2");

  THTensor_(zero)(gradBias);

  int64_t batchSize = THTensor_(size)(lastInput, 0);
  int64_t nnz = THTensor_(size)(lastInput, 1);

#pragma omp parallel for private(h, i, j) schedule(static) if (   \
  batchSize > 1 && batchSize * nnz * outDim > 10000)
  for (h = 0; h < batchSize; h++) {
    for (i = 0; i < nnz; i++) {
      if (THNN_(get3d)(lastInput, h, i, 1) == 0 ) {
        continue;
      }

      int64_t offset = (int64_t)(THNN_(get3d)(lastInput, h, i, 0)) - 1;
      if (offset >= 0 && offset < inDim) {
        real* pGradWeight = COL_PTR2(gradWeight, offset);
        if (gradWeight->stride[0] == 1) {
          THVector_(fill)(pGradWeight, 0, outDim);
        } else {
          int64_t stride = gradWeight->stride[0];
          for (j = 0; j < outDim; ++j) {
            pGradWeight[j * stride] = 0;
          }
        }
      } else {
        THError(
          "index out of bound. zeroGradParameters: %d not between 1 and %d",
          offset + 1,
          inDim);
      }
    }
  }
}
예제 #9
0
static void THNN_(SpatialConvolutionMM_updateOutput_frame)(
          THTensor *input,
          THTensor *output,
          THTensor *weight,
          THTensor *bias,
          THTensor *finput,
          int kW,
          int kH,
          int dW,
          int dH,
          int padW,
          int padH,
          long nInputPlane,
          long inputWidth,
          long inputHeight,
          long nOutputPlane,
          long outputWidth,
          long outputHeight)
{
  long i;
  THTensor *output2d;

  THNN_(unfolded_copy)(finput, input, kW, kH, dW, dH, padW, padH,
		       nInputPlane, inputWidth, inputHeight,
		       outputWidth, outputHeight);

  output2d = THTensor_(newWithStorage2d)(output->storage, output->storageOffset,
                                         nOutputPlane, -1,
                                         outputHeight*outputWidth, -1);
  if (bias) {
    for(i = 0; i < nOutputPlane; i++)
        THVector_(fill)
	  (output->storage->data + output->storageOffset + output->stride[0] * i,
	   THTensor_(get1d)(bias, i), outputHeight*outputWidth);
  } else {
    THTensor_(zero)(output);
  }

  THTensor_(addmm)(output2d, 1, output2d, 1, weight, finput);

  THTensor_(free)(output2d);
}
예제 #10
0
static void THNN_(TemporalRowConvolution_updateOutput_frame)(
	THTensor *input,
	THTensor *output,
	THTensor *weight,
	THTensor *bias,
	THTensor *finput,
	int kW,
	int dW,
	int padW,
	int64_t inputFrameSize,
	int64_t nInputFrame,
	int64_t nOutputFrame) {

	int64_t i;

	THTensor *output3d = THTensor_(newWithStorage3d)(
		output->storage, output->storageOffset,
		inputFrameSize, -1,
		1, -1,
		nOutputFrame, -1);

	THNN_(unfolded_copy_row)(finput, input, kW, dW, padW,
	                         inputFrameSize, nInputFrame, nOutputFrame);

	THTensor_(zero)(output);

	if (bias != NULL) {
		for (i = 0; i < inputFrameSize; i++)
			THVector_(fill)
			        (THStorage_(data)(output->storage) + output->storageOffset
			        + output->stride[0] * i,
			        THTensor_(get1d)(bias, i), nOutputFrame);
	}

	THTensor_(baddbmm)(output3d, 1, output3d, 1, weight, finput);

	THTensor_(free)(output3d);
}
예제 #11
0
void THVector_(divs)(real *y, const real *x, const real c, const ptrdiff_t n) {
  THVector_(divs_DISPATCHPTR)(y, x, c, n);
}
예제 #12
0
void THVector_(cdiv)(real *z, const real *x, const real *y, const ptrdiff_t n) {
  THVector_(cdiv_DISPATCHPTR)(z, x, y, n);
}
예제 #13
0
// Dispatch stubs that just call the pointers
TH_API void THVector_(adds)(real *r_, const real *t, const real value, const ptrdiff_t n) {
  THVector_(adds_DISPATCHPTR)(r_, t, value, n);
}
예제 #14
0
#ifndef TH_GENERIC_FILE
#define TH_GENERIC_FILE "generic/THVectorDispatch.cpp"
#else

/* For now there are only SIMD implementations for FLOAT and DOUBLE.
 * Hopefully in the future this can be made totally generic (e.g, there are SIMD implementations
 * for a lot of functions */
/* Each function with multiple implementations has:
 * 1. A DISPATCHPTR which will be initialized to point to the best available implementation for the host
 * 2. A DISPATCHTABLE which holds pointers to each implementation of a function, and a value indicating
 *    which SIMD extension a given implementation uses
 * 3. A dispatch stub, which is what is actually called by clients, that simply wraps the dispatch pointer.
 */

static void (*THVector_(fill_DISPATCHPTR))(real *, const real, const ptrdiff_t) = &THVector_(fill_DEFAULT);
static FunctionDescription THVector_(fill_DISPATCHTABLE)[] = {
  #if defined(__NEON__)
    #if defined(TH_REAL_IS_FLOAT)
      FUNCTION_IMPL(THVector_(fill_NEON), SIMDExtension_NEON),
    #endif
  #endif

  #if defined(__PPC64__)
    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
      FUNCTION_IMPL(THVector_(fill_VSX), SIMDExtension_VSX),
    #endif
  #endif

  #if defined(USE_AVX)
    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
      FUNCTION_IMPL(THVector_(fill_AVX), SIMDExtension_AVX),
예제 #15
0
static int nnconv1d_(HorizontalConvolution_updateGradInput)(lua_State *L)
{
   THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
   THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);

   int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane");
   int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane");
   int kL = luaT_getfieldcheckint(L, 1, "kL");

   THTensor *weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor);
   THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);

   THArgCheck(nOutputPlane == gradOutput->size[input->nDimension == 4 ? 1 : 0], 1,
              "Number of output features is not equal to nOutputPlane" );

   // change to batch mode
   int batch = 1;
   if (input->nDimension == 3) {
      batch = 0;
      THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
      THTensor_(resize4d)(gradOutput, 1, nOutputPlane, gradOutput->size[1], gradOutput->size[2]);
   }

   long batchSize    = input->size[0];
   long inputHeight  = input->size[2];
   long inputWidth   = input->size[3];
   long outputHeight = inputHeight;
   long outputWidth  = inputWidth - kL + 1;

   THTensor_(resizeAs)(gradInput, input);
   THTensor_(zero)(gradInput);

   int elt;
#pragma omp parallel for private(elt)
   for (elt = 0; elt < batchSize; elt++) {

      // select each batch
      THTensor *gradInput_t  = THTensor_(newSelect)(gradInput, 0, elt);
      THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, elt);

      // convolve horizontally
      int i, j, k;
      for (i = 0; i < nOutputPlane; i++) {
         for (j = 0; j < outputHeight; j++) {
            for (k = 0; k < kL; k++) {
               THVector_(add)(gradInput_t->storage->data + gradInput_t->storageOffset +
                              gradInput_t->stride[0]*i + gradInput_t->stride[1]*j + k,
                              gradOutput_t->storage->data + gradOutput_t->storageOffset +
                              gradOutput_t->stride[0]*i + gradOutput_t->stride[1]*j,
                              *(THTensor_(data)(weight)+i*kL+k), outputWidth);   // needs to change
            }
         }
      }

      // release temp tensors
      THTensor_(free)(gradInput_t);
      THTensor_(free)(gradOutput_t);
   }

   // revert to single batch
   if (batch == 0) {
      THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
      THTensor_(resize3d)(gradInput, nInputPlane, inputHeight, inputWidth);
      THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth);
   }

   return 1;
}
예제 #16
0
static int nnconv1d_(HorizontalConvolution_updateOutput)(lua_State *L)
{
   THTensor *input = luaT_checkudata(L, 2, torch_Tensor);

   int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane");
   int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane");
   int kL = luaT_getfieldcheckint(L, 1, "kL");

   THTensor *weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor);
   THTensor *bias = luaT_getfieldcheckudata(L, 1, "bias", torch_Tensor);
   THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);

   luaL_argcheck(L, input->nDimension == 3 ||
                    input->nDimension == 4, 2, "3D or 4D (batch mode) tensor expected");

   // change to batch mode
   int batch = 1;
   if (input->nDimension == 3) {
      batch = 0;
      THTensor_(resize4d)(input, 1, nInputPlane, input->size[1], input->size[2]);
   }

   long batchSize    = input->size[0];
   long inputHeight  = input->size[2];
   long inputWidth   = input->size[3];
   long outputHeight = inputHeight;
   long outputWidth  = inputWidth - kL + 1;

   THTensor_(resize4d)(output, batchSize, nOutputPlane, outputHeight, outputWidth);

   int elt;
#pragma omp parallel for private(elt)
   for (elt = 0; elt < batchSize; elt++) {

      // select each batch
      THTensor *input_t  = THTensor_(newSelect)(input, 0, elt);
      THTensor *output_t = THTensor_(newSelect)(output, 0, elt);

      // fill biases
      int i, j, k;
      for (i = 0; i < nOutputPlane; i++) {
         THVector_(fill)(output_t->storage->data+output_t->storageOffset+output_t->stride[0]*i,
                         THTensor_(get1d)(bias, i), outputHeight*outputWidth);
      }

      // convolve horizontally
      for (i = 0; i < nInputPlane; i++) {
         for (j = 0; j < inputHeight; j++) {
            for (k = 0; k < kL; k++) {
               THVector_(add)(output_t->storage->data + output_t->storageOffset +
                              output_t->stride[0]*i + output_t->stride[1]*j,
                              input_t->storage->data + input_t->storageOffset +
                              input_t->stride[0]*i + input_t->stride[1]*j + k,
                              *(THTensor_(data)(weight)+i*kL+k), outputWidth);
            }
         }
      }

      // release temp tensors
      THTensor_(free)(input_t);
      THTensor_(free)(output_t);
   }

   // revert to single batch
   if (batch == 0) {
      THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
      THTensor_(resize3d)(output, nOutputPlane, outputHeight, outputWidth);
   }

   return 1;
}
예제 #17
0
void THNN_(SpatialConvolutionMM_updateOutput)(
          THNNState *state,
          THTensor *input,
          THTensor *output,
          THTensor *weight,
          THTensor *bias,
          THTensor *finput,
          THTensor *fgradInput,
          int kW,
          int kH,
          int dW,
          int dH,
          int padW,
          int padH)
{

  int dimf = 0;
  int dimw = 2;
  int dimh = 1;

  long nInputPlane;
  long inputWidth;
  long inputHeight;
  long nOutputPlane;
  long outputWidth;
  long outputHeight;

  THArgCheck( input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor expected");
  THArgCheck(kW > 0 && kH > 0, 8, "kernel size should be greater than zero");
  THArgCheck(dW > 0 && dH > 0, 10, "stride should be greater than zero");

  if (input->nDimension == 4) {
    dimf++;
    dimw++;
    dimh++;
  }

  nInputPlane = input->size[dimf];
  inputWidth   = input->size[dimw];
  inputHeight  = input->size[dimh];
  nOutputPlane = weight->size[0];
  outputWidth  = (inputWidth + 2*padW - kW) / dW + 1;
  outputHeight = (inputHeight + 2*padH - kH) / dH + 1;

  if (outputWidth < 1 || outputHeight < 1)
    THError("Given input size: (%dx%dx%d). Calculated output size: (%dx%dx%d). Output size is too small",
        nInputPlane,inputHeight,inputWidth,nOutputPlane,outputHeight,outputWidth);

  if (nInputPlane*kW*kH != weight->size[1])
    THError("Wrong number of input channels! Input has %d channels, expected %d",nInputPlane,weight->size[1]/(kW*kH));



  if(input->nDimension == 3)
  {
    THTensor_(resize2d)(finput, kW*kH*nInputPlane, outputHeight*outputWidth);
    THTensor_(resize3d)(output, nOutputPlane, outputHeight, outputWidth);

    THNN_(SpatialConvolutionMM_updateOutput_frame)(input, output, weight, bias, finput,
                                                 kW, kH, dW, dH, padW, padH,
                                                 nInputPlane, inputWidth, inputHeight,
                                                 nOutputPlane, outputWidth, outputHeight);
  }
  else
  {
    long T = input->size[0];
    long t;

    THTensor_(resize3d)(finput, T, kW*kH*nInputPlane, outputHeight*outputWidth);
    THTensor_(resize4d)(output, T, nOutputPlane, outputHeight, outputWidth);

    THNN_(batch_unfolded_copy)(finput, input, T, kW, kH, dW, dH, padW, padH, nInputPlane, inputWidth, inputHeight, outputWidth, outputHeight);


    long i;
    if (bias) {
      #pragma omp parallel for collapse(2) private(i,t)
      for(t = 0; t < T; t++)
    	for(i = 0; i < nOutputPlane; i++)
    	  THVector_(fill)(output->storage->data+output->storageOffset+output->stride[0]*t+output->stride[1]*i, THTensor_(get1d)(bias, i), outputHeight*outputWidth);
    } else {
      THTensor_(zero)(output);
    }

#pragma omp parallel for private(t)
    for(t = 0; t < T; t++)
    {
      THTensor *input_t = THTensor_(newSelect)(input, 0, t);
      THTensor *output_t = THTensor_(newSelect)(output, 0, t);
      THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);

    /* THNN_(SpatialConvolutionMM_updateOutput_frame)(input_t, output_t, weight, bias, finput_t, */
    /*                                              kW, kH, dW, dH, padW, padH, */
    /*                                              nInputPlane, inputWidth, inputHeight, */
    /*                                              nOutputPlane, outputWidth, outputHeight); */


      THTensor *output2d;
      
      /* THNN_(unfolded_copy)(finput_t, input_t, kW, kH, dW, dH, padW, padH, nInputPlane, inputWidth, inputHeight, outputWidth, outputHeight); */
      
      output2d = THTensor_(newWithStorage2d)(output_t->storage, output_t->storageOffset,
      					     nOutputPlane, -1,
      					     outputHeight*outputWidth, -1);

      /* long i; */
      /* if (bias) { */
      /* 	for(i = 0; i < nOutputPlane; i++) */
      /* 	  THVector_(fill)(output_t->storage->data+output_t->storageOffset+output_t->stride[0]*i, THTensor_(get1d)(bias, i), outputHeight*outputWidth); */
      /* } else { */
      /* 	THTensor_(zero)(output_t); */
      /* } */
      
      
      THTensor_(addmm)(output2d, 1, output2d, 1, weight, finput_t);
            
      THTensor_(free)(output2d);
  
      THTensor_(free)(input_t);
      THTensor_(free)(output_t);
      THTensor_(free)(finput_t);
    }
  }

}
예제 #18
0
/* note: due to write issues, this one cannot be parallelized as well as unfolded_copy */
static void THNN_(unfolded_acc_vol)(
          THTensor *finput,
          THTensor *input,
          int kT,
          int kW,
          int kH,
          int dT,
          int dW,
          int dH,
          int pT,
          int pW,
          int pH,
          long nInputPlane,
          long inputDepth,
          long inputWidth,
          long inputHeight,
          long outputDepth,
          long outputWidth,
          long outputHeight)
{
  long nip;
  real *input_data = THTensor_(data)(input);
  real *finput_data = THTensor_(data)(finput);

//#pragma omp parallel for private(nip)
  for (nip = 0; nip < nInputPlane; nip++)
  {
    long kt, kw, kh, t, y, x, it, ix, iy;
    for (kt = 0; kt < kT; kt++)
    {
      for (kh = 0; kh < kH; kh++)
      {
        for (kw = 0; kw < kW; kw++)
        {
          real *src = finput_data
            + nip * (kT*kH*kW*outputDepth*outputHeight*outputWidth)
            + kt  * (kH*kW*outputDepth*outputHeight*outputWidth)
            + kh  * (kW*outputDepth*outputHeight*outputWidth)
            + kw  * (outputDepth*outputHeight*outputWidth);

          real *dst = input_data + nip*(inputDepth*inputHeight*inputWidth);
          if (pT > 0 || pH > 0 || pW > 0)
          {
            for (t = 0; t < outputDepth; t++)
            {
              it = t*dT - pT + kt;
              for (y = 0; y < outputHeight; y++)
              {
                iy = y*dH - pH + kh;
                for (x = 0; x < outputWidth; x++)
                {
                  ix = x*dW - pW + kw;
                  if (it < 0 || it >= inputDepth || iy < 0 || iy >= inputHeight || ix < 0 || ix >= inputWidth)
                  {
                  }
                  else
                  {
                    real *dst_slice = dst+it*inputHeight*inputWidth+iy*inputWidth+ix;
                    THVector_(cadd)(dst_slice, dst_slice, src+t*outputHeight*outputWidth+y*outputWidth+x, 1, 1);
                  }
                }
              }
            }
          }
          else
          {
            for (t = 0; t < outputDepth; t++)
            {
              it = t*dT + kt;
              for (y = 0; y < outputHeight; y++)
              {
                iy = y*dH + kh;
                for(x = 0; x < outputWidth; x++)
                {
                  ix = x*dW + kw;
                  real *dst_slice = dst+it*inputHeight*inputWidth+iy*inputWidth+ix;
                  THVector_(cadd)(dst_slice, dst_slice, src+t*outputHeight*outputWidth+y*outputWidth+x, 1, 1);
                }
              }
            }
          }
        }
      }
    }
  }
}
예제 #19
0
void THVector_(fill)(real *x, const real c, const ptrdiff_t n) {
  THVector_(fill_DISPATCHPTR)(x, c, n);
}
예제 #20
0
void THNN_(IndexLinear_updateOutput)(
          THNNState *state,
          THLongTensor *keys,
          int64_t keysOffset,
          THTensor *values,
          THLongTensor *sizes,
          THLongTensor *cumSumSizes,
          THTensor *output,
          THTensor *weight,
          THTensor *bias,
          THTensor *normalizedValues,
          int  train)
{
  /* Retrieve all the dimensions of the problem */
  int64_t batchSize = THLongTensor_size(sizes, 0);
  int64_t keysSize = THLongTensor_size(keys, 0);
  int64_t outDim = THTensor_(size)(bias, 0);
  int64_t woutDim = THTensor_(size)(weight, 1);
  int maxNormalize = woutDim - outDim;
  int64_t* sizesData = THLongTensor_data(sizes);
  int64_t* cumSumSizesData = THLongTensor_data(cumSumSizes);

  /* Define/resize the normalized values tensor if maxNormalize is  > 0 */
  scalar_t* normalizedValuesData = NULL;
  if (maxNormalize)
  {
    THTensor_(resize1d)(normalizedValues, keysSize);
    normalizedValuesData = normalizedValues->data<scalar_t>();
  }

  /* Resize the output */
  THTensor_(resize2d)(output, batchSize, outDim);

  /* Access the storage data/strides */
  scalar_t* outputData = output->data<scalar_t>();
  scalar_t* valuesData = values->data<scalar_t>();
  scalar_t* weightData = weight->data<scalar_t>();
  int64_t weightStride0 = weight->stride(0);
  scalar_t* biasData = bias->data<scalar_t>();
  int64_t* keysData = THLongTensor_data(keys);

  /* Make sure these inputs are contiguous to accelerate computations */
  THArgCheck(THLongTensor_isContiguous(keys), 1, "keys vector must be contiguous");
  THArgCheck(THTensor_(isContiguous)(values), 3, "values vector must be contiguous");
  THArgCheck(THTensor_(isContiguous)(output), 6, "output vector must be contiguous");
  THArgCheck(THTensor_(isContiguous)(weight), 7, "weight matrix must be contiguous");
  THArgCheck(THTensor_(isContiguous)(bias), 8, "bias vector must be contiguous");
  THArgCheck(THNN_(checkKeysValues)(keys, values), 1, "Keys and values should have the same number of elements");
  THArgCheck(THTensor_(isContiguous)(normalizedValues), 9, "normalizedValues vector must be contiguous");

  /* Separate cases: output dimension is == 1, or > 1
   * This allows for some optimizations. */
  if (outDim == 1)
  {
    THVector_(fill)(outputData, *biasData, batchSize);
    if (maxNormalize)
    {
      /* Parallelize on the batch itself */
      auto loop = [&](int64_t start, int64_t end) {
        for (auto j = start; j < end; j++)
        {
          scalar_t* loutputData = outputData + j;
          scalar_t val = 0;
          scalar_t absVal = 0;
          int64_t offset = j == 0 ? 0 : cumSumSizesData[j - 1];

          for (auto i = 0; i < sizesData[j]; i++)
          {
            int64_t woffset = weightStride0*(keysData[offset] + keysOffset);
            absVal = fabs(valuesData[offset]);
            if (train)
            {
              if (absVal > weightData[woffset])
              {
                weightData[woffset] = absVal;
                weightData[woffset+1] = 1/absVal;
              }

              /*
               * The following can be used to scale the size of the updates
               * depending on some rule, e.g. the frequency of a feature, ...
               * This is used at update time.
               * TODO: implement a smarter update scale.
               */
              weightData[woffset+2] = 1;
            }
            normalizedValuesData[offset] = (absVal > weightData[woffset] ? THNN_INDEXLINEAR_SIGN(valuesData[offset]):valuesData[offset]*weightData[woffset+1]) + weightData[woffset+3];
            val += normalizedValuesData[offset] * weightData[woffset+maxNormalize];
            offset++;
          }
          *loutputData += val;
        }
      };
      if (keysSize * outDim > THNN_SPARSE_OMP_THRESHOLD) {
        at::parallel_for(0, batchSize, 1, loop);
      } else {
        loop(0, batchSize);
      }
    }
    else
    {
      /* Parallelize on the batch itself */
      auto loop = [&](int64_t start, int64_t end) {
        for (auto j = start; j < end; j++)
        {
          int64_t offset = j == 0 ? 0 : cumSumSizesData[j - 1];
          scalar_t* loutputData = outputData + j;
          scalar_t val = 0;

          for (auto i = 0; i < sizesData[j]; i++)
          {
            val += weightData[weightStride0*(keysData[offset] + keysOffset)] * valuesData[offset];
            offset++;
          }
          *loutputData += val;
        }
      };
      if (keysSize * outDim > THNN_SPARSE_OMP_THRESHOLD) {
        at::parallel_for(0, batchSize, 1, loop);
      } else {
        loop(0, batchSize);
      }
    }
  }
  else {
    auto loop = [&](int64_t start, int64_t end) {
      for (auto j = start; j < end; j++)
      {
        int64_t offset = j == 0 ? 0 : cumSumSizesData[j -  1];
        scalar_t val;
        scalar_t* loutputData = outputData + j*outDim;
        scalar_t* lweightData = weightData;
        memcpy(loutputData, biasData, outDim*sizeof(scalar_t));
        for (auto i = 0; i < sizesData[j]; i++)
        {
          int64_t woffset = weightStride0*(keysData[offset] + keysOffset);
          if (maxNormalize)
          {
            val = valuesData[offset];
            scalar_t absVal = fabs(val);
            if (train)
            {
              if (absVal > weightData[woffset])
              {
                weightData[woffset] = absVal;
                weightData[woffset+1] = 1/absVal;
              }

              /*
               * The following can be used to scale the size of the updates
               * depending on some rule, e.g. the frequency of a feature, ...
               * The commented section thereafter is just an example of what can be done:
               *
               *```
               * weightData[woffset+2] = weightData[woffset+2]==0?1:(weightData[woffset+2] / (weightData[woffset+2] + 1));
               * scalar_t alpha = 1;
               * scalar_t beta = 0.01;
               * scalar_t gamma = 1 - 0.000001;
               * scalar_t l = weightData[woffset+2]==0?1/gamma:(weightData[woffset+2] - beta) / (alpha - beta);
               * l = gamma*l;
               * weightData[woffset+2] = (alpha-beta)*l + beta;
               * ```
               *
               * TODO: implement a smarter update scale.
               */
              weightData[woffset+2] = 1;
            }

            /* Normalize + Clamp */
            val = (absVal > weightData[woffset] ? THNN_INDEXLINEAR_SIGN(val):val*weightData[woffset+1]) + weightData[woffset+3];
            normalizedValuesData[offset] = val;

            lweightData = weightData + woffset + maxNormalize;
          }
          else
          {
            val = valuesData[offset];
            lweightData = weightData + woffset;
          }
          if (outDim > THNN_SPARSE_OUTDIM_THRESHOLD)
          {
            THBlas_(axpy)(outDim, val, lweightData, 1, loutputData, 1);
          }
          else
          {
            for (auto k = 0; k < outDim; k++)
            {
              loutputData[k] += lweightData[k] * val;
            }
          }
          offset++;
        }
      }
    };
    if (keysSize * outDim > THNN_SPARSE_OMP_THRESHOLD) {
      at::parallel_for(0, batchSize, 1, loop);
    } else {
      loop(0, batchSize);
    }

  }
  return;
}
예제 #21
0
void THNN_(IndexLinear_accUpdateGradParameters)(
          THNNState *state,
          THLongTensor *keys,
          int64_t keysOffset,
          THTensor *values,
          THLongTensor *sizes,
          THLongTensor *cumSumSizes,
          THTensor *gradOutput,
          THTensor *weight,
          THTensor *bias,
          accreal weightDecay_,
          accreal scale_)
{
  scalar_t weightDecay = TH_CONVERT_ACCREAL_TO_REAL(weightDecay_);
  scalar_t scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
  /* Retrieve all the dimensions of the problem */
  int64_t batchSize = THLongTensor_size(sizes, 0);
  int64_t outDim = THTensor_(size)(bias, 0);
  int64_t woutDim = THTensor_(size)(weight, 1);
  int maxNormalize = woutDim - outDim;
  THArgCheck(THNN_(checkKeysValues)(keys, values), 1, "Keys and values should have the same number of elements");

  /* Access the storage data/strides */
  scalar_t* gradOutputData = gradOutput->data<scalar_t>();
  scalar_t* valuesData =values->data<scalar_t>();
  scalar_t* weightData = weight->data<scalar_t>();
  scalar_t* biasData = bias->data<scalar_t>();
  int64_t weightStride0 = weight->stride(0);
  int64_t* keysData = THLongTensor_data(keys);
  int64_t* sizesData = THLongTensor_data(sizes);

  /* Make sure these inputs are contiguous to accelerate computations */
  THArgCheck(THLongTensor_isContiguous(keys), 1, "keys vector must be contiguous");
  THArgCheck(THTensor_(isContiguous)(values), 3, "values vector must be contiguous");
  THArgCheck(THTensor_(isContiguous)(gradOutput), 6, "gradOutput vector must be contiguous");
  THArgCheck(THTensor_(isContiguous)(weight), 7, "weight matrix must be contiguous");
  THArgCheck(THTensor_(isContiguous)(bias), 8, "bias matrix must be contiguous");

  int i,j,k;

  /* Separate cases: output dimension is == 1, or > 1
   * This allows for some optimizations.
   * No multithreading here as this could
   * corrupt the results (hogwild style) */
  if (outDim == 1)
  {
    if (maxNormalize)
    {
        int64_t offset = 0;
        for (j = 0; j < batchSize; j++)
        {
          scalar_t* lgradOutputData = gradOutputData + j;
          *biasData -= *lgradOutputData * scale;
          scalar_t val = *lgradOutputData * scale;
          for (i = 0; i < sizesData[j]; i++)
          {
            int64_t idx = weightStride0*(keysData[offset] + keysOffset) + maxNormalize;
            weightData[idx-1] -= weightData[idx]*val*weightData[idx-2];
            weightData[idx] -= (val*valuesData[offset] - weightDecay * weightData[idx])*weightData[idx-2];
            offset++;
          }
        }

        offset = 0;
        for (j = 0; j < batchSize; j++)
        {
          for (i = 0; i < sizesData[j]; i++)
          {
            int64_t idx = weightStride0*(keysData[offset] + keysOffset) + maxNormalize;
            weightData[idx-2] = 0;
            offset++;
          }
        }
    }
    else
    {
      if (weightDecay)
      {
        int64_t offset = 0;
        for (j = 0; j < batchSize; j++)
        {
          scalar_t* lgradOutputData = gradOutputData + j;
          *biasData -= *lgradOutputData * scale;
          scalar_t val = *lgradOutputData * scale;
          for (i = 0; i < sizesData[j]; i++)
          {
            int64_t idx = weightStride0*(keysData[offset] + keysOffset);
            weightData[idx] -= val * valuesData[offset] + weightData[idx] * weightDecay;
            offset++;
          }
        }
      }
      else
      {
        int64_t offset = 0;
        for (j = 0; j < batchSize; j++)
        {
          scalar_t val = gradOutputData[j] * scale;
          for (i = 0; i < sizesData[j]; i++)
          {
            weightData[(keysData[offset] + keysOffset)*weightStride0] -= val * valuesData[offset];
            offset++;
          }
          *biasData -= val;
        }
      }
    }
  }
  else {
    int64_t offset = 0;
    for (j = 0; j < batchSize; j++)
    {
      scalar_t* lgradOutputData = gradOutputData + j*outDim;
      scalar_t* lweightData = weightData;
      THVector_(cadd)(biasData, biasData, lgradOutputData, -scale, outDim);
      for (i = 0; i < sizesData[j]; i++)
      {
        scalar_t val = valuesData[offset] * scale;
        scalar_t wd = weightDecay;

        // Max normalize case
        if (maxNormalize)
        {
          lweightData = weightData + weightStride0*(keysData[offset] + keysOffset) + (maxNormalize-2);
          val *= lweightData[0];
          wd *= lweightData[0];
          for (k=0; k < outDim; k++)
          {
            lweightData[1] -= lweightData[k+2]*scale*lgradOutputData[k]*lweightData[0];
          }
          lweightData += 2;
        }
        else
        {
          lweightData = weightData + weightStride0*(keysData[offset] + keysOffset);
        }

        /* We do sparse weight decay.
         * We think it makes more sense. */
        if (weightDecay)
        {
          if (outDim > THNN_SPARSE_OUTDIM_THRESHOLD)
          {
            THBlas_(axpy)(outDim, -wd, lweightData, 1, lweightData, 1);
          }
          else
          {
            for (k=0; k < outDim; k++)
            {
              lweightData[k] -= wd * lweightData[k];
            }
          }
        }

        if (outDim > THNN_SPARSE_OUTDIM_THRESHOLD)
        {
          THBlas_(axpy)(outDim, -val, lgradOutputData, 1, lweightData, 1);
        }
        else
        {
          for (k=0; k < outDim; k++)
          {
            lweightData[k] -= val * lgradOutputData[k];
          }
        }
        offset++;
      }
    }

    /* Max Normalize case:
     * Reset the smart update scaling if
     * one does it batch-wise.
     * TODO: Decide what to do with that piece of code.
     * NB: If the code belowe is uncommented, so should the commented
     * code in IndexLinear:zeroGradParameters() */

    /*
    if (maxNormalize)
    {
      offset = 0;
      for (j = 0; j < batchSize; j++)
      {
        scalar_t* lweightData = weightData;
        for (i = 0; i < sizesData[j]; i++)
        {
          scalar_t val = valuesData[offset] * scale;
          scalar_t wd = weightDecay;

          lweightData = weightData + weightStride0*(keysData[offset] + keysOffset) + (maxNormalize-2);
          lweightData[0] = 0;
          offset++;
        }
      }
    }
    */
  }
  return;
}
예제 #22
0
void THTensor_(copy)(THTensor *tensor, THTensor *src)
{
  if (tensor == src) return;
  ptrdiff_t tensorSize = THTensor_(nElement)(tensor);
  ptrdiff_t srcSize = THTensor_(nElement)(src);
  int tensorContig = THTensor_(isContiguous)(tensor);
  int srcContig = THTensor_(isContiguous)(src);

  int serial_path = 0;
#ifdef _OPENMP
  int inOMP = omp_in_parallel();
#endif
  if (tensorSize == srcSize) {
    if ( tensorContig && srcContig) {
      real *sp = THTensor_(data)(src);
      real *rp = THTensor_(data)(tensor);
#ifndef TH_REAL_IS_HALF
#ifdef _OPENMP
      #pragma omp parallel if ( (tensorSize > TH_OMP_OVERHEAD_THRESHOLD_COPY) && (!inOMP) )
      {
        size_t num_threads = omp_get_num_threads();
        size_t tid = omp_get_thread_num();
        ptrdiff_t offset = tid * (tensorSize / num_threads);
        ptrdiff_t end = (tid == num_threads - 1) ? tensorSize : offset + tensorSize / num_threads;
        ptrdiff_t len = end - offset;
        real *tensorData = rp + offset;
        real *srcData = sp + offset;
        THVector_(copy)(tensorData, srcData, len);
      }
#else
        THVector_(copy)(rp, sp, srcSize);
#endif

#else

#ifdef _OPENMP
      if ((srcSize > TH_OMP_OVERHEAD_THRESHOLD_COPY) && (!inOMP)) {
        ptrdiff_t i;
        #pragma omp parallel for private (i)
        for(i=0; i<srcSize; i++){
          rp[i] = sp[i];
        }
      } else {
        memcpy(rp, sp, srcSize * sizeof(real));
      }
#else
      memcpy(rp, sp, srcSize * sizeof(real));
#endif

#endif

#ifndef TH_REAL_IS_HALF
    } else if (THTensor_(copyTransposeValid)(tensor, src)) {
      THTensor_(copyTranspose)(tensor, src);
#endif
    } else {
#ifdef _OPENMP
      if (inOMP) {
        serial_path = 1;
      } else {
        TH_TENSOR_APPLY2_OMP(srcSize, tensorContig, srcContig, real, tensor, real, src, *tensor_data = *src_data;, TH_OMP_OVERHEAD_THRESHOLD_COPY)
      }
#else
      serial_path = 1;
#endif
    }
  } else {
예제 #23
0
void THVector_(copy)(real *y, const real *x, const ptrdiff_t n) {
  THVector_(copy_DISPATCHPTR)(y, x, n);
}
예제 #24
0
void THNN_(IndexLinear_updateParameters)(
          THNNState *state,
          THTensor *gradWeight,
          THTensor *gradBias,
          THTensor *weight,
          THTensor *bias,
          THLongTensor *runningKeys,
          THLongTensor *cumSumSizes,
          int64_t keysOffset,
          accreal weightDecay_,
          accreal learningRate_)
{
  scalar_t weightDecay = TH_CONVERT_ACCREAL_TO_REAL(weightDecay_);
  scalar_t learningRate = TH_CONVERT_ACCREAL_TO_REAL(learningRate_);
  /* Retrieve all the dimensions of the problem */
  int64_t outDim = THTensor_(size)(bias, 0);
  int64_t woutDim = THTensor_(size)(weight, 1);
  int maxNormalize = woutDim - outDim;
  int64_t keysSize = THLongTensor_size(runningKeys, 0);

  /* Access the storage data/strides */
  scalar_t* gradWeightData = gradWeight->data<scalar_t>();
  scalar_t* weightData = weight->data<scalar_t>();
  int64_t weightStride0 = weight->stride(0);
  scalar_t* gradBiasData = gradBias->data<scalar_t>();
  scalar_t* biasData = bias->data<scalar_t>();
  int64_t* keysData = THLongTensor_data(runningKeys);

  /* Make sure these inputs are contiguous to accelerate computations */
  THArgCheck(THTensor_(isContiguous)(gradWeight), 1, "gradWeight must be contiguous");
  THArgCheck(THTensor_(isContiguous)(gradBias), 2, "gradBias vector must be contiguous");
  THArgCheck(THTensor_(isContiguous)(weight), 3, "gradBias vector must be contiguous");
  THArgCheck(THTensor_(isContiguous)(bias), 4, "gradBias vector must be contiguous");
  THArgCheck(THLongTensor_isContiguous(runningKeys), 5, "keys vector must be contiguous");

  int j, k;

  /* Update the bias first */
  THVector_(cadd)(biasData, biasData, gradBiasData, -learningRate, outDim);

  /* Separate cases: output dimension is == 1, or > 1
   * This allows for some optimizations.
   * No multithreading here as this could
   * corrupt the results (hogwild style) */
  if (outDim == 1)
  {
    if (maxNormalize)
    {
      if (weightDecay)
      {
        for (j = 0; j < keysSize; j++)
        {
          int64_t woffset = weightStride0*(keysData[j] + keysOffset) + maxNormalize;
          scalar_t lr = learningRate*weightData[woffset-2];
          weightData[woffset-1] -= weightData[woffset]*gradWeightData[2*j]*lr;
          weightData[woffset] -= gradWeightData[2*j+1]*lr - weightDecay * weightData[woffset-2] * weightData[woffset];
        }
      }
      else
      {
        for (j = 0; j < keysSize; j++)
        {
          int64_t woffset = weightStride0*(keysData[j] + keysOffset) + maxNormalize;
          scalar_t lr = learningRate*weightData[woffset-2];
          weightData[woffset-1] -= weightData[woffset]*gradWeightData[2*j]*lr;
          weightData[woffset] -= gradWeightData[2*j+1]*lr;
        }
      }
    }
    else
    {
      if (weightDecay)
      {
        for (j = 0; j < keysSize; j++)
        {
          int64_t woffset = weightStride0*(keysData[j] + keysOffset);
          weightData[woffset] -= gradWeightData[j]*learningRate + weightDecay * weightData[woffset];
        }
      }
      else
      {
        for (j = 0; j < keysSize; j++)
        {
          weightData[weightStride0*(keysData[j] + keysOffset)] -= gradWeightData[j]*learningRate;
        }
      }
    }
  }
  else
  {
    for (j = 0; j < keysSize; j++)
    {
      scalar_t lr = learningRate;
      scalar_t wd = weightDecay;
      scalar_t* lweightData;
      int64_t woffset = weightStride0*(keysData[j] + keysOffset);
      scalar_t* lgradWeightData = gradWeightData + j*outDim;
      if (maxNormalize)
      {
        lgradWeightData += j*outDim;
        /* weightData[woffset + 2] */
        lweightData = weightData + woffset + maxNormalize - 2;
        lr = lr*lweightData[0];
        wd = weightDecay*lweightData[0];
        /* weightData[woffset + 3] */
        lweightData++;
        for (k=0; k < outDim; k++)
        {
            lweightData[0] -= lgradWeightData[k]*lweightData[k+1]*lr;
        }
        lweightData++;
        lgradWeightData += outDim;
      }
      else
      {
        lweightData = weightData + woffset;
      }

      /* We do sparse weight decay.
       * We think it makes more sense. */
      if (weightDecay)
      {
        for (k=0; k < outDim; k++)
        {
            lweightData[k] -= lweightData[k]*wd;
        }
      }

      if (outDim > THNN_SPARSE_OUTDIM_THRESHOLD)
      {
        THBlas_(axpy)(outDim, -lr, lgradWeightData, 1, lweightData, 1);
      }
      else
      {
        for (k=0; k < outDim; k++)
        {
          lweightData[k] -= lgradWeightData[k]*lr;
        }
      }
    }
  }
}
예제 #25
0
void THVector_(cvtFromInt)(real *y, const int *x, const ptrdiff_t n) {
  THVector_(cvtFromInt_DISPATCHPTR)(y, x, n);
}
예제 #26
0
void THNN_(IndexLinear_accGradParameters)(
          THNNState *state,
          THLongTensor *keys,
          int64_t keysOffset,
          THTensor *values,
          THLongTensor *sizes,
          THLongTensor *cumSumSizes,
          THTensor *gradOutput,
          THTensor *gradWeight,
          THTensor *gradBias,
          THTensor *weight,
          THTensor *bias,
          THTensor *valuesBuffer,
          accreal weightDecay_,
          accreal scale_)
{
  scalar_t scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
  /* Retrieve all the dimensions of the problem */
  int64_t batchSize = THLongTensor_size(sizes, 0);
  int64_t keysSize = THLongTensor_size(keys, 0);
  int64_t outDim = THTensor_(size)(bias, 0);
  int64_t woutDim = THTensor_(size)(weight, 1);
  int64_t maxNormalize = (woutDim - outDim) > 0 ?1:0;
  THArgCheck(THNN_(checkKeysValues)(keys, values), 1, "Keys and values should have the same number of elements");
  int64_t* sizesData = THLongTensor_data(sizes);

  /* COmpute the cumulative sizes */
  THLongTensor* cumSizes = THLongTensor_new();
  THLongTensor_cumsum(cumSizes, sizes, 0);
  int64_t* cumSizesData = THLongTensor_data(cumSizes);

  /* Resize the gradWeight buffer to keep it dense.
   * That speeds up updates A LOT assuming random mem access. */
  THTensor_(resize2d)(gradWeight, keysSize, outDim * (maxNormalize>0?2:1));

  /* Access the storage data/strides */
  scalar_t* gradOutputData = gradOutput->data<scalar_t>();
  scalar_t* valuesData =values->data<scalar_t>();
  scalar_t* gradWeightData = gradWeight->data<scalar_t>();
  scalar_t* gradBiasData = gradBias->data<scalar_t>();

  /* Make sure these inputs are contiguous to accelerate computations */
  THArgCheck(THLongTensor_isContiguous(keys), 1, "keys vector must be contiguous");
  THArgCheck(THTensor_(isContiguous)(values), 3, "values vector must be contiguous");
  THArgCheck(THTensor_(isContiguous)(gradOutput), 6, "gradOutput vector must be contiguous");
  THArgCheck(THTensor_(isContiguous)(gradWeight), 7, "gradWeight must be contiguous");
  THArgCheck(THTensor_(isContiguous)(gradBias), 8, "gradBias vector must be contiguous");
  THArgCheck(THTensor_(isContiguous)(weight), 9, "weight must be contiguous");
  THArgCheck(THTensor_(isContiguous)(bias), 10, "bias vector must be contiguous");
  THArgCheck(THTensor_(isContiguous)(valuesBuffer), 11, "valuesBuffer must be contiguous");

  int i,j,k;

  /* Separate cases: output dimension is == 1, or > 1
   * This allows for some optimizations.
   * No multithreading here as this could
   * corrupt the results (hogwild style) */
  if (outDim == 1)
  {
    for (j = 0; j < batchSize; j++)
    {
      int64_t offset = j==0?0:cumSizesData[j-1];
      scalar_t val = gradOutputData[j] * scale;
      scalar_t* lgradWeightData = gradWeightData + offset;
      scalar_t* lvaluesData = valuesData + offset;
      int64_t end = sizesData[j];

      if (maxNormalize)
      {
        lgradWeightData += offset;
        i = 0;
        for(;i < end; i++)
        {
          lgradWeightData[2*i] = val;
          lgradWeightData[2*i+1] = val * lvaluesData[i];
        }
      }
      else
      {
        i = 0;
        for(;i < end-4; i += 4)
        {
          lgradWeightData[i] = val * lvaluesData[i];
          lgradWeightData[i+1] = val * lvaluesData[i+1];
          lgradWeightData[i+2] = val * lvaluesData[i+2];
          lgradWeightData[i+3] = val * lvaluesData[i+3];
        }

        for(; i < end; i++)
        {
          lgradWeightData[i] = val * lvaluesData[i];
        }
      }
      *gradBiasData += val;
      offset += end;
    }
  }
  else {
    for (j = 0; j < batchSize; j++)
    {
      int64_t offset = j==0?0:cumSizesData[j-1];
      scalar_t* lgradOutputData = gradOutputData + j*outDim;
      scalar_t* lgradWeightData = gradWeightData;
      THVector_(cadd)(gradBiasData, gradBiasData, lgradOutputData, scale, outDim);
      for (i = 0; i < sizesData[j]; i++)
      {
        scalar_t val = valuesData[offset] * scale;
        lgradWeightData = gradWeightData + offset*outDim;
        if (maxNormalize)
        {
          lgradWeightData += offset*outDim;
          k = 0;
          for(;k < outDim-4; k += 4)
          {
            lgradWeightData[k] = lgradOutputData[k]*scale;
            lgradWeightData[k+1] = lgradOutputData[k+1]*scale;
            lgradWeightData[k+2] = lgradOutputData[k+2]*scale;
            lgradWeightData[k+3] = lgradOutputData[k+3]*scale;
          }

          for(; k < outDim; k++)
          {
            lgradWeightData[k] = lgradOutputData[k]*scale;
          }
          lgradWeightData += outDim;
        }
        k = 0;
        for(;k < outDim-4; k += 4)
        {
          lgradWeightData[k] = val * lgradOutputData[k];
          lgradWeightData[k+1] = val * lgradOutputData[k+1];
          lgradWeightData[k+2] = val * lgradOutputData[k+2];
          lgradWeightData[k+3] = val * lgradOutputData[k+3];
        }

        for(; k < outDim; k++)
        {
          lgradWeightData[k] = val * lgradOutputData[k];
        }
        offset++;
      }
    }
  }
  THLongTensor_free(cumSizes);
  return;
}
예제 #27
0
void THVector_(sigmoid)(real *y, const real *x, const ptrdiff_t n) {
  THVector_(sigmoid_DISPATCHPTR)(y, x, n);
}
예제 #28
0
파일: unfold.c 프로젝트: AkankshaJ/nn
/* note: due to write issues, this one cannot be parallelized as well as unfolded_copy */
void THNN_(unfolded_acc)(THTensor *finput, THTensor *input,
                         int kW, int kH, int dW, int dH, int padW, int padH,
                         int nInputPlane, int inputWidth, int inputHeight,
                         int outputWidth, int outputHeight)
{
#ifdef _WIN32
  LONG_PTR nip;
#else
  size_t nip;
#endif

  real *input_data = THTensor_(data)(input);
  real *finput_data = THTensor_(data)(finput);

#pragma omp parallel for private(nip)
  for(nip = 0; nip < nInputPlane; nip++)
  {
    size_t kw, kh, y, x; 
    long long ix = 0, iy = 0;
    for(kh = 0; kh < kH; kh++)
    {
      for(kw = 0; kw < kW; kw++)
      {
        real *src = finput_data + nip*(kH*kW*outputHeight*outputWidth) + kh*(kW*outputHeight*outputWidth) + kw*(outputHeight*outputWidth);
        real *dst = input_data + nip*(inputHeight*inputWidth);
        if (padW > 0 || padH > 0) {
          size_t lpad,rpad;
          for(y = 0; y < outputHeight; y++) {
            iy = (long long)(y*dH - padH + kh);
            if (iy < 0 || iy >= inputHeight) {
            } else {
              if (dW==1){
                 ix = (long long)(0 - padW + kw);
                 lpad = fmaxf(0,(int)(padW-kw));
                 rpad = fmaxf(0,(int)(padW-(kW-kw-1)));
                 THVector_(add)(dst+(size_t)(iy*inputWidth+ix+lpad), src+(size_t)(y*outputWidth+lpad), 1, outputWidth - lpad - rpad); /* note: THVector_add could handle 1 value better */
              }
              else{
                for (x=0; x<outputWidth; x++){
                   ix = (long long)(x*dW - padW + kw);
                   if (ix < 0 || ix >= inputWidth){
                   }else
                     THVector_(add)(dst+(size_t)(iy*inputWidth+ix), src+(size_t)(y*outputWidth+x), 1, 1);
                }
              }
            }
          }
        } else {
          for(y = 0; y < outputHeight; y++) {
            iy = (long long)(y*dH + kh);
            ix = (long long)(0 + kw);
            if (dW == 1 )
               THVector_(add)(dst+(size_t)(iy*inputWidth+ix), src+(size_t)(y*outputWidth), 1, outputWidth); /* note: THVector_add could handle 1 value better */
            else{
              for(x = 0; x < outputWidth; x++)
                THVector_(add)(dst+(size_t)(iy*inputWidth+ix+x*dW), src+(size_t)(y*outputWidth+x), 1, 1);
            }
          }
        }
      }
    }
  }
}
예제 #29
0
void THVector_(cadd)(real *z, const real *x, const real *y, const real c, const ptrdiff_t n) {
  THVector_(cadd_DISPATCHPTR)(z, x, y, c, n);
}
예제 #30
0
static int nnconv1d_(LateralConvolution_updateOutput)(lua_State *L)
{
   THTensor *input = luaT_checkudata(L, 2, torch_Tensor);

   int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane");
   int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane");

   THTensor *weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor);
   THTensor *bias = luaT_getfieldcheckudata(L, 1, "bias", torch_Tensor);
   THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);

   luaL_argcheck(L, input->nDimension == 3 ||
                    input->nDimension == 4, 2, "3D or 4D (batch mode) tensor expected");

   // change to batch mode
   int batch = 1;
   if (input->nDimension == 3) {
      batch = 0;
      THTensor_(resize4d)(input, 1, nInputPlane, input->size[1], input->size[2]);
   }

   long batchSize    = input->size[0];
   long inputHeight  = input->size[2];
   long inputWidth   = input->size[3];
   long outputHeight = inputHeight;
   long outputWidth  = inputWidth;

   THTensor_(resize4d)(output, batchSize, nOutputPlane, outputHeight, outputWidth);

   int elt;
#pragma omp parallel for private(elt)
   for (elt = 0; elt < batchSize; elt++) {

      // select each batch in 2D
      THTensor *input_t  = THTensor_(newSelect)(input, 0, elt);
      THTensor *output_t = THTensor_(newSelect)(output, 0, elt);
      THTensor *input2d  = THTensor_(newWithStorage2d)
                              (input_t->storage, input_t->storageOffset,
                               nInputPlane, -1, inputHeight*inputWidth, -1);
      THTensor *output2d = THTensor_(newWithStorage2d)
                              (output_t->storage, output_t->storageOffset,
                               nOutputPlane, -1, outputHeight*outputWidth, -1);

      // fill biases
      int i;
      for (i = 0; i < nOutputPlane; i++)
         THVector_(fill)(output_t->storage->data+output_t->storageOffset+output_t->stride[0]*i,
                         THTensor_(get1d)(bias, i), outputHeight*outputWidth);

      // convolve
      THTensor_(addmm)(output2d, 1, output2d, 1, weight, input2d);

      // release temp tensors
      THTensor_(free)(input2d);
      THTensor_(free)(output2d);
      THTensor_(free)(input_t);
      THTensor_(free)(output_t);
   }

   // revert to single batch
   if (batch == 0) {
      THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
      THTensor_(resize3d)(output, nOutputPlane, outputHeight, outputWidth);
   }

   return 1;
}