Esempio n. 1
0
void THNN_(TemporalRowConvolution_accGradParameters)(
	THNNState *state,
	THTensor *input,
	THTensor *gradOutput,
	THTensor *gradWeight,
	THTensor *gradBias,
	THTensor *finput,
	THTensor *fgradInput,
	int kW,
	int dW,
	int padW,
	bool featFirst,
	accreal scale_) {

    real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
	int ndim = input->nDimension;

	THTensor *tinput, *tgradOutput;

	if (!featFirst) {
		tinput = THTensor_(newTranspose)(input, ndim - 1, ndim - 2);
		tgradOutput = THTensor_(newTranspose)(gradOutput, ndim - 1, ndim - 2);

		input = THTensor_(newContiguous)(tinput);
		gradOutput = THTensor_(newContiguous)(tgradOutput);
	} else {
		input = THTensor_(newContiguous)(input);
		gradOutput = THTensor_(newContiguous)(gradOutput);
	}

	THNN_(TemporalRowConvolution_shapeCheck)
	        (state, input, gradOutput, gradWeight, gradBias, kW, dW, padW);

	if (ndim == 2) {
		THNN_(TemporalRowConvolution_accGradParameters_frame)(
			gradOutput, gradWeight, gradBias, finput, scale);
	} else {
		int64_t T = input->size[0];
		int64_t t;

		for (t = 0; t < T; t++) {
			THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
			THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);

			THNN_(TemporalRowConvolution_accGradParameters_frame)(
				gradOutput_t, gradWeight, gradBias, finput_t, scale);

			THTensor_(free)(gradOutput_t);
			THTensor_(free)(finput_t);
		}
	}

	if (!featFirst) {
		THTensor_(free)(tinput);
		THTensor_(free)(tgradOutput);
	}

	THTensor_(free)(input);
	THTensor_(free)(gradOutput);
}
void THNN_(VolumetricConvolutionMM_accGradParameters)(
          THNNState *state,
          THTensor *input,
          THTensor *gradOutput,
          THTensor *gradWeight,
          THTensor *gradBias,
          THTensor *finput,
          THTensor *fgradInput,
          int kT, int kW, int kH,
          int dT, int dW, int dH,
          int pT, int pW, int pH,
          accreal scale_)
{
  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);

  THNN_(VolumetricConvolutionMM_shapeCheck)(
        state, input, gradOutput, gradWeight, gradBias,
        kT, kW, kH, dT, dW, dH, pT, pW, pH, 1);
  input = THTensor_(newContiguous)(input);
  gradOutput = THTensor_(newContiguous)(gradOutput);

  if (gradWeight) {
    gradWeight = THNN_(newViewWeight)(gradWeight);
  }

  if (input->nDimension == 4)   // non-batch mode
  {
    THNN_(VolumetricConvolutionMM_accGradParameters_frame)(gradOutput, gradWeight, gradBias, finput, scale);
  }
  else  // batch mode
  {
    int64_t T = input->size[0];
    int64_t t;

#ifdef _OPENMP
    #pragma omp parallel for if(T > CONV3D_OMP_THRESHOLD) private(t)
#endif
    for (t = 0; t < T; t++)
    {
      THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
      THTensor *finput_t = NULL;
      if (gradWeight) {
        finput_t = THTensor_(newSelect)(finput, 0, t);
      }

      THNN_(VolumetricConvolutionMM_accGradParameters_frame)(gradOutput_t, gradWeight, gradBias, finput_t, scale);

      THTensor_(free)(gradOutput_t);
      if (gradWeight) {
        THTensor_(free)(finput_t);
      }
    }
  }

  THTensor_(free)(input);
  THTensor_(free)(gradOutput);
  if (gradWeight) {
    THTensor_(free)(gradWeight);
  }
}
Esempio n. 3
0
void THNN_(Threshold_updateOutput)(
          THNNState *state,
          THTensor *input,
          THTensor *output,
          accreal threshold_,
          accreal val_,
          bool inplace)
{
  real threshold = TH_CONVERT_ACCREAL_TO_REAL(threshold_);
  real val = TH_CONVERT_ACCREAL_TO_REAL(val_);
  if (inplace)
  {
    TH_TENSOR_APPLY(real, input,
      if (*input_data <= threshold)
        *input_data = val;
    );
    THTensor_(set)(output, input);
  }
void THNN_(VolumetricConvolutionMM_accGradParameters)(
          THNNState *state,
          THTensor *input,
          THTensor *gradOutput,
          THTensor *gradWeight,
          THTensor *gradBias,
          THTensor *finput,
          int kT, int kW, int kH,
          int dT, int dW, int dH,
          int pT, int pW, int pH,
          accreal scale_)
{
  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
  int nOutputPlane = (int)gradWeight->size[0];

  THNN_(VolumetricConvolutionMM_shapeCheck)(
        state, input, gradOutput, gradWeight, gradBias,
        kT, kW, kH, dT, dW, dH, pT, pW, pH);
  input = THTensor_(newContiguous)(input);
  gradOutput = THTensor_(newContiguous)(gradOutput);

  gradWeight = THNN_(view_weight)(gradWeight);

  if (input->nDimension == 4)   // non-batch mode
  {
    THNN_(VolumetricConvolutionMM_accGradParameters_frame)(gradOutput, gradWeight, gradBias, finput, scale);
  }
  else  // batch mode
  {
    int64_t T = input->size[0];
    int64_t t;

    for (t = 0; t < T; t++)
    {
      THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
      THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);

      THNN_(VolumetricConvolutionMM_accGradParameters_frame)(gradOutput_t, gradWeight, gradBias, finput_t, scale);

      THTensor_(free)(gradOutput_t);
      THTensor_(free)(finput_t);
    }
  }

  THTensor_(free)(input);
  THTensor_(free)(gradOutput);
  THTensor_(free)(gradWeight);
}
void THNN_(SpatialFullDilatedConvolution_accGradParameters)(
    THNNState *state,
    THTensor *input,
    THTensor *gradOutput,
    THTensor *gradWeight,
    THTensor *gradBias,
    THTensor *columns,
    THTensor *ones,
    int kW, int kH,
    int dW, int dH,
    int padW, int padH,
    int dilationW, int dilationH,
    int adjW, int adjH,
    accreal scale_)
{
  scalar_t scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
  THNN_(SpatialFullDilatedConvolution_shapeCheck)
    (input, gradOutput, gradWeight, gradBias, kH, kW, dH, dW, padH, padW,
     dilationH, dilationW, adjH, adjW, 1);

  int64_t nOutputPlane;
  if (gradWeight) {
    nOutputPlane = THTensor_(size)(gradWeight, 1);
  } else if (gradBias) {
    nOutputPlane = THTensor_sizeLegacyNoScalars(gradBias, 0);
  } else {
    return;
  }

  input = THTensor_(newContiguous)(input);
  gradOutput = THTensor_(newContiguous)(gradOutput);
  if (gradWeight) {
    THArgCheck(THTensor_(isContiguous)(gradWeight), 4, "gradWeight needs to be contiguous");
  }
  THArgCheck(THTensor_(isContiguous)(columns), 6, "columns needs to be contiguous");
  if (gradBias) {
    THArgCheck(THTensor_(isContiguous)(gradBias), 5, "gradBias needs to be contiguous");
    THArgCheck(THTensor_(isContiguous)(ones), 7, "ones needs to be contiguous");
  }

  int is_batch = 1;
  if (input->dim() == 3) {
    // Force batch
    is_batch = 0;
    THTensor_(resize4d)(input, 1, input->size(0), input->size(1), input->size(2));
    THTensor_(resize4d)(gradOutput, 1, gradOutput->size(0), gradOutput->size(1), gradOutput->size(2));
  }

  int64_t inputWidth   = input->size(3);
  int64_t inputHeight  = input->size(2);
  int64_t outputHeight = (inputHeight - 1) * dH - 2*padH + (dilationH * (kH - 1) + 1) + adjH;
  int64_t outputWidth  = (inputWidth - 1) * dW - 2*padW + (dilationW * (kW - 1) + 1) + adjW;

  // Batch size + input planes
  int64_t batchSize = input->size(0);

  // Define a buffer of ones, for bias accumulation
  if (ones->dim() != 2 || ones->size(0)*ones->size(1) < outputHeight*outputWidth) {
    // Resize plane and fill with ones...
    THTensor_(resize2d)(ones, outputHeight, outputWidth);
    THTensor_(fill)(ones, 1);
  }

  // Resize temporary columns
  THTensor_(resize2d)(columns, nOutputPlane*kW*kH, inputHeight*inputWidth);

  // Helpers
  THTensor *input_n = THTensor_(new)();
  THTensor *gradOutput_n = THTensor_(new)();

  int elt;
  // For each elt in batch, do:
  for (elt = 0; elt < batchSize; elt ++) {
    // Matrix mulitply per output:
    THTensor_(select)(gradOutput_n, gradOutput, 0, elt);

    // Do Weight:
    if (gradWeight) {
      // Matrix mulitply per output:
      THTensor_(select)(input_n, input, 0, elt);

      // Extract columns:
      THNN_(im2col)(
        gradOutput_n->data<scalar_t>(),
        nOutputPlane, outputHeight, outputWidth,
        inputHeight, inputWidth,
        kH, kW, padH, padW, dH, dW,
        dilationH, dilationW,
        columns->data<scalar_t>()
      );

      // M,N,K are dims of matrix A and B
      // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
      int64_t n = columns->size(0);   // nOutputPlane * kh * kw
      int64_t m = THTensor_sizeLegacyNoScalars(input_n, 0);   // nInputPlane
      int64_t k = columns->size(1);   // inputHeight * inputWidth

      // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
      THBlas_(gemm)(
          't', 'n',
          n, m, k,
          scale,
          columns->data<scalar_t>(), k,
          input_n->data<scalar_t>(), k,
          1,
          gradWeight->data<scalar_t>(), n
      );
    }

    // Do Bias:
    if (gradBias) {
      // M,N,K are dims of matrix A and B
      // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
      int64_t m_ = nOutputPlane;
      int64_t k_ = outputHeight * outputWidth;

      // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
      THBlas_(gemv)(
          't',
          k_, m_,
          scale,
          gradOutput_n->data<scalar_t>(), k_,
          ones->data<scalar_t>(), 1,
          1,
          gradBias->data<scalar_t>(), 1
      );
    }
  }

  // Free
  c10::raw::intrusive_ptr::decref(input_n);
  c10::raw::intrusive_ptr::decref(gradOutput_n);

  // Resize
  if (is_batch == 0) {
    THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth);
    THTensor_(resize3d)(input, input->size(1), inputHeight, inputWidth);
  }

  c10::raw::intrusive_ptr::decref(input);
  c10::raw::intrusive_ptr::decref(gradOutput);
}
Esempio n. 6
0
void THNN_(SparseLinear_legacyUpdateParameters)(
          THNNState *state,
          THTensor *weight,
          THTensor *bias,
          THTensor *gradWeight,
          THTensor *gradBias,
          THTensor *lastInput,
          accreal learningRate_)
{
  real learningRate = TH_CONVERT_ACCREAL_TO_REAL(learningRate_);
  int64_t h, i;
  int64_t outDim = weight->size[0];
  int64_t inDim = weight->size[1];

  THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4,
             "gradWeight size wrong");
  THArgCheck(THNN_(checkSize1D)(bias, outDim), 3, "bias size wrong");
  THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5, "gradBias size wrong");
  THArgCheck(THNN_(checkLegacyInput)(lastInput), 6,
             "input size must be batchsize x nnz x 2");


  int64_t batchSize = THTensor_(size)(lastInput, 0);
  int64_t nnz = THTensor_(size)(lastInput, 1);

  // collect unique offsets of non-0 val in input
  THTensor* offsets = THTensor_(newWithSize1d)(batchSize * nnz);
  int64_t cnt = 0;
  for (h = 0; h < batchSize; h++) {
    for (i = 0; i < nnz; i++) {
      real val = THNN_(get3d)(lastInput, h, i, 1);
      if (val == 0 ) {
        continue;
      }
      int64_t offset = (int64_t)(THNN_(get3d)(lastInput, h, i, 0)) - 1;
      if (offset >= 0 && offset < inDim) {
        THNN_(set1d)(offsets, cnt++, offset);
      } else {
        THError(
          "index out of bound. updateParameters: %d not between 1 and %d",
          offset + 1,
          inDim);
      }
    }
  }
  THTensor_(resize1d)(offsets, cnt);

  THTensor* uniqueOffsets = THTensor_(new)();
  THLongTensor* ri = THLongTensor_new();
  THTensor_(sort)(uniqueOffsets, ri, offsets, 0, 0);
  THLongTensor_free(ri);
  THTensor_(free)(offsets);

  cnt = 1;
  real* uniqueOffsets_p = THTensor_(data)(uniqueOffsets);
  for (i = 1; i < THTensor_(size)(uniqueOffsets, 0); i++) {
    if (uniqueOffsets_p[i] != uniqueOffsets_p[i - 1]) {
      uniqueOffsets_p[cnt++] = uniqueOffsets_p[i];
    }
  }
  THTensor_(resize1d)(uniqueOffsets, cnt);

  // weight += -learningRate * gradWeight
  THTensor_(cadd)(bias, bias, -learningRate, gradBias);
#pragma omp parallel for private(i) schedule(static) if (cnt * outDim > 10000)
  for (i = 0; i < cnt; i++) {
    int64_t offset = (int64_t)uniqueOffsets_p[i];
    THBlas_(axpy)(outDim,
                  -learningRate,
                  COL_PTR2(gradWeight, offset), gradWeight->stride[0],
                  COL_PTR2(weight, offset), weight->stride[0]);
  }

  THTensor_(free)(uniqueOffsets);
}
Esempio n. 7
0
void THNN_(SparseLinear_legacyAccGradParameters)(
          THNNState *state,
          THTensor *input,
          THTensor *gradOutput,
          THTensor *gradWeight,
          THTensor *gradBias,
          THTensor *weight,
          THTensor *bias,
          accreal weightDecay_,
          accreal scale_)
{
  real weightDecay = TH_CONVERT_ACCREAL_TO_REAL(weightDecay_);
  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
  int64_t h, i;
  int64_t outDim = THTensor_(size)(weight, 0);
  int64_t inDim = THTensor_(size)(weight, 1);

  THArgCheck(THNN_(checkLegacyInput)(input), 2,
             "input size must be batchsize x nnz x 2");
  THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4,
             "gradWeight size wrong");
  THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5,
             "gradBias size wrong");
  THArgCheck(THTensor_(isContiguous)(gradOutput), 1,
             "gradOutput must be contiguous");

  int64_t batchSize = THTensor_(size)(input, 0);
  int64_t nnz = THTensor_(size)(input, 1);
  THTensor_(resize2d)(gradOutput, batchSize, outDim);

  // gradWeight += gradOutput * input
#pragma omp parallel for private(h, i) schedule(static) if (\
  batchSize * nnz * outDim > 10000)
  for (i = 0; i < nnz; i++) {
    for (h = 0; h < batchSize; h++) {
      real val = scale * THNN_(get3d)(input, h, i, 1);
      if (val == 0) {
        continue;
      }

      int64_t offset = (int64_t)(THNN_(get3d)(input, h, i, 0)) - 1;
      if (offset >= 0 && offset < inDim) {
        THBlas_(axpy)(outDim,
                      val,
                      ROW_PTR2(gradOutput, h), gradOutput->stride[1],
                      COL_PTR2(gradWeight, offset), gradWeight->stride[0]);
      } else {
        THError(
          "index out of bound. accGradParameters: %d not between 1 and %d",
          offset + 1,
          inDim);
      }
    }
  }

  // gradBias += gradOutput
  THTensor* gradOutput_row = THTensor_(new)();
  for (h = 0; h < batchSize; h++) {
    THTensor_(select)(gradOutput_row, gradOutput, 0, h);
    THTensor_(cadd)(gradBias, gradBias, scale, gradOutput_row);
  }
  THTensor_(free)(gradOutput_row);

  if (weightDecay != 0) {
    THTensor_(cadd)(gradWeight, gradWeight, weightDecay, weight);
  }
}
Esempio n. 8
0
void THNN_(SparseLinear_accGradParameters)(
          THNNState *state,
          THTensor *input,
          THTensor *gradOutput,
          THTensor *gradWeight,
          THTensor *gradBias,
          THTensor *weight,
          THTensor *bias,
          accreal weightDecay_,
          accreal scale_)
{
  real weightDecay = TH_CONVERT_ACCREAL_TO_REAL(weightDecay_);
  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
  int64_t h, i, col, hp0, hp1;
  int64_t outDim = THTensor_(size)(weight, 0);
  int64_t inDim = THTensor_(size)(weight, 1);

  THArgCheck(THNN_(checkInput)(input), 2,
             "input must be in coo format, nnz x 3");
  THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4,
             "gradWeight size wrong");
  THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5,
             "gradBias size wrong");
  THArgCheck(THTensor_(isContiguous)(gradOutput), 1,
             "gradOutput must be contiguous");

  int64_t nnz = THTensor_(size)(input, 0);

  THLongTensor* csc = THLongTensor_newWithSize1d(inDim+1);
  THLongTensor_zero(csc);
  weight = THTensor_(newContiguous)(weight);

#pragma omp parallel for private(i, h, hp0, hp1) schedule(static) if (nnz > 10000)
  for (i = 0; i < nnz; i++) {
    hp0 = (int64_t)(THNN_(get2d)(input, i, 1)) - 1;
    hp1 = (i+1 == nnz) ?
            inDim :
            (int64_t)(THNN_(get2d)(input, i+1, 1)) - 1;
    if (hp0 != hp1) for (h = hp0; h < hp1; h++) {
      THLongTensor_set1d(csc, h+1, i+1);
    }
  }

  // gradWeight += gradOutput * input
#pragma omp parallel for private(h, i, col) schedule(static) if (nnz > 10000)
  for (col = 0; col < inDim; col++) {
    int64_t i_start = THLongTensor_get1d(csc, col);
    int64_t i_end = THLongTensor_get1d(csc, col+1);
    for (i = i_start; i < i_end; i++) {
      real val = scale * THNN_(get2d)(input, i, 2);

      h = (int64_t)(THNN_(get2d)(input, i, 0)) - 1;
      int64_t offset = (int64_t)(THNN_(get2d)(input, i, 1)) - 1;
      if (offset >= 0 && offset < inDim) {
        THBlas_(axpy)(outDim,
            val,
            ROW_PTR2(gradOutput, h), gradOutput->stride[1],
            COL_PTR2(gradWeight, offset), gradWeight->stride[0]);
      } else {
        THError(
            "index out of bound. accGradParameters: %d not between 1 and %d",
            offset + 1,
            inDim);
      }
    }
  }

  // gradBias += gradOutput
  THTensor* buf = THTensor_(new)();
  THTensor_(sum)(buf, gradOutput, 0, 1);
  THTensor_(cadd)(gradBias, gradBias, scale, buf);
  THTensor_(free)(buf);
  THLongTensor_free(csc);

  if (weightDecay != 0) {
    THTensor_(cadd)(gradWeight, gradWeight, weightDecay, weight);
  }
  THTensor_(free)(weight);
}
Esempio n. 9
0
void THNN_(LookupTable_accGradParameters)(
          THNNState *state,
          THIndexTensor *input,
          THTensor *gradOutput,
          THTensor *gradWeight,
          THIntegerTensor *count,
          THTensor *sorted,
          THIndexTensor *indices,
          bool scaleGradByFreq,
          int paddingValue,
          accreal ascale)
{
  real scale = TH_CONVERT_ACCREAL_TO_REAL(ascale);
  ptrdiff_t i;
  THInteger_t *count_data = NULL;

  if (scaleGradByFreq)
  {
    THIntegerTensor_(resize1d)(count, gradWeight->size[0]);
    count_data = THIntegerTensor_(data)(count);
  }

  if (!THTensor_(isContiguous)(gradWeight))
    THError("gradWeight must be contiguous");
  if (!THIndexTensor_(isContiguous)(input))
    THError("input must be contiguous");
  if (THIndexTensor_(nDimension)(input) != 1 && THIndexTensor_(nDimension)(input) != 2) {
    THDescBuff s1 = THIndexTensor_(sizeDesc)(input);
    THError("input must be a vector or matrix, but is of shape: %s", s1.str);
  }

  THIndex_t *input_data = THIndexTensor_(data)(input);
  ptrdiff_t numel = THIndexTensor_(nElement)(input);
  long numw = THTensor_(size)(gradWeight, 0);

  // check that inputs are all within range
  for (i=0; i<numel; i++)
    if (input_data[i] < TH_INDEX_BASE || input_data[i] >= numw + TH_INDEX_BASE) {
      THError("inputs need to be in the range %ld <= input < %ld, "
	      "but got input of value: %ld", TH_INDEX_BASE, (numw + TH_INDEX_BASE),
	      input_data[i]);
    }

  gradOutput = THTensor_(newContiguous)(gradOutput);

  real *gw = THTensor_(data)(gradWeight);
  real *go = THTensor_(data)(gradOutput);
  long stride = THTensor_(stride)(gradWeight, 0);

  if (count_data)
    THNN_(LookupTable_resetCount)(count_data, input);

#ifdef _OPENMP
  if (numel > 1000)
  {
    // The strategy is to parallelize over sections of the vocabulary, so that
    // thread 1 handles updates to gradWeight[0..nVocab/nThreads]. Every thread
    // has to traverse the entire input, but the dominating factor is the axpy
    // BLAS call.
    #pragma omp parallel private(i)
    {
      int tid = omp_get_thread_num();
      int nthreads = omp_get_num_threads();

      long start = tid * (numw/nthreads + 1);
      long end = start + (numw/nthreads + 1);
      for (i=0; i<numel; i++)
      {
        if (input_data[i] != paddingValue)
        {
            long k = input_data[i] - TH_INDEX_BASE;
            if (k >= start && k < end)
            {
                real scale_ = scale;
                if (count_data) scale_ /= count_data[k];
                THBlas_(axpy)(stride, scale_, go + i*stride, 1, gw + k*stride, 1);
            }
        }
      }
    }

    THTensor_(free)(gradOutput);
    return;
  }
#endif

  for (i=0; i<numel; i++)
  {
    if (input_data[i] != paddingValue)
    {
        long k = input_data[i] - TH_INDEX_BASE;
        real scale_ = scale;
        if (count_data) scale_ /= count_data[k];
        THBlas_(axpy)(stride, scale_, go + i*stride, 1, gw + k*stride, 1);
     }
  }

  THTensor_(free)(gradOutput);
}
Esempio n. 10
0
void THNN_(LookupTable_renorm)(
          THNNState *state,
          THIndexTensor *idx,
          THTensor *weight,
          accreal maxNorm_,
          accreal normType_)
{
  real maxNorm = TH_CONVERT_ACCREAL_TO_REAL(maxNorm_);
  real normType = TH_CONVERT_ACCREAL_TO_REAL(normType_);
  if (!THTensor_(isContiguous)(weight))
    THError("weight must be contiguous");
  if (!THIndexTensor_(isContiguous)(idx))
    THError("input must be contiguous");
  if (THIndexTensor_(nDimension)(idx) != 1)
    THError("idx must be a vector");
  if (normType <= 0)
    THError("non-positive-norm not supported");

  ptrdiff_t i;
  THIndex_t *row_idx = THIndexTensor_(data)(idx);
  ptrdiff_t numel = THIndexTensor_(nElement)(idx);

  long numw = THTensor_(size)(weight, 0);
  long stride = THTensor_(stride)(weight, 0);
  real *gw = THTensor_(data)(weight);
  for (i=0; i<numel; i++) {
    if (row_idx[i] < TH_INDEX_BASE || row_idx[i] >= numw + TH_INDEX_BASE) {
      THError("input need to be in the range %ld <= input < %ld, "
	      "but got input of value: %ld", TH_INDEX_BASE, (numw + TH_INDEX_BASE),
	      row_idx[i]);
    }
  }
  // get unique indices
  qsort(row_idx, numel, sizeof(THIndex_t), THNN_(compare_THIndex));
  ptrdiff_t ptr = 0;
  for (i=0; i<numel; i++)
    if (i == 0 || row_idx[i] != row_idx[i-1])
      row_idx[ptr++] = row_idx[i];
  numel = ptr;

#ifdef _OPENMP
  if (numel > 1000)
  {
    // The strategy is to parallelize over the rows that appear in
    // row_idx, so that thread 1 handles the rows in row_idx[0..numel/nThreads].
    // This distributes the work evenly to each thread.
    #pragma omp parallel for private(i)
    for (i=0; i<numel; i++)
    {
      long k = row_idx[i] - TH_INDEX_BASE;
      THNN_(LookupTable_renormRow)(gw + k*stride, stride, maxNorm, normType);
    }
    return;
  }
#endif
  for (i=0; i<numel; i++)
  {
    long k = row_idx[i] - TH_INDEX_BASE;
    THNN_(LookupTable_renormRow)(gw + k*stride, stride, maxNorm, normType);
  }
}
Esempio n. 11
0
void THNN_(SpatialSubSampling_accGradParameters)(
    THNNState *state,
    THTensor *input,
    THTensor *gradOutput,
    THTensor *gradWeight,
    THTensor *gradBias,
    int kW, int kH,
    int dW, int dH,
    accreal scale_)
{
  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
  THNN_(SpatialSubSampling_shapeCheck)(input, gradOutput, gradWeight, kW, kH);

  long nbatch = 1;
  long dimw = 2;
  long dimh = 1;

  long inputWidth;
  long inputHeight;
  long outputWidth;
  long outputHeight;

  int nInputPlane = THTensor_(size)(gradWeight,0);

  real *gradWeight_data;
  real *gradBias_data;
  real *gradOutput_data;
  real *input_data;

  long k;

  if (input->nDimension == 4) {
    dimw++;
    dimh++;
    nbatch = input->size[0];
  }

  inputWidth = input->size[dimw];
  inputHeight = input->size[dimh];
  outputWidth = (inputWidth - kW) / dW + 1;
  outputHeight = (inputHeight - kH) / dH + 1;

  gradWeight_data = THTensor_(data)(gradWeight);
  gradBias_data = THTensor_(data)(gradBias);
  gradOutput = THTensor_(newContiguous)(gradOutput);
  gradOutput_data = THTensor_(data)(gradOutput);

  input = THTensor_(newContiguous)(input);
  input_data = THTensor_(data)(input);

#pragma omp parallel for private(k)
  for(k = 0; k < nInputPlane; k++)
  {
    long p;
    for(p = 0; p < nbatch; p++)
    {
      real *ptr_gradOutput = gradOutput_data + p*nInputPlane*outputHeight*outputWidth + k*outputWidth*outputHeight;
      real sum;
      long xx, yy;
      long i;

      sum = 0;
      for(i = 0; i < outputWidth*outputHeight; i++)
        sum += ptr_gradOutput[i];
      gradBias_data[k] += scale*sum;

      sum = 0;
      for(yy = 0; yy < outputHeight; yy++)
      {
        for(xx = 0; xx < outputWidth; xx++)
        {
          real *ptr_input = input_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight + yy*dH*inputWidth+xx*dW;
          real z = *ptr_gradOutput++;
          long kx, ky;

          for(ky = 0; ky < kH; ky++)
          {
            for(kx = 0; kx < kW; kx++)
              sum += z * ptr_input[kx];
            ptr_input += inputWidth;
          }
        }
      }
      gradWeight_data[k] += scale*sum;
    }
  }

  THTensor_(free)(input);
  THTensor_(free)(gradOutput);
}
Esempio n. 12
0
void THNN_(SpatialFullConvolutionMap_accGradParameters)(
  THNNState *state,
  THTensor *input,
  THTensor *gradOutput,
  THTensor *gradWeight,
  THTensor *gradBias,
  THTensor *connTable,
  int nInputPlane,
  int nOutputPlane,
  int dW, int dH,
  accreal scale_)
{
  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
  THArgCheck(
    gradWeight != NULL && !gradWeight->is_empty() && gradWeight->dim() == 3
    && connTable != NULL && connTable->size[0] == gradWeight->size[0], 5,
    "non-empty 3D gradWeight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE
  );

  /* contiguous */
  input = THTensor_(newContiguous)(input);
  gradOutput = THTensor_(newContiguous)(gradOutput);

  /* get raw pointers */
  real *input_data = THTensor_(data)(input);
  real *gradOutput_data = THTensor_(data)(gradOutput);
  real *gradWeight_data = THTensor_(data)(gradWeight);
  real *gradBias_data = THTensor_(data)(gradBias);

  /* and dims */
  const int64_t input_h  = input->size[1];
  const int64_t input_w  = input->size[2];
  const int64_t output_h = gradOutput->size[1];
  const int64_t output_w = gradOutput->size[2];
  const int64_t weight_h = gradWeight->size[1];
  const int64_t weight_w = gradWeight->size[2];

  /* gradients wrt bias */
  int64_t k;
#pragma omp parallel for private(k)
  for (k = 0; k < nOutputPlane; k++)
  {
    real *ptr_gradOutput = gradOutput_data + k*output_w*output_h;
    int64_t l;
    for (l = 0; l < output_h*output_w; l++)
      gradBias_data[k] += scale*ptr_gradOutput[l];
  }

  /* gradients wrt weight */
  int nkernel = connTable->size[0];
#pragma omp parallel for private(k)
  for (k = 0; k < nkernel; k++)
  {
    int o = (int)THTensor_(get2d)(connTable,k,1) - TH_INDEX_BASE;
    int i = (int)THTensor_(get2d)(connTable,k,0) - TH_INDEX_BASE;

    /* gradient to kernel */
    THTensor_(validXCorr2DRevptr)(
      gradWeight_data + k*weight_w*weight_h,
      scale,
      gradOutput_data + o*output_w*output_h, output_h, output_w,
      input_data + i*input_w*input_h, input_h, input_w,
      dH, dW
    );
  }

  /* clean up */
  THTensor_(free)(input);
  THTensor_(free)(gradOutput);
}
Esempio n. 13
0
void THNN_(IndexLinear_accGradParameters)(
          THNNState *state,
          THLongTensor *keys,
          int64_t keysOffset,
          THTensor *values,
          THLongTensor *sizes,
          THLongTensor *cumSumSizes,
          THTensor *gradOutput,
          THTensor *gradWeight,
          THTensor *gradBias,
          THTensor *weight,
          THTensor *bias,
          THTensor *valuesBuffer,
          accreal weightDecay_,
          accreal scale_)
{
  scalar_t scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
  /* Retrieve all the dimensions of the problem */
  int64_t batchSize = THLongTensor_size(sizes, 0);
  int64_t keysSize = THLongTensor_size(keys, 0);
  int64_t outDim = THTensor_(size)(bias, 0);
  int64_t woutDim = THTensor_(size)(weight, 1);
  int64_t maxNormalize = (woutDim - outDim) > 0 ?1:0;
  THArgCheck(THNN_(checkKeysValues)(keys, values), 1, "Keys and values should have the same number of elements");
  int64_t* sizesData = THLongTensor_data(sizes);

  /* COmpute the cumulative sizes */
  THLongTensor* cumSizes = THLongTensor_new();
  THLongTensor_cumsum(cumSizes, sizes, 0);
  int64_t* cumSizesData = THLongTensor_data(cumSizes);

  /* Resize the gradWeight buffer to keep it dense.
   * That speeds up updates A LOT assuming random mem access. */
  THTensor_(resize2d)(gradWeight, keysSize, outDim * (maxNormalize>0?2:1));

  /* Access the storage data/strides */
  scalar_t* gradOutputData = gradOutput->data<scalar_t>();
  scalar_t* valuesData =values->data<scalar_t>();
  scalar_t* gradWeightData = gradWeight->data<scalar_t>();
  scalar_t* gradBiasData = gradBias->data<scalar_t>();

  /* Make sure these inputs are contiguous to accelerate computations */
  THArgCheck(THLongTensor_isContiguous(keys), 1, "keys vector must be contiguous");
  THArgCheck(THTensor_(isContiguous)(values), 3, "values vector must be contiguous");
  THArgCheck(THTensor_(isContiguous)(gradOutput), 6, "gradOutput vector must be contiguous");
  THArgCheck(THTensor_(isContiguous)(gradWeight), 7, "gradWeight must be contiguous");
  THArgCheck(THTensor_(isContiguous)(gradBias), 8, "gradBias vector must be contiguous");
  THArgCheck(THTensor_(isContiguous)(weight), 9, "weight must be contiguous");
  THArgCheck(THTensor_(isContiguous)(bias), 10, "bias vector must be contiguous");
  THArgCheck(THTensor_(isContiguous)(valuesBuffer), 11, "valuesBuffer must be contiguous");

  int i,j,k;

  /* Separate cases: output dimension is == 1, or > 1
   * This allows for some optimizations.
   * No multithreading here as this could
   * corrupt the results (hogwild style) */
  if (outDim == 1)
  {
    for (j = 0; j < batchSize; j++)
    {
      int64_t offset = j==0?0:cumSizesData[j-1];
      scalar_t val = gradOutputData[j] * scale;
      scalar_t* lgradWeightData = gradWeightData + offset;
      scalar_t* lvaluesData = valuesData + offset;
      int64_t end = sizesData[j];

      if (maxNormalize)
      {
        lgradWeightData += offset;
        i = 0;
        for(;i < end; i++)
        {
          lgradWeightData[2*i] = val;
          lgradWeightData[2*i+1] = val * lvaluesData[i];
        }
      }
      else
      {
        i = 0;
        for(;i < end-4; i += 4)
        {
          lgradWeightData[i] = val * lvaluesData[i];
          lgradWeightData[i+1] = val * lvaluesData[i+1];
          lgradWeightData[i+2] = val * lvaluesData[i+2];
          lgradWeightData[i+3] = val * lvaluesData[i+3];
        }

        for(; i < end; i++)
        {
          lgradWeightData[i] = val * lvaluesData[i];
        }
      }
      *gradBiasData += val;
      offset += end;
    }
  }
  else {
    for (j = 0; j < batchSize; j++)
    {
      int64_t offset = j==0?0:cumSizesData[j-1];
      scalar_t* lgradOutputData = gradOutputData + j*outDim;
      scalar_t* lgradWeightData = gradWeightData;
      THVector_(cadd)(gradBiasData, gradBiasData, lgradOutputData, scale, outDim);
      for (i = 0; i < sizesData[j]; i++)
      {
        scalar_t val = valuesData[offset] * scale;
        lgradWeightData = gradWeightData + offset*outDim;
        if (maxNormalize)
        {
          lgradWeightData += offset*outDim;
          k = 0;
          for(;k < outDim-4; k += 4)
          {
            lgradWeightData[k] = lgradOutputData[k]*scale;
            lgradWeightData[k+1] = lgradOutputData[k+1]*scale;
            lgradWeightData[k+2] = lgradOutputData[k+2]*scale;
            lgradWeightData[k+3] = lgradOutputData[k+3]*scale;
          }

          for(; k < outDim; k++)
          {
            lgradWeightData[k] = lgradOutputData[k]*scale;
          }
          lgradWeightData += outDim;
        }
        k = 0;
        for(;k < outDim-4; k += 4)
        {
          lgradWeightData[k] = val * lgradOutputData[k];
          lgradWeightData[k+1] = val * lgradOutputData[k+1];
          lgradWeightData[k+2] = val * lgradOutputData[k+2];
          lgradWeightData[k+3] = val * lgradOutputData[k+3];
        }

        for(; k < outDim; k++)
        {
          lgradWeightData[k] = val * lgradOutputData[k];
        }
        offset++;
      }
    }
  }
  THLongTensor_free(cumSizes);
  return;
}
Esempio n. 14
0
void THNN_(IndexLinear_accUpdateGradParameters)(
          THNNState *state,
          THLongTensor *keys,
          int64_t keysOffset,
          THTensor *values,
          THLongTensor *sizes,
          THLongTensor *cumSumSizes,
          THTensor *gradOutput,
          THTensor *weight,
          THTensor *bias,
          accreal weightDecay_,
          accreal scale_)
{
  scalar_t weightDecay = TH_CONVERT_ACCREAL_TO_REAL(weightDecay_);
  scalar_t scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
  /* Retrieve all the dimensions of the problem */
  int64_t batchSize = THLongTensor_size(sizes, 0);
  int64_t outDim = THTensor_(size)(bias, 0);
  int64_t woutDim = THTensor_(size)(weight, 1);
  int maxNormalize = woutDim - outDim;
  THArgCheck(THNN_(checkKeysValues)(keys, values), 1, "Keys and values should have the same number of elements");

  /* Access the storage data/strides */
  scalar_t* gradOutputData = gradOutput->data<scalar_t>();
  scalar_t* valuesData =values->data<scalar_t>();
  scalar_t* weightData = weight->data<scalar_t>();
  scalar_t* biasData = bias->data<scalar_t>();
  int64_t weightStride0 = weight->stride(0);
  int64_t* keysData = THLongTensor_data(keys);
  int64_t* sizesData = THLongTensor_data(sizes);

  /* Make sure these inputs are contiguous to accelerate computations */
  THArgCheck(THLongTensor_isContiguous(keys), 1, "keys vector must be contiguous");
  THArgCheck(THTensor_(isContiguous)(values), 3, "values vector must be contiguous");
  THArgCheck(THTensor_(isContiguous)(gradOutput), 6, "gradOutput vector must be contiguous");
  THArgCheck(THTensor_(isContiguous)(weight), 7, "weight matrix must be contiguous");
  THArgCheck(THTensor_(isContiguous)(bias), 8, "bias matrix must be contiguous");

  int i,j,k;

  /* Separate cases: output dimension is == 1, or > 1
   * This allows for some optimizations.
   * No multithreading here as this could
   * corrupt the results (hogwild style) */
  if (outDim == 1)
  {
    if (maxNormalize)
    {
        int64_t offset = 0;
        for (j = 0; j < batchSize; j++)
        {
          scalar_t* lgradOutputData = gradOutputData + j;
          *biasData -= *lgradOutputData * scale;
          scalar_t val = *lgradOutputData * scale;
          for (i = 0; i < sizesData[j]; i++)
          {
            int64_t idx = weightStride0*(keysData[offset] + keysOffset) + maxNormalize;
            weightData[idx-1] -= weightData[idx]*val*weightData[idx-2];
            weightData[idx] -= (val*valuesData[offset] - weightDecay * weightData[idx])*weightData[idx-2];
            offset++;
          }
        }

        offset = 0;
        for (j = 0; j < batchSize; j++)
        {
          for (i = 0; i < sizesData[j]; i++)
          {
            int64_t idx = weightStride0*(keysData[offset] + keysOffset) + maxNormalize;
            weightData[idx-2] = 0;
            offset++;
          }
        }
    }
    else
    {
      if (weightDecay)
      {
        int64_t offset = 0;
        for (j = 0; j < batchSize; j++)
        {
          scalar_t* lgradOutputData = gradOutputData + j;
          *biasData -= *lgradOutputData * scale;
          scalar_t val = *lgradOutputData * scale;
          for (i = 0; i < sizesData[j]; i++)
          {
            int64_t idx = weightStride0*(keysData[offset] + keysOffset);
            weightData[idx] -= val * valuesData[offset] + weightData[idx] * weightDecay;
            offset++;
          }
        }
      }
      else
      {
        int64_t offset = 0;
        for (j = 0; j < batchSize; j++)
        {
          scalar_t val = gradOutputData[j] * scale;
          for (i = 0; i < sizesData[j]; i++)
          {
            weightData[(keysData[offset] + keysOffset)*weightStride0] -= val * valuesData[offset];
            offset++;
          }
          *biasData -= val;
        }
      }
    }
  }
  else {
    int64_t offset = 0;
    for (j = 0; j < batchSize; j++)
    {
      scalar_t* lgradOutputData = gradOutputData + j*outDim;
      scalar_t* lweightData = weightData;
      THVector_(cadd)(biasData, biasData, lgradOutputData, -scale, outDim);
      for (i = 0; i < sizesData[j]; i++)
      {
        scalar_t val = valuesData[offset] * scale;
        scalar_t wd = weightDecay;

        // Max normalize case
        if (maxNormalize)
        {
          lweightData = weightData + weightStride0*(keysData[offset] + keysOffset) + (maxNormalize-2);
          val *= lweightData[0];
          wd *= lweightData[0];
          for (k=0; k < outDim; k++)
          {
            lweightData[1] -= lweightData[k+2]*scale*lgradOutputData[k]*lweightData[0];
          }
          lweightData += 2;
        }
        else
        {
          lweightData = weightData + weightStride0*(keysData[offset] + keysOffset);
        }

        /* We do sparse weight decay.
         * We think it makes more sense. */
        if (weightDecay)
        {
          if (outDim > THNN_SPARSE_OUTDIM_THRESHOLD)
          {
            THBlas_(axpy)(outDim, -wd, lweightData, 1, lweightData, 1);
          }
          else
          {
            for (k=0; k < outDim; k++)
            {
              lweightData[k] -= wd * lweightData[k];
            }
          }
        }

        if (outDim > THNN_SPARSE_OUTDIM_THRESHOLD)
        {
          THBlas_(axpy)(outDim, -val, lgradOutputData, 1, lweightData, 1);
        }
        else
        {
          for (k=0; k < outDim; k++)
          {
            lweightData[k] -= val * lgradOutputData[k];
          }
        }
        offset++;
      }
    }

    /* Max Normalize case:
     * Reset the smart update scaling if
     * one does it batch-wise.
     * TODO: Decide what to do with that piece of code.
     * NB: If the code belowe is uncommented, so should the commented
     * code in IndexLinear:zeroGradParameters() */

    /*
    if (maxNormalize)
    {
      offset = 0;
      for (j = 0; j < batchSize; j++)
      {
        scalar_t* lweightData = weightData;
        for (i = 0; i < sizesData[j]; i++)
        {
          scalar_t val = valuesData[offset] * scale;
          scalar_t wd = weightDecay;

          lweightData = weightData + weightStride0*(keysData[offset] + keysOffset) + (maxNormalize-2);
          lweightData[0] = 0;
          offset++;
        }
      }
    }
    */
  }
  return;
}
Esempio n. 15
0
void THNN_(IndexLinear_updateParameters)(
          THNNState *state,
          THTensor *gradWeight,
          THTensor *gradBias,
          THTensor *weight,
          THTensor *bias,
          THLongTensor *runningKeys,
          THLongTensor *cumSumSizes,
          int64_t keysOffset,
          accreal weightDecay_,
          accreal learningRate_)
{
  scalar_t weightDecay = TH_CONVERT_ACCREAL_TO_REAL(weightDecay_);
  scalar_t learningRate = TH_CONVERT_ACCREAL_TO_REAL(learningRate_);
  /* Retrieve all the dimensions of the problem */
  int64_t outDim = THTensor_(size)(bias, 0);
  int64_t woutDim = THTensor_(size)(weight, 1);
  int maxNormalize = woutDim - outDim;
  int64_t keysSize = THLongTensor_size(runningKeys, 0);

  /* Access the storage data/strides */
  scalar_t* gradWeightData = gradWeight->data<scalar_t>();
  scalar_t* weightData = weight->data<scalar_t>();
  int64_t weightStride0 = weight->stride(0);
  scalar_t* gradBiasData = gradBias->data<scalar_t>();
  scalar_t* biasData = bias->data<scalar_t>();
  int64_t* keysData = THLongTensor_data(runningKeys);

  /* Make sure these inputs are contiguous to accelerate computations */
  THArgCheck(THTensor_(isContiguous)(gradWeight), 1, "gradWeight must be contiguous");
  THArgCheck(THTensor_(isContiguous)(gradBias), 2, "gradBias vector must be contiguous");
  THArgCheck(THTensor_(isContiguous)(weight), 3, "gradBias vector must be contiguous");
  THArgCheck(THTensor_(isContiguous)(bias), 4, "gradBias vector must be contiguous");
  THArgCheck(THLongTensor_isContiguous(runningKeys), 5, "keys vector must be contiguous");

  int j, k;

  /* Update the bias first */
  THVector_(cadd)(biasData, biasData, gradBiasData, -learningRate, outDim);

  /* Separate cases: output dimension is == 1, or > 1
   * This allows for some optimizations.
   * No multithreading here as this could
   * corrupt the results (hogwild style) */
  if (outDim == 1)
  {
    if (maxNormalize)
    {
      if (weightDecay)
      {
        for (j = 0; j < keysSize; j++)
        {
          int64_t woffset = weightStride0*(keysData[j] + keysOffset) + maxNormalize;
          scalar_t lr = learningRate*weightData[woffset-2];
          weightData[woffset-1] -= weightData[woffset]*gradWeightData[2*j]*lr;
          weightData[woffset] -= gradWeightData[2*j+1]*lr - weightDecay * weightData[woffset-2] * weightData[woffset];
        }
      }
      else
      {
        for (j = 0; j < keysSize; j++)
        {
          int64_t woffset = weightStride0*(keysData[j] + keysOffset) + maxNormalize;
          scalar_t lr = learningRate*weightData[woffset-2];
          weightData[woffset-1] -= weightData[woffset]*gradWeightData[2*j]*lr;
          weightData[woffset] -= gradWeightData[2*j+1]*lr;
        }
      }
    }
    else
    {
      if (weightDecay)
      {
        for (j = 0; j < keysSize; j++)
        {
          int64_t woffset = weightStride0*(keysData[j] + keysOffset);
          weightData[woffset] -= gradWeightData[j]*learningRate + weightDecay * weightData[woffset];
        }
      }
      else
      {
        for (j = 0; j < keysSize; j++)
        {
          weightData[weightStride0*(keysData[j] + keysOffset)] -= gradWeightData[j]*learningRate;
        }
      }
    }
  }
  else
  {
    for (j = 0; j < keysSize; j++)
    {
      scalar_t lr = learningRate;
      scalar_t wd = weightDecay;
      scalar_t* lweightData;
      int64_t woffset = weightStride0*(keysData[j] + keysOffset);
      scalar_t* lgradWeightData = gradWeightData + j*outDim;
      if (maxNormalize)
      {
        lgradWeightData += j*outDim;
        /* weightData[woffset + 2] */
        lweightData = weightData + woffset + maxNormalize - 2;
        lr = lr*lweightData[0];
        wd = weightDecay*lweightData[0];
        /* weightData[woffset + 3] */
        lweightData++;
        for (k=0; k < outDim; k++)
        {
            lweightData[0] -= lgradWeightData[k]*lweightData[k+1]*lr;
        }
        lweightData++;
        lgradWeightData += outDim;
      }
      else
      {
        lweightData = weightData + woffset;
      }

      /* We do sparse weight decay.
       * We think it makes more sense. */
      if (weightDecay)
      {
        for (k=0; k < outDim; k++)
        {
            lweightData[k] -= lweightData[k]*wd;
        }
      }

      if (outDim > THNN_SPARSE_OUTDIM_THRESHOLD)
      {
        THBlas_(axpy)(outDim, -lr, lgradWeightData, 1, lweightData, 1);
      }
      else
      {
        for (k=0; k < outDim; k++)
        {
          lweightData[k] -= lgradWeightData[k]*lr;
        }
      }
    }
  }
}
void THNN_(SpatialConvolutionMap_accGradParameters)(
          THNNState *state,
          THTensor *input,
          THTensor *gradOutput,
          THTensor *gradWeight,
          THTensor *gradBias,
          THTensor *connTable,
          int nInputPlane,
          int nOutputPlane,
          int dW, int dH,
          accreal scale_)
{
  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
  THArgCheck(
    gradWeight != NULL && gradWeight->nDimension == 3
    && connTable != NULL && connTable->size[0] == gradWeight->size[0], 5,
    "3D gradWeight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE
  );

  /* and dims */
  int dimw = 2;
  int dimh = 1;
  int64_t nbatch = 1;
  if (input->nDimension == 4)
  {
    nbatch = input->size[0];
    dimw++;
    dimh++;
  }

  const int64_t input_h  = input->size[dimh];
  const int64_t input_w  = input->size[dimw];
  const int64_t output_h = gradOutput->size[dimh];
  const int64_t output_w = gradOutput->size[dimw];
  const int64_t kH       = gradWeight->size[1];
  const int64_t kW       = gradWeight->size[2];

  /* contiguous */
  input = THTensor_(newContiguous)(input);
  gradOutput = THTensor_(newContiguous)(gradOutput);
  THArgCheck(THTensor_(isContiguous)(gradWeight), 4, "gradWeight needs to be contiguous");
  THArgCheck(THTensor_(isContiguous)(gradBias), 5, "gradBias needs to be contiguous");

  /* get raw pointers */
  real *input_data = THTensor_(data)(input);
  real *gradOutput_data = THTensor_(data)(gradOutput);
  real *gradWeight_data = THTensor_(data)(gradWeight);
  real *gradBias_data = THTensor_(data)(gradBias);


  int64_t k;
  /* gradients wrt bias */
#pragma omp parallel for private(k)
  for (k = 0; k < nOutputPlane; k++)
  {
    int64_t m;
    for (m = 0; m < nbatch; m++)
    {
      real *ptr_gradOutput = gradOutput_data + k*output_w*output_h + m*nOutputPlane*output_w*output_h;
      int64_t l;
      for (l = 0; l < output_h*output_w; l++)
        gradBias_data[k] += scale*ptr_gradOutput[l];
    }
  }

  /* gradients wrt weight */
  const int nkernel = connTable->size[0];
#pragma omp parallel for private(k)
  for (k = 0; k < nkernel; k++)
  {
    int64_t m;
    for (m = 0; m < nbatch; m++)
    {
      int o = (int)THTensor_(get2d)(connTable,k,1) - TH_INDEX_BASE;
      int i = (int)THTensor_(get2d)(connTable,k,0) - TH_INDEX_BASE;

      /* gradient to kernel */
      THTensor_(validXCorr2DRevptr)(
        gradWeight_data + k*kW*kH,
        scale,
        input_data + i*input_w*input_h + m*nInputPlane*input_w*input_h, input_h, input_w,
        gradOutput_data + o*output_w*output_h + m*nOutputPlane*output_w*output_h , output_h, output_w,
        dH, dW
      );
    }
  }

  /* clean up */
  THTensor_(free)(input);
  THTensor_(free)(gradOutput);
}
void THNN_(SpatialDilatedConvolution_accGradParameters)(
    THNNState *state,
    THTensor *input,
    THTensor *gradOutput,
    THTensor *gradWeight,
    THTensor *gradBias,
    THTensor *columns,
    THTensor *ones,
    int kW, int kH,
    int dW, int dH,
    int padW, int padH,
    int dilationW, int dilationH,
    accreal scale_)
{
  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
  THNN_(SpatialDilatedConvolution_shapeCheck)
    (input, gradOutput, gradWeight, gradBias, kH, kW, dH, dW, padH, padW,
     dilationH, dilationW);

  // Params
  int nInputPlane = gradWeight->size[1];
  int nOutputPlane = gradWeight->size[0];

  input = THTensor_(newContiguous)(input);
  gradOutput = THTensor_(newContiguous)(gradOutput);
  THArgCheck(THTensor_(isContiguous)(gradWeight), 4, "gradWeight needs to be contiguous");
  if (gradBias)
    THArgCheck(THTensor_(isContiguous)(gradBias), 5, "gradBias needs to be contiguous");
  int batch = 1;
  if (input->nDimension == 3) {
    // Force batch
    batch = 0;
    THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
    THTensor_(resize4d)(gradOutput, 1, gradOutput->size[0],
			gradOutput->size[1], gradOutput->size[2]);
  }

  long inputWidth   = input->size[3];
  long inputHeight  = input->size[2];
  long outputWidth  = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
  long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;

  // Batch size + input planes
  long batchSize = input->size[0];

  // Define a buffer of ones, for bias accumulation
  if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
    // Resize plane and fill with ones...
    THTensor_(resize2d)(ones, outputHeight, outputWidth);
    THTensor_(fill)(ones, 1);
  }

  // Resize temporary columns
  THTensor_(resize2d)(columns, nInputPlane*kW*kH, outputHeight*outputWidth);

  // Helpers
  THTensor *input_n = THTensor_(new)();
  THTensor *gradOutput_n = THTensor_(new)();

  // For each elt in batch, do:
  for (int elt = 0; elt < batchSize; elt ++) {
    // Matrix mulitply per output:
    THTensor_(select)(input_n, input, 0, elt);
    THTensor_(select)(gradOutput_n, gradOutput, 0, elt);

    // Extract columns:
    THNN_(im2col)(
      THTensor_(data)(input_n),
      nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
      dilationH, dilationW,
      THTensor_(data)(columns)
    );

    // M,N,K are dims of matrix A and B
    long m = nOutputPlane;
    long n = nInputPlane*kW*kH;
    long k = columns->size[1];

    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
    THBlas_(gemm)(
        't', 'n',
        n, m, k,
        scale,
        THTensor_(data)(columns), k,
        THTensor_(data)(gradOutput_n), k,
        1,
        THTensor_(data)(gradWeight), n
    );

    // Do Bias:
    // M,N,K are dims of matrix A and B
    long m_ = nOutputPlane;
    long k_ = outputHeight * outputWidth;

    // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
    if (gradBias) {
      THBlas_(gemv)(
          't',
          k_, m_,
          scale,
          THTensor_(data)(gradOutput_n), k_,
          THTensor_(data)(ones), 1,
          1,
          THTensor_(data)(gradBias), 1
      );
    }
  }

  // Free
  THTensor_(free)(input_n);
  THTensor_(free)(gradOutput_n);

  // Resize
  if (batch == 0) {
    THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth);
    THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
  }

  THTensor_(free)(input);
  THTensor_(free)(gradOutput);
}