void THNN_(TemporalRowConvolution_accGradParameters)( THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias, THTensor *finput, THTensor *fgradInput, int kW, int dW, int padW, bool featFirst, accreal scale_) { real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_); int ndim = input->nDimension; THTensor *tinput, *tgradOutput; if (!featFirst) { tinput = THTensor_(newTranspose)(input, ndim - 1, ndim - 2); tgradOutput = THTensor_(newTranspose)(gradOutput, ndim - 1, ndim - 2); input = THTensor_(newContiguous)(tinput); gradOutput = THTensor_(newContiguous)(tgradOutput); } else { input = THTensor_(newContiguous)(input); gradOutput = THTensor_(newContiguous)(gradOutput); } THNN_(TemporalRowConvolution_shapeCheck) (state, input, gradOutput, gradWeight, gradBias, kW, dW, padW); if (ndim == 2) { THNN_(TemporalRowConvolution_accGradParameters_frame)( gradOutput, gradWeight, gradBias, finput, scale); } else { int64_t T = input->size[0]; int64_t t; for (t = 0; t < T; t++) { THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t); THTensor *finput_t = THTensor_(newSelect)(finput, 0, t); THNN_(TemporalRowConvolution_accGradParameters_frame)( gradOutput_t, gradWeight, gradBias, finput_t, scale); THTensor_(free)(gradOutput_t); THTensor_(free)(finput_t); } } if (!featFirst) { THTensor_(free)(tinput); THTensor_(free)(tgradOutput); } THTensor_(free)(input); THTensor_(free)(gradOutput); }
void THNN_(VolumetricConvolutionMM_accGradParameters)( THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias, THTensor *finput, THTensor *fgradInput, int kT, int kW, int kH, int dT, int dW, int dH, int pT, int pW, int pH, accreal scale_) { real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_); THNN_(VolumetricConvolutionMM_shapeCheck)( state, input, gradOutput, gradWeight, gradBias, kT, kW, kH, dT, dW, dH, pT, pW, pH, 1); input = THTensor_(newContiguous)(input); gradOutput = THTensor_(newContiguous)(gradOutput); if (gradWeight) { gradWeight = THNN_(newViewWeight)(gradWeight); } if (input->nDimension == 4) // non-batch mode { THNN_(VolumetricConvolutionMM_accGradParameters_frame)(gradOutput, gradWeight, gradBias, finput, scale); } else // batch mode { int64_t T = input->size[0]; int64_t t; #ifdef _OPENMP #pragma omp parallel for if(T > CONV3D_OMP_THRESHOLD) private(t) #endif for (t = 0; t < T; t++) { THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t); THTensor *finput_t = NULL; if (gradWeight) { finput_t = THTensor_(newSelect)(finput, 0, t); } THNN_(VolumetricConvolutionMM_accGradParameters_frame)(gradOutput_t, gradWeight, gradBias, finput_t, scale); THTensor_(free)(gradOutput_t); if (gradWeight) { THTensor_(free)(finput_t); } } } THTensor_(free)(input); THTensor_(free)(gradOutput); if (gradWeight) { THTensor_(free)(gradWeight); } }
void THNN_(Threshold_updateOutput)( THNNState *state, THTensor *input, THTensor *output, accreal threshold_, accreal val_, bool inplace) { real threshold = TH_CONVERT_ACCREAL_TO_REAL(threshold_); real val = TH_CONVERT_ACCREAL_TO_REAL(val_); if (inplace) { TH_TENSOR_APPLY(real, input, if (*input_data <= threshold) *input_data = val; ); THTensor_(set)(output, input); }
void THNN_(VolumetricConvolutionMM_accGradParameters)( THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias, THTensor *finput, int kT, int kW, int kH, int dT, int dW, int dH, int pT, int pW, int pH, accreal scale_) { real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_); int nOutputPlane = (int)gradWeight->size[0]; THNN_(VolumetricConvolutionMM_shapeCheck)( state, input, gradOutput, gradWeight, gradBias, kT, kW, kH, dT, dW, dH, pT, pW, pH); input = THTensor_(newContiguous)(input); gradOutput = THTensor_(newContiguous)(gradOutput); gradWeight = THNN_(view_weight)(gradWeight); if (input->nDimension == 4) // non-batch mode { THNN_(VolumetricConvolutionMM_accGradParameters_frame)(gradOutput, gradWeight, gradBias, finput, scale); } else // batch mode { int64_t T = input->size[0]; int64_t t; for (t = 0; t < T; t++) { THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t); THTensor *finput_t = THTensor_(newSelect)(finput, 0, t); THNN_(VolumetricConvolutionMM_accGradParameters_frame)(gradOutput_t, gradWeight, gradBias, finput_t, scale); THTensor_(free)(gradOutput_t); THTensor_(free)(finput_t); } } THTensor_(free)(input); THTensor_(free)(gradOutput); THTensor_(free)(gradWeight); }
void THNN_(SpatialFullDilatedConvolution_accGradParameters)( THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias, THTensor *columns, THTensor *ones, int kW, int kH, int dW, int dH, int padW, int padH, int dilationW, int dilationH, int adjW, int adjH, accreal scale_) { scalar_t scale = TH_CONVERT_ACCREAL_TO_REAL(scale_); THNN_(SpatialFullDilatedConvolution_shapeCheck) (input, gradOutput, gradWeight, gradBias, kH, kW, dH, dW, padH, padW, dilationH, dilationW, adjH, adjW, 1); int64_t nOutputPlane; if (gradWeight) { nOutputPlane = THTensor_(size)(gradWeight, 1); } else if (gradBias) { nOutputPlane = THTensor_sizeLegacyNoScalars(gradBias, 0); } else { return; } input = THTensor_(newContiguous)(input); gradOutput = THTensor_(newContiguous)(gradOutput); if (gradWeight) { THArgCheck(THTensor_(isContiguous)(gradWeight), 4, "gradWeight needs to be contiguous"); } THArgCheck(THTensor_(isContiguous)(columns), 6, "columns needs to be contiguous"); if (gradBias) { THArgCheck(THTensor_(isContiguous)(gradBias), 5, "gradBias needs to be contiguous"); THArgCheck(THTensor_(isContiguous)(ones), 7, "ones needs to be contiguous"); } int is_batch = 1; if (input->dim() == 3) { // Force batch is_batch = 0; THTensor_(resize4d)(input, 1, input->size(0), input->size(1), input->size(2)); THTensor_(resize4d)(gradOutput, 1, gradOutput->size(0), gradOutput->size(1), gradOutput->size(2)); } int64_t inputWidth = input->size(3); int64_t inputHeight = input->size(2); int64_t outputHeight = (inputHeight - 1) * dH - 2*padH + (dilationH * (kH - 1) + 1) + adjH; int64_t outputWidth = (inputWidth - 1) * dW - 2*padW + (dilationW * (kW - 1) + 1) + adjW; // Batch size + input planes int64_t batchSize = input->size(0); // Define a buffer of ones, for bias accumulation if (ones->dim() != 2 || ones->size(0)*ones->size(1) < outputHeight*outputWidth) { // Resize plane and fill with ones... THTensor_(resize2d)(ones, outputHeight, outputWidth); THTensor_(fill)(ones, 1); } // Resize temporary columns THTensor_(resize2d)(columns, nOutputPlane*kW*kH, inputHeight*inputWidth); // Helpers THTensor *input_n = THTensor_(new)(); THTensor *gradOutput_n = THTensor_(new)(); int elt; // For each elt in batch, do: for (elt = 0; elt < batchSize; elt ++) { // Matrix mulitply per output: THTensor_(select)(gradOutput_n, gradOutput, 0, elt); // Do Weight: if (gradWeight) { // Matrix mulitply per output: THTensor_(select)(input_n, input, 0, elt); // Extract columns: THNN_(im2col)( gradOutput_n->data<scalar_t>(), nOutputPlane, outputHeight, outputWidth, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW, dilationH, dilationW, columns->data<scalar_t>() ); // M,N,K are dims of matrix A and B // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) int64_t n = columns->size(0); // nOutputPlane * kh * kw int64_t m = THTensor_sizeLegacyNoScalars(input_n, 0); // nInputPlane int64_t k = columns->size(1); // inputHeight * inputWidth // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) THBlas_(gemm)( 't', 'n', n, m, k, scale, columns->data<scalar_t>(), k, input_n->data<scalar_t>(), k, 1, gradWeight->data<scalar_t>(), n ); } // Do Bias: if (gradBias) { // M,N,K are dims of matrix A and B // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) int64_t m_ = nOutputPlane; int64_t k_ = outputHeight * outputWidth; // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices) THBlas_(gemv)( 't', k_, m_, scale, gradOutput_n->data<scalar_t>(), k_, ones->data<scalar_t>(), 1, 1, gradBias->data<scalar_t>(), 1 ); } } // Free c10::raw::intrusive_ptr::decref(input_n); c10::raw::intrusive_ptr::decref(gradOutput_n); // Resize if (is_batch == 0) { THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth); THTensor_(resize3d)(input, input->size(1), inputHeight, inputWidth); } c10::raw::intrusive_ptr::decref(input); c10::raw::intrusive_ptr::decref(gradOutput); }
void THNN_(SparseLinear_legacyUpdateParameters)( THNNState *state, THTensor *weight, THTensor *bias, THTensor *gradWeight, THTensor *gradBias, THTensor *lastInput, accreal learningRate_) { real learningRate = TH_CONVERT_ACCREAL_TO_REAL(learningRate_); int64_t h, i; int64_t outDim = weight->size[0]; int64_t inDim = weight->size[1]; THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4, "gradWeight size wrong"); THArgCheck(THNN_(checkSize1D)(bias, outDim), 3, "bias size wrong"); THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5, "gradBias size wrong"); THArgCheck(THNN_(checkLegacyInput)(lastInput), 6, "input size must be batchsize x nnz x 2"); int64_t batchSize = THTensor_(size)(lastInput, 0); int64_t nnz = THTensor_(size)(lastInput, 1); // collect unique offsets of non-0 val in input THTensor* offsets = THTensor_(newWithSize1d)(batchSize * nnz); int64_t cnt = 0; for (h = 0; h < batchSize; h++) { for (i = 0; i < nnz; i++) { real val = THNN_(get3d)(lastInput, h, i, 1); if (val == 0 ) { continue; } int64_t offset = (int64_t)(THNN_(get3d)(lastInput, h, i, 0)) - 1; if (offset >= 0 && offset < inDim) { THNN_(set1d)(offsets, cnt++, offset); } else { THError( "index out of bound. updateParameters: %d not between 1 and %d", offset + 1, inDim); } } } THTensor_(resize1d)(offsets, cnt); THTensor* uniqueOffsets = THTensor_(new)(); THLongTensor* ri = THLongTensor_new(); THTensor_(sort)(uniqueOffsets, ri, offsets, 0, 0); THLongTensor_free(ri); THTensor_(free)(offsets); cnt = 1; real* uniqueOffsets_p = THTensor_(data)(uniqueOffsets); for (i = 1; i < THTensor_(size)(uniqueOffsets, 0); i++) { if (uniqueOffsets_p[i] != uniqueOffsets_p[i - 1]) { uniqueOffsets_p[cnt++] = uniqueOffsets_p[i]; } } THTensor_(resize1d)(uniqueOffsets, cnt); // weight += -learningRate * gradWeight THTensor_(cadd)(bias, bias, -learningRate, gradBias); #pragma omp parallel for private(i) schedule(static) if (cnt * outDim > 10000) for (i = 0; i < cnt; i++) { int64_t offset = (int64_t)uniqueOffsets_p[i]; THBlas_(axpy)(outDim, -learningRate, COL_PTR2(gradWeight, offset), gradWeight->stride[0], COL_PTR2(weight, offset), weight->stride[0]); } THTensor_(free)(uniqueOffsets); }
void THNN_(SparseLinear_legacyAccGradParameters)( THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias, THTensor *weight, THTensor *bias, accreal weightDecay_, accreal scale_) { real weightDecay = TH_CONVERT_ACCREAL_TO_REAL(weightDecay_); real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_); int64_t h, i; int64_t outDim = THTensor_(size)(weight, 0); int64_t inDim = THTensor_(size)(weight, 1); THArgCheck(THNN_(checkLegacyInput)(input), 2, "input size must be batchsize x nnz x 2"); THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4, "gradWeight size wrong"); THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5, "gradBias size wrong"); THArgCheck(THTensor_(isContiguous)(gradOutput), 1, "gradOutput must be contiguous"); int64_t batchSize = THTensor_(size)(input, 0); int64_t nnz = THTensor_(size)(input, 1); THTensor_(resize2d)(gradOutput, batchSize, outDim); // gradWeight += gradOutput * input #pragma omp parallel for private(h, i) schedule(static) if (\ batchSize * nnz * outDim > 10000) for (i = 0; i < nnz; i++) { for (h = 0; h < batchSize; h++) { real val = scale * THNN_(get3d)(input, h, i, 1); if (val == 0) { continue; } int64_t offset = (int64_t)(THNN_(get3d)(input, h, i, 0)) - 1; if (offset >= 0 && offset < inDim) { THBlas_(axpy)(outDim, val, ROW_PTR2(gradOutput, h), gradOutput->stride[1], COL_PTR2(gradWeight, offset), gradWeight->stride[0]); } else { THError( "index out of bound. accGradParameters: %d not between 1 and %d", offset + 1, inDim); } } } // gradBias += gradOutput THTensor* gradOutput_row = THTensor_(new)(); for (h = 0; h < batchSize; h++) { THTensor_(select)(gradOutput_row, gradOutput, 0, h); THTensor_(cadd)(gradBias, gradBias, scale, gradOutput_row); } THTensor_(free)(gradOutput_row); if (weightDecay != 0) { THTensor_(cadd)(gradWeight, gradWeight, weightDecay, weight); } }
void THNN_(SparseLinear_accGradParameters)( THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias, THTensor *weight, THTensor *bias, accreal weightDecay_, accreal scale_) { real weightDecay = TH_CONVERT_ACCREAL_TO_REAL(weightDecay_); real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_); int64_t h, i, col, hp0, hp1; int64_t outDim = THTensor_(size)(weight, 0); int64_t inDim = THTensor_(size)(weight, 1); THArgCheck(THNN_(checkInput)(input), 2, "input must be in coo format, nnz x 3"); THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4, "gradWeight size wrong"); THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5, "gradBias size wrong"); THArgCheck(THTensor_(isContiguous)(gradOutput), 1, "gradOutput must be contiguous"); int64_t nnz = THTensor_(size)(input, 0); THLongTensor* csc = THLongTensor_newWithSize1d(inDim+1); THLongTensor_zero(csc); weight = THTensor_(newContiguous)(weight); #pragma omp parallel for private(i, h, hp0, hp1) schedule(static) if (nnz > 10000) for (i = 0; i < nnz; i++) { hp0 = (int64_t)(THNN_(get2d)(input, i, 1)) - 1; hp1 = (i+1 == nnz) ? inDim : (int64_t)(THNN_(get2d)(input, i+1, 1)) - 1; if (hp0 != hp1) for (h = hp0; h < hp1; h++) { THLongTensor_set1d(csc, h+1, i+1); } } // gradWeight += gradOutput * input #pragma omp parallel for private(h, i, col) schedule(static) if (nnz > 10000) for (col = 0; col < inDim; col++) { int64_t i_start = THLongTensor_get1d(csc, col); int64_t i_end = THLongTensor_get1d(csc, col+1); for (i = i_start; i < i_end; i++) { real val = scale * THNN_(get2d)(input, i, 2); h = (int64_t)(THNN_(get2d)(input, i, 0)) - 1; int64_t offset = (int64_t)(THNN_(get2d)(input, i, 1)) - 1; if (offset >= 0 && offset < inDim) { THBlas_(axpy)(outDim, val, ROW_PTR2(gradOutput, h), gradOutput->stride[1], COL_PTR2(gradWeight, offset), gradWeight->stride[0]); } else { THError( "index out of bound. accGradParameters: %d not between 1 and %d", offset + 1, inDim); } } } // gradBias += gradOutput THTensor* buf = THTensor_(new)(); THTensor_(sum)(buf, gradOutput, 0, 1); THTensor_(cadd)(gradBias, gradBias, scale, buf); THTensor_(free)(buf); THLongTensor_free(csc); if (weightDecay != 0) { THTensor_(cadd)(gradWeight, gradWeight, weightDecay, weight); } THTensor_(free)(weight); }
void THNN_(LookupTable_accGradParameters)( THNNState *state, THIndexTensor *input, THTensor *gradOutput, THTensor *gradWeight, THIntegerTensor *count, THTensor *sorted, THIndexTensor *indices, bool scaleGradByFreq, int paddingValue, accreal ascale) { real scale = TH_CONVERT_ACCREAL_TO_REAL(ascale); ptrdiff_t i; THInteger_t *count_data = NULL; if (scaleGradByFreq) { THIntegerTensor_(resize1d)(count, gradWeight->size[0]); count_data = THIntegerTensor_(data)(count); } if (!THTensor_(isContiguous)(gradWeight)) THError("gradWeight must be contiguous"); if (!THIndexTensor_(isContiguous)(input)) THError("input must be contiguous"); if (THIndexTensor_(nDimension)(input) != 1 && THIndexTensor_(nDimension)(input) != 2) { THDescBuff s1 = THIndexTensor_(sizeDesc)(input); THError("input must be a vector or matrix, but is of shape: %s", s1.str); } THIndex_t *input_data = THIndexTensor_(data)(input); ptrdiff_t numel = THIndexTensor_(nElement)(input); long numw = THTensor_(size)(gradWeight, 0); // check that inputs are all within range for (i=0; i<numel; i++) if (input_data[i] < TH_INDEX_BASE || input_data[i] >= numw + TH_INDEX_BASE) { THError("inputs need to be in the range %ld <= input < %ld, " "but got input of value: %ld", TH_INDEX_BASE, (numw + TH_INDEX_BASE), input_data[i]); } gradOutput = THTensor_(newContiguous)(gradOutput); real *gw = THTensor_(data)(gradWeight); real *go = THTensor_(data)(gradOutput); long stride = THTensor_(stride)(gradWeight, 0); if (count_data) THNN_(LookupTable_resetCount)(count_data, input); #ifdef _OPENMP if (numel > 1000) { // The strategy is to parallelize over sections of the vocabulary, so that // thread 1 handles updates to gradWeight[0..nVocab/nThreads]. Every thread // has to traverse the entire input, but the dominating factor is the axpy // BLAS call. #pragma omp parallel private(i) { int tid = omp_get_thread_num(); int nthreads = omp_get_num_threads(); long start = tid * (numw/nthreads + 1); long end = start + (numw/nthreads + 1); for (i=0; i<numel; i++) { if (input_data[i] != paddingValue) { long k = input_data[i] - TH_INDEX_BASE; if (k >= start && k < end) { real scale_ = scale; if (count_data) scale_ /= count_data[k]; THBlas_(axpy)(stride, scale_, go + i*stride, 1, gw + k*stride, 1); } } } } THTensor_(free)(gradOutput); return; } #endif for (i=0; i<numel; i++) { if (input_data[i] != paddingValue) { long k = input_data[i] - TH_INDEX_BASE; real scale_ = scale; if (count_data) scale_ /= count_data[k]; THBlas_(axpy)(stride, scale_, go + i*stride, 1, gw + k*stride, 1); } } THTensor_(free)(gradOutput); }
void THNN_(LookupTable_renorm)( THNNState *state, THIndexTensor *idx, THTensor *weight, accreal maxNorm_, accreal normType_) { real maxNorm = TH_CONVERT_ACCREAL_TO_REAL(maxNorm_); real normType = TH_CONVERT_ACCREAL_TO_REAL(normType_); if (!THTensor_(isContiguous)(weight)) THError("weight must be contiguous"); if (!THIndexTensor_(isContiguous)(idx)) THError("input must be contiguous"); if (THIndexTensor_(nDimension)(idx) != 1) THError("idx must be a vector"); if (normType <= 0) THError("non-positive-norm not supported"); ptrdiff_t i; THIndex_t *row_idx = THIndexTensor_(data)(idx); ptrdiff_t numel = THIndexTensor_(nElement)(idx); long numw = THTensor_(size)(weight, 0); long stride = THTensor_(stride)(weight, 0); real *gw = THTensor_(data)(weight); for (i=0; i<numel; i++) { if (row_idx[i] < TH_INDEX_BASE || row_idx[i] >= numw + TH_INDEX_BASE) { THError("input need to be in the range %ld <= input < %ld, " "but got input of value: %ld", TH_INDEX_BASE, (numw + TH_INDEX_BASE), row_idx[i]); } } // get unique indices qsort(row_idx, numel, sizeof(THIndex_t), THNN_(compare_THIndex)); ptrdiff_t ptr = 0; for (i=0; i<numel; i++) if (i == 0 || row_idx[i] != row_idx[i-1]) row_idx[ptr++] = row_idx[i]; numel = ptr; #ifdef _OPENMP if (numel > 1000) { // The strategy is to parallelize over the rows that appear in // row_idx, so that thread 1 handles the rows in row_idx[0..numel/nThreads]. // This distributes the work evenly to each thread. #pragma omp parallel for private(i) for (i=0; i<numel; i++) { long k = row_idx[i] - TH_INDEX_BASE; THNN_(LookupTable_renormRow)(gw + k*stride, stride, maxNorm, normType); } return; } #endif for (i=0; i<numel; i++) { long k = row_idx[i] - TH_INDEX_BASE; THNN_(LookupTable_renormRow)(gw + k*stride, stride, maxNorm, normType); } }
void THNN_(SpatialSubSampling_accGradParameters)( THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias, int kW, int kH, int dW, int dH, accreal scale_) { real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_); THNN_(SpatialSubSampling_shapeCheck)(input, gradOutput, gradWeight, kW, kH); long nbatch = 1; long dimw = 2; long dimh = 1; long inputWidth; long inputHeight; long outputWidth; long outputHeight; int nInputPlane = THTensor_(size)(gradWeight,0); real *gradWeight_data; real *gradBias_data; real *gradOutput_data; real *input_data; long k; if (input->nDimension == 4) { dimw++; dimh++; nbatch = input->size[0]; } inputWidth = input->size[dimw]; inputHeight = input->size[dimh]; outputWidth = (inputWidth - kW) / dW + 1; outputHeight = (inputHeight - kH) / dH + 1; gradWeight_data = THTensor_(data)(gradWeight); gradBias_data = THTensor_(data)(gradBias); gradOutput = THTensor_(newContiguous)(gradOutput); gradOutput_data = THTensor_(data)(gradOutput); input = THTensor_(newContiguous)(input); input_data = THTensor_(data)(input); #pragma omp parallel for private(k) for(k = 0; k < nInputPlane; k++) { long p; for(p = 0; p < nbatch; p++) { real *ptr_gradOutput = gradOutput_data + p*nInputPlane*outputHeight*outputWidth + k*outputWidth*outputHeight; real sum; long xx, yy; long i; sum = 0; for(i = 0; i < outputWidth*outputHeight; i++) sum += ptr_gradOutput[i]; gradBias_data[k] += scale*sum; sum = 0; for(yy = 0; yy < outputHeight; yy++) { for(xx = 0; xx < outputWidth; xx++) { real *ptr_input = input_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight + yy*dH*inputWidth+xx*dW; real z = *ptr_gradOutput++; long kx, ky; for(ky = 0; ky < kH; ky++) { for(kx = 0; kx < kW; kx++) sum += z * ptr_input[kx]; ptr_input += inputWidth; } } } gradWeight_data[k] += scale*sum; } } THTensor_(free)(input); THTensor_(free)(gradOutput); }
void THNN_(SpatialFullConvolutionMap_accGradParameters)( THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias, THTensor *connTable, int nInputPlane, int nOutputPlane, int dW, int dH, accreal scale_) { real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_); THArgCheck( gradWeight != NULL && !gradWeight->is_empty() && gradWeight->dim() == 3 && connTable != NULL && connTable->size[0] == gradWeight->size[0], 5, "non-empty 3D gradWeight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE ); /* contiguous */ input = THTensor_(newContiguous)(input); gradOutput = THTensor_(newContiguous)(gradOutput); /* get raw pointers */ real *input_data = THTensor_(data)(input); real *gradOutput_data = THTensor_(data)(gradOutput); real *gradWeight_data = THTensor_(data)(gradWeight); real *gradBias_data = THTensor_(data)(gradBias); /* and dims */ const int64_t input_h = input->size[1]; const int64_t input_w = input->size[2]; const int64_t output_h = gradOutput->size[1]; const int64_t output_w = gradOutput->size[2]; const int64_t weight_h = gradWeight->size[1]; const int64_t weight_w = gradWeight->size[2]; /* gradients wrt bias */ int64_t k; #pragma omp parallel for private(k) for (k = 0; k < nOutputPlane; k++) { real *ptr_gradOutput = gradOutput_data + k*output_w*output_h; int64_t l; for (l = 0; l < output_h*output_w; l++) gradBias_data[k] += scale*ptr_gradOutput[l]; } /* gradients wrt weight */ int nkernel = connTable->size[0]; #pragma omp parallel for private(k) for (k = 0; k < nkernel; k++) { int o = (int)THTensor_(get2d)(connTable,k,1) - TH_INDEX_BASE; int i = (int)THTensor_(get2d)(connTable,k,0) - TH_INDEX_BASE; /* gradient to kernel */ THTensor_(validXCorr2DRevptr)( gradWeight_data + k*weight_w*weight_h, scale, gradOutput_data + o*output_w*output_h, output_h, output_w, input_data + i*input_w*input_h, input_h, input_w, dH, dW ); } /* clean up */ THTensor_(free)(input); THTensor_(free)(gradOutput); }
void THNN_(IndexLinear_accGradParameters)( THNNState *state, THLongTensor *keys, int64_t keysOffset, THTensor *values, THLongTensor *sizes, THLongTensor *cumSumSizes, THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias, THTensor *weight, THTensor *bias, THTensor *valuesBuffer, accreal weightDecay_, accreal scale_) { scalar_t scale = TH_CONVERT_ACCREAL_TO_REAL(scale_); /* Retrieve all the dimensions of the problem */ int64_t batchSize = THLongTensor_size(sizes, 0); int64_t keysSize = THLongTensor_size(keys, 0); int64_t outDim = THTensor_(size)(bias, 0); int64_t woutDim = THTensor_(size)(weight, 1); int64_t maxNormalize = (woutDim - outDim) > 0 ?1:0; THArgCheck(THNN_(checkKeysValues)(keys, values), 1, "Keys and values should have the same number of elements"); int64_t* sizesData = THLongTensor_data(sizes); /* COmpute the cumulative sizes */ THLongTensor* cumSizes = THLongTensor_new(); THLongTensor_cumsum(cumSizes, sizes, 0); int64_t* cumSizesData = THLongTensor_data(cumSizes); /* Resize the gradWeight buffer to keep it dense. * That speeds up updates A LOT assuming random mem access. */ THTensor_(resize2d)(gradWeight, keysSize, outDim * (maxNormalize>0?2:1)); /* Access the storage data/strides */ scalar_t* gradOutputData = gradOutput->data<scalar_t>(); scalar_t* valuesData =values->data<scalar_t>(); scalar_t* gradWeightData = gradWeight->data<scalar_t>(); scalar_t* gradBiasData = gradBias->data<scalar_t>(); /* Make sure these inputs are contiguous to accelerate computations */ THArgCheck(THLongTensor_isContiguous(keys), 1, "keys vector must be contiguous"); THArgCheck(THTensor_(isContiguous)(values), 3, "values vector must be contiguous"); THArgCheck(THTensor_(isContiguous)(gradOutput), 6, "gradOutput vector must be contiguous"); THArgCheck(THTensor_(isContiguous)(gradWeight), 7, "gradWeight must be contiguous"); THArgCheck(THTensor_(isContiguous)(gradBias), 8, "gradBias vector must be contiguous"); THArgCheck(THTensor_(isContiguous)(weight), 9, "weight must be contiguous"); THArgCheck(THTensor_(isContiguous)(bias), 10, "bias vector must be contiguous"); THArgCheck(THTensor_(isContiguous)(valuesBuffer), 11, "valuesBuffer must be contiguous"); int i,j,k; /* Separate cases: output dimension is == 1, or > 1 * This allows for some optimizations. * No multithreading here as this could * corrupt the results (hogwild style) */ if (outDim == 1) { for (j = 0; j < batchSize; j++) { int64_t offset = j==0?0:cumSizesData[j-1]; scalar_t val = gradOutputData[j] * scale; scalar_t* lgradWeightData = gradWeightData + offset; scalar_t* lvaluesData = valuesData + offset; int64_t end = sizesData[j]; if (maxNormalize) { lgradWeightData += offset; i = 0; for(;i < end; i++) { lgradWeightData[2*i] = val; lgradWeightData[2*i+1] = val * lvaluesData[i]; } } else { i = 0; for(;i < end-4; i += 4) { lgradWeightData[i] = val * lvaluesData[i]; lgradWeightData[i+1] = val * lvaluesData[i+1]; lgradWeightData[i+2] = val * lvaluesData[i+2]; lgradWeightData[i+3] = val * lvaluesData[i+3]; } for(; i < end; i++) { lgradWeightData[i] = val * lvaluesData[i]; } } *gradBiasData += val; offset += end; } } else { for (j = 0; j < batchSize; j++) { int64_t offset = j==0?0:cumSizesData[j-1]; scalar_t* lgradOutputData = gradOutputData + j*outDim; scalar_t* lgradWeightData = gradWeightData; THVector_(cadd)(gradBiasData, gradBiasData, lgradOutputData, scale, outDim); for (i = 0; i < sizesData[j]; i++) { scalar_t val = valuesData[offset] * scale; lgradWeightData = gradWeightData + offset*outDim; if (maxNormalize) { lgradWeightData += offset*outDim; k = 0; for(;k < outDim-4; k += 4) { lgradWeightData[k] = lgradOutputData[k]*scale; lgradWeightData[k+1] = lgradOutputData[k+1]*scale; lgradWeightData[k+2] = lgradOutputData[k+2]*scale; lgradWeightData[k+3] = lgradOutputData[k+3]*scale; } for(; k < outDim; k++) { lgradWeightData[k] = lgradOutputData[k]*scale; } lgradWeightData += outDim; } k = 0; for(;k < outDim-4; k += 4) { lgradWeightData[k] = val * lgradOutputData[k]; lgradWeightData[k+1] = val * lgradOutputData[k+1]; lgradWeightData[k+2] = val * lgradOutputData[k+2]; lgradWeightData[k+3] = val * lgradOutputData[k+3]; } for(; k < outDim; k++) { lgradWeightData[k] = val * lgradOutputData[k]; } offset++; } } } THLongTensor_free(cumSizes); return; }
void THNN_(IndexLinear_accUpdateGradParameters)( THNNState *state, THLongTensor *keys, int64_t keysOffset, THTensor *values, THLongTensor *sizes, THLongTensor *cumSumSizes, THTensor *gradOutput, THTensor *weight, THTensor *bias, accreal weightDecay_, accreal scale_) { scalar_t weightDecay = TH_CONVERT_ACCREAL_TO_REAL(weightDecay_); scalar_t scale = TH_CONVERT_ACCREAL_TO_REAL(scale_); /* Retrieve all the dimensions of the problem */ int64_t batchSize = THLongTensor_size(sizes, 0); int64_t outDim = THTensor_(size)(bias, 0); int64_t woutDim = THTensor_(size)(weight, 1); int maxNormalize = woutDim - outDim; THArgCheck(THNN_(checkKeysValues)(keys, values), 1, "Keys and values should have the same number of elements"); /* Access the storage data/strides */ scalar_t* gradOutputData = gradOutput->data<scalar_t>(); scalar_t* valuesData =values->data<scalar_t>(); scalar_t* weightData = weight->data<scalar_t>(); scalar_t* biasData = bias->data<scalar_t>(); int64_t weightStride0 = weight->stride(0); int64_t* keysData = THLongTensor_data(keys); int64_t* sizesData = THLongTensor_data(sizes); /* Make sure these inputs are contiguous to accelerate computations */ THArgCheck(THLongTensor_isContiguous(keys), 1, "keys vector must be contiguous"); THArgCheck(THTensor_(isContiguous)(values), 3, "values vector must be contiguous"); THArgCheck(THTensor_(isContiguous)(gradOutput), 6, "gradOutput vector must be contiguous"); THArgCheck(THTensor_(isContiguous)(weight), 7, "weight matrix must be contiguous"); THArgCheck(THTensor_(isContiguous)(bias), 8, "bias matrix must be contiguous"); int i,j,k; /* Separate cases: output dimension is == 1, or > 1 * This allows for some optimizations. * No multithreading here as this could * corrupt the results (hogwild style) */ if (outDim == 1) { if (maxNormalize) { int64_t offset = 0; for (j = 0; j < batchSize; j++) { scalar_t* lgradOutputData = gradOutputData + j; *biasData -= *lgradOutputData * scale; scalar_t val = *lgradOutputData * scale; for (i = 0; i < sizesData[j]; i++) { int64_t idx = weightStride0*(keysData[offset] + keysOffset) + maxNormalize; weightData[idx-1] -= weightData[idx]*val*weightData[idx-2]; weightData[idx] -= (val*valuesData[offset] - weightDecay * weightData[idx])*weightData[idx-2]; offset++; } } offset = 0; for (j = 0; j < batchSize; j++) { for (i = 0; i < sizesData[j]; i++) { int64_t idx = weightStride0*(keysData[offset] + keysOffset) + maxNormalize; weightData[idx-2] = 0; offset++; } } } else { if (weightDecay) { int64_t offset = 0; for (j = 0; j < batchSize; j++) { scalar_t* lgradOutputData = gradOutputData + j; *biasData -= *lgradOutputData * scale; scalar_t val = *lgradOutputData * scale; for (i = 0; i < sizesData[j]; i++) { int64_t idx = weightStride0*(keysData[offset] + keysOffset); weightData[idx] -= val * valuesData[offset] + weightData[idx] * weightDecay; offset++; } } } else { int64_t offset = 0; for (j = 0; j < batchSize; j++) { scalar_t val = gradOutputData[j] * scale; for (i = 0; i < sizesData[j]; i++) { weightData[(keysData[offset] + keysOffset)*weightStride0] -= val * valuesData[offset]; offset++; } *biasData -= val; } } } } else { int64_t offset = 0; for (j = 0; j < batchSize; j++) { scalar_t* lgradOutputData = gradOutputData + j*outDim; scalar_t* lweightData = weightData; THVector_(cadd)(biasData, biasData, lgradOutputData, -scale, outDim); for (i = 0; i < sizesData[j]; i++) { scalar_t val = valuesData[offset] * scale; scalar_t wd = weightDecay; // Max normalize case if (maxNormalize) { lweightData = weightData + weightStride0*(keysData[offset] + keysOffset) + (maxNormalize-2); val *= lweightData[0]; wd *= lweightData[0]; for (k=0; k < outDim; k++) { lweightData[1] -= lweightData[k+2]*scale*lgradOutputData[k]*lweightData[0]; } lweightData += 2; } else { lweightData = weightData + weightStride0*(keysData[offset] + keysOffset); } /* We do sparse weight decay. * We think it makes more sense. */ if (weightDecay) { if (outDim > THNN_SPARSE_OUTDIM_THRESHOLD) { THBlas_(axpy)(outDim, -wd, lweightData, 1, lweightData, 1); } else { for (k=0; k < outDim; k++) { lweightData[k] -= wd * lweightData[k]; } } } if (outDim > THNN_SPARSE_OUTDIM_THRESHOLD) { THBlas_(axpy)(outDim, -val, lgradOutputData, 1, lweightData, 1); } else { for (k=0; k < outDim; k++) { lweightData[k] -= val * lgradOutputData[k]; } } offset++; } } /* Max Normalize case: * Reset the smart update scaling if * one does it batch-wise. * TODO: Decide what to do with that piece of code. * NB: If the code belowe is uncommented, so should the commented * code in IndexLinear:zeroGradParameters() */ /* if (maxNormalize) { offset = 0; for (j = 0; j < batchSize; j++) { scalar_t* lweightData = weightData; for (i = 0; i < sizesData[j]; i++) { scalar_t val = valuesData[offset] * scale; scalar_t wd = weightDecay; lweightData = weightData + weightStride0*(keysData[offset] + keysOffset) + (maxNormalize-2); lweightData[0] = 0; offset++; } } } */ } return; }
void THNN_(IndexLinear_updateParameters)( THNNState *state, THTensor *gradWeight, THTensor *gradBias, THTensor *weight, THTensor *bias, THLongTensor *runningKeys, THLongTensor *cumSumSizes, int64_t keysOffset, accreal weightDecay_, accreal learningRate_) { scalar_t weightDecay = TH_CONVERT_ACCREAL_TO_REAL(weightDecay_); scalar_t learningRate = TH_CONVERT_ACCREAL_TO_REAL(learningRate_); /* Retrieve all the dimensions of the problem */ int64_t outDim = THTensor_(size)(bias, 0); int64_t woutDim = THTensor_(size)(weight, 1); int maxNormalize = woutDim - outDim; int64_t keysSize = THLongTensor_size(runningKeys, 0); /* Access the storage data/strides */ scalar_t* gradWeightData = gradWeight->data<scalar_t>(); scalar_t* weightData = weight->data<scalar_t>(); int64_t weightStride0 = weight->stride(0); scalar_t* gradBiasData = gradBias->data<scalar_t>(); scalar_t* biasData = bias->data<scalar_t>(); int64_t* keysData = THLongTensor_data(runningKeys); /* Make sure these inputs are contiguous to accelerate computations */ THArgCheck(THTensor_(isContiguous)(gradWeight), 1, "gradWeight must be contiguous"); THArgCheck(THTensor_(isContiguous)(gradBias), 2, "gradBias vector must be contiguous"); THArgCheck(THTensor_(isContiguous)(weight), 3, "gradBias vector must be contiguous"); THArgCheck(THTensor_(isContiguous)(bias), 4, "gradBias vector must be contiguous"); THArgCheck(THLongTensor_isContiguous(runningKeys), 5, "keys vector must be contiguous"); int j, k; /* Update the bias first */ THVector_(cadd)(biasData, biasData, gradBiasData, -learningRate, outDim); /* Separate cases: output dimension is == 1, or > 1 * This allows for some optimizations. * No multithreading here as this could * corrupt the results (hogwild style) */ if (outDim == 1) { if (maxNormalize) { if (weightDecay) { for (j = 0; j < keysSize; j++) { int64_t woffset = weightStride0*(keysData[j] + keysOffset) + maxNormalize; scalar_t lr = learningRate*weightData[woffset-2]; weightData[woffset-1] -= weightData[woffset]*gradWeightData[2*j]*lr; weightData[woffset] -= gradWeightData[2*j+1]*lr - weightDecay * weightData[woffset-2] * weightData[woffset]; } } else { for (j = 0; j < keysSize; j++) { int64_t woffset = weightStride0*(keysData[j] + keysOffset) + maxNormalize; scalar_t lr = learningRate*weightData[woffset-2]; weightData[woffset-1] -= weightData[woffset]*gradWeightData[2*j]*lr; weightData[woffset] -= gradWeightData[2*j+1]*lr; } } } else { if (weightDecay) { for (j = 0; j < keysSize; j++) { int64_t woffset = weightStride0*(keysData[j] + keysOffset); weightData[woffset] -= gradWeightData[j]*learningRate + weightDecay * weightData[woffset]; } } else { for (j = 0; j < keysSize; j++) { weightData[weightStride0*(keysData[j] + keysOffset)] -= gradWeightData[j]*learningRate; } } } } else { for (j = 0; j < keysSize; j++) { scalar_t lr = learningRate; scalar_t wd = weightDecay; scalar_t* lweightData; int64_t woffset = weightStride0*(keysData[j] + keysOffset); scalar_t* lgradWeightData = gradWeightData + j*outDim; if (maxNormalize) { lgradWeightData += j*outDim; /* weightData[woffset + 2] */ lweightData = weightData + woffset + maxNormalize - 2; lr = lr*lweightData[0]; wd = weightDecay*lweightData[0]; /* weightData[woffset + 3] */ lweightData++; for (k=0; k < outDim; k++) { lweightData[0] -= lgradWeightData[k]*lweightData[k+1]*lr; } lweightData++; lgradWeightData += outDim; } else { lweightData = weightData + woffset; } /* We do sparse weight decay. * We think it makes more sense. */ if (weightDecay) { for (k=0; k < outDim; k++) { lweightData[k] -= lweightData[k]*wd; } } if (outDim > THNN_SPARSE_OUTDIM_THRESHOLD) { THBlas_(axpy)(outDim, -lr, lgradWeightData, 1, lweightData, 1); } else { for (k=0; k < outDim; k++) { lweightData[k] -= lgradWeightData[k]*lr; } } } } }
void THNN_(SpatialConvolutionMap_accGradParameters)( THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias, THTensor *connTable, int nInputPlane, int nOutputPlane, int dW, int dH, accreal scale_) { real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_); THArgCheck( gradWeight != NULL && gradWeight->nDimension == 3 && connTable != NULL && connTable->size[0] == gradWeight->size[0], 5, "3D gradWeight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE ); /* and dims */ int dimw = 2; int dimh = 1; int64_t nbatch = 1; if (input->nDimension == 4) { nbatch = input->size[0]; dimw++; dimh++; } const int64_t input_h = input->size[dimh]; const int64_t input_w = input->size[dimw]; const int64_t output_h = gradOutput->size[dimh]; const int64_t output_w = gradOutput->size[dimw]; const int64_t kH = gradWeight->size[1]; const int64_t kW = gradWeight->size[2]; /* contiguous */ input = THTensor_(newContiguous)(input); gradOutput = THTensor_(newContiguous)(gradOutput); THArgCheck(THTensor_(isContiguous)(gradWeight), 4, "gradWeight needs to be contiguous"); THArgCheck(THTensor_(isContiguous)(gradBias), 5, "gradBias needs to be contiguous"); /* get raw pointers */ real *input_data = THTensor_(data)(input); real *gradOutput_data = THTensor_(data)(gradOutput); real *gradWeight_data = THTensor_(data)(gradWeight); real *gradBias_data = THTensor_(data)(gradBias); int64_t k; /* gradients wrt bias */ #pragma omp parallel for private(k) for (k = 0; k < nOutputPlane; k++) { int64_t m; for (m = 0; m < nbatch; m++) { real *ptr_gradOutput = gradOutput_data + k*output_w*output_h + m*nOutputPlane*output_w*output_h; int64_t l; for (l = 0; l < output_h*output_w; l++) gradBias_data[k] += scale*ptr_gradOutput[l]; } } /* gradients wrt weight */ const int nkernel = connTable->size[0]; #pragma omp parallel for private(k) for (k = 0; k < nkernel; k++) { int64_t m; for (m = 0; m < nbatch; m++) { int o = (int)THTensor_(get2d)(connTable,k,1) - TH_INDEX_BASE; int i = (int)THTensor_(get2d)(connTable,k,0) - TH_INDEX_BASE; /* gradient to kernel */ THTensor_(validXCorr2DRevptr)( gradWeight_data + k*kW*kH, scale, input_data + i*input_w*input_h + m*nInputPlane*input_w*input_h, input_h, input_w, gradOutput_data + o*output_w*output_h + m*nOutputPlane*output_w*output_h , output_h, output_w, dH, dW ); } } /* clean up */ THTensor_(free)(input); THTensor_(free)(gradOutput); }
void THNN_(SpatialDilatedConvolution_accGradParameters)( THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias, THTensor *columns, THTensor *ones, int kW, int kH, int dW, int dH, int padW, int padH, int dilationW, int dilationH, accreal scale_) { real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_); THNN_(SpatialDilatedConvolution_shapeCheck) (input, gradOutput, gradWeight, gradBias, kH, kW, dH, dW, padH, padW, dilationH, dilationW); // Params int nInputPlane = gradWeight->size[1]; int nOutputPlane = gradWeight->size[0]; input = THTensor_(newContiguous)(input); gradOutput = THTensor_(newContiguous)(gradOutput); THArgCheck(THTensor_(isContiguous)(gradWeight), 4, "gradWeight needs to be contiguous"); if (gradBias) THArgCheck(THTensor_(isContiguous)(gradBias), 5, "gradBias needs to be contiguous"); int batch = 1; if (input->nDimension == 3) { // Force batch batch = 0; THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]); THTensor_(resize4d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]); } long inputWidth = input->size[3]; long inputHeight = input->size[2]; long outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1; long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1; // Batch size + input planes long batchSize = input->size[0]; // Define a buffer of ones, for bias accumulation if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) { // Resize plane and fill with ones... THTensor_(resize2d)(ones, outputHeight, outputWidth); THTensor_(fill)(ones, 1); } // Resize temporary columns THTensor_(resize2d)(columns, nInputPlane*kW*kH, outputHeight*outputWidth); // Helpers THTensor *input_n = THTensor_(new)(); THTensor *gradOutput_n = THTensor_(new)(); // For each elt in batch, do: for (int elt = 0; elt < batchSize; elt ++) { // Matrix mulitply per output: THTensor_(select)(input_n, input, 0, elt); THTensor_(select)(gradOutput_n, gradOutput, 0, elt); // Extract columns: THNN_(im2col)( THTensor_(data)(input_n), nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW, dilationH, dilationW, THTensor_(data)(columns) ); // M,N,K are dims of matrix A and B long m = nOutputPlane; long n = nInputPlane*kW*kH; long k = columns->size[1]; // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) THBlas_(gemm)( 't', 'n', n, m, k, scale, THTensor_(data)(columns), k, THTensor_(data)(gradOutput_n), k, 1, THTensor_(data)(gradWeight), n ); // Do Bias: // M,N,K are dims of matrix A and B long m_ = nOutputPlane; long k_ = outputHeight * outputWidth; // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices) if (gradBias) { THBlas_(gemv)( 't', k_, m_, scale, THTensor_(data)(gradOutput_n), k_, THTensor_(data)(ones), 1, 1, THTensor_(data)(gradBias), 1 ); } } // Free THTensor_(free)(input_n); THTensor_(free)(gradOutput_n); // Resize if (batch == 0) { THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth); THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth); } THTensor_(free)(input); THTensor_(free)(gradOutput); }