static int nn_(SparseLinear_updateOutput)(lua_State *L) { long i; THTensor * input = luaT_checkudata(L, 2, torch_(Tensor_id)); THTensor * weight = luaT_getfieldcheckudata(L, 1, "weight", torch_(Tensor_id)); THTensor * bias = luaT_getfieldcheckudata(L, 1, "bias", torch_(Tensor_id)); THTensor * output = luaT_getfieldcheckudata(L, 1, "output", torch_(Tensor_id)); long dim = weight->size[0]; /* number of weights.. */ THTensor_(copy)(output, bias); for(i = 0; i < input->size[1]; i++) { long offset = (long)(THTensor_(get2d)(input, 0, i))-1; if(offset >= 0 && offset < dim) /* make sure indices are in bounds.. */ { real val = THTensor_(get2d)(input, 1, i); THBlas_(axpy)(output->size[0], val, THTensor_(data)(weight)+offset*weight->stride[0], weight->stride[1], THTensor_(data)(output), output->stride[0]); } else luaL_error(L, "index out of bound"); } return 1; }
int nn_(SparseLinear_updateParameters)(lua_State *L) { long i; real learningRate = luaL_checknumber(L, 2); THTensor * weight = luaT_getfieldcheckudata(L, 1, "weight", torch_(Tensor_id)); THTensor * output = luaT_getfieldcheckudata(L, 1, "output", torch_(Tensor_id)); THTensor * bias = luaT_getfieldcheckudata(L, 1, "bias", torch_(Tensor_id)); THTensor * gradBias = luaT_getfieldcheckudata(L, 1, "gradBias", torch_(Tensor_id)); THTensor * gradWeight = luaT_getfieldcheckudata(L, 1, "gradWeight", torch_(Tensor_id)); THTensor * lastInput = luaT_getfieldcheckudata(L, 1, "lastInput", torch_(Tensor_id)); real weightDecay = luaT_getfieldchecknumber(L, 1, "weightDecay"); long dim = weight->size[0]; /* number of weights.. */ THTensor_(cadd)(bias, bias, -learningRate, gradBias); for(i = 0; i < lastInput->size[1]; i++) { long offset = (long)(THTensor_(get2d)(lastInput, 0, i))-1; if(offset >= 0 && offset < dim) /* make sure indices are in bounds.. */ { THBlas_(axpy)(bias->size[0], -learningRate, THTensor_(data)(gradWeight)+offset*gradWeight->stride[0], gradWeight->stride[1], THTensor_(data)(weight)+offset*weight->stride[0], weight->stride[1]); } else luaL_error(L, "index out of bound"); } return 0; }
void THBlas_(gemv)(char trans, long m, long n, real alpha, real *a, long lda, real *x, long incx, real beta, real *y, long incy) { if(n == 1) lda = m; int cblas_trans = CblasNoTrans; if((trans == 't') || (trans == 'T')) { cblas_trans = CblasTrans; } #if defined(USE_BLAS) && (defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)) if( (m <= INT_MAX) && (n <= INT_MAX) && (lda > 0) && (lda <= INT_MAX) && (incx > 0) && (incx <= INT_MAX) && (incy > 0) && (incy <= INT_MAX) ) { int i_m = (int)m; int i_n = (int)n; int i_lda = (int)lda; int i_incx = (int)incx; int i_incy = (int)incy; #if defined(TH_REAL_IS_DOUBLE) cblas_dgemv(CblasColMajor, cblas_trans, i_m, i_n, alpha, a, i_lda, x, i_incx, beta, y, i_incy); #else cblas_sgemv(CblasColMajor, cblas_trans, i_m, i_n, alpha, a, i_lda, x, i_incx, beta, y, i_incy); #endif return; } #endif { long i, j; if( (trans == 'T') || (trans == 't') ) { for(i = 0; i < n; i++) { real sum = 0; real *row_ = a+lda*i; for(j = 0; j < m; j++) sum += x[j*incx]*row_[j]; y[i*incy] = beta*y[i*incy] + alpha*sum; } } else { if(beta != 1) THBlas_(scal)(m, beta, y, incy); for(j = 0; j < n; j++) { real *column_ = a+lda*j; real z = alpha*x[j*incx]; for(i = 0; i < m; i++) y[i*incy] += z*column_[i]; } } } }
static int nn_(SparseLinear_accGradParameters)(lua_State *L) { long i; THTensor * input = luaT_checkudata(L, 2, torch_(Tensor_id)); THTensor * gradOutput = luaT_checkudata(L, 3, torch_(Tensor_id)); real scale = luaL_optnumber(L, 4, 1); THTensor * weight = luaT_getfieldcheckudata(L, 1, "weight", torch_(Tensor_id)); THTensor * output = luaT_getfieldcheckudata(L, 1, "output", torch_(Tensor_id)); THTensor * gradBias = luaT_getfieldcheckudata(L, 1, "gradBias", torch_(Tensor_id)); THTensor * gradWeight = luaT_getfieldcheckudata(L, 1, "gradWeight", torch_(Tensor_id)); THTensor * lastInput = luaT_getfieldcheckudata(L, 1, "lastInput", torch_(Tensor_id)); real weightDecay = luaT_getfieldchecknumber(L, 1, "weightDecay"); long dim = gradWeight->size[0]; /* number of weights.. */ for(i = 0; i < input->size[1]; i++) { long offset = (long)(THTensor_(get2d)(input, 0, i))-1; if(offset >= 0 && offset < dim) /* make sure indices are in bounds.. */ { real val = scale*THTensor_(get2d)(input, 1, i); THBlas_(scal)(gradOutput->size[0], 0, THTensor_(data)(gradWeight)+offset*gradWeight->stride[0], gradWeight->stride[1]); /* zero */ THBlas_(axpy)(gradOutput->size[0], val, THTensor_(data)(gradOutput), gradOutput->stride[0], THTensor_(data)(gradWeight)+offset*gradWeight->stride[0], gradWeight->stride[1]); } else luaL_error(L, "index out of bound"); } THTensor_(cadd)(gradBias, gradBias, 1, gradOutput); if(weightDecay != 0) THTensor_(cadd)(gradWeight, gradWeight, weightDecay, weight); THTensor_(resizeAs)(lastInput, input); THTensor_(copy)(lastInput, input); return 0; }
void THBlas_(gemv)(char trans, int64_t m, int64_t n, real alpha, real *a, int64_t lda, real *x, int64_t incx, real beta, real *y, int64_t incy) { if(n == 1) lda = m; #if defined(USE_BLAS) && (defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)) if( (m <= INT_MAX) && (n <= INT_MAX) && (lda <= INT_MAX) && (incx > 0) && (incx <= INT_MAX) && (incy > 0) && (incy <= INT_MAX) ) { THArgCheck(lda >= THMax(1, m), 6, "lda should be at least max(1, m=%d), but have %d", m, lda); int i_m = (int)m; int i_n = (int)n; int i_lda = (int)lda; int i_incx = (int)incx; int i_incy = (int)incy; #if defined(TH_REAL_IS_DOUBLE) dgemv_(&trans, &i_m, &i_n, &alpha, a, &i_lda, x, &i_incx, &beta, y, &i_incy); #else sgemv_(&trans, &i_m, &i_n, &alpha, a, &i_lda, x, &i_incx, &beta, y, &i_incy); #endif return; } #endif { int64_t i, j; if( (trans == 'T') || (trans == 't') ) { for(i = 0; i < n; i++) { real sum = 0; real *row_ = a+lda*i; for(j = 0; j < m; j++) sum += x[j*incx]*row_[j]; if (beta == 0) y[i*incy] = alpha*sum; else y[i*incy] = beta*y[i*incy] + alpha*sum; } } else { if(beta != 1) THBlas_(scal)(m, beta, y, incy); for(j = 0; j < n; j++) { real *column_ = a+lda*j; real z = alpha*x[j*incx]; for(i = 0; i < m; i++) y[i*incy] += z*column_[i]; } } } }
void THNN_(SparseLinear_accGradParameters)( THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias, THTensor *weight, THTensor *bias, real weightDecay, real scale) { long h, i; long outDim = THTensor_(size)(weight, 0); long inDim = THTensor_(size)(weight, 1); THArgCheck(THNN_(checkInput)(input), 2, "input must be in coo format, nnz x 3"); THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4, "gradWeight size wrong"); THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5, "gradBias size wrong"); THArgCheck(THTensor_(isContiguous)(gradOutput), 1, "gradOutput must be contiguous"); long nnz = THTensor_(size)(input, 0); // THTensor_(resize2d)(gradOutput, batchSize, outDim); // gradWeight += gradOutput * input #pragma omp parallel for private(h, i) schedule(static) if (\ nnz * outDim > 10000) for (i = 0; i < nnz; i++) { real val = scale * THNN_(get2d)(input, i, 2); long offset = (long)(THNN_(get2d)(input, i, 1)) - 1; long h = (long)(THNN_(get2d)(input, i, 0)) - 1; if (offset >= 0 && offset < inDim) { THBlas_(axpy)(outDim, val, ROW_PTR2(gradOutput, h), gradOutput->stride[1], COL_PTR2(gradWeight, offset), gradWeight->stride[0]); } else { THError( "index out of bound. accGradParameters: %d not between 1 and %d", offset + 1, inDim); } } // gradBias += gradOutput THTensor* buf = THTensor_(new)(); THTensor_(sum)(buf, gradOutput, 0); THTensor_(cadd)(gradBias, gradBias, scale, buf); THTensor_(free)(buf); if (weightDecay != 0) { THTensor_(cadd)(gradWeight, gradWeight, weightDecay, weight); } }
static int nn_(SparseLinear_updateOutput)(lua_State *L) { long i; THTensor * input = luaT_checkudata(L, 2, torch_Tensor); THTensor * weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor); THTensor * bias = luaT_getfieldcheckudata(L, 1, "bias", torch_Tensor); THTensor * output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor); long outDim = weight->size[0]; long inDim = weight->size[1]; luaL_argcheck(L, nn_(checkInput)(input), 2, "input size must be nnz x 2"); luaL_argcheck(L, nn_(checkSize1D)(output, outDim), 1, "output size wrong"); luaL_argcheck(L, nn_(checkSize1D)(bias, outDim), 1, "bias size wrong"); lua_getfield(L, 1, "shardBuffer"); if (!lua_isnil(L, -1)) { THTensor *buffer = luaT_getfieldcheckudata(L, 1, "shardBuffer", torch_Tensor); long num_shards = buffer->size[1]; luaL_argcheck(L, buffer->nDimension == 2 && buffer->size[0] == outDim && num_shards > 0, 1, "shardBuffer size wrong"); THTensor_(zero)(buffer); #pragma omp parallel for private(i) schedule(static) num_threads(num_shards) for (i = 0; i < input->size[0]; i++) { #ifdef _OPENMP int shardId = omp_get_thread_num(); #else int shardId = 1; #endif long offset = (long)(THTensor_(get2d)(input, i, 0)) - 1; if (offset >= 0 && offset < inDim) { THBlas_(axpy)(outDim, THTensor_(get2d)(input, i, 1), THTensor_(data)(weight) + offset * weight->stride[1], weight->stride[0], THTensor_(data)(buffer) + shardId * buffer->stride[1], buffer->stride[0]); } else { luaL_error(L, "index out of bound. updateOutput: \ %ld not between 1 and %ld", offset + 1, inDim); } } THTensor_(sum)(output, buffer, 1); THTensor_(cadd)(output, bias, 1.0, output); lua_getfield(L, 1, "output"); return 1; }
void THNN_(SparseLinear_legacyUpdateOutput)( THNNState *state, THTensor *input, THTensor *output, THTensor *weight, THTensor *bias) { int64_t h, i; int64_t outDim = THTensor_(size)(weight, 0); int64_t inDim = THTensor_(size)(weight, 1); THArgCheck(THNN_(checkLegacyInput)(input), 2, "input size must be batchsize x nnz x 2"); THArgCheck(THTensor_(isContiguous)(output), 3, "output must be contiguous"); THArgCheck(THNN_(checkSize1D)(bias, outDim), 5, "bias size wrong"); weight = THTensor_(newContiguous)(weight); int64_t batchSize = THTensor_(size)(input, 0); int64_t nnz = THTensor_(size)(input, 1); THTensor_(resize2d)(output, batchSize, outDim); // output = weight * input + bias THTensor_(zero)(output); #pragma omp parallel for private(h, i) schedule(static) if ( \ batchSize > 1 && batchSize * nnz * outDim > 10000) for (h = 0; h < batchSize; h++) { for (i = 0; i < nnz; i++) { real val = THNN_(get3d)(input, h, i, 1); if (val == 0) { continue; } int64_t offset = (int64_t)(THNN_(get3d)(input, h, i, 0)) - 1; if (offset >= 0 && offset < inDim) { THBlas_(axpy)(outDim, val, COL_PTR2(weight, offset), weight->stride[0], ROW_PTR2(output, h), output->stride[1]); } else { THError("index out of bound. updateOutput: %d not between 1 and %d", offset + 1, inDim); } } } THTensor* output_row = THTensor_(new)(); for (h = 0; h < batchSize; h++) { THTensor_(select)(output_row, output, 0, h); THTensor_(cadd)(output_row, bias, 1.0, output_row); } THTensor_(free)(output_row); THTensor_(free)(weight); }
void THNN_(SparseLinear_updateOutput)( THNNState *state, THTensor *input, THTensor *output, THTensor *weight, THTensor *bias) { long h, i; long outDim = THTensor_(size)(weight, 0); long inDim = THTensor_(size)(weight, 1); long batchSize = THTensor_(size)(output, 0); THArgCheck(THNN_(checkInput)(input), 2, "input must be in coo format, nnz x 3"); THArgCheck(THTensor_(isContiguous)(output), 3, "output must be contiguous"); THArgCheck(THNN_(checkSize1D)(bias, outDim), 5, "bias size wrong"); long nnz = THTensor_(size)(input, 0); // output = weight * input + bias THTensor_(zero)(output); #pragma omp parallel for private(i) schedule(static) if (nnz * outDim > 10000) for (i = 0; i < nnz; i++) { real val = THNN_(get2d)(input, i, 2); if (val == 0) { continue; } long offset = (long)(THNN_(get2d)(input, i, 1)) - 1; long h = (long)(THNN_(get2d)(input, i, 0)) - 1; if (offset >= 0 && offset < inDim) { THBlas_(axpy)(outDim, val, COL_PTR2(weight, offset), weight->stride[0], ROW_PTR2(output, h), output->stride[1]); } else { THError("index out of bound. updateOutput: %d not between 1 and %d", offset + 1, inDim); } } THTensor* output_row = THTensor_(new)(); for (h = 0; h < batchSize; h++) { THTensor_(select)(output_row, output, 0, h); THTensor_(cadd)(output_row, bias, 1.0, output_row); } THTensor_(free)(output_row); }
void THNN_(VolumetricDilatedConvolution_updateGradInput)( THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput, THTensor *weight, THTensor *gradColumns, int kT, int kW, int kH, int dT, int dW, int dH, int padT, int padW, int padH, int dilationT, int dilationW, int dilationH) { THNN_(VolumetricDilatedConvolution_shapeCheck)( input, gradOutput, weight, NULL, kT, kH, kW, dT, dH, dW, padT, padH, padW, dilationT, dilationH, dilationW, 0); // Params int nInputPlane = weight->size[1]; int nOutputPlane = weight->size[0]; input = THTensor_(newContiguous)(input); gradOutput = THTensor_(newContiguous)(gradOutput); weight = THTensor_(newContiguous)(weight); THArgCheck(THTensor_(isContiguous)(gradColumns), 5, "gradColumns needs to be contiguous"); int is_batch = 1; if (input->_dim() == 4) { // Force batch is_batch = 0; THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]); THTensor_(resize5d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]); } int64_t inputDepth = input->size[2]; int64_t inputWidth = input->size[4]; int64_t inputHeight = input->size[3]; int64_t outputDepth = (inputDepth + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1; int64_t outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1; int64_t outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1; // Batch size + input planes int64_t batchSize = input->size[0]; // Resize output THTensor_(resize5d)(gradInput, batchSize, nInputPlane, inputDepth, inputHeight, inputWidth); // Resize temporary columns THTensor_(resize2d)(gradColumns, nInputPlane*kT*kW*kH, outputDepth*outputHeight*outputWidth); THTensor_(zero)(gradColumns); // Helpers THTensor *gradInput_n = THTensor_(new)(); THTensor *gradOutput_n = THTensor_(new)(); // For each elt in batch, do: for (int elt = 0; elt < batchSize; elt ++) { // Matrix mulitply per sample: THTensor_(select)(gradInput_n, gradInput, 0, elt); THTensor_(select)(gradOutput_n, gradOutput, 0, elt); // M,N,K are dims of matrix A and B int64_t m = nInputPlane*kT*kW*kH; int64_t n = gradColumns->size[1]; int64_t k = nOutputPlane; // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) THBlas_(gemm)( 'n', 't', n, m, k, 1, THTensor_(data)(gradOutput_n), n, THTensor_(data)(weight), m, 0, THTensor_(data)(gradColumns), n ); // Unpack columns back into input: THNN_(col2vol)( THTensor_(data)(gradColumns), nInputPlane, inputDepth, inputHeight, inputWidth, outputDepth, outputHeight, outputWidth, kT, kH, kW, padT, padH, padW, dT, dH, dW, dilationT, dilationH, dilationW, THTensor_(data)(gradInput_n) ); } // Free THTensor_(free)(gradInput_n); THTensor_(free)(gradOutput_n); // Resize output if (is_batch == 0) { THTensor_(resize4d)(gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth); THTensor_(resize4d)(input, nInputPlane, inputDepth, inputHeight, inputWidth); THTensor_(resize4d)(gradInput, nInputPlane, inputDepth, inputHeight, inputWidth); } THTensor_(free)(input); THTensor_(free)(gradOutput); THTensor_(free)(weight); }
void THNN_(SpatialFullDilatedConvolution_updateOutput)( THNNState *state, THTensor *input, THTensor *output, THTensor *weight, THTensor *bias, THTensor *columns, THTensor *ones, int kW, int kH, int dW, int dH, int padW, int padH, int dilationW, int dilationH, int adjW, int adjH) { THNN_(SpatialFullDilatedConvolution_shapeCheck) (input, NULL, weight, bias, kH, kW, dH, dW, padH, padW, dilationH, dilationW, adjH, adjW, 0); int nInputPlane = THTensor_(size)(weight,0); int nOutputPlane = THTensor_(size)(weight,1); input = THTensor_(newContiguous)(input); weight = THTensor_(newContiguous)(weight); THArgCheck(THTensor_(isContiguous)(columns), 5, "columns needs to be contiguous"); if (bias) { bias = THTensor_(newContiguous)(bias); THArgCheck(THTensor_(isContiguous)(ones), 6, "ones needs to be contiguous"); } int is_batch = 1; if (input->dim() == 3) { // Force batch is_batch = 0; THTensor_(resize4d)(input, 1, input->size(0), input->size(1), input->size(2)); } int64_t inputHeight = input->size(2); int64_t inputWidth = input->size(3); int64_t outputHeight = (inputHeight - 1) * dH - 2*padH + (dilationH * (kH - 1) + 1) + adjH; int64_t outputWidth = (inputWidth - 1) * dW - 2*padW + (dilationW * (kW - 1) + 1) + adjW; // Batch size + input planes int64_t batchSize = input->size(0); // Resize output THTensor_(resize4d)(output, batchSize, nOutputPlane, outputHeight, outputWidth); // Resize temporary columns THTensor_(resize2d)(columns, nOutputPlane*kW*kH, inputHeight*inputWidth); THTensor_(zero)(columns); // Define a buffer of ones, for bias accumulation // Note: this buffer can be shared with other modules, it only ever gets increased, // and always contains ones. if (ones->dim() != 2 || ones->size(0)*ones->size(1) < outputHeight*outputWidth) { // Resize plane and fill with ones... THTensor_(resize2d)(ones, outputHeight, outputWidth); THTensor_(fill)(ones, 1); } // Helpers THTensor *input_n = THTensor_(new)(); THTensor *output_n = THTensor_(new)(); int elt; // For each elt in batch, do: for (elt = 0; elt < batchSize; elt ++) { // Matrix mulitply per output: THTensor_(select)(input_n, input, 0, elt); THTensor_(select)(output_n, output, 0, elt); // M,N,K are dims of matrix A and B // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) int64_t m = weight->size(1) * weight->size(2) * weight->size(3); int64_t n = columns->size(1); int64_t k = weight->size(0); // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) THBlas_(gemm)( 'n', 't', n, m, k, 1, input_n->data<scalar_t>(), n, weight->data<scalar_t>(), m, 0, columns->data<scalar_t>(), n ); // Unpack columns back into input: THNN_(col2im)( columns->data<scalar_t>(), nOutputPlane, outputHeight, outputWidth, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW, dilationH, dilationW, output_n->data<scalar_t>() ); // Do Bias after: // M,N,K are dims of matrix A and B // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) int64_t m_ = nOutputPlane; int64_t n_ = outputHeight * outputWidth; int64_t k_ = 1; // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) if (bias) { THBlas_(gemm)( 't', 'n', n_, m_, k_, 1, ones->data<scalar_t>(), k_, bias->data<scalar_t>(), k_, 1, output_n->data<scalar_t>(), n_ ); } } // Free c10::raw::intrusive_ptr::decref(input_n); c10::raw::intrusive_ptr::decref(output_n); // Resize output if (is_batch == 0) { THTensor_(resize3d)(output, nOutputPlane, outputHeight, outputWidth); THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth); } c10::raw::intrusive_ptr::decref(input); c10::raw::intrusive_ptr::decref(weight); if (bias) c10::raw::intrusive_ptr::decref(bias); }
void THNN_(SpatialFullDilatedConvolution_accGradParameters)( THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias, THTensor *columns, THTensor *ones, int kW, int kH, int dW, int dH, int padW, int padH, int dilationW, int dilationH, int adjW, int adjH, accreal scale_) { scalar_t scale = TH_CONVERT_ACCREAL_TO_REAL(scale_); THNN_(SpatialFullDilatedConvolution_shapeCheck) (input, gradOutput, gradWeight, gradBias, kH, kW, dH, dW, padH, padW, dilationH, dilationW, adjH, adjW, 1); int64_t nOutputPlane; if (gradWeight) { nOutputPlane = THTensor_(size)(gradWeight, 1); } else if (gradBias) { nOutputPlane = THTensor_sizeLegacyNoScalars(gradBias, 0); } else { return; } input = THTensor_(newContiguous)(input); gradOutput = THTensor_(newContiguous)(gradOutput); if (gradWeight) { THArgCheck(THTensor_(isContiguous)(gradWeight), 4, "gradWeight needs to be contiguous"); } THArgCheck(THTensor_(isContiguous)(columns), 6, "columns needs to be contiguous"); if (gradBias) { THArgCheck(THTensor_(isContiguous)(gradBias), 5, "gradBias needs to be contiguous"); THArgCheck(THTensor_(isContiguous)(ones), 7, "ones needs to be contiguous"); } int is_batch = 1; if (input->dim() == 3) { // Force batch is_batch = 0; THTensor_(resize4d)(input, 1, input->size(0), input->size(1), input->size(2)); THTensor_(resize4d)(gradOutput, 1, gradOutput->size(0), gradOutput->size(1), gradOutput->size(2)); } int64_t inputWidth = input->size(3); int64_t inputHeight = input->size(2); int64_t outputHeight = (inputHeight - 1) * dH - 2*padH + (dilationH * (kH - 1) + 1) + adjH; int64_t outputWidth = (inputWidth - 1) * dW - 2*padW + (dilationW * (kW - 1) + 1) + adjW; // Batch size + input planes int64_t batchSize = input->size(0); // Define a buffer of ones, for bias accumulation if (ones->dim() != 2 || ones->size(0)*ones->size(1) < outputHeight*outputWidth) { // Resize plane and fill with ones... THTensor_(resize2d)(ones, outputHeight, outputWidth); THTensor_(fill)(ones, 1); } // Resize temporary columns THTensor_(resize2d)(columns, nOutputPlane*kW*kH, inputHeight*inputWidth); // Helpers THTensor *input_n = THTensor_(new)(); THTensor *gradOutput_n = THTensor_(new)(); int elt; // For each elt in batch, do: for (elt = 0; elt < batchSize; elt ++) { // Matrix mulitply per output: THTensor_(select)(gradOutput_n, gradOutput, 0, elt); // Do Weight: if (gradWeight) { // Matrix mulitply per output: THTensor_(select)(input_n, input, 0, elt); // Extract columns: THNN_(im2col)( gradOutput_n->data<scalar_t>(), nOutputPlane, outputHeight, outputWidth, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW, dilationH, dilationW, columns->data<scalar_t>() ); // M,N,K are dims of matrix A and B // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) int64_t n = columns->size(0); // nOutputPlane * kh * kw int64_t m = THTensor_sizeLegacyNoScalars(input_n, 0); // nInputPlane int64_t k = columns->size(1); // inputHeight * inputWidth // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) THBlas_(gemm)( 't', 'n', n, m, k, scale, columns->data<scalar_t>(), k, input_n->data<scalar_t>(), k, 1, gradWeight->data<scalar_t>(), n ); } // Do Bias: if (gradBias) { // M,N,K are dims of matrix A and B // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) int64_t m_ = nOutputPlane; int64_t k_ = outputHeight * outputWidth; // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices) THBlas_(gemv)( 't', k_, m_, scale, gradOutput_n->data<scalar_t>(), k_, ones->data<scalar_t>(), 1, 1, gradBias->data<scalar_t>(), 1 ); } } // Free c10::raw::intrusive_ptr::decref(input_n); c10::raw::intrusive_ptr::decref(gradOutput_n); // Resize if (is_batch == 0) { THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth); THTensor_(resize3d)(input, input->size(1), inputHeight, inputWidth); } c10::raw::intrusive_ptr::decref(input); c10::raw::intrusive_ptr::decref(gradOutput); }
void THNN_(SpatialFullDilatedConvolution_updateGradInput)( THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput, THTensor *weight, THTensor *gradColumns, int kW, int kH, int dW, int dH, int padW, int padH, int dilationW, int dilationH, int adjW, int adjH) { THNN_(SpatialFullDilatedConvolution_shapeCheck) (input, gradOutput, weight, NULL, kH, kW, dH, dW, padH, padW, dilationH, dilationW, adjH, adjW, 0); int64_t nInputPlane = THTensor_(size)(weight,0); int64_t nOutputPlane = THTensor_(size)(weight,1); input = THTensor_(newContiguous)(input); gradOutput = THTensor_(newContiguous)(gradOutput); weight = THTensor_(newContiguous)(weight); THArgCheck(THTensor_(isContiguous)(gradColumns), 5, "gradColumns needs to be contiguous"); int is_batch = 1; if (input->dim() == 3) { // Force batch is_batch = 0; THTensor_(resize4d)(input, 1, input->size(0), input->size(1), input->size(2)); THTensor_(resize4d)(gradOutput, 1, gradOutput->size(0), gradOutput->size(1), gradOutput->size(2)); } int64_t inputWidth = input->size(3); int64_t inputHeight = input->size(2); int64_t outputHeight = (inputHeight - 1) * dH - 2*padH + (dilationH * (kH - 1) + 1) + adjH; int64_t outputWidth = (inputWidth - 1) * dW - 2*padW + (dilationW * (kW - 1) + 1) + adjW; // Batch size + input planes int64_t batchSize = input->size(0); // Resize output THTensor_(resize4d)(gradInput, batchSize, nInputPlane, inputHeight, inputWidth); THTensor_(zero)(gradInput); // Resize temporary columns THTensor_(resize2d)(gradColumns, nOutputPlane*kW*kH, inputHeight*inputWidth); // Helpers THTensor *gradInput_n = THTensor_(new)(); THTensor *gradOutput_n = THTensor_(new)(); int elt; // For each elt in batch, do: for (elt = 0; elt < batchSize; elt ++) { // Matrix mulitply per sample: THTensor_(select)(gradInput_n, gradInput, 0, elt); THTensor_(select)(gradOutput_n, gradOutput, 0, elt); // Extract columns: THNN_(im2col)( gradOutput_n->data<scalar_t>(), nOutputPlane, outputHeight, outputWidth, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW, dilationH, dilationW, gradColumns->data<scalar_t>() ); // M,N,K are dims of matrix A and B // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) int64_t m = weight->size(0); int64_t n = gradColumns->size(1); int64_t k = weight->size(1) * weight->size(2) * weight->size(3); // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) THBlas_(gemm)( 'n', 'n', n, m, k, 1, gradColumns->data<scalar_t>(), n, weight->data<scalar_t>(), k, 0, gradInput_n->data<scalar_t>(), n ); } // Free c10::raw::intrusive_ptr::decref(gradInput_n); c10::raw::intrusive_ptr::decref(gradOutput_n); // Resize output if (is_batch == 0) { THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth); THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth); THTensor_(resize3d)(gradInput, nInputPlane, inputHeight, inputWidth); } c10::raw::intrusive_ptr::decref(input); c10::raw::intrusive_ptr::decref(gradOutput); c10::raw::intrusive_ptr::decref(weight); }
void THNN_(SparseLinear_updateOutput)( THNNState *state, THTensor *input, THTensor *output, THTensor *weight, THTensor *bias) { int64_t h, i, hp0, hp1; int64_t outDim = THTensor_(size)(weight, 0); int64_t inDim = THTensor_(size)(weight, 1); int64_t batchSize = THTensor_(size)(output, 0); THArgCheck(THNN_(checkInput)(input), 2, "input must be in coo format, nnz x 3"); THArgCheck(THTensor_(isContiguous)(output), 3, "output must be contiguous"); THArgCheck(THNN_(checkSize1D)(bias, outDim), 5, "bias size wrong"); int64_t nnz = THTensor_(size)(input, 0); THLongTensor * csr = THLongTensor_newWithSize1d(batchSize+1); THLongTensor_zero(csr); weight = THTensor_(newContiguous)(weight); //#pragma omp parallel for private(i, h, hp0, hp1) schedule(static) if (nnz > 10000) for (i=0; i<nnz; i++) { hp0 = (int64_t)(THNN_(get2d)(input, i, 0)) - 1; hp1 = (i+1 == nnz) ? batchSize : (int64_t)(THNN_(get2d)(input, i+1, 0)) - 1; if (hp0 != hp1) for (h = hp0; h < hp1; h++) { THLongTensor_set1d(csr, h+1, i+1); } } // output = weight * input + bias THTensor_(zero)(output); #pragma omp parallel for private(h, i) schedule(static) if (nnz > 10000) for (h = 0; h < batchSize; h++) { int64_t i_start = THLongTensor_get1d(csr, h); int64_t i_end = THLongTensor_get1d(csr, h+1); for (i = i_start; i < i_end; i++) { real val = THNN_(get2d)(input, i, 2); if (val == 0) { continue; } int64_t offset = (int64_t)(THNN_(get2d)(input, i, 1)) - 1; if (offset >= 0 && offset < inDim) { THBlas_(axpy)(outDim, val, COL_PTR2(weight, offset), weight->stride[0], ROW_PTR2(output, h), output->stride[1]); } else { THError("index out of bound. updateOutput: %d not between 1 and %d", offset + 1, inDim); } } } THTensor* output_row = THTensor_(new)(); for (h = 0; h < batchSize; h++) { THTensor_(select)(output_row, output, 0, h); THTensor_(cadd)(output_row, bias, 1.0, output_row); } THTensor_(free)(output_row); THLongTensor_free(csr); THTensor_(free)(weight); }
void THNN_(SparseLinear_legacyUpdateParameters)( THNNState *state, THTensor *weight, THTensor *bias, THTensor *gradWeight, THTensor *gradBias, THTensor *lastInput, accreal learningRate_) { real learningRate = TH_CONVERT_ACCREAL_TO_REAL(learningRate_); int64_t h, i; int64_t outDim = weight->size[0]; int64_t inDim = weight->size[1]; THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4, "gradWeight size wrong"); THArgCheck(THNN_(checkSize1D)(bias, outDim), 3, "bias size wrong"); THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5, "gradBias size wrong"); THArgCheck(THNN_(checkLegacyInput)(lastInput), 6, "input size must be batchsize x nnz x 2"); int64_t batchSize = THTensor_(size)(lastInput, 0); int64_t nnz = THTensor_(size)(lastInput, 1); // collect unique offsets of non-0 val in input THTensor* offsets = THTensor_(newWithSize1d)(batchSize * nnz); int64_t cnt = 0; for (h = 0; h < batchSize; h++) { for (i = 0; i < nnz; i++) { real val = THNN_(get3d)(lastInput, h, i, 1); if (val == 0 ) { continue; } int64_t offset = (int64_t)(THNN_(get3d)(lastInput, h, i, 0)) - 1; if (offset >= 0 && offset < inDim) { THNN_(set1d)(offsets, cnt++, offset); } else { THError( "index out of bound. updateParameters: %d not between 1 and %d", offset + 1, inDim); } } } THTensor_(resize1d)(offsets, cnt); THTensor* uniqueOffsets = THTensor_(new)(); THLongTensor* ri = THLongTensor_new(); THTensor_(sort)(uniqueOffsets, ri, offsets, 0, 0); THLongTensor_free(ri); THTensor_(free)(offsets); cnt = 1; real* uniqueOffsets_p = THTensor_(data)(uniqueOffsets); for (i = 1; i < THTensor_(size)(uniqueOffsets, 0); i++) { if (uniqueOffsets_p[i] != uniqueOffsets_p[i - 1]) { uniqueOffsets_p[cnt++] = uniqueOffsets_p[i]; } } THTensor_(resize1d)(uniqueOffsets, cnt); // weight += -learningRate * gradWeight THTensor_(cadd)(bias, bias, -learningRate, gradBias); #pragma omp parallel for private(i) schedule(static) if (cnt * outDim > 10000) for (i = 0; i < cnt; i++) { int64_t offset = (int64_t)uniqueOffsets_p[i]; THBlas_(axpy)(outDim, -learningRate, COL_PTR2(gradWeight, offset), gradWeight->stride[0], COL_PTR2(weight, offset), weight->stride[0]); } THTensor_(free)(uniqueOffsets); }
void THNN_(SpatialDilatedConvolution_updateOutput)( THNNState *state, THTensor *input, THTensor *output, THTensor *weight, THTensor *bias, THTensor *columns, THTensor *ones, int kW, int kH, int dW, int dH, int padW, int padH, int dilationW, int dilationH) { THNN_(SpatialDilatedConvolution_shapeCheck) (input, NULL, weight, bias, kH, kW, dH, dW, padH, padW, dilationH, dilationW); // Params: int nInputPlane = weight->size[1]; int nOutputPlane = weight->size[0]; input = THTensor_(newContiguous)(input); weight = THTensor_(newContiguous)(weight); bias = bias ? THTensor_(newContiguous)(bias) : bias; int batch = 1; if (input->nDimension == 3) { // Force batch batch = 0; THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]); } long inputWidth = input->size[3]; long inputHeight = input->size[2]; long outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1; long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1; // Batch size + input planes long batchSize = input->size[0]; // Resize output THTensor_(resize4d)(output, batchSize, nOutputPlane, outputHeight, outputWidth); THTensor_(zero)(output); // Resize temporary columns THTensor_(resize2d)(columns, nInputPlane*kW*kH, outputHeight*outputWidth); // Define a buffer of ones, for bias accumulation // Note: this buffer can be shared with other modules, it only ever gets increased, // and always contains ones. if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) { // Resize plane and fill with ones... THTensor_(resize2d)(ones, outputHeight, outputWidth); THTensor_(fill)(ones, 1); } // Helpers THTensor *input_n = THTensor_(new)(); THTensor *output_n = THTensor_(new)(); // For each elt in batch, do: for (int elt = 0; elt < batchSize; elt ++) { // Matrix mulitply per output: THTensor_(select)(input_n, input, 0, elt); THTensor_(select)(output_n, output, 0, elt); // Do Bias first: // M,N,K are dims of matrix A and B long m_ = nOutputPlane; long n_ = outputHeight * outputWidth; long k_ = 1; // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) if (bias) { THBlas_(gemm)( 't', 'n', n_, m_, k_, 1, THTensor_(data)(ones), k_, THTensor_(data)(bias), k_, 0, THTensor_(data)(output_n), n_ ); } else { THTensor_(zero)(output_n); } // Extract columns: THNN_(im2col)( THTensor_(data)(input_n), nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW, dilationH, dilationW, THTensor_(data)(columns) ); // M,N,K are dims of matrix A and B long m = nOutputPlane; long n = columns->size[1]; long k = nInputPlane*kH*kW; // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) THBlas_(gemm)( 'n', 'n', n, m, k, 1, THTensor_(data)(columns), n, THTensor_(data)(weight), k, 1, THTensor_(data)(output_n), n ); } // Free THTensor_(free)(input_n); THTensor_(free)(output_n); // Resize output if (batch == 0) { THTensor_(resize3d)(output, nOutputPlane, outputHeight, outputWidth); THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth); } THTensor_(free)(input); THTensor_(free)(weight); if (bias) THTensor_(free)(bias); }
static int nnconv1d_(HorizontalConvolution_accGradParameters)(lua_State *L) { THTensor *input = luaT_checkudata(L, 2, torch_Tensor); THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor); real scale = luaL_optnumber(L, 4, 1); int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane"); int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane"); int kL = luaT_getfieldcheckint(L, 1, "kL"); THTensor *ones = luaT_getfieldcheckudata(L, 1, "ones", torch_Tensor); THTensor *gradWeight = luaT_getfieldcheckudata(L, 1, "gradWeight", torch_Tensor); THTensor *gradBias = luaT_getfieldcheckudata(L, 1, "gradBias", torch_Tensor); THArgCheck(nOutputPlane == gradOutput->size[input->nDimension == 4 ? 1 : 0], 1, "Number of output features is not equal to nOutputPlane" ); // change to batch mode int batch = 1; if (input->nDimension == 3) { batch = 0; THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]); THTensor_(resize4d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]); } long batchSize = input->size[0]; long inputHeight = input->size[2]; long inputWidth = input->size[3]; long outputHeight = inputHeight; long outputWidth = inputWidth - kL + 1; if (ones->nDimension != 1 || ones->size[0] < outputHeight*outputWidth) { THTensor_(resize1d)(ones, outputHeight*outputWidth); THTensor_(fill)(ones, 1); } int elt; for (elt = 0; elt < batchSize; elt++) { // select each batch in 2D THTensor *input_t = THTensor_(newSelect)(input, 0, elt); THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, elt); THTensor *gradOutput2d = THTensor_(newWithStorage2d)(gradOutput->storage, gradOutput->storageOffset, nOutputPlane, -1, outputWidth*outputHeight, -1); // dot products int i, j, k; for (i = 0; i < nInputPlane; i++) { for (k = 0; k < kL; k++) { for (j = 0; j < outputHeight; j++) { *(gradWeight->storage->data + gradWeight->storageOffset + i*gradWeight->stride[0] + k) += scale*THBlas_(dot) (outputWidth, gradOutput_t->storage->data + gradOutput_t->storageOffset + i*gradOutput_t->stride[0] + j*gradOutput_t->stride[1], gradOutput_t->stride[2], input_t->storage->data + input_t->storageOffset + i*input_t->stride[0] + j*input_t->stride[1] + k, input_t->stride[2]); } } } // fill biases THTensor_(addmv)(gradBias, 1, gradBias, scale, gradOutput2d, ones); THTensor_(free)(gradOutput2d); THTensor_(free)(input_t); THTensor_(free)(gradOutput_t); } // revert to single batch if (batch == 0) { THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth); THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth); } return 0; }
static int nn_(SpatialFullConvolution_accGradParameters)(lua_State *L) { // Inputs THTensor *input = (THTensor *)luaT_checkudata(L, 2, torch_Tensor); THTensor *gradOutput = (THTensor *)luaT_checkudata(L, 3, torch_Tensor); // Params int dW = luaT_getfieldcheckint(L, 1, "dW"); int dH = luaT_getfieldcheckint(L, 1, "dH"); int kW = luaT_getfieldcheckint(L, 1, "kW"); int kH = luaT_getfieldcheckint(L, 1, "kH"); int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane"); int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane"); int padW = luaT_getfieldcheckint(L, 1, "padW"); int padH = luaT_getfieldcheckint(L, 1, "padH"); int adjW = luaT_getfieldcheckint(L, 1, "adjW"); int adjH = luaT_getfieldcheckint(L, 1, "adjH"); float scale = luaL_optnumber(L, 4, 1); THTensor *gradWeight = (THTensor *)luaT_getfieldcheckudata(L, 1, "gradWeight", torch_Tensor); THTensor *gradBias = (THTensor *)luaT_getfieldcheckudata(L, 1, "gradBias", torch_Tensor); THTensor *columns = (THTensor*)luaT_getfieldcheckudata(L, 1, "finput", torch_Tensor); THTensor *ones = (THTensor*)luaT_getfieldcheckudata(L, 1, "fgradInput", torch_Tensor); luaL_argcheck(L, input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected"); int batch = 1; if (input->nDimension == 3) { // Force batch batch = 0; THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]); THTensor_(resize4d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]); } long inputWidth = input->size[3]; long inputHeight = input->size[2]; long outputWidth = (inputWidth - 1) * dW - 2*padW + kW + adjW; long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH; // Batch size + input planes long batchSize = input->size[0]; // Define a buffer of ones, for bias accumulation if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) { // Resize plane and fill with ones... THTensor_(resize2d)(ones, outputHeight, outputWidth); THTensor_(fill)(ones, 1); } // Resize temporary columns THTensor_(resize2d)(columns, nOutputPlane*kW*kH, inputHeight*inputWidth); // Helpers THTensor *input_n = THTensor_(new)(); THTensor *gradOutput_n = THTensor_(new)(); int elt; // For each elt in batch, do: for (elt = 0; elt < batchSize; elt ++) { // Matrix mulitply per output: THTensor_(select)(input_n, input, 0, elt); THTensor_(select)(gradOutput_n, gradOutput, 0, elt); // Extract columns: nn_(im2col)( THTensor_(data)(gradOutput_n), nOutputPlane, outputHeight, outputWidth, kH, kW, padH, padW, dH, dW, THTensor_(data)(columns) ); // M,N,K are dims of matrix A and B // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) long n = columns->size[0]; // nOutputPlane * kh * kw long m = input_n->size[0]; // nInputPlane long k = columns->size[1]; // inputHeight * inputWidth // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) THBlas_(gemm)( 't', 'n', n, m, k, scale, THTensor_(data)(columns), k, THTensor_(data)(input_n), k, 1, THTensor_(data)(gradWeight), n ); // Do Bias: // M,N,K are dims of matrix A and B // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) long m_ = nOutputPlane; long k_ = outputHeight * outputWidth; // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices) THBlas_(gemv)( 't', k_, m_, scale, THTensor_(data)(gradOutput_n), k_, THTensor_(data)(ones), 1, 1, THTensor_(data)(gradBias), 1 ); } // Free THTensor_(free)(input_n); THTensor_(free)(gradOutput_n); // Resize if (batch == 0) { THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth); THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth); } // Return nothing return 0; }
void THNN_(LookupTable_accGradParameters)( THNNState *state, THIndexTensor *input, THTensor *gradOutput, THTensor *gradWeight, THIntegerTensor *count, THTensor *sorted, THTensor *indices, bool scaleGradByFreq, int paddingValue, real scale) { long i; THInteger_t *count_data = NULL; if (scaleGradByFreq) { THIntegerTensor_(resize1d)(count, gradWeight->size[0]); count_data = THIntegerTensor_(data)(count); } if (!THTensor_(isContiguous)(gradWeight)) THError("gradWeight must be contiguous"); if (!THIndexTensor_(isContiguous)(input)) THError("input must be contiguous"); if (THIndexTensor_(nDimension)(input) != 1 && THIndexTensor_(nDimension)(input) != 2) THError("input must be a vector or matrix"); THIndex_t *input_data = THIndexTensor_(data)(input); long numel = THIndexTensor_(nElement)(input); long numw = THTensor_(size)(gradWeight, 0); // check that inputs are all within range for (i=0; i<numel; i++) if (input_data[i] < 1 || input_data[i] > numw) THError("input out of range"); gradOutput = THTensor_(newContiguous)(gradOutput); real *gw = THTensor_(data)(gradWeight); real *go = THTensor_(data)(gradOutput); long stride = THTensor_(stride)(gradWeight, 0); if (count_data) THNN_(LookupTable_resetCount)(count_data, input); #ifdef _OPENMP if (numel > 1000) { // The strategy is to parallelize over sections of the vocabulary, so that // thread 1 handles updates to gradWeight[0..nVocab/nThreads]. Every thread // has to traverse the entire input, but the dominating factor is the axpy // BLAS call. #pragma omp parallel private(i) { int tid = omp_get_thread_num(); int nthreads = omp_get_num_threads(); long start = tid * (numw/nthreads + 1); long end = start + (numw/nthreads + 1); for (i=0; i<numel; i++) { if (input_data[i] != paddingValue) { long k = input_data[i] - 1; if (k >= start && k < end) { real scale_ = scale; if (count_data) scale_ /= count_data[k]; THBlas_(axpy)(stride, scale_, go + i*stride, 1, gw + k*stride, 1); } } } } THTensor_(free)(gradOutput); return; } #endif for (i=0; i<numel; i++) { if (input_data[i] != paddingValue) { long k = input_data[i] - 1; real scale_ = scale; if (count_data) scale_ /= count_data[k]; THBlas_(axpy)(stride, scale_, go + i*stride, 1, gw + k*stride, 1); } } THTensor_(free)(gradOutput); }
void THNN_(VolumetricDilatedConvolution_updateOutput)( THNNState *state, THTensor *input, THTensor *output, THTensor *weight, THTensor *bias, THTensor *columns, THTensor *ones, int kT, int kW, int kH, int dT, int dW, int dH, int padT, int padW, int padH, int dilationT, int dilationW, int dilationH) { THNN_(VolumetricDilatedConvolution_shapeCheck)( input, NULL, weight, bias, kT, kH, kW, dT, dH, dW, padT, padH, padW, dilationT, dilationH, dilationW, 0); // Params: int nInputPlane = weight->size[1]; int nOutputPlane = weight->size[0]; input = THTensor_(newContiguous)(input); weight = THTensor_(newContiguous)(weight); THArgCheck(THTensor_(isContiguous)(columns), 5, "columns needs to be contiguous"); if (bias) { bias = THTensor_(newContiguous)(bias); THArgCheck(THTensor_(isContiguous)(ones), 6, "ones needs to be contiguous"); } int is_batch = 1; if (input->_dim() == 4) { // Force batch is_batch = 0; THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]); } int64_t inputDepth = input->size[2]; int64_t inputHeight = input->size[3]; int64_t inputWidth = input->size[4]; int64_t outputDepth = (inputDepth + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1; int64_t outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1; int64_t outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1; // Batch size + input planes int64_t batchSize = input->size[0]; // Resize output THTensor_(resize5d)(output, batchSize, nOutputPlane, outputDepth, outputHeight, outputWidth); THTensor_(zero)(output); // Resize temporary columns THTensor_(resize2d)(columns, nInputPlane*kT*kW*kH, outputDepth*outputHeight*outputWidth); // Define a buffer of ones, for bias accumulation // Note: this buffer can be shared with other modules, it only ever gets increased, // and always contains ones. if (ones->_dim() != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) { // Resize plane and fill with ones... THTensor_(resize3d)(ones, outputDepth, outputHeight, outputWidth); THTensor_(fill)(ones, 1); } // Helpers THTensor *input_n = THTensor_(new)(); THTensor *output_n = THTensor_(new)(); // For each elt in batch, do: for (int elt = 0; elt < batchSize; elt ++) { // Matrix mulitply per output: THTensor_(select)(input_n, input, 0, elt); THTensor_(select)(output_n, output, 0, elt); // Do Bias first: // M,N,K are dims of matrix A and B int64_t m_ = nOutputPlane; int64_t n_ = outputDepth * outputHeight * outputWidth; int64_t k_ = 1; // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) if (bias) { THBlas_(gemm)( 't', 'n', n_, m_, k_, 1, THTensor_(data)(ones), k_, THTensor_(data)(bias), k_, 0, THTensor_(data)(output_n), n_ ); } else { THTensor_(zero)(output_n); } // Extract columns: THNN_(vol2col)( THTensor_(data)(input_n), nInputPlane, inputDepth, inputHeight, inputWidth, outputDepth, outputHeight, outputWidth, kT, kH, kW, padT, padH, padW, dT, dH, dW, dilationT, dilationH, dilationW, THTensor_(data)(columns) ); // M,N,K are dims of matrix A and B int64_t m = nOutputPlane; int64_t n = columns->size[1]; int64_t k = nInputPlane*kT*kH*kW; // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) THBlas_(gemm)( 'n', 'n', n, m, k, 1, THTensor_(data)(columns), n, THTensor_(data)(weight), k, 1, THTensor_(data)(output_n), n ); } // Free THTensor_(free)(input_n); THTensor_(free)(output_n); // Resize output if (is_batch == 0) { THTensor_(resize4d)(output, nOutputPlane, outputDepth, outputHeight, outputWidth); THTensor_(resize4d)(input, nInputPlane, inputDepth, inputHeight, inputWidth); } THTensor_(free)(input); THTensor_(free)(weight); if (bias) THTensor_(free)(bias); }
void THNN_(SpatialDilatedConvolution_updateGradInput)( THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput, THTensor *weight, THTensor *gradColumns, int kW, int kH, int dW, int dH, int padW, int padH, int dilationW, int dilationH) { THNN_(SpatialDilatedConvolution_shapeCheck) (input, gradOutput, weight, NULL, kH, kW, dH, dW, padH, padW, dilationH, dilationW); // Params int nInputPlane = weight->size[1]; int nOutputPlane = weight->size[0]; input = THTensor_(newContiguous)(input); weight = THTensor_(newContiguous)(weight); gradOutput = THTensor_(newContiguous)(gradOutput); int batch = 1; if (input->nDimension == 3) { // Force batch batch = 0; THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]); THTensor_(resize4d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]); } long inputWidth = input->size[3]; long inputHeight = input->size[2]; long outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1; long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1; // Batch size + input planes long batchSize = input->size[0]; // Resize output THTensor_(resize4d)(gradInput, batchSize, nInputPlane, inputHeight, inputWidth); // Resize temporary columns THTensor_(resize2d)(gradColumns, nInputPlane*kW*kH, outputHeight*outputWidth); THTensor_(zero)(gradColumns); // Helpers THTensor *gradInput_n = THTensor_(new)(); THTensor *gradOutput_n = THTensor_(new)(); // For each elt in batch, do: for (int elt = 0; elt < batchSize; elt ++) { // Matrix mulitply per sample: THTensor_(select)(gradInput_n, gradInput, 0, elt); THTensor_(select)(gradOutput_n, gradOutput, 0, elt); // M,N,K are dims of matrix A and B long m = nInputPlane*kW*kH; long n = gradColumns->size[1]; long k = nOutputPlane; // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) THBlas_(gemm)( 'n', 't', n, m, k, 1, THTensor_(data)(gradOutput_n), n, THTensor_(data)(weight), m, 0, THTensor_(data)(gradColumns), n ); // Unpack columns back into input: THNN_(col2im)( THTensor_(data)(gradColumns), nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW, dilationH, dilationW, THTensor_(data)(gradInput_n) ); } // Free THTensor_(free)(gradInput_n); THTensor_(free)(gradOutput_n); // Resize output if (batch == 0) { THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth); THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth); THTensor_(resize3d)(gradInput, nInputPlane, inputHeight, inputWidth); } THTensor_(free)(input); THTensor_(free)(gradOutput); THTensor_(free)(weight); }
void THNN_(SpatialDilatedConvolution_updateOutput)( THNNState *state, THTensor *input, THTensor *output, THTensor *weight, THTensor *bias, THTensor *columns, THTensor *ones, int kW, int kH, int dW, int dH, int padW, int padH, int dilationW, int dilationH) { THNN_ARGCHECK(input->nDimension == 3 || input->nDimension == 4, 2, input, "3D or 4D (batch mode) tensor expected for input, but got: %s"); THNN_ARGCHECK(weight->nDimension == 4, 4, weight, "4D weight tensor (nOutputPlane,nInputPlane,kH,kW) expected, " "but got: %s"); THArgCheck(!bias || weight->size[0] == bias->size[0], 4, "nOutputPlane mismatch in weight and bias"); THArgCheck(kW > 0 && kH > 0, 8, "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW); THArgCheck(dW > 0 && dH > 0, 10, "stride should be greater than zero, but got dH: %d dW: %d", dH, dW); // Params: int nInputPlane = weight->size[1]; int nOutputPlane = weight->size[0]; int batch = 1; if (input->nDimension == 3) { THArgCheck(input->size[0] == nInputPlane, 2, "input channels %d and nInputPlane %d dont match.", input->size[0], nInputPlane); // Force batch batch = 0; THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]); } else { THArgCheck(input->size[1] == nInputPlane, 2, "input channels %d and nInputPlane %d dont match", input->size[1], nInputPlane); } long inputWidth = input->size[3]; long inputHeight = input->size[2]; long outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1; long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1; if (outputWidth < 1 || outputHeight < 1) THError("Given input size: (%dx%dx%d). " "Calculated output size: (%dx%dx%d). Output size is too small", nInputPlane,inputHeight,inputWidth,nOutputPlane,outputHeight,outputWidth); // Batch size + input planes long batchSize = input->size[0]; // Resize output THTensor_(resize4d)(output, batchSize, nOutputPlane, outputHeight, outputWidth); THTensor_(zero)(output); // Resize temporary columns THTensor_(resize2d)(columns, nInputPlane*kW*kH, outputHeight*outputWidth); // Define a buffer of ones, for bias accumulation // Note: this buffer can be shared with other modules, it only ever gets increased, // and always contains ones. if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) { // Resize plane and fill with ones... THTensor_(resize2d)(ones, outputHeight, outputWidth); THTensor_(fill)(ones, 1); } // Helpers THTensor *input_n = THTensor_(new)(); THTensor *output_n = THTensor_(new)(); // For each elt in batch, do: for (int elt = 0; elt < batchSize; elt ++) { // Matrix mulitply per output: THTensor_(select)(input_n, input, 0, elt); THTensor_(select)(output_n, output, 0, elt); // Do Bias first: // M,N,K are dims of matrix A and B long m_ = nOutputPlane; long n_ = outputHeight * outputWidth; long k_ = 1; // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) if (bias) { THBlas_(gemm)( 't', 'n', n_, m_, k_, 1, THTensor_(data)(ones), k_, THTensor_(data)(bias), k_, 0, THTensor_(data)(output_n), n_ ); } else { THTensor_(zero)(output_n); } // Extract columns: THNN_(im2col)( THTensor_(data)(input_n), nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW, dilationH, dilationW, THTensor_(data)(columns) ); // M,N,K are dims of matrix A and B long m = nOutputPlane; long n = columns->size[1]; long k = nInputPlane*kH*kW; // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) THBlas_(gemm)( 'n', 'n', n, m, k, 1, THTensor_(data)(columns), n, THTensor_(data)(weight), k, 1, THTensor_(data)(output_n), n ); } // Free THTensor_(free)(input_n); THTensor_(free)(output_n); // Resize output if (batch == 0) { THTensor_(resize3d)(output, nOutputPlane, outputHeight, outputWidth); THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth); } }
void THNN_(SpatialDilatedConvolution_accGradParameters)( THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias, THTensor *columns, THTensor *ones, int kW, int kH, int dW, int dH, int padW, int padH, int dilationW, int dilationH, real scale) { THNN_ARGCHECK(input->nDimension == 3 || input->nDimension == 4, 2, input, "3D or 4D (batch mode) tensor expected for input, but got: %s"); THNN_ARGCHECK(gradWeight->nDimension == 4, 4, gradWeight, "4D gradWeight tensor (nOutputPlane,nInputPlane,kH,kW) expected, " "but got: %s"); THArgCheck(!gradBias || gradWeight->size[0] == gradBias->size[0], 4, "nOutputPlane mismatch in gradWeight and gradBias"); THArgCheck(kW > 0 && kH > 0, 8, "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW); THArgCheck(dW > 0 && dH > 0, 10, "stride should be greater than zero, but got dH: %d dW: %d", dH, dW); // Params int nInputPlane = gradWeight->size[1]; int nOutputPlane = gradWeight->size[0]; int batch = 1; if (input->nDimension == 3) { // Force batch batch = 0; THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]); THTensor_(resize4d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]); } long inputWidth = input->size[3]; long inputHeight = input->size[2]; long outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1; long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1; // Batch size + input planes long batchSize = input->size[0]; // Define a buffer of ones, for bias accumulation if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) { // Resize plane and fill with ones... THTensor_(resize2d)(ones, outputHeight, outputWidth); THTensor_(fill)(ones, 1); } // Resize temporary columns THTensor_(resize2d)(columns, nInputPlane*kW*kH, outputHeight*outputWidth); // Helpers THTensor *input_n = THTensor_(new)(); THTensor *gradOutput_n = THTensor_(new)(); // For each elt in batch, do: for (int elt = 0; elt < batchSize; elt ++) { // Matrix mulitply per output: THTensor_(select)(input_n, input, 0, elt); THTensor_(select)(gradOutput_n, gradOutput, 0, elt); // Extract columns: THNN_(im2col)( THTensor_(data)(input_n), nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW, dilationH, dilationW, THTensor_(data)(columns) ); // M,N,K are dims of matrix A and B long m = nOutputPlane; long n = nInputPlane*kW*kH; long k = columns->size[1]; // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) THBlas_(gemm)( 't', 'n', n, m, k, scale, THTensor_(data)(columns), k, THTensor_(data)(gradOutput_n), k, 1, THTensor_(data)(gradWeight), n ); // Do Bias: // M,N,K are dims of matrix A and B long m_ = nOutputPlane; long k_ = outputHeight * outputWidth; // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices) if (gradBias) { THBlas_(gemv)( 't', k_, m_, scale, THTensor_(data)(gradOutput_n), k_, THTensor_(data)(ones), 1, 1, THTensor_(data)(gradBias), 1 ); } } // Free THTensor_(free)(input_n); THTensor_(free)(gradOutput_n); // Resize if (batch == 0) { THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth); THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth); } }
void THNN_(SpatialDilatedConvolution_updateGradInput)( THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput, THTensor *weight, THTensor *gradColumns, int kW, int kH, int dW, int dH, int padW, int padH, int dilationW, int dilationH) { THNN_ARGCHECK(input->nDimension == 3 || input->nDimension == 4, 2, input, "3D or 4D (batch mode) tensor expected for input, but got: %s"); THNN_ARGCHECK(weight->nDimension == 4, 4, weight, "4D weight tensor (nOutputPlane,nInputPlane,kH,kW) expected, " "but got: %s"); THArgCheck(kW > 0 && kH > 0, 9, "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW); THArgCheck(dW > 0 && dH > 0, 11, "stride should be greater than zero, but got dH: %d dW: %d", dH, dW); // Params int nInputPlane = weight->size[1]; int nOutputPlane = weight->size[0]; int batch = 1; if (input->nDimension == 3) { // Force batch batch = 0; THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]); THTensor_(resize4d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]); } long inputWidth = input->size[3]; long inputHeight = input->size[2]; long outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1; long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1; // Batch size + input planes long batchSize = input->size[0]; // Resize output THTensor_(resize4d)(gradInput, batchSize, nInputPlane, inputHeight, inputWidth); // Resize temporary columns THTensor_(resize2d)(gradColumns, nInputPlane*kW*kH, outputHeight*outputWidth); THTensor_(zero)(gradColumns); // Helpers THTensor *gradInput_n = THTensor_(new)(); THTensor *gradOutput_n = THTensor_(new)(); // For each elt in batch, do: for (int elt = 0; elt < batchSize; elt ++) { // Matrix mulitply per sample: THTensor_(select)(gradInput_n, gradInput, 0, elt); THTensor_(select)(gradOutput_n, gradOutput, 0, elt); // M,N,K are dims of matrix A and B long m = nInputPlane*kW*kH; long n = gradColumns->size[1]; long k = nOutputPlane; // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) THBlas_(gemm)( 'n', 't', n, m, k, 1, THTensor_(data)(gradOutput_n), n, THTensor_(data)(weight), m, 0, THTensor_(data)(gradColumns), n ); // Unpack columns back into input: THNN_(col2im)( THTensor_(data)(gradColumns), nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW, dilationH, dilationW, THTensor_(data)(gradInput_n) ); } // Free THTensor_(free)(gradInput_n); THTensor_(free)(gradOutput_n); // Resize output if (batch == 0) { THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth); THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth); THTensor_(resize3d)(gradInput, nInputPlane, inputHeight, inputWidth); } }
static int nn_(SpatialFullConvolution_updateGradInput)(lua_State *L) { // Inputs THTensor *input = (THTensor *)luaT_checkudata(L, 2, torch_Tensor); THTensor *gradOutput = (THTensor *)luaT_checkudata(L, 3, torch_Tensor); // Params int dW = luaT_getfieldcheckint(L, 1, "dW"); int dH = luaT_getfieldcheckint(L, 1, "dH"); int kW = luaT_getfieldcheckint(L, 1, "kW"); int kH = luaT_getfieldcheckint(L, 1, "kH"); int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane"); int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane"); int padW = luaT_getfieldcheckint(L, 1, "padW"); int padH = luaT_getfieldcheckint(L, 1, "padH"); int adjW = luaT_getfieldcheckint(L, 1, "adjW"); int adjH = luaT_getfieldcheckint(L, 1, "adjH"); THTensor *weight = (THTensor *)luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor); THTensor *gradColumns = (THTensor*)luaT_getfieldcheckudata(L, 1, "finput", torch_Tensor); THTensor *gradInput = (THTensor *)luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor); luaL_argcheck(L, input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected"); int batch = 1; if (input->nDimension == 3) { // Force batch batch = 0; THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]); THTensor_(resize4d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]); } long inputWidth = input->size[3]; long inputHeight = input->size[2]; long outputWidth = (inputWidth - 1) * dW - 2*padW + kW + adjW; long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH; // Batch size + input planes long batchSize = input->size[0]; // Resize output THTensor_(resize4d)(gradInput, batchSize, nInputPlane, inputHeight, inputWidth); // Resize temporary columns THTensor_(resize2d)(gradColumns, nOutputPlane*kW*kH, inputHeight*inputWidth); // Helpers THTensor *gradInput_n = THTensor_(new)(); THTensor *gradOutput_n = THTensor_(new)(); int elt; // For each elt in batch, do: for (elt = 0; elt < batchSize; elt ++) { // Matrix mulitply per sample: THTensor_(select)(gradInput_n, gradInput, 0, elt); THTensor_(select)(gradOutput_n, gradOutput, 0, elt); // Extract columns: nn_(im2col)( THTensor_(data)(gradOutput_n), nOutputPlane, outputHeight, outputWidth, kH, kW, padH, padW, dH, dW, THTensor_(data)(gradColumns) ); // M,N,K are dims of matrix A and B // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) long m = weight->size[0]; long n = gradColumns->size[1]; long k = weight->size[1] * weight->size[2] * weight->size[3]; // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) THBlas_(gemm)( 'n', 'n', n, m, k, 1, THTensor_(data)(gradColumns), n, THTensor_(data)(weight), k, 0, THTensor_(data)(gradInput_n), n ); } // Free THTensor_(free)(gradInput_n); THTensor_(free)(gradOutput_n); // Resize output if (batch == 0) { THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth); THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth); THTensor_(resize3d)(gradInput, nInputPlane, inputHeight, inputWidth); } // Return gradInput return 1; }
void THNN_(VolumetricFullConvolution_accGradParameters)( THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias, THTensor *finput, THTensor *fgradInput, int dT, int dW, int dH, // stride int pT, int pW, int pH, // padding int aT, int aW, int aH, // extra output adjustment real scale) { // number of input & output planes and kernel size is indirectly defined by the gradWeight tensor THNN_(VolumetricFullConvolution_shapeCheck)( input, gradOutput, gradWeight, gradBias, dT, dW, dH, pT, pW, pH, aT, aW, aH); int nInputPlane = (int)gradWeight->size[0]; int nOutputPlane = (int)gradWeight->size[1]; int kT = (int)gradWeight->size[2]; int kH = (int)gradWeight->size[3]; int kW = (int)gradWeight->size[4]; THTensor *columns = finput; THTensor *ones = fgradInput; input = THTensor_(newContiguous)(input); gradOutput = THTensor_(newContiguous)(gradOutput); int batch = 1; if (input->nDimension == 4) { // Force batch batch = 0; THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]); THTensor_(resize5d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]); } const long inputWidth = input->size[4]; const long inputHeight = input->size[3]; const long inputDepth = input->size[2]; const long outputWidth = (inputWidth - 1) * dW - 2*pW + kW + aW; const long outputHeight = (inputHeight - 1) * dH - 2*pH + kH + aH; const long outputDepth = (inputDepth - 1) * dT - 2*pT + kT + aT; // Batch size + input planes const long batchSize = input->size[0]; // Define a buffer of ones, for bias accumulation if (ones->nDimension != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) { // Resize plane and fill with ones... THTensor_(resize3d)(ones, outputDepth, outputHeight, outputWidth); THTensor_(fill)(ones, 1); } // Resize temporary columns THTensor_(resize2d)(columns, nOutputPlane*kW*kH*kT, inputDepth*inputHeight*inputWidth); // Helpers THTensor *input_n = THTensor_(new)(); THTensor *gradOutput_n = THTensor_(new)(); int elt; // For each elt in batch, do: for (elt = 0; elt < batchSize; ++elt) { // Matrix mulitply per output: THTensor_(select)(input_n, input, 0, elt); THTensor_(select)(gradOutput_n, gradOutput, 0, elt); // Extract columns: THNN_(vol2col)( THTensor_(data)(gradOutput_n), nOutputPlane, outputDepth, outputHeight, outputWidth, kT, kH, kW, pT, pH, pW, dT, dH, dW, 1, 1, 1, THTensor_(data)(columns) ); // M,N,K are dims of matrix A and B // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) const long n = columns->size[0]; // nOutputPlane * kt * kh * kw const long m = input_n->size[0]; // nInputPlane const long k = columns->size[1]; // inputHeight * inputWidth // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) THBlas_(gemm)( 't', 'n', n, m, k, scale, THTensor_(data)(columns), k, THTensor_(data)(input_n), k, 1, THTensor_(data)(gradWeight), n ); // Do Bias: // M,N,K are dims of matrix A and B // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) const long m_ = nOutputPlane; const long k_ = outputDepth * outputHeight * outputWidth; // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices) THBlas_(gemv)( 't', k_, m_, scale, THTensor_(data)(gradOutput_n), k_, THTensor_(data)(ones), 1, 1, THTensor_(data)(gradBias), 1 ); } // Free THTensor_(free)(input_n); THTensor_(free)(gradOutput_n); // Resize if (batch == 0) { THTensor_(resize4d)(gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth); THTensor_(resize4d)(input, nInputPlane, inputDepth, inputHeight, inputWidth); } THTensor_(free)(input); THTensor_(free)(gradOutput); }
static int nn_(SpatialFullConvolution_updateOutput)(lua_State *L) { // Input THTensor *input = (THTensor*)luaT_checkudata(L, 2, torch_Tensor); // Params: int dW = luaT_getfieldcheckint(L, 1, "dW"); int dH = luaT_getfieldcheckint(L, 1, "dH"); int kW = luaT_getfieldcheckint(L, 1, "kW"); int kH = luaT_getfieldcheckint(L, 1, "kH"); int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane"); int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane"); int padW = luaT_getfieldcheckint(L, 1, "padW"); int padH = luaT_getfieldcheckint(L, 1, "padH"); int adjW = luaT_getfieldcheckint(L, 1, "adjW"); int adjH = luaT_getfieldcheckint(L, 1, "adjH"); THTensor *weight = (THTensor*)luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor); THTensor *bias = (THTensor*)luaT_getfieldcheckudata(L, 1, "bias", torch_Tensor); THTensor *columns = (THTensor*)luaT_getfieldcheckudata(L, 1, "finput", torch_Tensor); THTensor *ones = (THTensor*)luaT_getfieldcheckudata(L, 1, "fgradInput", torch_Tensor); THTensor *output = (THTensor*)luaT_getfieldcheckudata(L, 1, "output", torch_Tensor); luaL_argcheck(L, input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected"); int batch = 1; if (input->nDimension == 3) { luaL_argcheck(L, input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match"); // Force batch batch = 0; THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]); } else { luaL_argcheck(L, input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match"); } long inputWidth = input->size[3]; long inputHeight = input->size[2]; long outputWidth = (inputWidth - 1) * dW - 2*padW + kW + adjW; long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH; // Batch size + input planes long batchSize = input->size[0]; // Resize output THTensor_(resize4d)(output, batchSize, nOutputPlane, outputHeight, outputWidth); // Resize temporary columns THTensor_(resize2d)(columns, nOutputPlane*kW*kH, inputHeight*inputWidth); // Define a buffer of ones, for bias accumulation // Note: this buffer can be shared with other modules, it only ever gets increased, // and always contains ones. if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) { // Resize plane and fill with ones... THTensor_(resize2d)(ones, outputHeight, outputWidth); THTensor_(fill)(ones, 1); } // Helpers THTensor *input_n = THTensor_(new)(); THTensor *output_n = THTensor_(new)(); int elt; // For each elt in batch, do: for (elt = 0; elt < batchSize; elt ++) { // Matrix mulitply per output: THTensor_(select)(input_n, input, 0, elt); THTensor_(select)(output_n, output, 0, elt); // M,N,K are dims of matrix A and B // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) long m = weight->size[1] * weight->size[2] * weight->size[3]; long n = columns->size[1]; long k = weight->size[0]; // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) THBlas_(gemm)( 'n', 't', n, m, k, 1, THTensor_(data)(input_n), n, THTensor_(data)(weight), m, 0, THTensor_(data)(columns), n ); // Unpack columns back into input: nn_(col2im)( THTensor_(data)(columns), nOutputPlane, outputHeight, outputWidth, kH, kW, padH, padW, dH, dW, THTensor_(data)(output_n) ); // Do Bias after: // M,N,K are dims of matrix A and B // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) long m_ = nOutputPlane; long n_ = outputHeight * outputWidth; long k_ = 1; // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) THBlas_(gemm)( 't', 'n', n_, m_, k_, 1, THTensor_(data)(ones), k_, THTensor_(data)(bias), k_, 1, THTensor_(data)(output_n), n_ ); } // Free THTensor_(free)(input_n); THTensor_(free)(output_n); // Resize output if (batch == 0) { THTensor_(resize3d)(output, nOutputPlane, outputHeight, outputWidth); THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth); } // return output return 1; }
void THNN_(SparseLinear_accGradParameters)( THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias, THTensor *weight, THTensor *bias, accreal weightDecay_, accreal scale_) { real weightDecay = TH_CONVERT_ACCREAL_TO_REAL(weightDecay_); real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_); int64_t h, i, col, hp0, hp1; int64_t outDim = THTensor_(size)(weight, 0); int64_t inDim = THTensor_(size)(weight, 1); THArgCheck(THNN_(checkInput)(input), 2, "input must be in coo format, nnz x 3"); THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4, "gradWeight size wrong"); THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5, "gradBias size wrong"); THArgCheck(THTensor_(isContiguous)(gradOutput), 1, "gradOutput must be contiguous"); int64_t nnz = THTensor_(size)(input, 0); THLongTensor* csc = THLongTensor_newWithSize1d(inDim+1); THLongTensor_zero(csc); weight = THTensor_(newContiguous)(weight); #pragma omp parallel for private(i, h, hp0, hp1) schedule(static) if (nnz > 10000) for (i = 0; i < nnz; i++) { hp0 = (int64_t)(THNN_(get2d)(input, i, 1)) - 1; hp1 = (i+1 == nnz) ? inDim : (int64_t)(THNN_(get2d)(input, i+1, 1)) - 1; if (hp0 != hp1) for (h = hp0; h < hp1; h++) { THLongTensor_set1d(csc, h+1, i+1); } } // gradWeight += gradOutput * input #pragma omp parallel for private(h, i, col) schedule(static) if (nnz > 10000) for (col = 0; col < inDim; col++) { int64_t i_start = THLongTensor_get1d(csc, col); int64_t i_end = THLongTensor_get1d(csc, col+1); for (i = i_start; i < i_end; i++) { real val = scale * THNN_(get2d)(input, i, 2); h = (int64_t)(THNN_(get2d)(input, i, 0)) - 1; int64_t offset = (int64_t)(THNN_(get2d)(input, i, 1)) - 1; if (offset >= 0 && offset < inDim) { THBlas_(axpy)(outDim, val, ROW_PTR2(gradOutput, h), gradOutput->stride[1], COL_PTR2(gradWeight, offset), gradWeight->stride[0]); } else { THError( "index out of bound. accGradParameters: %d not between 1 and %d", offset + 1, inDim); } } } // gradBias += gradOutput THTensor* buf = THTensor_(new)(); THTensor_(sum)(buf, gradOutput, 0, 1); THTensor_(cadd)(gradBias, gradBias, scale, buf); THTensor_(free)(buf); THLongTensor_free(csc); if (weightDecay != 0) { THTensor_(cadd)(gradWeight, gradWeight, weightDecay, weight); } THTensor_(free)(weight); }
void THNN_(SparseLinear_legacyAccGradParameters)( THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias, THTensor *weight, THTensor *bias, accreal weightDecay_, accreal scale_) { real weightDecay = TH_CONVERT_ACCREAL_TO_REAL(weightDecay_); real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_); int64_t h, i; int64_t outDim = THTensor_(size)(weight, 0); int64_t inDim = THTensor_(size)(weight, 1); THArgCheck(THNN_(checkLegacyInput)(input), 2, "input size must be batchsize x nnz x 2"); THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4, "gradWeight size wrong"); THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5, "gradBias size wrong"); THArgCheck(THTensor_(isContiguous)(gradOutput), 1, "gradOutput must be contiguous"); int64_t batchSize = THTensor_(size)(input, 0); int64_t nnz = THTensor_(size)(input, 1); THTensor_(resize2d)(gradOutput, batchSize, outDim); // gradWeight += gradOutput * input #pragma omp parallel for private(h, i) schedule(static) if (\ batchSize * nnz * outDim > 10000) for (i = 0; i < nnz; i++) { for (h = 0; h < batchSize; h++) { real val = scale * THNN_(get3d)(input, h, i, 1); if (val == 0) { continue; } int64_t offset = (int64_t)(THNN_(get3d)(input, h, i, 0)) - 1; if (offset >= 0 && offset < inDim) { THBlas_(axpy)(outDim, val, ROW_PTR2(gradOutput, h), gradOutput->stride[1], COL_PTR2(gradWeight, offset), gradWeight->stride[0]); } else { THError( "index out of bound. accGradParameters: %d not between 1 and %d", offset + 1, inDim); } } } // gradBias += gradOutput THTensor* gradOutput_row = THTensor_(new)(); for (h = 0; h < batchSize; h++) { THTensor_(select)(gradOutput_row, gradOutput, 0, h); THTensor_(cadd)(gradBias, gradBias, scale, gradOutput_row); } THTensor_(free)(gradOutput_row); if (weightDecay != 0) { THTensor_(cadd)(gradWeight, gradWeight, weightDecay, weight); } }
void THNN_(SpatialDilatedConvolution_accGradParameters)( THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias, THTensor *columns, THTensor *ones, int kW, int kH, int dW, int dH, int padW, int padH, int dilationW, int dilationH, accreal scale_) { real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_); THNN_(SpatialDilatedConvolution_shapeCheck) (input, gradOutput, gradWeight, gradBias, kH, kW, dH, dW, padH, padW, dilationH, dilationW); // Params int nInputPlane = gradWeight->size[1]; int nOutputPlane = gradWeight->size[0]; input = THTensor_(newContiguous)(input); gradOutput = THTensor_(newContiguous)(gradOutput); THArgCheck(THTensor_(isContiguous)(gradWeight), 4, "gradWeight needs to be contiguous"); if (gradBias) THArgCheck(THTensor_(isContiguous)(gradBias), 5, "gradBias needs to be contiguous"); int batch = 1; if (input->nDimension == 3) { // Force batch batch = 0; THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]); THTensor_(resize4d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]); } long inputWidth = input->size[3]; long inputHeight = input->size[2]; long outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1; long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1; // Batch size + input planes long batchSize = input->size[0]; // Define a buffer of ones, for bias accumulation if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) { // Resize plane and fill with ones... THTensor_(resize2d)(ones, outputHeight, outputWidth); THTensor_(fill)(ones, 1); } // Resize temporary columns THTensor_(resize2d)(columns, nInputPlane*kW*kH, outputHeight*outputWidth); // Helpers THTensor *input_n = THTensor_(new)(); THTensor *gradOutput_n = THTensor_(new)(); // For each elt in batch, do: for (int elt = 0; elt < batchSize; elt ++) { // Matrix mulitply per output: THTensor_(select)(input_n, input, 0, elt); THTensor_(select)(gradOutput_n, gradOutput, 0, elt); // Extract columns: THNN_(im2col)( THTensor_(data)(input_n), nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW, dilationH, dilationW, THTensor_(data)(columns) ); // M,N,K are dims of matrix A and B long m = nOutputPlane; long n = nInputPlane*kW*kH; long k = columns->size[1]; // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) THBlas_(gemm)( 't', 'n', n, m, k, scale, THTensor_(data)(columns), k, THTensor_(data)(gradOutput_n), k, 1, THTensor_(data)(gradWeight), n ); // Do Bias: // M,N,K are dims of matrix A and B long m_ = nOutputPlane; long k_ = outputHeight * outputWidth; // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices) if (gradBias) { THBlas_(gemv)( 't', k_, m_, scale, THTensor_(data)(gradOutput_n), k_, THTensor_(data)(ones), 1, 1, THTensor_(data)(gradBias), 1 ); } } // Free THTensor_(free)(input_n); THTensor_(free)(gradOutput_n); // Resize if (batch == 0) { THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth); THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth); } THTensor_(free)(input); THTensor_(free)(gradOutput); }