/* note: due to write issues, this one cannot be parallelized as well as unfolded_copy */ static void nn_(unfolded_acc)(THTensor *finput, THTensor *input, int kW, int kH, int dW, int dH, int padW, int padH, int nInputPlane, int inputWidth, int inputHeight, int outputWidth, int outputHeight) { int nip; real *input_data = THTensor_(data)(input); real *finput_data = THTensor_(data)(finput); #pragma omp parallel for private(nip) for(nip = 0; nip < nInputPlane; nip++) { int kw, kh, y, x, ix, iy; for(kh = 0; kh < kH; kh++) { for(kw = 0; kw < kW; kw++) { real *src = finput_data + nip*(kH*kW*outputHeight*outputWidth) + kh*(kW*outputHeight*outputWidth) + kw*(outputHeight*outputWidth); real *dst = input_data + nip*(inputHeight*inputWidth); if (padW > 0 || padH > 0) { int lpad,rpad; for(y = 0; y < outputHeight; y++) { iy = y*dH - padH + kh; if (iy < 0 || iy >= inputHeight) { } else { if (dW==1){ ix = 0 - padW + kw; lpad = fmaxf(0,padW-kw); rpad = fmaxf(0,padW-(kW-kw-1)); THVector_(add)(dst+iy*inputWidth+ix+lpad, src+y*outputWidth+lpad, 1, outputWidth - lpad - rpad); /* note: THVector_add could handle 1 value better */ } else{ for (x=0; x<outputWidth; x++){ ix = x*dW - padW + kw; if (ix < 0 || ix >= inputWidth){ }else THVector_(add)(dst+iy*inputWidth+ix, src+y*outputWidth+x, 1, 1); } } } } } else { for(y = 0; y < outputHeight; y++) { iy = y*dH + kh; ix = 0 + kw; if (dW == 1 ) THVector_(add)(dst+iy*inputWidth+ix, src+y*outputWidth, 1, outputWidth); /* note: THVector_add could handle 1 value better */ else{ for(x = 0; x < outputWidth; x++) THVector_(add)(dst+iy*inputWidth+ix+x*dW, src+y*outputWidth+x, 1, 1); } } } } } } }
void THVector_(normal_fill_DEFAULT)(real *data, int64_t size, THGenerator *generator, const real mean, const real stddev) { THAssert(size >= 16 && "Size must be >= 16 for normal fill"); for (int64_t i = 0; i < size; ++i) { #ifdef TH_REAL_IS_FLOAT data[i] = THRandom_uniformFloat(generator, 0, 1); #else data[i] = THRandom_uniform(generator, 0, 1); #endif } for (int64_t i = 0; i < size - 15; i += 16) { THVector_(interleaved_normal_fill_16)(data + i, mean, stddev); } if (size % 16 != 0) { // Recompute the last 16 values. data = data + size - 16; for (int64_t i = 0; i < 16; ++i) { #ifdef TH_REAL_IS_FLOAT data[i] = THRandom_uniformFloat(generator, 0, 1); #else data[i] = THRandom_uniform(generator, 0, 1); #endif } THVector_(interleaved_normal_fill_16)(data, mean, stddev); } }
void THVector_(normal_fill)(real *data, const int64_t size, struct THGenerator *generator, const real mean, const real stddev) { THVector_(normal_fill_DISPATCHPTR)(data, size, generator, mean, stddev); }
/* note: due to write issues, this one cannot be parallelized as well as unfolded_copy */ static void nn_(unfolded_acc)(THTensor *finput, THTensor *input, int kW, int kH, int nInputPlane, int inputWidth, int inputHeight, int outputWidth, int outputHeight) { int nip; real *input_data = THTensor_(data)(input); real *finput_data = THTensor_(data)(finput); #pragma omp parallel for private(nip) for(nip = 0; nip < nInputPlane; nip++) { int kw, kh, y; for(kh = 0; kh < kH; kh++) { for(kw = 0; kw < kW; kw++) { real *src = finput_data + nip*(kH*kW*outputHeight*outputWidth) + kh*(kW*outputHeight*outputWidth) + kw*(outputHeight*outputWidth); real *dst = input_data + nip*(inputHeight*inputWidth) + kh*inputWidth + kw; for(y = 0; y < outputHeight; y++) THVector_(add)(dst+y*inputWidth, src+y*outputWidth, 1, outputWidth); /* note: THVector_add could handle 1 value better */ } } } }
static void THNN_(VolumetricConvolutionMM_updateOutput_frame)( THTensor *input, THTensor *output, THTensor *weight, THTensor *bias, THTensor *finput, int kT, int kW, int kH, int dT, int dW, int dH, int pT, int pW, int pH, int64_t nInputPlane, int64_t inputDepth, int64_t inputWidth, int64_t inputHeight, int64_t nOutputPlane, int64_t outputDepth, int64_t outputWidth, int64_t outputHeight) { int64_t i; THTensor *output2d; THNN_(unfolded_copy_vol)( finput, input, kT, kW, kH, dT, dW, dH, pT, pW, pH, nInputPlane, inputDepth, inputWidth, inputHeight, outputDepth, outputWidth, outputHeight ); output2d = THTensor_(newWithStorage2d)( output->storage, output->storageOffset, nOutputPlane, -1, outputDepth*outputHeight*outputWidth, -1 ); if (bias) { for (i = 0; i < nOutputPlane; i++) { THVector_(fill)( output->storage->data+output->storageOffset+output->stride[0]*i, THTensor_(get1d)(bias, i), outputDepth*outputHeight*outputWidth ); } } else { THTensor_(zero)(output); } THTensor_(addmm)(output2d, 1, output2d, 1, weight, finput); THTensor_(free)(output2d); }
static void THNN_(unfolded_acc_row)( THTensor *finput, THTensor *input, int kW, int dW, int padW, int64_t inputFrameSize, int64_t nInputFrame, int64_t nOutputFrame) { int64_t c; real *input_data = THTensor_(data)(input); real *finput_data = THTensor_(data)(finput); // #pragma omp parallel for private(c) for (c = 0; c < inputFrameSize; c++) { int64_t kw, x; int64_t ix = 0; for (kw = 0; kw < kW; kw++) { real *src = finput_data + c * (kW * nOutputFrame) + kw * (nOutputFrame); real *dst = input_data + c * (nInputFrame); ix = (size_t)(kw); if (dW == 1) { real *dst_slice = dst + (size_t)(ix); THVector_(cadd)(dst_slice, dst_slice, src, 1, nOutputFrame); } else { for (x = 0; x < nOutputFrame; x++) { real *dst_slice = dst + (size_t)(ix + x * dW); THVector_(cadd)(dst_slice, dst_slice, src + (size_t)(x), 1, 1); } } } } }
void THTensor_(copy)(THTensor *tensor, THTensor *src) { if (THTensor_(isContiguous)(tensor) && THTensor_(isContiguous)(src) && THTensor_(nElement)(tensor) == THTensor_(nElement)(src)) { real *sp = THTensor_(data)(src); real *rp = THTensor_(data)(tensor); ptrdiff_t sz = THTensor_(nElement)(tensor); #ifndef TH_REAL_IS_HALF THVector_(copy)(rp, sp, sz); #else memcpy(rp, sp, sz * sizeof(real)); #endif } else { TH_TENSOR_APPLY2(real, tensor, real, src, *tensor_data = *src_data;) } }
void THNN_(SparseLinear_legacyZeroGradParameters)( THNNState *state, THTensor *gradWeight, THTensor *gradBias, THTensor *lastInput) { int64_t h, i, j; int64_t outDim = gradWeight->size[0]; int64_t inDim = gradWeight->size[1]; THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 3, "gradBias size wrong"); THArgCheck(THNN_(checkLegacyInput)(lastInput), 4, "input size must be batchsize x nnz x 2"); THTensor_(zero)(gradBias); int64_t batchSize = THTensor_(size)(lastInput, 0); int64_t nnz = THTensor_(size)(lastInput, 1); #pragma omp parallel for private(h, i, j) schedule(static) if ( \ batchSize > 1 && batchSize * nnz * outDim > 10000) for (h = 0; h < batchSize; h++) { for (i = 0; i < nnz; i++) { if (THNN_(get3d)(lastInput, h, i, 1) == 0 ) { continue; } int64_t offset = (int64_t)(THNN_(get3d)(lastInput, h, i, 0)) - 1; if (offset >= 0 && offset < inDim) { real* pGradWeight = COL_PTR2(gradWeight, offset); if (gradWeight->stride[0] == 1) { THVector_(fill)(pGradWeight, 0, outDim); } else { int64_t stride = gradWeight->stride[0]; for (j = 0; j < outDim; ++j) { pGradWeight[j * stride] = 0; } } } else { THError( "index out of bound. zeroGradParameters: %d not between 1 and %d", offset + 1, inDim); } } } }
static void THNN_(SpatialConvolutionMM_updateOutput_frame)( THTensor *input, THTensor *output, THTensor *weight, THTensor *bias, THTensor *finput, int kW, int kH, int dW, int dH, int padW, int padH, long nInputPlane, long inputWidth, long inputHeight, long nOutputPlane, long outputWidth, long outputHeight) { long i; THTensor *output2d; THNN_(unfolded_copy)(finput, input, kW, kH, dW, dH, padW, padH, nInputPlane, inputWidth, inputHeight, outputWidth, outputHeight); output2d = THTensor_(newWithStorage2d)(output->storage, output->storageOffset, nOutputPlane, -1, outputHeight*outputWidth, -1); if (bias) { for(i = 0; i < nOutputPlane; i++) THVector_(fill) (output->storage->data + output->storageOffset + output->stride[0] * i, THTensor_(get1d)(bias, i), outputHeight*outputWidth); } else { THTensor_(zero)(output); } THTensor_(addmm)(output2d, 1, output2d, 1, weight, finput); THTensor_(free)(output2d); }
static void THNN_(TemporalRowConvolution_updateOutput_frame)( THTensor *input, THTensor *output, THTensor *weight, THTensor *bias, THTensor *finput, int kW, int dW, int padW, int64_t inputFrameSize, int64_t nInputFrame, int64_t nOutputFrame) { int64_t i; THTensor *output3d = THTensor_(newWithStorage3d)( output->storage, output->storageOffset, inputFrameSize, -1, 1, -1, nOutputFrame, -1); THNN_(unfolded_copy_row)(finput, input, kW, dW, padW, inputFrameSize, nInputFrame, nOutputFrame); THTensor_(zero)(output); if (bias != NULL) { for (i = 0; i < inputFrameSize; i++) THVector_(fill) (THStorage_(data)(output->storage) + output->storageOffset + output->stride[0] * i, THTensor_(get1d)(bias, i), nOutputFrame); } THTensor_(baddbmm)(output3d, 1, output3d, 1, weight, finput); THTensor_(free)(output3d); }
void THVector_(divs)(real *y, const real *x, const real c, const ptrdiff_t n) { THVector_(divs_DISPATCHPTR)(y, x, c, n); }
void THVector_(cdiv)(real *z, const real *x, const real *y, const ptrdiff_t n) { THVector_(cdiv_DISPATCHPTR)(z, x, y, n); }
// Dispatch stubs that just call the pointers TH_API void THVector_(adds)(real *r_, const real *t, const real value, const ptrdiff_t n) { THVector_(adds_DISPATCHPTR)(r_, t, value, n); }
#ifndef TH_GENERIC_FILE #define TH_GENERIC_FILE "generic/THVectorDispatch.cpp" #else /* For now there are only SIMD implementations for FLOAT and DOUBLE. * Hopefully in the future this can be made totally generic (e.g, there are SIMD implementations * for a lot of functions */ /* Each function with multiple implementations has: * 1. A DISPATCHPTR which will be initialized to point to the best available implementation for the host * 2. A DISPATCHTABLE which holds pointers to each implementation of a function, and a value indicating * which SIMD extension a given implementation uses * 3. A dispatch stub, which is what is actually called by clients, that simply wraps the dispatch pointer. */ static void (*THVector_(fill_DISPATCHPTR))(real *, const real, const ptrdiff_t) = &THVector_(fill_DEFAULT); static FunctionDescription THVector_(fill_DISPATCHTABLE)[] = { #if defined(__NEON__) #if defined(TH_REAL_IS_FLOAT) FUNCTION_IMPL(THVector_(fill_NEON), SIMDExtension_NEON), #endif #endif #if defined(__PPC64__) #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT) FUNCTION_IMPL(THVector_(fill_VSX), SIMDExtension_VSX), #endif #endif #if defined(USE_AVX) #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT) FUNCTION_IMPL(THVector_(fill_AVX), SIMDExtension_AVX),
static int nnconv1d_(HorizontalConvolution_updateGradInput)(lua_State *L) { THTensor *input = luaT_checkudata(L, 2, torch_Tensor); THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor); int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane"); int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane"); int kL = luaT_getfieldcheckint(L, 1, "kL"); THTensor *weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor); THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor); THArgCheck(nOutputPlane == gradOutput->size[input->nDimension == 4 ? 1 : 0], 1, "Number of output features is not equal to nOutputPlane" ); // change to batch mode int batch = 1; if (input->nDimension == 3) { batch = 0; THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]); THTensor_(resize4d)(gradOutput, 1, nOutputPlane, gradOutput->size[1], gradOutput->size[2]); } long batchSize = input->size[0]; long inputHeight = input->size[2]; long inputWidth = input->size[3]; long outputHeight = inputHeight; long outputWidth = inputWidth - kL + 1; THTensor_(resizeAs)(gradInput, input); THTensor_(zero)(gradInput); int elt; #pragma omp parallel for private(elt) for (elt = 0; elt < batchSize; elt++) { // select each batch THTensor *gradInput_t = THTensor_(newSelect)(gradInput, 0, elt); THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, elt); // convolve horizontally int i, j, k; for (i = 0; i < nOutputPlane; i++) { for (j = 0; j < outputHeight; j++) { for (k = 0; k < kL; k++) { THVector_(add)(gradInput_t->storage->data + gradInput_t->storageOffset + gradInput_t->stride[0]*i + gradInput_t->stride[1]*j + k, gradOutput_t->storage->data + gradOutput_t->storageOffset + gradOutput_t->stride[0]*i + gradOutput_t->stride[1]*j, *(THTensor_(data)(weight)+i*kL+k), outputWidth); // needs to change } } } // release temp tensors THTensor_(free)(gradInput_t); THTensor_(free)(gradOutput_t); } // revert to single batch if (batch == 0) { THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth); THTensor_(resize3d)(gradInput, nInputPlane, inputHeight, inputWidth); THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth); } return 1; }
static int nnconv1d_(HorizontalConvolution_updateOutput)(lua_State *L) { THTensor *input = luaT_checkudata(L, 2, torch_Tensor); int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane"); int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane"); int kL = luaT_getfieldcheckint(L, 1, "kL"); THTensor *weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor); THTensor *bias = luaT_getfieldcheckudata(L, 1, "bias", torch_Tensor); THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor); luaL_argcheck(L, input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor expected"); // change to batch mode int batch = 1; if (input->nDimension == 3) { batch = 0; THTensor_(resize4d)(input, 1, nInputPlane, input->size[1], input->size[2]); } long batchSize = input->size[0]; long inputHeight = input->size[2]; long inputWidth = input->size[3]; long outputHeight = inputHeight; long outputWidth = inputWidth - kL + 1; THTensor_(resize4d)(output, batchSize, nOutputPlane, outputHeight, outputWidth); int elt; #pragma omp parallel for private(elt) for (elt = 0; elt < batchSize; elt++) { // select each batch THTensor *input_t = THTensor_(newSelect)(input, 0, elt); THTensor *output_t = THTensor_(newSelect)(output, 0, elt); // fill biases int i, j, k; for (i = 0; i < nOutputPlane; i++) { THVector_(fill)(output_t->storage->data+output_t->storageOffset+output_t->stride[0]*i, THTensor_(get1d)(bias, i), outputHeight*outputWidth); } // convolve horizontally for (i = 0; i < nInputPlane; i++) { for (j = 0; j < inputHeight; j++) { for (k = 0; k < kL; k++) { THVector_(add)(output_t->storage->data + output_t->storageOffset + output_t->stride[0]*i + output_t->stride[1]*j, input_t->storage->data + input_t->storageOffset + input_t->stride[0]*i + input_t->stride[1]*j + k, *(THTensor_(data)(weight)+i*kL+k), outputWidth); } } } // release temp tensors THTensor_(free)(input_t); THTensor_(free)(output_t); } // revert to single batch if (batch == 0) { THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth); THTensor_(resize3d)(output, nOutputPlane, outputHeight, outputWidth); } return 1; }
void THNN_(SpatialConvolutionMM_updateOutput)( THNNState *state, THTensor *input, THTensor *output, THTensor *weight, THTensor *bias, THTensor *finput, THTensor *fgradInput, int kW, int kH, int dW, int dH, int padW, int padH) { int dimf = 0; int dimw = 2; int dimh = 1; long nInputPlane; long inputWidth; long inputHeight; long nOutputPlane; long outputWidth; long outputHeight; THArgCheck( input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor expected"); THArgCheck(kW > 0 && kH > 0, 8, "kernel size should be greater than zero"); THArgCheck(dW > 0 && dH > 0, 10, "stride should be greater than zero"); if (input->nDimension == 4) { dimf++; dimw++; dimh++; } nInputPlane = input->size[dimf]; inputWidth = input->size[dimw]; inputHeight = input->size[dimh]; nOutputPlane = weight->size[0]; outputWidth = (inputWidth + 2*padW - kW) / dW + 1; outputHeight = (inputHeight + 2*padH - kH) / dH + 1; if (outputWidth < 1 || outputHeight < 1) THError("Given input size: (%dx%dx%d). Calculated output size: (%dx%dx%d). Output size is too small", nInputPlane,inputHeight,inputWidth,nOutputPlane,outputHeight,outputWidth); if (nInputPlane*kW*kH != weight->size[1]) THError("Wrong number of input channels! Input has %d channels, expected %d",nInputPlane,weight->size[1]/(kW*kH)); if(input->nDimension == 3) { THTensor_(resize2d)(finput, kW*kH*nInputPlane, outputHeight*outputWidth); THTensor_(resize3d)(output, nOutputPlane, outputHeight, outputWidth); THNN_(SpatialConvolutionMM_updateOutput_frame)(input, output, weight, bias, finput, kW, kH, dW, dH, padW, padH, nInputPlane, inputWidth, inputHeight, nOutputPlane, outputWidth, outputHeight); } else { long T = input->size[0]; long t; THTensor_(resize3d)(finput, T, kW*kH*nInputPlane, outputHeight*outputWidth); THTensor_(resize4d)(output, T, nOutputPlane, outputHeight, outputWidth); THNN_(batch_unfolded_copy)(finput, input, T, kW, kH, dW, dH, padW, padH, nInputPlane, inputWidth, inputHeight, outputWidth, outputHeight); long i; if (bias) { #pragma omp parallel for collapse(2) private(i,t) for(t = 0; t < T; t++) for(i = 0; i < nOutputPlane; i++) THVector_(fill)(output->storage->data+output->storageOffset+output->stride[0]*t+output->stride[1]*i, THTensor_(get1d)(bias, i), outputHeight*outputWidth); } else { THTensor_(zero)(output); } #pragma omp parallel for private(t) for(t = 0; t < T; t++) { THTensor *input_t = THTensor_(newSelect)(input, 0, t); THTensor *output_t = THTensor_(newSelect)(output, 0, t); THTensor *finput_t = THTensor_(newSelect)(finput, 0, t); /* THNN_(SpatialConvolutionMM_updateOutput_frame)(input_t, output_t, weight, bias, finput_t, */ /* kW, kH, dW, dH, padW, padH, */ /* nInputPlane, inputWidth, inputHeight, */ /* nOutputPlane, outputWidth, outputHeight); */ THTensor *output2d; /* THNN_(unfolded_copy)(finput_t, input_t, kW, kH, dW, dH, padW, padH, nInputPlane, inputWidth, inputHeight, outputWidth, outputHeight); */ output2d = THTensor_(newWithStorage2d)(output_t->storage, output_t->storageOffset, nOutputPlane, -1, outputHeight*outputWidth, -1); /* long i; */ /* if (bias) { */ /* for(i = 0; i < nOutputPlane; i++) */ /* THVector_(fill)(output_t->storage->data+output_t->storageOffset+output_t->stride[0]*i, THTensor_(get1d)(bias, i), outputHeight*outputWidth); */ /* } else { */ /* THTensor_(zero)(output_t); */ /* } */ THTensor_(addmm)(output2d, 1, output2d, 1, weight, finput_t); THTensor_(free)(output2d); THTensor_(free)(input_t); THTensor_(free)(output_t); THTensor_(free)(finput_t); } } }
/* note: due to write issues, this one cannot be parallelized as well as unfolded_copy */ static void THNN_(unfolded_acc_vol)( THTensor *finput, THTensor *input, int kT, int kW, int kH, int dT, int dW, int dH, int pT, int pW, int pH, long nInputPlane, long inputDepth, long inputWidth, long inputHeight, long outputDepth, long outputWidth, long outputHeight) { long nip; real *input_data = THTensor_(data)(input); real *finput_data = THTensor_(data)(finput); //#pragma omp parallel for private(nip) for (nip = 0; nip < nInputPlane; nip++) { long kt, kw, kh, t, y, x, it, ix, iy; for (kt = 0; kt < kT; kt++) { for (kh = 0; kh < kH; kh++) { for (kw = 0; kw < kW; kw++) { real *src = finput_data + nip * (kT*kH*kW*outputDepth*outputHeight*outputWidth) + kt * (kH*kW*outputDepth*outputHeight*outputWidth) + kh * (kW*outputDepth*outputHeight*outputWidth) + kw * (outputDepth*outputHeight*outputWidth); real *dst = input_data + nip*(inputDepth*inputHeight*inputWidth); if (pT > 0 || pH > 0 || pW > 0) { for (t = 0; t < outputDepth; t++) { it = t*dT - pT + kt; for (y = 0; y < outputHeight; y++) { iy = y*dH - pH + kh; for (x = 0; x < outputWidth; x++) { ix = x*dW - pW + kw; if (it < 0 || it >= inputDepth || iy < 0 || iy >= inputHeight || ix < 0 || ix >= inputWidth) { } else { real *dst_slice = dst+it*inputHeight*inputWidth+iy*inputWidth+ix; THVector_(cadd)(dst_slice, dst_slice, src+t*outputHeight*outputWidth+y*outputWidth+x, 1, 1); } } } } } else { for (t = 0; t < outputDepth; t++) { it = t*dT + kt; for (y = 0; y < outputHeight; y++) { iy = y*dH + kh; for(x = 0; x < outputWidth; x++) { ix = x*dW + kw; real *dst_slice = dst+it*inputHeight*inputWidth+iy*inputWidth+ix; THVector_(cadd)(dst_slice, dst_slice, src+t*outputHeight*outputWidth+y*outputWidth+x, 1, 1); } } } } } } } } }
void THVector_(fill)(real *x, const real c, const ptrdiff_t n) { THVector_(fill_DISPATCHPTR)(x, c, n); }
void THNN_(IndexLinear_updateOutput)( THNNState *state, THLongTensor *keys, int64_t keysOffset, THTensor *values, THLongTensor *sizes, THLongTensor *cumSumSizes, THTensor *output, THTensor *weight, THTensor *bias, THTensor *normalizedValues, int train) { /* Retrieve all the dimensions of the problem */ int64_t batchSize = THLongTensor_size(sizes, 0); int64_t keysSize = THLongTensor_size(keys, 0); int64_t outDim = THTensor_(size)(bias, 0); int64_t woutDim = THTensor_(size)(weight, 1); int maxNormalize = woutDim - outDim; int64_t* sizesData = THLongTensor_data(sizes); int64_t* cumSumSizesData = THLongTensor_data(cumSumSizes); /* Define/resize the normalized values tensor if maxNormalize is > 0 */ scalar_t* normalizedValuesData = NULL; if (maxNormalize) { THTensor_(resize1d)(normalizedValues, keysSize); normalizedValuesData = normalizedValues->data<scalar_t>(); } /* Resize the output */ THTensor_(resize2d)(output, batchSize, outDim); /* Access the storage data/strides */ scalar_t* outputData = output->data<scalar_t>(); scalar_t* valuesData = values->data<scalar_t>(); scalar_t* weightData = weight->data<scalar_t>(); int64_t weightStride0 = weight->stride(0); scalar_t* biasData = bias->data<scalar_t>(); int64_t* keysData = THLongTensor_data(keys); /* Make sure these inputs are contiguous to accelerate computations */ THArgCheck(THLongTensor_isContiguous(keys), 1, "keys vector must be contiguous"); THArgCheck(THTensor_(isContiguous)(values), 3, "values vector must be contiguous"); THArgCheck(THTensor_(isContiguous)(output), 6, "output vector must be contiguous"); THArgCheck(THTensor_(isContiguous)(weight), 7, "weight matrix must be contiguous"); THArgCheck(THTensor_(isContiguous)(bias), 8, "bias vector must be contiguous"); THArgCheck(THNN_(checkKeysValues)(keys, values), 1, "Keys and values should have the same number of elements"); THArgCheck(THTensor_(isContiguous)(normalizedValues), 9, "normalizedValues vector must be contiguous"); /* Separate cases: output dimension is == 1, or > 1 * This allows for some optimizations. */ if (outDim == 1) { THVector_(fill)(outputData, *biasData, batchSize); if (maxNormalize) { /* Parallelize on the batch itself */ auto loop = [&](int64_t start, int64_t end) { for (auto j = start; j < end; j++) { scalar_t* loutputData = outputData + j; scalar_t val = 0; scalar_t absVal = 0; int64_t offset = j == 0 ? 0 : cumSumSizesData[j - 1]; for (auto i = 0; i < sizesData[j]; i++) { int64_t woffset = weightStride0*(keysData[offset] + keysOffset); absVal = fabs(valuesData[offset]); if (train) { if (absVal > weightData[woffset]) { weightData[woffset] = absVal; weightData[woffset+1] = 1/absVal; } /* * The following can be used to scale the size of the updates * depending on some rule, e.g. the frequency of a feature, ... * This is used at update time. * TODO: implement a smarter update scale. */ weightData[woffset+2] = 1; } normalizedValuesData[offset] = (absVal > weightData[woffset] ? THNN_INDEXLINEAR_SIGN(valuesData[offset]):valuesData[offset]*weightData[woffset+1]) + weightData[woffset+3]; val += normalizedValuesData[offset] * weightData[woffset+maxNormalize]; offset++; } *loutputData += val; } }; if (keysSize * outDim > THNN_SPARSE_OMP_THRESHOLD) { at::parallel_for(0, batchSize, 1, loop); } else { loop(0, batchSize); } } else { /* Parallelize on the batch itself */ auto loop = [&](int64_t start, int64_t end) { for (auto j = start; j < end; j++) { int64_t offset = j == 0 ? 0 : cumSumSizesData[j - 1]; scalar_t* loutputData = outputData + j; scalar_t val = 0; for (auto i = 0; i < sizesData[j]; i++) { val += weightData[weightStride0*(keysData[offset] + keysOffset)] * valuesData[offset]; offset++; } *loutputData += val; } }; if (keysSize * outDim > THNN_SPARSE_OMP_THRESHOLD) { at::parallel_for(0, batchSize, 1, loop); } else { loop(0, batchSize); } } } else { auto loop = [&](int64_t start, int64_t end) { for (auto j = start; j < end; j++) { int64_t offset = j == 0 ? 0 : cumSumSizesData[j - 1]; scalar_t val; scalar_t* loutputData = outputData + j*outDim; scalar_t* lweightData = weightData; memcpy(loutputData, biasData, outDim*sizeof(scalar_t)); for (auto i = 0; i < sizesData[j]; i++) { int64_t woffset = weightStride0*(keysData[offset] + keysOffset); if (maxNormalize) { val = valuesData[offset]; scalar_t absVal = fabs(val); if (train) { if (absVal > weightData[woffset]) { weightData[woffset] = absVal; weightData[woffset+1] = 1/absVal; } /* * The following can be used to scale the size of the updates * depending on some rule, e.g. the frequency of a feature, ... * The commented section thereafter is just an example of what can be done: * *``` * weightData[woffset+2] = weightData[woffset+2]==0?1:(weightData[woffset+2] / (weightData[woffset+2] + 1)); * scalar_t alpha = 1; * scalar_t beta = 0.01; * scalar_t gamma = 1 - 0.000001; * scalar_t l = weightData[woffset+2]==0?1/gamma:(weightData[woffset+2] - beta) / (alpha - beta); * l = gamma*l; * weightData[woffset+2] = (alpha-beta)*l + beta; * ``` * * TODO: implement a smarter update scale. */ weightData[woffset+2] = 1; } /* Normalize + Clamp */ val = (absVal > weightData[woffset] ? THNN_INDEXLINEAR_SIGN(val):val*weightData[woffset+1]) + weightData[woffset+3]; normalizedValuesData[offset] = val; lweightData = weightData + woffset + maxNormalize; } else { val = valuesData[offset]; lweightData = weightData + woffset; } if (outDim > THNN_SPARSE_OUTDIM_THRESHOLD) { THBlas_(axpy)(outDim, val, lweightData, 1, loutputData, 1); } else { for (auto k = 0; k < outDim; k++) { loutputData[k] += lweightData[k] * val; } } offset++; } } }; if (keysSize * outDim > THNN_SPARSE_OMP_THRESHOLD) { at::parallel_for(0, batchSize, 1, loop); } else { loop(0, batchSize); } } return; }
void THNN_(IndexLinear_accUpdateGradParameters)( THNNState *state, THLongTensor *keys, int64_t keysOffset, THTensor *values, THLongTensor *sizes, THLongTensor *cumSumSizes, THTensor *gradOutput, THTensor *weight, THTensor *bias, accreal weightDecay_, accreal scale_) { scalar_t weightDecay = TH_CONVERT_ACCREAL_TO_REAL(weightDecay_); scalar_t scale = TH_CONVERT_ACCREAL_TO_REAL(scale_); /* Retrieve all the dimensions of the problem */ int64_t batchSize = THLongTensor_size(sizes, 0); int64_t outDim = THTensor_(size)(bias, 0); int64_t woutDim = THTensor_(size)(weight, 1); int maxNormalize = woutDim - outDim; THArgCheck(THNN_(checkKeysValues)(keys, values), 1, "Keys and values should have the same number of elements"); /* Access the storage data/strides */ scalar_t* gradOutputData = gradOutput->data<scalar_t>(); scalar_t* valuesData =values->data<scalar_t>(); scalar_t* weightData = weight->data<scalar_t>(); scalar_t* biasData = bias->data<scalar_t>(); int64_t weightStride0 = weight->stride(0); int64_t* keysData = THLongTensor_data(keys); int64_t* sizesData = THLongTensor_data(sizes); /* Make sure these inputs are contiguous to accelerate computations */ THArgCheck(THLongTensor_isContiguous(keys), 1, "keys vector must be contiguous"); THArgCheck(THTensor_(isContiguous)(values), 3, "values vector must be contiguous"); THArgCheck(THTensor_(isContiguous)(gradOutput), 6, "gradOutput vector must be contiguous"); THArgCheck(THTensor_(isContiguous)(weight), 7, "weight matrix must be contiguous"); THArgCheck(THTensor_(isContiguous)(bias), 8, "bias matrix must be contiguous"); int i,j,k; /* Separate cases: output dimension is == 1, or > 1 * This allows for some optimizations. * No multithreading here as this could * corrupt the results (hogwild style) */ if (outDim == 1) { if (maxNormalize) { int64_t offset = 0; for (j = 0; j < batchSize; j++) { scalar_t* lgradOutputData = gradOutputData + j; *biasData -= *lgradOutputData * scale; scalar_t val = *lgradOutputData * scale; for (i = 0; i < sizesData[j]; i++) { int64_t idx = weightStride0*(keysData[offset] + keysOffset) + maxNormalize; weightData[idx-1] -= weightData[idx]*val*weightData[idx-2]; weightData[idx] -= (val*valuesData[offset] - weightDecay * weightData[idx])*weightData[idx-2]; offset++; } } offset = 0; for (j = 0; j < batchSize; j++) { for (i = 0; i < sizesData[j]; i++) { int64_t idx = weightStride0*(keysData[offset] + keysOffset) + maxNormalize; weightData[idx-2] = 0; offset++; } } } else { if (weightDecay) { int64_t offset = 0; for (j = 0; j < batchSize; j++) { scalar_t* lgradOutputData = gradOutputData + j; *biasData -= *lgradOutputData * scale; scalar_t val = *lgradOutputData * scale; for (i = 0; i < sizesData[j]; i++) { int64_t idx = weightStride0*(keysData[offset] + keysOffset); weightData[idx] -= val * valuesData[offset] + weightData[idx] * weightDecay; offset++; } } } else { int64_t offset = 0; for (j = 0; j < batchSize; j++) { scalar_t val = gradOutputData[j] * scale; for (i = 0; i < sizesData[j]; i++) { weightData[(keysData[offset] + keysOffset)*weightStride0] -= val * valuesData[offset]; offset++; } *biasData -= val; } } } } else { int64_t offset = 0; for (j = 0; j < batchSize; j++) { scalar_t* lgradOutputData = gradOutputData + j*outDim; scalar_t* lweightData = weightData; THVector_(cadd)(biasData, biasData, lgradOutputData, -scale, outDim); for (i = 0; i < sizesData[j]; i++) { scalar_t val = valuesData[offset] * scale; scalar_t wd = weightDecay; // Max normalize case if (maxNormalize) { lweightData = weightData + weightStride0*(keysData[offset] + keysOffset) + (maxNormalize-2); val *= lweightData[0]; wd *= lweightData[0]; for (k=0; k < outDim; k++) { lweightData[1] -= lweightData[k+2]*scale*lgradOutputData[k]*lweightData[0]; } lweightData += 2; } else { lweightData = weightData + weightStride0*(keysData[offset] + keysOffset); } /* We do sparse weight decay. * We think it makes more sense. */ if (weightDecay) { if (outDim > THNN_SPARSE_OUTDIM_THRESHOLD) { THBlas_(axpy)(outDim, -wd, lweightData, 1, lweightData, 1); } else { for (k=0; k < outDim; k++) { lweightData[k] -= wd * lweightData[k]; } } } if (outDim > THNN_SPARSE_OUTDIM_THRESHOLD) { THBlas_(axpy)(outDim, -val, lgradOutputData, 1, lweightData, 1); } else { for (k=0; k < outDim; k++) { lweightData[k] -= val * lgradOutputData[k]; } } offset++; } } /* Max Normalize case: * Reset the smart update scaling if * one does it batch-wise. * TODO: Decide what to do with that piece of code. * NB: If the code belowe is uncommented, so should the commented * code in IndexLinear:zeroGradParameters() */ /* if (maxNormalize) { offset = 0; for (j = 0; j < batchSize; j++) { scalar_t* lweightData = weightData; for (i = 0; i < sizesData[j]; i++) { scalar_t val = valuesData[offset] * scale; scalar_t wd = weightDecay; lweightData = weightData + weightStride0*(keysData[offset] + keysOffset) + (maxNormalize-2); lweightData[0] = 0; offset++; } } } */ } return; }
void THTensor_(copy)(THTensor *tensor, THTensor *src) { if (tensor == src) return; ptrdiff_t tensorSize = THTensor_(nElement)(tensor); ptrdiff_t srcSize = THTensor_(nElement)(src); int tensorContig = THTensor_(isContiguous)(tensor); int srcContig = THTensor_(isContiguous)(src); int serial_path = 0; #ifdef _OPENMP int inOMP = omp_in_parallel(); #endif if (tensorSize == srcSize) { if ( tensorContig && srcContig) { real *sp = THTensor_(data)(src); real *rp = THTensor_(data)(tensor); #ifndef TH_REAL_IS_HALF #ifdef _OPENMP #pragma omp parallel if ( (tensorSize > TH_OMP_OVERHEAD_THRESHOLD_COPY) && (!inOMP) ) { size_t num_threads = omp_get_num_threads(); size_t tid = omp_get_thread_num(); ptrdiff_t offset = tid * (tensorSize / num_threads); ptrdiff_t end = (tid == num_threads - 1) ? tensorSize : offset + tensorSize / num_threads; ptrdiff_t len = end - offset; real *tensorData = rp + offset; real *srcData = sp + offset; THVector_(copy)(tensorData, srcData, len); } #else THVector_(copy)(rp, sp, srcSize); #endif #else #ifdef _OPENMP if ((srcSize > TH_OMP_OVERHEAD_THRESHOLD_COPY) && (!inOMP)) { ptrdiff_t i; #pragma omp parallel for private (i) for(i=0; i<srcSize; i++){ rp[i] = sp[i]; } } else { memcpy(rp, sp, srcSize * sizeof(real)); } #else memcpy(rp, sp, srcSize * sizeof(real)); #endif #endif #ifndef TH_REAL_IS_HALF } else if (THTensor_(copyTransposeValid)(tensor, src)) { THTensor_(copyTranspose)(tensor, src); #endif } else { #ifdef _OPENMP if (inOMP) { serial_path = 1; } else { TH_TENSOR_APPLY2_OMP(srcSize, tensorContig, srcContig, real, tensor, real, src, *tensor_data = *src_data;, TH_OMP_OVERHEAD_THRESHOLD_COPY) } #else serial_path = 1; #endif } } else {
void THVector_(copy)(real *y, const real *x, const ptrdiff_t n) { THVector_(copy_DISPATCHPTR)(y, x, n); }
void THNN_(IndexLinear_updateParameters)( THNNState *state, THTensor *gradWeight, THTensor *gradBias, THTensor *weight, THTensor *bias, THLongTensor *runningKeys, THLongTensor *cumSumSizes, int64_t keysOffset, accreal weightDecay_, accreal learningRate_) { scalar_t weightDecay = TH_CONVERT_ACCREAL_TO_REAL(weightDecay_); scalar_t learningRate = TH_CONVERT_ACCREAL_TO_REAL(learningRate_); /* Retrieve all the dimensions of the problem */ int64_t outDim = THTensor_(size)(bias, 0); int64_t woutDim = THTensor_(size)(weight, 1); int maxNormalize = woutDim - outDim; int64_t keysSize = THLongTensor_size(runningKeys, 0); /* Access the storage data/strides */ scalar_t* gradWeightData = gradWeight->data<scalar_t>(); scalar_t* weightData = weight->data<scalar_t>(); int64_t weightStride0 = weight->stride(0); scalar_t* gradBiasData = gradBias->data<scalar_t>(); scalar_t* biasData = bias->data<scalar_t>(); int64_t* keysData = THLongTensor_data(runningKeys); /* Make sure these inputs are contiguous to accelerate computations */ THArgCheck(THTensor_(isContiguous)(gradWeight), 1, "gradWeight must be contiguous"); THArgCheck(THTensor_(isContiguous)(gradBias), 2, "gradBias vector must be contiguous"); THArgCheck(THTensor_(isContiguous)(weight), 3, "gradBias vector must be contiguous"); THArgCheck(THTensor_(isContiguous)(bias), 4, "gradBias vector must be contiguous"); THArgCheck(THLongTensor_isContiguous(runningKeys), 5, "keys vector must be contiguous"); int j, k; /* Update the bias first */ THVector_(cadd)(biasData, biasData, gradBiasData, -learningRate, outDim); /* Separate cases: output dimension is == 1, or > 1 * This allows for some optimizations. * No multithreading here as this could * corrupt the results (hogwild style) */ if (outDim == 1) { if (maxNormalize) { if (weightDecay) { for (j = 0; j < keysSize; j++) { int64_t woffset = weightStride0*(keysData[j] + keysOffset) + maxNormalize; scalar_t lr = learningRate*weightData[woffset-2]; weightData[woffset-1] -= weightData[woffset]*gradWeightData[2*j]*lr; weightData[woffset] -= gradWeightData[2*j+1]*lr - weightDecay * weightData[woffset-2] * weightData[woffset]; } } else { for (j = 0; j < keysSize; j++) { int64_t woffset = weightStride0*(keysData[j] + keysOffset) + maxNormalize; scalar_t lr = learningRate*weightData[woffset-2]; weightData[woffset-1] -= weightData[woffset]*gradWeightData[2*j]*lr; weightData[woffset] -= gradWeightData[2*j+1]*lr; } } } else { if (weightDecay) { for (j = 0; j < keysSize; j++) { int64_t woffset = weightStride0*(keysData[j] + keysOffset); weightData[woffset] -= gradWeightData[j]*learningRate + weightDecay * weightData[woffset]; } } else { for (j = 0; j < keysSize; j++) { weightData[weightStride0*(keysData[j] + keysOffset)] -= gradWeightData[j]*learningRate; } } } } else { for (j = 0; j < keysSize; j++) { scalar_t lr = learningRate; scalar_t wd = weightDecay; scalar_t* lweightData; int64_t woffset = weightStride0*(keysData[j] + keysOffset); scalar_t* lgradWeightData = gradWeightData + j*outDim; if (maxNormalize) { lgradWeightData += j*outDim; /* weightData[woffset + 2] */ lweightData = weightData + woffset + maxNormalize - 2; lr = lr*lweightData[0]; wd = weightDecay*lweightData[0]; /* weightData[woffset + 3] */ lweightData++; for (k=0; k < outDim; k++) { lweightData[0] -= lgradWeightData[k]*lweightData[k+1]*lr; } lweightData++; lgradWeightData += outDim; } else { lweightData = weightData + woffset; } /* We do sparse weight decay. * We think it makes more sense. */ if (weightDecay) { for (k=0; k < outDim; k++) { lweightData[k] -= lweightData[k]*wd; } } if (outDim > THNN_SPARSE_OUTDIM_THRESHOLD) { THBlas_(axpy)(outDim, -lr, lgradWeightData, 1, lweightData, 1); } else { for (k=0; k < outDim; k++) { lweightData[k] -= lgradWeightData[k]*lr; } } } } }
void THVector_(cvtFromInt)(real *y, const int *x, const ptrdiff_t n) { THVector_(cvtFromInt_DISPATCHPTR)(y, x, n); }
void THNN_(IndexLinear_accGradParameters)( THNNState *state, THLongTensor *keys, int64_t keysOffset, THTensor *values, THLongTensor *sizes, THLongTensor *cumSumSizes, THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias, THTensor *weight, THTensor *bias, THTensor *valuesBuffer, accreal weightDecay_, accreal scale_) { scalar_t scale = TH_CONVERT_ACCREAL_TO_REAL(scale_); /* Retrieve all the dimensions of the problem */ int64_t batchSize = THLongTensor_size(sizes, 0); int64_t keysSize = THLongTensor_size(keys, 0); int64_t outDim = THTensor_(size)(bias, 0); int64_t woutDim = THTensor_(size)(weight, 1); int64_t maxNormalize = (woutDim - outDim) > 0 ?1:0; THArgCheck(THNN_(checkKeysValues)(keys, values), 1, "Keys and values should have the same number of elements"); int64_t* sizesData = THLongTensor_data(sizes); /* COmpute the cumulative sizes */ THLongTensor* cumSizes = THLongTensor_new(); THLongTensor_cumsum(cumSizes, sizes, 0); int64_t* cumSizesData = THLongTensor_data(cumSizes); /* Resize the gradWeight buffer to keep it dense. * That speeds up updates A LOT assuming random mem access. */ THTensor_(resize2d)(gradWeight, keysSize, outDim * (maxNormalize>0?2:1)); /* Access the storage data/strides */ scalar_t* gradOutputData = gradOutput->data<scalar_t>(); scalar_t* valuesData =values->data<scalar_t>(); scalar_t* gradWeightData = gradWeight->data<scalar_t>(); scalar_t* gradBiasData = gradBias->data<scalar_t>(); /* Make sure these inputs are contiguous to accelerate computations */ THArgCheck(THLongTensor_isContiguous(keys), 1, "keys vector must be contiguous"); THArgCheck(THTensor_(isContiguous)(values), 3, "values vector must be contiguous"); THArgCheck(THTensor_(isContiguous)(gradOutput), 6, "gradOutput vector must be contiguous"); THArgCheck(THTensor_(isContiguous)(gradWeight), 7, "gradWeight must be contiguous"); THArgCheck(THTensor_(isContiguous)(gradBias), 8, "gradBias vector must be contiguous"); THArgCheck(THTensor_(isContiguous)(weight), 9, "weight must be contiguous"); THArgCheck(THTensor_(isContiguous)(bias), 10, "bias vector must be contiguous"); THArgCheck(THTensor_(isContiguous)(valuesBuffer), 11, "valuesBuffer must be contiguous"); int i,j,k; /* Separate cases: output dimension is == 1, or > 1 * This allows for some optimizations. * No multithreading here as this could * corrupt the results (hogwild style) */ if (outDim == 1) { for (j = 0; j < batchSize; j++) { int64_t offset = j==0?0:cumSizesData[j-1]; scalar_t val = gradOutputData[j] * scale; scalar_t* lgradWeightData = gradWeightData + offset; scalar_t* lvaluesData = valuesData + offset; int64_t end = sizesData[j]; if (maxNormalize) { lgradWeightData += offset; i = 0; for(;i < end; i++) { lgradWeightData[2*i] = val; lgradWeightData[2*i+1] = val * lvaluesData[i]; } } else { i = 0; for(;i < end-4; i += 4) { lgradWeightData[i] = val * lvaluesData[i]; lgradWeightData[i+1] = val * lvaluesData[i+1]; lgradWeightData[i+2] = val * lvaluesData[i+2]; lgradWeightData[i+3] = val * lvaluesData[i+3]; } for(; i < end; i++) { lgradWeightData[i] = val * lvaluesData[i]; } } *gradBiasData += val; offset += end; } } else { for (j = 0; j < batchSize; j++) { int64_t offset = j==0?0:cumSizesData[j-1]; scalar_t* lgradOutputData = gradOutputData + j*outDim; scalar_t* lgradWeightData = gradWeightData; THVector_(cadd)(gradBiasData, gradBiasData, lgradOutputData, scale, outDim); for (i = 0; i < sizesData[j]; i++) { scalar_t val = valuesData[offset] * scale; lgradWeightData = gradWeightData + offset*outDim; if (maxNormalize) { lgradWeightData += offset*outDim; k = 0; for(;k < outDim-4; k += 4) { lgradWeightData[k] = lgradOutputData[k]*scale; lgradWeightData[k+1] = lgradOutputData[k+1]*scale; lgradWeightData[k+2] = lgradOutputData[k+2]*scale; lgradWeightData[k+3] = lgradOutputData[k+3]*scale; } for(; k < outDim; k++) { lgradWeightData[k] = lgradOutputData[k]*scale; } lgradWeightData += outDim; } k = 0; for(;k < outDim-4; k += 4) { lgradWeightData[k] = val * lgradOutputData[k]; lgradWeightData[k+1] = val * lgradOutputData[k+1]; lgradWeightData[k+2] = val * lgradOutputData[k+2]; lgradWeightData[k+3] = val * lgradOutputData[k+3]; } for(; k < outDim; k++) { lgradWeightData[k] = val * lgradOutputData[k]; } offset++; } } } THLongTensor_free(cumSizes); return; }
void THVector_(sigmoid)(real *y, const real *x, const ptrdiff_t n) { THVector_(sigmoid_DISPATCHPTR)(y, x, n); }
/* note: due to write issues, this one cannot be parallelized as well as unfolded_copy */ void THNN_(unfolded_acc)(THTensor *finput, THTensor *input, int kW, int kH, int dW, int dH, int padW, int padH, int nInputPlane, int inputWidth, int inputHeight, int outputWidth, int outputHeight) { #ifdef _WIN32 LONG_PTR nip; #else size_t nip; #endif real *input_data = THTensor_(data)(input); real *finput_data = THTensor_(data)(finput); #pragma omp parallel for private(nip) for(nip = 0; nip < nInputPlane; nip++) { size_t kw, kh, y, x; long long ix = 0, iy = 0; for(kh = 0; kh < kH; kh++) { for(kw = 0; kw < kW; kw++) { real *src = finput_data + nip*(kH*kW*outputHeight*outputWidth) + kh*(kW*outputHeight*outputWidth) + kw*(outputHeight*outputWidth); real *dst = input_data + nip*(inputHeight*inputWidth); if (padW > 0 || padH > 0) { size_t lpad,rpad; for(y = 0; y < outputHeight; y++) { iy = (long long)(y*dH - padH + kh); if (iy < 0 || iy >= inputHeight) { } else { if (dW==1){ ix = (long long)(0 - padW + kw); lpad = fmaxf(0,(int)(padW-kw)); rpad = fmaxf(0,(int)(padW-(kW-kw-1))); THVector_(add)(dst+(size_t)(iy*inputWidth+ix+lpad), src+(size_t)(y*outputWidth+lpad), 1, outputWidth - lpad - rpad); /* note: THVector_add could handle 1 value better */ } else{ for (x=0; x<outputWidth; x++){ ix = (long long)(x*dW - padW + kw); if (ix < 0 || ix >= inputWidth){ }else THVector_(add)(dst+(size_t)(iy*inputWidth+ix), src+(size_t)(y*outputWidth+x), 1, 1); } } } } } else { for(y = 0; y < outputHeight; y++) { iy = (long long)(y*dH + kh); ix = (long long)(0 + kw); if (dW == 1 ) THVector_(add)(dst+(size_t)(iy*inputWidth+ix), src+(size_t)(y*outputWidth), 1, outputWidth); /* note: THVector_add could handle 1 value better */ else{ for(x = 0; x < outputWidth; x++) THVector_(add)(dst+(size_t)(iy*inputWidth+ix+x*dW), src+(size_t)(y*outputWidth+x), 1, 1); } } } } } } }
void THVector_(cadd)(real *z, const real *x, const real *y, const real c, const ptrdiff_t n) { THVector_(cadd_DISPATCHPTR)(z, x, y, c, n); }
static int nnconv1d_(LateralConvolution_updateOutput)(lua_State *L) { THTensor *input = luaT_checkudata(L, 2, torch_Tensor); int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane"); int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane"); THTensor *weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor); THTensor *bias = luaT_getfieldcheckudata(L, 1, "bias", torch_Tensor); THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor); luaL_argcheck(L, input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor expected"); // change to batch mode int batch = 1; if (input->nDimension == 3) { batch = 0; THTensor_(resize4d)(input, 1, nInputPlane, input->size[1], input->size[2]); } long batchSize = input->size[0]; long inputHeight = input->size[2]; long inputWidth = input->size[3]; long outputHeight = inputHeight; long outputWidth = inputWidth; THTensor_(resize4d)(output, batchSize, nOutputPlane, outputHeight, outputWidth); int elt; #pragma omp parallel for private(elt) for (elt = 0; elt < batchSize; elt++) { // select each batch in 2D THTensor *input_t = THTensor_(newSelect)(input, 0, elt); THTensor *output_t = THTensor_(newSelect)(output, 0, elt); THTensor *input2d = THTensor_(newWithStorage2d) (input_t->storage, input_t->storageOffset, nInputPlane, -1, inputHeight*inputWidth, -1); THTensor *output2d = THTensor_(newWithStorage2d) (output_t->storage, output_t->storageOffset, nOutputPlane, -1, outputHeight*outputWidth, -1); // fill biases int i; for (i = 0; i < nOutputPlane; i++) THVector_(fill)(output_t->storage->data+output_t->storageOffset+output_t->stride[0]*i, THTensor_(get1d)(bias, i), outputHeight*outputWidth); // convolve THTensor_(addmm)(output2d, 1, output2d, 1, weight, input2d); // release temp tensors THTensor_(free)(input2d); THTensor_(free)(output2d); THTensor_(free)(input_t); THTensor_(free)(output_t); } // revert to single batch if (batch == 0) { THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth); THTensor_(resize3d)(output, nOutputPlane, outputHeight, outputWidth); } return 1; }