/* * Based on the implementation of the THTensor_(indexCopy) in torch7 */ static void THCudaTensor_indexCopy(THCudaTensor *tensor, int dim, THLongTensor *index, THCudaTensor *src) { long i, numel; THCudaTensor *tSlice, *sSlice; long *index_data; numel = THLongTensor_nElement(index); THArgCheck(index->nDimension == 1, 3, "Index is supposed to be a vector"); THArgCheck(dim < src->nDimension,4,"Indexing dim is out of bounds"); index = THLongTensor_newContiguous(index); index_data = THLongTensor_data(index); for (i=0; i<numel; i++) { if (tensor->nDimension > 1 ) { tSlice = THCudaTensor_new(); sSlice = THCudaTensor_new(); THCudaTensor_select(tSlice, tensor, dim, index_data[i]-1); THCudaTensor_select(sSlice, src, dim, i); THCudaTensor_copy(tSlice, sSlice); THCudaTensor_free(tSlice); THCudaTensor_free(sSlice); } else { // It's faster to copy a float from an address in the device to another address in the device than // retrieving it to the host memory and recopy it to the device memory THCudaCheck(cudaMemcpy(tensor->storage->data + tensor->storageOffset + index_data[i]-1,\ src->storage->data + src->storageOffset + i, sizeof(float), cudaMemcpyDeviceToDevice)); } } THLongTensor_free(index); }
/* * Based on the implementation of the THTensor_(indexCopy) in torch7 */ static void THCudaTensor_indexFill(THCudaTensor *tensor, int dim, THLongTensor *index, float val) { long i, numel; THCudaTensor *tSlice; long *index_data; numel = THLongTensor_nElement(index); THArgCheck(index->nDimension == 1, 3, "Index is supposed to be a vector"); THArgCheck(dim < tensor->nDimension,4,"Indexing dim is out of bounds"); index = THLongTensor_newContiguous(index); index_data = THLongTensor_data(index); for (i=0; i<numel; i++) { if (tensor->nDimension > 1 ) { // create a new CudaTensor tSlice = THCudaTensor_new(); // set its storage to point to the corresponding storage in tensor THCudaTensor_select(tSlice, tensor,dim,index_data[i]-1); THCudaTensor_fill(tSlice, val); THCudaTensor_free(tSlice); } else { THCudaTensor_set1d(tensor,index_data[i]-1,val); } } THLongTensor_free(index); }
// Stitch takes args. // pano - a torch tensor in RGB with dims (3 x height x width) // // offset_map - a torch tensor same h and w as pano storing offsets and // and image indices. The two feaure dimentions are: // -- image number (starting at 1) and // -- bit offset in image tensor // nimages - is the number of images use to make the panorama // image1, ... imagen - are the image in a torch tensor static int Lstitch_(stitch)(lua_State *L) { int nargs = lua_gettop(L); THTensor *pano = (THTensor *)luaT_checkudata(L, 1, torch_(Tensor_id)); THLongTensor *offset_map = (THLongTensor *)luaT_checkudata(L, 2, luaT_checktypename2id(L, "torch.LongTensor")); THTensor *images[MAXIMAGES]; int i = 0; long npixels = offset_map->size[1]*offset_map->size[2]; real *pano_pt = THTensor_(data)(pano); long *offset_pt = THLongTensor_data(offset_map); real *images_pt[MAXIMAGES]; long images_npixels[MAXIMAGES]; long images_Goff[MAXIMAGES]; long images_Boff[MAXIMAGES]; real * panoR = pano_pt; real * panoG = pano_pt + pano->stride[0]; real * panoB = pano_pt + (2*pano->stride[0]); real * curImg_pt = NULL; long unsigned int XYoffset = 0; long * offImg = offset_pt; long * offIndexXY = offset_pt + offset_map->stride[0]; int nimages = 0; long cImgOff = 0; /* finish processing input image tensors */ /* either you can pass a table */ /* or a number and variable length of args */ if (nargs == 3){ if (lua_istable(L,3)){ nimages = lua_objlen (L, 3); /* table is in the stack at index 3 */ lua_pushnil(L); /* first key */ i = 0; while (lua_next(L, 3) != 0) { /* 'key' (at index -2) and 'value' (at index -1) */ images[i] = (THTensor *)luaT_checkudata(L, -1, torch_(Tensor_id)); images_npixels[i] = images[i]->size[1]*images[i]->size[2]; images_Goff[i] = images[i]->stride[0]; images_Boff[i] = 2*images[i]->stride[0]; images_pt[i] = THTensor_(data)(images[i]); /* removes 'value'; keeps 'key' for next iteration */ lua_pop(L, 1); i = i+1; } } else { lua_pushstring(L, "with 3 args last argument is a table"); lua_error(L); } } else { nimages = lua_tonumber(L,3); for(i=0;i<nimages;i++){ images[i] = (THTensor *)luaT_checkudata(L, i+4, torch_(Tensor_id)); images_npixels[i] = images[i]->size[1]*images[i]->size[2]; images_Goff[i] = images[i]->stride[0]; images_Boff[i] = 2*images[i]->stride[0]; images_pt[i] = THTensor_(data)(images[i]); } } for(i=0;i<npixels;i++){ cImgOff = (long unsigned int)*offImg - 1; curImg_pt = images_pt[cImgOff]; if ((*offIndexXY > 0) && (*offIndexXY < images_npixels[cImgOff])){ XYoffset = (long unsigned int)*offIndexXY; *panoR = curImg_pt[XYoffset]; *panoG = curImg_pt[XYoffset + images_Goff[cImgOff]] ; *panoB = curImg_pt[XYoffset + images_Boff[cImgOff]]; } panoR++; panoG++; panoB++; offImg++; offIndexXY++; } return 0; }
static void load_array_to_lua(lua_State *L, cnpy::NpyArray& arr){ int ndims = arr.shape.size(); //based on code from mattorch with stride fix int k; THLongStorage *size = THLongStorage_newWithSize(ndims); THLongStorage *stride = THLongStorage_newWithSize(ndims); for (k=0; k<ndims; k++) { THLongStorage_set(size, k, arr.shape[k]); if (k > 0) THLongStorage_set(stride, ndims-k-1, arr.shape[ndims-k]*THLongStorage_get(stride,ndims-k)); else THLongStorage_set(stride, ndims-k-1, 1); } void * tensorDataPtr = NULL; size_t numBytes = 0; if ( arr.arrayType == 'f' ){ // float32/64 if ( arr.word_size == 4 ){ //float32 THFloatTensor *tensor = THFloatTensor_newWithSize(size, stride); tensorDataPtr = (void *)(THFloatTensor_data(tensor)); numBytes = THFloatTensor_nElement(tensor) * arr.word_size; luaT_pushudata(L, tensor, luaT_checktypename2id(L, "torch.FloatTensor")); }else if ( arr.word_size == 8){ //float 64 THDoubleTensor *tensor = THDoubleTensor_newWithSize(size, stride); tensorDataPtr = (void *)(THDoubleTensor_data(tensor)); numBytes = THDoubleTensor_nElement(tensor) * arr.word_size; luaT_pushudata(L, tensor, luaT_checktypename2id(L, "torch.DoubleTensor")); } }else if ( arr.arrayType == 'i' || arr.arrayType == 'u' ){ // does torch have unsigned types .. need to look if ( arr.word_size == 1 ){ //int8 THByteTensor *tensor = THByteTensor_newWithSize(size, stride); tensorDataPtr = (void *)(THByteTensor_data(tensor)); numBytes = THByteTensor_nElement(tensor) * arr.word_size; luaT_pushudata(L, tensor, luaT_checktypename2id(L, "torch.ByteTensor")); }else if ( arr.word_size == 2 ){ //int16 THShortTensor *tensor = THShortTensor_newWithSize(size, stride); tensorDataPtr = (void *)(THShortTensor_data(tensor)); numBytes = THShortTensor_nElement(tensor) * arr.word_size; luaT_pushudata(L, tensor, luaT_checktypename2id(L, "torch.ShortTensor")); }else if ( arr.word_size == 4 ){ //int32 THIntTensor *tensor = THIntTensor_newWithSize(size, stride); tensorDataPtr = (void *)(THIntTensor_data(tensor)); numBytes = THIntTensor_nElement(tensor) * arr.word_size; luaT_pushudata(L, tensor, luaT_checktypename2id(L, "torch.IntTensor")); }else if ( arr.word_size == 8){ //long 64 THLongTensor *tensor = THLongTensor_newWithSize(size, stride); tensorDataPtr = (void *)(THLongTensor_data(tensor)); numBytes = THLongTensor_nElement(tensor) * arr.word_size; luaT_pushudata(L, tensor, luaT_checktypename2id(L, "torch.LongTensor")); } }else{ printf("array type unsupported"); throw std::runtime_error("unsupported data type"); } // now copy the data assert(tensorDataPtr); memcpy(tensorDataPtr, (void *)(arr.data<void>()), numBytes); }
void THNN_(IndexLinear_accGradParameters)( THNNState *state, THLongTensor *keys, int64_t keysOffset, THTensor *values, THLongTensor *sizes, THLongTensor *cumSumSizes, THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias, THTensor *weight, THTensor *bias, THTensor *valuesBuffer, accreal weightDecay_, accreal scale_) { scalar_t scale = TH_CONVERT_ACCREAL_TO_REAL(scale_); /* Retrieve all the dimensions of the problem */ int64_t batchSize = THLongTensor_size(sizes, 0); int64_t keysSize = THLongTensor_size(keys, 0); int64_t outDim = THTensor_(size)(bias, 0); int64_t woutDim = THTensor_(size)(weight, 1); int64_t maxNormalize = (woutDim - outDim) > 0 ?1:0; THArgCheck(THNN_(checkKeysValues)(keys, values), 1, "Keys and values should have the same number of elements"); int64_t* sizesData = THLongTensor_data(sizes); /* COmpute the cumulative sizes */ THLongTensor* cumSizes = THLongTensor_new(); THLongTensor_cumsum(cumSizes, sizes, 0); int64_t* cumSizesData = THLongTensor_data(cumSizes); /* Resize the gradWeight buffer to keep it dense. * That speeds up updates A LOT assuming random mem access. */ THTensor_(resize2d)(gradWeight, keysSize, outDim * (maxNormalize>0?2:1)); /* Access the storage data/strides */ scalar_t* gradOutputData = gradOutput->data<scalar_t>(); scalar_t* valuesData =values->data<scalar_t>(); scalar_t* gradWeightData = gradWeight->data<scalar_t>(); scalar_t* gradBiasData = gradBias->data<scalar_t>(); /* Make sure these inputs are contiguous to accelerate computations */ THArgCheck(THLongTensor_isContiguous(keys), 1, "keys vector must be contiguous"); THArgCheck(THTensor_(isContiguous)(values), 3, "values vector must be contiguous"); THArgCheck(THTensor_(isContiguous)(gradOutput), 6, "gradOutput vector must be contiguous"); THArgCheck(THTensor_(isContiguous)(gradWeight), 7, "gradWeight must be contiguous"); THArgCheck(THTensor_(isContiguous)(gradBias), 8, "gradBias vector must be contiguous"); THArgCheck(THTensor_(isContiguous)(weight), 9, "weight must be contiguous"); THArgCheck(THTensor_(isContiguous)(bias), 10, "bias vector must be contiguous"); THArgCheck(THTensor_(isContiguous)(valuesBuffer), 11, "valuesBuffer must be contiguous"); int i,j,k; /* Separate cases: output dimension is == 1, or > 1 * This allows for some optimizations. * No multithreading here as this could * corrupt the results (hogwild style) */ if (outDim == 1) { for (j = 0; j < batchSize; j++) { int64_t offset = j==0?0:cumSizesData[j-1]; scalar_t val = gradOutputData[j] * scale; scalar_t* lgradWeightData = gradWeightData + offset; scalar_t* lvaluesData = valuesData + offset; int64_t end = sizesData[j]; if (maxNormalize) { lgradWeightData += offset; i = 0; for(;i < end; i++) { lgradWeightData[2*i] = val; lgradWeightData[2*i+1] = val * lvaluesData[i]; } } else { i = 0; for(;i < end-4; i += 4) { lgradWeightData[i] = val * lvaluesData[i]; lgradWeightData[i+1] = val * lvaluesData[i+1]; lgradWeightData[i+2] = val * lvaluesData[i+2]; lgradWeightData[i+3] = val * lvaluesData[i+3]; } for(; i < end; i++) { lgradWeightData[i] = val * lvaluesData[i]; } } *gradBiasData += val; offset += end; } } else { for (j = 0; j < batchSize; j++) { int64_t offset = j==0?0:cumSizesData[j-1]; scalar_t* lgradOutputData = gradOutputData + j*outDim; scalar_t* lgradWeightData = gradWeightData; THVector_(cadd)(gradBiasData, gradBiasData, lgradOutputData, scale, outDim); for (i = 0; i < sizesData[j]; i++) { scalar_t val = valuesData[offset] * scale; lgradWeightData = gradWeightData + offset*outDim; if (maxNormalize) { lgradWeightData += offset*outDim; k = 0; for(;k < outDim-4; k += 4) { lgradWeightData[k] = lgradOutputData[k]*scale; lgradWeightData[k+1] = lgradOutputData[k+1]*scale; lgradWeightData[k+2] = lgradOutputData[k+2]*scale; lgradWeightData[k+3] = lgradOutputData[k+3]*scale; } for(; k < outDim; k++) { lgradWeightData[k] = lgradOutputData[k]*scale; } lgradWeightData += outDim; } k = 0; for(;k < outDim-4; k += 4) { lgradWeightData[k] = val * lgradOutputData[k]; lgradWeightData[k+1] = val * lgradOutputData[k+1]; lgradWeightData[k+2] = val * lgradOutputData[k+2]; lgradWeightData[k+3] = val * lgradOutputData[k+3]; } for(; k < outDim; k++) { lgradWeightData[k] = val * lgradOutputData[k]; } offset++; } } } THLongTensor_free(cumSizes); return; }
void THNN_(IndexLinear_accUpdateGradParameters)( THNNState *state, THLongTensor *keys, int64_t keysOffset, THTensor *values, THLongTensor *sizes, THLongTensor *cumSumSizes, THTensor *gradOutput, THTensor *weight, THTensor *bias, accreal weightDecay_, accreal scale_) { scalar_t weightDecay = TH_CONVERT_ACCREAL_TO_REAL(weightDecay_); scalar_t scale = TH_CONVERT_ACCREAL_TO_REAL(scale_); /* Retrieve all the dimensions of the problem */ int64_t batchSize = THLongTensor_size(sizes, 0); int64_t outDim = THTensor_(size)(bias, 0); int64_t woutDim = THTensor_(size)(weight, 1); int maxNormalize = woutDim - outDim; THArgCheck(THNN_(checkKeysValues)(keys, values), 1, "Keys and values should have the same number of elements"); /* Access the storage data/strides */ scalar_t* gradOutputData = gradOutput->data<scalar_t>(); scalar_t* valuesData =values->data<scalar_t>(); scalar_t* weightData = weight->data<scalar_t>(); scalar_t* biasData = bias->data<scalar_t>(); int64_t weightStride0 = weight->stride(0); int64_t* keysData = THLongTensor_data(keys); int64_t* sizesData = THLongTensor_data(sizes); /* Make sure these inputs are contiguous to accelerate computations */ THArgCheck(THLongTensor_isContiguous(keys), 1, "keys vector must be contiguous"); THArgCheck(THTensor_(isContiguous)(values), 3, "values vector must be contiguous"); THArgCheck(THTensor_(isContiguous)(gradOutput), 6, "gradOutput vector must be contiguous"); THArgCheck(THTensor_(isContiguous)(weight), 7, "weight matrix must be contiguous"); THArgCheck(THTensor_(isContiguous)(bias), 8, "bias matrix must be contiguous"); int i,j,k; /* Separate cases: output dimension is == 1, or > 1 * This allows for some optimizations. * No multithreading here as this could * corrupt the results (hogwild style) */ if (outDim == 1) { if (maxNormalize) { int64_t offset = 0; for (j = 0; j < batchSize; j++) { scalar_t* lgradOutputData = gradOutputData + j; *biasData -= *lgradOutputData * scale; scalar_t val = *lgradOutputData * scale; for (i = 0; i < sizesData[j]; i++) { int64_t idx = weightStride0*(keysData[offset] + keysOffset) + maxNormalize; weightData[idx-1] -= weightData[idx]*val*weightData[idx-2]; weightData[idx] -= (val*valuesData[offset] - weightDecay * weightData[idx])*weightData[idx-2]; offset++; } } offset = 0; for (j = 0; j < batchSize; j++) { for (i = 0; i < sizesData[j]; i++) { int64_t idx = weightStride0*(keysData[offset] + keysOffset) + maxNormalize; weightData[idx-2] = 0; offset++; } } } else { if (weightDecay) { int64_t offset = 0; for (j = 0; j < batchSize; j++) { scalar_t* lgradOutputData = gradOutputData + j; *biasData -= *lgradOutputData * scale; scalar_t val = *lgradOutputData * scale; for (i = 0; i < sizesData[j]; i++) { int64_t idx = weightStride0*(keysData[offset] + keysOffset); weightData[idx] -= val * valuesData[offset] + weightData[idx] * weightDecay; offset++; } } } else { int64_t offset = 0; for (j = 0; j < batchSize; j++) { scalar_t val = gradOutputData[j] * scale; for (i = 0; i < sizesData[j]; i++) { weightData[(keysData[offset] + keysOffset)*weightStride0] -= val * valuesData[offset]; offset++; } *biasData -= val; } } } } else { int64_t offset = 0; for (j = 0; j < batchSize; j++) { scalar_t* lgradOutputData = gradOutputData + j*outDim; scalar_t* lweightData = weightData; THVector_(cadd)(biasData, biasData, lgradOutputData, -scale, outDim); for (i = 0; i < sizesData[j]; i++) { scalar_t val = valuesData[offset] * scale; scalar_t wd = weightDecay; // Max normalize case if (maxNormalize) { lweightData = weightData + weightStride0*(keysData[offset] + keysOffset) + (maxNormalize-2); val *= lweightData[0]; wd *= lweightData[0]; for (k=0; k < outDim; k++) { lweightData[1] -= lweightData[k+2]*scale*lgradOutputData[k]*lweightData[0]; } lweightData += 2; } else { lweightData = weightData + weightStride0*(keysData[offset] + keysOffset); } /* We do sparse weight decay. * We think it makes more sense. */ if (weightDecay) { if (outDim > THNN_SPARSE_OUTDIM_THRESHOLD) { THBlas_(axpy)(outDim, -wd, lweightData, 1, lweightData, 1); } else { for (k=0; k < outDim; k++) { lweightData[k] -= wd * lweightData[k]; } } } if (outDim > THNN_SPARSE_OUTDIM_THRESHOLD) { THBlas_(axpy)(outDim, -val, lgradOutputData, 1, lweightData, 1); } else { for (k=0; k < outDim; k++) { lweightData[k] -= val * lgradOutputData[k]; } } offset++; } } /* Max Normalize case: * Reset the smart update scaling if * one does it batch-wise. * TODO: Decide what to do with that piece of code. * NB: If the code belowe is uncommented, so should the commented * code in IndexLinear:zeroGradParameters() */ /* if (maxNormalize) { offset = 0; for (j = 0; j < batchSize; j++) { scalar_t* lweightData = weightData; for (i = 0; i < sizesData[j]; i++) { scalar_t val = valuesData[offset] * scale; scalar_t wd = weightDecay; lweightData = weightData + weightStride0*(keysData[offset] + keysOffset) + (maxNormalize-2); lweightData[0] = 0; offset++; } } } */ } return; }
void THNN_(IndexLinear_updateOutput)( THNNState *state, THLongTensor *keys, int64_t keysOffset, THTensor *values, THLongTensor *sizes, THLongTensor *cumSumSizes, THTensor *output, THTensor *weight, THTensor *bias, THTensor *normalizedValues, int train) { /* Retrieve all the dimensions of the problem */ int64_t batchSize = THLongTensor_size(sizes, 0); int64_t keysSize = THLongTensor_size(keys, 0); int64_t outDim = THTensor_(size)(bias, 0); int64_t woutDim = THTensor_(size)(weight, 1); int maxNormalize = woutDim - outDim; int64_t* sizesData = THLongTensor_data(sizes); int64_t* cumSumSizesData = THLongTensor_data(cumSumSizes); /* Define/resize the normalized values tensor if maxNormalize is > 0 */ scalar_t* normalizedValuesData = NULL; if (maxNormalize) { THTensor_(resize1d)(normalizedValues, keysSize); normalizedValuesData = normalizedValues->data<scalar_t>(); } /* Resize the output */ THTensor_(resize2d)(output, batchSize, outDim); /* Access the storage data/strides */ scalar_t* outputData = output->data<scalar_t>(); scalar_t* valuesData = values->data<scalar_t>(); scalar_t* weightData = weight->data<scalar_t>(); int64_t weightStride0 = weight->stride(0); scalar_t* biasData = bias->data<scalar_t>(); int64_t* keysData = THLongTensor_data(keys); /* Make sure these inputs are contiguous to accelerate computations */ THArgCheck(THLongTensor_isContiguous(keys), 1, "keys vector must be contiguous"); THArgCheck(THTensor_(isContiguous)(values), 3, "values vector must be contiguous"); THArgCheck(THTensor_(isContiguous)(output), 6, "output vector must be contiguous"); THArgCheck(THTensor_(isContiguous)(weight), 7, "weight matrix must be contiguous"); THArgCheck(THTensor_(isContiguous)(bias), 8, "bias vector must be contiguous"); THArgCheck(THNN_(checkKeysValues)(keys, values), 1, "Keys and values should have the same number of elements"); THArgCheck(THTensor_(isContiguous)(normalizedValues), 9, "normalizedValues vector must be contiguous"); /* Separate cases: output dimension is == 1, or > 1 * This allows for some optimizations. */ if (outDim == 1) { THVector_(fill)(outputData, *biasData, batchSize); if (maxNormalize) { /* Parallelize on the batch itself */ auto loop = [&](int64_t start, int64_t end) { for (auto j = start; j < end; j++) { scalar_t* loutputData = outputData + j; scalar_t val = 0; scalar_t absVal = 0; int64_t offset = j == 0 ? 0 : cumSumSizesData[j - 1]; for (auto i = 0; i < sizesData[j]; i++) { int64_t woffset = weightStride0*(keysData[offset] + keysOffset); absVal = fabs(valuesData[offset]); if (train) { if (absVal > weightData[woffset]) { weightData[woffset] = absVal; weightData[woffset+1] = 1/absVal; } /* * The following can be used to scale the size of the updates * depending on some rule, e.g. the frequency of a feature, ... * This is used at update time. * TODO: implement a smarter update scale. */ weightData[woffset+2] = 1; } normalizedValuesData[offset] = (absVal > weightData[woffset] ? THNN_INDEXLINEAR_SIGN(valuesData[offset]):valuesData[offset]*weightData[woffset+1]) + weightData[woffset+3]; val += normalizedValuesData[offset] * weightData[woffset+maxNormalize]; offset++; } *loutputData += val; } }; if (keysSize * outDim > THNN_SPARSE_OMP_THRESHOLD) { at::parallel_for(0, batchSize, 1, loop); } else { loop(0, batchSize); } } else { /* Parallelize on the batch itself */ auto loop = [&](int64_t start, int64_t end) { for (auto j = start; j < end; j++) { int64_t offset = j == 0 ? 0 : cumSumSizesData[j - 1]; scalar_t* loutputData = outputData + j; scalar_t val = 0; for (auto i = 0; i < sizesData[j]; i++) { val += weightData[weightStride0*(keysData[offset] + keysOffset)] * valuesData[offset]; offset++; } *loutputData += val; } }; if (keysSize * outDim > THNN_SPARSE_OMP_THRESHOLD) { at::parallel_for(0, batchSize, 1, loop); } else { loop(0, batchSize); } } } else { auto loop = [&](int64_t start, int64_t end) { for (auto j = start; j < end; j++) { int64_t offset = j == 0 ? 0 : cumSumSizesData[j - 1]; scalar_t val; scalar_t* loutputData = outputData + j*outDim; scalar_t* lweightData = weightData; memcpy(loutputData, biasData, outDim*sizeof(scalar_t)); for (auto i = 0; i < sizesData[j]; i++) { int64_t woffset = weightStride0*(keysData[offset] + keysOffset); if (maxNormalize) { val = valuesData[offset]; scalar_t absVal = fabs(val); if (train) { if (absVal > weightData[woffset]) { weightData[woffset] = absVal; weightData[woffset+1] = 1/absVal; } /* * The following can be used to scale the size of the updates * depending on some rule, e.g. the frequency of a feature, ... * The commented section thereafter is just an example of what can be done: * *``` * weightData[woffset+2] = weightData[woffset+2]==0?1:(weightData[woffset+2] / (weightData[woffset+2] + 1)); * scalar_t alpha = 1; * scalar_t beta = 0.01; * scalar_t gamma = 1 - 0.000001; * scalar_t l = weightData[woffset+2]==0?1/gamma:(weightData[woffset+2] - beta) / (alpha - beta); * l = gamma*l; * weightData[woffset+2] = (alpha-beta)*l + beta; * ``` * * TODO: implement a smarter update scale. */ weightData[woffset+2] = 1; } /* Normalize + Clamp */ val = (absVal > weightData[woffset] ? THNN_INDEXLINEAR_SIGN(val):val*weightData[woffset+1]) + weightData[woffset+3]; normalizedValuesData[offset] = val; lweightData = weightData + woffset + maxNormalize; } else { val = valuesData[offset]; lweightData = weightData + woffset; } if (outDim > THNN_SPARSE_OUTDIM_THRESHOLD) { THBlas_(axpy)(outDim, val, lweightData, 1, loutputData, 1); } else { for (auto k = 0; k < outDim; k++) { loutputData[k] += lweightData[k] * val; } } offset++; } } }; if (keysSize * outDim > THNN_SPARSE_OMP_THRESHOLD) { at::parallel_for(0, batchSize, 1, loop); } else { loop(0, batchSize); } } return; }
void THNN_(IndexLinear_updateParameters)( THNNState *state, THTensor *gradWeight, THTensor *gradBias, THTensor *weight, THTensor *bias, THLongTensor *runningKeys, THLongTensor *cumSumSizes, int64_t keysOffset, accreal weightDecay_, accreal learningRate_) { scalar_t weightDecay = TH_CONVERT_ACCREAL_TO_REAL(weightDecay_); scalar_t learningRate = TH_CONVERT_ACCREAL_TO_REAL(learningRate_); /* Retrieve all the dimensions of the problem */ int64_t outDim = THTensor_(size)(bias, 0); int64_t woutDim = THTensor_(size)(weight, 1); int maxNormalize = woutDim - outDim; int64_t keysSize = THLongTensor_size(runningKeys, 0); /* Access the storage data/strides */ scalar_t* gradWeightData = gradWeight->data<scalar_t>(); scalar_t* weightData = weight->data<scalar_t>(); int64_t weightStride0 = weight->stride(0); scalar_t* gradBiasData = gradBias->data<scalar_t>(); scalar_t* biasData = bias->data<scalar_t>(); int64_t* keysData = THLongTensor_data(runningKeys); /* Make sure these inputs are contiguous to accelerate computations */ THArgCheck(THTensor_(isContiguous)(gradWeight), 1, "gradWeight must be contiguous"); THArgCheck(THTensor_(isContiguous)(gradBias), 2, "gradBias vector must be contiguous"); THArgCheck(THTensor_(isContiguous)(weight), 3, "gradBias vector must be contiguous"); THArgCheck(THTensor_(isContiguous)(bias), 4, "gradBias vector must be contiguous"); THArgCheck(THLongTensor_isContiguous(runningKeys), 5, "keys vector must be contiguous"); int j, k; /* Update the bias first */ THVector_(cadd)(biasData, biasData, gradBiasData, -learningRate, outDim); /* Separate cases: output dimension is == 1, or > 1 * This allows for some optimizations. * No multithreading here as this could * corrupt the results (hogwild style) */ if (outDim == 1) { if (maxNormalize) { if (weightDecay) { for (j = 0; j < keysSize; j++) { int64_t woffset = weightStride0*(keysData[j] + keysOffset) + maxNormalize; scalar_t lr = learningRate*weightData[woffset-2]; weightData[woffset-1] -= weightData[woffset]*gradWeightData[2*j]*lr; weightData[woffset] -= gradWeightData[2*j+1]*lr - weightDecay * weightData[woffset-2] * weightData[woffset]; } } else { for (j = 0; j < keysSize; j++) { int64_t woffset = weightStride0*(keysData[j] + keysOffset) + maxNormalize; scalar_t lr = learningRate*weightData[woffset-2]; weightData[woffset-1] -= weightData[woffset]*gradWeightData[2*j]*lr; weightData[woffset] -= gradWeightData[2*j+1]*lr; } } } else { if (weightDecay) { for (j = 0; j < keysSize; j++) { int64_t woffset = weightStride0*(keysData[j] + keysOffset); weightData[woffset] -= gradWeightData[j]*learningRate + weightDecay * weightData[woffset]; } } else { for (j = 0; j < keysSize; j++) { weightData[weightStride0*(keysData[j] + keysOffset)] -= gradWeightData[j]*learningRate; } } } } else { for (j = 0; j < keysSize; j++) { scalar_t lr = learningRate; scalar_t wd = weightDecay; scalar_t* lweightData; int64_t woffset = weightStride0*(keysData[j] + keysOffset); scalar_t* lgradWeightData = gradWeightData + j*outDim; if (maxNormalize) { lgradWeightData += j*outDim; /* weightData[woffset + 2] */ lweightData = weightData + woffset + maxNormalize - 2; lr = lr*lweightData[0]; wd = weightDecay*lweightData[0]; /* weightData[woffset + 3] */ lweightData++; for (k=0; k < outDim; k++) { lweightData[0] -= lgradWeightData[k]*lweightData[k+1]*lr; } lweightData++; lgradWeightData += outDim; } else { lweightData = weightData + woffset; } /* We do sparse weight decay. * We think it makes more sense. */ if (weightDecay) { for (k=0; k < outDim; k++) { lweightData[k] -= lweightData[k]*wd; } } if (outDim > THNN_SPARSE_OUTDIM_THRESHOLD) { THBlas_(axpy)(outDim, -lr, lgradWeightData, 1, lweightData, 1); } else { for (k=0; k < outDim; k++) { lweightData[k] -= lgradWeightData[k]*lr; } } } } }