void THNN_(SparseLinear_legacyUpdateParameters)( THNNState *state, THTensor *weight, THTensor *bias, THTensor *gradWeight, THTensor *gradBias, THTensor *lastInput, accreal learningRate_) { real learningRate = TH_CONVERT_ACCREAL_TO_REAL(learningRate_); int64_t h, i; int64_t outDim = weight->size[0]; int64_t inDim = weight->size[1]; THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4, "gradWeight size wrong"); THArgCheck(THNN_(checkSize1D)(bias, outDim), 3, "bias size wrong"); THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5, "gradBias size wrong"); THArgCheck(THNN_(checkLegacyInput)(lastInput), 6, "input size must be batchsize x nnz x 2"); int64_t batchSize = THTensor_(size)(lastInput, 0); int64_t nnz = THTensor_(size)(lastInput, 1); // collect unique offsets of non-0 val in input THTensor* offsets = THTensor_(newWithSize1d)(batchSize * nnz); int64_t cnt = 0; for (h = 0; h < batchSize; h++) { for (i = 0; i < nnz; i++) { real val = THNN_(get3d)(lastInput, h, i, 1); if (val == 0 ) { continue; } int64_t offset = (int64_t)(THNN_(get3d)(lastInput, h, i, 0)) - 1; if (offset >= 0 && offset < inDim) { THNN_(set1d)(offsets, cnt++, offset); } else { THError( "index out of bound. updateParameters: %d not between 1 and %d", offset + 1, inDim); } } } THTensor_(resize1d)(offsets, cnt); THTensor* uniqueOffsets = THTensor_(new)(); THLongTensor* ri = THLongTensor_new(); THTensor_(sort)(uniqueOffsets, ri, offsets, 0, 0); THLongTensor_free(ri); THTensor_(free)(offsets); cnt = 1; real* uniqueOffsets_p = THTensor_(data)(uniqueOffsets); for (i = 1; i < THTensor_(size)(uniqueOffsets, 0); i++) { if (uniqueOffsets_p[i] != uniqueOffsets_p[i - 1]) { uniqueOffsets_p[cnt++] = uniqueOffsets_p[i]; } } THTensor_(resize1d)(uniqueOffsets, cnt); // weight += -learningRate * gradWeight THTensor_(cadd)(bias, bias, -learningRate, gradBias); #pragma omp parallel for private(i) schedule(static) if (cnt * outDim > 10000) for (i = 0; i < cnt; i++) { int64_t offset = (int64_t)uniqueOffsets_p[i]; THBlas_(axpy)(outDim, -learningRate, COL_PTR2(gradWeight, offset), gradWeight->stride[0], COL_PTR2(weight, offset), weight->stride[0]); } THTensor_(free)(uniqueOffsets); }
void THNN_(IndexLinear_accGradParameters)( THNNState *state, THLongTensor *keys, int64_t keysOffset, THTensor *values, THLongTensor *sizes, THLongTensor *cumSumSizes, THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias, THTensor *weight, THTensor *bias, THTensor *valuesBuffer, accreal weightDecay_, accreal scale_) { scalar_t scale = TH_CONVERT_ACCREAL_TO_REAL(scale_); /* Retrieve all the dimensions of the problem */ int64_t batchSize = THLongTensor_size(sizes, 0); int64_t keysSize = THLongTensor_size(keys, 0); int64_t outDim = THTensor_(size)(bias, 0); int64_t woutDim = THTensor_(size)(weight, 1); int64_t maxNormalize = (woutDim - outDim) > 0 ?1:0; THArgCheck(THNN_(checkKeysValues)(keys, values), 1, "Keys and values should have the same number of elements"); int64_t* sizesData = THLongTensor_data(sizes); /* COmpute the cumulative sizes */ THLongTensor* cumSizes = THLongTensor_new(); THLongTensor_cumsum(cumSizes, sizes, 0); int64_t* cumSizesData = THLongTensor_data(cumSizes); /* Resize the gradWeight buffer to keep it dense. * That speeds up updates A LOT assuming random mem access. */ THTensor_(resize2d)(gradWeight, keysSize, outDim * (maxNormalize>0?2:1)); /* Access the storage data/strides */ scalar_t* gradOutputData = gradOutput->data<scalar_t>(); scalar_t* valuesData =values->data<scalar_t>(); scalar_t* gradWeightData = gradWeight->data<scalar_t>(); scalar_t* gradBiasData = gradBias->data<scalar_t>(); /* Make sure these inputs are contiguous to accelerate computations */ THArgCheck(THLongTensor_isContiguous(keys), 1, "keys vector must be contiguous"); THArgCheck(THTensor_(isContiguous)(values), 3, "values vector must be contiguous"); THArgCheck(THTensor_(isContiguous)(gradOutput), 6, "gradOutput vector must be contiguous"); THArgCheck(THTensor_(isContiguous)(gradWeight), 7, "gradWeight must be contiguous"); THArgCheck(THTensor_(isContiguous)(gradBias), 8, "gradBias vector must be contiguous"); THArgCheck(THTensor_(isContiguous)(weight), 9, "weight must be contiguous"); THArgCheck(THTensor_(isContiguous)(bias), 10, "bias vector must be contiguous"); THArgCheck(THTensor_(isContiguous)(valuesBuffer), 11, "valuesBuffer must be contiguous"); int i,j,k; /* Separate cases: output dimension is == 1, or > 1 * This allows for some optimizations. * No multithreading here as this could * corrupt the results (hogwild style) */ if (outDim == 1) { for (j = 0; j < batchSize; j++) { int64_t offset = j==0?0:cumSizesData[j-1]; scalar_t val = gradOutputData[j] * scale; scalar_t* lgradWeightData = gradWeightData + offset; scalar_t* lvaluesData = valuesData + offset; int64_t end = sizesData[j]; if (maxNormalize) { lgradWeightData += offset; i = 0; for(;i < end; i++) { lgradWeightData[2*i] = val; lgradWeightData[2*i+1] = val * lvaluesData[i]; } } else { i = 0; for(;i < end-4; i += 4) { lgradWeightData[i] = val * lvaluesData[i]; lgradWeightData[i+1] = val * lvaluesData[i+1]; lgradWeightData[i+2] = val * lvaluesData[i+2]; lgradWeightData[i+3] = val * lvaluesData[i+3]; } for(; i < end; i++) { lgradWeightData[i] = val * lvaluesData[i]; } } *gradBiasData += val; offset += end; } } else { for (j = 0; j < batchSize; j++) { int64_t offset = j==0?0:cumSizesData[j-1]; scalar_t* lgradOutputData = gradOutputData + j*outDim; scalar_t* lgradWeightData = gradWeightData; THVector_(cadd)(gradBiasData, gradBiasData, lgradOutputData, scale, outDim); for (i = 0; i < sizesData[j]; i++) { scalar_t val = valuesData[offset] * scale; lgradWeightData = gradWeightData + offset*outDim; if (maxNormalize) { lgradWeightData += offset*outDim; k = 0; for(;k < outDim-4; k += 4) { lgradWeightData[k] = lgradOutputData[k]*scale; lgradWeightData[k+1] = lgradOutputData[k+1]*scale; lgradWeightData[k+2] = lgradOutputData[k+2]*scale; lgradWeightData[k+3] = lgradOutputData[k+3]*scale; } for(; k < outDim; k++) { lgradWeightData[k] = lgradOutputData[k]*scale; } lgradWeightData += outDim; } k = 0; for(;k < outDim-4; k += 4) { lgradWeightData[k] = val * lgradOutputData[k]; lgradWeightData[k+1] = val * lgradOutputData[k+1]; lgradWeightData[k+2] = val * lgradOutputData[k+2]; lgradWeightData[k+3] = val * lgradOutputData[k+3]; } for(; k < outDim; k++) { lgradWeightData[k] = val * lgradOutputData[k]; } offset++; } } } THLongTensor_free(cumSizes); return; }