void THNN_(SparseLinear_accGradParameters)( THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias, THTensor *weight, THTensor *bias, real weightDecay, real scale) { long h, i; long outDim = THTensor_(size)(weight, 0); long inDim = THTensor_(size)(weight, 1); THArgCheck(THNN_(checkInput)(input), 2, "input must be in coo format, nnz x 3"); THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4, "gradWeight size wrong"); THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5, "gradBias size wrong"); THArgCheck(THTensor_(isContiguous)(gradOutput), 1, "gradOutput must be contiguous"); long nnz = THTensor_(size)(input, 0); // THTensor_(resize2d)(gradOutput, batchSize, outDim); // gradWeight += gradOutput * input #pragma omp parallel for private(h, i) schedule(static) if (\ nnz * outDim > 10000) for (i = 0; i < nnz; i++) { real val = scale * THNN_(get2d)(input, i, 2); long offset = (long)(THNN_(get2d)(input, i, 1)) - 1; long h = (long)(THNN_(get2d)(input, i, 0)) - 1; if (offset >= 0 && offset < inDim) { THBlas_(axpy)(outDim, val, ROW_PTR2(gradOutput, h), gradOutput->stride[1], COL_PTR2(gradWeight, offset), gradWeight->stride[0]); } else { THError( "index out of bound. accGradParameters: %d not between 1 and %d", offset + 1, inDim); } } // gradBias += gradOutput THTensor* buf = THTensor_(new)(); THTensor_(sum)(buf, gradOutput, 0); THTensor_(cadd)(gradBias, gradBias, scale, buf); THTensor_(free)(buf); if (weightDecay != 0) { THTensor_(cadd)(gradWeight, gradWeight, weightDecay, weight); } }
void THNN_(SparseLinear_legacyUpdateOutput)( THNNState *state, THTensor *input, THTensor *output, THTensor *weight, THTensor *bias) { int64_t h, i; int64_t outDim = THTensor_(size)(weight, 0); int64_t inDim = THTensor_(size)(weight, 1); THArgCheck(THNN_(checkLegacyInput)(input), 2, "input size must be batchsize x nnz x 2"); THArgCheck(THTensor_(isContiguous)(output), 3, "output must be contiguous"); THArgCheck(THNN_(checkSize1D)(bias, outDim), 5, "bias size wrong"); weight = THTensor_(newContiguous)(weight); int64_t batchSize = THTensor_(size)(input, 0); int64_t nnz = THTensor_(size)(input, 1); THTensor_(resize2d)(output, batchSize, outDim); // output = weight * input + bias THTensor_(zero)(output); #pragma omp parallel for private(h, i) schedule(static) if ( \ batchSize > 1 && batchSize * nnz * outDim > 10000) for (h = 0; h < batchSize; h++) { for (i = 0; i < nnz; i++) { real val = THNN_(get3d)(input, h, i, 1); if (val == 0) { continue; } int64_t offset = (int64_t)(THNN_(get3d)(input, h, i, 0)) - 1; if (offset >= 0 && offset < inDim) { THBlas_(axpy)(outDim, val, COL_PTR2(weight, offset), weight->stride[0], ROW_PTR2(output, h), output->stride[1]); } else { THError("index out of bound. updateOutput: %d not between 1 and %d", offset + 1, inDim); } } } THTensor* output_row = THTensor_(new)(); for (h = 0; h < batchSize; h++) { THTensor_(select)(output_row, output, 0, h); THTensor_(cadd)(output_row, bias, 1.0, output_row); } THTensor_(free)(output_row); THTensor_(free)(weight); }
void THNN_(SparseLinear_updateOutput)( THNNState *state, THTensor *input, THTensor *output, THTensor *weight, THTensor *bias) { long h, i; long outDim = THTensor_(size)(weight, 0); long inDim = THTensor_(size)(weight, 1); long batchSize = THTensor_(size)(output, 0); THArgCheck(THNN_(checkInput)(input), 2, "input must be in coo format, nnz x 3"); THArgCheck(THTensor_(isContiguous)(output), 3, "output must be contiguous"); THArgCheck(THNN_(checkSize1D)(bias, outDim), 5, "bias size wrong"); long nnz = THTensor_(size)(input, 0); // output = weight * input + bias THTensor_(zero)(output); #pragma omp parallel for private(i) schedule(static) if (nnz * outDim > 10000) for (i = 0; i < nnz; i++) { real val = THNN_(get2d)(input, i, 2); if (val == 0) { continue; } long offset = (long)(THNN_(get2d)(input, i, 1)) - 1; long h = (long)(THNN_(get2d)(input, i, 0)) - 1; if (offset >= 0 && offset < inDim) { THBlas_(axpy)(outDim, val, COL_PTR2(weight, offset), weight->stride[0], ROW_PTR2(output, h), output->stride[1]); } else { THError("index out of bound. updateOutput: %d not between 1 and %d", offset + 1, inDim); } } THTensor* output_row = THTensor_(new)(); for (h = 0; h < batchSize; h++) { THTensor_(select)(output_row, output, 0, h); THTensor_(cadd)(output_row, bias, 1.0, output_row); } THTensor_(free)(output_row); }
void THNN_(SparseLinear_updateOutput)( THNNState *state, THTensor *input, THTensor *output, THTensor *weight, THTensor *bias) { int64_t h, i, hp0, hp1; int64_t outDim = THTensor_(size)(weight, 0); int64_t inDim = THTensor_(size)(weight, 1); int64_t batchSize = THTensor_(size)(output, 0); THArgCheck(THNN_(checkInput)(input), 2, "input must be in coo format, nnz x 3"); THArgCheck(THTensor_(isContiguous)(output), 3, "output must be contiguous"); THArgCheck(THNN_(checkSize1D)(bias, outDim), 5, "bias size wrong"); int64_t nnz = THTensor_(size)(input, 0); THLongTensor * csr = THLongTensor_newWithSize1d(batchSize+1); THLongTensor_zero(csr); weight = THTensor_(newContiguous)(weight); //#pragma omp parallel for private(i, h, hp0, hp1) schedule(static) if (nnz > 10000) for (i=0; i<nnz; i++) { hp0 = (int64_t)(THNN_(get2d)(input, i, 0)) - 1; hp1 = (i+1 == nnz) ? batchSize : (int64_t)(THNN_(get2d)(input, i+1, 0)) - 1; if (hp0 != hp1) for (h = hp0; h < hp1; h++) { THLongTensor_set1d(csr, h+1, i+1); } } // output = weight * input + bias THTensor_(zero)(output); #pragma omp parallel for private(h, i) schedule(static) if (nnz > 10000) for (h = 0; h < batchSize; h++) { int64_t i_start = THLongTensor_get1d(csr, h); int64_t i_end = THLongTensor_get1d(csr, h+1); for (i = i_start; i < i_end; i++) { real val = THNN_(get2d)(input, i, 2); if (val == 0) { continue; } int64_t offset = (int64_t)(THNN_(get2d)(input, i, 1)) - 1; if (offset >= 0 && offset < inDim) { THBlas_(axpy)(outDim, val, COL_PTR2(weight, offset), weight->stride[0], ROW_PTR2(output, h), output->stride[1]); } else { THError("index out of bound. updateOutput: %d not between 1 and %d", offset + 1, inDim); } } } THTensor* output_row = THTensor_(new)(); for (h = 0; h < batchSize; h++) { THTensor_(select)(output_row, output, 0, h); THTensor_(cadd)(output_row, bias, 1.0, output_row); } THTensor_(free)(output_row); THLongTensor_free(csr); THTensor_(free)(weight); }
void THNN_(SparseLinear_legacyAccGradParameters)( THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias, THTensor *weight, THTensor *bias, accreal weightDecay_, accreal scale_) { real weightDecay = TH_CONVERT_ACCREAL_TO_REAL(weightDecay_); real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_); int64_t h, i; int64_t outDim = THTensor_(size)(weight, 0); int64_t inDim = THTensor_(size)(weight, 1); THArgCheck(THNN_(checkLegacyInput)(input), 2, "input size must be batchsize x nnz x 2"); THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4, "gradWeight size wrong"); THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5, "gradBias size wrong"); THArgCheck(THTensor_(isContiguous)(gradOutput), 1, "gradOutput must be contiguous"); int64_t batchSize = THTensor_(size)(input, 0); int64_t nnz = THTensor_(size)(input, 1); THTensor_(resize2d)(gradOutput, batchSize, outDim); // gradWeight += gradOutput * input #pragma omp parallel for private(h, i) schedule(static) if (\ batchSize * nnz * outDim > 10000) for (i = 0; i < nnz; i++) { for (h = 0; h < batchSize; h++) { real val = scale * THNN_(get3d)(input, h, i, 1); if (val == 0) { continue; } int64_t offset = (int64_t)(THNN_(get3d)(input, h, i, 0)) - 1; if (offset >= 0 && offset < inDim) { THBlas_(axpy)(outDim, val, ROW_PTR2(gradOutput, h), gradOutput->stride[1], COL_PTR2(gradWeight, offset), gradWeight->stride[0]); } else { THError( "index out of bound. accGradParameters: %d not between 1 and %d", offset + 1, inDim); } } } // gradBias += gradOutput THTensor* gradOutput_row = THTensor_(new)(); for (h = 0; h < batchSize; h++) { THTensor_(select)(gradOutput_row, gradOutput, 0, h); THTensor_(cadd)(gradBias, gradBias, scale, gradOutput_row); } THTensor_(free)(gradOutput_row); if (weightDecay != 0) { THTensor_(cadd)(gradWeight, gradWeight, weightDecay, weight); } }
void THNN_(SparseLinear_accGradParameters)( THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias, THTensor *weight, THTensor *bias, accreal weightDecay_, accreal scale_) { real weightDecay = TH_CONVERT_ACCREAL_TO_REAL(weightDecay_); real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_); int64_t h, i, col, hp0, hp1; int64_t outDim = THTensor_(size)(weight, 0); int64_t inDim = THTensor_(size)(weight, 1); THArgCheck(THNN_(checkInput)(input), 2, "input must be in coo format, nnz x 3"); THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4, "gradWeight size wrong"); THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5, "gradBias size wrong"); THArgCheck(THTensor_(isContiguous)(gradOutput), 1, "gradOutput must be contiguous"); int64_t nnz = THTensor_(size)(input, 0); THLongTensor* csc = THLongTensor_newWithSize1d(inDim+1); THLongTensor_zero(csc); weight = THTensor_(newContiguous)(weight); #pragma omp parallel for private(i, h, hp0, hp1) schedule(static) if (nnz > 10000) for (i = 0; i < nnz; i++) { hp0 = (int64_t)(THNN_(get2d)(input, i, 1)) - 1; hp1 = (i+1 == nnz) ? inDim : (int64_t)(THNN_(get2d)(input, i+1, 1)) - 1; if (hp0 != hp1) for (h = hp0; h < hp1; h++) { THLongTensor_set1d(csc, h+1, i+1); } } // gradWeight += gradOutput * input #pragma omp parallel for private(h, i, col) schedule(static) if (nnz > 10000) for (col = 0; col < inDim; col++) { int64_t i_start = THLongTensor_get1d(csc, col); int64_t i_end = THLongTensor_get1d(csc, col+1); for (i = i_start; i < i_end; i++) { real val = scale * THNN_(get2d)(input, i, 2); h = (int64_t)(THNN_(get2d)(input, i, 0)) - 1; int64_t offset = (int64_t)(THNN_(get2d)(input, i, 1)) - 1; if (offset >= 0 && offset < inDim) { THBlas_(axpy)(outDim, val, ROW_PTR2(gradOutput, h), gradOutput->stride[1], COL_PTR2(gradWeight, offset), gradWeight->stride[0]); } else { THError( "index out of bound. accGradParameters: %d not between 1 and %d", offset + 1, inDim); } } } // gradBias += gradOutput THTensor* buf = THTensor_(new)(); THTensor_(sum)(buf, gradOutput, 0, 1); THTensor_(cadd)(gradBias, gradBias, scale, buf); THTensor_(free)(buf); THLongTensor_free(csc); if (weightDecay != 0) { THTensor_(cadd)(gradWeight, gradWeight, weightDecay, weight); } THTensor_(free)(weight); }