C++ (Cpp) ROW_PTR2 예제들

예제 #1

0

파일 보기

파일: SparseLinear.c 프로젝트: elbamos/nn

void THNN_(SparseLinear_accGradParameters)(
          THNNState *state,
          THTensor *input,
          THTensor *gradOutput,
          THTensor *gradWeight,
          THTensor *gradBias,
          THTensor *weight,
          THTensor *bias,
          real weightDecay,
          real scale)
{
  long h, i;
  long outDim = THTensor_(size)(weight, 0);
  long inDim = THTensor_(size)(weight, 1);

  THArgCheck(THNN_(checkInput)(input), 2,
             "input must be in coo format, nnz x 3");
  THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4,
             "gradWeight size wrong");
  THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5,
             "gradBias size wrong");
  THArgCheck(THTensor_(isContiguous)(gradOutput), 1,
             "gradOutput must be contiguous");

  long nnz = THTensor_(size)(input, 0);
  // THTensor_(resize2d)(gradOutput, batchSize, outDim);

  // gradWeight += gradOutput * input
#pragma omp parallel for private(h, i) schedule(static) if (\
  nnz * outDim > 10000)
  for (i = 0; i < nnz; i++) {
    real val = scale * THNN_(get2d)(input, i, 2);

    long offset = (long)(THNN_(get2d)(input, i, 1)) - 1;
    long h = (long)(THNN_(get2d)(input, i, 0)) - 1;
    if (offset >= 0 && offset < inDim) {
      THBlas_(axpy)(outDim,
          val,
          ROW_PTR2(gradOutput, h), gradOutput->stride[1],
          COL_PTR2(gradWeight, offset), gradWeight->stride[0]);
    } else {
      THError(
          "index out of bound. accGradParameters: %d not between 1 and %d",
          offset + 1,
          inDim);
    }
  }

  // gradBias += gradOutput
  THTensor* buf = THTensor_(new)();
  THTensor_(sum)(buf, gradOutput, 0);
  THTensor_(cadd)(gradBias, gradBias, scale, buf);
  THTensor_(free)(buf);

  if (weightDecay != 0) {
    THTensor_(cadd)(gradWeight, gradWeight, weightDecay, weight);
  }
}

예제 #2

0

파일 보기

파일: SparseLinear.c 프로젝트: xiongyw/pytorch

void THNN_(SparseLinear_legacyUpdateOutput)(
          THNNState *state,
          THTensor *input,
          THTensor *output,
          THTensor *weight,
          THTensor *bias)
{
  int64_t h, i;
  int64_t outDim = THTensor_(size)(weight, 0);
  int64_t inDim = THTensor_(size)(weight, 1);

  THArgCheck(THNN_(checkLegacyInput)(input), 2, "input size must be batchsize x nnz x 2");
  THArgCheck(THTensor_(isContiguous)(output), 3, "output must be contiguous");
  THArgCheck(THNN_(checkSize1D)(bias, outDim), 5, "bias size wrong");

  weight = THTensor_(newContiguous)(weight);

  int64_t batchSize = THTensor_(size)(input, 0);
  int64_t nnz = THTensor_(size)(input, 1);
  THTensor_(resize2d)(output, batchSize, outDim);

  // output = weight * input + bias
  THTensor_(zero)(output);
#pragma omp parallel for private(h, i) schedule(static) if (   \
  batchSize > 1 && batchSize * nnz * outDim > 10000)
  for (h = 0; h < batchSize; h++) {
    for (i = 0; i < nnz; i++) {
      real val = THNN_(get3d)(input, h, i, 1);
      if (val == 0) {
        continue;
      }

      int64_t offset = (int64_t)(THNN_(get3d)(input, h, i, 0)) - 1;
      if (offset >= 0 && offset < inDim) {
        THBlas_(axpy)(outDim,
                      val,
                      COL_PTR2(weight, offset), weight->stride[0],
                      ROW_PTR2(output, h), output->stride[1]);
      } else {
        THError("index out of bound. updateOutput: %d not between 1 and %d",
                offset + 1, inDim);
      }
    }
  }

  THTensor* output_row = THTensor_(new)();
  for (h = 0; h < batchSize; h++) {
    THTensor_(select)(output_row, output, 0, h);
    THTensor_(cadd)(output_row, bias, 1.0, output_row);
  }
  THTensor_(free)(output_row);
  THTensor_(free)(weight);
}

예제 #3

0

파일 보기

파일: SparseLinear.c 프로젝트: elbamos/nn

void THNN_(SparseLinear_updateOutput)(
          THNNState *state,
          THTensor *input,
          THTensor *output,
          THTensor *weight,
          THTensor *bias)
{
  long h, i;
  long outDim = THTensor_(size)(weight, 0);
  long inDim = THTensor_(size)(weight, 1);
  long batchSize = THTensor_(size)(output, 0);

  THArgCheck(THNN_(checkInput)(input), 2, "input must be in coo format, nnz x 3");
  THArgCheck(THTensor_(isContiguous)(output), 3, "output must be contiguous");
  THArgCheck(THNN_(checkSize1D)(bias, outDim), 5, "bias size wrong");

  long nnz = THTensor_(size)(input, 0);

  // output = weight * input + bias
  THTensor_(zero)(output);
#pragma omp parallel for private(i) schedule(static) if (nnz * outDim > 10000)
  for (i = 0; i < nnz; i++) {
    real val = THNN_(get2d)(input, i, 2);
    if (val == 0) {
      continue;
    }

    long offset = (long)(THNN_(get2d)(input, i, 1)) - 1;
    long h = (long)(THNN_(get2d)(input, i, 0)) - 1;
    if (offset >= 0 && offset < inDim) {
      THBlas_(axpy)(outDim,
                    val,
                    COL_PTR2(weight, offset), weight->stride[0],
                    ROW_PTR2(output, h), output->stride[1]);
    } else {
      THError("index out of bound. updateOutput: %d not between 1 and %d",
              offset + 1, inDim);
    }
  }

  THTensor* output_row = THTensor_(new)();
  for (h = 0; h < batchSize; h++) {
    THTensor_(select)(output_row, output, 0, h);
    THTensor_(cadd)(output_row, bias, 1.0, output_row);
  }
  THTensor_(free)(output_row);
}

예제 #4

0

파일 보기

파일: SparseLinear.c 프로젝트: xiongyw/pytorch

void THNN_(SparseLinear_updateOutput)(
          THNNState *state,
          THTensor *input,
          THTensor *output,
          THTensor *weight,
          THTensor *bias)
{
  int64_t h, i, hp0, hp1;
  int64_t outDim = THTensor_(size)(weight, 0);
  int64_t inDim = THTensor_(size)(weight, 1);
  int64_t batchSize = THTensor_(size)(output, 0);

  THArgCheck(THNN_(checkInput)(input), 2, "input must be in coo format, nnz x 3");
  THArgCheck(THTensor_(isContiguous)(output), 3, "output must be contiguous");
  THArgCheck(THNN_(checkSize1D)(bias, outDim), 5, "bias size wrong");

  int64_t nnz = THTensor_(size)(input, 0);

  THLongTensor * csr = THLongTensor_newWithSize1d(batchSize+1);
  THLongTensor_zero(csr);

  weight = THTensor_(newContiguous)(weight);

//#pragma omp parallel for private(i, h, hp0, hp1) schedule(static) if (nnz > 10000)
  for (i=0; i<nnz; i++) {
    hp0 = (int64_t)(THNN_(get2d)(input, i, 0)) - 1;
    hp1 = (i+1 == nnz) ?
            batchSize :
            (int64_t)(THNN_(get2d)(input, i+1, 0)) - 1;
    if (hp0 != hp1) for (h = hp0; h < hp1; h++) {
      THLongTensor_set1d(csr, h+1, i+1);
    }
  }


  // output = weight * input + bias
  THTensor_(zero)(output);
#pragma omp parallel for private(h, i) schedule(static) if (nnz > 10000)
  for (h = 0; h < batchSize; h++) {
    int64_t i_start = THLongTensor_get1d(csr, h);
    int64_t i_end = THLongTensor_get1d(csr, h+1);
    for (i = i_start; i < i_end; i++) {
      real val = THNN_(get2d)(input, i, 2);
      if (val == 0) {
        continue;
      }

      int64_t offset = (int64_t)(THNN_(get2d)(input, i, 1)) - 1;
      if (offset >= 0 && offset < inDim) {
        THBlas_(axpy)(outDim,
            val,
            COL_PTR2(weight, offset), weight->stride[0],
            ROW_PTR2(output, h), output->stride[1]);
      } else {
        THError("index out of bound. updateOutput: %d not between 1 and %d",
            offset + 1, inDim);
      }
    }
  }

  THTensor* output_row = THTensor_(new)();
  for (h = 0; h < batchSize; h++) {
    THTensor_(select)(output_row, output, 0, h);
    THTensor_(cadd)(output_row, bias, 1.0, output_row);
  }
  THTensor_(free)(output_row);
  THLongTensor_free(csr);
  THTensor_(free)(weight);
}

예제 #5

0

파일 보기

파일: SparseLinear.c 프로젝트: xiongyw/pytorch

void THNN_(SparseLinear_legacyAccGradParameters)(
          THNNState *state,
          THTensor *input,
          THTensor *gradOutput,
          THTensor *gradWeight,
          THTensor *gradBias,
          THTensor *weight,
          THTensor *bias,
          accreal weightDecay_,
          accreal scale_)
{
  real weightDecay = TH_CONVERT_ACCREAL_TO_REAL(weightDecay_);
  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
  int64_t h, i;
  int64_t outDim = THTensor_(size)(weight, 0);
  int64_t inDim = THTensor_(size)(weight, 1);

  THArgCheck(THNN_(checkLegacyInput)(input), 2,
             "input size must be batchsize x nnz x 2");
  THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4,
             "gradWeight size wrong");
  THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5,
             "gradBias size wrong");
  THArgCheck(THTensor_(isContiguous)(gradOutput), 1,
             "gradOutput must be contiguous");

  int64_t batchSize = THTensor_(size)(input, 0);
  int64_t nnz = THTensor_(size)(input, 1);
  THTensor_(resize2d)(gradOutput, batchSize, outDim);

  // gradWeight += gradOutput * input
#pragma omp parallel for private(h, i) schedule(static) if (\
  batchSize * nnz * outDim > 10000)
  for (i = 0; i < nnz; i++) {
    for (h = 0; h < batchSize; h++) {
      real val = scale * THNN_(get3d)(input, h, i, 1);
      if (val == 0) {
        continue;
      }

      int64_t offset = (int64_t)(THNN_(get3d)(input, h, i, 0)) - 1;
      if (offset >= 0 && offset < inDim) {
        THBlas_(axpy)(outDim,
                      val,
                      ROW_PTR2(gradOutput, h), gradOutput->stride[1],
                      COL_PTR2(gradWeight, offset), gradWeight->stride[0]);
      } else {
        THError(
          "index out of bound. accGradParameters: %d not between 1 and %d",
          offset + 1,
          inDim);
      }
    }
  }

  // gradBias += gradOutput
  THTensor* gradOutput_row = THTensor_(new)();
  for (h = 0; h < batchSize; h++) {
    THTensor_(select)(gradOutput_row, gradOutput, 0, h);
    THTensor_(cadd)(gradBias, gradBias, scale, gradOutput_row);
  }
  THTensor_(free)(gradOutput_row);

  if (weightDecay != 0) {
    THTensor_(cadd)(gradWeight, gradWeight, weightDecay, weight);
  }
}

예제 #6

0

파일 보기

파일: SparseLinear.c 프로젝트: xiongyw/pytorch

void THNN_(SparseLinear_accGradParameters)(
          THNNState *state,
          THTensor *input,
          THTensor *gradOutput,
          THTensor *gradWeight,
          THTensor *gradBias,
          THTensor *weight,
          THTensor *bias,
          accreal weightDecay_,
          accreal scale_)
{
  real weightDecay = TH_CONVERT_ACCREAL_TO_REAL(weightDecay_);
  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
  int64_t h, i, col, hp0, hp1;
  int64_t outDim = THTensor_(size)(weight, 0);
  int64_t inDim = THTensor_(size)(weight, 1);

  THArgCheck(THNN_(checkInput)(input), 2,
             "input must be in coo format, nnz x 3");
  THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4,
             "gradWeight size wrong");
  THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5,
             "gradBias size wrong");
  THArgCheck(THTensor_(isContiguous)(gradOutput), 1,
             "gradOutput must be contiguous");

  int64_t nnz = THTensor_(size)(input, 0);

  THLongTensor* csc = THLongTensor_newWithSize1d(inDim+1);
  THLongTensor_zero(csc);
  weight = THTensor_(newContiguous)(weight);

#pragma omp parallel for private(i, h, hp0, hp1) schedule(static) if (nnz > 10000)
  for (i = 0; i < nnz; i++) {
    hp0 = (int64_t)(THNN_(get2d)(input, i, 1)) - 1;
    hp1 = (i+1 == nnz) ?
            inDim :
            (int64_t)(THNN_(get2d)(input, i+1, 1)) - 1;
    if (hp0 != hp1) for (h = hp0; h < hp1; h++) {
      THLongTensor_set1d(csc, h+1, i+1);
    }
  }

  // gradWeight += gradOutput * input
#pragma omp parallel for private(h, i, col) schedule(static) if (nnz > 10000)
  for (col = 0; col < inDim; col++) {
    int64_t i_start = THLongTensor_get1d(csc, col);
    int64_t i_end = THLongTensor_get1d(csc, col+1);
    for (i = i_start; i < i_end; i++) {
      real val = scale * THNN_(get2d)(input, i, 2);

      h = (int64_t)(THNN_(get2d)(input, i, 0)) - 1;
      int64_t offset = (int64_t)(THNN_(get2d)(input, i, 1)) - 1;
      if (offset >= 0 && offset < inDim) {
        THBlas_(axpy)(outDim,
            val,
            ROW_PTR2(gradOutput, h), gradOutput->stride[1],
            COL_PTR2(gradWeight, offset), gradWeight->stride[0]);
      } else {
        THError(
            "index out of bound. accGradParameters: %d not between 1 and %d",
            offset + 1,
            inDim);
      }
    }
  }

  // gradBias += gradOutput
  THTensor* buf = THTensor_(new)();
  THTensor_(sum)(buf, gradOutput, 0, 1);
  THTensor_(cadd)(gradBias, gradBias, scale, buf);
  THTensor_(free)(buf);
  THLongTensor_free(csc);

  if (weightDecay != 0) {
    THTensor_(cadd)(gradWeight, gradWeight, weightDecay, weight);
  }
  THTensor_(free)(weight);
}