void MKLPackedRecurrentLayer::backwardBatch(int batchSize,
                                            size_t numSequences,
                                            const int* starts) {
  if (!batchGrad_) {
    batchGrad_.reset(new SequenceToBatch(useGpu_));
  }
  batchGrad_->shareIndexWith(*batchValue_);

  size_t numBatch = batchGrad_->getNumBatch();
  bool backwardByBatch = numBatch < numSequences;

  batchGrad_->copyFromSeq(*output_.grad);
  {
    REGISTER_TIMER_INFO("RecurrentBwData", getName().c_str());
    /* backward one batch */
    for (int n = (int)numBatch - 1; n >= 0; n--) {
      MatrixPtr batchGrad = batchGrad_->getBatchValue(n);
      MatrixPtr batchValue =
          batchValue_->getBatchValue(n, batchGrad->getHeight());

      Argument arg;
      arg.value = batchValue;
      arg.grad = batchGrad;
      activation_->backward(arg).check();

      if (n != 0) {
        batchValue = batchGrad_->getBatchValue(n - 1, batchGrad->getHeight());
        packed_weightT_->gemm_compute(batchGrad, batchValue);
      }

      if (backwardByBatch && weight_->getWGrad()) {
        if (n != 0) {
          /* backward weight */
          batchValue =
              batchValue_->getBatchValue(n - 1, batchGrad->getHeight());
          weight_->getWGrad()->mul(
              *batchValue->getTranspose(), *batchGrad, 1, 1);
        }
      }
    }
  }

  batchGrad_->copyBackSeq(*output_.grad);

  if (!backwardByBatch && weight_->getWGrad()) {
    REGISTER_TIMER_INFO("RecurrentBwWeight", getName().c_str());
    for (size_t seq = 0; seq < numSequences; ++seq) {
      int len = starts[seq + 1] - starts[seq];
      weight_->getWGrad()->mul(
          *output_.value
               ->subMatrix(reversed_ ? starts[seq] + 1 : starts[seq], len - 1)
               ->getTranspose(),
          *output_.grad->subMatrix(reversed_ ? starts[seq] : starts[seq] + 1,
                                   len - 1),
          1,
          1);
    }
  }
}
示例#2
0
void checkMatrixEqual(const MatrixPtr& a, const MatrixPtr& b) {
  EXPECT_EQ(a->getWidth(), b->getWidth());
  EXPECT_EQ(a->getHeight(), b->getHeight());
  EXPECT_EQ(a->isTransposed(), b->isTransposed());
  for (size_t r = 0; r < a->getHeight(); ++r) {
    for (size_t c = 0; c < a->getWidth(); ++c) {
      EXPECT_FLOAT_EQ(a->getElement(r, c), b->getElement(r, c));
    }
  }
}
示例#3
0
void ExpandLayer::backward(const UpdateCallback& callback) {
  if (biases_ && biases_->getWGrad()) {
    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
    /* Increasing the number of gradient */
    biases_->getParameterPtr()->incUpdate(callback);
  }

  if (!getInputGrad(0)) return;
  MatrixPtr inputGrad = getInputGrad(0);
  MatrixPtr outputGrad = getOutputGrad();
  auto cpuSeqStartPos = type_ ? getInput(1).subSequenceStartPositions
                              : getInput(1).sequenceStartPositions;
  size_t numSequences = cpuSeqStartPos->getSize() - 1;
  const int* starts = cpuSeqStartPos->getData(false);

  CHECK_EQ(inputGrad->getWidth(), outputGrad->getWidth());
  CHECK_EQ(outputGrad->getHeight(), (size_t)starts[numSequences]);

  AsyncGpuBlock asyncGpuBlock;

  // sum to get the grad
  real scale = 1;
  for (size_t sequenceId = 0; sequenceId < numSequences; sequenceId++) {
    // TODO(Dangqingqing) optimization for GPU
    int sequenceLength = starts[sequenceId + 1] - starts[sequenceId];
    if (sequenceLength == 0) {
      // empty sequence
      continue;
    }
    MatrixPtr copyData = inputGrad->subMatrix(sequenceId, 1);
    copyData->collectBias(
        *outputGrad->subMatrix(starts[sequenceId], sequenceLength), scale);
  }
}
示例#4
0
void ScaleSubRegionLayer::forward(PassType passType) {
  Layer::forward(passType);
  auto in0 = getInput(0);
  imgH_ = in0.getFrameHeight();
  imgW_ = in0.getFrameWidth();
  if (imgH_ == 0 || imgW_ == 0) {
    auto& conf = config_.inputs(0).scale_sub_region_conf();
    imgH_ = conf.image_conf().img_size_y();
    imgW_ = conf.image_conf().img_size();
  }
  MatrixPtr imgV = in0.value;
  size_t batchSize = imgV->getHeight();
  size_t spatialSize = imgH_ * imgW_;
  channelsNum_ = imgV->getWidth() / spatialSize;
  shape_ = TensorShape({batchSize, channelsNum_, imgH_, imgW_});

  resetOutput(batchSize, imgV->getWidth());
  auto& out = getOutput();
  out.setFrameHeight(imgH_);
  out.setFrameWidth(imgW_);

  MatrixPtr indicesV = getInputValue(1);
  indicesShape_ = TensorShape({batchSize, 6});

  REGISTER_TIMER_INFO("ScaleSubRegionForward", getName().c_str());
  BufferArgs inArgs;
  BufferArgs outArgs;
  inArgs.addArg(*imgV, shape_);
  inArgs.addArg(*indicesV, indicesShape_);
  outArgs.addArg(*out.value, shape_, ASSIGN_TO);
  forward_[0]->calc(inArgs, outArgs);
}
void MKLPackedRecurrentLayer::forwardBatch(int batchSize,
                                           size_t numSequences,
                                           const int* starts) {
  if (!batchValue_) {
    batchValue_.reset(new SequenceToBatch(useGpu_));
  }

  batchValue_->resizeOrCreateBatch(batchSize, numSequences, starts, reversed_);

  batchValue_->copyFromSeq(*output_.value);

  {
    REGISTER_TIMER_INFO("RecurrentFwBatch", getName().c_str());
    /* forward one batch */
    for (size_t n = 0; n < batchValue_->getNumBatch(); n++) {
      MatrixPtr batchValue = batchValue_->getBatchValue(n);

      if (n != 0) {
        MatrixPtr preBatchValue =
            batchValue_->getBatchValue(n - 1, batchValue->getHeight());

        packed_weight_->gemm_compute(preBatchValue, batchValue);
      }
      Argument arg;
      arg.value = batchValue;
      activation_->forward(arg).check();
    }
  }
  batchValue_->copyBackSeq(*output_.value);
}
示例#6
0
Error __must_check MKLDNNSoftmaxActivation::backward(Argument& act) {
  MatrixPtr outputV = act.value;
  MatrixPtr outputG = act.grad;
  Matrix::resizeOrCreate(sftMaxDot_,
                         outputG->getHeight(),
                         outputG->getWidth(),
                         /* trans */ false,
                         /* useGpu */ false);
  Matrix::resizeOrCreate(sftMaxSum_,
                         outputG->getHeight(),
                         1,
                         /* trans */ false,
                         /* useGpu */ false);
  sftMaxDot_->dotMul(*outputG, *outputV);
  sftMaxSum_->colMerge(*sftMaxDot_);
  act.grad->softmaxDerivative(*act.value, *sftMaxSum_);
  return Error();
}
示例#7
0
void GatedRecurrentLayer::forwardBatch(int batchSize,
                                       size_t numSequences,
                                       const int* starts,
                                       MatrixPtr inputValue) {
  REGISTER_TIMER_INFO("GruFwBatchTime", getName().c_str());
  hl_gru_value gruValue;
  gruValue.gateWeight = (gateWeight_->getW())->getData();
  gruValue.stateWeight = (stateWeight_->getW())->getData();

  if (!batchValue_) {
    batchValue_.reset(new SequenceToBatch(useGpu_));
  }
  batchValue_->resizeOrCreateBatch(batchSize, numSequences, starts,
                                   reversed_);

  batchValue_->resizeOrCreate(*output_.value);
  batchValue_->copy(*inputValue, *gate_.value, /* seq2batch */true);
  if (bias_ && bias_->getWGrad()) {
    gate_.value->addBias(*(bias_->getW()), 1);
  }

  {
    int numBatch = batchValue_->getNumBatch();
    int batchSize = 0;
    AsyncGpuBlock asyncGpuBlock;
    for (int n = 0; n < numBatch; n++) {
      MatrixPtr outputValueTmp = batchValue_->getBatchValue(n);
      gruValue.outputValue = outputValueTmp->getData();
      gruValue.gateValue =
        (batchValue_->getBatchValue(*gate_.value, n))->getData();
      gruValue.resetOutputValue =
        (batchValue_->getBatchValue(*resetOutput_.value, n))->getData();

      batchSize = outputValueTmp->getHeight();
      gruValue.prevOutValue =
        (n == 0 ? nullptr
                : (batchValue_->getBatchValue(n - 1, batchSize))->getData());

      {
        if (useGpu_) {
          GruCompute::forward<1>(gruValue, getSize(), batchSize);
        } else {
          GruCompute::forward<0>(gruValue, getSize(), batchSize);
        }
      }
    }
  }
  {
    batchValue_->copyBackSeq(*output_.value);
  }
}
void SelectiveFullyConnectedLayer::backward(const UpdateCallback& callback) {
  backwardActivation();
  MatrixPtr oGrad = getOutputGrad();
  if (!fullOutput_) {
    interOutGrad_ = Matrix::createSparseMatrix(oGrad->getData(),
                                               interOutput_->getRows(),
                                               interOutput_->getCols(),
                                               interOutput_->getHeight(),
                                               interOutput_->getWidth(),
                                               interOutput_->getElementCnt(),
                                               FLOAT_VALUE,
                                               SPARSE_CSR,
                                               /*trans=*/false,
                                               /*useGpu=*/useGpu_);
  } else {
    interOutGrad_ = Matrix::create(oGrad->getData(),
                                   oGrad->getHeight(),
                                   oGrad->getWidth(),
                                   /*trans=*/false,
                                   /*useGpu=*/useGpu_);
  }

  if (biases_ && biases_->getWGrad()) {
    REGISTER_TIMER_INFO("BpBiasTimer", getName().c_str());
    biases_->getWGrad()->collectBias(*interOutGrad_, 1);
    biases_->getParameterPtr()->incUpdate(callback);
  }

  // backward is different from FullyConnectedLayer
  // because the weight is transposed
  for (size_t i = 0; i < inputNum_; i++) {
    AsyncGpuBlock block;
    MatrixPtr preGrad = getInputGrad(i);
    if (preGrad) {
      REGISTER_TIMER_INFO("BpMulTimer", getName().c_str());
      preGrad->mul(*interOutGrad_, *weights_[i]->getW(), 1, 1);
    }

    MatrixPtr wGrad = weights_[i]->getWGrad();
    if (wGrad) {
      REGISTER_TIMER_INFO("GradMulTimer", getName().c_str());
      MatrixPtr input = getInputValue(i);
      wGrad->mul(*interOutGrad_->getTranspose(), *input, 1, 1);
    }

    {
      REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
      weights_[i]->getParameterPtr()->incUpdate(callback);
    }
  }
}
示例#9
0
void KmaxSeqScoreLayer::forward(PassType passType) {
  Layer::forward(passType);

  const Argument& input = getInput(0);
  const MatrixPtr inputScore = getInputValue(0);

  CHECK(input.hasSeq() || input.hasSubseq())
      << "input of " << getName()
      << " must be a sequence or a nested sequence.";
  CHECK_EQ(input.value->getWidth(), 1UL)
      << "input of " << getName() << " are scores over a sequence or "
      << "a nested sequence, so its width must be 1.";

  if (useGpu_) {
    /*
     * currently, this Layer only runs in CPU, if the other part of the model is
     * runing on GPU, then copy the input to this layer from GPU to CPU.
     */
    Matrix::resizeOrCreate(scores_,
                           inputScore->getHeight(),
                           1,
                           false /* trans */,
                           false /* useGpu */);
    scores_->copyFrom(*inputScore);
  } else {
    scores_ = inputScore;
  }

  /*
   * TODO(caoying)
   * In PaddePaddle, currently all matrices are real number types,
   * but output of this layer which is some selected indices of the give
   * sequence are actually filled with int types so that storing int types
   * information in a real number matrix is dangerous, since real numbers will
   * be convered to int types.
   */
  Matrix::resizeOrCreate(
      output_.value,
      input.hasSubseq() ? input.getNumSubSequences() : input.getNumSequences(),
      beamSize_,
      false,
      false);
  output_.value->one();
  output_.value->mulScalar(-1.);

  kmaxScorePerSeq(scores_->getData(),
                  output_.value->getData(),
                  input.hasSubseq() ? input.subSequenceStartPositions
                                    : input.sequenceStartPositions);
}
示例#10
0
TEST(Arguments, Matrix) {
  MatrixPtr matrix = Matrix::create(100, 200);
  CheckBufferArg check = [=](const BufferArg& arg) {
    EXPECT_EQ(arg.shape().ndims(), 2U);
    EXPECT_EQ(arg.shape()[0], 100U);
    EXPECT_EQ(arg.shape()[1], 200U);
    EXPECT_EQ(arg.data(), matrix->getData());

    EXPECT_EQ(arg.matrix<DEVICE_TYPE_CPU>().getHeight(), matrix->getHeight());
    EXPECT_EQ(arg.matrix<DEVICE_TYPE_CPU>().getWidth(), matrix->getWidth());
    EXPECT_EQ(arg.matrix<DEVICE_TYPE_CPU>().getData(), matrix->getData());
  };

  BufferArgs argments;
  argments.addArg(*matrix);
  std::vector<CheckBufferArg> checkFunc;
  checkFunc.push_back(check);
  testBufferArgs(argments, checkFunc);
}
示例#11
0
void SlopeInterceptLayer::forward(PassType passType) {
  Layer::forward(passType);

  MatrixPtr inV = getInputValue(0);

  /* malloc memory for the output_ if necessary */
  size_t batchSize = inV->getHeight();
  size_t size = getSize();

  CHECK_EQ(size, inV->getWidth());

  {
    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
    reserveOutput(batchSize, size);
  }

  MatrixPtr outV = getOutputValue();
  {
    REGISTER_TIMER_INFO("FwSlopeInterceptTimer", getName().c_str());
    outV->mulScalar(*inV, config_.slope());
    outV->add(config_.intercept());
  }
}
示例#12
0
void GatedRecurrentLayer::backwardBatch(int batchSize,
                                        MatrixPtr inputGrad) {
  REGISTER_TIMER_INFO("GruBwBatchTime", getName().c_str());
  hl_gru_value gruValue;
  gruValue.gateWeight = (gateWeight_->getW())->getData();
  gruValue.stateWeight = (stateWeight_->getW())->getData();

  hl_gru_grad gruGrad;
  gruGrad.gateWeightGrad =
    (gateWeight_->getWGrad() ? gateWeight_->getWGrad()->getData() : nullptr);
  gruGrad.stateWeightGrad =
    (stateWeight_->getWGrad() ? stateWeight_->getWGrad()->getData() : nullptr);

  if (!batchGrad_) {
    batchGrad_.reset(new SequenceToBatch(useGpu_));
  }
  batchGrad_->shareIndexWith(*batchValue_);

  {
    batchGrad_->copyFromSeq(*output_.grad);
  }

  {
    int numBatch = batchGrad_->getNumBatch();
    int batchSize = 0;
    AsyncGpuBlock asyncGpuBlock;
    for (int n = (int)numBatch - 1; n >= 0; n--) {
      gruValue.gateValue =
        (batchGrad_->getBatchValue(*gate_.value, n))->getData();
      gruValue.resetOutputValue =
        (batchGrad_->getBatchValue(*resetOutput_.value, n))->getData();

      MatrixPtr outputGradTmp  = batchGrad_->getBatchValue(n);
      gruGrad.outputGrad = outputGradTmp->getData();
      gruGrad.gateGrad =
        (batchGrad_->getBatchValue(*gate_.grad , n))->getData();
      gruGrad.resetOutputGrad =
        (batchGrad_->getBatchValue(*resetOutput_.grad , n))->getData();

      {
        batchSize = outputGradTmp->getHeight();
        gruValue.prevOutValue =
          (n == 0 ? nullptr
                  : (batchValue_->getBatchValue(n - 1, batchSize))->getData());
        gruGrad.prevOutGrad =
          (n == 0 ? nullptr
                  : (batchGrad_->getBatchValue(n - 1, batchSize))->getData());

        if (useGpu_) {
          GruCompute::backward<1>(gruValue, gruGrad, getSize(),
                                  batchSize);
        } else {
          GruCompute::backward<0>(gruValue, gruGrad, getSize(),
                                  batchSize);
        }
      }
    }
  }

  if (inputGrad) {
    batchGrad_->add(*inputGrad, *gate_.grad, /* seq2batch */false);
  }
  if (bias_ && bias_->getWGrad()) {
    bias_->getWGrad()->collectBias(*gate_.grad, /* scale */ 1);
  }
}
void SelectiveFullyConnectedLayer::forward(PassType passType) {
  REGISTER_TIMER("selective_fc.forward");
  Layer::forward(passType);

  getSelectiveCols();
  size_t height = getInput(0).getBatchSize();
  size_t width = getSize();
  size_t nnz = height * width;
  if (!fullOutput_) {
    CHECK(selCols_);
    CHECK(height == selCols_->getHeight());
    CHECK(width == selCols_->getWidth());
    nnz = selCols_->getElementCnt();
  }

  // Layer::ResetOutput(), here we set outV/outG as SparseMatrix manually
  // this outV should be used as input of MaxIdLayer and softmax activation
  reserveOutput(height, width, nnz);

  bool flag = true;
  for (size_t i = 0; i < inputNum_; i++) {
    MatrixPtr input = getInputValue(i);
    MatrixPtr weight = weights_[i]->getW();
    size_t hsize = input->getHeight();
    size_t wsize = weight->getHeight();
    real scaleT = i == 0 ? real(0) : real(1);

    flag = nnz < (hsize * wsize) * config_.selective_fc_full_mul_ratio() &&
           !fullOutput_;
    if (flag) {
      // if the indecies are highly sparse,
      // manully compute the multiplication of
      // the input vector and the selected rows.
      REGISTER_TIMER("selective.plain");
      interOutput_->mul(*input, *weight->getTranspose(), 1, scaleT);
    } else {
      // if the indecies is not sparse enough,
      // use full mul instead
      REGISTER_TIMER("selective.mul");
      if (fullOutput_) {
        interOutput_->mul(*input, *weight->getTranspose(), 1, scaleT);
      } else {
        Matrix::resizeOrCreate(mmat_,
                               hsize,
                               wsize,
                               /*trans=*/false,
                               /*useGpu=*/useGpu_);
        mmat_->mul(*input, *weight->getTranspose());
        interOutput_->add3(mmat_);
      }
    }
  }

  if (biases_) {
    interOutput_->addBias(*(biases_->getW()), 1);
  }

  flag = (passType_ == PASS_TEST && config_.selective_fc_pass_generation() &&
          !fullOutput_);
  if (flag) {
    // during generation, output of this layer is a sparse csr matrix,
    // which is probably the input of maxid layer
    // if the model is trained with multi-class-cross-entroy-with-selfnorm,
    // activiation of this layer should be exponential, not softmax.

    Argument arg;
    arg.value = Matrix::create(interOutput_->getData(),
                               1,
                               nnz,
                               /*trans=*/false,
                               /*useGpu=*/useGpu_);
    //! TODO(yuyang18): Why we cannot invoke forwardActivation here?
    activation_->forward(arg).check();
  } else /* train and test in train, not generating */ {
    // during training, this layer output value is *Matrix*, which is input of
    // eg. multi-class-cross-entropy

    // while training, every sample has a equal number of selected
    // columns to be activated.
    // note indices of multi-class-cross-entropy need to be remapped
    // to this index.
    // e.g. sample = [1,3,5] and 3 is gold, then label is 1

    forwardActivation();
  }
}
示例#14
0
  virtual real evalImp(std::vector<Argument>& arguments) {
    overlapThreshold_ = config_.overlap_threshold();
    backgroundId_ = config_.background_id();
    evaluateDifficult_ = config_.evaluate_difficult();
    apType_ = config_.ap_type();

    MatrixPtr detectTmpValue = arguments[0].value;
    Matrix::resizeOrCreate(cpuOutput_,
                           detectTmpValue->getHeight(),
                           detectTmpValue->getWidth(),
                           false,
                           false);

    MatrixPtr labelTmpValue = arguments[1].value;
    Matrix::resizeOrCreate(cpuLabel_,
                           labelTmpValue->getHeight(),
                           labelTmpValue->getWidth(),
                           false,
                           false);

    cpuOutput_->copyFrom(*detectTmpValue);
    cpuLabel_->copyFrom(*labelTmpValue);

    Argument label = arguments[1];
    const int* labelIndex = label.sequenceStartPositions->getData(false);
    size_t batchSize = label.getNumSequences();

    vector<map<size_t, vector<NormalizedBBox>>> allGTBBoxes;
    vector<map<size_t, vector<pair<real, NormalizedBBox>>>> allDetectBBoxes;

    for (size_t n = 0; n < batchSize; ++n) {
      map<size_t, vector<NormalizedBBox>> bboxes;
      for (int i = labelIndex[n]; i < labelIndex[n + 1]; ++i) {
        vector<NormalizedBBox> bbox;
        getBBoxFromLabelData(cpuLabel_->getData() + i * 6, 1, bbox);
        int c = cpuLabel_->getData()[i * 6];
        bboxes[c].push_back(bbox[0]);
      }
      allGTBBoxes.push_back(bboxes);
    }

    size_t n = 0;
    const real* cpuOutputData = cpuOutput_->getData();
    for (size_t imgId = 0; imgId < batchSize; ++imgId) {
      map<size_t, vector<pair<real, NormalizedBBox>>> bboxes;
      size_t curImgId = static_cast<size_t>((cpuOutputData + n * 7)[0]);
      while (curImgId == imgId && n < cpuOutput_->getHeight()) {
        vector<real> label;
        vector<real> score;
        vector<NormalizedBBox> bbox;
        getBBoxFromDetectData(cpuOutputData + n * 7, 1, label, score, bbox);
        bboxes[label[0]].push_back(make_pair(score[0], bbox[0]));
        ++n;
        curImgId = static_cast<size_t>((cpuOutputData + n * 7)[0]);
      }
      allDetectBBoxes.push_back(bboxes);
    }

    for (size_t n = 0; n < batchSize; ++n) {
      for (map<size_t, vector<NormalizedBBox>>::iterator it =
               allGTBBoxes[n].begin();
           it != allGTBBoxes[n].end();
           ++it) {
        size_t count = 0;
        if (evaluateDifficult_) {
          count = it->second.size();
        } else {
          for (size_t i = 0; i < it->second.size(); ++i)
            if (!(it->second[i].isDifficult)) ++count;
        }
        if (numPos_.find(it->first) == numPos_.end() && count != 0) {
          numPos_[it->first] = count;
        } else {
          numPos_[it->first] += count;
        }
      }
    }

    // calcTFPos
    calcTFPos(batchSize, allGTBBoxes, allDetectBBoxes);

    return 0;
  }