void MKLPackedRecurrentLayer::backwardBatch(int batchSize, size_t numSequences, const int* starts) { if (!batchGrad_) { batchGrad_.reset(new SequenceToBatch(useGpu_)); } batchGrad_->shareIndexWith(*batchValue_); size_t numBatch = batchGrad_->getNumBatch(); bool backwardByBatch = numBatch < numSequences; batchGrad_->copyFromSeq(*output_.grad); { REGISTER_TIMER_INFO("RecurrentBwData", getName().c_str()); /* backward one batch */ for (int n = (int)numBatch - 1; n >= 0; n--) { MatrixPtr batchGrad = batchGrad_->getBatchValue(n); MatrixPtr batchValue = batchValue_->getBatchValue(n, batchGrad->getHeight()); Argument arg; arg.value = batchValue; arg.grad = batchGrad; activation_->backward(arg).check(); if (n != 0) { batchValue = batchGrad_->getBatchValue(n - 1, batchGrad->getHeight()); packed_weightT_->gemm_compute(batchGrad, batchValue); } if (backwardByBatch && weight_->getWGrad()) { if (n != 0) { /* backward weight */ batchValue = batchValue_->getBatchValue(n - 1, batchGrad->getHeight()); weight_->getWGrad()->mul( *batchValue->getTranspose(), *batchGrad, 1, 1); } } } } batchGrad_->copyBackSeq(*output_.grad); if (!backwardByBatch && weight_->getWGrad()) { REGISTER_TIMER_INFO("RecurrentBwWeight", getName().c_str()); for (size_t seq = 0; seq < numSequences; ++seq) { int len = starts[seq + 1] - starts[seq]; weight_->getWGrad()->mul( *output_.value ->subMatrix(reversed_ ? starts[seq] + 1 : starts[seq], len - 1) ->getTranspose(), *output_.grad->subMatrix(reversed_ ? starts[seq] : starts[seq] + 1, len - 1), 1, 1); } } }
void checkMatrixEqual(const MatrixPtr& a, const MatrixPtr& b) { EXPECT_EQ(a->getWidth(), b->getWidth()); EXPECT_EQ(a->getHeight(), b->getHeight()); EXPECT_EQ(a->isTransposed(), b->isTransposed()); for (size_t r = 0; r < a->getHeight(); ++r) { for (size_t c = 0; c < a->getWidth(); ++c) { EXPECT_FLOAT_EQ(a->getElement(r, c), b->getElement(r, c)); } } }
void ExpandLayer::backward(const UpdateCallback& callback) { if (biases_ && biases_->getWGrad()) { biases_->getWGrad()->collectBias(*getOutputGrad(), 1); /* Increasing the number of gradient */ biases_->getParameterPtr()->incUpdate(callback); } if (!getInputGrad(0)) return; MatrixPtr inputGrad = getInputGrad(0); MatrixPtr outputGrad = getOutputGrad(); auto cpuSeqStartPos = type_ ? getInput(1).subSequenceStartPositions : getInput(1).sequenceStartPositions; size_t numSequences = cpuSeqStartPos->getSize() - 1; const int* starts = cpuSeqStartPos->getData(false); CHECK_EQ(inputGrad->getWidth(), outputGrad->getWidth()); CHECK_EQ(outputGrad->getHeight(), (size_t)starts[numSequences]); AsyncGpuBlock asyncGpuBlock; // sum to get the grad real scale = 1; for (size_t sequenceId = 0; sequenceId < numSequences; sequenceId++) { // TODO(Dangqingqing) optimization for GPU int sequenceLength = starts[sequenceId + 1] - starts[sequenceId]; if (sequenceLength == 0) { // empty sequence continue; } MatrixPtr copyData = inputGrad->subMatrix(sequenceId, 1); copyData->collectBias( *outputGrad->subMatrix(starts[sequenceId], sequenceLength), scale); } }
void ScaleSubRegionLayer::forward(PassType passType) { Layer::forward(passType); auto in0 = getInput(0); imgH_ = in0.getFrameHeight(); imgW_ = in0.getFrameWidth(); if (imgH_ == 0 || imgW_ == 0) { auto& conf = config_.inputs(0).scale_sub_region_conf(); imgH_ = conf.image_conf().img_size_y(); imgW_ = conf.image_conf().img_size(); } MatrixPtr imgV = in0.value; size_t batchSize = imgV->getHeight(); size_t spatialSize = imgH_ * imgW_; channelsNum_ = imgV->getWidth() / spatialSize; shape_ = TensorShape({batchSize, channelsNum_, imgH_, imgW_}); resetOutput(batchSize, imgV->getWidth()); auto& out = getOutput(); out.setFrameHeight(imgH_); out.setFrameWidth(imgW_); MatrixPtr indicesV = getInputValue(1); indicesShape_ = TensorShape({batchSize, 6}); REGISTER_TIMER_INFO("ScaleSubRegionForward", getName().c_str()); BufferArgs inArgs; BufferArgs outArgs; inArgs.addArg(*imgV, shape_); inArgs.addArg(*indicesV, indicesShape_); outArgs.addArg(*out.value, shape_, ASSIGN_TO); forward_[0]->calc(inArgs, outArgs); }
void MKLPackedRecurrentLayer::forwardBatch(int batchSize, size_t numSequences, const int* starts) { if (!batchValue_) { batchValue_.reset(new SequenceToBatch(useGpu_)); } batchValue_->resizeOrCreateBatch(batchSize, numSequences, starts, reversed_); batchValue_->copyFromSeq(*output_.value); { REGISTER_TIMER_INFO("RecurrentFwBatch", getName().c_str()); /* forward one batch */ for (size_t n = 0; n < batchValue_->getNumBatch(); n++) { MatrixPtr batchValue = batchValue_->getBatchValue(n); if (n != 0) { MatrixPtr preBatchValue = batchValue_->getBatchValue(n - 1, batchValue->getHeight()); packed_weight_->gemm_compute(preBatchValue, batchValue); } Argument arg; arg.value = batchValue; activation_->forward(arg).check(); } } batchValue_->copyBackSeq(*output_.value); }
Error __must_check MKLDNNSoftmaxActivation::backward(Argument& act) { MatrixPtr outputV = act.value; MatrixPtr outputG = act.grad; Matrix::resizeOrCreate(sftMaxDot_, outputG->getHeight(), outputG->getWidth(), /* trans */ false, /* useGpu */ false); Matrix::resizeOrCreate(sftMaxSum_, outputG->getHeight(), 1, /* trans */ false, /* useGpu */ false); sftMaxDot_->dotMul(*outputG, *outputV); sftMaxSum_->colMerge(*sftMaxDot_); act.grad->softmaxDerivative(*act.value, *sftMaxSum_); return Error(); }
void GatedRecurrentLayer::forwardBatch(int batchSize, size_t numSequences, const int* starts, MatrixPtr inputValue) { REGISTER_TIMER_INFO("GruFwBatchTime", getName().c_str()); hl_gru_value gruValue; gruValue.gateWeight = (gateWeight_->getW())->getData(); gruValue.stateWeight = (stateWeight_->getW())->getData(); if (!batchValue_) { batchValue_.reset(new SequenceToBatch(useGpu_)); } batchValue_->resizeOrCreateBatch(batchSize, numSequences, starts, reversed_); batchValue_->resizeOrCreate(*output_.value); batchValue_->copy(*inputValue, *gate_.value, /* seq2batch */true); if (bias_ && bias_->getWGrad()) { gate_.value->addBias(*(bias_->getW()), 1); } { int numBatch = batchValue_->getNumBatch(); int batchSize = 0; AsyncGpuBlock asyncGpuBlock; for (int n = 0; n < numBatch; n++) { MatrixPtr outputValueTmp = batchValue_->getBatchValue(n); gruValue.outputValue = outputValueTmp->getData(); gruValue.gateValue = (batchValue_->getBatchValue(*gate_.value, n))->getData(); gruValue.resetOutputValue = (batchValue_->getBatchValue(*resetOutput_.value, n))->getData(); batchSize = outputValueTmp->getHeight(); gruValue.prevOutValue = (n == 0 ? nullptr : (batchValue_->getBatchValue(n - 1, batchSize))->getData()); { if (useGpu_) { GruCompute::forward<1>(gruValue, getSize(), batchSize); } else { GruCompute::forward<0>(gruValue, getSize(), batchSize); } } } } { batchValue_->copyBackSeq(*output_.value); } }
void SelectiveFullyConnectedLayer::backward(const UpdateCallback& callback) { backwardActivation(); MatrixPtr oGrad = getOutputGrad(); if (!fullOutput_) { interOutGrad_ = Matrix::createSparseMatrix(oGrad->getData(), interOutput_->getRows(), interOutput_->getCols(), interOutput_->getHeight(), interOutput_->getWidth(), interOutput_->getElementCnt(), FLOAT_VALUE, SPARSE_CSR, /*trans=*/false, /*useGpu=*/useGpu_); } else { interOutGrad_ = Matrix::create(oGrad->getData(), oGrad->getHeight(), oGrad->getWidth(), /*trans=*/false, /*useGpu=*/useGpu_); } if (biases_ && biases_->getWGrad()) { REGISTER_TIMER_INFO("BpBiasTimer", getName().c_str()); biases_->getWGrad()->collectBias(*interOutGrad_, 1); biases_->getParameterPtr()->incUpdate(callback); } // backward is different from FullyConnectedLayer // because the weight is transposed for (size_t i = 0; i < inputNum_; i++) { AsyncGpuBlock block; MatrixPtr preGrad = getInputGrad(i); if (preGrad) { REGISTER_TIMER_INFO("BpMulTimer", getName().c_str()); preGrad->mul(*interOutGrad_, *weights_[i]->getW(), 1, 1); } MatrixPtr wGrad = weights_[i]->getWGrad(); if (wGrad) { REGISTER_TIMER_INFO("GradMulTimer", getName().c_str()); MatrixPtr input = getInputValue(i); wGrad->mul(*interOutGrad_->getTranspose(), *input, 1, 1); } { REGISTER_TIMER_INFO("WeightUpdate", getName().c_str()); weights_[i]->getParameterPtr()->incUpdate(callback); } } }
void KmaxSeqScoreLayer::forward(PassType passType) { Layer::forward(passType); const Argument& input = getInput(0); const MatrixPtr inputScore = getInputValue(0); CHECK(input.hasSeq() || input.hasSubseq()) << "input of " << getName() << " must be a sequence or a nested sequence."; CHECK_EQ(input.value->getWidth(), 1UL) << "input of " << getName() << " are scores over a sequence or " << "a nested sequence, so its width must be 1."; if (useGpu_) { /* * currently, this Layer only runs in CPU, if the other part of the model is * runing on GPU, then copy the input to this layer from GPU to CPU. */ Matrix::resizeOrCreate(scores_, inputScore->getHeight(), 1, false /* trans */, false /* useGpu */); scores_->copyFrom(*inputScore); } else { scores_ = inputScore; } /* * TODO(caoying) * In PaddePaddle, currently all matrices are real number types, * but output of this layer which is some selected indices of the give * sequence are actually filled with int types so that storing int types * information in a real number matrix is dangerous, since real numbers will * be convered to int types. */ Matrix::resizeOrCreate( output_.value, input.hasSubseq() ? input.getNumSubSequences() : input.getNumSequences(), beamSize_, false, false); output_.value->one(); output_.value->mulScalar(-1.); kmaxScorePerSeq(scores_->getData(), output_.value->getData(), input.hasSubseq() ? input.subSequenceStartPositions : input.sequenceStartPositions); }
TEST(Arguments, Matrix) { MatrixPtr matrix = Matrix::create(100, 200); CheckBufferArg check = [=](const BufferArg& arg) { EXPECT_EQ(arg.shape().ndims(), 2U); EXPECT_EQ(arg.shape()[0], 100U); EXPECT_EQ(arg.shape()[1], 200U); EXPECT_EQ(arg.data(), matrix->getData()); EXPECT_EQ(arg.matrix<DEVICE_TYPE_CPU>().getHeight(), matrix->getHeight()); EXPECT_EQ(arg.matrix<DEVICE_TYPE_CPU>().getWidth(), matrix->getWidth()); EXPECT_EQ(arg.matrix<DEVICE_TYPE_CPU>().getData(), matrix->getData()); }; BufferArgs argments; argments.addArg(*matrix); std::vector<CheckBufferArg> checkFunc; checkFunc.push_back(check); testBufferArgs(argments, checkFunc); }
void SlopeInterceptLayer::forward(PassType passType) { Layer::forward(passType); MatrixPtr inV = getInputValue(0); /* malloc memory for the output_ if necessary */ size_t batchSize = inV->getHeight(); size_t size = getSize(); CHECK_EQ(size, inV->getWidth()); { REGISTER_TIMER_INFO("FwResetTimer", getName().c_str()); reserveOutput(batchSize, size); } MatrixPtr outV = getOutputValue(); { REGISTER_TIMER_INFO("FwSlopeInterceptTimer", getName().c_str()); outV->mulScalar(*inV, config_.slope()); outV->add(config_.intercept()); } }
void GatedRecurrentLayer::backwardBatch(int batchSize, MatrixPtr inputGrad) { REGISTER_TIMER_INFO("GruBwBatchTime", getName().c_str()); hl_gru_value gruValue; gruValue.gateWeight = (gateWeight_->getW())->getData(); gruValue.stateWeight = (stateWeight_->getW())->getData(); hl_gru_grad gruGrad; gruGrad.gateWeightGrad = (gateWeight_->getWGrad() ? gateWeight_->getWGrad()->getData() : nullptr); gruGrad.stateWeightGrad = (stateWeight_->getWGrad() ? stateWeight_->getWGrad()->getData() : nullptr); if (!batchGrad_) { batchGrad_.reset(new SequenceToBatch(useGpu_)); } batchGrad_->shareIndexWith(*batchValue_); { batchGrad_->copyFromSeq(*output_.grad); } { int numBatch = batchGrad_->getNumBatch(); int batchSize = 0; AsyncGpuBlock asyncGpuBlock; for (int n = (int)numBatch - 1; n >= 0; n--) { gruValue.gateValue = (batchGrad_->getBatchValue(*gate_.value, n))->getData(); gruValue.resetOutputValue = (batchGrad_->getBatchValue(*resetOutput_.value, n))->getData(); MatrixPtr outputGradTmp = batchGrad_->getBatchValue(n); gruGrad.outputGrad = outputGradTmp->getData(); gruGrad.gateGrad = (batchGrad_->getBatchValue(*gate_.grad , n))->getData(); gruGrad.resetOutputGrad = (batchGrad_->getBatchValue(*resetOutput_.grad , n))->getData(); { batchSize = outputGradTmp->getHeight(); gruValue.prevOutValue = (n == 0 ? nullptr : (batchValue_->getBatchValue(n - 1, batchSize))->getData()); gruGrad.prevOutGrad = (n == 0 ? nullptr : (batchGrad_->getBatchValue(n - 1, batchSize))->getData()); if (useGpu_) { GruCompute::backward<1>(gruValue, gruGrad, getSize(), batchSize); } else { GruCompute::backward<0>(gruValue, gruGrad, getSize(), batchSize); } } } } if (inputGrad) { batchGrad_->add(*inputGrad, *gate_.grad, /* seq2batch */false); } if (bias_ && bias_->getWGrad()) { bias_->getWGrad()->collectBias(*gate_.grad, /* scale */ 1); } }
void SelectiveFullyConnectedLayer::forward(PassType passType) { REGISTER_TIMER("selective_fc.forward"); Layer::forward(passType); getSelectiveCols(); size_t height = getInput(0).getBatchSize(); size_t width = getSize(); size_t nnz = height * width; if (!fullOutput_) { CHECK(selCols_); CHECK(height == selCols_->getHeight()); CHECK(width == selCols_->getWidth()); nnz = selCols_->getElementCnt(); } // Layer::ResetOutput(), here we set outV/outG as SparseMatrix manually // this outV should be used as input of MaxIdLayer and softmax activation reserveOutput(height, width, nnz); bool flag = true; for (size_t i = 0; i < inputNum_; i++) { MatrixPtr input = getInputValue(i); MatrixPtr weight = weights_[i]->getW(); size_t hsize = input->getHeight(); size_t wsize = weight->getHeight(); real scaleT = i == 0 ? real(0) : real(1); flag = nnz < (hsize * wsize) * config_.selective_fc_full_mul_ratio() && !fullOutput_; if (flag) { // if the indecies are highly sparse, // manully compute the multiplication of // the input vector and the selected rows. REGISTER_TIMER("selective.plain"); interOutput_->mul(*input, *weight->getTranspose(), 1, scaleT); } else { // if the indecies is not sparse enough, // use full mul instead REGISTER_TIMER("selective.mul"); if (fullOutput_) { interOutput_->mul(*input, *weight->getTranspose(), 1, scaleT); } else { Matrix::resizeOrCreate(mmat_, hsize, wsize, /*trans=*/false, /*useGpu=*/useGpu_); mmat_->mul(*input, *weight->getTranspose()); interOutput_->add3(mmat_); } } } if (biases_) { interOutput_->addBias(*(biases_->getW()), 1); } flag = (passType_ == PASS_TEST && config_.selective_fc_pass_generation() && !fullOutput_); if (flag) { // during generation, output of this layer is a sparse csr matrix, // which is probably the input of maxid layer // if the model is trained with multi-class-cross-entroy-with-selfnorm, // activiation of this layer should be exponential, not softmax. Argument arg; arg.value = Matrix::create(interOutput_->getData(), 1, nnz, /*trans=*/false, /*useGpu=*/useGpu_); //! TODO(yuyang18): Why we cannot invoke forwardActivation here? activation_->forward(arg).check(); } else /* train and test in train, not generating */ { // during training, this layer output value is *Matrix*, which is input of // eg. multi-class-cross-entropy // while training, every sample has a equal number of selected // columns to be activated. // note indices of multi-class-cross-entropy need to be remapped // to this index. // e.g. sample = [1,3,5] and 3 is gold, then label is 1 forwardActivation(); } }
virtual real evalImp(std::vector<Argument>& arguments) { overlapThreshold_ = config_.overlap_threshold(); backgroundId_ = config_.background_id(); evaluateDifficult_ = config_.evaluate_difficult(); apType_ = config_.ap_type(); MatrixPtr detectTmpValue = arguments[0].value; Matrix::resizeOrCreate(cpuOutput_, detectTmpValue->getHeight(), detectTmpValue->getWidth(), false, false); MatrixPtr labelTmpValue = arguments[1].value; Matrix::resizeOrCreate(cpuLabel_, labelTmpValue->getHeight(), labelTmpValue->getWidth(), false, false); cpuOutput_->copyFrom(*detectTmpValue); cpuLabel_->copyFrom(*labelTmpValue); Argument label = arguments[1]; const int* labelIndex = label.sequenceStartPositions->getData(false); size_t batchSize = label.getNumSequences(); vector<map<size_t, vector<NormalizedBBox>>> allGTBBoxes; vector<map<size_t, vector<pair<real, NormalizedBBox>>>> allDetectBBoxes; for (size_t n = 0; n < batchSize; ++n) { map<size_t, vector<NormalizedBBox>> bboxes; for (int i = labelIndex[n]; i < labelIndex[n + 1]; ++i) { vector<NormalizedBBox> bbox; getBBoxFromLabelData(cpuLabel_->getData() + i * 6, 1, bbox); int c = cpuLabel_->getData()[i * 6]; bboxes[c].push_back(bbox[0]); } allGTBBoxes.push_back(bboxes); } size_t n = 0; const real* cpuOutputData = cpuOutput_->getData(); for (size_t imgId = 0; imgId < batchSize; ++imgId) { map<size_t, vector<pair<real, NormalizedBBox>>> bboxes; size_t curImgId = static_cast<size_t>((cpuOutputData + n * 7)[0]); while (curImgId == imgId && n < cpuOutput_->getHeight()) { vector<real> label; vector<real> score; vector<NormalizedBBox> bbox; getBBoxFromDetectData(cpuOutputData + n * 7, 1, label, score, bbox); bboxes[label[0]].push_back(make_pair(score[0], bbox[0])); ++n; curImgId = static_cast<size_t>((cpuOutputData + n * 7)[0]); } allDetectBBoxes.push_back(bboxes); } for (size_t n = 0; n < batchSize; ++n) { for (map<size_t, vector<NormalizedBBox>>::iterator it = allGTBBoxes[n].begin(); it != allGTBBoxes[n].end(); ++it) { size_t count = 0; if (evaluateDifficult_) { count = it->second.size(); } else { for (size_t i = 0; i < it->second.size(); ++i) if (!(it->second[i].isDifficult)) ++count; } if (numPos_.find(it->first) == numPos_.end() && count != 0) { numPos_[it->first] = count; } else { numPos_[it->first] += count; } } } // calcTFPos calcTFPos(batchSize, allGTBBoxes, allDetectBBoxes); return 0; }