TEST(MatrixBatchTransTest, test_batch_matrix_transpose) { const int nx = 100; const int ny = 50; const int numSamples = 50; MatrixPtr cMat = Matrix::create(numSamples, nx * ny, false, false); MatrixPtr gMat = Matrix::create(numSamples, nx * ny, false, true); MatrixPtr cBatchTransMat = Matrix::create(numSamples, nx * ny, false, false); MatrixPtr gBatchTransMat = Matrix::create(numSamples, nx * ny, false, true); MatrixPtr cMat_d2h = Matrix::create(numSamples, nx * ny, false, false); real* cData = cMat->getData(); real* gold = cBatchTransMat->getData(); // host for (int sample_id = 0; sample_id < numSamples; ++sample_id) for (int j = 0; j < ny; j++) for (int i = 0; i < nx; i++) cData[sample_id * nx * ny + j * nx + i] = j * nx + i; // correct result for error checking for (int sample_id = 0; sample_id < numSamples; ++sample_id) for (int j = 0; j < ny; j++) for (int i = 0; i < nx; i++) gold[sample_id * nx * ny + i * ny + j] = cData[sample_id * nx * ny + j * nx + i]; // device gMat->copyFrom(*cMat, HPPL_STREAM_DEFAULT); batchTranspose( gMat->getData(), gBatchTransMat->getData(), nx, ny, numSamples); cMat_d2h->copyFrom(*gBatchTransMat, HPPL_STREAM_DEFAULT); checkMatrixEqual(cBatchTransMat, cMat_d2h); }
void SelectiveFullyConnectedLayer::backward(const UpdateCallback& callback) { backwardActivation(); MatrixPtr oGrad = getOutputGrad(); if (!fullOutput_) { interOutGrad_ = Matrix::createSparseMatrix(oGrad->getData(), interOutput_->getRows(), interOutput_->getCols(), interOutput_->getHeight(), interOutput_->getWidth(), interOutput_->getElementCnt(), FLOAT_VALUE, SPARSE_CSR, /*trans=*/false, /*useGpu=*/useGpu_); } else { interOutGrad_ = Matrix::create(oGrad->getData(), oGrad->getHeight(), oGrad->getWidth(), /*trans=*/false, /*useGpu=*/useGpu_); } if (biases_ && biases_->getWGrad()) { REGISTER_TIMER_INFO("BpBiasTimer", getName().c_str()); biases_->getWGrad()->collectBias(*interOutGrad_, 1); biases_->getParameterPtr()->incUpdate(callback); } // backward is different from FullyConnectedLayer // because the weight is transposed for (size_t i = 0; i < inputNum_; i++) { AsyncGpuBlock block; MatrixPtr preGrad = getInputGrad(i); if (preGrad) { REGISTER_TIMER_INFO("BpMulTimer", getName().c_str()); preGrad->mul(*interOutGrad_, *weights_[i]->getW(), 1, 1); } MatrixPtr wGrad = weights_[i]->getWGrad(); if (wGrad) { REGISTER_TIMER_INFO("GradMulTimer", getName().c_str()); MatrixPtr input = getInputValue(i); wGrad->mul(*interOutGrad_->getTranspose(), *input, 1, 1); } { REGISTER_TIMER_INFO("WeightUpdate", getName().c_str()); weights_[i]->getParameterPtr()->incUpdate(callback); } } }
TEST(Arguments, Matrix) { MatrixPtr matrix = Matrix::create(100, 200); CheckBufferArg check = [=](const BufferArg& arg) { EXPECT_EQ(arg.shape().ndims(), 2U); EXPECT_EQ(arg.shape()[0], 100U); EXPECT_EQ(arg.shape()[1], 200U); EXPECT_EQ(arg.data(), matrix->getData()); EXPECT_EQ(arg.matrix<DEVICE_TYPE_CPU>().getHeight(), matrix->getHeight()); EXPECT_EQ(arg.matrix<DEVICE_TYPE_CPU>().getWidth(), matrix->getWidth()); EXPECT_EQ(arg.matrix<DEVICE_TYPE_CPU>().getData(), matrix->getData()); }; BufferArgs argments; argments.addArg(*matrix); std::vector<CheckBufferArg> checkFunc; checkFunc.push_back(check); testBufferArgs(argments, checkFunc); }
const real* getData(const Matrix& matrix) { if (matrix.useGpu()) { MatrixPtr cpuMatrix = Matrix::create( matrix.getHeight(), matrix.getWidth(), matrix.isTransposed(), false); cpuMatrix->copyFrom(matrix); return cpuMatrix->getData(); } else { return matrix.getData(); } }
void FeatureMapExpandLayer::forward(PassType passType) { Layer::forward(passType); MatrixPtr inputV = getInputValue(0); size_t batchSize = getInput(0).getBatchSize(); int imgSize = inputV->getWidth(); resetOutput(batchSize, imgSize * numFilters_); MatrixPtr outputV = getOutputValue(); { AsyncGpuBlock asyncGpuBlock; if (asRowVector_) { for (size_t i = 0; i < batchSize; i++) { MatrixPtr outVTmp = Matrix::create(outputV->getData() + i * imgSize * numFilters_, numFilters_, imgSize, false, useGpu_); MatrixPtr inVTmp = Matrix::create( inputV->getData() + i * imgSize, 1, imgSize, false, useGpu_); outVTmp->addRowVector(*inVTmp); } } else { for (size_t i = 0; i < batchSize; i++) { MatrixPtr outVTmp = Matrix::create(outputV->getData() + i * imgSize * numFilters_, imgSize, numFilters_, false, useGpu_); MatrixPtr inVTmp = Matrix::create( inputV->getData() + i * imgSize, imgSize, 1, false, useGpu_); outVTmp->addColVector(*inVTmp); } } } /* activation */ { REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str()); forwardActivation(); } }
void FeatureMapExpandLayer::backward(const UpdateCallback& callback) { MatrixPtr inGrad = getInputGrad(0); if (NULL == inGrad) { return; } MatrixPtr outGrad = getOutputGrad(); size_t batchSize = getInput(0).getBatchSize(); int imgSize = inGrad->getWidth(); /* Do activation */ { REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str()); backwardActivation(); } { AsyncGpuBlock asyncGpuBlock; if (asRowVector_) { for (size_t i = 0; i < batchSize; i++) { MatrixPtr outGradTmp = Matrix::create(outGrad->getData() + i * imgSize * numFilters_, numFilters_, imgSize, false, useGpu_); MatrixPtr inGradTmp = Matrix::create( inGrad->getData() + i * imgSize, 1, imgSize, false, useGpu_); inGradTmp->collectBias(*outGradTmp, 1); } } else { for (size_t i = 0; i < batchSize; i++) { MatrixPtr outGradTmp = Matrix::create(outGrad->getData() + i * imgSize * numFilters_, imgSize, numFilters_, false, useGpu_); MatrixPtr inGradTmp = Matrix::create( inGrad->getData() + i * imgSize, imgSize, 1, false, useGpu_); inGradTmp->sumRows(*outGradTmp, 1, 1); } } } }
void GatedRecurrentLayer::forwardBatch(int batchSize, size_t numSequences, const int* starts, MatrixPtr inputValue) { REGISTER_TIMER_INFO("GruFwBatchTime", getName().c_str()); hl_gru_value gruValue; gruValue.gateWeight = (gateWeight_->getW())->getData(); gruValue.stateWeight = (stateWeight_->getW())->getData(); if (!batchValue_) { batchValue_.reset(new SequenceToBatch(useGpu_)); } batchValue_->resizeOrCreateBatch(batchSize, numSequences, starts, reversed_); batchValue_->resizeOrCreate(*output_.value); batchValue_->copy(*inputValue, *gate_.value, /* seq2batch */true); if (bias_ && bias_->getWGrad()) { gate_.value->addBias(*(bias_->getW()), 1); } { int numBatch = batchValue_->getNumBatch(); int batchSize = 0; AsyncGpuBlock asyncGpuBlock; for (int n = 0; n < numBatch; n++) { MatrixPtr outputValueTmp = batchValue_->getBatchValue(n); gruValue.outputValue = outputValueTmp->getData(); gruValue.gateValue = (batchValue_->getBatchValue(*gate_.value, n))->getData(); gruValue.resetOutputValue = (batchValue_->getBatchValue(*resetOutput_.value, n))->getData(); batchSize = outputValueTmp->getHeight(); gruValue.prevOutValue = (n == 0 ? nullptr : (batchValue_->getBatchValue(n - 1, batchSize))->getData()); { if (useGpu_) { GruCompute::forward<1>(gruValue, getSize(), batchSize); } else { GruCompute::forward<0>(gruValue, getSize(), batchSize); } } } } { batchValue_->copyBackSeq(*output_.value); } }
void KmaxSeqScoreLayer::forward(PassType passType) { Layer::forward(passType); const Argument& input = getInput(0); const MatrixPtr inputScore = getInputValue(0); CHECK(input.hasSeq() || input.hasSubseq()) << "input of " << getName() << " must be a sequence or a nested sequence."; CHECK_EQ(input.value->getWidth(), 1UL) << "input of " << getName() << " are scores over a sequence or " << "a nested sequence, so its width must be 1."; if (useGpu_) { /* * currently, this Layer only runs in CPU, if the other part of the model is * runing on GPU, then copy the input to this layer from GPU to CPU. */ Matrix::resizeOrCreate(scores_, inputScore->getHeight(), 1, false /* trans */, false /* useGpu */); scores_->copyFrom(*inputScore); } else { scores_ = inputScore; } /* * TODO(caoying) * In PaddePaddle, currently all matrices are real number types, * but output of this layer which is some selected indices of the give * sequence are actually filled with int types so that storing int types * information in a real number matrix is dangerous, since real numbers will * be convered to int types. */ Matrix::resizeOrCreate( output_.value, input.hasSubseq() ? input.getNumSubSequences() : input.getNumSequences(), beamSize_, false, false); output_.value->one(); output_.value->mulScalar(-1.); kmaxScorePerSeq(scores_->getData(), output_.value->getData(), input.hasSubseq() ? input.subSequenceStartPositions : input.sequenceStartPositions); }
void DeConv3DLayer::forward(PassType passType) { Layer::forward(passType); int batchSize = inputLayers_[0]->getOutputValue()->getHeight(); int outWidth = getSize(); resetOutput(batchSize, outWidth); const MatrixPtr outMat = getOutputValue(); REGISTER_TIMER_INFO("FwdDeConv3D", getName().c_str()); for (size_t i = 0; i != inputLayers_.size(); ++i) { const MatrixPtr &inMat = getInputValue(i); int M = M_[i]; int N = N_[i]; int K = K_[i]; MatrixPtr wMat = weights_[i]->getW(); Matrix::resizeOrCreate(colBuf_, K * groups_[i], N, false, useGpu_); for (int n = 0; n < batchSize; ++n) { real *inData = inMat->getData() + n * inMat->getStride(); for (int g = 0; g < groups_[i]; ++g) { MatrixPtr inMatSub = Matrix::create(inData, M, N, false, useGpu_); MatrixPtr wMatSub = wMat->subMatrix(g * K, K); MatrixPtr colBufDataSub = colBuf_->subMatrix(g * K, K); colBufDataSub->mul(*wMatSub, *inMatSub, 1.0, 0.0); inData += M * N; } colBuf_->col2Vol(outMat->getData() + n * outMat->getStride(), numFilters_, imgSizeD_[i], imgSizeH_[i], imgSizeW_[i], filterSizeZ_[i], filterSizeY_[i], filterSize_[i], strideZ_[i], strideY_[i], stride_[i], paddingZ_[i], paddingY_[i], padding_[i], 1.0, 1.0); } } if (nullptr != this->biasParameter_) { this->addBias(); } forwardActivation(); }
void ConvProjection::backward(const UpdateCallback &callback) { REGISTER_TIMER_INFO("CudnnConvBpTimer", getName().c_str()); void *workSpace = NULL; if (workSpaceInBytes_ > 0) { workSpace = getSpaceBytes(workSpaceInBytes_); } for (int g = 0; g < groups_; ++g) { real *outGrad = out_->grad->getData() + g * outputOffset_; if (weight_->getWGrad()) { real *inputData = in_->value->getData() + g * inputOffset_; real *weightGrad = weight_->getWGrad()->getData() + g * weightOffset_; hl_convolution_backward_filter(imageDesc_, inputData, outputDesc_, outGrad, filterDesc_, weightGrad, convDesc_, workSpace, bwdFilterLimitBytes_, bwdFilterAlgo_); } MatrixPtr preGrad = in_->grad; if (NULL != preGrad) { real *inputGrad = preGrad->getData() + g * inputOffset_; real *wgtData = weight_->getW()->getData() + g * weightOffset_; hl_convolution_backward_data(imageDesc_, inputGrad, outputDesc_, outGrad, filterDesc_, wgtData, convDesc_, workSpace, bwdDataLimitBytes_, bwdDataAlgo_); } } weight_->getParameterPtr()->incUpdate(callback); }
void GatedRecurrentLayer::backwardBatch(int batchSize, MatrixPtr inputGrad) { REGISTER_TIMER_INFO("GruBwBatchTime", getName().c_str()); hl_gru_value gruValue; gruValue.gateWeight = (gateWeight_->getW())->getData(); gruValue.stateWeight = (stateWeight_->getW())->getData(); hl_gru_grad gruGrad; gruGrad.gateWeightGrad = (gateWeight_->getWGrad() ? gateWeight_->getWGrad()->getData() : nullptr); gruGrad.stateWeightGrad = (stateWeight_->getWGrad() ? stateWeight_->getWGrad()->getData() : nullptr); if (!batchGrad_) { batchGrad_.reset(new SequenceToBatch(useGpu_)); } batchGrad_->shareIndexWith(*batchValue_); { batchGrad_->copyFromSeq(*output_.grad); } { int numBatch = batchGrad_->getNumBatch(); int batchSize = 0; AsyncGpuBlock asyncGpuBlock; for (int n = (int)numBatch - 1; n >= 0; n--) { gruValue.gateValue = (batchGrad_->getBatchValue(*gate_.value, n))->getData(); gruValue.resetOutputValue = (batchGrad_->getBatchValue(*resetOutput_.value, n))->getData(); MatrixPtr outputGradTmp = batchGrad_->getBatchValue(n); gruGrad.outputGrad = outputGradTmp->getData(); gruGrad.gateGrad = (batchGrad_->getBatchValue(*gate_.grad , n))->getData(); gruGrad.resetOutputGrad = (batchGrad_->getBatchValue(*resetOutput_.grad , n))->getData(); { batchSize = outputGradTmp->getHeight(); gruValue.prevOutValue = (n == 0 ? nullptr : (batchValue_->getBatchValue(n - 1, batchSize))->getData()); gruGrad.prevOutGrad = (n == 0 ? nullptr : (batchGrad_->getBatchValue(n - 1, batchSize))->getData()); if (useGpu_) { GruCompute::backward<1>(gruValue, gruGrad, getSize(), batchSize); } else { GruCompute::backward<0>(gruValue, gruGrad, getSize(), batchSize); } } } } if (inputGrad) { batchGrad_->add(*inputGrad, *gate_.grad, /* seq2batch */false); } if (bias_ && bias_->getWGrad()) { bias_->getWGrad()->collectBias(*gate_.grad, /* scale */ 1); } }