TEST(MatrixBatchTransTest, test_batch_matrix_transpose) {
  const int nx = 100;
  const int ny = 50;
  const int numSamples = 50;

  MatrixPtr cMat = Matrix::create(numSamples, nx * ny, false, false);
  MatrixPtr gMat = Matrix::create(numSamples, nx * ny, false, true);

  MatrixPtr cBatchTransMat = Matrix::create(numSamples, nx * ny, false, false);
  MatrixPtr gBatchTransMat = Matrix::create(numSamples, nx * ny, false, true);
  MatrixPtr cMat_d2h = Matrix::create(numSamples, nx * ny, false, false);

  real* cData = cMat->getData();
  real* gold = cBatchTransMat->getData();

  // host
  for (int sample_id = 0; sample_id < numSamples; ++sample_id)
    for (int j = 0; j < ny; j++)
      for (int i = 0; i < nx; i++)
        cData[sample_id * nx * ny + j * nx + i] = j * nx + i;

  // correct result for error checking
  for (int sample_id = 0; sample_id < numSamples; ++sample_id)
    for (int j = 0; j < ny; j++)
      for (int i = 0; i < nx; i++)
        gold[sample_id * nx * ny + i * ny + j] =
            cData[sample_id * nx * ny + j * nx + i];
  // device
  gMat->copyFrom(*cMat, HPPL_STREAM_DEFAULT);
  batchTranspose(
      gMat->getData(), gBatchTransMat->getData(), nx, ny, numSamples);
  cMat_d2h->copyFrom(*gBatchTransMat, HPPL_STREAM_DEFAULT);
  checkMatrixEqual(cBatchTransMat, cMat_d2h);
}
void SelectiveFullyConnectedLayer::backward(const UpdateCallback& callback) {
  backwardActivation();
  MatrixPtr oGrad = getOutputGrad();
  if (!fullOutput_) {
    interOutGrad_ = Matrix::createSparseMatrix(oGrad->getData(),
                                               interOutput_->getRows(),
                                               interOutput_->getCols(),
                                               interOutput_->getHeight(),
                                               interOutput_->getWidth(),
                                               interOutput_->getElementCnt(),
                                               FLOAT_VALUE,
                                               SPARSE_CSR,
                                               /*trans=*/false,
                                               /*useGpu=*/useGpu_);
  } else {
    interOutGrad_ = Matrix::create(oGrad->getData(),
                                   oGrad->getHeight(),
                                   oGrad->getWidth(),
                                   /*trans=*/false,
                                   /*useGpu=*/useGpu_);
  }

  if (biases_ && biases_->getWGrad()) {
    REGISTER_TIMER_INFO("BpBiasTimer", getName().c_str());
    biases_->getWGrad()->collectBias(*interOutGrad_, 1);
    biases_->getParameterPtr()->incUpdate(callback);
  }

  // backward is different from FullyConnectedLayer
  // because the weight is transposed
  for (size_t i = 0; i < inputNum_; i++) {
    AsyncGpuBlock block;
    MatrixPtr preGrad = getInputGrad(i);
    if (preGrad) {
      REGISTER_TIMER_INFO("BpMulTimer", getName().c_str());
      preGrad->mul(*interOutGrad_, *weights_[i]->getW(), 1, 1);
    }

    MatrixPtr wGrad = weights_[i]->getWGrad();
    if (wGrad) {
      REGISTER_TIMER_INFO("GradMulTimer", getName().c_str());
      MatrixPtr input = getInputValue(i);
      wGrad->mul(*interOutGrad_->getTranspose(), *input, 1, 1);
    }

    {
      REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
      weights_[i]->getParameterPtr()->incUpdate(callback);
    }
  }
}
Exemple #3
0
TEST(Arguments, Matrix) {
  MatrixPtr matrix = Matrix::create(100, 200);
  CheckBufferArg check = [=](const BufferArg& arg) {
    EXPECT_EQ(arg.shape().ndims(), 2U);
    EXPECT_EQ(arg.shape()[0], 100U);
    EXPECT_EQ(arg.shape()[1], 200U);
    EXPECT_EQ(arg.data(), matrix->getData());

    EXPECT_EQ(arg.matrix<DEVICE_TYPE_CPU>().getHeight(), matrix->getHeight());
    EXPECT_EQ(arg.matrix<DEVICE_TYPE_CPU>().getWidth(), matrix->getWidth());
    EXPECT_EQ(arg.matrix<DEVICE_TYPE_CPU>().getData(), matrix->getData());
  };

  BufferArgs argments;
  argments.addArg(*matrix);
  std::vector<CheckBufferArg> checkFunc;
  checkFunc.push_back(check);
  testBufferArgs(argments, checkFunc);
}
const real* getData(const Matrix& matrix) {
  if (matrix.useGpu()) {
    MatrixPtr cpuMatrix = Matrix::create(
        matrix.getHeight(), matrix.getWidth(), matrix.isTransposed(), false);
    cpuMatrix->copyFrom(matrix);
    return cpuMatrix->getData();
  } else {
    return matrix.getData();
  }
}
void FeatureMapExpandLayer::forward(PassType passType) {
  Layer::forward(passType);
  MatrixPtr inputV = getInputValue(0);
  size_t batchSize = getInput(0).getBatchSize();
  int imgSize = inputV->getWidth();
  resetOutput(batchSize, imgSize * numFilters_);

  MatrixPtr outputV = getOutputValue();

  {
    AsyncGpuBlock asyncGpuBlock;
    if (asRowVector_) {
      for (size_t i = 0; i < batchSize; i++) {
        MatrixPtr outVTmp =
            Matrix::create(outputV->getData() + i * imgSize * numFilters_,
                           numFilters_,
                           imgSize,
                           false,
                           useGpu_);
        MatrixPtr inVTmp = Matrix::create(
            inputV->getData() + i * imgSize, 1, imgSize, false, useGpu_);
        outVTmp->addRowVector(*inVTmp);
      }
    } else {
      for (size_t i = 0; i < batchSize; i++) {
        MatrixPtr outVTmp =
            Matrix::create(outputV->getData() + i * imgSize * numFilters_,
                           imgSize,
                           numFilters_,
                           false,
                           useGpu_);
        MatrixPtr inVTmp = Matrix::create(
            inputV->getData() + i * imgSize, imgSize, 1, false, useGpu_);
        outVTmp->addColVector(*inVTmp);
      }
    }
  }
  /* activation */ {
    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
    forwardActivation();
  }
}
void FeatureMapExpandLayer::backward(const UpdateCallback& callback) {
  MatrixPtr inGrad = getInputGrad(0);
  if (NULL == inGrad) {
    return;
  }
  MatrixPtr outGrad = getOutputGrad();
  size_t batchSize = getInput(0).getBatchSize();
  int imgSize = inGrad->getWidth();
  /* Do activation */ {
    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
    backwardActivation();
  }
  {
    AsyncGpuBlock asyncGpuBlock;
    if (asRowVector_) {
      for (size_t i = 0; i < batchSize; i++) {
        MatrixPtr outGradTmp =
            Matrix::create(outGrad->getData() + i * imgSize * numFilters_,
                           numFilters_,
                           imgSize,
                           false,
                           useGpu_);
        MatrixPtr inGradTmp = Matrix::create(
            inGrad->getData() + i * imgSize, 1, imgSize, false, useGpu_);
        inGradTmp->collectBias(*outGradTmp, 1);
      }
    } else {
      for (size_t i = 0; i < batchSize; i++) {
        MatrixPtr outGradTmp =
            Matrix::create(outGrad->getData() + i * imgSize * numFilters_,
                           imgSize,
                           numFilters_,
                           false,
                           useGpu_);
        MatrixPtr inGradTmp = Matrix::create(
            inGrad->getData() + i * imgSize, imgSize, 1, false, useGpu_);
        inGradTmp->sumRows(*outGradTmp, 1, 1);
      }
    }
  }
}
void GatedRecurrentLayer::forwardBatch(int batchSize,
                                       size_t numSequences,
                                       const int* starts,
                                       MatrixPtr inputValue) {
  REGISTER_TIMER_INFO("GruFwBatchTime", getName().c_str());
  hl_gru_value gruValue;
  gruValue.gateWeight = (gateWeight_->getW())->getData();
  gruValue.stateWeight = (stateWeight_->getW())->getData();

  if (!batchValue_) {
    batchValue_.reset(new SequenceToBatch(useGpu_));
  }
  batchValue_->resizeOrCreateBatch(batchSize, numSequences, starts,
                                   reversed_);

  batchValue_->resizeOrCreate(*output_.value);
  batchValue_->copy(*inputValue, *gate_.value, /* seq2batch */true);
  if (bias_ && bias_->getWGrad()) {
    gate_.value->addBias(*(bias_->getW()), 1);
  }

  {
    int numBatch = batchValue_->getNumBatch();
    int batchSize = 0;
    AsyncGpuBlock asyncGpuBlock;
    for (int n = 0; n < numBatch; n++) {
      MatrixPtr outputValueTmp = batchValue_->getBatchValue(n);
      gruValue.outputValue = outputValueTmp->getData();
      gruValue.gateValue =
        (batchValue_->getBatchValue(*gate_.value, n))->getData();
      gruValue.resetOutputValue =
        (batchValue_->getBatchValue(*resetOutput_.value, n))->getData();

      batchSize = outputValueTmp->getHeight();
      gruValue.prevOutValue =
        (n == 0 ? nullptr
                : (batchValue_->getBatchValue(n - 1, batchSize))->getData());

      {
        if (useGpu_) {
          GruCompute::forward<1>(gruValue, getSize(), batchSize);
        } else {
          GruCompute::forward<0>(gruValue, getSize(), batchSize);
        }
      }
    }
  }
  {
    batchValue_->copyBackSeq(*output_.value);
  }
}
void KmaxSeqScoreLayer::forward(PassType passType) {
  Layer::forward(passType);

  const Argument& input = getInput(0);
  const MatrixPtr inputScore = getInputValue(0);

  CHECK(input.hasSeq() || input.hasSubseq())
      << "input of " << getName()
      << " must be a sequence or a nested sequence.";
  CHECK_EQ(input.value->getWidth(), 1UL)
      << "input of " << getName() << " are scores over a sequence or "
      << "a nested sequence, so its width must be 1.";

  if (useGpu_) {
    /*
     * currently, this Layer only runs in CPU, if the other part of the model is
     * runing on GPU, then copy the input to this layer from GPU to CPU.
     */
    Matrix::resizeOrCreate(scores_,
                           inputScore->getHeight(),
                           1,
                           false /* trans */,
                           false /* useGpu */);
    scores_->copyFrom(*inputScore);
  } else {
    scores_ = inputScore;
  }

  /*
   * TODO(caoying)
   * In PaddePaddle, currently all matrices are real number types,
   * but output of this layer which is some selected indices of the give
   * sequence are actually filled with int types so that storing int types
   * information in a real number matrix is dangerous, since real numbers will
   * be convered to int types.
   */
  Matrix::resizeOrCreate(
      output_.value,
      input.hasSubseq() ? input.getNumSubSequences() : input.getNumSequences(),
      beamSize_,
      false,
      false);
  output_.value->one();
  output_.value->mulScalar(-1.);

  kmaxScorePerSeq(scores_->getData(),
                  output_.value->getData(),
                  input.hasSubseq() ? input.subSequenceStartPositions
                                    : input.sequenceStartPositions);
}
Exemple #9
0
void DeConv3DLayer::forward(PassType passType) {
  Layer::forward(passType);
  int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
  int outWidth = getSize();
  resetOutput(batchSize, outWidth);
  const MatrixPtr outMat = getOutputValue();

  REGISTER_TIMER_INFO("FwdDeConv3D", getName().c_str());
  for (size_t i = 0; i != inputLayers_.size(); ++i) {
    const MatrixPtr &inMat = getInputValue(i);
    int M = M_[i];
    int N = N_[i];
    int K = K_[i];
    MatrixPtr wMat = weights_[i]->getW();
    Matrix::resizeOrCreate(colBuf_, K * groups_[i], N, false, useGpu_);
    for (int n = 0; n < batchSize; ++n) {
      real *inData = inMat->getData() + n * inMat->getStride();
      for (int g = 0; g < groups_[i]; ++g) {
        MatrixPtr inMatSub = Matrix::create(inData, M, N, false, useGpu_);
        MatrixPtr wMatSub = wMat->subMatrix(g * K, K);
        MatrixPtr colBufDataSub = colBuf_->subMatrix(g * K, K);
        colBufDataSub->mul(*wMatSub, *inMatSub, 1.0, 0.0);
        inData += M * N;
      }
      colBuf_->col2Vol(outMat->getData() + n * outMat->getStride(),
                       numFilters_,
                       imgSizeD_[i],
                       imgSizeH_[i],
                       imgSizeW_[i],
                       filterSizeZ_[i],
                       filterSizeY_[i],
                       filterSize_[i],
                       strideZ_[i],
                       strideY_[i],
                       stride_[i],
                       paddingZ_[i],
                       paddingY_[i],
                       padding_[i],
                       1.0,
                       1.0);
    }
  }
  if (nullptr != this->biasParameter_) {
    this->addBias();
  }
  forwardActivation();
}
Exemple #10
0
void ConvProjection::backward(const UpdateCallback &callback) {
  REGISTER_TIMER_INFO("CudnnConvBpTimer", getName().c_str());

  void *workSpace = NULL;
  if (workSpaceInBytes_ > 0) {
    workSpace = getSpaceBytes(workSpaceInBytes_);
  }

  for (int g = 0; g < groups_; ++g) {
    real *outGrad = out_->grad->getData() + g * outputOffset_;
    if (weight_->getWGrad()) {
      real *inputData = in_->value->getData() + g * inputOffset_;
      real *weightGrad = weight_->getWGrad()->getData() + g * weightOffset_;
      hl_convolution_backward_filter(imageDesc_,
                                     inputData,
                                     outputDesc_,
                                     outGrad,
                                     filterDesc_,
                                     weightGrad,
                                     convDesc_,
                                     workSpace,
                                     bwdFilterLimitBytes_,
                                     bwdFilterAlgo_);
    }

    MatrixPtr preGrad = in_->grad;
    if (NULL != preGrad) {
      real *inputGrad = preGrad->getData() + g * inputOffset_;
      real *wgtData = weight_->getW()->getData() + g * weightOffset_;
      hl_convolution_backward_data(imageDesc_,
                                   inputGrad,
                                   outputDesc_,
                                   outGrad,
                                   filterDesc_,
                                   wgtData,
                                   convDesc_,
                                   workSpace,
                                   bwdDataLimitBytes_,
                                   bwdDataAlgo_);
    }
  }

  weight_->getParameterPtr()->incUpdate(callback);
}
Exemple #11
0
void GatedRecurrentLayer::backwardBatch(int batchSize,
                                        MatrixPtr inputGrad) {
  REGISTER_TIMER_INFO("GruBwBatchTime", getName().c_str());
  hl_gru_value gruValue;
  gruValue.gateWeight = (gateWeight_->getW())->getData();
  gruValue.stateWeight = (stateWeight_->getW())->getData();

  hl_gru_grad gruGrad;
  gruGrad.gateWeightGrad =
    (gateWeight_->getWGrad() ? gateWeight_->getWGrad()->getData() : nullptr);
  gruGrad.stateWeightGrad =
    (stateWeight_->getWGrad() ? stateWeight_->getWGrad()->getData() : nullptr);

  if (!batchGrad_) {
    batchGrad_.reset(new SequenceToBatch(useGpu_));
  }
  batchGrad_->shareIndexWith(*batchValue_);

  {
    batchGrad_->copyFromSeq(*output_.grad);
  }

  {
    int numBatch = batchGrad_->getNumBatch();
    int batchSize = 0;
    AsyncGpuBlock asyncGpuBlock;
    for (int n = (int)numBatch - 1; n >= 0; n--) {
      gruValue.gateValue =
        (batchGrad_->getBatchValue(*gate_.value, n))->getData();
      gruValue.resetOutputValue =
        (batchGrad_->getBatchValue(*resetOutput_.value, n))->getData();

      MatrixPtr outputGradTmp  = batchGrad_->getBatchValue(n);
      gruGrad.outputGrad = outputGradTmp->getData();
      gruGrad.gateGrad =
        (batchGrad_->getBatchValue(*gate_.grad , n))->getData();
      gruGrad.resetOutputGrad =
        (batchGrad_->getBatchValue(*resetOutput_.grad , n))->getData();

      {
        batchSize = outputGradTmp->getHeight();
        gruValue.prevOutValue =
          (n == 0 ? nullptr
                  : (batchValue_->getBatchValue(n - 1, batchSize))->getData());
        gruGrad.prevOutGrad =
          (n == 0 ? nullptr
                  : (batchGrad_->getBatchValue(n - 1, batchSize))->getData());

        if (useGpu_) {
          GruCompute::backward<1>(gruValue, gruGrad, getSize(),
                                  batchSize);
        } else {
          GruCompute::backward<0>(gruValue, gruGrad, getSize(),
                                  batchSize);
        }
      }
    }
  }

  if (inputGrad) {
    batchGrad_->add(*inputGrad, *gate_.grad, /* seq2batch */false);
  }
  if (bias_ && bias_->getWGrad()) {
    bias_->getWGrad()->collectBias(*gate_.grad, /* scale */ 1);
  }
}