Example #1
0
void testProtoSequenceDataProvider(int* numPerSlotType, bool async,
                                   bool useGpu) {
  mkDir(kTestDir);
  DataBatch data;

  prepareData(&data, numPerSlotType,
              /* iid */ true, useGpu);
  writeData(data, useGpu, /* dataCompression */ false);

  DataConfig config;
  config.set_type("proto_sequence");
  config.set_files(kProtoFileList);
  config.set_async_load_data(async);

  unique_ptr<DataProvider> dataProvider(DataProvider::create(config, useGpu));
  dataProvider->setSkipShuffle();

  EXPECT_EQ(data.getSize(), dataProvider->getSize());

  int64_t batchSize = 10;
  DataBatch batch;

  vector<Argument>& args1 = data.getStreams();
  ICpuGpuVectorPtr sequenceStartPositions1 =
      args1[0].sequenceStartPositions;

  dataProvider->reset();

  size_t args1Offset = 0;
  while (dataProvider->getNextBatch(batchSize, &batch) > 0) {
    CHECK_EQ(data.getNumStreams(), batch.getNumStreams());
    vector<Argument>& args2 = batch.getStreams();
    ICpuGpuVectorPtr sequenceStartPositions2 =
        args2[0].sequenceStartPositions;
    for (auto& arg : args1) {
      // args1 should not has sequence
      EXPECT_EQ(true, !arg.sequenceStartPositions);
    }
    for (auto& arg : args2) {
      // args2 should has sequence
      EXPECT_NE(true, !arg.sequenceStartPositions);
    }
    size_t numSeqs = batch.getNumSequences();
    checkSampleSequence(args1, args2, args1Offset, numSeqs, useGpu);
    args1Offset += numSeqs;
  }

  EXPECT_EQ(args1Offset, (size_t)data.getNumSequences());
  rmDir(kTestDir);
}
Example #2
0
void writeData(const DataBatch& batch, bool useGpu, bool dataCompression) {
  DataHeader header;
  const vector<Argument>& arguments = batch.getStreams();
  for (auto& argument : arguments) {
    SlotDef* slotDef = header.add_slot_defs();
    slotDef->set_type(getSlotType(argument));
    slotDef->set_dim(getSlotDim(argument));
  }
  VLOG(1) << "header=" << header.DebugString();

  int64_t totalSeqs = batch.getNumSequences();
  int64_t seq = 0;
  ICpuGpuVectorPtr sequenceStartPositions =
      arguments[0].sequenceStartPositions;
  int64_t numWritten = 0;
  vector<string> curProtoFiles =
      dataCompression ? protoFilesCompressed : protoFiles;
  for (size_t i = 0; i < curProtoFiles.size(); ++i) {
    int64_t numSeqs = totalSeqs * (i + 1) / curProtoFiles.size() -
                      totalSeqs * i / curProtoFiles.size();
    ofstream os(curProtoFiles[i]);
    CHECK(os) << "Fail to open " << curProtoFiles[i];
    unique_ptr<ProtoWriter> writer(new ProtoWriter(&os, dataCompression));
    CHECK(writer->write(header));
    for (int j = 0; j < numSeqs; ++j, ++seq) {
      int64_t begin = seq;
      int64_t end = seq + 1;
      if (sequenceStartPositions) {
        begin = sequenceStartPositions->getElement(seq);
        end = sequenceStartPositions->getElement(seq + 1);
      }
      for (int pos = begin; pos < end; ++pos) {
        DataSample sample;
        makeSample(arguments, pos, pos == begin, &sample, useGpu);
        CHECK(writer->write(sample));
        ++numWritten;
      }
    }

    writer.reset(nullptr);
    os.close();
  }
  CHECK_EQ(arguments[0].getBatchSize(), numWritten);
}
Example #3
0
void testProtoDataProvider(int* numPerSlotType, bool iid, bool async,
                           bool useGpu, bool dataCompression,
                           int numConstantSlots = 0) {
  mkDir(kTestDir);
  DataBatch data;

  prepareData(&data, numPerSlotType, iid, useGpu);
  writeData(data, useGpu, dataCompression);

  DataConfig config;
  config.set_type("proto");
  config.set_files(dataCompression ? kProtoFileListCompressed : kProtoFileList);
  config.set_async_load_data(async);

  for (int i = 0; i < numConstantSlots; ++i) {
    config.add_constant_slots(i + 11);
    MatrixPtr w = Matrix::create(data.getSize(), 1, /* trans= */ false,
                                 /* useGpu= */ false);
    w->assign(config.constant_slots(i));
    data.appendData(w);
  }

  unique_ptr<DataProvider> dataProvider(DataProvider::create(config, useGpu));
  dataProvider->setSkipShuffle();

  EXPECT_EQ(data.getSize(), dataProvider->getSize());

  int64_t batchSize = 10;
  DataBatch batch;

  size_t seq1 = 0;
  vector<Argument>& args1 = data.getStreams();
  ICpuGpuVectorPtr sequenceStartPositions1 =
      args1[0].sequenceStartPositions;

  dataProvider->reset();

  while (dataProvider->getNextBatch(batchSize, &batch) > 0) {
    CHECK_EQ(data.getNumStreams(), batch.getNumStreams());
    vector<Argument>& args2 = batch.getStreams();
    ICpuGpuVectorPtr sequenceStartPositions2 =
        args2[0].sequenceStartPositions;
    for (auto& arg : args2) {
      EXPECT_EQ(iid, !arg.sequenceStartPositions);
    }
    size_t numSeqs = batch.getNumSequences();
    VLOG(1) << "numSeqs=" << numSeqs;
    for (size_t seq2 = 0; seq2 < numSeqs; ++seq1, ++seq2) {
      int64_t begin1 = seq1;
      int64_t end1 = seq1 + 1;
      if (sequenceStartPositions1) {
        begin1 = sequenceStartPositions1->getElement(seq1);
        end1 = sequenceStartPositions1->getElement(seq1 + 1);
        EXPECT_LT(seq1, sequenceStartPositions1->getSize() - 1);
      }

      int64_t begin2 = seq2;
      int64_t end2 = seq2 + 1;
      if (sequenceStartPositions2) {
        begin2 = sequenceStartPositions2->getElement(seq2);
        end2 = sequenceStartPositions2->getElement(seq2 + 1);
      }
      VLOG(1) << " begin1=" << begin1 << " end1=" << end1
              << " begin2=" << begin2 << " end2=" << end2;
      EXPECT_EQ(end1 - begin1, end2 - begin2);
      for (int i = 0; i < end1 - begin1; ++i) {
        checkSample(args1, begin1 + i, args2, begin2 + i, useGpu);
      }
    }
  }

  EXPECT_EQ(seq1, (size_t)data.getNumSequences());
  rmDir(kTestDir);
}