Beispiel #1
0
int main (int argc, char * const argv[])
{
  // insert code here...
  Moses::ResetUserTime();
  Moses::PrintUserTime("Starting");

  if (argc != 8) {
    std::cerr << "Usage: " << argv[0] << " numSourceFactors numTargetFactors numScores tableLimit sortScoreIndex inputPath outputPath" << std::endl;
    return 1;
  }

  int numSourceFactors	= Moses::Scan<int>(argv[1])
                          , numTargetFactors	= Moses::Scan<int>(argv[2])
                              , numScores				= Moses::Scan<int>(argv[3])
                                  , tableLimit				= Moses::Scan<int>(argv[4]);
  TargetPhraseCollection::s_sortScoreInd			= Moses::Scan<int>(argv[5]);
  assert(TargetPhraseCollection::s_sortScoreInd < numScores);

  const string filePath 	= argv[6]
                            ,destPath	= argv[7];

  Moses::InputFileStream inStream(filePath);

  OnDiskWrapper onDiskWrapper;
  bool retDb = onDiskWrapper.BeginSave(destPath, numSourceFactors, numTargetFactors, numScores);
  assert(retDb);

  PhraseNode &rootNode = onDiskWrapper.GetRootSourceNode();
  size_t lineNum = 0;
  char line[100000];

  //while(getline(inStream, line))
  while(inStream.getline(line, 100000)) {
    lineNum++;
    if (lineNum%1000 == 0) cerr << "." << flush;
    if (lineNum%10000 == 0) cerr << ":" << flush;
    if (lineNum%100000 == 0) cerr << lineNum << flush;
    //cerr << lineNum << " " << line << endl;

    std::vector<float> misc(1);
    SourcePhrase sourcePhrase;
    TargetPhrase *targetPhrase = new TargetPhrase(numScores);
    OnDiskPt::PhrasePtr spShort = Tokenize(sourcePhrase, *targetPhrase, line, onDiskWrapper, numScores, misc);
    assert(misc.size() == onDiskWrapper.GetNumCounts());

    rootNode.AddTargetPhrase(sourcePhrase, targetPhrase, onDiskWrapper, tableLimit, misc, spShort);
  }

  rootNode.Save(onDiskWrapper, 0, tableLimit);
  onDiskWrapper.EndSave();

  Moses::PrintUserTime("Finished");

  //pause();
  return 0;

} // main()
void PhraseNode::GetChild(Word &wordFound, UINT64 &childFilePos, size_t ind, OnDiskWrapper &onDiskWrapper) const
{

  size_t wordSize = onDiskWrapper.GetSourceWordSize();
  size_t childSize = wordSize + sizeof(UINT64);

  char *currMem = m_memLoad
                  + sizeof(UINT64) * 2 // size & file pos of target phrase coll
                  + sizeof(float) * onDiskWrapper.GetNumCounts() // count info
                  + childSize * ind;

  size_t memRead = ReadChild(wordFound, childFilePos, currMem);
  CHECK(memRead == childSize);
}
PhraseNode::PhraseNode(UINT64 filePos, OnDiskWrapper &onDiskWrapper)
  :m_counts(onDiskWrapper.GetNumCounts())
{
  // load saved node
  m_filePos = filePos;

  size_t countSize = onDiskWrapper.GetNumCounts();

  std::fstream &file = onDiskWrapper.GetFileSource();
  file.seekg(filePos);
  CHECK(filePos == (UINT64)file.tellg());

  file.read((char*) &m_numChildrenLoad, sizeof(UINT64));
  
  size_t memAlloc = GetNodeSize(m_numChildrenLoad, onDiskWrapper.GetSourceWordSize(), countSize);
  m_memLoad = (char*) malloc(memAlloc);

  // go to start of node again
  file.seekg(filePos);
  CHECK(filePos == (UINT64)file.tellg());

  // read everything into memory
  file.read(m_memLoad, memAlloc);
  CHECK(filePos + memAlloc == (UINT64)file.tellg());

  // get value
  m_value = ((UINT64*)m_memLoad)[1];

  // get counts
  float *memFloat = (float*) (m_memLoad + sizeof(UINT64) * 2);

  CHECK(countSize == 1);
  m_counts[0] = memFloat[0];

  m_memLoadLast = m_memLoad + memAlloc;
}
void PhraseNode::Save(OnDiskWrapper &onDiskWrapper, size_t pos, size_t tableLimit)
{
  CHECK(!m_saved);

  // save this node
  m_targetPhraseColl.Sort(tableLimit);
  m_targetPhraseColl.Save(onDiskWrapper);
  m_value = m_targetPhraseColl.GetFilePos();

  size_t numCounts = onDiskWrapper.GetNumCounts();

  size_t memAlloc = GetNodeSize(GetSize(), onDiskWrapper.GetSourceWordSize(), numCounts);
  char *mem = (char*) malloc(memAlloc);
  //memset(mem, 0xfe, memAlloc);

  size_t memUsed = 0;
  UINT64 *memArray = (UINT64*) mem;
  memArray[0] = GetSize(); // num of children
  memArray[1] = m_value;   // file pos of corresponding target phrases
  memUsed += 2 * sizeof(UINT64);

  // count info
  float *memFloat = (float*) (mem + memUsed);
  CHECK(numCounts == 1);
  memFloat[0] = (m_counts.size() == 0) ? DEFAULT_COUNT : m_counts[0]; // if count = 0, put in very large num to make sure its still used. HACK
  memUsed += sizeof(float) * numCounts;

  // recursively save chm_countsildren
  ChildColl::iterator iter;
  for (iter = m_children.begin(); iter != m_children.end(); ++iter) {
    const Word &childWord = iter->first;
    PhraseNode &childNode = iter->second;

    // recursive
    if (!childNode.Saved())
      childNode.Save(onDiskWrapper, pos + 1, tableLimit);

    char *currMem = mem + memUsed;
    size_t wordMemUsed = childWord.WriteToMemory(currMem);
    memUsed += wordMemUsed;

    UINT64 *memArray = (UINT64*) (mem + memUsed);
    memArray[0] = childNode.GetFilePos();
    memUsed += sizeof(UINT64);

  }

  // save this node
  //Moses::DebugMem(mem, memAlloc);
  CHECK(memUsed == memAlloc);

  std::fstream &file = onDiskWrapper.GetFileSource();
  m_filePos = file.tellp();
  file.seekp(0, ios::end);
  file.write(mem, memUsed);

  UINT64 endPos = file.tellp();
  CHECK(m_filePos + memUsed == endPos);

  free(mem);

  m_children.clear();
  m_saved = true;
}