void TargetPhraseCollection::ReadFromFile(size_t tableLimit, UINT64 filePos, OnDiskWrapper &onDiskWrapper)
{
  fstream &fileTPColl = onDiskWrapper.GetFileTargetColl();
  fstream &fileTP = onDiskWrapper.GetFileTargetInd();

  size_t numScores = onDiskWrapper.GetNumScores();

  UINT64 numPhrases;

  UINT64 currFilePos = filePos;
  fileTPColl.seekg(filePos);
  fileTPColl.read((char*) &numPhrases, sizeof(UINT64));

  // table limit
  numPhrases = std::min(numPhrases, (UINT64) tableLimit);

  currFilePos += sizeof(UINT64);

  for (size_t ind = 0; ind < numPhrases; ++ind) {
    TargetPhrase *tp = new TargetPhrase(numScores);

    UINT64 sizeOtherInfo = tp->ReadOtherInfoFromFile(currFilePos, fileTPColl);
    tp->ReadFromFile(fileTP);

    currFilePos += sizeOtherInfo;

    m_coll.push_back(tp);
  }
}
Ejemplo n.º 2
0
int main(int argc, char **argv)
{
    int tableLimit = 20;
    std::string ttable = "";
    bool useAlignments = false;

    for(int i = 1; i < argc; i++) {
        if(!strcmp(argv[i], "-tlimit")) {
            if(i + 1 == argc)
                usage();
            tableLimit = atoi(argv[++i]);
        } else if(!strcmp(argv[i], "-t")) {
            if(i + 1 == argc)
                usage();
            ttable = argv[++i];
        } else
            usage();
    }

    if(ttable == "")
        usage();

    OnDiskWrapper onDiskWrapper;
    bool retDb = onDiskWrapper.BeginLoad(ttable);
    CHECK(retDb);
    OnDiskQuery onDiskQuery(onDiskWrapper);

    cerr << "Ready..." << endl;

    std::string line;
    while(getline(std::cin, line)) {
        std::vector<std::string> tokens;
        tokens = Moses::Tokenize(line, " ");

        cerr << "line: " << line << endl;
        const PhraseNode* node = onDiskQuery.Query(tokens);

        if (node) {
            // source phrase points to a bunch of rules
            const TargetPhraseCollection *coll = node->GetTargetPhraseCollection(tableLimit, onDiskWrapper);
            string str = coll->GetDebugStr();
            cout << "Found " << coll->GetSize() << endl;

            for (size_t ind = 0; ind < coll->GetSize(); ++ind) {
                const TargetPhrase &targetPhrase = coll->GetTargetPhrase(ind);
                cerr << "  ";
                targetPhrase.DebugPrint(cerr, onDiskWrapper.GetVocab());
                cerr << endl;
            }
        } else {
            cout << "Not found" << endl;
        }

        std::cout << '\n';
        std::cout.flush();
    }

    cerr << "Finished." << endl;
}
Ejemplo n.º 3
0
int main (int argc, char * const argv[])
{
  // insert code here...
  Moses::ResetUserTime();
  Moses::PrintUserTime("Starting");

  if (argc != 8) {
    std::cerr << "Usage: " << argv[0] << " numSourceFactors numTargetFactors numScores tableLimit sortScoreIndex inputPath outputPath" << std::endl;
    return 1;
  }

  int numSourceFactors	= Moses::Scan<int>(argv[1])
                          , numTargetFactors	= Moses::Scan<int>(argv[2])
                              , numScores				= Moses::Scan<int>(argv[3])
                                  , tableLimit				= Moses::Scan<int>(argv[4]);
  TargetPhraseCollection::s_sortScoreInd			= Moses::Scan<int>(argv[5]);
  assert(TargetPhraseCollection::s_sortScoreInd < numScores);

  const string filePath 	= argv[6]
                            ,destPath	= argv[7];

  Moses::InputFileStream inStream(filePath);

  OnDiskWrapper onDiskWrapper;
  bool retDb = onDiskWrapper.BeginSave(destPath, numSourceFactors, numTargetFactors, numScores);
  assert(retDb);

  PhraseNode &rootNode = onDiskWrapper.GetRootSourceNode();
  size_t lineNum = 0;
  char line[100000];

  //while(getline(inStream, line))
  while(inStream.getline(line, 100000)) {
    lineNum++;
    if (lineNum%1000 == 0) cerr << "." << flush;
    if (lineNum%10000 == 0) cerr << ":" << flush;
    if (lineNum%100000 == 0) cerr << lineNum << flush;
    //cerr << lineNum << " " << line << endl;

    std::vector<float> misc(1);
    SourcePhrase sourcePhrase;
    TargetPhrase *targetPhrase = new TargetPhrase(numScores);
    OnDiskPt::PhrasePtr spShort = Tokenize(sourcePhrase, *targetPhrase, line, onDiskWrapper, numScores, misc);
    assert(misc.size() == onDiskWrapper.GetNumCounts());

    rootNode.AddTargetPhrase(sourcePhrase, targetPhrase, onDiskWrapper, tableLimit, misc, spShort);
  }

  rootNode.Save(onDiskWrapper, 0, tableLimit);
  onDiskWrapper.EndSave();

  Moses::PrintUserTime("Finished");

  //pause();
  return 0;

} // main()
void PhraseNode::GetChild(Word &wordFound, UINT64 &childFilePos, size_t ind, OnDiskWrapper &onDiskWrapper) const
{

  size_t wordSize = onDiskWrapper.GetSourceWordSize();
  size_t childSize = wordSize + sizeof(UINT64);

  char *currMem = m_memLoad
                  + sizeof(UINT64) * 2 // size & file pos of target phrase coll
                  + sizeof(float) * onDiskWrapper.GetNumCounts() // count info
                  + childSize * ind;

  size_t memRead = ReadChild(wordFound, childFilePos, currMem);
  CHECK(memRead == childSize);
}
Ejemplo n.º 5
0
bool Vocab::Load(OnDiskWrapper &onDiskWrapper)
{
	fstream &file = onDiskWrapper.GetFileVocab();
	
	string line;
	while(getline(file, line))
	{
		vector<string> tokens;
		Moses::Tokenize(tokens, line);
		assert(tokens.size() == 2);
		const string &key = tokens[0];
		m_vocabColl[key] =  Moses::Scan<UINT64>(tokens[1]);
	}
	
	// create lookup
	// assume contiguous vocab id
	m_lookup.resize(m_vocabColl.size() + 1);
	
	CollType::const_iterator iter;
	for (iter = m_vocabColl.begin(); iter != m_vocabColl.end(); ++iter)
	{
		UINT32 vocabId = iter->second;
		const std::string &word = iter->first;
		
		m_lookup[vocabId] = word;
	}
	
	return true;
}
Ejemplo n.º 6
0
void Vocab::Save(OnDiskWrapper &onDiskWrapper)
{
  fstream &file = onDiskWrapper.GetFileVocab();
  CollType::const_iterator iterVocab;
  for (iterVocab = m_vocabColl.begin(); iterVocab != m_vocabColl.end(); ++iterVocab) {
    const string &word = iterVocab->first;
    UINT32 vocabId = iterVocab->second;

    file << word << " " << vocabId << endl;
  }
}
PhraseNode::PhraseNode(UINT64 filePos, OnDiskWrapper &onDiskWrapper)
  :m_counts(onDiskWrapper.GetNumCounts())
{
  // load saved node
  m_filePos = filePos;

  size_t countSize = onDiskWrapper.GetNumCounts();

  std::fstream &file = onDiskWrapper.GetFileSource();
  file.seekg(filePos);
  CHECK(filePos == (UINT64)file.tellg());

  file.read((char*) &m_numChildrenLoad, sizeof(UINT64));
  
  size_t memAlloc = GetNodeSize(m_numChildrenLoad, onDiskWrapper.GetSourceWordSize(), countSize);
  m_memLoad = (char*) malloc(memAlloc);

  // go to start of node again
  file.seekg(filePos);
  CHECK(filePos == (UINT64)file.tellg());

  // read everything into memory
  file.read(m_memLoad, memAlloc);
  CHECK(filePos + memAlloc == (UINT64)file.tellg());

  // get value
  m_value = ((UINT64*)m_memLoad)[1];

  // get counts
  float *memFloat = (float*) (m_memLoad + sizeof(UINT64) * 2);

  CHECK(countSize == 1);
  m_counts[0] = memFloat[0];

  m_memLoadLast = m_memLoad + memAlloc;
}
void TargetPhraseCollection::Save(OnDiskWrapper &onDiskWrapper)
{
  std::fstream &file = onDiskWrapper.GetFileTargetColl();

  size_t memUsed = sizeof(uint64_t);
  char *mem = (char*) malloc(memUsed);

  // size of coll
  uint64_t numPhrases = GetSize();
  ((uint64_t*)mem)[0] = numPhrases;

  // MAIN LOOP
  CollType::iterator iter;
  for (iter = m_coll.begin(); iter != m_coll.end(); ++iter) {
    // save phrase
    TargetPhrase &targetPhrase = **iter;
    targetPhrase.Save(onDiskWrapper);

    // save coll
    size_t memUsedTPOtherInfo;
    char *memTPOtherInfo = targetPhrase.WriteOtherInfoToMemory(onDiskWrapper, memUsedTPOtherInfo);

    // expand existing mem
    mem = (char*) realloc(mem, memUsed + memUsedTPOtherInfo);
    memcpy(mem + memUsed, memTPOtherInfo, memUsedTPOtherInfo);
    memUsed += memUsedTPOtherInfo;

    free(memTPOtherInfo);
  }

  // total number of bytes
  //((uint64_t*)mem)[0] = (uint64_t) memUsed;

  uint64_t startPos = file.tellp();
  file.seekp(0, ios::end);
  file.write((char*) mem, memUsed);

  free(mem);

#ifndef NDEBUG
  uint64_t endPos = file.tellp();
  assert(startPos + memUsed == endPos);
#endif
  m_filePos = startPos;

}
Ejemplo n.º 9
0
void TargetPhrase::Save(OnDiskWrapper &onDiskWrapper)
{
  // save in target ind
  size_t memUsed;
  char *mem = WriteToMemory(onDiskWrapper, memUsed);

  std::fstream &file = onDiskWrapper.GetFileTargetInd();

  UINT64 startPos = file.tellp();

  file.seekp(0, ios::end);
  file.write(mem, memUsed);

  UINT64 endPos = file.tellp();
  CHECK(startPos + memUsed == endPos);

  m_filePos = startPos;
  free(mem);
}
void PhraseNode::Save(OnDiskWrapper &onDiskWrapper, size_t pos, size_t tableLimit)
{
  CHECK(!m_saved);

  // save this node
  m_targetPhraseColl.Sort(tableLimit);
  m_targetPhraseColl.Save(onDiskWrapper);
  m_value = m_targetPhraseColl.GetFilePos();

  size_t numCounts = onDiskWrapper.GetNumCounts();

  size_t memAlloc = GetNodeSize(GetSize(), onDiskWrapper.GetSourceWordSize(), numCounts);
  char *mem = (char*) malloc(memAlloc);
  //memset(mem, 0xfe, memAlloc);

  size_t memUsed = 0;
  UINT64 *memArray = (UINT64*) mem;
  memArray[0] = GetSize(); // num of children
  memArray[1] = m_value;   // file pos of corresponding target phrases
  memUsed += 2 * sizeof(UINT64);

  // count info
  float *memFloat = (float*) (mem + memUsed);
  CHECK(numCounts == 1);
  memFloat[0] = (m_counts.size() == 0) ? DEFAULT_COUNT : m_counts[0]; // if count = 0, put in very large num to make sure its still used. HACK
  memUsed += sizeof(float) * numCounts;

  // recursively save chm_countsildren
  ChildColl::iterator iter;
  for (iter = m_children.begin(); iter != m_children.end(); ++iter) {
    const Word &childWord = iter->first;
    PhraseNode &childNode = iter->second;

    // recursive
    if (!childNode.Saved())
      childNode.Save(onDiskWrapper, pos + 1, tableLimit);

    char *currMem = mem + memUsed;
    size_t wordMemUsed = childWord.WriteToMemory(currMem);
    memUsed += wordMemUsed;

    UINT64 *memArray = (UINT64*) (mem + memUsed);
    memArray[0] = childNode.GetFilePos();
    memUsed += sizeof(UINT64);

  }

  // save this node
  //Moses::DebugMem(mem, memAlloc);
  CHECK(memUsed == memAlloc);

  std::fstream &file = onDiskWrapper.GetFileSource();
  m_filePos = file.tellp();
  file.seekp(0, ios::end);
  file.write(mem, memUsed);

  UINT64 endPos = file.tellp();
  CHECK(m_filePos + memUsed == endPos);

  free(mem);

  m_children.clear();
  m_saved = true;
}
int main(int argc, char **argv)
{
  int tableLimit = 20;
  std::string ttable = "";
  bool useAlignments = false;

  for(int i = 1; i < argc; i++) {
    if(!strcmp(argv[i], "-tlimit")) {
      if(i + 1 == argc)
        usage();
      tableLimit = atoi(argv[++i]);
    } else if(!strcmp(argv[i], "-t")) {
      if(i + 1 == argc)
        usage();
      ttable = argv[++i];
    }
    else
      usage();
  }

  if(ttable == "")
    usage();

	OnDiskWrapper onDiskWrapper;
  bool retDb = onDiskWrapper.BeginLoad(ttable);
	CHECK(retDb);
	
	cerr << "Ready..." << endl;
	
  std::string line;
  while(getline(std::cin, line)) {
    std::vector<std::string> tokens;
    tokens = Moses::Tokenize(line, " ");

		cerr << "line: " << line << endl;
		
		// create source phrase
    SourcePhrase sourcePhrase;

		for (size_t pos = 0; pos < tokens.size(); ++pos)
		{
		  const string &tok = tokens[pos];
		  
		  if (pos == tokens.size() - 1) 
		  { // last position. LHS non-term
			  Tokenize(sourcePhrase, tok, false, true, onDiskWrapper);
			}
			else
			{
			  Tokenize(sourcePhrase, tok, true, true, onDiskWrapper);
			}
		}
		
    const PhraseNode *node = &onDiskWrapper.GetRootSourceNode();
		cerr << "node=" << node << endl;
    assert(node);
    
    for (size_t pos = 0; pos < sourcePhrase.GetSize(); ++pos)
		{
		  const Word &word = sourcePhrase.GetWord(pos);
		  cerr << word << " ";
		  node = node->GetChild(word, onDiskWrapper);
  		cerr << "node=" << node << endl;
		  
		  if (node == NULL)
		  {
		    break;
		  }
		}
    
    if (node)
    { // source phrase points to a bunch of rules
      const TargetPhraseCollection *coll = node->GetTargetPhraseCollection(tableLimit, onDiskWrapper);
      string str = coll->GetDebugStr();
      cout << "Found " << coll->GetSize() << endl;
      
      for (size_t ind = 0; ind < coll->GetSize(); ++ind)
      {
        const TargetPhrase &targetPhrase = coll->GetTargetPhrase(ind);
        cerr << "  ";
        targetPhrase.DebugPrint(cerr, onDiskWrapper.GetVocab());
        cerr << endl;
        

      }
    }
    else
    {
      cout << "Not found" << endl;
    }
    
    std::cout << '\n';
    std::cout.flush();
  }
  
  cerr << "Finished." << endl;
	
}