void RuleTableLoaderCompact::LoadPhraseSection( LineReader &reader, const std::vector<Word> &vocab, std::vector<Phrase> &rhsPhrases, std::vector<size_t> &lhsIds) { // Read phrase count. reader.ReadLine(); const size_t phraseCount = std::atoi(reader.m_line.c_str()); // Reads lines, storing Phrase object for each RHS and vocab ID for each LHS. rhsPhrases.resize(phraseCount, Phrase(0)); lhsIds.resize(phraseCount); std::vector<size_t> tokenPositions; for (size_t i = 0; i < phraseCount; ++i) { reader.ReadLine(); tokenPositions.clear(); FindTokens(tokenPositions, reader.m_line); const char *charLine = reader.m_line.c_str(); lhsIds[i] = std::atoi(charLine+tokenPositions[0]); for (size_t j = 1; j < tokenPositions.size(); ++j) { rhsPhrases[i].AddWord(vocab[std::atoi(charLine+tokenPositions[j])]); } } }
void RunInputThread() { RString line; LineReader * linereader; LOG->Trace("Input thread started; getting line reader"); linereader = getLineReader(); if(linereader == NULL) { LOG->Warn("Could not open line reader for SextetStream input"); } else { LOG->Trace("Got line reader"); while(continueInputThread) { LOG->Trace("Reading line"); if(linereader->ReadLine(line)) { LOG->Trace("Got line: '%s'", line.c_str()); if(line.length() > 0) { uint8_t newStateBuffer[STATE_BUFFER_SIZE]; GetNewState(newStateBuffer, line); ReactToChanges(newStateBuffer); } } else { // Error or EOF condition. LOG->Trace("Reached end of SextetStream input"); continueInputThread = false; } } LOG->Info("SextetStream input stopped"); delete linereader; } }
void RuleTableLoaderCompact::LoadVocabularySection( LineReader &reader, const std::vector<FactorType> &factorTypes, std::vector<Word> &vocabulary) { // Read symbol count. reader.ReadLine(); const size_t vocabSize = std::atoi(reader.m_line.c_str()); // Read symbol lines and create Word objects. vocabulary.resize(vocabSize); for (size_t i = 0; i < vocabSize; ++i) { reader.ReadLine(); const size_t len = reader.m_line.size(); bool isNonTerm = (reader.m_line[0] == '[' && reader.m_line[len-1] == ']'); if (isNonTerm) { reader.m_line = reader.m_line.substr(1, len-2); } vocabulary[i].CreateFromString(Input, factorTypes, reader.m_line, isNonTerm); } }
void RuleTableLoaderCompact::LoadAlignmentSection( LineReader &reader, std::vector<const AlignmentInfo *> &alignmentSets, std::vector<Phrase> &sourcePhrases) { // Read alignment set count. reader.ReadLine(); const size_t alignmentSetCount = std::atoi(reader.m_line.c_str()); alignmentSets.resize(alignmentSetCount * 2); AlignmentInfo::CollType alignTerm, alignNonTerm; std::vector<std::string> tokens; std::vector<size_t> points; for (size_t i = 0; i < alignmentSetCount; ++i) { // Read alignment set, lookup in collection, and store pointer. alignTerm.clear(); alignNonTerm.clear(); tokens.clear(); reader.ReadLine(); Tokenize(tokens, reader.m_line); std::vector<std::string>::const_iterator p; for (p = tokens.begin(); p != tokens.end(); ++p) { points.clear(); Tokenize<size_t>(points, *p, "-"); std::pair<size_t, size_t> alignmentPair(points[0], points[1]); if (sourcePhrases[i].GetWord(alignmentPair.first).IsNonTerminal()) { alignNonTerm.insert(alignmentPair); } else { alignTerm.insert(alignmentPair); } } alignmentSets[i*2] = AlignmentInfoCollection::Instance().Add(alignNonTerm); alignmentSets[i*2 + 1] = AlignmentInfoCollection::Instance().Add(alignTerm); } }
bool RuleTableLoaderCompact::LoadRuleSection( LineReader &reader, const std::vector<Word> &vocab, const std::vector<Phrase> &sourcePhrases, const std::vector<Phrase> &targetPhrases, const std::vector<size_t> &targetLhsIds, const std::vector<const AlignmentInfo *> &alignmentSets, RuleTableTrie &ruleTable) { // Read rule count. reader.ReadLine(); const size_t ruleCount = std::atoi(reader.m_line.c_str()); // Read rules and add to table. const size_t numScoreComponents = ruleTable.GetNumScoreComponents(); std::vector<float> scoreVector(numScoreComponents); std::vector<size_t> tokenPositions; for (size_t i = 0; i < ruleCount; ++i) { reader.ReadLine(); tokenPositions.clear(); FindTokens(tokenPositions, reader.m_line); const char *charLine = reader.m_line.c_str(); // The first three tokens are IDs for the source phrase, target phrase, // and alignment set. const int sourcePhraseId = std::atoi(charLine+tokenPositions[0]); const int targetPhraseId = std::atoi(charLine+tokenPositions[1]); const int alignmentSetId = std::atoi(charLine+tokenPositions[2]); const Phrase &sourcePhrase = sourcePhrases[sourcePhraseId]; const Phrase &targetPhrasePhrase = targetPhrases[targetPhraseId]; const Word *targetLhs = new Word(vocab[targetLhsIds[targetPhraseId]]); Word sourceLHS("X"); // TODO not implemented for compact const AlignmentInfo *alignNonTerm = alignmentSets[alignmentSetId]; // Then there should be one score for each score component. for (size_t j = 0; j < numScoreComponents; ++j) { float score = std::atof(charLine+tokenPositions[3+j]); scoreVector[j] = FloorScore(TransformScore(score)); } if (reader.m_line[tokenPositions[3+numScoreComponents]] != ':') { std::stringstream msg; msg << "Size of scoreVector != number (" << scoreVector.size() << "!=" << numScoreComponents << ") of score components on line " << reader.m_lineNum; UserMessage::Add(msg.str()); return false; } // The remaining columns are currently ignored. // Create and score target phrase. TargetPhrase *targetPhrase = new TargetPhrase(targetPhrasePhrase); targetPhrase->SetAlignNonTerm(alignNonTerm); targetPhrase->SetTargetLHS(targetLhs); targetPhrase->SetSourcePhrase(sourcePhrase); targetPhrase->Evaluate(sourcePhrase, ruleTable.GetFeaturesToApply()); // Insert rule into table. TargetPhraseCollection &coll = GetOrCreateTargetPhraseCollection( ruleTable, sourcePhrase, *targetPhrase, &sourceLHS); coll.Add(targetPhrase); } return true; }