void ContextParameters:: init(Parameter& params) { look_back = look_ahead = 0; params.SetParameter(context_string, "context-string", std::string("")); std::string context_window; params.SetParameter(context_window, "context-window", std::string("")); if (context_window == "") return; size_t p = context_window.find_first_of("0123456789"); if (p == 0) look_back = look_ahead = atoi(context_window.c_str()); if (p == 1) { if (context_window[0] == '-') look_back = atoi(context_window.substr(1).c_str()); else if (context_window[0] == '+') look_ahead = atoi(context_window.substr(1).c_str()); else UTIL_THROW2("Invalid specification of context window."); } if (p == 2) { if (context_window.substr(0,2) == "+-" || context_window.substr(0,2) == "-+") look_back = look_ahead = atoi(context_window.substr(p).c_str()); else UTIL_THROW2("Invalid specification of context window."); } }
FeatureFunction *ConstructKenLM(size_t startInd, const std::string &line, const std::string &file, FactorType factorType, util::LoadMethod load_method) { lm::ngram::ModelType model_type; if (lm::ngram::RecognizeBinary(file.c_str(), model_type)) { switch (model_type) { case lm::ngram::PROBING: return new KENLM<lm::ngram::ProbingModel>(startInd, line, file, factorType, load_method); case lm::ngram::REST_PROBING: return new KENLM<lm::ngram::RestProbingModel>(startInd, line, file, factorType, load_method); case lm::ngram::TRIE: return new KENLM<lm::ngram::TrieModel>(startInd, line, file, factorType, load_method); case lm::ngram::QUANT_TRIE: return new KENLM<lm::ngram::QuantTrieModel>(startInd, line, file, factorType, load_method); case lm::ngram::ARRAY_TRIE: return new KENLM<lm::ngram::ArrayTrieModel>(startInd, line, file, factorType, load_method); case lm::ngram::QUANT_ARRAY_TRIE: return new KENLM<lm::ngram::QuantArrayTrieModel>(startInd, line, file, factorType, load_method); default: UTIL_THROW2("Unrecognized kenlm model type " << model_type) ; } } else { return new KENLM<lm::ngram::ProbingModel>(startInd, line, file, factorType, load_method); } }
// Generate the target tree of the derivation d. TreePointer KBestExtractor::GetOutputTree(const Derivation &d) { const TargetPhrase &phrase = *(d.edge->shyperedge.label.translation); if (const PhraseProperty *property = phrase.GetProperty("Tree")) { const std::string *tree = property->GetValueString(); TreePointer mytree (boost::make_shared<InternalTree>(*tree)); //get subtrees (in target order) std::vector<TreePointer> previous_trees; for (size_t pos = 0; pos < phrase.GetSize(); ++pos) { const Word &word = phrase.GetWord(pos); if (word.IsNonTerminal()) { size_t nonTermInd = phrase.GetAlignNonTerm().GetNonTermIndexMap()[pos]; const Derivation &subderivation = *d.subderivations[nonTermInd]; const TreePointer prev_tree = GetOutputTree(subderivation); previous_trees.push_back(prev_tree); } } mytree->Combine(previous_trees); return mytree; } else { UTIL_THROW2("Error: TreeStructureFeature active, but no internal tree structure found"); } }
LM* ConstructCoarseLM(const std::string &file) { lm::ngram::ModelType model_type; if (lm::ngram::RecognizeBinary(file.c_str(), model_type)) { switch(model_type) { case lm::ngram::PROBING: return new CoarseLMModel<lm::ngram::ProbingModel>(file); case lm::ngram::REST_PROBING: return new CoarseLMModel<lm::ngram::RestProbingModel>(file); case lm::ngram::TRIE: return new CoarseLMModel<lm::ngram::TrieModel>(file); case lm::ngram::QUANT_TRIE: return new CoarseLMModel<lm::ngram::QuantTrieModel>(file); case lm::ngram::ARRAY_TRIE: return new CoarseLMModel<lm::ngram::ArrayTrieModel>(file); case lm::ngram::QUANT_ARRAY_TRIE: return new CoarseLMModel<lm::ngram::QuantArrayTrieModel>(file); default: UTIL_THROW2("Unrecognized kenlm model type " << model_type); } } else { return new CoarseLMModel<lm::ngram::ProbingModel>(file); } }
void OpSequenceModel::SetParameter(const std::string& key, const std::string& value) { if (key == "path") { m_lmPath = value; } else if (key == "support-features") { if(value == "no") numFeatures = 1; else numFeatures = 5; } else if (key == "input-factor") { sFactor = Scan<int>(value); } else if (key == "output-factor") { tFactor = Scan<int>(value); } else if (key == "load") { if (value == "lazy") { load_method = util::LAZY; } else if (value == "populate_or_lazy") { load_method = util::POPULATE_OR_LAZY; } else if (value == "populate_or_read" || value == "populate") { load_method = util::POPULATE_OR_READ; } else if (value == "read") { load_method = util::READ; } else if (value == "parallel_read") { load_method = util::PARALLEL_READ; } else { UTIL_THROW2("Unknown KenLM load method " << value); } } else { StatefulFeatureFunction::SetParameter(key, value); } }
OSMLM* ConstructOSMLM(const char *file, util::LoadMethod load_method) { lm::ngram::ModelType model_type; lm::ngram::Config config; config.load_method = load_method; if (lm::ngram::RecognizeBinary(file, model_type)) { switch(model_type) { case lm::ngram::PROBING: return new KenOSM<lm::ngram::ProbingModel>(file, config); case lm::ngram::REST_PROBING: return new KenOSM<lm::ngram::RestProbingModel>(file, config); case lm::ngram::TRIE: return new KenOSM<lm::ngram::TrieModel>(file, config); case lm::ngram::QUANT_TRIE: return new KenOSM<lm::ngram::QuantTrieModel>(file, config); case lm::ngram::ARRAY_TRIE: return new KenOSM<lm::ngram::ArrayTrieModel>(file, config); case lm::ngram::QUANT_ARRAY_TRIE: return new KenOSM<lm::ngram::QuantArrayTrieModel>(file, config); default: UTIL_THROW2("Unrecognized kenlm model type " << model_type); } } else { return new KenOSM<lm::ngram::ProbingModel>(file, config); } }
void BaseManager:: OutputSearchGraphAsHypergraph(std::ostream& out) const { // This virtual function that may not be implemented everywhere, but it should for // derived classes that use it UTIL_THROW2("Not implemented."); }
FFState* OpSequenceModel::EvaluateWhenApplied( const ChartHypothesis& /* cur_hypo */, int /* featureID - used to index the state in the previous hypotheses */, ScoreComponentCollection* accumulator) const { UTIL_THROW2("Chart decoding not support by UTIL_THROW2"); }
void GlobalLexicalModel::Load() { FactorCollection &factorCollection = FactorCollection::Instance(); const std::string& factorDelimiter = StaticData::Instance().GetFactorDelimiter(); VERBOSE(2, "Loading global lexical model from file " << m_filePath << endl); m_inputFactors = FactorMask(m_inputFactorsVec); m_outputFactors = FactorMask(m_outputFactorsVec); InputFileStream inFile(m_filePath); // reading in data one line at a time size_t lineNum = 0; string line; while(getline(inFile, line)) { ++lineNum; vector<string> token = Tokenize<string>(line, " "); if (token.size() != 3) { // format checking UTIL_THROW2("Syntax error at " << m_filePath << ":" << lineNum << ":" << line); } // create the output word Word *outWord = new Word(); vector<string> factorString = Tokenize( token[0], factorDelimiter ); for (size_t i=0 ; i < m_outputFactorsVec.size() ; i++) { const FactorDirection& direction = Output; const FactorType& factorType = m_outputFactorsVec[i]; const Factor* factor = factorCollection.AddFactor( direction, factorType, factorString[i] ); outWord->SetFactor( factorType, factor ); } // create the input word Word *inWord = new Word(); factorString = Tokenize( token[1], factorDelimiter ); for (size_t i=0 ; i < m_inputFactorsVec.size() ; i++) { const FactorDirection& direction = Input; const FactorType& factorType = m_inputFactorsVec[i]; const Factor* factor = factorCollection.AddFactor( direction, factorType, factorString[i] ); inWord->SetFactor( factorType, factor ); } // maximum entropy feature score float score = Scan<float>(token[2]); // std::cerr << "storing word " << *outWord << " " << *inWord << " " << score << endl; // store feature in hash DoubleHash::iterator keyOutWord = m_hash.find( outWord ); if( keyOutWord == m_hash.end() ) { m_hash[outWord][inWord] = score; } else { // already have hash for outword, delete the word to avoid leaks (keyOutWord->second)[inWord] = score; delete outWord; } } }
FeatureFunction *ConstructKenLM(size_t startInd, const std::string &lineOrig) { FactorType factorType = 0; string filePath; util::LoadMethod load_method = util::POPULATE_OR_READ; util::TokenIter<util::SingleCharacter, true> argument(lineOrig, ' '); ++argument; // KENLM util::StringStream line; line << "KENLM"; for (; argument; ++argument) { const char *equals = std::find(argument->data(), argument->data() + argument->size(), '='); UTIL_THROW_IF2(equals == argument->data() + argument->size(), "Expected = in KenLM argument " << *argument); StringPiece name(argument->data(), equals - argument->data()); StringPiece value(equals + 1, argument->data() + argument->size() - equals - 1); if (name == "factor") { factorType = boost::lexical_cast<FactorType>(value); } else if (name == "order") { // Ignored } else if (name == "path") { filePath.assign(value.data(), value.size()); } else if (name == "lazyken") { // deprecated: use load instead. load_method = boost::lexical_cast<bool>(value) ? util::LAZY : util::POPULATE_OR_READ; } else if (name == "load") { if (value == "lazy") { load_method = util::LAZY; } else if (value == "populate_or_lazy") { load_method = util::POPULATE_OR_LAZY; } else if (value == "populate_or_read" || value == "populate") { load_method = util::POPULATE_OR_READ; } else if (value == "read") { load_method = util::READ; } else if (value == "parallel_read") { load_method = util::PARALLEL_READ; } else { UTIL_THROW2("Unknown KenLM load method " << value); } } else { // pass to base class to interpret line << " " << name << "=" << value; } } return ConstructKenLM(startInd, line.str(), filePath, factorType, load_method); }
bool ContextParameters:: init(Parameter const& params) { look_back = look_ahead = 0; params.SetParameter(context_string, "context-string", std::string("")); std::string context_window; params.SetParameter(context_window, "context-window", std::string("")); if (context_window == "") return true; if (context_window.substr(0,3) == "all") { look_back = look_ahead = std::numeric_limits<size_t>::max(); return true; } size_t p = context_window.find_first_of("0123456789"); if (p == 0) look_back = look_ahead = atoi(context_window.c_str()); if (p == 1) { if (context_window[0] == '-') look_back = atoi(context_window.substr(1).c_str()); else if (context_window[0] == '+') look_ahead = atoi(context_window.substr(1).c_str()); else UTIL_THROW2("Invalid specification of context window."); } if (p == 2) { if (context_window.substr(0,2) == "+-" || context_window.substr(0,2) == "-+") look_back = look_ahead = atoi(context_window.substr(p).c_str()); else UTIL_THROW2("Invalid specification of context window."); } return true; }
void BleuScoreFeature::SetParameter(const std::string& key, const std::string& value) { if (key == "references") { vector<string> referenceFiles = Tokenize(value, ","); UTIL_THROW_IF2(referenceFiles.size() == 0, "No reference file"); vector<vector<string> > references(referenceFiles.size()); for (size_t i =0; i < referenceFiles.size(); ++i) { ifstream in(referenceFiles[i].c_str()); if (!in) { UTIL_THROW2("Unable to load references from " << referenceFiles[i]); } string line; while (getline(in,line)) { /* if (GetSearchAlgorithm() == ChartDecoding) { stringstream tmp; tmp << "<s> " << line << " </s>"; line = tmp.str(); } */ references[i].push_back(line); } if (i > 0) { if (references[i].size() != references[i-1].size()) { UTIL_THROW2("Reference files are of different lengths"); } } in.close(); } // for (size_t i =0; i < referenceFiles.size(); ++i) { //Set the references in the bleu feature LoadReferences(references); } else { StatefulFeatureFunction::SetParameter(key, value); } }
Search *Search::CreateSearch(Manager& manager, const InputType &source, SearchAlgorithm searchAlgorithm, const TranslationOptionCollection &transOptColl) { switch(searchAlgorithm) { case Normal: return new SearchNormal(manager,source, transOptColl); case CubePruning: return new SearchCubePruning(manager, source, transOptColl); case NormalBatch: return new SearchNormalBatch(manager, source, transOptColl); default: UTIL_THROW2("ERROR: search. Aborting\n"); return NULL; } }
void FeatureFunction::SetParameter(const std::string& key, const std::string& value) { if (key == "tuneable") { m_tuneable = Scan<bool>(value); } else if (key == "tuneable-components") { UTIL_THROW_IF2(!m_tuneable, GetScoreProducerDescription() << ": tuneable-components cannot be set if tuneable=false"); SetTuneableComponents(value); } else if (key == "require-sorting-after-source-context") { m_requireSortingAfterSourceContext = Scan<bool>(value); } else if (key == "verbosity") { m_verbosity = Scan<size_t>(value); } else if (key == "filterable") { //ignore } else { UTIL_THROW2(GetScoreProducerDescription() << ": Unknown argument " << key << "=" << value); } }
// Generate the target-side yield of the derivation d. Phrase KBestExtractor::GetOutputPhrase(const Derivation &d) { FactorType placeholderFactor = StaticData::Instance().options()->input.placeholder_factor; Phrase ret(ARRAY_SIZE_INCR); const TargetPhrase &phrase = *(d.edge->shyperedge.label.translation); const AlignmentInfo::NonTermIndexMap &nonTermIndexMap = phrase.GetAlignNonTerm().GetNonTermIndexMap(); for (std::size_t pos = 0; pos < phrase.GetSize(); ++pos) { const Word &word = phrase.GetWord(pos); if (word.IsNonTerminal()) { std::size_t nonTermInd = nonTermIndexMap[pos]; const Derivation &subderivation = *d.subderivations[nonTermInd]; Phrase subPhrase = GetOutputPhrase(subderivation); ret.Append(subPhrase); } else { ret.AddWord(word); if (placeholderFactor == NOT_FOUND) { continue; } // FIXME UTIL_THROW2("placeholders are not currently supported by the S2T decoder"); /* std::set<std::size_t> sourcePosSet = phrase.GetAlignTerm().GetAlignmentsForTarget(pos); if (sourcePosSet.size() == 1) { const std::vector<const Word*> *ruleSourceFromInputPath = hypo.GetTranslationOption().GetSourceRuleFromInputPath(); UTIL_THROW_IF2(ruleSourceFromInputPath == NULL, "Source Words in of the rules hasn't been filled out"); std::size_t sourcePos = *sourcePosSet.begin(); const Word *sourceWord = ruleSourceFromInputPath->at(sourcePos); UTIL_THROW_IF2(sourceWord == NULL, "Null source word at position " << sourcePos); const Factor *factor = sourceWord->GetFactor(placeholderFactor); if (factor) { ret.Back()[0] = factor; } } */ } } return ret; }
PhraseDictionaryMultiModel::PhraseDictionaryMultiModel(const std::string &line) :PhraseDictionary(line, true) { ReadParameters(); if (m_mode == "interpolate") { size_t numWeights = m_numScoreComponents; UTIL_THROW_IF2(m_pdStr.size() != m_multimodelweights.size() && m_pdStr.size()*numWeights != m_multimodelweights.size(), "Number of scores and weights are not equal"); } else if (m_mode == "all" || m_mode == "all-restrict") { UTIL_THROW2("Implementation has moved: use PhraseDictionaryGroup with restrict=true/false"); } else { ostringstream msg; msg << "combination mode unknown: " << m_mode; throw runtime_error(msg.str()); } }
FFState* TreeStructureFeature::EvaluateWhenApplied(const ChartHypothesis& cur_hypo , int featureID /* used to index the state in the previous hypotheses */ , ScoreComponentCollection* accumulator) const { if (const PhraseProperty *property = cur_hypo.GetCurrTargetPhrase().GetProperty("Tree")) { const std::string *tree = property->GetValueString(); TreePointer mytree (new InternalTree(*tree)); if (m_labelset) { AddNTLabels(mytree); } //get subtrees (in target order) std::vector<TreePointer> previous_trees; for (size_t pos = 0; pos < cur_hypo.GetCurrTargetPhrase().GetSize(); ++pos) { const Word &word = cur_hypo.GetCurrTargetPhrase().GetWord(pos); if (word.IsNonTerminal()) { size_t nonTermInd = cur_hypo.GetCurrTargetPhrase().GetAlignNonTerm().GetNonTermIndexMap()[pos]; const ChartHypothesis *prevHypo = cur_hypo.GetPrevHypo(nonTermInd); const TreeState* prev = dynamic_cast<const TreeState*>(prevHypo->GetFFState(featureID)); const TreePointer prev_tree = prev->GetTree(); previous_trees.push_back(prev_tree); } } std::vector<std::string> sparse_features; if (m_constraints) { sparse_features = m_constraints->SyntacticRules(mytree, previous_trees); } mytree->Combine(previous_trees); //sparse scores for (std::vector<std::string>::const_iterator feature=sparse_features.begin(); feature != sparse_features.end(); ++feature) { accumulator->PlusEquals(this, *feature, 1); } return new TreeState(mytree); } else { UTIL_THROW2("Error: TreeStructureFeature active, but no internal tree structure found"); } }
void TargetConstituentBoundariesRightAdjacentPhraseProperty::ProcessValue(const std::string &value) { FactorCollection &factorCollection = FactorCollection::Instance(); std::vector<std::string> tokens; Tokenize(tokens, value, " "); std::vector<std::string>::const_iterator tokenIter = tokens.begin(); while (tokenIter != tokens.end()) { try { std::vector<std::string> constituents; Tokenize(constituents, *tokenIter, "<"); ++tokenIter; float count = std::atof( tokenIter->c_str() ); ++tokenIter; std::set<const Factor* > dedup; for ( std::vector<std::string>::iterator constituentIter = constituents.begin(); constituentIter != constituents.end(); ++constituentIter ) { const Factor* constituentFactor = factorCollection.AddFactor(*constituentIter,false); std::pair< std::set<const Factor* >::iterator, bool > dedupIns = dedup.insert(constituentFactor); if ( dedupIns.second ) { std::pair< TargetConstituentBoundariesRightAdjacentCollection::iterator, bool > inserted = m_constituentsCollection.insert(std::make_pair(constituentFactor,count)); if ( !inserted.second ) { (inserted.first)->second += count; } } } } catch (const std::exception &e) { UTIL_THROW2("TargetConstituentBoundariesRightAdjacentPhraseProperty: Read error. Flawed property? " << value); } } };
std::string LexicalReorderingTableCreator::EncodeLine(std::vector<std::string>& tokens) { std::string scoresString = tokens.back(); std::stringstream scoresStream; std::vector<float> scores; Tokenize<float>(scores, scoresString); if(!m_numScoreComponent) { m_numScoreComponent = scores.size(); m_scoreCounters.resize(m_multipleScoreTrees ? m_numScoreComponent : 1); for(std::vector<ScoreCounter*>::iterator it = m_scoreCounters.begin(); it != m_scoreCounters.end(); it++) *it = new ScoreCounter(); m_scoreTrees.resize(m_multipleScoreTrees ? m_numScoreComponent : 1); } if(m_numScoreComponent != scores.size()) { std::stringstream strme; strme << "Error: Wrong number of scores detected (" << scores.size() << " != " << m_numScoreComponent << ") :" << std::endl; strme << "Line: " << tokens[0] << " ||| ... ||| " << scoresString << std::endl; UTIL_THROW2(strme.str()); } size_t c = 0; float score; while(c < m_numScoreComponent) { score = scores[c]; score = FloorScore(TransformScore(score)); scoresStream.write((char*)&score, sizeof(score)); m_scoreCounters[m_multipleScoreTrees ? c : 0]->Increase(score); c++; } return scoresStream.str(); }
FFState* TreeStructureFeature::EvaluateWhenApplied(const ChartHypothesis& cur_hypo , int featureID /* used to index the state in the previous hypotheses */ , ScoreComponentCollection* accumulator) const { if (const PhraseProperty *property = cur_hypo.GetCurrTargetPhrase().GetProperty("Tree")) { const std::string *tree = property->GetValueString(); TreePointer mytree (boost::make_shared<InternalTree>(*tree)); //get subtrees (in target order) std::vector<TreePointer> previous_trees; for (size_t pos = 0; pos < cur_hypo.GetCurrTargetPhrase().GetSize(); ++pos) { const Word &word = cur_hypo.GetCurrTargetPhrase().GetWord(pos); if (word.IsNonTerminal()) { size_t nonTermInd = cur_hypo.GetCurrTargetPhrase().GetAlignNonTerm().GetNonTermIndexMap()[pos]; const ChartHypothesis *prevHypo = cur_hypo.GetPrevHypo(nonTermInd); const TreeState* prev = dynamic_cast<const TreeState*>(prevHypo->GetFFState(featureID)); const TreePointer prev_tree = prev->GetTree(); previous_trees.push_back(prev_tree); } } if (m_constraints) { m_constraints->SyntacticRules(mytree, previous_trees, this, accumulator); } mytree->Combine(previous_trees); bool full_sentence = (mytree->GetChildren().back()->GetLabel() == m_send || (mytree->GetChildren().back()->GetLabel() == m_send_nt && mytree->GetChildren().back()->GetChildren().back()->GetLabel() == m_send)); if (m_binarized && full_sentence) { mytree->Unbinarize(); } return new TreeState(mytree); } else { UTIL_THROW2("Error: TreeStructureFeature active, but no internal tree structure found"); } }
bool HyperTreeLoader::Load(AllOptions const& opts, const std::vector<FactorType> &input, const std::vector<FactorType> &output, const std::string &inFile, const RuleTableFF &ff, HyperTree &trie, boost::unordered_set<std::size_t> &sourceTermSet) { PrintUserTime(std::string("Start loading HyperTree")); sourceTermSet.clear(); std::size_t count = 0; std::ostream *progress = NULL; IFVERBOSE(1) progress = &std::cerr; util::FilePiece in(inFile.c_str(), progress); // reused variables std::vector<float> scoreVector; StringPiece line; double_conversion::StringToDoubleConverter converter(double_conversion::StringToDoubleConverter::NO_FLAGS, NAN, NAN, "inf", "nan"); HyperPathLoader hyperPathLoader; Phrase dummySourcePhrase; { Word *lhs = NULL; dummySourcePhrase.CreateFromString(Input, input, "hello", &lhs); delete lhs; } while(true) { try { line = in.ReadLine(); } catch (const util::EndOfFileException &e) { break; } util::TokenIter<util::MultiCharacter> pipes(line, "|||"); StringPiece sourceString(*pipes); StringPiece targetString(*++pipes); StringPiece scoreString(*++pipes); StringPiece alignString; if (++pipes) { StringPiece temp(*pipes); alignString = temp; } ++pipes; // counts scoreVector.clear(); for (util::TokenIter<util::AnyCharacter, true> s(scoreString, " \t"); s; ++s) { int processed; float score = converter.StringToFloat(s->data(), s->length(), &processed); UTIL_THROW_IF2(std::isnan(score), "Bad score " << *s << " on line " << count); scoreVector.push_back(FloorScore(TransformScore(score))); } const std::size_t numScoreComponents = ff.GetNumScoreComponents(); if (scoreVector.size() != numScoreComponents) { UTIL_THROW2("Size of scoreVector != number (" << scoreVector.size() << "!=" << numScoreComponents << ") of score components on line " << count); } // Source-side HyperPath sourceFragment; hyperPathLoader.Load(sourceString, sourceFragment); ExtractSourceTerminalSetFromHyperPath(sourceFragment, sourceTermSet); // Target-side TargetPhrase *targetPhrase = new TargetPhrase(&ff); Word *targetLHS = NULL; targetPhrase->CreateFromString(Output, output, targetString, &targetLHS); targetPhrase->SetTargetLHS(targetLHS); targetPhrase->SetAlignmentInfo(alignString); if (++pipes) { StringPiece sparseString(*pipes); targetPhrase->SetSparseScore(&ff, sparseString); } if (++pipes) { StringPiece propertiesString(*pipes); targetPhrase->SetProperties(propertiesString); } targetPhrase->GetScoreBreakdown().Assign(&ff, scoreVector); targetPhrase->EvaluateInIsolation(dummySourcePhrase, ff.GetFeaturesToApply()); // Add rule to trie. TargetPhraseCollection::shared_ptr phraseColl = GetOrCreateTargetPhraseCollection(trie, sourceFragment); phraseColl->Add(targetPhrase); count++; } // sort and prune each target phrase collection if (ff.GetTableLimit()) { SortAndPrune(trie, ff.GetTableLimit()); } return true; }
void LanguageModel::EvaluateWhenApplied(const SCFG::Manager &mgr, const SCFG::Hypothesis &hypo, int featureID, Scores &scores, FFState &state) const { UTIL_THROW2("Not implemented"); }
void PhraseDictionaryFuzzyMatch::InitializeForInput(InputType const& inputSentence) { #if defined __MINGW32__ char dirName[] = "moses.XXXXXX"; #else char dirName[] = "/tmp/moses.XXXXXX"; #endif // defined char *temp = mkdtemp(dirName); UTIL_THROW_IF2(temp == NULL, "Couldn't create temporary directory " << dirName); string dirNameStr(dirName); string inFileName(dirNameStr + "/in"); ofstream inFile(inFileName.c_str()); for (size_t i = 1; i < inputSentence.GetSize() - 1; ++i) { inFile << inputSentence.GetWord(i); } inFile << endl; inFile.close(); long translationId = inputSentence.GetTranslationId(); string ptFileName = m_FuzzyMatchWrapper->Extract(translationId, dirNameStr); // populate with rules for this sentence PhraseDictionaryNodeMemory &rootNode = m_collection[translationId]; FormatType format = MosesFormat; // data from file InputFileStream inStream(ptFileName); // copied from class LoaderStandard PrintUserTime("Start loading fuzzy-match phrase model"); const StaticData &staticData = StaticData::Instance(); const std::string& factorDelimiter = staticData.GetFactorDelimiter(); string lineOrig; size_t count = 0; while(getline(inStream, lineOrig)) { const string *line; if (format == HieroFormat) { // reformat line UTIL_THROW(util::Exception, "Cannot be Hiero format"); //line = ReformatHieroRule(lineOrig); } else { // do nothing to format of line line = &lineOrig; } vector<string> tokens; vector<float> scoreVector; TokenizeMultiCharSeparator(tokens, *line , "|||" ); if (tokens.size() != 4 && tokens.size() != 5) { UTIL_THROW2("Syntax error at " << ptFileName << ":" << count); } const string &sourcePhraseString = tokens[0] , &targetPhraseString = tokens[1] , &scoreString = tokens[2] , &alignString = tokens[3]; bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == string::npos); if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) { TRACE_ERR( ptFileName << ":" << count << ": pt entry contains empty target, skipping\n"); continue; } Tokenize<float>(scoreVector, scoreString); const size_t numScoreComponents = GetNumScoreComponents(); if (scoreVector.size() != numScoreComponents) { UTIL_THROW2("Size of scoreVector != number (" << scoreVector.size() << "!=" << numScoreComponents << ") of score components on line " << count); } UTIL_THROW_IF2(scoreVector.size() != numScoreComponents, "Number of scores incorrectly specified"); // parse source & find pt node // constituent labels Word *sourceLHS; Word *targetLHS; // source Phrase sourcePhrase( 0); // sourcePhrase.CreateFromString(Input, m_input, sourcePhraseString, factorDelimiter, &sourceLHS); sourcePhrase.CreateFromString(Input, m_input, sourcePhraseString, &sourceLHS); // create target phrase obj TargetPhrase *targetPhrase = new TargetPhrase(this); // targetPhrase->CreateFromString(Output, m_output, targetPhraseString, factorDelimiter, &targetLHS); targetPhrase->CreateFromString(Output, m_output, targetPhraseString, &targetLHS); // rest of target phrase targetPhrase->SetAlignmentInfo(alignString); targetPhrase->SetTargetLHS(targetLHS); //targetPhrase->SetDebugOutput(string("New Format pt ") + line); // component score, for n-best output std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),TransformScore); std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),FloorScore); targetPhrase->GetScoreBreakdown().Assign(this, scoreVector); targetPhrase->EvaluateInIsolation(sourcePhrase, GetFeaturesToApply()); TargetPhraseCollection &phraseColl = GetOrCreateTargetPhraseCollection(rootNode, sourcePhrase, *targetPhrase, sourceLHS); phraseColl.Add(targetPhrase); count++; if (format == HieroFormat) { // reformat line delete line; } else { // do nothing } } // sort and prune each target phrase collection SortAndPrune(rootNode); //removedirectoryrecursively(dirName); }
int main(int argc, char* argv[]) { std::cerr << "Consolidate v2.0 written by Philipp Koehn" << std::endl << "consolidating direct and indirect rule tables" << std::endl; if (argc < 4) { std::cerr << "syntax: consolidate phrase-table.direct phrase-table.indirect phrase-table.consolidated [--Hierarchical] [--OnlyDirect] [--PhraseCount] [--GoodTuring counts-of-counts-file] [--KneserNey counts-of-counts-file] [--LowCountFeature] [--SourceLabels source-labels-file] [--PartsOfSpeech parts-of-speech-file] [--MinScore id:threshold[,id:threshold]*]" << std::endl; exit(1); } const std::string fileNameDirect = argv[1]; const std::string fileNameIndirect = argv[2]; const std::string fileNameConsolidated = argv[3]; std::string fileNameCountOfCounts; std::string fileNameSourceLabelSet; std::string fileNamePartsOfSpeechVocabulary; for(int i=4; i<argc; i++) { if (strcmp(argv[i],"--Hierarchical") == 0) { hierarchicalFlag = true; std::cerr << "processing hierarchical rules" << std::endl; } else if (strcmp(argv[i],"--OnlyDirect") == 0) { onlyDirectFlag = true; std::cerr << "only including direct translation scores p(e|f)" << std::endl; } else if (strcmp(argv[i],"--PhraseCount") == 0) { phraseCountFlag = true; std::cerr << "including the phrase count feature" << std::endl; } else if (strcmp(argv[i],"--GoodTuring") == 0) { goodTuringFlag = true; UTIL_THROW_IF2(i+1==argc, "specify count of count files for Good Turing discounting!"); fileNameCountOfCounts = argv[++i]; std::cerr << "adjusting phrase translation probabilities with Good Turing discounting" << std::endl; } else if (strcmp(argv[i],"--KneserNey") == 0) { kneserNeyFlag = true; UTIL_THROW_IF2(i+1==argc, "specify count of count files for Kneser Ney discounting!"); fileNameCountOfCounts = argv[++i]; std::cerr << "adjusting phrase translation probabilities with Kneser Ney discounting" << std::endl; } else if (strcmp(argv[i],"--LowCountFeature") == 0) { lowCountFlag = true; std::cerr << "including the low count feature" << std::endl; } else if (strcmp(argv[i],"--CountBinFeature") == 0 || strcmp(argv[i],"--SparseCountBinFeature") == 0) { if (strcmp(argv[i],"--SparseCountBinFeature") == 0) sparseCountBinFeatureFlag = true; std::cerr << "include "<< (sparseCountBinFeatureFlag ? "sparse " : "") << "count bin feature:"; int prev = 0; while(i+1<argc && argv[i+1][0]>='0' && argv[i+1][0]<='9') { int binCount = Moses::Scan<int>(argv[++i]); countBin.push_back( binCount ); if (prev+1 == binCount) { std::cerr << " " << binCount; } else { std::cerr << " " << (prev+1) << "-" << binCount; } prev = binCount; } std::cerr << " " << (prev+1) << "+" << std::endl; } else if (strcmp(argv[i],"--LogProb") == 0) { logProbFlag = true; std::cerr << "using log-probabilities" << std::endl; } else if (strcmp(argv[i],"--Counts") == 0) { countsProperty = true; std::cerr << "output counts as a property" << std::endl;; } else if (strcmp(argv[i],"--SourceLabels") == 0) { sourceLabelsFlag = true; UTIL_THROW_IF2(i+1==argc, "specify source label set file!"); fileNameSourceLabelSet = argv[++i]; std::cerr << "processing source labels property" << std::endl; } else if (strcmp(argv[i],"--PartsOfSpeech") == 0) { partsOfSpeechFlag = true; UTIL_THROW_IF2(i+1==argc, "specify parts-of-speech file!"); fileNamePartsOfSpeechVocabulary = argv[++i]; std::cerr << "processing parts-of-speech property" << std::endl; } else if (strcmp(argv[i],"--MinScore") == 0) { std::string setting = argv[++i]; bool done = false; while (!done) { std::string single_setting; size_t pos; if ((pos = setting.find(",")) != std::string::npos) { single_setting = setting.substr(0, pos); setting.erase(0, pos + 1); } else { single_setting = setting; done = true; } pos = single_setting.find(":"); UTIL_THROW_IF2(pos == std::string::npos, "faulty MinScore setting '" << single_setting << "' in '" << argv[i] << "'"); unsigned int field = Moses::Scan<unsigned int>( single_setting.substr(0,pos) ); float threshold = Moses::Scan<float>( single_setting.substr(pos+1) ); if (field == 0) { minScore0 = threshold; std::cerr << "setting minScore0 to " << threshold << std::endl; } else if (field == 2) { minScore2 = threshold; std::cerr << "setting minScore2 to " << threshold << std::endl; } else { UTIL_THROW2("MinScore currently only supported for indirect (0) and direct (2) phrase translation probabilities"); } } } else { UTIL_THROW2("unknown option " << argv[i]); } } processFiles( fileNameDirect, fileNameIndirect, fileNameConsolidated, fileNameCountOfCounts, fileNameSourceLabelSet, fileNamePartsOfSpeechVocabulary ); }
bool CoveredReferenceState::operator==(const FFState& other) const { UTIL_THROW2("TODO:Haven't figure this out yet"); }
size_t CoveredReferenceState::hash() const { UTIL_THROW2("TODO:Haven't figure this out yet"); }
bool RuleTrieLoader::Load(const std::vector<FactorType> &input, const std::vector<FactorType> &output, const std::string &inFile, const RuleTableFF &ff, RuleTrie &trie) { PrintUserTime(std::string("Start loading text phrase table. Moses format")); const StaticData &staticData = StaticData::Instance(); // const std::string &factorDelimiter = staticData.GetFactorDelimiter(); std::size_t count = 0; std::ostream *progress = NULL; IFVERBOSE(1) progress = &std::cerr; util::FilePiece in(inFile.c_str(), progress); // reused variables std::vector<float> scoreVector; StringPiece line; double_conversion::StringToDoubleConverter converter(double_conversion::StringToDoubleConverter::NO_FLAGS, NAN, NAN, "inf", "nan"); while(true) { try { line = in.ReadLine(); } catch (const util::EndOfFileException &e) { break; } util::TokenIter<util::MultiCharacter> pipes(line, "|||"); StringPiece sourcePhraseString(*pipes); StringPiece targetPhraseString(*++pipes); StringPiece scoreString(*++pipes); StringPiece alignString; if (++pipes) { StringPiece temp(*pipes); alignString = temp; } if (++pipes) { StringPiece str(*pipes); //counts } bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == std::string::npos); if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) { TRACE_ERR( ff.GetFilePath() << ":" << count << ": pt entry contains empty target, skipping\n"); continue; } scoreVector.clear(); for (util::TokenIter<util::AnyCharacter, true> s(scoreString, " \t"); s; ++s) { int processed; float score = converter.StringToFloat(s->data(), s->length(), &processed); UTIL_THROW_IF2(std::isnan(score), "Bad score " << *s << " on line " << count); scoreVector.push_back(FloorScore(TransformScore(score))); } const std::size_t numScoreComponents = ff.GetNumScoreComponents(); if (scoreVector.size() != numScoreComponents) { UTIL_THROW2("Size of scoreVector != number (" << scoreVector.size() << "!=" << numScoreComponents << ") of score components on line " << count); } // parse source & find pt node // constituent labels Word *sourceLHS = NULL; Word *targetLHS; // create target phrase obj TargetPhrase *targetPhrase = new TargetPhrase(&ff); // targetPhrase->CreateFromString(Output, output, targetPhraseString, factorDelimiter, &targetLHS); targetPhrase->CreateFromString(Output, output, targetPhraseString, &targetLHS); // source Phrase sourcePhrase; // sourcePhrase.CreateFromString(Input, input, sourcePhraseString, factorDelimiter, &sourceLHS); sourcePhrase.CreateFromString(Input, input, sourcePhraseString, &sourceLHS); // rest of target phrase targetPhrase->SetAlignmentInfo(alignString); targetPhrase->SetTargetLHS(targetLHS); //targetPhrase->SetDebugOutput(string("New Format pt ") + line); if (++pipes) { StringPiece sparseString(*pipes); targetPhrase->SetSparseScore(&ff, sparseString); } if (++pipes) { StringPiece propertiesString(*pipes); targetPhrase->SetProperties(propertiesString); } targetPhrase->GetScoreBreakdown().Assign(&ff, scoreVector); targetPhrase->EvaluateInIsolation(sourcePhrase, ff.GetFeaturesToApply()); TargetPhraseCollection &phraseColl = GetOrCreateTargetPhraseCollection( trie, *sourceLHS, sourcePhrase); phraseColl.Add(targetPhrase); // not implemented correctly in memory pt. just delete it for now delete sourceLHS; count++; } // sort and prune each target phrase collection if (ff.GetTableLimit()) { SortAndPrune(trie, ff.GetTableLimit()); } return true; }