/** * \brief Method inherited from ucam::util::TaskInterface. Loads the language model and stores in lm data structure. * \param &d: data structure in which the null filter is to be stored. * \returns false (does not break the chain of tasks) */ bool run ( Data& d ) { mylmfst_.DeleteStates(); if ( !USER_CHECK ( d.klm.size() , "No language models available" ) ) return true; if ( !USER_CHECK ( d.klm.find ( lmkey_ ) != d.klm.end() , "No language models available (key not initialized) " ) ) return true; if ( !USER_CHECK ( d.fsts.find ( latticeloadkey_ ) != d.fsts.end() , " Input fst not available!" ) ) return true; mylmfst_ = * (static_cast<fst::VectorFst<Arc> * > ( d.fsts[latticeloadkey_] ) ); if (deletelmscores_) { LINFO ( "Delete old LM scores first" ); //Deletes LM scores if using lexstdarc. Note -- will copy through on stdarc! fst::MakeWeight2<Arc> mwcopy; fst::Map<Arc> ( &mylmfst_, fst::GenericWeightAutoMapper<Arc, fst::MakeWeight2<Arc> > ( mwcopy ) ); } LINFO ( "Input lattice loaded with key=" << latticeloadkey_ << ", NS=" << mylmfst_.NumStates() ); fst::MakeWeight<Arc> mw; for ( uint k = 0; k < d.klm[lmkey_].size(); ++k ) { if ( !USER_CHECK ( d.klm[lmkey_][k]->model != NULL, "Language model " << k << " not available!" ) ) return true; KenLMModelT& model = *d.klm[lmkey_][k]->model; #ifndef USE_GOOGLE_SPARSE_HASH unordered_set<Label> epsilons; #else google::dense_hash_set<Label> epsilons; epsilons.set_empty_key ( numeric_limits<Label>::max() ); #endif ///We want the language model to ignore these guys: epsilons.insert ( DR ); epsilons.insert ( OOV ); epsilons.insert ( EPSILON ); epsilons.insert ( SEP ); LINFO ( "Applying language model " << k << " with lmkey=" << lmkey_ << ", using lmscale=" << d.klm[lmkey_][k]->lmscale ); LDEBUG ( "lattice NS=" << mylmfst_.NumStates() ); fst::ApplyLanguageModelOnTheFly<Arc, fst::MakeWeight<Arc>, KenLMModelT> *f = new fst::ApplyLanguageModelOnTheFly<Arc, fst::MakeWeight<Arc>, KenLMModelT > ( mylmfst_ , model , epsilons , natlog_ , d.klm[lmkey_][k]->lmscale , d.klm[lmkey_][k]->lmwp , d.klm[lmkey_][k]->idb); f->setMakeWeight ( mw ); d.stats->setTimeStart ( "on-the-fly-composition " + ucam::util::toString ( k ) ); mylmfst_ = * ( ( *f ) () ); delete f; d.stats->setTimeEnd ( "on-the-fly-composition " + ucam::util::toString ( k ) ); LDEBUG ( mylmfst_.NumStates() ); mw.update(); } d.fsts[latticestorekey_] = &mylmfst_; LINFO ( "Done!" ); return false; };
///Constructor with registry object and several keys to access data object and registry HiFSTTask ( const ucam::util::RegistryPO& rg , const std::string& outputkey = HifstConstants::kHifstLatticeStore, const std::string& locallmkey = HifstConstants::kHifstLocalpruneLmLoad, const std::string& fullreferencelatticekey = HifstConstants::kReferencefilterNosubstringStore , const std::string& lmkey = HifstConstants::kLmLoad ) : optimize_ (rg.getBool (HifstConstants::kHifstOptimizecells) ), numlocallm_ (rg.getVectorString (locallmkey).size() ), warned_ (false), rtnfiles_ (rg.get<std::string> (HifstConstants::kHifstWritertn) ), fullreferencelatticekey_ ( fullreferencelatticekey ), lmkey_ ( lmkey ), locallmkey_ ( locallmkey ), outputkey_ ( outputkey ), piscount_ ( 0 ), aligner_ ( rg.getBool ( HifstConstants::kHifstAlilatsmode ) ), // cellredm_ ( rg.getBool ( "hifst.cellredm" ) ), // finalredm_ ( rg.getBool ( "hifst.finalredm" ) ), hipdtmode_ (rg.getBool (HifstConstants::kHifstUsepdt) ), rtnopt_ (rg.getBool (HifstConstants::kHifstRtnopt) ), replacefstbyarc_ ( rg.getSetString ( HifstConstants::kHifstReplacefstbyarcNonterminals ) ), replacefstbyarcexceptions_ ( rg.getSetString ( HifstConstants::kHifstReplacefstbyarcExceptions ) ), replacefstbynumstates_ ( rg.get<unsigned> ( HifstConstants::kHifstReplacefstbyarcNumstates ) ), localprune_ ( rg.getBool ( HifstConstants::kHifstLocalpruneEnable ) ), pruneweight_ ( rg.get<float> ( HifstConstants::kHifstPrune ) ), numstatesthreshold_ ( rg.get<unsigned> ( HifstConstants::kHifstLocalpruneNumstates ) ), lpctuples_ ( rg.getVectorString ( HifstConstants::kHifstLocalpruneConditions ) ) { LINFO ("Number of local language models=" << numlocallm_); LINFO ("aligner mode=" << aligner_); LINFO ("localprune mode=" << localprune_); USER_CHECK ( ! ( lpc_.size() % 4 ), "local pruning conditions are defined by tuples of 4 elements: category,x,y,Number-of-states. Category is a string and x,y are int. Number of states is unsigned" ); USER_CHECK ( (localprune_ && numlocallm_) || ( localprune_ && !numlocallm_ && aligner_ ) || (!localprune_) , "If you want to do cell pruning in translation, you should use a language model for local pruning. Check --hifst.localprune.lm.load and --hifst.localprune.enable.\n"); optimize.setAlignMode (aligner_); if (hipdtmode_) { LINFO ("Hipdt mode enabled!"); } if (!rtnopt_) { LINFO ("RTN openfst optimizations will not be applied"); } LDEBUG ( "Hifst constructor done!" ); };
void buildNextElementFromPattern ( std::vector<std::string>& spattern, std::vector<std::string>& ss, std::vector< std::vector<std::string> >& pinstances, unsigned ps, unsigned pp, unsigned gaphistory = 0 ) { LDEBUG ( "startingword:" << ss[ps] << ",thisword:" << ss[ps + pp + gaphistory ] << ",thiselement:" << spattern[pp] << ",ps=" << ps << ",pp=" << pp << ",spatternsize=" << spattern.size() << ",gaphistory=" << gaphistory ); if ( spattern[pp] == "w" ) { pinstances[pinstances.size() - 1].push_back ( ucam::util::toString ( ss[ps + pp + gaphistory] ) ); if ( ( pp + 1 < spattern.size() ) && ( ps + spattern.size() + gaphistory <= ss.size() ) ) buildNextElementFromPattern ( spattern, ss, pinstances, ps, pp + 1, gaphistory ); } else if ( spattern[pp] == "X" ) { LDEBUG ( "X,with gapmaxspan=" << gapmaxspan_ ); pinstances[pinstances.size() - 1].push_back ( "X" ); std::vector<std::string> replicate = pinstances[pinstances.size() - 1]; for ( unsigned k = 1; ( k <= gapmaxspan_ ) && ( pp + 1 < spattern.size() ) && ( ps + spattern.size() - 1 + gaphistory + k - 1 < ss.size() ) && ( spattern.size() + gaphistory + k - 1 <= maxspan_ ); ++k ) { LDEBUG ( "GAPSPAN=" << k ); if ( k > 1 ) pinstances.push_back ( replicate ); //clone previous one and run recursively. buildNextElementFromPattern ( spattern, ss, pinstances, ps, pp + 1, gaphistory + k - 1 ); } } else { ///Bad news if you get here... Expliciting failed condition to provide enough user information... USER_CHECK ( spattern[pp] == "X" || spattern[pp] == "w", "Incorrect pattern!" ); } };
/** * \brief Opens a [file] (pipe, or a text/compressed file). * using boost: * using fdstream: All three cases are handled with zcat -f,, which is piped (i.e. handled by another processor). * \param filename File name to be opened. */ inline void open ( const std::string& filename ) { close(); #ifdef USE_FDSTREAM_HPP if ( filename != "-" ) { LDEBUG ( "Test file=[" << filename << "]" ); sfile_ = fopen ( filename.c_str(), "r" ); USER_CHECK ( sfile_ != NULL, "Error while opening file" ); fclose ( sfile_ ); //Now we can assume more or less safely that file really exists (sigh) } //lets open this with the pipe. std::string command = "zcat -f "; command += filename; LINFO ( "Opening (fd)" << command ); sfile_ = popen ( command.c_str(), "r" ); USER_CHECK ( sfile_ != NULL, "Error while opening pipe" ); filestream_ = new boost::fdistream ( fileno ( sfile_ ) ); USER_CHECK (filestream_, "File Stream allocation failed!"); #else LINFO ( "Opening " << filename ); std::string auxfilename = filename; if (auxfilename == "-" ) auxfilename = "/dev/stdin"; file.reset (new std::ifstream (auxfilename.c_str(), std::ios_base::in | std::ios_base::binary) ); if (!USER_CHECK (file->is_open(), "Error while opening file:") ) exit (EXIT_FAILURE); in.reset (new boost::iostreams::filtering_streambuf<boost::iostreams::input>); if (auxfilename.substr (0, 5) != "/dev/" ) { if (ends_with (auxfilename, ".gz") ) in->push (boost::iostreams::gzip_decompressor() ); } in->push (*file); filestream_ = new std::istream (&*in); if (!USER_CHECK (filestream_, "File Stream allocation failed!") ) exit (EXIT_FAILURE); //Try to detect whether it actually _is_ a compressed file (sigh) . if (ends_with (auxfilename, ".gz") ) { filestream_->peek(); if (!USER_CHECK (filestream_->good(), "File not open/doesn't exist... Or possibly not compressed but ends with .gz? " ) ) exit (EXIT_FAILURE); } #endif };
/** * \brief Runs the lattice building procedure. * \param d Contains the data structure with all the necessary elements (i.e. cykdata) and in which will be store a pointer to the * output lattice. */ bool run ( Data& d ) { cykfstresult_.DeleteStates(); this->d_ = &d; hieroindexexistence_.clear(); LINFO ( "Running HiFST" ); //Reset one-time warnings for inexistent language models. warned_ = false; pdtparens_.clear(); cykdata_ = d.cykdata; if ( !USER_CHECK ( cykdata_, "cyk parse has not been executed previously?" ) ) { resetExternalData (d); return true; } if ( d.cykdata->success == CYK_RETURN_FAILURE ) { ///Keep calm, return empty lattice and carry on fst::VectorFst<Arc> aux; d.fsts[outputkey_] = &cykfstresult_; d.vcat = cykdata_->vcat; resetExternalData (d); return false; } ///If not yet, initialize now functor with local conditions. initLocalConditions(); rtn_ = new RTNT; if ( localprune_ ) rtnnumstates_ = new ExpandedNumStatesRTNT; rfba_ = new ReplaceFstByArcT ( cykdata_->vcat, replacefstbyarc_, replacefstbyarcexceptions_, aligner_, replacefstbynumstates_ ); piscount_ = 0; //reset pruning-in-search count to 0 LINFO ( "Second Pass: FST-building!" ); d.stats->setTimeStart ( "lattice-construction" ); //Owned by rtn_; fst::Fst<Arc> *sfst = buildRTN ( cykdata_->categories["S"], 0, cykdata_->sentence.size() - 1 ); d.stats->setTimeEnd ( "lattice-construction" ); cykfstresult_ = (*sfst); LINFO ( "Final - RTN head optimizations !" ); optimize ( &cykfstresult_ , std::numeric_limits<unsigned>::max() , !hipdtmode_ && optimize_ ); FORCELINFO ("Stats for Sentence " << d.sidx << ": local pruning, number of times=" << piscount_); d.stats->lpcount = piscount_; //store local pruning counts in stats LINFO ("RTN expansion starts now!"); //Expand... { ///Define hieroindex Label hieroindex = APBASETAG + 1 * APCCTAG + 0 * APXTAG + ( cykdata_->sentence.size() - 1 ) * APYTAG; if ( hieroindexexistence_.find ( hieroindex ) == hieroindexexistence_.end() ) pairlabelfsts_.push_back ( pair< Label, const fst::Fst<Arc> * > ( hieroindex, &cykfstresult_ ) ); ///Optimizations over the rtn -- they are generally worth doing... fst::ReplaceUtil<Arc> replace_util (pairlabelfsts_, hieroindex, !aligner_); //has ownership of modified rtn fsts if (rtnopt_) { LINFO ("rtn optimizations..."); d_->stats->setTimeStart ("replace-opts"); replace_util.ReplaceTrivial(); replace_util.ReplaceUnique(); replace_util.Connect(); pairlabelfsts_.clear(); replace_util.GetFstPairs (&pairlabelfsts_); d_->stats->setTimeEnd ("replace-opts"); } //After optimizations, we can write RTN if required by user writeRTN(); boost::scoped_ptr< fst::VectorFst<Arc> > efst (new fst::VectorFst<Arc>); if (!hipdtmode_ ) { LINFO ("Final Replace (RTN->FSA), main index=" << hieroindex); d_->stats->setTimeStart ("replace-rtn-final"); Replace (pairlabelfsts_, &*efst, hieroindex, !aligner_); d_->stats->setTimeEnd ("replace-rtn-final"); } else { LINFO ("Final Replace (RTN->PDA)"); d_->stats->setTimeStart ("replace-pdt-final"); Replace (pairlabelfsts_, &*efst, &pdtparens_, hieroindex); d_->stats->setTimeEnd ("replace-pdt-final"); LINFO ("Number of pdtparens=" << pdtparens_.size() ); } LINFO ("Removing Epsilons..."); fst::RmEpsilon<Arc> ( &*efst ); LINFO ("Done! NS=" << efst->NumStates() ); //Apply filters applyFilters ( &*efst ); //Compose with full reference lattice to ensure that final lattice is correct. if ( d.fsts.find ( fullreferencelatticekey_ ) != d.fsts.end() ) { if ( static_cast< fst::VectorFst<Arc> * > (d.fsts[fullreferencelatticekey_])->NumStates() > 0 ) { LINFO ( "Composing with full reference lattice, NS=" << static_cast< fst::VectorFst<Arc> * > (d.fsts[fullreferencelatticekey_])->NumStates() ); fst::Compose<Arc> ( *efst, * ( static_cast<fst::VectorFst<Arc> * > (d.fsts[fullreferencelatticekey_]) ), &*efst ); LINFO ( "After composition: NS=" << efst->NumStates() ); } else { LINFO ( "No composition with full ref lattice" ); }; } else { LINFO ( "No composition with full ref lattice" ); }; //Apply language model fst::VectorFst<Arc> *res = NULL; if (efst->NumStates() ) res = applyLanguageModel ( *efst ); else { LWARN ("Empty lattice -- skipping LM application"); } if ( res != NULL ) { boost::shared_ptr<fst::VectorFst<Arc> >latlm ( res ); if ( latlm.get() == efst.get() ) { LWARN ( "Yikes! Unexpected situation! Will it crash? (muhahaha) " ); } //Todo: union with shortest path... if ( pruneweight_ < std::numeric_limits<float>::max() ) { if (!hipdtmode_ || pdtparens_.empty() ) { LINFO ("Pruning, weight=" << pruneweight_); fst::Prune<Arc> (*latlm, &cykfstresult_, mw_ ( pruneweight_ ) ); } else { LINFO ("Expanding, weight=" << pruneweight_); fst::ExpandOptions<Arc> eopts (true, false, mw_ ( pruneweight_ ) ); Expand ( *latlm, pdtparens_, &cykfstresult_, eopts); pdtparens_.clear(); } } else { LINFO ("Copying through full lattice with lm scores"); cykfstresult_ = *latlm; } } else { LINFO ("Copying through full lattice (no lm)"); cykfstresult_ = *efst; } if ( hieroindexexistence_.find ( hieroindex ) == hieroindexexistence_.end() ) pairlabelfsts_.pop_back(); } pairlabelfsts_.clear(); LINFO ( "Reps" ); fst::RmEpsilon ( &cykfstresult_ ); LINFO ( "NS=" << cykfstresult_.NumStates() ); //This should delete all pertinent fsas... LINFO ( "deleting data stuff..." ); delete rtn_; if ( localprune_ ) delete rtnnumstates_; delete rfba_; d.vcat = cykdata_->vcat; resetExternalData (d); d.fsts[outputkey_] = &cykfstresult_; if (hipdtmode_ && pdtparens_.size() ) d.fsts[outputkey_ + ".parens" ] = &pdtparens_; LINFO ( "done..." ); FORCELINFO ( "End Sentence ******************************************************" ); d.stats->setTimeEnd ( "sent-dec" ); d.stats->message += "[" + ucam::util::getTimestamp() + "] End Sentence\n"; return false; };