Beispiel #1
0
  /**
   * \brief Method inherited from ucam::util::TaskInterface. Loads the language model and stores in lm data structure.
   * \param &d: data structure in which the null filter is to be stored.
   * \returns false (does not break the chain of tasks)
   */
  bool run ( Data& d ) {
    mylmfst_.DeleteStates();
    if ( !USER_CHECK ( d.klm.size() ,
                       "No language models available" ) ) return true;
    if ( !USER_CHECK ( d.klm.find ( lmkey_ ) != d.klm.end() ,
                       "No language models available (key not initialized) " ) ) return true;
    if ( !USER_CHECK ( d.fsts.find ( latticeloadkey_ ) != d.fsts.end() ,
                       " Input fst not available!" ) ) return true;
    mylmfst_ = * (static_cast<fst::VectorFst<Arc> * > ( d.fsts[latticeloadkey_] ) );
    if (deletelmscores_) {
      LINFO ( "Delete old LM scores first" );
      //Deletes LM scores if using lexstdarc. Note -- will copy through on stdarc!
      fst::MakeWeight2<Arc> mwcopy;
      fst::Map<Arc> ( &mylmfst_,
                      fst::GenericWeightAutoMapper<Arc, fst::MakeWeight2<Arc> > ( mwcopy ) );
    }
    LINFO ( "Input lattice loaded with key=" << latticeloadkey_ << ", NS=" <<
            mylmfst_.NumStates() );
    fst::MakeWeight<Arc> mw;
    for ( uint k = 0; k < d.klm[lmkey_].size(); ++k ) {
      if ( !USER_CHECK ( d.klm[lmkey_][k]->model != NULL,
                         "Language model " << k << " not available!" ) ) return true;
      KenLMModelT& model = *d.klm[lmkey_][k]->model;
#ifndef USE_GOOGLE_SPARSE_HASH
      unordered_set<Label> epsilons;
#else
      google::dense_hash_set<Label> epsilons;
      epsilons.set_empty_key ( numeric_limits<Label>::max() );
#endif
      ///We want the language model to ignore these guys:
      epsilons.insert ( DR );
      epsilons.insert ( OOV );
      epsilons.insert ( EPSILON );
      epsilons.insert ( SEP );
      LINFO ( "Applying language model " << k
              << " with lmkey=" << lmkey_
              << ", using lmscale=" << d.klm[lmkey_][k]->lmscale );
      LDEBUG ( "lattice NS=" << mylmfst_.NumStates() );
      fst::ApplyLanguageModelOnTheFly<Arc, fst::MakeWeight<Arc>, KenLMModelT> *f
        = new fst::ApplyLanguageModelOnTheFly<Arc, fst::MakeWeight<Arc>, KenLMModelT >
      ( mylmfst_
        , model
        , epsilons
        , natlog_
        , d.klm[lmkey_][k]->lmscale
        , d.klm[lmkey_][k]->lmwp
        , d.klm[lmkey_][k]->idb);
      f->setMakeWeight ( mw );
      d.stats->setTimeStart ( "on-the-fly-composition " +  ucam::util::toString (
                                k ) );
      mylmfst_ = * ( ( *f ) () );
      delete f;
      d.stats->setTimeEnd ( "on-the-fly-composition " + ucam::util::toString ( k ) );
      LDEBUG ( mylmfst_.NumStates() );
      mw.update();
    }
    d.fsts[latticestorekey_] = &mylmfst_;
    LINFO ( "Done!" );
    return false;
  };
Beispiel #2
0
 ///Constructor with registry object and several keys to access data object and registry
 HiFSTTask ( const ucam::util::RegistryPO& rg ,
             const std::string& outputkey = HifstConstants::kHifstLatticeStore,
             const std::string& locallmkey = HifstConstants::kHifstLocalpruneLmLoad,
             const std::string& fullreferencelatticekey =
               HifstConstants::kReferencefilterNosubstringStore ,
             const std::string& lmkey = HifstConstants::kLmLoad
           ) :
   optimize_ (rg.getBool (HifstConstants::kHifstOptimizecells) ),
   numlocallm_ (rg.getVectorString (locallmkey).size() ),
   warned_ (false),
   rtnfiles_ (rg.get<std::string> (HifstConstants::kHifstWritertn) ),
   fullreferencelatticekey_ ( fullreferencelatticekey ),
   lmkey_ ( lmkey ),
   locallmkey_ ( locallmkey ),
   outputkey_ ( outputkey ),
   piscount_ ( 0 ),
   aligner_ ( rg.getBool ( HifstConstants::kHifstAlilatsmode ) ),
   //    cellredm_ ( rg.getBool ( "hifst.cellredm" ) ),
   //    finalredm_ ( rg.getBool ( "hifst.finalredm" ) ),
   hipdtmode_ (rg.getBool (HifstConstants::kHifstUsepdt) ),
   rtnopt_ (rg.getBool (HifstConstants::kHifstRtnopt) ),
   replacefstbyarc_ ( rg.getSetString (
                        HifstConstants::kHifstReplacefstbyarcNonterminals ) ),
   replacefstbyarcexceptions_ ( rg.getSetString (
                                  HifstConstants::kHifstReplacefstbyarcExceptions ) ),
   replacefstbynumstates_ ( rg.get<unsigned>
                            ( HifstConstants::kHifstReplacefstbyarcNumstates ) ),
   localprune_ ( rg.getBool ( HifstConstants::kHifstLocalpruneEnable ) ),
   pruneweight_ ( rg.get<float> ( HifstConstants::kHifstPrune ) ),
   numstatesthreshold_ ( rg.get<unsigned>
                         ( HifstConstants::kHifstLocalpruneNumstates ) ),
   lpctuples_ ( rg.getVectorString (
                  HifstConstants::kHifstLocalpruneConditions ) ) {
   LINFO ("Number of local language models=" << numlocallm_);
   LINFO ("aligner mode=" << aligner_);
   LINFO ("localprune mode=" << localprune_);
   USER_CHECK ( ! ( lpc_.size() % 4 ),
                "local pruning conditions are defined by tuples of 4 elements: category,x,y,Number-of-states. Category is a string and x,y are int. Number of states is unsigned" );
   USER_CHECK ( (localprune_ && numlocallm_) || ( localprune_ && !numlocallm_
                && aligner_ ) || (!localprune_) ,
                "If you want to do cell pruning in translation, you should  use a language model for local pruning. Check --hifst.localprune.lm.load and --hifst.localprune.enable.\n");
   optimize.setAlignMode (aligner_);
   if (hipdtmode_) {
     LINFO ("Hipdt mode enabled!");
   }
   if (!rtnopt_) {
     LINFO ("RTN openfst optimizations will not be applied");
   }
   LDEBUG ( "Hifst constructor done!" );
 };
 void buildNextElementFromPattern ( std::vector<std::string>& spattern,
                                    std::vector<std::string>& ss,
                                    std::vector< std::vector<std::string> >& pinstances,
                                    unsigned ps,
                                    unsigned pp,
                                    unsigned gaphistory = 0 ) {
   LDEBUG ( "startingword:" << ss[ps] << ",thisword:" <<   ss[ps + pp +
            gaphistory ] << ",thiselement:" << spattern[pp] << ",ps=" << ps << ",pp=" << pp
            << ",spatternsize=" << spattern.size() << ",gaphistory=" << gaphistory );
   if ( spattern[pp] == "w" ) {
     pinstances[pinstances.size() - 1].push_back ( ucam::util::toString (
           ss[ps + pp + gaphistory] ) );
     if ( ( pp + 1 < spattern.size() )
          && ( ps + spattern.size() + gaphistory  <= ss.size() ) )
       buildNextElementFromPattern ( spattern, ss, pinstances, ps, pp + 1,
                                     gaphistory );
   } else if ( spattern[pp] == "X" ) {
     LDEBUG ( "X,with gapmaxspan=" << gapmaxspan_ );
     pinstances[pinstances.size() - 1].push_back ( "X" );
     std::vector<std::string> replicate = pinstances[pinstances.size() - 1];
     for ( unsigned k = 1;
           ( k <= gapmaxspan_ )
           && ( pp + 1 < spattern.size() )
           && ( ps + spattern.size() - 1 + gaphistory + k - 1 < ss.size() )
           && ( spattern.size() + gaphistory + k - 1 <= maxspan_ );
           ++k ) {
       LDEBUG ( "GAPSPAN=" << k );
       if ( k > 1 ) pinstances.push_back (
           replicate ); //clone previous one and run recursively.
       buildNextElementFromPattern ( spattern, ss, pinstances, ps, pp + 1,
                                     gaphistory + k - 1 );
     }
   } else {
     ///Bad news if you get here... Expliciting failed condition to provide enough user information...
     USER_CHECK ( spattern[pp] == "X" || spattern[pp] == "w", "Incorrect pattern!" );
   }
 };
Beispiel #4
0
  /**
   * \brief Opens a [file] (pipe, or a text/compressed file).
   * using boost:
   * using fdstream: All three cases are handled with zcat -f,, which is piped (i.e. handled by another processor).
   * \param filename File name to be opened.
   */
  inline void open ( const std::string& filename ) {
    close();
#ifdef USE_FDSTREAM_HPP
    if ( filename != "-" ) {
      LDEBUG ( "Test file=[" << filename << "]" );
      sfile_ = fopen ( filename.c_str(), "r" );
      USER_CHECK ( sfile_ != NULL, "Error while opening file" );
      fclose ( sfile_ );
      //Now we can assume more or less safely that file really exists (sigh)
    }
    //lets open this with the pipe.
    std::string command = "zcat -f ";
    command += filename;
    LINFO ( "Opening (fd)" << command );
    sfile_ = popen ( command.c_str(), "r" );
    USER_CHECK ( sfile_ != NULL, "Error while opening pipe" );
    filestream_ = new boost::fdistream ( fileno ( sfile_ ) );
    USER_CHECK (filestream_, "File Stream allocation failed!");
#else
    LINFO ( "Opening " << filename );
    std::string auxfilename = filename;
    if (auxfilename == "-" ) auxfilename = "/dev/stdin";
    file.reset (new std::ifstream (auxfilename.c_str(),
                                   std::ios_base::in | std::ios_base::binary) );
    if (!USER_CHECK (file->is_open(),
                     "Error while opening file:") ) exit (EXIT_FAILURE);
    in.reset (new boost::iostreams::filtering_streambuf<boost::iostreams::input>);
    if (auxfilename.substr (0, 5) != "/dev/" ) {
      if (ends_with (auxfilename, ".gz") )
        in->push (boost::iostreams::gzip_decompressor() );
    }
    in->push (*file);
    filestream_ = new std::istream (&*in);
    if (!USER_CHECK (filestream_,
                     "File Stream allocation failed!") ) exit (EXIT_FAILURE);
    //Try to detect whether it actually _is_ a compressed file (sigh) .
    if (ends_with (auxfilename, ".gz") ) {
      filestream_->peek();
      if (!USER_CHECK (filestream_->good(),
                       "File not open/doesn't exist... Or possibly not compressed but ends with .gz? " ) )
        exit (EXIT_FAILURE);
    }
#endif
  };
Beispiel #5
0
 /**
  * \brief Runs the lattice building procedure.
  * \param d          Contains the data structure with all the necessary elements (i.e. cykdata) and in which will be store a pointer to the
  * output lattice.
  */
 bool run ( Data& d ) {
   cykfstresult_.DeleteStates();
   this->d_ = &d;
   hieroindexexistence_.clear();
   LINFO ( "Running HiFST" );
   //Reset one-time warnings for inexistent language models.
   warned_ = false;
   pdtparens_.clear();
   cykdata_ = d.cykdata;
   if ( !USER_CHECK ( cykdata_, "cyk parse has not been executed previously?" ) ) {
     resetExternalData (d);
     return true;
   }
   if ( d.cykdata->success == CYK_RETURN_FAILURE ) {
     ///Keep calm, return empty lattice and carry on
     fst::VectorFst<Arc> aux;
     d.fsts[outputkey_] = &cykfstresult_;
     d.vcat = cykdata_->vcat;
     resetExternalData (d);
     return false;
   }
   ///If not yet, initialize now functor with local conditions.
   initLocalConditions();
   rtn_ = new RTNT;
   if ( localprune_ )
     rtnnumstates_ = new ExpandedNumStatesRTNT;
   rfba_ = new ReplaceFstByArcT ( cykdata_->vcat, replacefstbyarc_,
                                  replacefstbyarcexceptions_, aligner_, replacefstbynumstates_ );
   piscount_ = 0; //reset pruning-in-search count to 0
   LINFO ( "Second Pass: FST-building!" );
   d.stats->setTimeStart ( "lattice-construction" );
   //Owned by rtn_;
   fst::Fst<Arc> *sfst = buildRTN ( cykdata_->categories["S"], 0,
                                    cykdata_->sentence.size() - 1 );
   d.stats->setTimeEnd ( "lattice-construction" );
   cykfstresult_ = (*sfst);
   LINFO ( "Final - RTN head optimizations !" );
   optimize ( &cykfstresult_ ,
              std::numeric_limits<unsigned>::max() ,
              !hipdtmode_  && optimize_
            );
   FORCELINFO ("Stats for Sentence " << d.sidx <<
               ": local pruning, number of times=" << piscount_);
   d.stats->lpcount = piscount_; //store local pruning counts in stats
   LINFO ("RTN expansion starts now!");
   //Expand...
   {
     ///Define hieroindex
     Label hieroindex = APBASETAG + 1 * APCCTAG + 0 * APXTAG +
                        ( cykdata_->sentence.size() - 1 ) * APYTAG;
     if ( hieroindexexistence_.find ( hieroindex ) == hieroindexexistence_.end() )
       pairlabelfsts_.push_back ( pair< Label, const fst::Fst<Arc> * > ( hieroindex,
                                  &cykfstresult_ ) );
     ///Optimizations over the rtn -- they are generally worth doing...
     fst::ReplaceUtil<Arc> replace_util (pairlabelfsts_, hieroindex,
                                         !aligner_); //has ownership of modified rtn fsts
     if (rtnopt_) {
       LINFO ("rtn optimizations...");
       d_->stats->setTimeStart ("replace-opts");
       replace_util.ReplaceTrivial();
       replace_util.ReplaceUnique();
       replace_util.Connect();
       pairlabelfsts_.clear();
       replace_util.GetFstPairs (&pairlabelfsts_);
       d_->stats->setTimeEnd ("replace-opts");
     }
     //After optimizations, we can write RTN if required by user
     writeRTN();
     boost::scoped_ptr< fst::VectorFst<Arc> > efst (new fst::VectorFst<Arc>);
     if (!hipdtmode_ ) {
       LINFO ("Final Replace (RTN->FSA), main index=" << hieroindex);
       d_->stats->setTimeStart ("replace-rtn-final");
       Replace (pairlabelfsts_, &*efst, hieroindex, !aligner_);
       d_->stats->setTimeEnd ("replace-rtn-final");
     } else {
       LINFO ("Final Replace (RTN->PDA)");
       d_->stats->setTimeStart ("replace-pdt-final");
       Replace (pairlabelfsts_, &*efst, &pdtparens_, hieroindex);
       d_->stats->setTimeEnd ("replace-pdt-final");
       LINFO ("Number of pdtparens=" << pdtparens_.size() );
     }
     LINFO ("Removing Epsilons...");
     fst::RmEpsilon<Arc> ( &*efst );
     LINFO ("Done! NS=" << efst->NumStates() );
     //Apply filters
     applyFilters ( &*efst );
     //Compose with full reference lattice to ensure that final lattice is correct.
     if ( d.fsts.find ( fullreferencelatticekey_ ) != d.fsts.end() ) {
       if ( static_cast< fst::VectorFst<Arc> * >
            (d.fsts[fullreferencelatticekey_])->NumStates() > 0 ) {
         LINFO ( "Composing with full reference lattice, NS=" <<
                 static_cast< fst::VectorFst<Arc> * >
                 (d.fsts[fullreferencelatticekey_])->NumStates() );
         fst::Compose<Arc> ( *efst,
                             * ( static_cast<fst::VectorFst<Arc> * > (d.fsts[fullreferencelatticekey_]) ),
                             &*efst );
         LINFO ( "After composition: NS=" << efst->NumStates() );
       } else {
         LINFO ( "No composition with full ref lattice" );
       };
     } else {
       LINFO ( "No composition with full ref lattice" );
     };
     //Apply language model
     fst::VectorFst<Arc> *res = NULL;
     if (efst->NumStates() )
       res = applyLanguageModel ( *efst  );
     else {
       LWARN ("Empty lattice -- skipping LM application");
     }
     if ( res != NULL ) {
       boost::shared_ptr<fst::VectorFst<Arc> >latlm ( res );
       if ( latlm.get() == efst.get() ) {
         LWARN ( "Yikes! Unexpected situation! Will it crash? (muhahaha) " );
       }
       //Todo: union with shortest path...
       if ( pruneweight_ < std::numeric_limits<float>::max() ) {
         if (!hipdtmode_ || pdtparens_.empty() ) {
           LINFO ("Pruning, weight=" << pruneweight_);
           fst::Prune<Arc> (*latlm, &cykfstresult_, mw_ ( pruneweight_ ) );
         } else {
           LINFO ("Expanding, weight=" << pruneweight_);
           fst::ExpandOptions<Arc> eopts (true, false, mw_ ( pruneweight_ ) );
           Expand ( *latlm, pdtparens_, &cykfstresult_, eopts);
           pdtparens_.clear();
         }
       } else {
         LINFO ("Copying through full lattice with lm scores");
         cykfstresult_ = *latlm;
       }
     } else {
       LINFO ("Copying through full lattice (no lm)");
       cykfstresult_ = *efst;
     }
     if ( hieroindexexistence_.find ( hieroindex ) == hieroindexexistence_.end() )
       pairlabelfsts_.pop_back();
   }
   pairlabelfsts_.clear();
   LINFO ( "Reps" );
   fst::RmEpsilon ( &cykfstresult_ );
   LINFO ( "NS=" << cykfstresult_.NumStates() );
   //This should delete all pertinent fsas...
   LINFO ( "deleting data stuff..." );
   delete rtn_;
   if ( localprune_ )
     delete rtnnumstates_;
   delete rfba_;
   d.vcat = cykdata_->vcat;
   resetExternalData (d);
   d.fsts[outputkey_] = &cykfstresult_;
   if (hipdtmode_ && pdtparens_.size() )
     d.fsts[outputkey_ + ".parens" ] = &pdtparens_;
   LINFO ( "done..." );
   FORCELINFO ( "End Sentence ******************************************************" );
   d.stats->setTimeEnd ( "sent-dec" );
   d.stats->message += "[" + ucam::util::getTimestamp() + "] End Sentence\n";
   return false;
 };