Exemple #1
0
 /**
  * \brief Private constructor with ucam::util::RegistryPO object and index to a particular language model.
  * This constructor is only used when several language models are loaded.
  * If the user wants to load several language models  (e.g. --lm.load=lm1,lm2,lm3,lm4 and --lm.scale=0.25,0.25,0.25 ).
  * The second and following instances of LoadLanguageModelTask will be created using the private constructor.
  * In the constructor itself the next language model loader task is appended.
  * this constructor, which has an index to the actual language model that must be loaded.
  * \param rg      :  ucam::util::RegistryPO object, containing user parameters.
  * \param index   :  Index to the actual language model.
  * \param lmload  :  key word to access the registry object for language models
  * \param lmscale :  key word to access the registry object for language model scales.
  */
 LoadLanguageModelTask ( const ucam::util::RegistryPO& rg ,
                         uint index ,
                         const std::string& lmload = HifstConstants::kLmLoad,
                         const std::string& lmscale = HifstConstants::kLmFeatureweights ,
                         const std::string& lmwp = HifstConstants::kLmWordPenalty,
                         const std::string& wordmapkey = HifstConstants::kLmWordmap
                       ) :
   rg_ ( rg ),
   lmkey_ ( lmload ),
   previous_ ( "" ),
   built_ ( false ),
   index_ ( index ),
   isintegermapped_ (!rg.exists (wordmapkey)
                     || rg.get<std::string> (wordmapkey) == ""),
   wordmapkey_ (wordmapkey),
   lmfile_ ( rg.getVectorString ( lmload , index ) ) {
   LDEBUG ( "LM loader using parameters " << lmload << "/" << lmscale <<
           ", and key " << lmkey_  << ",index=" << index_ << ",wordmapkey=" <<
           wordmapkey_);
   setLanguageModelScale ( lmscale );
   setLanguageModelWordPenalty ( lmwp );
   if ( rg.getVectorString ( lmload ).size() > index_ + 1 ) {
     LDEBUG ( "Appending Language model..." );
     this->appendTask ( new LoadLanguageModelTask ( rg, index_ + 1, lmload, lmscale ,
                        lmwp , wordmapkey ) );
   }
   LDEBUG ( "." );
 };
 ///Constructor with RegistryPO object.
 HifstStatsTask ( const ucam::util::RegistryPO& rg ) :
   d_ ( NULL ),
   writeCYKStats_ ( rg.getBool ( HifstConstants::kStatsHifstCykgridEnable  ) ),
   width_ ( rg.get<unsigned> ( HifstConstants::kStatsHifstCykgridCellwidth ) ),
   statsoutput_ ( rg.get<std::string> ( HifstConstants::kStatsHifstWrite ) ) {
   this->appendTask ( new ucam::fsttools::SpeedStatsTask<Data> ( rg ) );
 };
 ///Constructor with registry object, offset and keys
 LoadSparseWeightFlowerLatticeTask ( const ucam::util::RegistryPO& rg,
                                     const unsigned offset =
                                       1, //minimum offset considering only one language model...
                                     const std::string& alignmentlattices =
                                       HifstConstants::kRuleflowerlatticeFilterbyalilats,
                                     const std::string& grammarloadkey = HifstConstants::kRuleflowerlatticeLoad,
                                     const std::string& grammarstorekey = HifstConstants::kRuleflowerlatticeStore
                                   ) :
   offset_ ( offset ),
   rg_ ( rg ),
   alilats_ ( rg.exists ( alignmentlattices ) ? rg.get<std::string>
              ( alignmentlattices ) : "" ),
   grammar_ ( rg.get<std::string> ( grammarloadkey ) ),
   fscales_ ( fst::TropicalSparseTupleWeight<float>::Params() ),
   filterbyalilats_ ( rg.exists ( alignmentlattices ) ),
   grammarstorekey_ ( grammarstorekey ) {
 };
Exemple #4
0
 ///Constructor with registry object and several keys to access data object and registry
 HiFSTTask ( const ucam::util::RegistryPO& rg ,
             const std::string& outputkey = HifstConstants::kHifstLatticeStore,
             const std::string& locallmkey = HifstConstants::kHifstLocalpruneLmLoad,
             const std::string& fullreferencelatticekey =
               HifstConstants::kReferencefilterNosubstringStore ,
             const std::string& lmkey = HifstConstants::kLmLoad
           ) :
   optimize_ (rg.getBool (HifstConstants::kHifstOptimizecells) ),
   numlocallm_ (rg.getVectorString (locallmkey).size() ),
   warned_ (false),
   rtnfiles_ (rg.get<std::string> (HifstConstants::kHifstWritertn) ),
   fullreferencelatticekey_ ( fullreferencelatticekey ),
   lmkey_ ( lmkey ),
   locallmkey_ ( locallmkey ),
   outputkey_ ( outputkey ),
   piscount_ ( 0 ),
   aligner_ ( rg.getBool ( HifstConstants::kHifstAlilatsmode ) ),
   //    cellredm_ ( rg.getBool ( "hifst.cellredm" ) ),
   //    finalredm_ ( rg.getBool ( "hifst.finalredm" ) ),
   hipdtmode_ (rg.getBool (HifstConstants::kHifstUsepdt) ),
   rtnopt_ (rg.getBool (HifstConstants::kHifstRtnopt) ),
   replacefstbyarc_ ( rg.getSetString (
                        HifstConstants::kHifstReplacefstbyarcNonterminals ) ),
   replacefstbyarcexceptions_ ( rg.getSetString (
                                  HifstConstants::kHifstReplacefstbyarcExceptions ) ),
   replacefstbynumstates_ ( rg.get<unsigned>
                            ( HifstConstants::kHifstReplacefstbyarcNumstates ) ),
   localprune_ ( rg.getBool ( HifstConstants::kHifstLocalpruneEnable ) ),
   pruneweight_ ( rg.get<float> ( HifstConstants::kHifstPrune ) ),
   numstatesthreshold_ ( rg.get<unsigned>
                         ( HifstConstants::kHifstLocalpruneNumstates ) ),
   lpctuples_ ( rg.getVectorString (
                  HifstConstants::kHifstLocalpruneConditions ) ) {
   LINFO ("Number of local language models=" << numlocallm_);
   LINFO ("aligner mode=" << aligner_);
   LINFO ("localprune mode=" << localprune_);
   USER_CHECK ( ! ( lpc_.size() % 4 ),
                "local pruning conditions are defined by tuples of 4 elements: category,x,y,Number-of-states. Category is a string and x,y are int. Number of states is unsigned" );
   USER_CHECK ( (localprune_ && numlocallm_) || ( localprune_ && !numlocallm_
                && aligner_ ) || (!localprune_) ,
                "If you want to do cell pruning in translation, you should  use a language model for local pruning. Check --hifst.localprune.lm.load and --hifst.localprune.enable.\n");
   optimize.setAlignMode (aligner_);
   if (hipdtmode_) {
     LINFO ("Hipdt mode enabled!");
   }
   if (!rtnopt_) {
     LINFO ("RTN openfst optimizations will not be applied");
   }
   LDEBUG ( "Hifst constructor done!" );
 };
Exemple #5
0
 ///Constructor with ucam::util::RegistryPO object
 ApplyLanguageModelTask ( const ucam::util::RegistryPO& rg ,
                          const std::string& lmkey = HifstConstants::kLmLoad ,
                          const std::string& latticeloadkey = "lm.lattice.load",
                          const std::string& latticestorekey = "lm.lattice.store",
                          bool deletelmscores = false
                        ) :
   lmkey_ ( lmkey ),
   latticeloadkey_ ( latticeloadkey ),
   latticestorekey_ ( latticestorekey ),
   natlog_ ( !rg.exists ( HifstConstants::kLmLogTen ) ),
   deletelmscores_ (deletelmscores) {
 };
Exemple #6
0
 /**
  * \brief Public constructor. If the user wants to load several language models  (e.g. --lm.load=lm1,lm2,lm3,lm4 and --lm.scale=0.25,0.25,0.25 ),
  * the second and following instances of LoadLanguageModelTask will be created using the private constructor (see below), which has an index to the actual language model that must be loaded.
  * For the public constructor, the index is set to 0.
  * \param rg        ucam::util::RegistryPO object, containing user parameters.
  * \param lmload    key word to access the registry object for language models
  * \param lmscale   key word to access the registry object for language model scales.
  * \param forceone  To force the loading of only one language model (i.e. lm1 with scale 0.25).
  */
 LoadLanguageModelTask ( const ucam::util::RegistryPO& rg
                         , const std::string& lmload = HifstConstants::kLmLoad
                         , const std::string& lmscale =
                         HifstConstants::kLmFeatureweights  //if rg.get(lmscale)=="" the scale will default to 1
                         , const std::string& lmwp =
                         HifstConstants::kLmWordPenalty  //if rg.get(wps)=="" the scale will default to 0
                         , const std::string& wordmapkey = HifstConstants::kLmWordmap
                         , bool forceone = false
                         )
     : rg_ ( rg )
     , lmkey_ ( lmload )
     , previous_ ( "" )
     , built_ ( false )
     , index_ ( 0 )
     , isintegermapped_ (!rg.exists (wordmapkey)
                         || rg.get<std::string> (wordmapkey) == "")
     , wordmapkey_ (wordmapkey)
     , lmfile_ ( rg.getVectorString ( lmload , 0 ) )
 {
   LDEBUG ( "LM loader using parameters " << lmload << "/" << lmscale << "/" << lmwp
           << ", and key " << lmkey_  << ",index=" << index_ << ",wordmap=" <<
           wordmapkey_);
   FORCELINFO("Language model loader for " << lmfile_() );
   setLanguageModelScale ( lmscale );
   setLanguageModelWordPenalty ( lmwp );
   if ( rg_.getVectorString ( lmload ).size() > 1 ) {
     if ( !forceone ) {
       LINFO ( "Appending Language model..." );
       this->appendTask ( new LoadLanguageModelTask ( rg_, 1, lmload, lmscale , lmwp ,
                          wordmapkey ) );
     } else {
       LWARN ( "Only one loaded for " << lmload <<
               ". Extra language models are being ignored" );
     }
   }
   LINFO ( "Finished constructor!" );
 };
int SampleWFSAs( ucam::util::RegistryPO const& rg) {
  using ucam::util::oszfstream;
  using ucam::util::PatternAddress;
  PatternAddress<unsigned> input(rg.get<std::string>(HifstConstants::kInput.c_str()));
  PatternAddress<unsigned> output(rg.get<std::string>(HifstConstants::kOutput.c_str()));
  unsigned n = rg.get<unsigned>(HifstConstants::kNbest.c_str());
  unsigned ns = rg.get<unsigned>(HifstConstants::kNSamples.c_str()); 
  float alpha = rg.get<float>(HifstConstants::kAlpha.c_str()); 
  bool negatives = rg.exists(HifstConstants::kNegativeExamples.c_str()); 
  bool binarytarget = rg.exists(HifstConstants::kBinaryTarget.c_str());
  bool negate = !rg.exists(HifstConstants::kDontNegate.c_str());
  std::string extTok = rg.getString(HifstConstants::kExternalTokenizer.c_str());
  std::string wMap   = rg.getString(HifstConstants::kWordMap.c_str());
  //  std::string wMap = "";
  //
  bool printOutputLabels = rg.exists(HifstConstants::kPrintOutputLabels.c_str());
  std::string refFiles;
  bool intRefs;
  if (rg.exists(HifstConstants::kWordRefs)) {
    refFiles = rg.getString(HifstConstants::kWordRefs);    
    intRefs = false;
  }
  if (rg.exists(HifstConstants::kIntRefs)) {
    refFiles = rg.getString(HifstConstants::kIntRefs);
    intRefs = true;
  } 
  std::cerr << refFiles << "**" <<std::endl;
  ucam::fsttools::BleuScorer bleuScorer(refFiles, extTok, n, intRefs, wMap);
  ucam::fsttools::TuneSet< Arc > tuneSet(rg);
  ucam::fsttools::Bleu ibs = tuneSet.ComputeBleu(bleuScorer);
  FORCELINFO("Set level Bleu: " << ibs);
  unsigned seed = time(NULL);
  if (rg.exists(HifstConstants::kRandomSeed.c_str()))
    seed = rg.get<unsigned>(HifstConstants::kRandomSeed.c_str());
  FORCELINFO("random seed: " << seed);
  srand(seed);
  boost::scoped_ptr<oszfstream> out;
  std::string old;
  for (unsigned i=0; i<tuneSet.cachedLats.size(); i++) {
    fst::VectorFst<Arc> ifst(*tuneSet.cachedLats[i]);
    fst::VectorFst<Arc> nfst;
    if (old != output (i) ) {
      out.reset(new oszfstream (output(i)));
      old = output(i);
    }
    if (!ifst.NumStates() ) {
      FORCELINFO("EMPTY: " << i);
      continue;
    }
    // Projecting allows unique to work for all cases.
    fst::Project(&ifst, (printOutputLabels?PROJECT_OUTPUT:PROJECT_INPUT));
    ShortestPath (ifst, &nfst, n, true );
    std::vector<HypT> hyps;
    fst::printStrings<Arc> (nfst, &hyps);
    std::vector< LabeledFeature< float, typename Arc::Weight> > fea = 
      ProSBLEUSample<typename Arc::Weight, HypT>(bleuScorer, hyps, i, n, ns, alpha, negatives, negate);
    for (unsigned s=0; s<fea.size(); s++) {
      *out << (binarytarget ? (fea[s].value > 0.0 ? 1 : 0) : fea[s].value);
      *out << " " << fea[s].fea << std::endl;
    }
  }
  FORCELINFO("Done Sample WFSAs");
};