void
ContextParameters::
init(Parameter& params)
{
  look_back = look_ahead = 0;
  params.SetParameter(context_string, "context-string", std::string(""));
  std::string context_window;
  params.SetParameter(context_window, "context-window", std::string(""));

  if (context_window == "")
    return;

  size_t p = context_window.find_first_of("0123456789");
  if (p == 0)
    look_back = look_ahead = atoi(context_window.c_str());
  if (p == 1) {
    if (context_window[0] == '-')
      look_back  = atoi(context_window.substr(1).c_str());
    else if (context_window[0] == '+')
      look_ahead = atoi(context_window.substr(1).c_str());
    else
      UTIL_THROW2("Invalid specification of context window.");
  }
  if (p == 2) {
    if (context_window.substr(0,2) == "+-" ||
        context_window.substr(0,2) == "-+")
      look_back = look_ahead = atoi(context_window.substr(p).c_str());
    else
      UTIL_THROW2("Invalid specification of context window.");
  }
}
Example #2
0
FeatureFunction *ConstructKenLM(size_t startInd, const std::string &line,
                                const std::string &file, FactorType factorType,
                                util::LoadMethod load_method)
{
  lm::ngram::ModelType model_type;
  if (lm::ngram::RecognizeBinary(file.c_str(), model_type)) {
    switch (model_type) {
    case lm::ngram::PROBING:
      return new KENLM<lm::ngram::ProbingModel>(startInd, line, file,
             factorType, load_method);
    case lm::ngram::REST_PROBING:
      return new KENLM<lm::ngram::RestProbingModel>(startInd, line, file,
             factorType, load_method);
    case lm::ngram::TRIE:
      return new KENLM<lm::ngram::TrieModel>(startInd, line, file, factorType,
                                             load_method);
    case lm::ngram::QUANT_TRIE:
      return new KENLM<lm::ngram::QuantTrieModel>(startInd, line, file,
             factorType, load_method);
    case lm::ngram::ARRAY_TRIE:
      return new KENLM<lm::ngram::ArrayTrieModel>(startInd, line, file,
             factorType, load_method);
    case lm::ngram::QUANT_ARRAY_TRIE:
      return new KENLM<lm::ngram::QuantArrayTrieModel>(startInd, line, file,
             factorType, load_method);
    default:
      UTIL_THROW2("Unrecognized kenlm model type " << model_type)
      ;
    }
  } else {
    return new KENLM<lm::ngram::ProbingModel>(startInd, line, file, factorType,
           load_method);
  }
}
// Generate the target tree of the derivation d.
TreePointer KBestExtractor::GetOutputTree(const Derivation &d)
{
  const TargetPhrase &phrase = *(d.edge->shyperedge.label.translation);
  if (const PhraseProperty *property = phrase.GetProperty("Tree")) {
    const std::string *tree = property->GetValueString();
    TreePointer mytree (boost::make_shared<InternalTree>(*tree));

    //get subtrees (in target order)
    std::vector<TreePointer> previous_trees;
    for (size_t pos = 0; pos < phrase.GetSize(); ++pos) {
      const Word &word = phrase.GetWord(pos);
      if (word.IsNonTerminal()) {
        size_t nonTermInd = phrase.GetAlignNonTerm().GetNonTermIndexMap()[pos];
        const Derivation &subderivation = *d.subderivations[nonTermInd];
        const TreePointer prev_tree = GetOutputTree(subderivation);
        previous_trees.push_back(prev_tree);
      }
    }

    mytree->Combine(previous_trees);
    return mytree;
  } else {
    UTIL_THROW2("Error: TreeStructureFeature active, but no internal tree structure found");
  }
}
LM* ConstructCoarseLM(const std::string &file)
{
  lm::ngram::ModelType model_type;
  if (lm::ngram::RecognizeBinary(file.c_str(), model_type)) {

    switch(model_type) {
    case lm::ngram::PROBING:
      return new CoarseLMModel<lm::ngram::ProbingModel>(file);
    case lm::ngram::REST_PROBING:
      return new CoarseLMModel<lm::ngram::RestProbingModel>(file);
    case lm::ngram::TRIE:
      return new CoarseLMModel<lm::ngram::TrieModel>(file);
    case lm::ngram::QUANT_TRIE:
      return new CoarseLMModel<lm::ngram::QuantTrieModel>(file);
    case lm::ngram::ARRAY_TRIE:
      return new CoarseLMModel<lm::ngram::ArrayTrieModel>(file);
    case lm::ngram::QUANT_ARRAY_TRIE:
      return new CoarseLMModel<lm::ngram::QuantArrayTrieModel>(file);
    default:
      UTIL_THROW2("Unrecognized kenlm model type " << model_type);
    }
  } else {
    return new CoarseLMModel<lm::ngram::ProbingModel>(file);
  }
}
void OpSequenceModel::SetParameter(const std::string& key, const std::string& value)
{

  if (key == "path") {
    m_lmPath = value;
  } else if (key == "support-features") {
    if(value == "no")
      numFeatures = 1;
    else
      numFeatures = 5;
  } else if (key == "input-factor") {
    sFactor = Scan<int>(value);
  } else if (key == "output-factor") {
    tFactor = Scan<int>(value);
  } else if (key == "load") {
    if (value == "lazy") {
      load_method = util::LAZY;
    } else if (value == "populate_or_lazy") {
      load_method = util::POPULATE_OR_LAZY;
    } else if (value == "populate_or_read" || value == "populate") {
      load_method = util::POPULATE_OR_READ;
    } else if (value == "read") {
      load_method = util::READ;
    } else if (value == "parallel_read") {
      load_method = util::PARALLEL_READ;
    } else {
      UTIL_THROW2("Unknown KenLM load method " << value);
    }
  } else {
    StatefulFeatureFunction::SetParameter(key, value);
  }
}
Example #6
0
OSMLM* ConstructOSMLM(const char *file, util::LoadMethod load_method)
{
  lm::ngram::ModelType model_type;
  lm::ngram::Config config;
  config.load_method = load_method;
  if (lm::ngram::RecognizeBinary(file, model_type)) {
    switch(model_type) {
    case lm::ngram::PROBING:
      return new KenOSM<lm::ngram::ProbingModel>(file, config);
    case lm::ngram::REST_PROBING:
      return new KenOSM<lm::ngram::RestProbingModel>(file, config);
    case lm::ngram::TRIE:
      return new KenOSM<lm::ngram::TrieModel>(file, config);
    case lm::ngram::QUANT_TRIE:
      return new KenOSM<lm::ngram::QuantTrieModel>(file, config);
    case lm::ngram::ARRAY_TRIE:
      return new KenOSM<lm::ngram::ArrayTrieModel>(file, config);
    case lm::ngram::QUANT_ARRAY_TRIE:
      return new KenOSM<lm::ngram::QuantArrayTrieModel>(file, config);
    default:
      UTIL_THROW2("Unrecognized kenlm model type " << model_type);
    }
  } else {
    return new KenOSM<lm::ngram::ProbingModel>(file, config);
  }
}
Example #7
0
void
BaseManager::
OutputSearchGraphAsHypergraph(std::ostream& out) const
{
  // This virtual function that may not be implemented everywhere, but it should for
  // derived classes that use it
  UTIL_THROW2("Not implemented.");
}
FFState* OpSequenceModel::EvaluateWhenApplied(
  const ChartHypothesis& /* cur_hypo */,
  int /* featureID - used to index the state in the previous hypotheses */,
  ScoreComponentCollection* accumulator) const
{
  UTIL_THROW2("Chart decoding not support by UTIL_THROW2");

}
void GlobalLexicalModel::Load()
{
  FactorCollection &factorCollection = FactorCollection::Instance();
  const std::string& factorDelimiter = StaticData::Instance().GetFactorDelimiter();

  VERBOSE(2, "Loading global lexical model from file " << m_filePath << endl);

  m_inputFactors = FactorMask(m_inputFactorsVec);
  m_outputFactors = FactorMask(m_outputFactorsVec);
  InputFileStream inFile(m_filePath);

  // reading in data one line at a time
  size_t lineNum = 0;
  string line;
  while(getline(inFile, line)) {
    ++lineNum;
    vector<string> token = Tokenize<string>(line, " ");

    if (token.size() != 3) { // format checking
      UTIL_THROW2("Syntax error at " << m_filePath << ":" << lineNum << ":" << line);
    }

    // create the output word
    Word *outWord = new Word();
    vector<string> factorString = Tokenize( token[0], factorDelimiter );
    for (size_t i=0 ; i < m_outputFactorsVec.size() ; i++) {
      const FactorDirection& direction = Output;
      const FactorType& factorType = m_outputFactorsVec[i];
      const Factor* factor = factorCollection.AddFactor( direction, factorType, factorString[i] );
      outWord->SetFactor( factorType, factor );
    }

    // create the input word
    Word *inWord = new Word();
    factorString = Tokenize( token[1], factorDelimiter );
    for (size_t i=0 ; i < m_inputFactorsVec.size() ; i++) {
      const FactorDirection& direction = Input;
      const FactorType& factorType = m_inputFactorsVec[i];
      const Factor* factor = factorCollection.AddFactor( direction, factorType, factorString[i] );
      inWord->SetFactor( factorType, factor );
    }

    // maximum entropy feature score
    float score = Scan<float>(token[2]);

    // std::cerr << "storing word " << *outWord << " " << *inWord << " " << score << endl;

    // store feature in hash
    DoubleHash::iterator keyOutWord = m_hash.find( outWord );
    if( keyOutWord == m_hash.end() ) {
      m_hash[outWord][inWord] = score;
    } else { // already have hash for outword, delete the word to avoid leaks
      (keyOutWord->second)[inWord] = score;
      delete outWord;
    }
  }
}
Example #10
0
FeatureFunction *ConstructKenLM(size_t startInd, const std::string &lineOrig)
{
  FactorType factorType = 0;
  string filePath;
  util::LoadMethod load_method = util::POPULATE_OR_READ;

  util::TokenIter<util::SingleCharacter, true> argument(lineOrig, ' ');
  ++argument; // KENLM

  util::StringStream line;
  line << "KENLM";

  for (; argument; ++argument) {
    const char *equals = std::find(argument->data(),
                                   argument->data() + argument->size(), '=');
    UTIL_THROW_IF2(equals == argument->data() + argument->size(),
                   "Expected = in KenLM argument " << *argument);
    StringPiece name(argument->data(), equals - argument->data());
    StringPiece value(equals + 1,
                      argument->data() + argument->size() - equals - 1);
    if (name == "factor") {
      factorType = boost::lexical_cast<FactorType>(value);
    } else if (name == "order") {
      // Ignored
    } else if (name == "path") {
      filePath.assign(value.data(), value.size());
    } else if (name == "lazyken") {
      // deprecated: use load instead.
      load_method =
        boost::lexical_cast<bool>(value) ?
        util::LAZY : util::POPULATE_OR_READ;
    } else if (name == "load") {
      if (value == "lazy") {
        load_method = util::LAZY;
      } else if (value == "populate_or_lazy") {
        load_method = util::POPULATE_OR_LAZY;
      } else if (value == "populate_or_read" || value == "populate") {
        load_method = util::POPULATE_OR_READ;
      } else if (value == "read") {
        load_method = util::READ;
      } else if (value == "parallel_read") {
        load_method = util::PARALLEL_READ;
      } else {
        UTIL_THROW2("Unknown KenLM load method " << value);
      }
    } else {
      // pass to base class to interpret
      line << " " << name << "=" << value;
    }
  }

  return ConstructKenLM(startInd, line.str(), filePath, factorType, load_method);
}
bool
ContextParameters::
init(Parameter const& params)
{
  look_back = look_ahead = 0;
  params.SetParameter(context_string, "context-string", std::string(""));
  std::string context_window;
  params.SetParameter(context_window, "context-window", std::string(""));

  if (context_window == "")
    return true;

  if (context_window.substr(0,3) == "all") {
    look_back = look_ahead = std::numeric_limits<size_t>::max();
    return true;
  }

  size_t p = context_window.find_first_of("0123456789");
  if (p == 0)
    look_back = look_ahead = atoi(context_window.c_str());

  if (p == 1) {
    if (context_window[0] == '-')
      look_back  = atoi(context_window.substr(1).c_str());
    else if (context_window[0] == '+')
      look_ahead = atoi(context_window.substr(1).c_str());
    else
      UTIL_THROW2("Invalid specification of context window.");
  }

  if (p == 2) {
    if (context_window.substr(0,2) == "+-" ||
        context_window.substr(0,2) == "-+")
      look_back = look_ahead = atoi(context_window.substr(p).c_str());
    else
      UTIL_THROW2("Invalid specification of context window.");
  }
  return true;
}
void BleuScoreFeature::SetParameter(const std::string& key, const std::string& value)
{
    if (key == "references") {
        vector<string> referenceFiles = Tokenize(value, ",");
        UTIL_THROW_IF2(referenceFiles.size() == 0, "No reference file");
        vector<vector<string> > references(referenceFiles.size());

        for (size_t i =0; i < referenceFiles.size(); ++i) {
            ifstream in(referenceFiles[i].c_str());
            if (!in) {
                UTIL_THROW2("Unable to load references from " << referenceFiles[i]);
            }
            string line;
            while (getline(in,line)) {
                /*  if (GetSearchAlgorithm() == ChartDecoding) {
                stringstream tmp;
                tmp << "<s> " << line << " </s>";
                line = tmp.str();
                }
                */
                references[i].push_back(line);
            }
            if (i > 0) {
                if (references[i].size() != references[i-1].size()) {
                    UTIL_THROW2("Reference files are of different lengths");
                }
            }
            in.close();
        } // for (size_t i =0; i < referenceFiles.size(); ++i) {

        //Set the references in the bleu feature
        LoadReferences(references);

    } else {
        StatefulFeatureFunction::SetParameter(key, value);
    }

}
Example #13
0
Search *Search::CreateSearch(Manager& manager, const InputType &source,
                             SearchAlgorithm searchAlgorithm, const TranslationOptionCollection &transOptColl)
{
  switch(searchAlgorithm) {
  case Normal:
    return new SearchNormal(manager,source, transOptColl);
  case CubePruning:
    return new SearchCubePruning(manager, source, transOptColl);
  case NormalBatch:
    return new SearchNormalBatch(manager, source, transOptColl);
  default:
	UTIL_THROW2("ERROR: search. Aborting\n");
    return NULL;
  }
}
Example #14
0
void FeatureFunction::SetParameter(const std::string& key, const std::string& value)
{
  if (key == "tuneable") {
    m_tuneable = Scan<bool>(value);
  } else if (key == "tuneable-components") {
    UTIL_THROW_IF2(!m_tuneable, GetScoreProducerDescription()
                   << ": tuneable-components cannot be set if tuneable=false");
    SetTuneableComponents(value);
  } else if (key == "require-sorting-after-source-context") {
    m_requireSortingAfterSourceContext = Scan<bool>(value);
  } else if (key == "verbosity") {
    m_verbosity = Scan<size_t>(value);
  } else if (key == "filterable") { //ignore
  } else {
    UTIL_THROW2(GetScoreProducerDescription() << ": Unknown argument " << key << "=" << value);
  }
}
// Generate the target-side yield of the derivation d.
Phrase KBestExtractor::GetOutputPhrase(const Derivation &d)
{
  FactorType placeholderFactor = StaticData::Instance().options()->input.placeholder_factor;

  Phrase ret(ARRAY_SIZE_INCR);

  const TargetPhrase &phrase = *(d.edge->shyperedge.label.translation);
  const AlignmentInfo::NonTermIndexMap &nonTermIndexMap =
    phrase.GetAlignNonTerm().GetNonTermIndexMap();
  for (std::size_t pos = 0; pos < phrase.GetSize(); ++pos) {
    const Word &word = phrase.GetWord(pos);
    if (word.IsNonTerminal()) {
      std::size_t nonTermInd = nonTermIndexMap[pos];
      const Derivation &subderivation = *d.subderivations[nonTermInd];
      Phrase subPhrase = GetOutputPhrase(subderivation);
      ret.Append(subPhrase);
    } else {
      ret.AddWord(word);
      if (placeholderFactor == NOT_FOUND) {
        continue;
      }
      // FIXME
      UTIL_THROW2("placeholders are not currently supported by the S2T decoder");
      /*
            std::set<std::size_t> sourcePosSet =
              phrase.GetAlignTerm().GetAlignmentsForTarget(pos);
            if (sourcePosSet.size() == 1) {
              const std::vector<const Word*> *ruleSourceFromInputPath =
                hypo.GetTranslationOption().GetSourceRuleFromInputPath();
              UTIL_THROW_IF2(ruleSourceFromInputPath == NULL,
                             "Source Words in of the rules hasn't been filled out");
              std::size_t sourcePos = *sourcePosSet.begin();
              const Word *sourceWord = ruleSourceFromInputPath->at(sourcePos);
              UTIL_THROW_IF2(sourceWord == NULL,
                             "Null source word at position " << sourcePos);
              const Factor *factor = sourceWord->GetFactor(placeholderFactor);
              if (factor) {
                ret.Back()[0] = factor;
              }
            }
      */
    }
  }

  return ret;
}
PhraseDictionaryMultiModel::PhraseDictionaryMultiModel(const std::string &line)
  :PhraseDictionary(line, true)
{
  ReadParameters();

  if (m_mode == "interpolate") {
    size_t numWeights = m_numScoreComponents;
    UTIL_THROW_IF2(m_pdStr.size() != m_multimodelweights.size() &&
                   m_pdStr.size()*numWeights != m_multimodelweights.size(),
                   "Number of scores and weights are not equal");
  } else if (m_mode == "all" || m_mode == "all-restrict") {
    UTIL_THROW2("Implementation has moved: use PhraseDictionaryGroup with restrict=true/false");
  } else {
    ostringstream msg;
    msg << "combination mode unknown: " << m_mode;
    throw runtime_error(msg.str());
  }
}
FFState* TreeStructureFeature::EvaluateWhenApplied(const ChartHypothesis& cur_hypo
                                   , int featureID /* used to index the state in the previous hypotheses */
                                   , ScoreComponentCollection* accumulator) const
{
  if (const PhraseProperty *property = cur_hypo.GetCurrTargetPhrase().GetProperty("Tree")) {
    const std::string *tree = property->GetValueString();
    TreePointer mytree (new InternalTree(*tree));

    if (m_labelset) {
        AddNTLabels(mytree);
    }

    //get subtrees (in target order)
    std::vector<TreePointer> previous_trees;
    for (size_t pos = 0; pos < cur_hypo.GetCurrTargetPhrase().GetSize(); ++pos) {
      const Word &word = cur_hypo.GetCurrTargetPhrase().GetWord(pos);
      if (word.IsNonTerminal()) {
        size_t nonTermInd = cur_hypo.GetCurrTargetPhrase().GetAlignNonTerm().GetNonTermIndexMap()[pos];
        const ChartHypothesis *prevHypo = cur_hypo.GetPrevHypo(nonTermInd);
        const TreeState* prev = dynamic_cast<const TreeState*>(prevHypo->GetFFState(featureID));
        const TreePointer prev_tree = prev->GetTree();
        previous_trees.push_back(prev_tree);
      }
    }

    std::vector<std::string> sparse_features;
    if (m_constraints) {
      sparse_features = m_constraints->SyntacticRules(mytree, previous_trees);
    }
    mytree->Combine(previous_trees);

    //sparse scores
    for (std::vector<std::string>::const_iterator feature=sparse_features.begin(); feature != sparse_features.end(); ++feature) {
      accumulator->PlusEquals(this, *feature, 1);
    }
    return new TreeState(mytree);
  }
  else {
    UTIL_THROW2("Error: TreeStructureFeature active, but no internal tree structure found");
  }

}
void TargetConstituentBoundariesRightAdjacentPhraseProperty::ProcessValue(const std::string &value)
{
  FactorCollection &factorCollection = FactorCollection::Instance();
  std::vector<std::string> tokens;
  Tokenize(tokens, value, " ");
  std::vector<std::string>::const_iterator tokenIter = tokens.begin();
  while (tokenIter != tokens.end()) {
    try {

      std::vector<std::string> constituents;
      Tokenize(constituents, *tokenIter, "<");
      ++tokenIter;
      float count = std::atof( tokenIter->c_str() );
      ++tokenIter;

      std::set<const Factor* > dedup;

      for ( std::vector<std::string>::iterator constituentIter = constituents.begin();
            constituentIter != constituents.end(); ++constituentIter ) {

        const Factor* constituentFactor = factorCollection.AddFactor(*constituentIter,false);

        std::pair< std::set<const Factor* >::iterator, bool > dedupIns =
          dedup.insert(constituentFactor);
        if ( dedupIns.second ) {

          std::pair< TargetConstituentBoundariesRightAdjacentCollection::iterator, bool > inserted =
            m_constituentsCollection.insert(std::make_pair(constituentFactor,count));
          if ( !inserted.second ) {
            (inserted.first)->second += count;
          }
        }
      }

    } catch (const std::exception &e) {
      UTIL_THROW2("TargetConstituentBoundariesRightAdjacentPhraseProperty: Read error. Flawed property?  " << value);
    }
  }
};
std::string LexicalReorderingTableCreator::EncodeLine(std::vector<std::string>& tokens)
{
  std::string scoresString = tokens.back();
  std::stringstream scoresStream;

  std::vector<float> scores;
  Tokenize<float>(scores, scoresString);

  if(!m_numScoreComponent) {
    m_numScoreComponent = scores.size();
    m_scoreCounters.resize(m_multipleScoreTrees ? m_numScoreComponent : 1);
    for(std::vector<ScoreCounter*>::iterator it = m_scoreCounters.begin();
        it != m_scoreCounters.end(); it++)
      *it = new ScoreCounter();
    m_scoreTrees.resize(m_multipleScoreTrees ? m_numScoreComponent : 1);
  }

  if(m_numScoreComponent != scores.size()) {
    std::stringstream strme;
    strme << "Error: Wrong number of scores detected ("
              << scores.size() << " != " << m_numScoreComponent << ") :" << std::endl;
    strme << "Line: " << tokens[0] << " ||| ... ||| " << scoresString << std::endl;
    UTIL_THROW2(strme.str());
  }

  size_t c = 0;
  float score;
  while(c < m_numScoreComponent) {
    score = scores[c];
    score = FloorScore(TransformScore(score));
    scoresStream.write((char*)&score, sizeof(score));

    m_scoreCounters[m_multipleScoreTrees ? c : 0]->Increase(score);
    c++;
  }

  return scoresStream.str();
}
FFState* TreeStructureFeature::EvaluateWhenApplied(const ChartHypothesis& cur_hypo
    , int featureID /* used to index the state in the previous hypotheses */
    , ScoreComponentCollection* accumulator) const
{
  if (const PhraseProperty *property = cur_hypo.GetCurrTargetPhrase().GetProperty("Tree")) {
    const std::string *tree = property->GetValueString();
    TreePointer mytree (boost::make_shared<InternalTree>(*tree));

    //get subtrees (in target order)
    std::vector<TreePointer> previous_trees;
    for (size_t pos = 0; pos < cur_hypo.GetCurrTargetPhrase().GetSize(); ++pos) {
      const Word &word = cur_hypo.GetCurrTargetPhrase().GetWord(pos);
      if (word.IsNonTerminal()) {
        size_t nonTermInd = cur_hypo.GetCurrTargetPhrase().GetAlignNonTerm().GetNonTermIndexMap()[pos];
        const ChartHypothesis *prevHypo = cur_hypo.GetPrevHypo(nonTermInd);
        const TreeState* prev = dynamic_cast<const TreeState*>(prevHypo->GetFFState(featureID));
        const TreePointer prev_tree = prev->GetTree();
        previous_trees.push_back(prev_tree);
      }
    }

    if (m_constraints) {
      m_constraints->SyntacticRules(mytree, previous_trees, this, accumulator);
    }
    mytree->Combine(previous_trees);

    bool full_sentence = (mytree->GetChildren().back()->GetLabel() == m_send || (mytree->GetChildren().back()->GetLabel() == m_send_nt && mytree->GetChildren().back()->GetChildren().back()->GetLabel() == m_send));
    if (m_binarized && full_sentence) {
      mytree->Unbinarize();
    }

    return new TreeState(mytree);
  } else {
    UTIL_THROW2("Error: TreeStructureFeature active, but no internal tree structure found");
  }

}
bool HyperTreeLoader::Load(AllOptions const& opts,
                           const std::vector<FactorType> &input,
                           const std::vector<FactorType> &output,
                           const std::string &inFile,
                           const RuleTableFF &ff,
                           HyperTree &trie,
                           boost::unordered_set<std::size_t> &sourceTermSet)
{
  PrintUserTime(std::string("Start loading HyperTree"));

  sourceTermSet.clear();

  std::size_t count = 0;

  std::ostream *progress = NULL;
  IFVERBOSE(1) progress = &std::cerr;
  util::FilePiece in(inFile.c_str(), progress);

  // reused variables
  std::vector<float> scoreVector;
  StringPiece line;

  double_conversion::StringToDoubleConverter converter(double_conversion::StringToDoubleConverter::NO_FLAGS, NAN, NAN, "inf", "nan");

  HyperPathLoader hyperPathLoader;

  Phrase dummySourcePhrase;
  {
    Word *lhs = NULL;
    dummySourcePhrase.CreateFromString(Input, input, "hello", &lhs);
    delete lhs;
  }

  while(true) {
    try {
      line = in.ReadLine();
    } catch (const util::EndOfFileException &e) {
      break;
    }

    util::TokenIter<util::MultiCharacter> pipes(line, "|||");
    StringPiece sourceString(*pipes);
    StringPiece targetString(*++pipes);
    StringPiece scoreString(*++pipes);

    StringPiece alignString;
    if (++pipes) {
      StringPiece temp(*pipes);
      alignString = temp;
    }

    ++pipes;  // counts

    scoreVector.clear();
    for (util::TokenIter<util::AnyCharacter, true> s(scoreString, " \t"); s; ++s) {
      int processed;
      float score = converter.StringToFloat(s->data(), s->length(), &processed);
      UTIL_THROW_IF2(std::isnan(score), "Bad score " << *s << " on line " << count);
      scoreVector.push_back(FloorScore(TransformScore(score)));
    }
    const std::size_t numScoreComponents = ff.GetNumScoreComponents();
    if (scoreVector.size() != numScoreComponents) {
      UTIL_THROW2("Size of scoreVector != number (" << scoreVector.size() << "!="
                  << numScoreComponents << ") of score components on line " << count);
    }

    // Source-side
    HyperPath sourceFragment;
    hyperPathLoader.Load(sourceString, sourceFragment);
    ExtractSourceTerminalSetFromHyperPath(sourceFragment, sourceTermSet);

    // Target-side
    TargetPhrase *targetPhrase = new TargetPhrase(&ff);
    Word *targetLHS = NULL;
    targetPhrase->CreateFromString(Output, output, targetString, &targetLHS);
    targetPhrase->SetTargetLHS(targetLHS);
    targetPhrase->SetAlignmentInfo(alignString);

    if (++pipes) {
      StringPiece sparseString(*pipes);
      targetPhrase->SetSparseScore(&ff, sparseString);
    }

    if (++pipes) {
      StringPiece propertiesString(*pipes);
      targetPhrase->SetProperties(propertiesString);
    }

    targetPhrase->GetScoreBreakdown().Assign(&ff, scoreVector);
    targetPhrase->EvaluateInIsolation(dummySourcePhrase,
                                      ff.GetFeaturesToApply());

    // Add rule to trie.
    TargetPhraseCollection::shared_ptr phraseColl
    = GetOrCreateTargetPhraseCollection(trie, sourceFragment);
    phraseColl->Add(targetPhrase);

    count++;
  }

  // sort and prune each target phrase collection
  if (ff.GetTableLimit()) {
    SortAndPrune(trie, ff.GetTableLimit());
  }

  return true;
}
Example #22
0
void LanguageModel::EvaluateWhenApplied(const SCFG::Manager &mgr,
                                        const SCFG::Hypothesis &hypo, int featureID, Scores &scores,
                                        FFState &state) const
{
  UTIL_THROW2("Not implemented");
}
void PhraseDictionaryFuzzyMatch::InitializeForInput(InputType const& inputSentence)
{
#if defined __MINGW32__
  char dirName[] = "moses.XXXXXX";
#else
  char dirName[] = "/tmp/moses.XXXXXX";
#endif // defined
  char *temp = mkdtemp(dirName);
  UTIL_THROW_IF2(temp == NULL,
		  "Couldn't create temporary directory " << dirName);

  string dirNameStr(dirName);

  string inFileName(dirNameStr + "/in");

  ofstream inFile(inFileName.c_str());

  for (size_t i = 1; i < inputSentence.GetSize() - 1; ++i) {
    inFile << inputSentence.GetWord(i);
  }
  inFile << endl;
  inFile.close();

  long translationId = inputSentence.GetTranslationId();
  string ptFileName = m_FuzzyMatchWrapper->Extract(translationId, dirNameStr);

  // populate with rules for this sentence
  PhraseDictionaryNodeMemory &rootNode = m_collection[translationId];
  FormatType format = MosesFormat;

  // data from file
  InputFileStream inStream(ptFileName);

  // copied from class LoaderStandard
  PrintUserTime("Start loading fuzzy-match phrase model");

  const StaticData &staticData = StaticData::Instance();
  const std::string& factorDelimiter = staticData.GetFactorDelimiter();


  string lineOrig;
  size_t count = 0;

  while(getline(inStream, lineOrig)) {
    const string *line;
    if (format == HieroFormat) { // reformat line
      UTIL_THROW(util::Exception, "Cannot be Hiero format");
      //line = ReformatHieroRule(lineOrig);
    } else {
      // do nothing to format of line
      line = &lineOrig;
    }

    vector<string> tokens;
    vector<float> scoreVector;

    TokenizeMultiCharSeparator(tokens, *line , "|||" );

    if (tokens.size() != 4 && tokens.size() != 5) {
      UTIL_THROW2("Syntax error at " << ptFileName << ":" << count);
    }

    const string &sourcePhraseString = tokens[0]
                                       , &targetPhraseString = tokens[1]
                                           , &scoreString        = tokens[2]
                                               , &alignString        = tokens[3];

    bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == string::npos);
    if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) {
      TRACE_ERR( ptFileName << ":" << count << ": pt entry contains empty target, skipping\n");
      continue;
    }

    Tokenize<float>(scoreVector, scoreString);
    const size_t numScoreComponents = GetNumScoreComponents();
    if (scoreVector.size() != numScoreComponents) {
      UTIL_THROW2("Size of scoreVector != number (" << scoreVector.size() << "!="
            << numScoreComponents << ") of score components on line " << count);
    }

    UTIL_THROW_IF2(scoreVector.size() != numScoreComponents,
    		"Number of scores incorrectly specified");

    // parse source & find pt node

    // constituent labels
    Word *sourceLHS;
    Word *targetLHS;

    // source
    Phrase sourcePhrase( 0);
    // sourcePhrase.CreateFromString(Input, m_input, sourcePhraseString, factorDelimiter, &sourceLHS);
    sourcePhrase.CreateFromString(Input, m_input, sourcePhraseString, &sourceLHS);

    // create target phrase obj
    TargetPhrase *targetPhrase = new TargetPhrase(this);
    // targetPhrase->CreateFromString(Output, m_output, targetPhraseString, factorDelimiter, &targetLHS);
    targetPhrase->CreateFromString(Output, m_output, targetPhraseString, &targetLHS);

    // rest of target phrase
    targetPhrase->SetAlignmentInfo(alignString);
    targetPhrase->SetTargetLHS(targetLHS);
    //targetPhrase->SetDebugOutput(string("New Format pt ") + line);

    // component score, for n-best output
    std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),TransformScore);
    std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),FloorScore);

    targetPhrase->GetScoreBreakdown().Assign(this, scoreVector);
    targetPhrase->EvaluateInIsolation(sourcePhrase, GetFeaturesToApply());

    TargetPhraseCollection &phraseColl = GetOrCreateTargetPhraseCollection(rootNode, sourcePhrase, *targetPhrase, sourceLHS);
    phraseColl.Add(targetPhrase);

    count++;

    if (format == HieroFormat) { // reformat line
      delete line;
    } else {
      // do nothing
    }

  }

  // sort and prune each target phrase collection
  SortAndPrune(rootNode);

  //removedirectoryrecursively(dirName);
}
Example #24
0
int main(int argc, char* argv[])
{
  std::cerr << "Consolidate v2.0 written by Philipp Koehn" << std::endl
            << "consolidating direct and indirect rule tables" << std::endl;

  if (argc < 4) {
    std::cerr << "syntax: consolidate phrase-table.direct phrase-table.indirect phrase-table.consolidated [--Hierarchical] [--OnlyDirect] [--PhraseCount] [--GoodTuring counts-of-counts-file] [--KneserNey counts-of-counts-file] [--LowCountFeature] [--SourceLabels source-labels-file] [--PartsOfSpeech parts-of-speech-file] [--MinScore id:threshold[,id:threshold]*]" << std::endl;
    exit(1);
  }
  const std::string fileNameDirect = argv[1];
  const std::string fileNameIndirect = argv[2];
  const std::string fileNameConsolidated = argv[3];
  std::string fileNameCountOfCounts;
  std::string fileNameSourceLabelSet;
  std::string fileNamePartsOfSpeechVocabulary;

  for(int i=4; i<argc; i++) {
    if (strcmp(argv[i],"--Hierarchical") == 0) {
      hierarchicalFlag = true;
      std::cerr << "processing hierarchical rules" << std::endl;
    } else if (strcmp(argv[i],"--OnlyDirect") == 0) {
      onlyDirectFlag = true;
      std::cerr << "only including direct translation scores p(e|f)" << std::endl;
    } else if (strcmp(argv[i],"--PhraseCount") == 0) {
      phraseCountFlag = true;
      std::cerr << "including the phrase count feature" << std::endl;
    } else if (strcmp(argv[i],"--GoodTuring") == 0) {
      goodTuringFlag = true;
      UTIL_THROW_IF2(i+1==argc, "specify count of count files for Good Turing discounting!");
      fileNameCountOfCounts = argv[++i];
      std::cerr << "adjusting phrase translation probabilities with Good Turing discounting" << std::endl;
    } else if (strcmp(argv[i],"--KneserNey") == 0) {
      kneserNeyFlag = true;
      UTIL_THROW_IF2(i+1==argc, "specify count of count files for Kneser Ney discounting!");
      fileNameCountOfCounts = argv[++i];
      std::cerr << "adjusting phrase translation probabilities with Kneser Ney discounting" << std::endl;
    } else if (strcmp(argv[i],"--LowCountFeature") == 0) {
      lowCountFlag = true;
      std::cerr << "including the low count feature" << std::endl;
    } else if (strcmp(argv[i],"--CountBinFeature") == 0 ||
               strcmp(argv[i],"--SparseCountBinFeature") == 0) {
      if (strcmp(argv[i],"--SparseCountBinFeature") == 0)
        sparseCountBinFeatureFlag = true;
      std::cerr << "include "<< (sparseCountBinFeatureFlag ? "sparse " : "") << "count bin feature:";
      int prev = 0;
      while(i+1<argc && argv[i+1][0]>='0' && argv[i+1][0]<='9') {
        int binCount = Moses::Scan<int>(argv[++i]);
        countBin.push_back( binCount );
        if (prev+1 == binCount) {
          std::cerr << " " << binCount;
        } else {
          std::cerr << " " << (prev+1) << "-" << binCount;
        }
        prev = binCount;
      }
      std::cerr << " " << (prev+1) << "+" << std::endl;
    } else if (strcmp(argv[i],"--LogProb") == 0) {
      logProbFlag = true;
      std::cerr << "using log-probabilities" << std::endl;
    } else if (strcmp(argv[i],"--Counts") == 0) {
      countsProperty = true;
      std::cerr << "output counts as a property" << std::endl;;
    } else if (strcmp(argv[i],"--SourceLabels") == 0) {
      sourceLabelsFlag = true;
      UTIL_THROW_IF2(i+1==argc, "specify source label set file!");
      fileNameSourceLabelSet = argv[++i];
      std::cerr << "processing source labels property" << std::endl;
    } else if (strcmp(argv[i],"--PartsOfSpeech") == 0) {
      partsOfSpeechFlag = true;
      UTIL_THROW_IF2(i+1==argc, "specify parts-of-speech file!");
      fileNamePartsOfSpeechVocabulary = argv[++i];
      std::cerr << "processing parts-of-speech property" << std::endl;
    } else if (strcmp(argv[i],"--MinScore") == 0) {
      std::string setting = argv[++i];
      bool done = false;
      while (!done) {
        std::string single_setting;
        size_t pos;
        if ((pos = setting.find(",")) != std::string::npos) {
          single_setting = setting.substr(0, pos);
          setting.erase(0, pos + 1);
        } else {
          single_setting = setting;
          done = true;
        }
        pos = single_setting.find(":");
        UTIL_THROW_IF2(pos == std::string::npos, "faulty MinScore setting '" << single_setting << "' in '" << argv[i] << "'");
        unsigned int field = Moses::Scan<unsigned int>( single_setting.substr(0,pos) );
        float threshold = Moses::Scan<float>( single_setting.substr(pos+1) );
        if (field == 0) {
          minScore0 = threshold;
          std::cerr << "setting minScore0 to " << threshold << std::endl;
        } else if (field == 2) {
          minScore2 = threshold;
          std::cerr << "setting minScore2 to " << threshold << std::endl;
        } else {
          UTIL_THROW2("MinScore currently only supported for indirect (0) and direct (2) phrase translation probabilities");
        }
      }
    } else {
      UTIL_THROW2("unknown option " << argv[i]);
    }
  }

  processFiles( fileNameDirect, fileNameIndirect, fileNameConsolidated, fileNameCountOfCounts, fileNameSourceLabelSet, fileNamePartsOfSpeechVocabulary );
}
bool CoveredReferenceState::operator==(const FFState& other) const
{
  UTIL_THROW2("TODO:Haven't figure this out yet");
}
size_t CoveredReferenceState::hash() const
{
  UTIL_THROW2("TODO:Haven't figure this out yet");
}
bool RuleTrieLoader::Load(const std::vector<FactorType> &input,
                          const std::vector<FactorType> &output,
                          const std::string &inFile,
                          const RuleTableFF &ff,
                          RuleTrie &trie)
{
    PrintUserTime(std::string("Start loading text phrase table. Moses format"));

    const StaticData &staticData = StaticData::Instance();
    // const std::string &factorDelimiter = staticData.GetFactorDelimiter();

    std::size_t count = 0;

    std::ostream *progress = NULL;
    IFVERBOSE(1) progress = &std::cerr;
    util::FilePiece in(inFile.c_str(), progress);

    // reused variables
    std::vector<float> scoreVector;
    StringPiece line;

    double_conversion::StringToDoubleConverter converter(double_conversion::StringToDoubleConverter::NO_FLAGS, NAN, NAN, "inf", "nan");

    while(true) {
        try {
            line = in.ReadLine();
        } catch (const util::EndOfFileException &e) {
            break;
        }

        util::TokenIter<util::MultiCharacter> pipes(line, "|||");
        StringPiece sourcePhraseString(*pipes);
        StringPiece targetPhraseString(*++pipes);
        StringPiece scoreString(*++pipes);

        StringPiece alignString;
        if (++pipes) {
            StringPiece temp(*pipes);
            alignString = temp;
        }

        if (++pipes) {
            StringPiece str(*pipes); //counts
        }

        bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == std::string::npos);
        if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) {
            TRACE_ERR( ff.GetFilePath() << ":" << count << ": pt entry contains empty target, skipping\n");
            continue;
        }

        scoreVector.clear();
        for (util::TokenIter<util::AnyCharacter, true> s(scoreString, " \t"); s; ++s) {
            int processed;
            float score = converter.StringToFloat(s->data(), s->length(), &processed);
            UTIL_THROW_IF2(std::isnan(score), "Bad score " << *s << " on line " << count);
            scoreVector.push_back(FloorScore(TransformScore(score)));
        }
        const std::size_t numScoreComponents = ff.GetNumScoreComponents();
        if (scoreVector.size() != numScoreComponents) {
            UTIL_THROW2("Size of scoreVector != number (" << scoreVector.size() << "!="
                        << numScoreComponents << ") of score components on line " << count);
        }

        // parse source & find pt node

        // constituent labels
        Word *sourceLHS = NULL;
        Word *targetLHS;

        // create target phrase obj
        TargetPhrase *targetPhrase = new TargetPhrase(&ff);
        // targetPhrase->CreateFromString(Output, output, targetPhraseString, factorDelimiter, &targetLHS);
        targetPhrase->CreateFromString(Output, output, targetPhraseString, &targetLHS);
        // source
        Phrase sourcePhrase;
        // sourcePhrase.CreateFromString(Input, input, sourcePhraseString, factorDelimiter, &sourceLHS);
        sourcePhrase.CreateFromString(Input, input, sourcePhraseString, &sourceLHS);

        // rest of target phrase
        targetPhrase->SetAlignmentInfo(alignString);
        targetPhrase->SetTargetLHS(targetLHS);

        //targetPhrase->SetDebugOutput(string("New Format pt ") + line);

        if (++pipes) {
            StringPiece sparseString(*pipes);
            targetPhrase->SetSparseScore(&ff, sparseString);
        }

        if (++pipes) {
            StringPiece propertiesString(*pipes);
            targetPhrase->SetProperties(propertiesString);
        }

        targetPhrase->GetScoreBreakdown().Assign(&ff, scoreVector);
        targetPhrase->EvaluateInIsolation(sourcePhrase, ff.GetFeaturesToApply());

        TargetPhraseCollection &phraseColl = GetOrCreateTargetPhraseCollection(
                trie, *sourceLHS, sourcePhrase);
        phraseColl.Add(targetPhrase);

        // not implemented correctly in memory pt. just delete it for now
        delete sourceLHS;

        count++;
    }

    // sort and prune each target phrase collection
    if (ff.GetTableLimit()) {
        SortAndPrune(trie, ff.GetTableLimit());
    }

    return true;
}