Esempio n. 1
0
bool OnDiskWrapper::OpenForLoad(const std::string &filePath)
{
  m_fileSource.open((filePath + "/Source.dat").c_str(), ios::in | ios::binary);
  UTIL_THROW_IF(!m_fileSource.is_open(),
		  util::FileOpenException,
		  "Couldn't open file " << filePath << "/Source.dat");

  m_fileTargetInd.open((filePath + "/TargetInd.dat").c_str(), ios::in | ios::binary);
  UTIL_THROW_IF(!m_fileTargetInd.is_open(),
		  util::FileOpenException,
		  "Couldn't open file " << filePath << "/TargetInd.dat");

  m_fileTargetColl.open((filePath + "/TargetColl.dat").c_str(), ios::in | ios::binary);
  UTIL_THROW_IF(!m_fileTargetColl.is_open(),
		  util::FileOpenException,
		  "Couldn't open file " << filePath << "/TargetColl.dat");

  m_fileVocab.open((filePath + "/Vocab.dat").c_str(), ios::in);
  UTIL_THROW_IF(!m_fileVocab.is_open(),
		  util::FileOpenException,
		  "Couldn't open file " << filePath << "/Vocab.dat");

  m_fileMisc.open((filePath + "/Misc.dat").c_str(), ios::in);
  UTIL_THROW_IF(!m_fileMisc.is_open(),
		  util::FileOpenException,
		  "Couldn't open file " << filePath << "/Misc.dat");

  // set up root node
  LoadMisc();
  m_numSourceFactors = GetMisc("NumSourceFactors");
  m_numTargetFactors = GetMisc("NumTargetFactors");
  m_numScores = GetMisc("NumScores");

  return true;
}
void TargetPhrase::SetAlignmentInfo(const StringPiece &alignString)
{
	AlignmentInfo::CollType alignTerm, alignNonTerm;
  for (util::TokenIter<util::AnyCharacter, true> token(alignString, util::AnyCharacter(" \t")); token; ++token) {
    util::TokenIter<util::SingleCharacter, false> dash(*token, util::SingleCharacter('-'));

    char *endptr;
    size_t sourcePos = strtoul(dash->data(), &endptr, 10);
    UTIL_THROW_IF(endptr != dash->data() + dash->size(), util::ErrnoException, "Error parsing alignment" << *dash);
    ++dash;
    size_t targetPos = strtoul(dash->data(), &endptr, 10);
    UTIL_THROW_IF(endptr != dash->data() + dash->size(), util::ErrnoException, "Error parsing alignment" << *dash);
    UTIL_THROW_IF(++dash, util::Exception, "Extra gunk in alignment " << *token);


    if (GetWord(targetPos).IsNonTerminal()) {
    	alignNonTerm.insert(std::pair<size_t,size_t>(sourcePos, targetPos));
    }
  	else {
  		alignTerm.insert(std::pair<size_t,size_t>(sourcePos, targetPos));
  	}
  }
  SetAlignTerm(alignTerm);
  SetAlignNonTerm(alignNonTerm);

}
Esempio n. 3
0
float sentenceLevelBackgroundBleu(const std::vector<float>& sent, const std::vector<float>& bg)
{
  // Sum sent and background
  UTIL_THROW_IF(sent.size()!=bg.size(), util::Exception, "Error");
  UTIL_THROW_IF(sent.size() != kBleuNgramOrder * 2 + 1, util::Exception, "Error");
  std::vector<float> stats(sent.size());

  for(size_t i=0; i<sent.size(); i++)
    stats[i] = sent[i]+bg[i];

  // Calculate BLEU
  float logbleu = 0.0;
  for (int j = 0; j < kBleuNgramOrder; j++) {
    logbleu += log(stats[2 * j]) - log(stats[2 * j + 1]);
  }
  logbleu /= kBleuNgramOrder;
  const float brevity = 1.0 - stats[(kBleuNgramOrder * 2)] / stats[1];

  if (brevity < 0.0) {
    logbleu += brevity;
  }

  // Exponentiate and scale by reference length (as per Chiang et al 08)
  return exp(logbleu) * stats[kBleuNgramOrder*2];
}
Esempio n. 4
0
void ScoreFeatureManager::configure(const std::vector<std::string> args)
{
  bool domainAdded = false;
  bool sparseDomainAdded = false;

  for (size_t i = 0; i < args.size(); ++i) {
  	if (args[i] == "--IgnoreSentenceId") {
      m_includeSentenceId = true;
    } else if (args[i].substr(0,8) == "--Domain") {
      string type = args[i].substr(8);
      ++i;
      UTIL_THROW_IF(i == args.size(), ScoreFeatureArgumentException, "Missing domain file");
      string domainFile = args[i];
      UTIL_THROW_IF(domainAdded, ScoreFeatureArgumentException,
                    "Only allowed one domain feature");
      if (type == "Subset") {
        m_features.push_back(ScoreFeaturePtr(new SubsetDomainFeature(domainFile)));
      } else if (type == "Ratio") {
        m_features.push_back(ScoreFeaturePtr(new RatioDomainFeature(domainFile)));
      } else if (type == "Indicator") {
        m_features.push_back(ScoreFeaturePtr(new IndicatorDomainFeature(domainFile)));
      } else {
        UTIL_THROW(ScoreFeatureArgumentException, "Unknown domain feature type " << type);
      }
      domainAdded = true;
      m_includeSentenceId = true;
    } else if (args[i].substr(0,14) == "--SparseDomain") {
      string type = args[i].substr(14);
      ++i;
      UTIL_THROW_IF(i == args.size(), ScoreFeatureArgumentException, "Missing domain file");
      string domainFile = args[i];
      UTIL_THROW_IF(sparseDomainAdded, ScoreFeatureArgumentException,
                    "Only allowed one sparse domain feature");
      if (type == "Subset") {
        m_features.push_back(ScoreFeaturePtr(new SparseSubsetDomainFeature(domainFile)));
      } else if (type == "Ratio") {
        m_features.push_back(ScoreFeaturePtr(new SparseRatioDomainFeature(domainFile)));
      } else if (type == "Indicator") {
        m_features.push_back(ScoreFeaturePtr(new SparseIndicatorDomainFeature(domainFile)));
      } else {
        UTIL_THROW(ScoreFeatureArgumentException, "Unknown domain feature type " << type);
      }
      sparseDomainAdded = true;
      m_includeSentenceId = true;
    } else if(args[i] == "--GHKMFeatureSparse"){
    	//MARIA
    	m_features.push_back(ScoreFeaturePtr(new InternalStructFeatureSparse()));
    } else if(args[i] == "--GHKMFeatureDense"){
    	//MARIA
    	m_features.push_back(ScoreFeaturePtr(new InternalStructFeatureDense()));
    } else {
      UTIL_THROW(ScoreFeatureArgumentException,"Unknown score argument " << args[i]);
    	}

  }

}
void WordTranslationFeature::Load()
{
  // load word list for restricted feature set
  if (m_filePathSource.empty()) {
    return;
  } //else if (tokens.size() == 8) {

  cerr << "loading word translation word lists from " << m_filePathSource << " and " << m_filePathTarget << endl;
  if (m_domainTrigger) {
    // domain trigger terms for each input document
    ifstream inFileSource(m_filePathSource.c_str());
    UTIL_THROW_IF(!inFileSource, util::Exception, "could not open file " << m_filePathSource);

    std::string line;
    while (getline(inFileSource, line)) {
      m_vocabDomain.resize(m_vocabDomain.size() + 1);
      vector<string> termVector;
      boost::split(termVector, line, boost::is_any_of("\t "));
      for (size_t i=0; i < termVector.size(); ++i)
        m_vocabDomain.back().insert(termVector[i]);
    }

    inFileSource.close();
  } else {
    // restricted source word vocabulary
    ifstream inFileSource(m_filePathSource.c_str());
    UTIL_THROW_IF(!inFileSource, util::Exception, "could not open file " << m_filePathSource);

    std::string line;
    while (getline(inFileSource, line)) {
      m_vocabSource.insert(line);
    }

    inFileSource.close();

    // restricted target word vocabulary
    ifstream inFileTarget(m_filePathTarget.c_str());
    UTIL_THROW_IF(!inFileTarget, util::Exception, "could not open file " << m_filePathTarget);

    while (getline(inFileTarget, line)) {
      m_vocabTarget.insert(line);
    }

    inFileTarget.close();

    m_unrestricted = false;
  }
}
Esempio n. 6
0
/**
 * Pre-calculate the n-gram probabilities for the words in the specified phrase.
 *
 * Note that when this method is called, we do not have access to the context
 * in which this phrase will eventually be applied.
 *
 * In other words, we know what words are in this phrase,
 * but we do not know what words will come before or after this phrase.
 *
 * The parameters fullScore, ngramScore, and oovCount are all output parameters.
 *
 * The value stored in oovCount is the number of words in the phrase
 * that are not in the language model's vocabulary.
 *
 * The sum of the ngram scores for all words in this phrase are stored in fullScore.
 *
 * The value stored in ngramScore is similar, but only full-order ngram scores are included.
 *
 * This is best shown by example:
 *
 * Assume a trigram backward language model and a phrase "a b c d e f g"
 *
 * fullScore would represent the sum of the logprob scores for the following values:
 *
 * p(g)
 * p(f | g)
 * p(e | g f)
 * p(d | f e)
 * p(c | e d)
 * p(b | d c)
 * p(a | c b)
 *
 * ngramScore would represent the sum of the logprob scores for the following values:
 *
 * p(g)
 * p(f | g)
 * p(e | g f)
 * p(d | f e)
 * p(c | e d)
 * p(b | d c)
 * p(a | c b)
 */
template <class Model> void BackwardLanguageModel<Model>::CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const
{
  fullScore = 0;
  ngramScore = 0;
  oovCount = 0;

  if (!phrase.GetSize()) return;

  lm::ngram::ChartState discarded_sadly;
  lm::ngram::RuleScore<Model> scorer(*m_ngram, discarded_sadly);

  UTIL_THROW_IF(
    (m_beginSentenceFactor == phrase.GetWord(0).GetFactor(m_factorType)),
    util::Exception,
    "BackwardLanguageModel does not currently support rules that include <s>"
  );

  float before_boundary = 0.0f;

  int lastWord = phrase.GetSize() - 1;
  int ngramBoundary = m_ngram->Order() - 1;
  int boundary = ( lastWord < ngramBoundary ) ? 0 : ngramBoundary;

  int position;
  for (position = lastWord; position >= 0; position-=1) {
    const Word &word = phrase.GetWord(position);
    UTIL_THROW_IF(
      (word.IsNonTerminal()),
      util::Exception,
      "BackwardLanguageModel does not currently support rules that include non-terminals "
    );

    lm::WordIndex index = TranslateID(word);
    scorer.Terminal(index);
    if (!index) ++oovCount;

    if (position==boundary) {
      before_boundary = scorer.Finish();
    }

  }

  fullScore = scorer.Finish();

  ngramScore = TransformLMScore(fullScore - before_boundary);
  fullScore = TransformLMScore(fullScore);

}
Esempio n. 7
0
size_t ThrowingFwrite(const void *ptr, size_t size, size_t count, FILE* stream)
{
  assert(size);
  size_t returnValue = std::fwrite(ptr, size, count, stream);
  UTIL_THROW_IF(count != returnValue, util::ErrnoException, "Short fwrite; requested size " << size);
  return returnValue;
}
Esempio n. 8
0
void Word::ConvertToMoses(
    const std::vector<Moses::FactorType> &outputFactorsVec, 
    const Vocab &vocab,
    Moses::Word &overwrite) const {
  Moses::FactorCollection &factorColl = Moses::FactorCollection::Instance();
  overwrite = Moses::Word(m_isNonTerminal);

  // TODO: this conversion should have been done at load time.  
  util::TokenIter<util::SingleCharacter> tok(vocab.GetString(m_vocabId), '|');

  for (std::vector<Moses::FactorType>::const_iterator t = outputFactorsVec.begin(); t != outputFactorsVec.end(); ++t, ++tok) {
    UTIL_THROW_IF(!tok, util::Exception, "Too few factors in \"" << vocab.GetString(m_vocabId) << "\"; was expecting " << outputFactorsVec.size());
    overwrite.SetFactor(*t, factorColl.AddFactor(*tok));
  }
  UTIL_THROW_IF(tok, util::Exception, "Too many factors in \"" << vocab.GetString(m_vocabId) << "\"; was expecting " << outputFactorsVec.size());
}
Esempio n. 9
0
void LanguageModelDALM::Load()
{
	/////////////////////
	// READING INIFILE //
	/////////////////////
	string model; // Path to the double-array file.
	string words; // Path to the vocabulary file.
	string wordstxt; //Path to the vocabulary file in text format.
	read_ini(m_filePath.c_str(), model, words, wordstxt);

	UTIL_THROW_IF(model.empty() || words.empty() || wordstxt.empty(),
			util::FileOpenException,
			"Failed to read DALM ini file " << m_filePath << ". Probably doesn't exist");

	////////////////
	// LOADING LM //
	////////////////

	// Preparing a logger object.
	m_logger = new DALM::Logger(stderr);
	m_logger->setLevel(DALM::LOGGER_INFO);

	// Load the vocabulary file.
	m_vocab = new DALM::Vocabulary(words, *m_logger);

	// Load the language model.
	m_lm = new DALM::LM(model, *m_vocab, *m_logger);

	wid_start = m_vocab->lookup(BOS_);
	wid_end = m_vocab->lookup(EOS_);

	// vocab mapping
	CreateVocabMapping(wordstxt);

}
Esempio n. 10
0
void Data::createShards(size_t shard_count, float shard_size, const string& scorerconfig,
                        vector<Data>& shards)
{
  UTIL_THROW_IF(shard_count == 0, util::Exception, "Must have at least 1 shard");
  UTIL_THROW_IF(shard_size < 0 || shard_size > 1,
		  util::Exception,
		  "Shard size must be between 0 and 1, inclusive. Currently " << shard_size);

  size_t data_size = m_score_data->size();
  UTIL_THROW_IF(data_size != m_feature_data->size(),
  	  	  util::Exception,
  	  	  "Error");

  shard_size *= data_size;
  const float coeff = static_cast<float>(data_size) / shard_count;

  for (size_t shard_id = 0; shard_id < shard_count; ++shard_id) {
    vector<size_t> shard_contents;
    if (shard_size == 0) {
      //split into roughly equal size shards
      const size_t shard_start = floor(0.5 + shard_id * coeff);
      const size_t shard_end = floor(0.5 + (shard_id + 1) * coeff);
      for (size_t i = shard_start; i < shard_end; ++i) {
        shard_contents.push_back(i);
      }
    } else {
      //create shards by randomly sampling
      for (size_t i = 0; i < floor(shard_size+0.5); ++i) {
        shard_contents.push_back(rand() % data_size);
      }
    }

    Scorer* scorer = ScorerFactory::getScorer(m_score_type, scorerconfig);

    shards.push_back(Data(scorer));
    shards.back().m_score_type = m_score_type;
    shards.back().m_num_scores = m_num_scores;
    for (size_t i = 0; i < shard_contents.size(); ++i) {
      shards.back().m_feature_data->add(m_feature_data->get(shard_contents[i]));
      shards.back().m_score_data->add(m_score_data->get(shard_contents[i]));
    }
    //cerr << endl;
  }
}
Esempio n. 11
0
void
Word::
CreateFromString(FactorDirection direction
                 , const std::vector<FactorType> &factorOrder
                 , const StringPiece &str
                 , bool isNonTerminal
                 , bool strict)
{
  FactorCollection &factorCollection = FactorCollection::Instance();
  vector<StringPiece> bits(MAX_NUM_FACTORS);
  string factorDelimiter = StaticData::Instance().GetFactorDelimiter();
  if (factorDelimiter.size()) {
    util::TokenIter<util::MultiCharacter> fit(str, factorDelimiter);
    size_t i = 0;
    for (; i < MAX_NUM_FACTORS && fit; ++i,++fit)
      bits[i] = *fit;
    if (i == MAX_NUM_FACTORS)
      UTIL_THROW_IF(fit, StrayFactorException,
                    "The hard limit for factors is " << MAX_NUM_FACTORS
                    << ". The word " << str << " contains factor delimiter "
                    << StaticData::Instance().GetFactorDelimiter()
                    << " too many times.");
    if (strict)
      UTIL_THROW_IF(fit, StrayFactorException,
                    "You have configured " << factorOrder.size()
                    << " factors but the word " << str
                    << " contains factor delimiter "
                    << StaticData::Instance().GetFactorDelimiter()
                    << " too many times.");
    UTIL_THROW_IF(!isNonTerminal && i < factorOrder.size(),util::Exception,
                  "Too few factors in string '" << str << "'.");
  } else {
    bits[0] = str;
  }
  for (size_t k = 0; k < factorOrder.size(); ++k) {
    UTIL_THROW_IF(factorOrder[k] >= MAX_NUM_FACTORS, util::Exception,
                  "Factor order out of bounds.");
    m_factorArray[factorOrder[k]] = factorCollection.AddFactor(bits[k], isNonTerminal);
  }
  // assume term/non-term same for all factors
  m_isNonTerminal = isNonTerminal;
}
void SparseHieroReorderingFeature::LoadVocabulary(const std::string& filename, Vocab& vocab)
{
  if (filename.empty()) return;
  ifstream in(filename.c_str());
  UTIL_THROW_IF(!in, util::Exception, "Unable to open vocab file: " << filename);
  string line;
  while(getline(in,line)) {
    vocab.insert(FactorCollection::Instance().AddFactor(line)); 
  }
  in.close();
}
Esempio n. 13
0
UINT64 OnDiskWrapper::GetMisc(const std::string &key) const
{
  std::map<std::string, UINT64>::const_iterator iter;
  iter = m_miscInfo.find(key);
  UTIL_THROW_IF(iter == m_miscInfo.end()
		  	  , util::Exception
		  	  , "Couldn't find value for key " << key
  	  	  	  );

  return iter->second;
}
Esempio n. 14
0
ChartTrellisPath::ChartTrellisPath(const ChartTrellisDetour &detour)
  : m_finalNode(new ChartTrellisNode(detour, m_deviationPoint))
  , m_scoreBreakdown(detour.GetBasePath().m_scoreBreakdown)
  , m_totalScore(0)
{
  UTIL_THROW_IF(m_deviationPoint == NULL, util::Exception, "No deviation point");
  ScoreComponentCollection scoreChange;
  scoreChange = detour.GetReplacementHypo().GetScoreBreakdown();
  scoreChange.MinusEquals(detour.GetSubstitutedNode().GetHypothesis().GetScoreBreakdown());
  m_scoreBreakdown.PlusEquals(scoreChange);
  m_totalScore = m_scoreBreakdown.GetWeightedScore();
}
SparseReordering::SparseReordering(const map<string,string>& config, const LexicalReordering* producer)
  : m_producer(producer) 
{
  static const string kSource= "source";
  static const string kTarget = "target";
  for (map<string,string>::const_iterator i = config.begin(); i != config.end(); ++i) {
    vector<string> fields = Tokenize(i->first, "-");
    if (fields[0] == "words") {
      UTIL_THROW_IF(!(fields.size() == 3), util::Exception, "Sparse reordering word list name should be sparse-words-(source|target)-<id>");
      if (fields[1] == kSource) {
        ReadWordList(i->second,fields[2], SparseReorderingFeatureKey::Source, &m_sourceWordLists);
      } else if (fields[1] == kTarget) {
        ReadWordList(i->second,fields[2],SparseReorderingFeatureKey::Target, &m_targetWordLists);
      } else {
        UTIL_THROW(util::Exception, "Sparse reordering requires source or target, not " << fields[1]);
      }
    } else if (fields[0] == "clusters") {
      UTIL_THROW_IF(!(fields.size() == 3), util::Exception, "Sparse reordering cluster name should be sparse-clusters-(source|target)-<id>");
      if (fields[1] == kSource) {
        ReadClusterMap(i->second,fields[2], SparseReorderingFeatureKey::Source, &m_sourceClusterMaps);
      } else if (fields[1] == kTarget) {
        ReadClusterMap(i->second,fields[2],SparseReorderingFeatureKey::Target, &m_targetClusterMaps);
      } else {
        UTIL_THROW(util::Exception, "Sparse reordering requires source or target, not " << fields[1]);
      }

    } else if (fields[0] == "phrase") {
      m_usePhrase = true;
    } else if (fields[0] == "stack") {
      m_useStack = true;
    } else if (fields[0] == "between") {
      m_useBetween = true;
    } else {
      UTIL_THROW(util::Exception, "Unable to parse sparse reordering option: " << i->first);
    }
  }

}
void SparseReordering::ReadWordList(const string& filename, const string& id, SparseReorderingFeatureKey::Side side, vector<WordList>* pWordLists) {
  ifstream fh(filename.c_str());
  UTIL_THROW_IF(!fh, util::Exception, "Unable to open: " << filename);
  string line;
  pWordLists->push_back(WordList());
  pWordLists->back().first = id;
  while (getline(fh,line)) {
    //TODO: StringPiece
    const Factor* factor = FactorCollection::Instance().AddFactor(line);
    pWordLists->back().second.insert(factor);
    PreCalculateFeatureNames(pWordLists->size()-1, id, side, factor, false); 

  }
}
Esempio n. 17
0
Word *OnDiskWrapper::ConvertFromMoses(const std::vector<Moses::FactorType> &factorsVec
                                      , const Moses::Word &origWord) const
{
  bool isNonTerminal = origWord.IsNonTerminal();
  Word *newWord = new Word(isNonTerminal);
  stringstream strme;

  size_t factorType = factorsVec[0];
  const Moses::Factor *factor = origWord.GetFactor(factorType);
  UTIL_THROW_IF(factor == NULL, util::Exception, "Expecting factor " << factorType);
  strme << factor->GetString();

  for (size_t ind = 1 ; ind < factorsVec.size() ; ++ind) {
    size_t factorType = factorsVec[ind];
    const Moses::Factor *factor = origWord.GetFactor(factorType);
    if (factor == NULL) {
      // can have less factors than factorType.size()
      break;
    }
    UTIL_THROW_IF(factor == NULL,
    		util::Exception,
    		"Expecting factor " << factorType << " at position " << ind);
    strme << "|" << factor->GetString();
  } // for (size_t factorType

  bool found;
  UINT64 vocabId = m_vocab.GetVocabId(strme.str(), found);
  if (!found) {
    // factor not in phrase table -> phrse definately not in. exit
    delete newWord;
    return NULL;
  } else {
    newWord->SetVocabId(vocabId);
    return newWord;
  }
}
Esempio n. 18
0
void Word::CreateFromString(FactorDirection direction
                            , const std::vector<FactorType> &factorOrder
                            , const StringPiece &str
                            , bool isNonTerminal)
{
  FactorCollection &factorCollection = FactorCollection::Instance();

  util::TokenIter<util::MultiCharacter> fit(str, StaticData::Instance().GetFactorDelimiter());
  for (size_t ind = 0; ind < factorOrder.size() && fit; ++ind, ++fit) {
    m_factorArray[factorOrder[ind]] = factorCollection.AddFactor(*fit);
  }
  UTIL_THROW_IF(fit, StrayFactorException, "You have configured " << factorOrder.size() << " factors but the word " << str << " contains factor delimiter " << StaticData::Instance().GetFactorDelimiter() << " too many times.");

  // assume term/non-term same for all factors
  m_isNonTerminal = isNonTerminal;
}
Esempio n. 19
0
float unsmoothedBleu(const std::vector<float>& stats)
{
  UTIL_THROW_IF(stats.size() != kBleuNgramOrder * 2 + 1, util::Exception, "Error");

  float logbleu = 0.0;
  for (int j = 0; j < kBleuNgramOrder; j++) {
    logbleu += log(stats[2 * j]) - log(stats[2 * j + 1]);
  }
  logbleu /= kBleuNgramOrder;
  const float brevity = 1.0 - stats[(kBleuNgramOrder * 2)] / stats[1];

  if (brevity < 0.0) {
    logbleu += brevity;
  }
  return exp(logbleu);
}
Esempio n. 20
0
void OnDiskWrapper::EndSave()
{
  bool ret = m_rootSourceNode->Saved();
  UTIL_THROW_IF(!ret, util::Exception, "Root node not saved");

  GetVocab().Save(*this);

  SaveMisc();

  m_fileMisc.close();
  m_fileVocab.close();
  m_fileSource.close();
  m_fileTarget.close();
  m_fileTargetInd.close();
  m_fileTargetColl.close();
}
Esempio n. 21
0
template <class Model> FFState *BackwardLanguageModel<Model>::Evaluate(const Phrase &phrase, const FFState *ps, float &returnedScore) const
{

  returnedScore = 0.0f;

  const lm::ngram::ChartState &previous = static_cast<const BackwardLMState&>(*ps).state;

  std::auto_ptr<BackwardLMState> ret(new BackwardLMState());

  lm::ngram::RuleScore<Model> scorer(*m_ngram, ret->state);

  int ngramBoundary = m_ngram->Order() - 1;
  int lastWord = phrase.GetSize() - 1;

  // Get scores for words at the end of the previous phrase
  // that are now adjacent to words at the the beginning of this phrase
  for (int position=std::min( lastWord,  ngramBoundary - 1); position >= 0; position-=1) {
    const Word &word = phrase.GetWord(position);
    UTIL_THROW_IF(
      (word.IsNonTerminal()),
      util::Exception,
      "BackwardLanguageModel does not currently support rules that include non-terminals "
    );

    lm::WordIndex index = TranslateID(word);
    scorer.Terminal(index);
  }
  scorer.NonTerminal(previous);
  returnedScore = scorer.Finish();
  /*
  out->PlusEquals(this, score);


    UTIL_THROW_IF(
      (1==1),
      util::Exception,
      "This method (BackwardLanguageModel<Model>::Evaluate) is not yet fully implemented"
      );
  */
  return ret.release();



}
Esempio n. 22
0
template <class Model> void Fill<Model>::AddPhraseOOV(TargetPhrase &phrase, std::list<TargetPhraseCollection*> &, const WordsRange &)
{
  std::vector<lm::WordIndex> words;
  UTIL_THROW_IF(phrase.GetSize() > 1, util::Exception,
		  "OOV target phrase should be 0 or 1 word in length");
  if (phrase.GetSize())
    words.push_back(Convert(phrase.GetWord(0)));

  search::PartialEdge edge(edges_.AllocateEdge(0));
  // Appears to be a bug that FutureScore does not already include language model.
  search::ScoreRuleRet scored(search::ScoreRule(context_.LanguageModel(), words, edge.Between()));
  edge.SetScore(phrase.GetFutureScore() + scored.prob * context_.LMWeight() + static_cast<search::Score>(scored.oov) * oov_weight_);

  search::Note note;
  note.vp = &phrase;
  edge.SetNote(note);

  edges_.AddEdge(edge);
}
Esempio n. 23
0
float smoothedSentenceBleu
(const std::vector<float>& stats, float smoothing, bool smoothBP)
{
  UTIL_THROW_IF(stats.size() != kBleuNgramOrder * 2 + 1, util::Exception, "Error");

  float logbleu = 0.0;
  for (int j = 0; j < kBleuNgramOrder; j++) {
    logbleu += log(stats[2 * j] + smoothing) - log(stats[2 * j + 1] + smoothing);
  }
  logbleu /= kBleuNgramOrder;
  const float reflength = stats[(kBleuNgramOrder * 2)]  +
                          (smoothBP ? smoothing : 0.0f);
  const float brevity = 1.0 - reflength / stats[1];

  if (brevity < 0.0) {
    logbleu += brevity;
  }
  return exp(logbleu);
}
Esempio n. 24
0
void SubsetDomainFeature::add(const map<string,float>& domainCount,
                              float count,
                              const MaybeLog& maybeLog,
                              std::vector<float>& denseValues,
                              std::map<std::string,float>& sparseValues)  const
{
  if (m_domain.list.size() > 6) {
    UTIL_THROW_IF(m_domain.list.size() > 6, ScoreFeatureArgumentException,
                  "too many domains for core domain subset features");
  }
  size_t bitmap = 0;
  for(size_t bit = 0; bit < m_domain.list.size(); bit++) {
    if (domainCount.find( m_domain.list[ bit ] ) != domainCount.end()) {
      bitmap += 1 << bit;
    }
  }
  for(size_t i = 1; i < (1 << m_domain.list.size()); i++) {
    denseValues.push_back(maybeLog( (bitmap == i) ? 2.718 : 1 ));
  }
}
Esempio n. 25
0
statscore_t BleuScorer::calculateScore(const vector<int>& comps) const
{
  UTIL_THROW_IF(comps.size() != kBleuNgramOrder * 2 + 1, util::Exception, "Error");

  float logbleu = 0.0;
  for (int i = 0; i < kBleuNgramOrder; ++i) {
    if (comps[2*i] == 0) {
      return 0.0;
    }
    logbleu += log(comps[2*i]) - log(comps[2*i+1]);

  }
  logbleu /= kBleuNgramOrder;
  // reflength divided by test length
  const float brevity = 1.0 - static_cast<float>(comps[kBleuNgramOrder * 2]) / comps[1];
  if (brevity < 0.0) {
    logbleu += brevity;
  }
  return exp(logbleu);
}
Esempio n. 26
0
void DynSuffixArray::Insert(vuint_t* newSent, unsigned newIndex)
{
  // for sentences
  //stages 1, 2, 4 stay same from 1char case
  //(use last word of new text in step 2 and save Ltmp until last insert?)
  //stage 3...all words of new sentence are inserted backwards
  // stage 2: k=ISA[newIndex], tmp= L[k], L[k]  = newChar
  //PrintAuxArrays();
  UTIL_THROW_IF(newIndex > m_SA->size(), util::Exception, "Error");
  int k(-1), kprime(-1);
  k = (newIndex < m_SA->size() ? m_ISA->at(newIndex) : m_ISA->at(0)); // k is now index of the cycle that starts at newindex
  int true_pos = LastFirstFunc(k); // track cycle shift (newIndex - 1)
  int Ltmp = m_L->at(k);
  m_L->at(k) = newSent->at(newSent->size()-1);  // cycle k now ends with correct word
  for(int j = newSent->size()-1; j > -1; --j) {
    kprime = LastFirstFunc(k);  // find cycle that starts with (newindex - 1)
    //kprime += ((m_L[k] == Ltmp) && (k > isa[k]) ? 1 : 0); // yada yada
    // only terminal char can be 0 so add new vocab at end
    kprime = (kprime > 0 ? kprime : m_SA->size());
    true_pos += (kprime <= true_pos ? 1 : 0); // track changes
    // insert everything
    m_F->insert(m_F->begin() + kprime, newSent->at(j));
    int theLWord = (j == 0 ? Ltmp : newSent->at(j-1));

    m_L->insert(m_L->begin() + kprime, theLWord);
    for (vuint_t::iterator itr = m_SA->begin(); itr != m_SA->end(); ++itr) {
      if(*itr >= newIndex) ++(*itr);
    }
    m_SA->insert(m_SA->begin() + kprime, newIndex);
    for (vuint_t::iterator itr = m_ISA->begin(); itr != m_ISA->end(); ++itr) {
      if((int)*itr >= kprime) ++(*itr);
    }

    m_ISA->insert(m_ISA->begin() + newIndex, kprime);
    k = kprime;
    //PrintAuxArrays();
  }
  // Begin stage 4
  Reorder(true_pos, LastFirstFunc(kprime)); // actual position vs computed position of cycle (newIndex-1)
}
Esempio n. 27
0
void DynSuffixArray::Reorder(unsigned j, unsigned jprime)
{
  set<pair<unsigned, unsigned> > seen;
  while(j != jprime) {
    // this 'seenit' check added for data with many loops. will remove after double
    // checking.
    bool seenit = seen.insert(std::make_pair(j, jprime)).second;
    if(seenit) {
      for(size_t i=1; i < m_SA->size(); ++i) {
        if(m_corpus->at(m_SA->at(i)) < m_corpus->at(m_SA->at(i-1))) {
          cerr << "PROBLEM WITH SUFFIX ARRAY REORDERING. EXITING...\n";
          exit(1);
        }
      }
      return;
    }
    //cerr << "j=" << j << "\tj'=" << jprime << endl;
    int isaIdx(-1);
    int new_j = LastFirstFunc(j);
    UTIL_THROW_IF(j > jprime, util::Exception, "Error");
    // for SA and L, the element at pos j is moved to pos j'
    m_L->insert(m_L->begin() + jprime + 1, m_L->at(j));
    m_L->erase(m_L->begin() + j);
    m_SA->insert(m_SA->begin() + jprime + 1, m_SA->at(j));
    m_SA->erase(m_SA->begin() + j);
    // all ISA values between (j...j'] decremented
    for(size_t i = 0; i < m_ISA->size(); ++i) {
      if((m_ISA->at(i) == j) && (isaIdx == -1))
        isaIdx = i; // store index of ISA[i] = j
      if((m_ISA->at(i) > j) && (m_ISA->at(i) <= jprime)) --(*m_ISA)[i];
    }
    // replace j with j' in ISA
    //isa[isaIdx] = jprime;
    m_ISA->at(isaIdx) = jprime;
    j = new_j;
    jprime = LastFirstFunc(jprime);
  }
  //cerr << "j=" << j << "\tj'=" << jprime << endl;
}
Esempio n. 28
0
void FeatureFunction::ParseLine(const std::string &line)
{
  vector<string> toks = Tokenize(line);
  CHECK(toks.size());

  string nameStub = toks[0];

  set<string> keys;

  for (size_t i = 1; i < toks.size(); ++i) {
    vector<string> args = TokenizeFirstOnly(toks[i], "=");
    CHECK(args.size() == 2);

    pair<set<string>::iterator,bool> ret = keys.insert(args[0]);
    UTIL_THROW_IF(!ret.second, util::Exception, "Duplicate key in line " << line);

    if (args[0] == "num-features") {
      m_numScoreComponents = Scan<size_t>(args[1]);
    } else if (args[0] == "name") {
      m_description = args[1];
    } else {
      m_args.push_back(args);
    }
  }

  // name
  if (m_description == "") {
    size_t index = description_counts.count(nameStub);

    ostringstream dstream;
    dstream << nameStub;
    dstream << index;

    description_counts.insert(nameStub);
    m_description = dstream.str();
  }

}
Esempio n. 29
0
bool RuleTableLoaderStandard::Load(FormatType format
                                , const std::vector<FactorType> &input
                                , const std::vector<FactorType> &output
                                , const std::string &inFile
                                , const std::vector<float> &weight
                                , size_t /* tableLimit */
                                , const LMList &languageModels
                                , const WordPenaltyProducer* wpProducer
                                , RuleTableTrie &ruleTable)
{
  PrintUserTime(string("Start loading text SCFG phrase table. ") + (format==MosesFormat?"Moses ":"Hiero ") + " format");

  const StaticData &staticData = StaticData::Instance();
  const std::string& factorDelimiter = staticData.GetFactorDelimiter();

  string lineOrig;
  size_t count = 0;

  std::ostream *progress = NULL;
  IFVERBOSE(1) progress = &std::cerr;
  util::FilePiece in(inFile.c_str(), progress);

  // reused variables
  vector<float> scoreVector;
  StringPiece line;
  std::string hiero_before, hiero_after;

  while(true) {
    try {
      line = in.ReadLine();
    } catch (const util::EndOfFileException &e) { break; }

    if (format == HieroFormat) { // inefficiently reformat line
      hiero_before.assign(line.data(), line.size());
      ReformatHieroRule(hiero_before, hiero_after);
      line = hiero_after;
    }

    util::TokenIter<util::MultiCharacter> pipes(line, "|||");
    StringPiece sourcePhraseString(*pipes);
    StringPiece targetPhraseString(*++pipes);
    StringPiece scoreString(*++pipes);
    StringPiece alignString(*++pipes);
    // TODO(bhaddow) efficiently handle default instead of parsing this string every time.  
    StringPiece ruleCountString = ++pipes ? *pipes : StringPiece("1 1");
    
    if (++pipes) {
      stringstream strme;
      strme << "Syntax error at " << ruleTable.GetFilePath() << ":" << count;
      UserMessage::Add(strme.str());
      abort();
    }

    bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == string::npos);
    if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) {
      TRACE_ERR( ruleTable.GetFilePath() << ":" << count << ": pt entry contains empty target, skipping\n");
      continue;
    }

    scoreVector.clear();
    for (util::TokenIter<util::AnyCharacter, true> s(scoreString, " \t"); s; ++s) {
      char *err_ind;
      scoreVector.push_back(strtod(s->data(), &err_ind));
      UTIL_THROW_IF(err_ind == s->data(), util::Exception, "Bad score " << *s << " on line " << count);
    }
    const size_t numScoreComponents = ruleTable.GetFeature()->GetNumScoreComponents();
    if (scoreVector.size() != numScoreComponents) {
      stringstream strme;
      strme << "Size of scoreVector != number (" << scoreVector.size() << "!="
            << numScoreComponents << ") of score components on line " << count;
      UserMessage::Add(strme.str());
      abort();
    }

    // parse source & find pt node

    // constituent labels
    Word sourceLHS, targetLHS;

    // source
    Phrase sourcePhrase( 0);
    sourcePhrase.CreateFromStringNewFormat(Input, input, sourcePhraseString, factorDelimiter, sourceLHS);

    // create target phrase obj
    TargetPhrase *targetPhrase = new TargetPhrase(Output);
    targetPhrase->CreateFromStringNewFormat(Output, output, targetPhraseString, factorDelimiter, targetLHS);
    targetPhrase->SetSourcePhrase(sourcePhrase);

    // rest of target phrase
    targetPhrase->SetAlignmentInfo(alignString, sourcePhrase);
    targetPhrase->SetTargetLHS(targetLHS);
    
    targetPhrase->SetRuleCount(ruleCountString, scoreVector[0]);
    //targetPhrase->SetDebugOutput(string("New Format pt ") + line);
    
    // component score, for n-best output
    std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),TransformScore);
    std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),FloorScore);

    targetPhrase->SetScoreChart(ruleTable.GetFeature(), scoreVector, weight, languageModels,wpProducer);

    TargetPhraseCollection &phraseColl = GetOrCreateTargetPhraseCollection(ruleTable, sourcePhrase, *targetPhrase, sourceLHS);
    phraseColl.Add(targetPhrase);

    count++;
  }

  // sort and prune each target phrase collection
  SortAndPrune(ruleTable);

  return true;
}
Esempio n. 30
0
bool RuleTableLoaderStandard::Load(FormatType format
                                   , const std::vector<FactorType> &input
                                   , const std::vector<FactorType> &output
                                   , const std::string &inFile
                                   , size_t /* tableLimit */
                                   , RuleTableTrie &ruleTable)
{
  PrintUserTime(string("Start loading text SCFG phrase table. ") + (format==MosesFormat?"Moses ":"Hiero ") + " format");

  const StaticData &staticData = StaticData::Instance();
  const std::string& factorDelimiter = staticData.GetFactorDelimiter();

  string lineOrig;
  size_t count = 0;

  std::ostream *progress = NULL;
  IFVERBOSE(1) progress = &std::cerr;
  util::FilePiece in(inFile.c_str(), progress);

  // reused variables
  vector<float> scoreVector;
  StringPiece line;
  std::string hiero_before, hiero_after;

  double_conversion::StringToDoubleConverter converter(double_conversion::StringToDoubleConverter::NO_FLAGS, NAN, NAN, "inf", "nan");

  while(true) {
    try {
      line = in.ReadLine();
    } catch (const util::EndOfFileException &e) {
      break;
    }

    if (format == HieroFormat) { // inefficiently reformat line
      hiero_before.assign(line.data(), line.size());
      ReformatHieroRule(hiero_before, hiero_after);
      line = hiero_after;
    }

    util::TokenIter<util::MultiCharacter> pipes(line, "|||");
    StringPiece sourcePhraseString(*pipes);
    StringPiece targetPhraseString(*++pipes);
    StringPiece scoreString(*++pipes);

    StringPiece alignString;
    if (++pipes) {
      StringPiece temp(*pipes);
      alignString = temp;
    }

    if (++pipes) {
      StringPiece str(*pipes); //counts
    }

    bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == string::npos);
    if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) {
      TRACE_ERR( ruleTable.GetFilePath() << ":" << count << ": pt entry contains empty target, skipping\n");
      continue;
    }

    scoreVector.clear();
    for (util::TokenIter<util::AnyCharacter, true> s(scoreString, " \t"); s; ++s) {
      int processed;
      float score = converter.StringToFloat(s->data(), s->length(), &processed);
      UTIL_THROW_IF(isnan(score), util::Exception, "Bad score " << *s << " on line " << count);
      scoreVector.push_back(FloorScore(TransformScore(score)));
    }
    const size_t numScoreComponents = ruleTable.GetNumScoreComponents();
    if (scoreVector.size() != numScoreComponents) {
      stringstream strme;
      strme << "Size of scoreVector != number (" << scoreVector.size() << "!="
            << numScoreComponents << ") of score components on line " << count;
      UserMessage::Add(strme.str());
      abort();
    }

    // parse source & find pt node

    // constituent labels
    Word *sourceLHS;
    Word *targetLHS;

    // create target phrase obj
    TargetPhrase *targetPhrase = new TargetPhrase();
    targetPhrase->CreateFromString(Output, output, targetPhraseString, factorDelimiter, &targetLHS);

    // source
    Phrase sourcePhrase;
    sourcePhrase.CreateFromString(Input, input, sourcePhraseString, factorDelimiter, &sourceLHS);

    // rest of target phrase
    targetPhrase->SetAlignmentInfo(alignString);
    targetPhrase->SetTargetLHS(targetLHS);

    //targetPhrase->SetDebugOutput(string("New Format pt ") + line);

    if (++pipes) {
      StringPiece sparseString(*pipes);
      targetPhrase->SetSparseScore(&ruleTable, sparseString);
    }

    if (++pipes) {
      StringPiece propertiesString(*pipes);
      targetPhrase->SetProperties(propertiesString);
    }

    targetPhrase->GetScoreBreakdown().Assign(&ruleTable, scoreVector);
    targetPhrase->Evaluate(sourcePhrase, ruleTable.GetFeaturesToApply());

    TargetPhraseCollection &phraseColl = GetOrCreateTargetPhraseCollection(ruleTable, sourcePhrase, *targetPhrase, sourceLHS);
    phraseColl.Add(targetPhrase);

    count++;
  }

  // sort and prune each target phrase collection
  SortAndPrune(ruleTable);

  return true;
}