Ejemplo n.º 1
0
void PhraseDictionaryOnDisk::InitializeForInput(InputType const& source)
{
  const StaticData &staticData = StaticData::Instance();

  ReduceCache();

  OnDiskPt::OnDiskWrapper *obj = new OnDiskPt::OnDiskWrapper();
  obj->BeginLoad(m_filePath);

  UTIL_THROW_IF2(obj->GetMisc("Version") != OnDiskPt::OnDiskWrapper::VERSION_NUM,
		  "On-disk phrase table is version " <<  obj->GetMisc("Version")
		  << ". It is not compatible with version " << OnDiskPt::OnDiskWrapper::VERSION_NUM);

  UTIL_THROW_IF2(obj->GetMisc("NumSourceFactors") != m_input.size(),
		  "On-disk phrase table has " <<  obj->GetMisc("NumSourceFactors") << " source factors."
		  		  << ". The ini file specified " << m_input.size() << " source factors");

  UTIL_THROW_IF2(obj->GetMisc("NumTargetFactors") != m_output.size(),
		  "On-disk phrase table has " <<  obj->GetMisc("NumTargetFactors") << " target factors."
		  		  << ". The ini file specified " << m_output.size() << " target factors");

  UTIL_THROW_IF2(obj->GetMisc("NumScores") != m_numScoreComponents,
		  "On-disk phrase table has " <<  obj->GetMisc("NumScores") << " scores."
		  		  << ". The ini file specified " << m_numScoreComponents << " scores");

  m_implementation.reset(obj);
}
Ejemplo n.º 2
0
bool Vocab::Load(FileHandler* vcbin, const FactorDirection& direction,
                 const FactorList& factors, bool closed)
{
  // load vocab id -> word mapping
  m_words2ids.clear();	// reset mapping
  m_ids2words.clear();
  std::string line, word_str;
  wordID_t id;

  std::istream &ret = getline(*vcbin, line);
  UTIL_THROW_IF2(!ret, "Couldn't read file");
  std::istringstream first(line.c_str());
  uint32_t vcbsize(0);
  first >> vcbsize;
  uint32_t loadedsize = 0;
  while (loadedsize++ < vcbsize && getline(*vcbin, line)) {
    std::istringstream entry(line.c_str());
    entry >> word_str;
    Word word;
    word.CreateFromString( direction, factors, word_str, false); // TODO set correctly isNonTerminal
    entry >> id;
    // may be no id (i.e. file may just be a word list)
    if (id == 0 && word != GetkOOVWord())
      id = m_ids2words.size() + 1;	// assign ids sequentially starting from 1
    UTIL_THROW_IF2(m_ids2words.count(id) != 0 || m_words2ids.count(word) != 0,
                   "Error");

    m_ids2words[id] = word;
    m_words2ids[word] = id;
  }
  m_closed = closed;	// once loaded fix vocab ?
  std::cerr << "Loaded vocab with " << m_ids2words.size() << " words." << std::endl;
  return true;
}
Ejemplo n.º 3
0
void Model1Vocabulary::Load(const std::string& fileName)
{
  InputFileStream inFile(fileName);
  FactorCollection &factorCollection = FactorCollection::Instance();
  std::string line;

  unsigned i = 0;
  if ( getline(inFile, line) ) { // first line of MGIZA vocabulary files seems to be special : "1       UNK     0"  -- skip if it's this
    ++i;
    std::vector<std::string> tokens = Tokenize(line);
    UTIL_THROW_IF2(tokens.size()!=3, "Line " << i << " in " << fileName << " has wrong number of tokens.");
    unsigned id = atoll( tokens[0].c_str() );
    if (! ( (id == 1) && (tokens[1] == "UNK") )) {
      const Factor* factor = factorCollection.AddFactor(tokens[1],false); // TODO: can we assume that the vocabulary is know and filter the model on loading?
      bool stored = Store(factor, id);
      UTIL_THROW_IF2(!stored, "Line " << i << " in " << fileName << " overwrites existing vocabulary entry.");
    }
  }
  while ( getline(inFile, line) ) {
    ++i;
    std::vector<std::string> tokens = Tokenize(line);
    UTIL_THROW_IF2(tokens.size()!=3, "Line " << i << " in " << fileName << " has wrong number of tokens.");
    unsigned id = atoll( tokens[0].c_str() );
    const Factor* factor = factorCollection.AddFactor(tokens[1],false); // TODO: can we assume that the vocabulary is know and filter the model on loading?
    bool stored = Store(factor, id);
    UTIL_THROW_IF2(!stored, "Line " << i << " in " << fileName << " overwrites existing vocabulary entry.");
  }
  inFile.Close();
}
Ejemplo n.º 4
0
/***
 * print surface factor only for the given phrase
 */
void BaseManager::OutputSurface(std::ostream &out, const Phrase &phrase,
                                const std::vector<FactorType> &outputFactorOrder,
                                bool reportAllFactors) const
{
  UTIL_THROW_IF2(outputFactorOrder.size() == 0,
                 "Cannot be empty phrase");
  if (reportAllFactors == true) {
    out << phrase;
  } else {
    size_t size = phrase.GetSize();
    for (size_t pos = 0 ; pos < size ; pos++) {
      const Factor *factor = phrase.GetFactor(pos, outputFactorOrder[0]);
      out << *factor;
      UTIL_THROW_IF2(factor == NULL,
                     "Empty factor 0 at position " << pos);

      for (size_t i = 1 ; i < outputFactorOrder.size() ; i++) {
        const Factor *factor = phrase.GetFactor(pos, outputFactorOrder[i]);
        UTIL_THROW_IF2(factor == NULL,
                       "Empty factor " << i << " at position " << pos);

        out << "|" << *factor;
      }
      out << " ";
    }
  }
}
void WordTranslationFeature::Load(AllOptions::ptr const& opts)
{
  m_options = opts;
  // load word list for restricted feature set
  if (m_filePathSource.empty()) {
    return;
  } //else if (tokens.size() == 8) {

  FEATUREVERBOSE(1, "Loading word translation word lists from " << m_filePathSource << " and " << m_filePathTarget << std::endl);
  if (m_domainTrigger) {
    // domain trigger terms for each input document
    ifstream inFileSource(m_filePathSource.c_str());
    UTIL_THROW_IF2(!inFileSource, "could not open file " << m_filePathSource);

    std::string line;
    while (getline(inFileSource, line)) {
      m_vocabDomain.resize(m_vocabDomain.size() + 1);
      vector<string> termVector;
      boost::split(termVector, line, boost::is_any_of("\t "));
      for (size_t i=0; i < termVector.size(); ++i)
        m_vocabDomain.back().insert(termVector[i]);
    }

    inFileSource.close();
  } else if (!m_filePathSource.empty() || !m_filePathTarget.empty()) {
    return;
    // restricted source word vocabulary
    ifstream inFileSource(m_filePathSource.c_str());
    UTIL_THROW_IF2(!inFileSource, "could not open file " << m_filePathSource);

    std::string line;
    while (getline(inFileSource, line)) {
      m_vocabSource.insert(line);
    }

    inFileSource.close();

    // restricted target word vocabulary
    ifstream inFileTarget(m_filePathTarget.c_str());
    UTIL_THROW_IF2(!inFileTarget, "could not open file " << m_filePathTarget);

    while (getline(inFileTarget, line)) {
      m_vocabTarget.insert(line);
    }

    inFileTarget.close();

    m_unrestricted = false;
  }
}
Ejemplo n.º 6
0
void PhrasePairFeature::Load()
{
  if (m_domainTrigger) {
    // domain trigger terms for each input document
    ifstream inFileSource(m_filePathSource.c_str());
    UTIL_THROW_IF2(!inFileSource, "could not open file " << m_filePathSource);

    std::string line;
    while (getline(inFileSource, line)) {
      std::set<std::string> terms;
      vector<string> termVector;
      boost::split(termVector, line, boost::is_any_of("\t "));
      for (size_t i=0; i < termVector.size(); ++i)
        terms.insert(termVector[i]);

      // add term set for current document
      m_vocabDomain.push_back(terms);
    }

    inFileSource.close();
  } else {
    // restricted source word vocabulary
    ifstream inFileSource(m_filePathSource.c_str());
    UTIL_THROW_IF2(!inFileSource, "could not open file " << m_filePathSource);

    std::string line;
    while (getline(inFileSource, line)) {
      m_vocabSource.insert(line);
    }

    inFileSource.close();

    /*  // restricted target word vocabulary
    ifstream inFileTarget(filePathTarget.c_str());
    if (!inFileTarget)
    {
      cerr << "could not open file " << filePathTarget << endl;
      return false;
    }

    while (getline(inFileTarget, line)) {
    m_vocabTarget.insert(line);
    }

    inFileTarget.close();*/

    m_unrestricted = false;
  }
}
Ejemplo n.º 7
0
void ReformatHieroRule(int sourceTarget, string &phrase, map<size_t, pair<size_t, size_t> > &ntAlign)
{
  vector<string> toks;
  Tokenize(toks, phrase, " ");

  for (size_t i = 0; i < toks.size(); ++i) {
    string &tok = toks[i];
    size_t tokLen = tok.size();
    if (tok.substr(0, 1) == "[" && tok.substr(tokLen - 1, 1) == "]") {
      // no-term
      vector<string> split = Tokenize(tok, ",");
      UTIL_THROW_IF2(split.size() != 2,
    		  "Incorrectly formmatted non-terminal: " << tok);

      tok = "[X]" + split[0] + "]";
      size_t coIndex = Scan<size_t>(split[1]);

      pair<size_t, size_t> &alignPoint = ntAlign[coIndex];
      if (sourceTarget == 0) {
        alignPoint.first = i;
      } else {
        alignPoint.second = i;
      }
    }
  }

  phrase = Join(" ", toks) + " [X]";

}
bool SoftMatchingFeature::Load(const std::string& filePath)
{

  StaticData &SD = StaticData::InstanceNonConst();

  InputFileStream inStream(filePath);
  std::string line;
  while(getline(inStream, line)) {
    std::vector<std::string> tokens = Tokenize(line);
    UTIL_THROW_IF2(tokens.size() != 2, "Error: wrong format of SoftMatching file: must have two nonterminals per line");

    // no soft matching necessary if LHS and RHS are the same
    if (tokens[0] == tokens[1]) {
      continue;
    }

    Word LHS, RHS;
    LHS.CreateFromString(Output, SD.options()->output.factor_order, tokens[0], true);
    RHS.CreateFromString(Output, SD.options()->output.factor_order, tokens[1], true);

    m_softMatches[RHS[0]->GetId()].push_back(LHS);
    GetOrSetFeatureName(RHS, LHS);
  }

  SD.SetSoftMatches(m_softMatches);

  return true;
}
Ejemplo n.º 9
0
std::vector<float> ConstrainedDecoding::DefaultWeights() const
{
  UTIL_THROW_IF2(m_numScoreComponents != 1,
		  "ConstrainedDecoding must only have 1 score");
  vector<float> ret(1, 1);
  return ret;
}
Ejemplo n.º 10
0
std::vector<float> ControlRecombination::DefaultWeights() const
{
  UTIL_THROW_IF2(m_numScoreComponents,
		  "ControlRecombination should not have any scores");
  vector<float> ret(0);
  return ret;
}
Ejemplo n.º 11
0
void TargetPhrase::SetAlignmentInfo(const StringPiece &alignString)
{
  AlignmentInfo::CollType alignTerm, alignNonTerm;
  for (util::TokenIter<util::AnyCharacter, true> token(alignString, util::AnyCharacter(" \t")); token; ++token) {
    util::TokenIter<util::SingleCharacter, false> dash(*token, util::SingleCharacter('-'));

    char *endptr;
    size_t sourcePos = strtoul(dash->data(), &endptr, 10);
    UTIL_THROW_IF(endptr != dash->data() + dash->size(), util::ErrnoException, "Error parsing alignment" << *dash);
    ++dash;
    size_t targetPos = strtoul(dash->data(), &endptr, 10);
    UTIL_THROW_IF(endptr != dash->data() + dash->size(), util::ErrnoException, "Error parsing alignment" << *dash);
    UTIL_THROW_IF2(++dash, "Extra gunk in alignment " << *token);


    if (GetWord(targetPos).IsNonTerminal()) {
      alignNonTerm.insert(std::pair<size_t,size_t>(sourcePos, targetPos));
    } else {
      alignTerm.insert(std::pair<size_t,size_t>(sourcePos, targetPos));
    }
  }
  SetAlignTerm(alignTerm);
  SetAlignNonTerm(alignNonTerm);

}
Ejemplo n.º 12
0
void TargetPhraseImpl::SetAlignmentInfo(const std::string &alignString)
{
  AlignmentInfo::CollType alignTerm, alignNonTerm;

  vector<string> toks = Tokenize(alignString);
  for (size_t i = 0; i < toks.size(); ++i) {
    vector<size_t> alignPair = Tokenize<size_t>(toks[i], "-");
    UTIL_THROW_IF2(alignPair.size() != 2, "Wrong alignment format");

    size_t sourcePos = alignPair[0];
    size_t targetPos = alignPair[1];

    if ((*this)[targetPos].isNonTerminal) {
      alignNonTerm.insert(std::pair<size_t,size_t>(sourcePos, targetPos));
    } else {
      alignTerm.insert(std::pair<size_t,size_t>(sourcePos, targetPos));
    }
  }

  SetAlignTerm(alignTerm);
  SetAlignNonTerm(alignNonTerm);
  //    cerr << "TargetPhrase::SetAlignmentInfo(const StringPiece &alignString) this:|" << *this << "|\n";

  //cerr << "alignTerm=" << alignTerm.size() << endl;
  //cerr << "alignNonTerm=" << alignNonTerm.size() << endl;

}
Ejemplo n.º 13
0
ChartHypothesis *RuleCubeItem::ReleaseHypothesis()
{
    UTIL_THROW_IF2(m_hypothesis == NULL, "Hypothesis is NULL");
    ChartHypothesis *hypo = m_hypothesis;
    m_hypothesis = NULL;
    return hypo;
}
Ejemplo n.º 14
0
void ChartParser::CreateInputPaths(const InputType &input)
{
  size_t size = input.GetSize();
  m_inputPathMatrix.resize(size);

  UTIL_THROW_IF2(input.GetType() != SentenceInput && input.GetType() != TreeInputType,
		  "Input must be a sentence or a tree, not lattice or confusion networks");
  for (size_t phaseSize = 1; phaseSize <= size; ++phaseSize) {
    for (size_t startPos = 0; startPos < size - phaseSize + 1; ++startPos) {
      size_t endPos = startPos + phaseSize -1;
      vector<InputPath*> &vec = m_inputPathMatrix[startPos];

      WordsRange range(startPos, endPos);
      Phrase subphrase(input.GetSubString(WordsRange(startPos, endPos)));
      const NonTerminalSet &labels = input.GetLabelSet(startPos, endPos);

      InputPath *node;
      if (range.GetNumWordsCovered() == 1) {
        node = new InputPath(subphrase, labels, range, NULL, NULL);
        vec.push_back(node);
      } else {
        const InputPath &prevNode = GetInputPath(startPos, endPos - 1);
        node = new InputPath(subphrase, labels, range, &prevNode, NULL);
        vec.push_back(node);
      }

      //m_inputPathQueue.push_back(node);
    }
  }
}
const PhraseDictionaryNodeMemory &PhraseDictionaryFuzzyMatch::GetRootNode(long translationId) const
{
  std::map<long, PhraseDictionaryNodeMemory>::const_iterator iter = m_collection.find(translationId);
  UTIL_THROW_IF2(iter == m_collection.end(),
		  "Couldn't find root node for input: " << translationId);
  return iter->second;
}
std::vector<TargetPhrase*> PhraseDictionaryTransliteration::CreateTargetPhrases(const Phrase &sourcePhrase, const string &outDir) const
{
	std::vector<TargetPhrase*> ret;

	string outPath = outDir + "/out.txt";
	ifstream outStream(outPath.c_str());

	string line;
	while (getline(outStream, line)) {
		vector<string> toks;
		Tokenize(toks, line, "\t");
		UTIL_THROW_IF2(toks.size() != 2, "Error in transliteration output file. Expecting word\tscore");

	  TargetPhrase *tp = new TargetPhrase();
	  Word &word = tp->AddWord();
	  word.CreateFromString(Output, m_output, toks[0], false);

	  float score = Scan<float>(toks[1]);
	  tp->GetScoreBreakdown().PlusEquals(this, score);

	  // score of all other ff when this rule is being loaded
	  tp->Evaluate(sourcePhrase, GetFeaturesToApply());

	  ret.push_back(tp);
	}

	outStream.close();

  return ret;
}
void
LexicalReorderingTableCompact::
Load(std::string filePath)
{
  std::FILE* pFile = std::fopen(filePath.c_str(), "r");
  UTIL_THROW_IF2(pFile == NULL, "File " << filePath << " could not be opened");

  //if(m_inMemory)
  m_hash.Load(pFile);
  //else
  //m_hash.LoadIndex(pFile);

  size_t read = 0;
  read += std::fread(&m_numScoreComponent, sizeof(m_numScoreComponent), 1, pFile);
  read += std::fread(&m_multipleScoreTrees,
                     sizeof(m_multipleScoreTrees), 1, pFile);

  if(m_multipleScoreTrees) {
    m_scoreTrees.resize(m_numScoreComponent);
    for(size_t i = 0; i < m_numScoreComponent; i++)
      m_scoreTrees[i] = new CanonicalHuffman<float>(pFile);
  } else {
    m_scoreTrees.resize(1);
    m_scoreTrees[0] = new CanonicalHuffman<float>(pFile);
  }

  if(m_inMemory)
    m_scoresMemory.load(pFile, false);
  else
    m_scoresMapped.load(pFile, true);
}
Ejemplo n.º 18
0
const Word &InputPath::GetLastWord() const
{
  size_t len = m_phrase.GetSize();
  UTIL_THROW_IF2(len == 0, "Input path phrase cannot be empty");
  const Word &ret = m_phrase.GetWord(len - 1);
  return ret;
}
Ejemplo n.º 19
0
void Manager::OutputBest(OutputCollector *collector) const
{
  if (!collector) {
    return;
  }
  std::ostringstream out;
  FixPrecision(out);
  const SHyperedge *best = GetBestSHyperedge();
  if (best == NULL) {
    VERBOSE(1, "NO BEST TRANSLATION" << std::endl);
    if (StaticData::Instance().GetOutputHypoScore()) {
      out << "0 ";
    }
    out << '\n';
  } else {
    if (StaticData::Instance().GetOutputHypoScore()) {
      out << best->label.score << " ";
    }
    Phrase yield = GetOneBestTargetYield(*best);
    // delete 1st & last
    UTIL_THROW_IF2(yield.GetSize() < 2,
                   "Output phrase should have contained at least 2 words (beginning and end-of-sentence)");
    yield.RemoveWord(0);
    yield.RemoveWord(yield.GetSize()-1);
    out << yield.GetStringRep(StaticData::Instance().GetOutputFactorOrder());
    out << '\n';
  }
  collector->Write(m_source.GetTranslationId(), out.str());
}
Ejemplo n.º 20
0
Scores
LexicalReorderingTableTree::
auxFindScoreForContext(const Candidates& cands, const Phrase& context)
{
  if(m_FactorsC.empty()) {
    UTIL_THROW_IF2(cands.size() > 1, "Error");
    return (cands.size() == 1) ? cands[0].GetScore(0) : Scores();
  } else {
    std::vector<std::string> cvec;
    for(size_t i = 0; i < context.GetSize(); ++i)
      cvec.push_back(context.GetWord(i).GetString(m_FactorsC, false));

    IPhrase c = m_Table->ConvertPhrase(cvec,TargetVocId);
    IPhrase sub_c;
    IPhrase::iterator start = c.begin();
    for(size_t j = 0; j <= context.GetSize(); ++j, ++start) {
      sub_c.assign(start, c.end());
      for(size_t cand = 0; cand < cands.size(); ++cand) {
        IPhrase p = cands[cand].GetPhrase(0);
        if(cands[cand].GetPhrase(0) == sub_c)
          return cands[cand].GetScore(0);
      }
    }
    return Scores();
  }
}
Ejemplo n.º 21
0
Scores
LexicalReorderingTableTree::
GetScore(const Phrase& f, const Phrase& e, const Phrase& c)
{
  if((!m_FactorsF.empty() && 0 == f.GetSize())
      || (!m_FactorsE.empty() && 0 == e.GetSize())) {
    //NOTE: no check for c as c might be empty, e.g. start of sentence
    //not a proper key
    // phi: commented out, since e may be empty (drop-unknown)
    //std::cerr << "Not a proper key!\n";
    return Scores();
  }

  CacheType::iterator i;

  if(m_UseCache) {
    std::pair<CacheType::iterator, bool> r;
    r = m_Cache.insert(std::make_pair(MakeCacheKey(f,e),Candidates()));
    if(!r.second) return auxFindScoreForContext((r.first)->second, c);
    i = r.first;
  } else if((i = m_Cache.find(MakeCacheKey(f,e))) != m_Cache.end())
    // although we might not be caching now, cache might be none empty!
    return auxFindScoreForContext(i->second, c);

  // not in cache => go to file...
  Candidates cands;
  m_Table->GetCandidates(MakeTableKey(f,e), &cands);
  if(cands.empty()) return Scores();
  if(m_UseCache) i->second = cands;

  if(m_FactorsC.empty()) {
    UTIL_THROW_IF2(1 != cands.size(), "Error");
    return cands[0].GetScore(0);
  } else return auxFindScoreForContext(cands, c);
};
ChartRuleLookupManagerMemory::ChartRuleLookupManagerMemory(
  const ChartParser &parser,
  const ChartCellCollectionBase &cellColl,
  const PhraseDictionaryMemory &ruleTable)
  : ChartRuleLookupManagerCYKPlus(parser, cellColl)
  , m_ruleTable(ruleTable)
{
  UTIL_THROW_IF2(m_dottedRuleColls.size() != 0,
		  "Dotted rule collection not correctly initialized");

  size_t sourceSize = parser.GetSize();
  m_dottedRuleColls.resize(sourceSize);

  const PhraseDictionaryNodeMemory &rootNode = m_ruleTable.GetRootNode();

  for (size_t ind = 0; ind < m_dottedRuleColls.size(); ++ind) {
#ifdef USE_BOOST_POOL
    DottedRuleInMemory *initDottedRule = m_dottedRulePool.malloc();
    new (initDottedRule) DottedRuleInMemory(rootNode);
#else
    DottedRuleInMemory *initDottedRule = new DottedRuleInMemory(rootNode);
#endif

    DottedRuleColl *dottedRuleColl = new DottedRuleColl(sourceSize - ind + 1);
    dottedRuleColl->Add(0, initDottedRule); // init rule. stores the top node in tree

    m_dottedRuleColls[ind] = dottedRuleColl;
  }
}
Ejemplo n.º 23
0
bool SoftMatchingFeature::Load(const std::string& filePath)
{

    StaticData &staticData = StaticData::InstanceNonConst();

    InputFileStream inStream(filePath);
    std::string line;
    while(getline(inStream, line)) {
      std::vector<std::string> tokens = Tokenize(line);
      UTIL_THROW_IF2(tokens.size() != 2, "Error: wrong format of SoftMatching file: must have two nonterminals per line");

      // no soft matching necessary if LHS and RHS are the same
      if (tokens[0] == tokens[1]) {
          continue;
      }

      Word LHS, RHS;
      LHS.CreateFromString(Output, staticData.GetOutputFactorOrder(), tokens[0], true);
      RHS.CreateFromString(Output, staticData.GetOutputFactorOrder(), tokens[1], true);

      m_soft_matches[LHS].insert(RHS);
      m_soft_matches_reverse[RHS].insert(LHS);
    }

    staticData.Set_Soft_Matches(Get_Soft_Matches());
    staticData.Set_Soft_Matches_Reverse(Get_Soft_Matches_Reverse());

   return true;
}
Ejemplo n.º 24
0
InputPath &ChartParser::GetInputPath(size_t startPos, size_t endPos)
{
  size_t offset = endPos - startPos;
  UTIL_THROW_IF2(offset >= m_inputPathMatrix[startPos].size(),
		  "Out of bound: " << offset);
  return *m_inputPathMatrix[startPos][offset];
}
ChartRuleLookupManagerOnDisk::ChartRuleLookupManagerOnDisk(
  const ChartParser &parser,
  const ChartCellCollectionBase &cellColl,
  const PhraseDictionaryOnDisk &dictionary,
  OnDiskPt::OnDiskWrapper &dbWrapper,
  const std::vector<FactorType> &inputFactorsVec,
  const std::vector<FactorType> &outputFactorsVec)
  : ChartRuleLookupManagerCYKPlus(parser, cellColl)
  , m_dictionary(dictionary)
  , m_dbWrapper(dbWrapper)
  , m_inputFactorsVec(inputFactorsVec)
  , m_outputFactorsVec(outputFactorsVec)
{
  UTIL_THROW_IF2(m_expandableDottedRuleListVec.size() != 0,
                 "Dotted rule collection not correctly initialized");

  size_t sourceSize = parser.GetSize();
  m_expandableDottedRuleListVec.resize(sourceSize);

  for (size_t ind = 0; ind < m_expandableDottedRuleListVec.size(); ++ind) {
    DottedRuleOnDisk *initDottedRule = new DottedRuleOnDisk(m_dbWrapper.GetRootSourceNode());

    DottedRuleStackOnDisk *processedStack = new DottedRuleStackOnDisk(sourceSize - ind + 1);
    processedStack->Add(0, initDottedRule); // init rule. stores the top node in tree

    m_expandableDottedRuleListVec[ind] = processedStack;
  }
}
Ejemplo n.º 26
0
RuleTrieCYKPlus::Node *RuleTrieCYKPlus::Node::GetOrCreateNonTerminalChild(const Word &targetNonTerm)
{
  UTIL_THROW_IF2(!targetNonTerm.IsNonTerminal(),
                  "Not a non-terminal: " << targetNonTerm);

  return &m_nonTermMap[targetNonTerm];
}
Ejemplo n.º 27
0
const OnDiskPt::OnDiskWrapper &PhraseDictionaryOnDisk::GetImplementation() const
{
  OnDiskPt::OnDiskWrapper* dict;
  dict = m_implementation.get();
  UTIL_THROW_IF2(dict == NULL, "Dictionary object not yet created for this thread");
  return *dict;
}
const Factor *FactorCollection::AddFactor(const StringPiece &factorString, bool isNonTerminal)
{
  FactorFriend to_ins;
  to_ins.in.m_string = factorString;
  to_ins.in.m_id = (isNonTerminal) ? m_factorIdNonTerminal : m_factorId;
  Set & set = (isNonTerminal) ? m_set : m_setNonTerminal;
  // If we're threaded, hope a read-only lock is sufficient.
#ifdef WITH_THREADS
  {
    // read=lock scope
    boost::shared_lock<boost::shared_mutex> read_lock(m_accessLock);
    Set::const_iterator i = set.find(to_ins);
    if (i != set.end()) return &i->in;
  }
  boost::unique_lock<boost::shared_mutex> lock(m_accessLock);
#endif // WITH_THREADS
  std::pair<Set::iterator, bool> ret(set.insert(to_ins));
  if (ret.second) {
    ret.first->in.m_string.set(
      memcpy(m_string_backing.Allocate(factorString.size()), factorString.data(), factorString.size()),
      factorString.size());
    if (isNonTerminal) {
      m_factorIdNonTerminal++;
      UTIL_THROW_IF2(m_factorIdNonTerminal >= moses_MaxNumNonterminals, "Number of non-terminals exceeds maximum size reserved. Adjust parameter moses_MaxNumNonterminals, then recompile");
    } else {
      m_factorId++;
    }
  }
  return &ret.first->in;
}
PhraseDictionaryNodeMemory &PhraseDictionaryFuzzyMatch::GetOrCreateNode(PhraseDictionaryNodeMemory &rootNode
    , const Phrase &source
    , const TargetPhrase &target
    , const Word *sourceLHS)
{
  cerr << source << endl << target << endl;
  const size_t size = source.GetSize();

  const AlignmentInfo &alignmentInfo = target.GetAlignNonTerm();
  AlignmentInfo::const_iterator iterAlign = alignmentInfo.begin();

  PhraseDictionaryNodeMemory *currNode = &rootNode;
  for (size_t pos = 0 ; pos < size ; ++pos) {
    const Word& word = source.GetWord(pos);

    if (word.IsNonTerminal()) {
      // indexed by source label 1st
      const Word &sourceNonTerm = word;

      UTIL_THROW_IF2(iterAlign == alignmentInfo.end(),
    		  "No alignment for non-term at position " << pos);
      UTIL_THROW_IF2(iterAlign->first != pos,
    		  "Alignment info incorrect at position " << pos);

      size_t targetNonTermInd = iterAlign->second;
      ++iterAlign;
      const Word &targetNonTerm = target.GetWord(targetNonTermInd);

#if defined(UNLABELLED_SOURCE)
      currNode = currNode->GetOrCreateNonTerminalChild(targetNonTerm);
#else
      currNode = currNode->GetOrCreateChild(sourceNonTerm, targetNonTerm);
#endif
    } else {
      currNode = currNode->GetOrCreateChild(word);
    }

    UTIL_THROW_IF2(currNode == NULL,
    		"Node not found at position " << pos);

  }

  // finally, the source LHS
  //currNode = currNode->GetOrCreateChild(sourceLHS);

  return *currNode;
}
void TranslationOptionCollectionLattice::CreateTranslationOptions()
{
  GetTargetPhraseCollectionBatch();

  VERBOSE(2,"Translation Option Collection\n " << *this << endl);
  const vector <DecodeGraph*> &decodeGraphs = StaticData::Instance().GetDecodeGraphs();
  UTIL_THROW_IF2(decodeGraphs.size() != 1, "Multiple decoder graphs not supported yet");
  const DecodeGraph &decodeGraph = *decodeGraphs[0];
  UTIL_THROW_IF2(decodeGraph.GetSize() != 1, "Factored decomposition not supported yet");

  const DecodeStep &decodeStep = **decodeGraph.begin();
  const PhraseDictionary &phraseDictionary = *decodeStep.GetPhraseDictionaryFeature();

  for (size_t i = 0; i < m_inputPathQueue.size(); ++i) {
    const InputPath &path = *m_inputPathQueue[i];
    const TargetPhraseCollection *tpColl = path.GetTargetPhrases(phraseDictionary);
    const WordsRange &range = path.GetWordsRange();

    if (tpColl) {
    	TargetPhraseCollection::const_iterator iter;
    	for (iter = tpColl->begin(); iter != tpColl->end(); ++iter) {
    		const TargetPhrase &tp = **iter;
    		TranslationOption *transOpt = new TranslationOption(range, tp);
    		transOpt->SetInputPath(path);
    		transOpt->Evaluate(m_source);

    		Add(transOpt);
    	}
    }
    else if (path.GetPhrase().GetSize() == 1) {
    	// unknown word processing
    	ProcessOneUnknownWord(path, path.GetWordsRange().GetEndPos(), 1, path.GetInputScore());
    }
  }

  // Prune
  Prune();

  Sort();

  // future score matrix
  CalcFutureScore();

  // Cached lex reodering costs
  CacheLexReordering();

}