void CacheBasedLanguageModel::Load(const std::string file)
	{
		//file format
		//age || n-gram 
		//age || n-gram || n-gram || n-gram || ...
		//....
		//each n-gram is a sequence of n words (no matter of n)
		//
		//there is no limit on the size of n
		//
		//entries can be repeated, but the last entry overwrites the previous
		
		
		VERBOSE(2,"Loading data from the cache file " << file << std::endl);
		InputFileStream cacheFile(file);
		
		std::string line;
		int age;
		std::vector<std::string> words;
		
		while (getline(cacheFile, line)) {
			std::vector<std::string> vecStr = TokenizeMultiCharSeparator( line , "||" );
			if (vecStr.size() >= 2) {
				age = Scan<int>(vecStr[0]);
				vecStr.erase(vecStr.begin());
				Update(vecStr,age);
			} else {
				TRACE_ERR("ERROR: The format of the loaded file is wrong: " << line << std::endl);
				CHECK(false);
			}
		}
		IFVERBOSE(2) Print();
	}
Esempio n. 2
0
void ReformatHieroRule(const string &lineOrig, string &out)
{  
  vector<string> tokens;
  vector<float> scoreVector;
  
  TokenizeMultiCharSeparator(tokens, lineOrig, "|||" );

  string &sourcePhraseString = tokens[1]
              , &targetPhraseString = tokens[2]
              , &scoreString        = tokens[3];

  map<size_t, pair<size_t, size_t> > ntAlign;
  ReformatHieroRule(0, sourcePhraseString, ntAlign);
  ReformatHieroRule(1, targetPhraseString, ntAlign);
  ReformateHieroScore(scoreString);
  
  stringstream align;
  map<size_t, pair<size_t, size_t> >::const_iterator iterAlign;
  for (iterAlign = ntAlign.begin(); iterAlign != ntAlign.end(); ++iterAlign)
  {
    const pair<size_t, size_t> &alignPoint = iterAlign->second;
    align << alignPoint.first << "-" << alignPoint.second << " ";
  }
  
  stringstream ret;
  ret << sourcePhraseString << " ||| "
      << targetPhraseString << " ||| " 
      << scoreString << " ||| "
      << align.str();
  
  out = ret.str();
}
Esempio n. 3
0
vector< vector<string> > Phrase::Parse(const std::string &phraseString, const std::vector<FactorType> &factorOrder, const std::string& factorDelimiter)
{
	bool isMultiCharDelimiter = factorDelimiter.size() > 1;
	// parse
	vector< vector<string> > phraseVector;
	vector<string> annotatedWordVector = Tokenize(phraseString);
	// KOMMA|none ART|Def.Z NN|Neut.NotGen.Sg VVFIN|none 
	//		to
	// "KOMMA|none" "ART|Def.Z" "NN|Neut.NotGen.Sg" "VVFIN|none"

	for (size_t phrasePos = 0 ; phrasePos < annotatedWordVector.size() ; phrasePos++)
	{
		string &annotatedWord = annotatedWordVector[phrasePos];
		vector<string> factorStrVector;
		if (isMultiCharDelimiter) {
			factorStrVector = TokenizeMultiCharSeparator(annotatedWord, factorDelimiter);
		} else {
			factorStrVector = Tokenize(annotatedWord, factorDelimiter);
		}
		// KOMMA|none
		//    to
		// "KOMMA" "none"
		if (factorStrVector.size() != factorOrder.size()) {
			TRACE_ERR( "[ERROR] Malformed input at " << /*StaticData::Instance().GetCurrentInputPosition() <<*/ std::endl
			          << "  Expected input to have words composed of " << factorOrder.size() << " factor(s) (form FAC1|FAC2|...)" << std::endl
								<< "  but instead received input with " << factorStrVector.size() << " factor(s).\n");
			abort();
		}
		phraseVector.push_back(factorStrVector);
	}
	return phraseVector;
}
void
LexicalReorderingTableMemory::
LoadFromFile(const std::string& filePath)
{
  std::string fileName = filePath;
  if(!FileExists(fileName) && FileExists(fileName+".gz"))
    fileName += ".gz";

  InputFileStream file(fileName);
  std::string line(""), key("");
  int numScores = -1;
  std::cerr << "Loading table into memory...";
  while(!getline(file, line).eof()) {
    std::vector<std::string> tokens = TokenizeMultiCharSeparator(line, "|||");
    int t = 0 ;
    std::string f(""),e(""),c("");

    if(!m_FactorsF.empty()) {
      //there should be something for f
      f = auxClearString(tokens.at(t));
      ++t;
    }
    if(!m_FactorsE.empty()) {
      //there should be something for e
      e = auxClearString(tokens.at(t));
      ++t;
    }
    if(!m_FactorsC.empty()) {
      //there should be something for c
      c = auxClearString(tokens.at(t));
      ++t;
    }
    //last token are the probs
    std::vector<float> p = Scan<float>(Tokenize(tokens.at(t)));
    //sanity check: all lines must have equall number of probs
    if(-1 == numScores) {
      numScores = (int)p.size(); //set in first line
    }
    if((int)p.size() != numScores) {
      TRACE_ERR( "found inconsistent number of probabilities... found "
                 << p.size() << " expected " << numScores << std::endl);
      exit(0);
    }
    std::transform(p.begin(),p.end(),p.begin(),TransformScore);
    std::transform(p.begin(),p.end(),p.begin(),FloorScore);
    //save it all into our map
    m_Table[MakeKey(f,e,c)] = p;
  }
  std::cerr << "done.\n";
}
Esempio n. 5
0
void DeleteRules::Load()
{
  std::vector<FactorType> factorOrder;
  factorOrder.push_back(0); // unfactored for now

  InputFileStream strme(m_path);

  string line;
  while (getline(strme, line)) {
    vector<string> toks = TokenizeMultiCharSeparator(line, "|||");
    UTIL_THROW_IF2(toks.size() != 2, "Line must be source ||| target");
    Phrase source, target;
    source.CreateFromString(Input, factorOrder, toks[0], NULL);
    target.CreateFromString(Output, factorOrder, toks[1], NULL);

    size_t hash = 0;
    boost::hash_combine(hash, source);
    boost::hash_combine(hash, target);
    m_ruleHashes.insert(hash);
  }
}
Esempio n. 6
0
void Word::CreateFromString(FactorDirection direction
                            , const std::vector<FactorType> &factorOrder
                            , const std::string &str
                            , bool isNonTerminal)
{
    FactorCollection &factorCollection = FactorCollection::Instance();

    vector<string> wordVec;
    const std::string& factorDelimiter = StaticData::Instance().GetFactorDelimiter();
    TokenizeMultiCharSeparator(wordVec, str, factorDelimiter);
    //Tokenize(wordVec, str, "|");
    CHECK(wordVec.size() <= factorOrder.size());

    const Factor *factor;
    for (size_t ind = 0; ind < wordVec.size(); ++ind) {
        FactorType factorType = factorOrder[ind];
        factor = factorCollection.AddFactor(direction, factorType, wordVec[ind]);
        m_factorArray[factorType] = factor;
    }

    // assume term/non-term same for all factors
    m_isNonTerminal = isNonTerminal;
}
Esempio n. 7
0
void TargetPhrase::SetProperties(const StringPiece &str)
{
  if (str.size() == 0) {
    return;
  }

  vector<string> toks;
  TokenizeMultiCharSeparator(toks, str.as_string(), "{{");
  for (size_t i = 0; i < toks.size(); ++i) {
    string &tok = toks[i];
    if (tok.empty()) {
      continue;
    }
    size_t endPos = tok.rfind("}");

    tok = tok.substr(0, endPos - 1);

    vector<string> keyValue = TokenizeFirstOnly(tok, " ");
    UTIL_THROW_IF2(keyValue.size() != 2,
    		"Incorrect format of property: " << str);
    SetProperty(keyValue[0], keyValue[1]);
  }
}
Esempio n. 8
0
/**
 * Process a sentence with xml annotation
 * Xml tags may specifiy additional/replacing translation options
 * and reordering constraints
 *
 * \param line in: sentence, out: sentence without the xml
 * \param res vector with translation options specified by xml
 * \param reorderingConstraint reordering constraint zones specified by xml
 * \param walls reordering constraint walls specified by xml
 * \param lbrackStr xml tag's left bracket string, typically "<"
 * \param rbrackStr xml tag's right bracket string, typically ">"
 */
bool ProcessAndStripXMLTags(string &line, vector<XmlOption*> &res, ReorderingConstraint &reorderingConstraint, vector< size_t > &walls,
                            std::vector< std::pair<size_t, std::string> > &placeholders,
                            int offset,
                            const std::string& lbrackStr, const std::string& rbrackStr)
{
  //parse XML markup in translation line

  const StaticData &staticData = StaticData::Instance();

  // hack. What pt should XML trans opt be assigned to?
  PhraseDictionary *firstPt = NULL;
  if (PhraseDictionary::GetColl().size() == 0) {
    firstPt = PhraseDictionary::GetColl()[0];
  }

  // no xml tag? we're done.
//if (line.find_first_of('<') == string::npos) {
  if (line.find(lbrackStr) == string::npos) {
    return true;
  }

  // break up input into a vector of xml tags and text
  // example: (this), (<b>), (is a), (</b>), (test .)
  vector<string> xmlTokens = TokenizeXml(line, lbrackStr, rbrackStr);

  // we need to store opened tags, until they are closed
  // tags are stored as tripled (tagname, startpos, contents)
  typedef pair< string, pair< size_t, string > > OpenedTag;
  vector< OpenedTag > tagStack; // stack that contains active opened tags

  string cleanLine; // return string (text without xml)
  size_t wordPos = 0; // position in sentence (in terms of number of words)

  const vector<FactorType> &outputFactorOrder = staticData.GetOutputFactorOrder();
  // const string &factorDelimiter = staticData.GetFactorDelimiter();

  // loop through the tokens
  for (size_t xmlTokenPos = 0 ; xmlTokenPos < xmlTokens.size() ; xmlTokenPos++) {
    // not a xml tag, but regular text (may contain many words)
    if(!isXmlTag(xmlTokens[xmlTokenPos], lbrackStr, rbrackStr)) {
      // add a space at boundary, if necessary
      if (cleanLine.size()>0 &&
          cleanLine[cleanLine.size() - 1] != ' ' &&
          xmlTokens[xmlTokenPos][0] != ' ') {
        cleanLine += " ";
      }
      cleanLine += xmlTokens[xmlTokenPos]; // add to output
      wordPos = Tokenize(cleanLine).size(); // count all the words
    }

    // process xml tag
    else {
      // *** get essential information about tag ***

      // strip extra boundary spaces and "<" and ">"
      string tag =  Trim(TrimXml(xmlTokens[xmlTokenPos], lbrackStr, rbrackStr));
      VERBOSE(3,"XML TAG IS: " << tag << std::endl);

      if (tag.size() == 0) {
        TRACE_ERR("ERROR: empty tag name: " << line << endl);
        return false;
      }

      // check if unary (e.g., "<wall/>")
      bool isUnary = ( tag[tag.size() - 1] == '/' );

      // check if opening tag (e.g. "<a>", not "</a>")g
      bool isClosed = ( tag[0] == '/' );
      bool isOpen = !isClosed;

      if (isClosed && isUnary) {
        TRACE_ERR("ERROR: can't have both closed and unary tag " << lbrackStr << tag << rbrackStr << ": " << line << endl);
        return false;
      }

      if (isClosed)
        tag = tag.substr(1); // remove "/" at the beginning
      if (isUnary)
        tag = tag.substr(0,tag.size()-1); // remove "/" at the end

      // find the tag name and contents
      string::size_type endOfName = tag.find_first_of(' ');
      string tagName = tag;
      string tagContent = "";
      if (endOfName != string::npos) {
        tagName = tag.substr(0,endOfName);
        tagContent = tag.substr(endOfName+1);
      }

      // *** process new tag ***

      if (isOpen || isUnary) {
        // put the tag on the tag stack
        OpenedTag openedTag = make_pair( tagName, make_pair( wordPos, tagContent ) );
        tagStack.push_back( openedTag );
        VERBOSE(3,"XML TAG " << tagName << " (" << tagContent << ") added to stack, now size " << tagStack.size() << endl);
      }

      // *** process completed tag ***

      if (isClosed || isUnary) {
        // pop last opened tag from stack;
        if (tagStack.size() == 0) {
          TRACE_ERR("ERROR: tag " << tagName << " closed, but not opened" << ":" << line << endl);
          return false;
        }
        OpenedTag openedTag = tagStack.back();
        tagStack.pop_back();

        // tag names have to match
        if (openedTag.first != tagName) {
          TRACE_ERR("ERROR: tag " << openedTag.first << " closed by tag " << tagName << ": " << line << endl );
          return false;
        }

        // assemble remaining information about tag
        size_t startPos = openedTag.second.first;
        string tagContent = openedTag.second.second;
        size_t endPos = wordPos;

        // span attribute overwrites position
        string span = ParseXmlTagAttribute(tagContent,"span");
        if (! span.empty()) {
          vector<string> ij = Tokenize(span, "-");
          if (ij.size() != 1 && ij.size() != 2) {
            TRACE_ERR("ERROR: span attribute must be of the form \"i-j\" or \"i\": " << line << endl);
            return false;
          }
          startPos = atoi(ij[0].c_str());
          if (ij.size() == 1) endPos = startPos + 1;
          else endPos = atoi(ij[1].c_str()) + 1;
        }

        VERBOSE(3,"XML TAG " << tagName << " (" << tagContent << ") spanning " << startPos << " to " << (endPos-1) << " complete, commence processing" << endl);

        // special tag: wall
        if (tagName == "wall") {
          size_t start = (startPos == 0) ? 0 : startPos-1;
          for(size_t pos = start; pos < endPos; pos++)
            walls.push_back( pos );
        }

        // special tag: zone
        else if (tagName == "zone") {
          if (startPos >= endPos) {
            TRACE_ERR("ERROR: zone must span at least one word: " << line << endl);
            return false;
          }
          reorderingConstraint.SetZone( startPos, endPos-1 );
        }

        // name-entity placeholder
        else if (tagName == "ne") {
          if (startPos != (endPos - 1)) {
            TRACE_ERR("ERROR: Placeholder must only span 1 word: " << line << endl);
            return false;
          }
          string entity = ParseXmlTagAttribute(tagContent,"entity");
          placeholders.push_back(std::pair<size_t, std::string>(startPos, entity));
        }

        // update: add new aligned sentence pair to Mmsapt identified by name
        else if (tagName == "update") {
#if PT_UG
          // get model name and aligned sentence pair
          string pdName = ParseXmlTagAttribute(tagContent,"name");
          string source = ParseXmlTagAttribute(tagContent,"source");
          string target = ParseXmlTagAttribute(tagContent,"target");
          string alignment = ParseXmlTagAttribute(tagContent,"alignment");
          // find PhraseDictionary by name
          const vector<PhraseDictionary*> &pds = PhraseDictionary::GetColl();
          PhraseDictionary* pd = NULL;
          for (vector<PhraseDictionary*>::const_iterator i = pds.begin(); i != pds.end(); ++i) {
            PhraseDictionary* curPd = *i;
            if (curPd->GetScoreProducerDescription() == pdName) {
              pd = curPd;
              break;
            }
          }
          if (pd == NULL) {
            TRACE_ERR("ERROR: No PhraseDictionary with name " << pdName << ", no update" << endl);
            return false;
          }
          // update model
          VERBOSE(3,"Updating " << pdName << " ||| " << source << " ||| " << target << " ||| " << alignment << endl);
          Mmsapt* pdsa = reinterpret_cast<Mmsapt*>(pd);
          pdsa->add(source, target, alignment);
#else
          TRACE_ERR("ERROR: recompile with --with-mm to update PhraseDictionary at runtime" << endl);
          return false;
#endif
        }

        // weight-overwrite: update feature weights, unspecified weights remain unchanged
        // IMPORTANT: translation models that cache phrases or apply table-limit during load
        // based on initial weights need to be reset.  Sending an empty update will do this
        // for PhraseDictionaryBitextSampling (Mmsapt) models:
        // <update name="TranslationModelName" source=" " target=" " alignment=" " />
        else if (tagName == "weight-overwrite") {

          // is a name->ff map stored anywhere so we don't have to build it every time?
          const vector<FeatureFunction*> &ffs = FeatureFunction::GetFeatureFunctions();
          boost::unordered_map<string, FeatureFunction*> map;
          BOOST_FOREACH(FeatureFunction* const& ff, ffs) {
            map[ff->GetScoreProducerDescription()] = ff;
          }

          // update each weight listed
          ScoreComponentCollection allWeights = StaticData::Instance().GetAllWeights();
          boost::unordered_map<string, FeatureFunction*>::iterator ffi;
          string ffName("");
          vector<float> ffWeights;
          vector<string> toks = Tokenize(ParseXmlTagAttribute(tagContent,"weights"));
          BOOST_FOREACH(string const& tok, toks) {
            if (tok.substr(tok.size() - 1, 1) == "=") {
              // start new feature
              if (ffName != "") {
                // set previous feature weights
                if (ffi != map.end()) {
                  allWeights.Assign(ffi->second, ffWeights);
                }
                ffWeights.clear();
              }
              ffName = tok.substr(0, tok.size() - 1);
              ffi = map.find(ffName);
              if (ffi == map.end()) {
                TRACE_ERR("ERROR: No FeatureFunction with name " << ffName << ", no weight update" << endl);
              }
            } else {
              // weight for current feature
              ffWeights.push_back(Scan<float>(tok));
            }
          }
          if (ffi != map.end()) {
            allWeights.Assign(ffi->second, ffWeights);
          }
          StaticData::InstanceNonConst().SetAllWeights(allWeights);
        }

        // default: opening tag that specifies translation options
        else {
          if (startPos > endPos) {
            TRACE_ERR("ERROR: tag " << tagName << " startPos > endPos: " << line << endl);
            return false;
          } else if (startPos == endPos) {
            TRACE_ERR("WARNING: tag " << tagName << " 0 span: " << line << endl);
            continue;
          }

          // specified translations -> vector of phrases
          // multiple translations may be specified, separated by "||"
          vector<string> altTexts = TokenizeMultiCharSeparator(ParseXmlTagAttribute(tagContent,"translation"), "||");
          if( altTexts.size() == 1 && altTexts[0] == "" )
            altTexts.pop_back(); // happens when nothing specified
          // deal with legacy annotations: "translation" was called "english"
          vector<string> moreAltTexts = TokenizeMultiCharSeparator(ParseXmlTagAttribute(tagContent,"english"), "||");
          if (moreAltTexts.size()>1 || moreAltTexts[0] != "") {
            for(vector<string>::iterator translation=moreAltTexts.begin();
                translation != moreAltTexts.end();
                translation++) {
              string t = *translation;
              altTexts.push_back( t );
            }
          }

          // specified probabilities for the translations -> vector of probs
          vector<string> altProbs = TokenizeMultiCharSeparator(ParseXmlTagAttribute(tagContent,"prob"), "||");
          if( altProbs.size() == 1 && altProbs[0] == "" )
            altProbs.pop_back(); // happens when nothing specified

          // report what we have processed so far
          VERBOSE(3,"XML TAG NAME IS: '" << tagName << "'" << endl);
          VERBOSE(3,"XML TAG TRANSLATION IS: '" << altTexts[0] << "'" << endl);
          VERBOSE(3,"XML TAG PROB IS: '" << altProbs[0] << "'" << endl);
          VERBOSE(3,"XML TAG SPAN IS: " << startPos << "-" << (endPos-1) << endl);
          if (altProbs.size() > 0 && altTexts.size() != altProbs.size()) {
            TRACE_ERR("ERROR: Unequal number of probabilities and translation alternatives: " << line << endl);
            return false;
          }

          // store translation options into members
          if (staticData.GetXmlInputType() != XmlIgnore) {
            // only store options if we aren't ignoring them
            for (size_t i=0; i<altTexts.size(); ++i) {
              Phrase sourcePhrase; // TODO don't know what the source phrase is

              // set default probability
              float probValue = 1;
              if (altProbs.size() > 0) probValue = Scan<float>(altProbs[i]);
              // convert from prob to log-prob
              float scoreValue = FloorScore(TransformScore(probValue));

              WordsRange range(startPos + offset,endPos-1 + offset); // span covered by phrase
              TargetPhrase targetPhrase(firstPt);
              // targetPhrase.CreateFromString(Output, outputFactorOrder,altTexts[i],factorDelimiter, NULL);
              targetPhrase.CreateFromString(Output, outputFactorOrder,altTexts[i], NULL);

              // lhs
              const UnknownLHSList &lhsList = staticData.GetUnknownLHS();
              if (!lhsList.empty()) {
                const Factor *factor = FactorCollection::Instance().AddFactor(lhsList[0].first, true);
                Word *targetLHS = new Word(true);
                targetLHS->SetFactor(0, factor); // TODO - other factors too?
                targetPhrase.SetTargetLHS(targetLHS);
              }

              targetPhrase.SetXMLScore(scoreValue);
              targetPhrase.EvaluateInIsolation(sourcePhrase);

              XmlOption *option = new XmlOption(range,targetPhrase);
              assert(option);

              res.push_back(option);
            }
            altTexts.clear();
            altProbs.clear();
          }
        }
      }
    }
  }
bool PhraseDictionaryMemory::Load(const std::vector<FactorType> &input
                                  , const std::vector<FactorType> &output
                                  , const string &filePath
                                  , const vector<float> &weight
                                  , size_t tableLimit
                                  , const LMList &languageModels
                                  , float weightWP)
{
    const StaticData &staticData = StaticData::Instance();

    m_tableLimit = tableLimit;


    // data from file
    InputFileStream inFile(filePath);

    // create hash file if necessary
    ofstream tempFile;
    string tempFilePath;

    vector< vector<string> >	phraseVector;
    string line, prevSourcePhrase = "";
    size_t count = 0;
    size_t line_num = 0;
    size_t numElement = NOT_FOUND; // 3=old format, 5=async format which include word alignment info

    while(getline(inFile, line))
    {
        ++line_num;
        vector<string> tokens = TokenizeMultiCharSeparator( line , "|||" );

        if (numElement == NOT_FOUND)
        {   // init numElement
            numElement = tokens.size();
            assert(numElement >= 3);
            // extended style: source ||| target ||| scores ||| [alignment] ||| [counts]
        }

        if (tokens.size() != numElement)
        {
            stringstream strme;
            strme << "Syntax error at " << filePath << ":" << line_num;
            UserMessage::Add(strme.str());
            abort();
        }

        const string &sourcePhraseString=tokens[0]
                                         ,&targetPhraseString=tokens[1]
                                                 ,&scoreString = tokens[2];

        bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == string::npos);
        if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) {
            TRACE_ERR( filePath << ":" << line_num << ": pt entry contains empty target, skipping\n");
            continue;
        }

        const std::string& factorDelimiter = StaticData::Instance().GetFactorDelimiter();
        if (sourcePhraseString != prevSourcePhrase)
            phraseVector = Phrase::Parse(sourcePhraseString, input, factorDelimiter);

        vector<float> scoreVector = Tokenize<float>(scoreString);
        if (scoreVector.size() != m_numScoreComponent)
        {
            stringstream strme;
            strme << "Size of scoreVector != number (" <<scoreVector.size() << "!=" <<m_numScoreComponent<<") of score components on line " << line_num;
            UserMessage::Add(strme.str());
            abort();
        }

        // source
        Phrase sourcePhrase(Input);
        sourcePhrase.CreateFromString( input, phraseVector);
        //target
        TargetPhrase targetPhrase(Output);
        targetPhrase.SetSourcePhrase(&sourcePhrase);
        targetPhrase.CreateFromString( output, targetPhraseString, factorDelimiter);

        if (tokens.size() > 3)
            targetPhrase.SetAlignmentInfo(tokens[3]);

        // component score, for n-best output
        std::vector<float> scv(scoreVector.size());
        std::transform(scoreVector.begin(),scoreVector.end(),scv.begin(),TransformScore);
        std::transform(scv.begin(),scv.end(),scv.begin(),FloorScore);
        targetPhrase.SetScore(m_feature, scv, weight, weightWP, languageModels);

        AddEquivPhrase(sourcePhrase, targetPhrase);

        count++;
    }

    // sort each target phrase collection
    m_collection.Sort(m_tableLimit);

    return true;
}
bool LexicalReorderingTableTree::Create(std::istream& inFile,
                                        const std::string& outFileName)
{
  std::string line;
  //TRACE_ERR("Entering Create...\n");
  std::string
  ofn(outFileName+".binlexr.srctree"),
      oft(outFileName+".binlexr.tgtdata"),
      ofi(outFileName+".binlexr.idx"),
      ofsv(outFileName+".binlexr.voc0"),
      oftv(outFileName+".binlexr.voc1");


  FILE *os = fOpen(ofn.c_str(),"wb");
  FILE *ot = fOpen(oft.c_str(),"wb");

  //TRACE_ERR("opend files....\n");

  typedef PrefixTreeSA<LabelId,OFF_T> PSA;
  PSA *psa = new PSA;
  PSA::setDefault(InvalidOffT);
  WordVoc* voc[3];

  LabelId currFirstWord = InvalidLabelId;
  IPhrase currKey;

  Candidates         cands;
  std::vector<OFF_T> vo;
  size_t lnc = 0;
  size_t numTokens    = 0;
  size_t numKeyTokens = 0;
  while(getline(inFile, line)) {
    ++lnc;
    if(0 == lnc % 10000) {
      TRACE_ERR(".");
    }
    IPhrase key;
    Scores   score;

    std::vector<std::string> tokens = TokenizeMultiCharSeparator(line, "|||");
    std::string w;
    if(1 == lnc) {
      //do some init stuff in the first line
      numTokens = tokens.size();
      if(tokens.size() == 2) { //f ||| score
        numKeyTokens = 1;
        voc[0] = new WordVoc();
        voc[1] = 0;
      } else if(3 == tokens.size() || 4 == tokens.size()) { //either f ||| e ||| score or f ||| e ||| c ||| score
        numKeyTokens = 2;
        voc[0] = new WordVoc(); //f voc
        voc[1] = new WordVoc(); //e voc
        voc[2] = voc[1];        //c & e share voc
      }
    } else {
      //sanity check ALL lines must have same number of tokens
      CHECK(numTokens == tokens.size());
    }
    size_t phrase = 0;
    for(; phrase < numKeyTokens; ++phrase) {
      //conditioned on more than just f... need |||
      if(phrase >=1) {
        key.push_back(PrefixTreeMap::MagicWord);
      }
      std::istringstream is(tokens[phrase]);
      while(is >> w) {
        key.push_back(voc[phrase]->add(w));
      }
    }
    //collect all non key phrases, i.e. c
    std::vector<IPhrase> tgt_phrases;
    tgt_phrases.resize(numTokens - numKeyTokens - 1);
    for(size_t j = 0; j < tgt_phrases.size(); ++j, ++phrase) {
      std::istringstream is(tokens[numKeyTokens + j]);
      while(is >> w) {
        tgt_phrases[j].push_back(voc[phrase]->add(w));
      }
    }
    //last token is score
    std::istringstream is(tokens[numTokens-1]);
    while(is >> w) {
      score.push_back(atof(w.c_str()));
    }
    //transform score now...
    std::transform(score.begin(),score.end(),score.begin(),TransformScore);
    std::transform(score.begin(),score.end(),score.begin(),FloorScore);
    std::vector<Scores> scores;
    scores.push_back(score);

    if(key.empty()) {
      TRACE_ERR("WARNING: empty source phrase in line '"<<line<<"'\n");
      continue;
    }
    //first time inits
    if(currFirstWord == InvalidLabelId) {
      currFirstWord = key[0];
    }
    if(currKey.empty()) {
      currKey = key;
      //insert key into tree
      CHECK(psa);
      PSA::Data& d = psa->insert(key);
      if(d == InvalidOffT) {
        d = fTell(ot);
      } else {
        TRACE_ERR("ERROR: source phrase already inserted (A)!\nline(" << lnc << "): '" << line << "\n");
        return false;
      }
    }
    if(currKey != key) {
      //ok new key
      currKey = key;
      //a) write cands for old key
      cands.writeBin(ot);
      cands.clear();
      //b) check if we need to move on to new tree root
      if(key[0] != currFirstWord) {
        // write key prefix tree to file and clear
        PTF pf;
        if(currFirstWord >= vo.size()) {
          vo.resize(currFirstWord+1,InvalidOffT);
        }
        vo[currFirstWord] = fTell(os);
        pf.create(*psa, os);
        // clear
        delete psa;
        psa = new PSA;
        currFirstWord = key[0];
      }
      //c) insert key into tree
      CHECK(psa);
      PSA::Data& d = psa->insert(key);
      if(d == InvalidOffT) {
        d = fTell(ot);
      } else {
        TRACE_ERR("ERROR: source phrase already inserted (A)!\nline(" << lnc << "): '" << line << "\n");
        return false;
      }
    }
    cands.push_back(GenericCandidate(tgt_phrases, scores));
  }
  if (lnc == 0) {
    TRACE_ERR("ERROR: empty lexicalised reordering file\n" << std::endl);
    return false;
  }
  //flush remainders
  cands.writeBin(ot);
  cands.clear();
  //process last currFirstWord
  PTF pf;
  if(currFirstWord >= vo.size()) {
    vo.resize(currFirstWord+1,InvalidOffT);
  }
  vo[currFirstWord] = fTell(os);
  pf.create(*psa,os);
  delete psa;
  psa=0;

  fClose(os);
  fClose(ot);
  /*
  std::vector<size_t> inv;
  for(size_t i = 0; i < vo.size(); ++i){
    if(vo[i] == InvalidOffT){
      inv.push_back(i);
    }
  }
  if(inv.size()) {
    TRACE_ERR("WARNING: there are src voc entries with no phrase "
        "translation: count "<<inv.size()<<"\n"
        "There exists phrase translations for "<<vo.size()-inv.size()
        <<" entries\n");
  }
  */
  FILE *oi = fOpen(ofi.c_str(),"wb");
  fWriteVector(oi,vo);
  fClose(oi);

  if(voc[0]) {
    voc[0]->Write(ofsv);
    delete voc[0];
  }
  if(voc[1]) {
    voc[1]->Write(oftv);
    delete voc[1];
  }
  return true;
}
void PhraseDictionaryFuzzyMatch::InitializeForInput(InputType const& inputSentence)
{
  char dirName[] = "/tmp/moses.XXXXXX";
  char *temp = mkdtemp(dirName);
  UTIL_THROW_IF2(temp == NULL,
		  "Couldn't create temporary directory " << dirName);

  string dirNameStr(dirName);

  string inFileName(dirNameStr + "/in");

  ofstream inFile(inFileName.c_str());

  for (size_t i = 1; i < inputSentence.GetSize() - 1; ++i) {
    inFile << inputSentence.GetWord(i);
  }
  inFile << endl;
  inFile.close();

  long translationId = inputSentence.GetTranslationId();
  string ptFileName = m_FuzzyMatchWrapper->Extract(translationId, dirNameStr);

  // populate with rules for this sentence
  PhraseDictionaryNodeMemory &rootNode = m_collection[translationId];
  FormatType format = MosesFormat;

  // data from file
  InputFileStream inStream(ptFileName);

  // copied from class LoaderStandard
  PrintUserTime("Start loading fuzzy-match phrase model");

  const StaticData &staticData = StaticData::Instance();
  const std::string& factorDelimiter = staticData.GetFactorDelimiter();


  string lineOrig;
  size_t count = 0;

  while(getline(inStream, lineOrig)) {
    const string *line;
    if (format == HieroFormat) { // reformat line
      UTIL_THROW(util::Exception, "Cannot be Hiero format");
      //line = ReformatHieroRule(lineOrig);
    } else {
      // do nothing to format of line
      line = &lineOrig;
    }

    vector<string> tokens;
    vector<float> scoreVector;

    TokenizeMultiCharSeparator(tokens, *line , "|||" );

    if (tokens.size() != 4 && tokens.size() != 5) {
      stringstream strme;
      strme << "Syntax error at " << ptFileName << ":" << count;
      UserMessage::Add(strme.str());
      abort();
    }

    const string &sourcePhraseString = tokens[0]
                                       , &targetPhraseString = tokens[1]
                                           , &scoreString        = tokens[2]
                                               , &alignString        = tokens[3];

    bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == string::npos);
    if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) {
      TRACE_ERR( ptFileName << ":" << count << ": pt entry contains empty target, skipping\n");
      continue;
    }

    Tokenize<float>(scoreVector, scoreString);
    const size_t numScoreComponents = GetNumScoreComponents();
    if (scoreVector.size() != numScoreComponents) {
      stringstream strme;
      strme << "Size of scoreVector != number (" << scoreVector.size() << "!="
            << numScoreComponents << ") of score components on line " << count;
      UserMessage::Add(strme.str());
      abort();
    }

    UTIL_THROW_IF2(scoreVector.size() != numScoreComponents,
    		"Number of scores incorrectly specified");

    // parse source & find pt node

    // constituent labels
    Word *sourceLHS;
    Word *targetLHS;

    // source
    Phrase sourcePhrase( 0);
    sourcePhrase.CreateFromString(Input, m_input, sourcePhraseString, factorDelimiter, &sourceLHS);

    // create target phrase obj
    TargetPhrase *targetPhrase = new TargetPhrase();
    targetPhrase->CreateFromString(Output, m_output, targetPhraseString, factorDelimiter, &targetLHS);

    // rest of target phrase
    targetPhrase->SetAlignmentInfo(alignString);
    targetPhrase->SetTargetLHS(targetLHS);
    //targetPhrase->SetDebugOutput(string("New Format pt ") + line);

    // component score, for n-best output
    std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),TransformScore);
    std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),FloorScore);

    targetPhrase->GetScoreBreakdown().Assign(this, scoreVector);
    targetPhrase->Evaluate(sourcePhrase, GetFeaturesToApply());

    TargetPhraseCollection &phraseColl = GetOrCreateTargetPhraseCollection(rootNode, sourcePhrase, *targetPhrase, sourceLHS);
    phraseColl.Add(targetPhrase);

    count++;

    if (format == HieroFormat) { // reformat line
      delete line;
    } else {
      // do nothing
    }

  }

  // sort and prune each target phrase collection
  SortAndPrune(rootNode);

  //removedirectoryrecursively(dirName);
}
Esempio n. 12
0
/**
 * Process a sentence with xml annotation
 * Xml tags may specifiy additional/replacing translation options
 * and reordering constraints
 *
 * \param line in: sentence, out: sentence without the xml
 * \param res vector with translation options specified by xml
 * \param reorderingConstraint reordering constraint zones specified by xml
 * \param walls reordering constraint walls specified by xml
 */
bool TreeInput::ProcessAndStripXMLTags(string &line, std::vector<XMLParseOutput> &sourceLabels)
{
  //parse XML markup in translation line

  // no xml tag? we're done.
  if (line.find_first_of('<') == string::npos) {
    return true;
  }

  // break up input into a vector of xml tags and text
  // example: (this), (<b>), (is a), (</b>), (test .)
  vector<string> xmlTokens = TokenizeXml(line);

  // we need to store opened tags, until they are closed
  // tags are stored as tripled (tagname, startpos, contents)
  typedef pair< string, pair< size_t, string > > OpenedTag;
  vector< OpenedTag > tagStack; // stack that contains active opened tags

  string cleanLine; // return string (text without xml)
  size_t wordPos = 0; // position in sentence (in terms of number of words)

  // loop through the tokens
  for (size_t xmlTokenPos = 0 ; xmlTokenPos < xmlTokens.size() ; xmlTokenPos++) {
    // not a xml tag, but regular text (may contain many words)
    if(!isXmlTag(xmlTokens[xmlTokenPos])) {
      // add a space at boundary, if necessary
      if (cleanLine.size()>0 &&
          cleanLine[cleanLine.size() - 1] != ' ' &&
          xmlTokens[xmlTokenPos][0] != ' ') {
        cleanLine += " ";
      }
      cleanLine += xmlTokens[xmlTokenPos]; // add to output
      wordPos = Tokenize(cleanLine).size(); // count all the words
    }

    // process xml tag
    else {
      // *** get essential information about tag ***

      // strip extra boundary spaces and "<" and ">"
      string tag =  Trim(TrimXml(xmlTokens[xmlTokenPos]));
      VERBOSE(3,"XML TAG IS: " << tag << std::endl);

      if (tag.size() == 0) {
        TRACE_ERR("ERROR: empty tag name: " << line << endl);
        return false;
      }

      // check if unary (e.g., "<wall/>")
      bool isUnary = ( tag[tag.size() - 1] == '/' );

      // check if opening tag (e.g. "<a>", not "</a>")g
      bool isClosed = ( tag[0] == '/' );
      bool isOpen = !isClosed;

      if (isClosed && isUnary) {
        TRACE_ERR("ERROR: can't have both closed and unary tag <" << tag << ">: " << line << endl);
        return false;
      }

      if (isClosed)
        tag = tag.substr(1); // remove "/" at the beginning
      if (isUnary)
        tag = tag.substr(0,tag.size()-1); // remove "/" at the end

      // find the tag name and contents
      string::size_type endOfName = tag.find_first_of(' ');
      string tagName = tag;
      string tagContent = "";
      if (endOfName != string::npos) {
        tagName = tag.substr(0,endOfName);
        tagContent = tag.substr(endOfName+1);
      }

      // *** process new tag ***

      if (isOpen || isUnary) {
        // put the tag on the tag stack
        OpenedTag openedTag = make_pair( tagName, make_pair( wordPos, tagContent ) );
        tagStack.push_back( openedTag );
        VERBOSE(3,"XML TAG " << tagName << " (" << tagContent << ") added to stack, now size " << tagStack.size() << endl);
      }

      // *** process completed tag ***

      if (isClosed || isUnary) {
        // pop last opened tag from stack;
        if (tagStack.size() == 0) {
          TRACE_ERR("ERROR: tag " << tagName << " closed, but not opened" << ":" << line << endl);
          return false;
        }
        OpenedTag openedTag = tagStack.back();
        tagStack.pop_back();

        // tag names have to match
        if (openedTag.first != tagName) {
          TRACE_ERR("ERROR: tag " << openedTag.first << " closed by tag " << tagName << ": " << line << endl );
          return false;
        }

        // assemble remaining information about tag
        size_t startPos = openedTag.second.first;
        string tagContent = openedTag.second.second;
        size_t endPos = wordPos;

        // span attribute overwrites position
        string span = ParseXmlTagAttribute(tagContent,"span");
        if (! span.empty()) {
          vector<string> ij = Tokenize(span, "-");
          if (ij.size() != 1 && ij.size() != 2) {
            TRACE_ERR("ERROR: span attribute must be of the form \"i-j\" or \"i\": " << line << endl);
            return false;
          }
          startPos = atoi(ij[0].c_str());
          if (ij.size() == 1) endPos = startPos + 1;
          else endPos = atoi(ij[1].c_str()) + 1;
        }

        VERBOSE(3,"XML TAG " << tagName << " (" << tagContent << ") spanning " << startPos << " to " << (endPos-1) << " complete, commence processing" << endl);

        if (startPos >= endPos) {
          TRACE_ERR("ERROR: tag " << tagName << " must span at least one word: " << line << endl);
          return false;
        }

        WordsRange range(startPos,endPos-1);
        // specified translations -> vector of phrases
        // multiple translations may be specified, separated by "||"
        vector<string> altTexts = TokenizeMultiCharSeparator(ParseXmlTagAttribute(tagContent,"label"), "||");
        CHECK(altTexts.size() == 1);

        XMLParseOutput item(altTexts[0], range);
        sourceLabels.push_back(item);
      }
    }
  }
  // we are done. check if there are tags that are still open
  if (tagStack.size() > 0) {
    TRACE_ERR("ERROR: some opened tags were never closed: " << line << endl);
    return false;
  }

  // return de-xml'ed sentence in line
  line = cleanLine;
  return true;
}
Esempio n. 13
0
/**
 * Process a sentence with xml annotation
 * Xml tags may specifiy additional/replacing translation options
 * and reordering constraints
 *
 * \param line in: sentence, out: sentence without the xml
 * \param res vector with translation options specified by xml
 * \param reorderingConstraint reordering constraint zones specified by xml
 * \param walls reordering constraint walls specified by xml
 */
bool TreeInput::ProcessAndStripXMLTags(string &line, std::vector<XMLParseOutput> &sourceLabels, std::vector<XmlOption*> &xmlOptions)
{
  //parse XML markup in translation line

  // no xml tag? we're done.
  if (line.find_first_of('<') == string::npos) {
    return true;
  }

  // break up input into a vector of xml tags and text
  // example: (this), (<b>), (is a), (</b>), (test .)
  vector<string> xmlTokens = TokenizeXml(line);

  // we need to store opened tags, until they are closed
  // tags are stored as tripled (tagname, startpos, contents)
  typedef pair< string, pair< size_t, string > > OpenedTag;
  vector< OpenedTag > tagStack; // stack that contains active opened tags

  string cleanLine; // return string (text without xml)
  size_t wordPos = 0; // position in sentence (in terms of number of words)

  // keep this handy for later
  const vector<FactorType> &outputFactorOrder = StaticData::Instance().GetOutputFactorOrder();
  const string &factorDelimiter = StaticData::Instance().GetFactorDelimiter();

  // loop through the tokens
  for (size_t xmlTokenPos = 0 ; xmlTokenPos < xmlTokens.size() ; xmlTokenPos++) {
    // not a xml tag, but regular text (may contain many words)
    if(!isXmlTag(xmlTokens[xmlTokenPos])) {
      // add a space at boundary, if necessary
      if (cleanLine.size()>0 &&
          cleanLine[cleanLine.size() - 1] != ' ' &&
          xmlTokens[xmlTokenPos][0] != ' ') {
        cleanLine += " ";
      }
      cleanLine += xmlTokens[xmlTokenPos]; // add to output
      wordPos = Tokenize(cleanLine).size(); // count all the words
    }

    // process xml tag
    else {
      // *** get essential information about tag ***

      // strip extra boundary spaces and "<" and ">"
      string tag =  Trim(TrimXml(xmlTokens[xmlTokenPos]));
      VERBOSE(3,"XML TAG IS: " << tag << std::endl);

      if (tag.size() == 0) {
        TRACE_ERR("ERROR: empty tag name: " << line << endl);
        return false;
      }

      // check if unary (e.g., "<wall/>")
      bool isUnary = ( tag[tag.size() - 1] == '/' );

      // check if opening tag (e.g. "<a>", not "</a>")g
      bool isClosed = ( tag[0] == '/' );
      bool isOpen = !isClosed;

      if (isClosed && isUnary) {
        TRACE_ERR("ERROR: can't have both closed and unary tag <" << tag << ">: " << line << endl);
        return false;
      }

      if (isClosed)
        tag = tag.substr(1); // remove "/" at the beginning
      if (isUnary)
        tag = tag.substr(0,tag.size()-1); // remove "/" at the end

      // find the tag name and contents
      string::size_type endOfName = tag.find_first_of(' ');
      string tagName = tag;
      string tagContent = "";
      if (endOfName != string::npos) {
        tagName = tag.substr(0,endOfName);
        tagContent = tag.substr(endOfName+1);
      }

      // *** process new tag ***

      if (isOpen || isUnary) {
        // put the tag on the tag stack
        OpenedTag openedTag = make_pair( tagName, make_pair( wordPos, tagContent ) );
        tagStack.push_back( openedTag );
        VERBOSE(3,"XML TAG " << tagName << " (" << tagContent << ") added to stack, now size " << tagStack.size() << endl);
      }

      // *** process completed tag ***

      if (isClosed || isUnary) {
        // pop last opened tag from stack;
        if (tagStack.size() == 0) {
          TRACE_ERR("ERROR: tag " << tagName << " closed, but not opened" << ":" << line << endl);
          return false;
        }
        OpenedTag openedTag = tagStack.back();
        tagStack.pop_back();

        // tag names have to match
        if (openedTag.first != tagName) {
          TRACE_ERR("ERROR: tag " << openedTag.first << " closed by tag " << tagName << ": " << line << endl );
          return false;
        }

        // assemble remaining information about tag
        size_t startPos = openedTag.second.first;
        string tagContent = openedTag.second.second;
        size_t endPos = wordPos;

        // span attribute overwrites position
        string span = ParseXmlTagAttribute(tagContent,"span");
        if (! span.empty()) {
          vector<string> ij = Tokenize(span, "-");
          if (ij.size() != 1 && ij.size() != 2) {
            TRACE_ERR("ERROR: span attribute must be of the form \"i-j\" or \"i\": " << line << endl);
            return false;
          }
          startPos = atoi(ij[0].c_str());
          if (ij.size() == 1) endPos = startPos + 1;
          else endPos = atoi(ij[1].c_str()) + 1;
        }

        VERBOSE(3,"XML TAG " << tagName << " (" << tagContent << ") spanning " << startPos << " to " << (endPos-1) << " complete, commence processing" << endl);

        if (startPos >= endPos) {
          TRACE_ERR("ERROR: tag " << tagName << " must span at least one word: " << line << endl);
          return false;
        }

	// may be either a input span label ("label"), or a specified output translation "translation"
        string label = ParseXmlTagAttribute(tagContent,"label");
        string translation = ParseXmlTagAttribute(tagContent,"translation");

        // specified label
        if (translation.length() == 0 && label.length() > 0) {
          WordsRange range(startPos,endPos-1); // really?
          XMLParseOutput item(label, range);
          sourceLabels.push_back(item);
        }

        // specified translations -> vector of phrases, separated by "||"
        if (translation.length() > 0 && StaticData::Instance().GetXmlInputType() != XmlIgnore) {
          vector<string> altTexts = TokenizeMultiCharSeparator(translation, "||");
          vector<string> altLabel = TokenizeMultiCharSeparator(label, "||");
          vector<string> altProbs = TokenizeMultiCharSeparator(ParseXmlTagAttribute(tagContent,"prob"), "||");
	  //TRACE_ERR("number of translations: " << altTexts.size() << endl);
          for (size_t i=0; i<altTexts.size(); ++i) {
            // set target phrase
            TargetPhrase targetPhrase(Output);
            targetPhrase.CreateFromString(outputFactorOrder,altTexts[i],factorDelimiter);

            // set constituent label
	    string targetLHSstr;
            if (altLabel.size() > i && altLabel[i].size() > 0) {
              targetLHSstr = altLabel[i];
            }
            else {
              const UnknownLHSList &lhsList = StaticData::Instance().GetUnknownLHS();
              UnknownLHSList::const_iterator iterLHS = lhsList.begin();
              targetLHSstr = iterLHS->first;
            }
            Word targetLHS(true);
            targetLHS.CreateFromString(Output, outputFactorOrder, targetLHSstr, true);
            CHECK(targetLHS.GetFactor(0) != NULL);
            targetPhrase.SetTargetLHS(targetLHS);

            // get probability
            float probValue = 1;
            if (altProbs.size() > i && altProbs[i].size() > 0) {
              probValue = Scan<float>(altProbs[i]);
            }
            // convert from prob to log-prob
            float scoreValue = FloorScore(TransformScore(probValue));
            targetPhrase.SetScore(scoreValue);

            // set span and create XmlOption
            WordsRange range(startPos+1,endPos);
            XmlOption *option = new XmlOption(range,targetPhrase);
            CHECK(option);
            xmlOptions.push_back(option);

            VERBOSE(2,"xml translation = [" << range << "] " << targetLHSstr << " -> " << altTexts[i] << " prob: " << probValue << endl);
          }
          altTexts.clear();
          altProbs.clear();
        }
      }
    }
  }
  // we are done. check if there are tags that are still open
  if (tagStack.size() > 0) {
    TRACE_ERR("ERROR: some opened tags were never closed: " << line << endl);
    return false;
  }

  // return de-xml'ed sentence in line
  line = cleanLine;
  return true;
}
bool PhraseDictionaryNewFormat::Load(const std::vector<FactorType> &input
																			 , const std::vector<FactorType> &output
																			 , std::istream &inStream
																			 , const std::vector<float> &weight
																			 , size_t tableLimit
																			 , const LMList &languageModels
																			 , float weightWP)
{
	PrintUserTime("Start loading new format pt model");
	
	const StaticData &staticData = StaticData::Instance();
	const std::string& factorDelimiter = staticData.GetFactorDelimiter();
	
	VERBOSE(2,"PhraseDictionaryNewFormat: input=" << m_inputFactors << "  output=" << m_outputFactors << std::endl);
	
	string line;
	size_t count = 0;
	
	while(getline(inStream, line))
	{
		vector<string> tokens;
		vector<float> scoreVector;
		
		TokenizeMultiCharSeparator(tokens, line , "|||" );
					
		if (tokens.size() != 4 && tokens.size() != 5)
		{
			stringstream strme;
			strme << "Syntax error at " << m_filePath << ":" << count;
			UserMessage::Add(strme.str());
			abort();
		}
		
		const string &sourcePhraseString	= tokens[0]
								, &targetPhraseString	= tokens[1]
								, &alignString				= tokens[2]
								, &scoreString				= tokens[3];

		bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == string::npos);
		if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) {
			TRACE_ERR( m_filePath << ":" << count << ": pt entry contains empty target, skipping\n");
			continue;
		}
		
		Tokenize<float>(scoreVector, scoreString);
		if (scoreVector.size() != m_numScoreComponent)
		{
			stringstream strme;
			strme << "Size of scoreVector != number (" <<scoreVector.size() << "!=" <<m_numScoreComponent<<") of score components on line " << count;
			UserMessage::Add(strme.str());
			abort();
		}
		assert(scoreVector.size() == m_numScoreComponent);
		
		// parse source & find pt node
		
		// head word
		Word sourceLHS, targetLHS;

		// source
		Phrase sourcePhrase(Input);
		sourcePhrase.CreateFromStringNewFormat(Input, input, sourcePhraseString, factorDelimiter, sourceLHS);
		
		// create target phrase obj
		TargetPhrase *targetPhrase = new TargetPhrase(Output);
		targetPhrase->CreateFromStringNewFormat(Output, output, targetPhraseString, factorDelimiter, targetLHS);
		
		// alignment
		list<pair<size_t,size_t> > alignmentInfo;
		CreateAlignmentInfo(alignmentInfo, alignString);

		// rest of target phrase
		targetPhrase->SetAlignmentInfo(alignmentInfo);
		targetPhrase->SetTargetLHS(targetLHS);
		//targetPhrase->SetDebugOutput(string("New Format pt ") + line);
		
		// component score, for n-best output
		std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),TransformScore);
		std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),FloorScore);
		
		targetPhrase->SetScoreChart(GetFeature(), scoreVector, weight, languageModels);
		
		// count info for backoff
		if (tokens.size() >= 6)
			targetPhrase->CreateCountInfo(tokens[5]);

		TargetPhraseCollection &phraseColl = GetOrCreateTargetPhraseCollection(sourcePhrase, *targetPhrase);
		AddEquivPhrase(phraseColl, targetPhrase);
		
		count++;
	}
	
	// cleanup cache
	
	// sort each target phrase collection
	m_collection.Sort(m_tableLimit);
	
	return true;
}
Esempio n. 15
0
bool RuleTableLoaderStandard::Load(FormatType format
                                , const std::vector<FactorType> &input
                                , const std::vector<FactorType> &output
                                , std::istream &inStream
                                , const std::vector<float> &weight
                                , size_t /* tableLimit */
                                , const LMList &languageModels
                                , const WordPenaltyProducer* wpProducer
                                , RuleTableTrie &ruleTable)
{
  PrintUserTime(string("Start loading text SCFG phrase table. ") + (format==MosesFormat?"Moses ":"Hiero ") + " format");

  const StaticData &staticData = StaticData::Instance();
  const std::string& factorDelimiter = staticData.GetFactorDelimiter();


  string lineOrig;
  size_t count = 0;

  while(getline(inStream, lineOrig)) {
    const string *line;
    if (format == HieroFormat) { // reformat line
      line = ReformatHieroRule(lineOrig);
    }
    else
    { // do nothing to format of line
      line = &lineOrig;
    }
    
    vector<string> tokens;
    vector<float> scoreVector;

    TokenizeMultiCharSeparator(tokens, *line , "|||" );

    if (tokens.size() != 4 && tokens.size() != 5) {
      stringstream strme;
      strme << "Syntax error at " << ruleTable.GetFilePath() << ":" << count;
      UserMessage::Add(strme.str());
      abort();
    }

    const string &sourcePhraseString = tokens[0]
               , &targetPhraseString = tokens[1]
               , &scoreString        = tokens[2]
               , &alignString        = tokens[3];

    bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == string::npos);
    if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) {
      TRACE_ERR( ruleTable.GetFilePath() << ":" << count << ": pt entry contains empty target, skipping\n");
      continue;
    }

    Tokenize<float>(scoreVector, scoreString);
    const size_t numScoreComponents = ruleTable.GetFeature()->GetNumScoreComponents();
    if (scoreVector.size() != numScoreComponents) {
      stringstream strme;
      strme << "Size of scoreVector != number (" << scoreVector.size() << "!="
            << numScoreComponents << ") of score components on line " << count;
      UserMessage::Add(strme.str());
      abort();
    }
    CHECK(scoreVector.size() == numScoreComponents);

    // parse source & find pt node

    // constituent labels
    Word sourceLHS, targetLHS;

    // source
    Phrase sourcePhrase( 0);
    sourcePhrase.CreateFromStringNewFormat(Input, input, sourcePhraseString, factorDelimiter, sourceLHS);

    // create target phrase obj
    TargetPhrase *targetPhrase = new TargetPhrase(Output);
    targetPhrase->CreateFromStringNewFormat(Output, output, targetPhraseString, factorDelimiter, targetLHS);

    // rest of target phrase
    targetPhrase->SetAlignmentInfo(alignString);
    targetPhrase->SetTargetLHS(targetLHS);
    //targetPhrase->SetDebugOutput(string("New Format pt ") + line);

    // component score, for n-best output
    std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),TransformScore);
    std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),FloorScore);

    targetPhrase->SetScoreChart(ruleTable.GetFeature(), scoreVector, weight, languageModels,wpProducer);

    TargetPhraseCollection &phraseColl = GetOrCreateTargetPhraseCollection(ruleTable, sourcePhrase, *targetPhrase, sourceLHS);
    phraseColl.Add(targetPhrase);

    count++;

    if (format == HieroFormat) { // reformat line
      delete line;
    }
    else
    { // do nothing
    }

  }

  // sort and prune each target phrase collection
  SortAndPrune(ruleTable);

  return true;
}