//Called when we start translating a new sentence
void PhraseDictionaryFeature::InitDictionary(const TranslationSystem* system, const InputType& source)
{
  PhraseDictionary* dict;
  if (m_useThreadSafePhraseDictionary) {
    //thread safe dictionary should already be loaded
    dict = m_threadSafePhraseDictionary.get();
  } else {
    //thread-unsafe dictionary may need to be loaded if this is a new thread.
    if (!m_threadUnsafePhraseDictionary.get()) {
      m_threadUnsafePhraseDictionary.reset(LoadPhraseTable(system));
    }
    dict = m_threadUnsafePhraseDictionary.get();
  }
  CHECK(dict);
  dict->InitializeForInput(source);
}
PhraseDictionary *FindPhraseDictionary(const string &ptName)
{
  const std::vector<PhraseDictionary*> &pts = PhraseDictionary::GetColl();

  PhraseDictionary *pt = NULL;
  std::vector<PhraseDictionary*>::const_iterator iter;
  for (iter = pts.begin(); iter != pts.end(); ++iter) {
    PhraseDictionary *currPt = *iter;
    if (currPt->GetScoreProducerDescription() == ptName) {
      pt = currPt;
      break;
    }
  }

  return pt;
}
ChartParser::ChartParser(InputType const &source, ChartCellCollectionBase &cells) :
  m_decodeGraphList(StaticData::Instance().GetDecodeGraphs()),
  m_source(source)
{
  const StaticData &staticData = StaticData::Instance();

  staticData.InitializeForInput(source);
  CreateInputPaths(m_source);

  const std::vector<PhraseDictionary*> &dictionaries = PhraseDictionary::GetColl();
  assert(dictionaries.size() == m_decodeGraphList.size());
  m_ruleLookupManagers.reserve(dictionaries.size());
  for (std::size_t i = 0; i < dictionaries.size(); ++i) {
    const PhraseDictionary *dict = dictionaries[i];
    PhraseDictionary *nonConstDict = const_cast<PhraseDictionary*>(dict);
    std::size_t maxChartSpan = m_decodeGraphList[i]->GetMaxChartSpan();
    ChartRuleLookupManager *lookupMgr = nonConstDict->CreateRuleLookupManager(*this, cells, maxChartSpan);
    m_ruleLookupManagers.push_back(lookupMgr);
  }

}
     void TranslationSystem::CleanUpAfterSentenceProcessing(const InputType& source) const {
        
        for(size_t i=0;i<m_phraseDictionaries.size();++i)
        {
          PhraseDictionaryFeature &phraseDictionaryFeature = *m_phraseDictionaries[i];
          PhraseDictionary* phraseDictionary = const_cast<PhraseDictionary*>(phraseDictionaryFeature.GetDictionary());
          phraseDictionary->CleanUp(source);

        }
  
        for(size_t i=0;i<m_generationDictionaries.size();++i)
            m_generationDictionaries[i]->CleanUp(source);
  
        //something LMs could do after each sentence 
        LMList::const_iterator iterLM;
        for (iterLM = m_languageModels.begin() ; iterLM != m_languageModels.end() ; ++iterLM)
        {
          LanguageModel &languageModel = **iterLM;
          languageModel.CleanUpAfterSentenceProcessing(source);
        }
     }
ChartParser::ChartParser(InputType const &source, ChartCellCollectionBase &cells) :
  m_decodeGraphList(StaticData::Instance().GetDecodeGraphs()),
  m_source(source)
{
  const StaticData &staticData = StaticData::Instance();

  staticData.InitializeForInput(source);
  CreateInputPaths(m_source);

  const std::vector<PhraseDictionary*> &dictionaries = PhraseDictionary::GetColl();
  m_ruleLookupManagers.reserve(dictionaries.size());
  for (std::vector<PhraseDictionary*>::const_iterator p = dictionaries.begin();
       p != dictionaries.end(); ++p) {

    const PhraseDictionary *dict = *p;
    PhraseDictionary *nonConstDict = const_cast<PhraseDictionary*>(dict);

    ChartRuleLookupManager *lookupMgr = nonConstDict->CreateRuleLookupManager(*this, cells);

    m_ruleLookupManagers.push_back(lookupMgr);
  }

}
Beispiel #6
0
/**
 * Process a sentence with xml annotation
 * Xml tags may specifiy additional/replacing translation options
 * and reordering constraints
 *
 * \param line in: sentence, out: sentence without the xml
 * \param res vector with translation options specified by xml
 * \param reorderingConstraint reordering constraint zones specified by xml
 * \param walls reordering constraint walls specified by xml
 * \param lbrackStr xml tag's left bracket string, typically "<"
 * \param rbrackStr xml tag's right bracket string, typically ">"
 */
bool ProcessAndStripXMLTags(string &line, vector<XmlOption*> &res, ReorderingConstraint &reorderingConstraint, vector< size_t > &walls,
                            std::vector< std::pair<size_t, std::string> > &placeholders,
                            int offset,
                            const std::string& lbrackStr, const std::string& rbrackStr)
{
  //parse XML markup in translation line

  const StaticData &staticData = StaticData::Instance();

  // hack. What pt should XML trans opt be assigned to?
  PhraseDictionary *firstPt = NULL;
  if (PhraseDictionary::GetColl().size() == 0) {
    firstPt = PhraseDictionary::GetColl()[0];
  }

  // no xml tag? we're done.
//if (line.find_first_of('<') == string::npos) {
  if (line.find(lbrackStr) == string::npos) {
    return true;
  }

  // break up input into a vector of xml tags and text
  // example: (this), (<b>), (is a), (</b>), (test .)
  vector<string> xmlTokens = TokenizeXml(line, lbrackStr, rbrackStr);

  // we need to store opened tags, until they are closed
  // tags are stored as tripled (tagname, startpos, contents)
  typedef pair< string, pair< size_t, string > > OpenedTag;
  vector< OpenedTag > tagStack; // stack that contains active opened tags

  string cleanLine; // return string (text without xml)
  size_t wordPos = 0; // position in sentence (in terms of number of words)

  const vector<FactorType> &outputFactorOrder = staticData.GetOutputFactorOrder();
  // const string &factorDelimiter = staticData.GetFactorDelimiter();

  // loop through the tokens
  for (size_t xmlTokenPos = 0 ; xmlTokenPos < xmlTokens.size() ; xmlTokenPos++) {
    // not a xml tag, but regular text (may contain many words)
    if(!isXmlTag(xmlTokens[xmlTokenPos], lbrackStr, rbrackStr)) {
      // add a space at boundary, if necessary
      if (cleanLine.size()>0 &&
          cleanLine[cleanLine.size() - 1] != ' ' &&
          xmlTokens[xmlTokenPos][0] != ' ') {
        cleanLine += " ";
      }
      cleanLine += xmlTokens[xmlTokenPos]; // add to output
      wordPos = Tokenize(cleanLine).size(); // count all the words
    }

    // process xml tag
    else {
      // *** get essential information about tag ***

      // strip extra boundary spaces and "<" and ">"
      string tag =  Trim(TrimXml(xmlTokens[xmlTokenPos], lbrackStr, rbrackStr));
      VERBOSE(3,"XML TAG IS: " << tag << std::endl);

      if (tag.size() == 0) {
        TRACE_ERR("ERROR: empty tag name: " << line << endl);
        return false;
      }

      // check if unary (e.g., "<wall/>")
      bool isUnary = ( tag[tag.size() - 1] == '/' );

      // check if opening tag (e.g. "<a>", not "</a>")g
      bool isClosed = ( tag[0] == '/' );
      bool isOpen = !isClosed;

      if (isClosed && isUnary) {
        TRACE_ERR("ERROR: can't have both closed and unary tag " << lbrackStr << tag << rbrackStr << ": " << line << endl);
        return false;
      }

      if (isClosed)
        tag = tag.substr(1); // remove "/" at the beginning
      if (isUnary)
        tag = tag.substr(0,tag.size()-1); // remove "/" at the end

      // find the tag name and contents
      string::size_type endOfName = tag.find_first_of(' ');
      string tagName = tag;
      string tagContent = "";
      if (endOfName != string::npos) {
        tagName = tag.substr(0,endOfName);
        tagContent = tag.substr(endOfName+1);
      }

      // *** process new tag ***

      if (isOpen || isUnary) {
        // put the tag on the tag stack
        OpenedTag openedTag = make_pair( tagName, make_pair( wordPos, tagContent ) );
        tagStack.push_back( openedTag );
        VERBOSE(3,"XML TAG " << tagName << " (" << tagContent << ") added to stack, now size " << tagStack.size() << endl);
      }

      // *** process completed tag ***

      if (isClosed || isUnary) {
        // pop last opened tag from stack;
        if (tagStack.size() == 0) {
          TRACE_ERR("ERROR: tag " << tagName << " closed, but not opened" << ":" << line << endl);
          return false;
        }
        OpenedTag openedTag = tagStack.back();
        tagStack.pop_back();

        // tag names have to match
        if (openedTag.first != tagName) {
          TRACE_ERR("ERROR: tag " << openedTag.first << " closed by tag " << tagName << ": " << line << endl );
          return false;
        }

        // assemble remaining information about tag
        size_t startPos = openedTag.second.first;
        string tagContent = openedTag.second.second;
        size_t endPos = wordPos;

        // span attribute overwrites position
        string span = ParseXmlTagAttribute(tagContent,"span");
        if (! span.empty()) {
          vector<string> ij = Tokenize(span, "-");
          if (ij.size() != 1 && ij.size() != 2) {
            TRACE_ERR("ERROR: span attribute must be of the form \"i-j\" or \"i\": " << line << endl);
            return false;
          }
          startPos = atoi(ij[0].c_str());
          if (ij.size() == 1) endPos = startPos + 1;
          else endPos = atoi(ij[1].c_str()) + 1;
        }

        VERBOSE(3,"XML TAG " << tagName << " (" << tagContent << ") spanning " << startPos << " to " << (endPos-1) << " complete, commence processing" << endl);

        // special tag: wall
        if (tagName == "wall") {
          size_t start = (startPos == 0) ? 0 : startPos-1;
          for(size_t pos = start; pos < endPos; pos++)
            walls.push_back( pos );
        }

        // special tag: zone
        else if (tagName == "zone") {
          if (startPos >= endPos) {
            TRACE_ERR("ERROR: zone must span at least one word: " << line << endl);
            return false;
          }
          reorderingConstraint.SetZone( startPos, endPos-1 );
        }

        // name-entity placeholder
        else if (tagName == "ne") {
          if (startPos != (endPos - 1)) {
            TRACE_ERR("ERROR: Placeholder must only span 1 word: " << line << endl);
            return false;
          }
          string entity = ParseXmlTagAttribute(tagContent,"entity");
          placeholders.push_back(std::pair<size_t, std::string>(startPos, entity));
        }

        // update: add new aligned sentence pair to Mmsapt identified by name
        else if (tagName == "update") {
#if PT_UG
          // get model name and aligned sentence pair
          string pdName = ParseXmlTagAttribute(tagContent,"name");
          string source = ParseXmlTagAttribute(tagContent,"source");
          string target = ParseXmlTagAttribute(tagContent,"target");
          string alignment = ParseXmlTagAttribute(tagContent,"alignment");
          // find PhraseDictionary by name
          const vector<PhraseDictionary*> &pds = PhraseDictionary::GetColl();
          PhraseDictionary* pd = NULL;
          for (vector<PhraseDictionary*>::const_iterator i = pds.begin(); i != pds.end(); ++i) {
            PhraseDictionary* curPd = *i;
            if (curPd->GetScoreProducerDescription() == pdName) {
              pd = curPd;
              break;
            }
          }
          if (pd == NULL) {
            TRACE_ERR("ERROR: No PhraseDictionary with name " << pdName << ", no update" << endl);
            return false;
          }
          // update model
          VERBOSE(3,"Updating " << pdName << " ||| " << source << " ||| " << target << " ||| " << alignment << endl);
          Mmsapt* pdsa = reinterpret_cast<Mmsapt*>(pd);
          pdsa->add(source, target, alignment);
#else
          TRACE_ERR("ERROR: recompile with --with-mm to update PhraseDictionary at runtime" << endl);
          return false;
#endif
        }

        // weight-overwrite: update feature weights, unspecified weights remain unchanged
        // IMPORTANT: translation models that cache phrases or apply table-limit during load
        // based on initial weights need to be reset.  Sending an empty update will do this
        // for PhraseDictionaryBitextSampling (Mmsapt) models:
        // <update name="TranslationModelName" source=" " target=" " alignment=" " />
        else if (tagName == "weight-overwrite") {

          // is a name->ff map stored anywhere so we don't have to build it every time?
          const vector<FeatureFunction*> &ffs = FeatureFunction::GetFeatureFunctions();
          boost::unordered_map<string, FeatureFunction*> map;
          BOOST_FOREACH(FeatureFunction* const& ff, ffs) {
            map[ff->GetScoreProducerDescription()] = ff;
          }

          // update each weight listed
          ScoreComponentCollection allWeights = StaticData::Instance().GetAllWeights();
          boost::unordered_map<string, FeatureFunction*>::iterator ffi;
          string ffName("");
          vector<float> ffWeights;
          vector<string> toks = Tokenize(ParseXmlTagAttribute(tagContent,"weights"));
          BOOST_FOREACH(string const& tok, toks) {
            if (tok.substr(tok.size() - 1, 1) == "=") {
              // start new feature
              if (ffName != "") {
                // set previous feature weights
                if (ffi != map.end()) {
                  allWeights.Assign(ffi->second, ffWeights);
                }
                ffWeights.clear();
              }
              ffName = tok.substr(0, tok.size() - 1);
              ffi = map.find(ffName);
              if (ffi == map.end()) {
                TRACE_ERR("ERROR: No FeatureFunction with name " << ffName << ", no weight update" << endl);
              }
            } else {
              // weight for current feature
              ffWeights.push_back(Scan<float>(tok));
            }
          }
          if (ffi != map.end()) {
            allWeights.Assign(ffi->second, ffWeights);
          }
          StaticData::InstanceNonConst().SetAllWeights(allWeights);
        }

        // default: opening tag that specifies translation options
        else {
          if (startPos > endPos) {
            TRACE_ERR("ERROR: tag " << tagName << " startPos > endPos: " << line << endl);
            return false;
          } else if (startPos == endPos) {
            TRACE_ERR("WARNING: tag " << tagName << " 0 span: " << line << endl);
            continue;
          }

          // specified translations -> vector of phrases
          // multiple translations may be specified, separated by "||"
          vector<string> altTexts = TokenizeMultiCharSeparator(ParseXmlTagAttribute(tagContent,"translation"), "||");
          if( altTexts.size() == 1 && altTexts[0] == "" )
            altTexts.pop_back(); // happens when nothing specified
          // deal with legacy annotations: "translation" was called "english"
          vector<string> moreAltTexts = TokenizeMultiCharSeparator(ParseXmlTagAttribute(tagContent,"english"), "||");
          if (moreAltTexts.size()>1 || moreAltTexts[0] != "") {
            for(vector<string>::iterator translation=moreAltTexts.begin();
                translation != moreAltTexts.end();
                translation++) {
              string t = *translation;
              altTexts.push_back( t );
            }
          }

          // specified probabilities for the translations -> vector of probs
          vector<string> altProbs = TokenizeMultiCharSeparator(ParseXmlTagAttribute(tagContent,"prob"), "||");
          if( altProbs.size() == 1 && altProbs[0] == "" )
            altProbs.pop_back(); // happens when nothing specified

          // report what we have processed so far
          VERBOSE(3,"XML TAG NAME IS: '" << tagName << "'" << endl);
          VERBOSE(3,"XML TAG TRANSLATION IS: '" << altTexts[0] << "'" << endl);
          VERBOSE(3,"XML TAG PROB IS: '" << altProbs[0] << "'" << endl);
          VERBOSE(3,"XML TAG SPAN IS: " << startPos << "-" << (endPos-1) << endl);
          if (altProbs.size() > 0 && altTexts.size() != altProbs.size()) {
            TRACE_ERR("ERROR: Unequal number of probabilities and translation alternatives: " << line << endl);
            return false;
          }

          // store translation options into members
          if (staticData.GetXmlInputType() != XmlIgnore) {
            // only store options if we aren't ignoring them
            for (size_t i=0; i<altTexts.size(); ++i) {
              Phrase sourcePhrase; // TODO don't know what the source phrase is

              // set default probability
              float probValue = 1;
              if (altProbs.size() > 0) probValue = Scan<float>(altProbs[i]);
              // convert from prob to log-prob
              float scoreValue = FloorScore(TransformScore(probValue));

              WordsRange range(startPos + offset,endPos-1 + offset); // span covered by phrase
              TargetPhrase targetPhrase(firstPt);
              // targetPhrase.CreateFromString(Output, outputFactorOrder,altTexts[i],factorDelimiter, NULL);
              targetPhrase.CreateFromString(Output, outputFactorOrder,altTexts[i], NULL);

              // lhs
              const UnknownLHSList &lhsList = staticData.GetUnknownLHS();
              if (!lhsList.empty()) {
                const Factor *factor = FactorCollection::Instance().AddFactor(lhsList[0].first, true);
                Word *targetLHS = new Word(true);
                targetLHS->SetFactor(0, factor); // TODO - other factors too?
                targetPhrase.SetTargetLHS(targetLHS);
              }

              targetPhrase.SetXMLScore(scoreValue);
              targetPhrase.EvaluateInIsolation(sourcePhrase);

              XmlOption *option = new XmlOption(range,targetPhrase);
              assert(option);

              res.push_back(option);
            }
            altTexts.clear();
            altProbs.clear();
          }
        }
      }
    }
  }
int main(int argc, char** argv) 
{
  bool help;
  string input_file;
  string config_file;


  po::options_description desc("Allowed options");
  desc.add_options()
  ("help,h", po::value(&help)->zero_tokens()->default_value(false), "Print this help message and exit")
  ("input-file,i", po::value<string>(&input_file), "Input file")
  ("config-file,f", po::value<string>(&config_file), "Config file")
  ;

  po::options_description cmdline_options;
  cmdline_options.add(desc);
  po::variables_map vm;
  po::parsed_options parsed = po::command_line_parser(argc,argv).
            options(cmdline_options).allow_unregistered().run();
  po::store(parsed, vm);
  po::notify(vm);
  if (help) {
    usage(desc, argv);
    exit(0);
  }
  if (input_file.empty()) {
    cerr << "ERROR: Please specify an input file" << endl << endl;
    usage(desc, argv);
    exit(1);
  }
  if (config_file.empty()) {
    cerr << "ERROR: Please specify a config file" << endl << endl;
    usage(desc, argv);
    exit(1);
  }

  vector<string> mosesargs;
  mosesargs.push_back(argv[0]);
  mosesargs.push_back("-f");
  mosesargs.push_back(config_file);
  for (size_t i = 0; i < parsed.options.size(); ++i) {
    if (parsed.options[i].position_key == -1 && !parsed.options[i].unregistered) continue;
    /*
    const string& key = parsed.options[i].string_key;
    if (!key.empty()) {
      mosesargs.push_back(key);
    }
    for (size_t j = 0; j < parsed.options[i].value.size(); ++j) {
      const string& value = parsed.options[i].value[j];
      if (!value.empty()) {
        mosesargs.push_back(value);
      }
    }*/

    for (size_t j = 0; j < parsed.options[i].original_tokens.size(); ++j) {
      mosesargs.push_back(parsed.options[i].original_tokens[j]);
    }
  }

  boost::scoped_ptr<Parameter> params(new Parameter());  
  char** mosesargv = new char*[mosesargs.size()];
  for (size_t i = 0; i < mosesargs.size(); ++i) {
    mosesargv[i] = new char[mosesargs[i].length() + 1];
    strcpy(mosesargv[i], mosesargs[i].c_str());
  }

  if (!params->LoadParam(mosesargs.size(), mosesargv)) {
    params->Explain();
    exit(1);
  }

  if (!StaticData::LoadDataStatic(params.get(),argv[0])) {
    exit(1);
  }

  const StaticData &staticData = StaticData::Instance();
  const std::vector<FactorType> & input = staticData.GetInputFactorOrder();

  //Find the phrase table to evaluate with
  PhraseDictionary* phraseTable = NULL;
  const vector<FeatureFunction*>& ffs = FeatureFunction::GetFeatureFunctions();
  for (size_t i = 0; i < ffs.size(); ++i) {
    PhraseDictionary* maybePhraseTable = dynamic_cast< PhraseDictionary*>(ffs[i]);
    if (maybePhraseTable) {
      UTIL_THROW_IF(phraseTable,util::Exception,"Can only score translations with one phrase table");
      phraseTable = maybePhraseTable;
    }
  }
  UTIL_THROW_IF(!phraseTable,util::Exception,"Unable to find scoring phrase table");

  Sentence sentence;
  phraseTable->InitializeForInput(sentence);

  //
  //Load and prune the phrase table. This is taken (with mods) from moses/TranslationModel/RuleTable/LoaderStandard.cpp
  //

  string lineOrig;

  std::ostream *progress = NULL;
  IFVERBOSE(1) progress = &std::cerr;
  util::FilePiece in(input_file.c_str(), progress);

  // reused variables
  vector<float> scoreVector;
  StringPiece line;

  double_conversion::StringToDoubleConverter converter(double_conversion::StringToDoubleConverter::NO_FLAGS, NAN, NAN, "inf", "nan");

  StringPiece previous;

  while(true) {
    try {
      line = in.ReadLine();
    } catch (const util::EndOfFileException &e) {
      break;
    }

    util::TokenIter<util::MultiCharacter> pipes(line, "|||");
    StringPiece sourcePhraseString(*pipes);
    if (sourcePhraseString != previous) {
      outputTopN(previous, phraseTable, input, cout);
      previous = sourcePhraseString;
    }
  }
  outputTopN(previous, phraseTable, input, cout);





  return 0;
}