//! populate this InputType with data from in stream int TreeInput:: Read(std::istream& in, const std::vector<FactorType>& factorOrder, AllOptions const& opts) { const StaticData &staticData = StaticData::Instance(); string line; if (getline(in, line, '\n').eof()) return 0; // remove extra spaces //line = Trim(line); m_labelledSpans.clear(); ProcessAndStripXMLTags(line, m_labelledSpans, m_xmlOptions); // do words 1st - hack stringstream strme; strme << line << endl; Sentence::Read(strme, factorOrder, opts); // size input chart size_t sourceSize = GetSize(); m_sourceChart.resize(sourceSize); for (size_t pos = 0; pos < sourceSize; ++pos) { m_sourceChart[pos].resize(sourceSize - pos); } // do source labels vector<XMLParseOutput>::const_iterator iterLabel; for (iterLabel = m_labelledSpans.begin(); iterLabel != m_labelledSpans.end(); ++iterLabel) { const XMLParseOutput &labelItem = *iterLabel; const Range &range = labelItem.m_range; const string &label = labelItem.m_label; AddChartLabel(range.GetStartPos() + 1, range.GetEndPos() + 1, label, factorOrder); } // default label for (size_t startPos = 0; startPos < sourceSize; ++startPos) { for (size_t endPos = startPos; endPos < sourceSize; ++endPos) { NonTerminalSet &list = GetLabelSet(startPos, endPos); if (list.size() == 0 || !staticData.GetDefaultNonTermOnlyForEmptyRange()) { AddChartLabel(startPos, endPos, staticData.GetInputDefaultNonTerminal(), factorOrder); } } } return 1; }
std::auto_ptr<ParseTree> XmlTreeParser::Parse(const std::string &line) { m_line = line; m_tree.Clear(); try { if (!ProcessAndStripXMLTags(m_line, m_tree, m_labelSet, m_topLabelSet)) { throw Exception(""); } } catch (const XmlException &e) { throw Exception(e.getMsg()); } m_tree.ConnectNodes(); SyntaxNode *root = m_tree.GetTop(); assert(root); m_words = tokenize(m_line.c_str()); return ConvertTree(*root, m_words); }
bool SentenceAlignmentWithSyntax::processSourceSentence(const char * sourceString, int sentenceID, bool boundaryRules) { if (!m_options.sourceSyntax) { return SentenceAlignment::processSourceSentence(sourceString, sentenceID, boundaryRules); } string sourceStringCPP(sourceString); try { ProcessAndStripXMLTags(sourceStringCPP, sourceTree, m_sourceLabelCollection , m_sourceTopLabelCollection); } catch (const XmlException & e) { std::cerr << "WARNING: failed to process source sentence at line " << sentenceID << ": " << e.getMsg() << std::endl; return false; } source = tokenize(sourceStringCPP.c_str()); return true; }
bool SentenceAlignmentWithSyntax::processTargetSentence(const char * targetString, int sentenceID, bool boundaryRules) { if (!m_options.targetSyntax) { return SentenceAlignment::processTargetSentence(targetString, sentenceID, boundaryRules); } string targetStringCPP(targetString); try { ProcessAndStripXMLTags(targetStringCPP, targetTree, m_targetLabelCollection, m_targetTopLabelCollection); } catch (const XmlException & e) { std::cerr << "WARNING: failed to process target sentence at line " << sentenceID << ": " << e.getMsg() << std::endl; return false; } target = tokenize(targetStringCPP.c_str()); return true; }
int Sentence::Read(std::istream& in,const std::vector<FactorType>& factorOrder) { // const std::string& factorDelimiter = StaticData::Instance().GetFactorDelimiter(); std::string line; std::map<std::string, std::string> meta; if (getline(in, line, '\n').eof()) return 0; //get covered words - if continual-partial-translation is switched on, parse input const StaticData &staticData = StaticData::Instance(); m_frontSpanCoveredLength = 0; m_sourceCompleted.resize(0); if (staticData.ContinuePartialTranslation()) { string initialTargetPhrase; string sourceCompletedStr; int loc1 = line.find( "|||", 0 ); int loc2 = line.find( "|||", loc1 + 3 ); if (loc1 > -1 && loc2 > -1) { initialTargetPhrase = line.substr(0, loc1); sourceCompletedStr = line.substr(loc1 + 3, loc2 - loc1 - 3); line = line.substr(loc2 + 3); sourceCompletedStr = Trim(sourceCompletedStr); initialTargetPhrase = Trim(initialTargetPhrase); m_initialTargetPhrase = initialTargetPhrase; int len = sourceCompletedStr.size(); m_sourceCompleted.resize(len); int contiguous = 1; for (int i = 0; i < len; ++i) { if (sourceCompletedStr.at(i) == '1') { m_sourceCompleted[i] = true; if (contiguous) m_frontSpanCoveredLength ++; } else { m_sourceCompleted[i] = false; contiguous = 0; } } } } // remove extra spaces line = Trim(line); // if sentences is specified as "<seg id=1> ... </seg>", extract id meta = ProcessAndStripSGML(line); if (meta.find("id") != meta.end()) { this->SetTranslationId(atol(meta["id"].c_str())); } if (meta.find("docid") != meta.end()) { this->SetDocumentId(atol(meta["docid"].c_str())); this->SetUseTopicId(false); this->SetUseTopicIdAndProb(false); } if (meta.find("topic") != meta.end()) { vector<string> topic_params; boost::split(topic_params, meta["topic"], boost::is_any_of("\t ")); if (topic_params.size() == 1) { this->SetTopicId(atol(topic_params[0].c_str())); this->SetUseTopicId(true); this->SetUseTopicIdAndProb(false); } else { this->SetTopicIdAndProb(topic_params); this->SetUseTopicId(false); this->SetUseTopicIdAndProb(true); } } if (meta.find("weight-setting") != meta.end()) { this->SetWeightSetting(meta["weight-setting"]); this->SetSpecifiesWeightSetting(true); staticData.SetWeightSetting(meta["weight-setting"]); } else { this->SetSpecifiesWeightSetting(false); } // parse XML markup in translation line //const StaticData &staticData = StaticData::Instance(); std::vector< size_t > xmlWalls; std::vector< std::pair<size_t, std::string> > placeholders; if (staticData.GetXmlInputType() != XmlPassThrough) { int offset = 0; if (staticData.IsChart()) { offset = 1; } if (!ProcessAndStripXMLTags(line, m_xmlOptions, m_reorderingConstraint, xmlWalls, placeholders, offset, staticData.GetXmlBrackets().first, staticData.GetXmlBrackets().second)) { const string msg("Unable to parse XML in line: " + line); TRACE_ERR(msg << endl); throw runtime_error(msg); } } // Phrase::CreateFromString(Input, factorOrder, line, factorDelimiter, NULL); Phrase::CreateFromString(Input, factorOrder, line, NULL); // placeholders ProcessPlaceholders(placeholders); if (staticData.IsChart()) { InitStartEndWord(); } //now that we have final word positions in phrase (from CreateFromString), //we can make input phrase objects to go with our XmlOptions and create TranslationOptions //only fill the vector if we are parsing XML if (staticData.GetXmlInputType() != XmlPassThrough ) { for (size_t i=0; i<GetSize(); i++) { m_xmlCoverageMap.push_back(false); } //iterXMLOpts will be empty for XmlIgnore //look at each column for(std::vector<XmlOption*>::const_iterator iterXmlOpts = m_xmlOptions.begin(); iterXmlOpts != m_xmlOptions.end(); iterXmlOpts++) { const XmlOption *xmlOption = *iterXmlOpts; const WordsRange &range = xmlOption->range; for(size_t j=range.GetStartPos(); j<=range.GetEndPos(); j++) { m_xmlCoverageMap[j]=true; } } } // reordering walls and zones m_reorderingConstraint.InitializeWalls( GetSize() ); // set reordering walls, if "-monotone-at-punction" is set if (staticData.UseReorderingConstraint() && GetSize()>0) { m_reorderingConstraint.SetMonotoneAtPunctuation( GetSubString( WordsRange(0,GetSize()-1 ) ) ); } // set walls obtained from xml for(size_t i=0; i<xmlWalls.size(); i++) if( xmlWalls[i] < GetSize() ) // no buggy walls, please m_reorderingConstraint.SetWall( xmlWalls[i], true ); m_reorderingConstraint.FinalizeWalls(); return 1; }
//! populate this InputType with data from in stream int TreeInput::Read(std::istream& in,const std::vector<FactorType>& factorOrder) { const StaticData &staticData = StaticData::Instance(); string line; if (getline(in, line, '\n').eof()) return 0; // remove extra spaces //line = Trim(line); std::vector<XMLParseOutput> sourceLabels; std::vector<XmlOption*> xmlOptionsList; ProcessAndStripXMLTags(line, sourceLabels, xmlOptionsList); // do words 1st - hack stringstream strme; strme << line << endl; Sentence::Read(strme, factorOrder); // size input chart size_t sourceSize = GetSize(); m_sourceChart.resize(sourceSize); for (size_t pos = 0; pos < sourceSize; ++pos) { m_sourceChart[pos].resize(sourceSize - pos); } // do source labels vector<XMLParseOutput>::const_iterator iterLabel; for (iterLabel = sourceLabels.begin(); iterLabel != sourceLabels.end(); ++iterLabel) { const XMLParseOutput &labelItem = *iterLabel; const WordsRange &range = labelItem.m_range; const string &label = labelItem.m_label; AddChartLabel(range.GetStartPos() + 1, range.GetEndPos() + 1, label, factorOrder); } // default label for (size_t startPos = 0; startPos < sourceSize; ++startPos) { for (size_t endPos = startPos; endPos < sourceSize; ++endPos) { AddChartLabel(startPos, endPos, staticData.GetInputDefaultNonTerminal(), factorOrder); } } // XML Options //only fill the vector if we are parsing XML if (staticData.GetXmlInputType() != XmlPassThrough ) { //TODO: needed to handle exclusive //for (size_t i=0; i<GetSize(); i++) { // m_xmlCoverageMap.push_back(false); //} //iterXMLOpts will be empty for XmlIgnore //look at each column for(std::vector<XmlOption*>::const_iterator iterXmlOpts = xmlOptionsList.begin(); iterXmlOpts != xmlOptionsList.end(); iterXmlOpts++) { const XmlOption *xmlOption = *iterXmlOpts; TargetPhrase *targetPhrase = new TargetPhrase(xmlOption->targetPhrase); *targetPhrase = xmlOption->targetPhrase; // copy everything WordsRange *range = new WordsRange(xmlOption->range); const StackVec emptyStackVec; // hmmm... maybe dangerous, but it is never consulted TargetPhraseCollection *tpc = new TargetPhraseCollection; tpc->Add(targetPhrase); ChartTranslationOptions *transOpt = new ChartTranslationOptions(*tpc, emptyStackVec, *range, 0.0f); m_xmlChartOptionsList.push_back(transOpt); //TODO: needed to handle exclusive //for(size_t j=transOpt->GetSourceWordsRange().GetStartPos(); j<=transOpt->GetSourceWordsRange().GetEndPos(); j++) { // m_xmlCoverageMap[j]=true; //} delete xmlOption; } } return 1; }