vector<DisambiguatedData> PredictedMorphologyTreeRecoverer::GetMorphology( const SyntaxTree& realTree) { size_t size = static_cast<size_t>(realTree.GetSize()); vector<Token> tokens(size); for (size_t nodeIndex = 0; nodeIndex < size; ++nodeIndex) { tokens[nodeIndex] = static_cast<Token>(realTree.GetNodes()[nodeIndex]); } return disambiguator->Disambiguate(tokens); }
ROOT_FEATURE_CALCULATOR_TEMPLATE vector<typename ROOT_FEATURE_CALCULATOR_TEMPLATE1::StringBatch> ROOT_FEATURE_CALCULATOR_TEMPLATE1::Calculate( const SyntaxTree& tree) { const vector<SyntaxNode>& nodes = tree.GetNodes(); vector<StringBatch> features; for (size_t nodeIndex = 0; nodeIndex < nodes.size(); ++nodeIndex) { features.emplace_back(Calculate(nodes[nodeIndex], tree)); } return features; }
void LeftBinarize( SyntaxTree &tree, ParentNodes &parents ) { for(ParentNodes::const_iterator p = parents.begin(); p != parents.end(); p++) { const SplitPoints &point = *p; if (point.size() > 3) { const vector< SyntaxNode* >& topNodes = tree.GetNodes( point[0], point[point.size()-1]-1); string topLabel = topNodes[0]->GetLabel(); for(size_t i=2; i<point.size()-1; i++) { // cerr << "LeftBin " << point[0] << "-" << (point[point.size()-1]-1) << ": " << point[0] << "-" << point[i]-1 << " ^" << topLabel << endl; tree.AddNode( point[0], point[i]-1, "^" + topLabel ); } } } }
EDGE_FEATURE_CALCULATOR_TEMPLATE void EDGE_FEATURE_CALCULATOR_TEMPLATE1::CalculateFeatures( const SyntaxTree& tree , vector<vector<StringBatch> >* features) { int size = tree.GetSize(); const vector<SyntaxNode>& nodes = tree.GetNodes(); // Calculate features for nodes vector<vector<wstring> > nodeFeatures = nodeFeatureCalculator.Calculate(tree); // Calculate features features->resize(size); for (int leftIndex = 0; leftIndex < size; ++leftIndex) { (*features)[leftIndex].resize(size); for (int rightIndex = 0; rightIndex < size; ++rightIndex) { if (leftIndex == rightIndex) { continue; } StringBatch& currentFeatures = (*features)[leftIndex][rightIndex]; wstring distanceAbbr = Ceil( nodes[rightIndex].index - nodes[leftIndex].index); wstring posLeftMin2 = (leftIndex > 1 ? nodes[leftIndex - 2].label : L"none"); wstring posLeftMin1 = (leftIndex > 0 ? nodes[leftIndex].label : L"none"); wstring posLeft = L"!" + nodes[leftIndex].label; wstring posLeftPlus1 = (leftIndex + 1 < size ? nodes[leftIndex + 1].label : L"none"); wstring posLeftPlus2 = (leftIndex + 2 < size ? nodes[leftIndex + 2].label : L"none"); wstring posRightMin2 = (rightIndex > 1 ? nodes[rightIndex - 2].label : L"none"); wstring posRightMin1 = (rightIndex > 0 ? nodes[rightIndex].label : L"none"); wstring posRight = L"!" + nodes[rightIndex].label; wstring posRightPlus1 = (rightIndex + 1 < size ? nodes[rightIndex + 1].label : L"none"); wstring posRightPlus2 = (rightIndex + 2 < size ? nodes[rightIndex + 2].label : L"none"); currentFeatures.Add(posLeft + L"->"); currentFeatures.Add(L"->" + posRight); currentFeatures.Add(posLeftMin2 + posLeftMin1 + posLeft + posLeftPlus1 + L"->" + posRight + L"_" + distanceAbbr); currentFeatures.Add(posLeftMin2 + posLeftMin1 + posLeft + L"->" + posRight + L"_" + distanceAbbr); currentFeatures.Add(posLeftMin1 + posLeft + L"->" + posRight + L"_" + distanceAbbr); currentFeatures.Add(posLeft + posLeftPlus1 + L"->" + posRight + L"_" + distanceAbbr); currentFeatures.Add(posLeft + L"->" + posRight + L"_" + distanceAbbr); currentFeatures.Add(posLeft + L"->" + posRightMin2 + posRightMin1 + posRight + posRightPlus1 + L"_" + distanceAbbr); currentFeatures.Add(posLeft + L"->" + posRightMin2 + posRightMin1 + posRight + L"_" + distanceAbbr); currentFeatures.Add(posLeft + L"->" + posRightMin1 + posRight + L"_" + distanceAbbr); currentFeatures.Add(posLeft + L"->" + posRight + posRightPlus1 + L"_" + distanceAbbr); // Add features wstring prefix = nodes[leftIndex].label + L"_" + nodes[rightIndex].label; currentFeatures.Add(L"L" + nodes[leftIndex].label); currentFeatures.Add(L"R" + nodes[rightIndex].label); // Add left node features const vector<wstring>& leftFeatures = nodeFeatures[leftIndex]; for (size_t it = 0; it < leftFeatures.size(); ++it) { currentFeatures.Add( prefix + L"L" + leftFeatures[it]); } // Add right node features const vector<wstring>& rightFeatures = nodeFeatures[rightIndex]; for (size_t it = 0; it < rightFeatures.size(); ++it) { currentFeatures.Add(prefix + L"R" + rightFeatures[it]); } currentFeatures.Add(nodes[leftIndex].lemma + L"->" + nodes[rightIndex].label + L"_" + distanceAbbr); currentFeatures.Add(nodes[leftIndex].label + L"->" + nodes[rightIndex].lemma + L"_" + distanceAbbr); currentFeatures.Add(nodes[leftIndex].content + L"->" + nodes[rightIndex].label + L"_" + distanceAbbr); currentFeatures.Add(nodes[leftIndex].label + L"->" + nodes[rightIndex].content + L"_" + distanceAbbr); currentFeatures.Add(nodes[leftIndex].lemma + L"->" + nodes[rightIndex].label); currentFeatures.Add(nodes[leftIndex].label + L"->" + nodes[rightIndex].lemma); currentFeatures.Add(nodes[leftIndex].content + L"->" + nodes[rightIndex].label); currentFeatures.Add(nodes[leftIndex].label + L"->" + nodes[rightIndex].content); } } }
/*TODO: we'd only have to return a vector of XML options if we dropped linking. 2-d vector is so we can link things up afterwards. We can't create TranslationOptions as we parse because we don't have the completed source parsed until after this function removes all the markup from it (CreateFromString in Sentence::Read). */ bool ProcessAndStripXMLTags(string &line, SyntaxTree &tree, set< string > &labelCollection, map< string, int > &topLabelCollection ) { //parse XML markup in translation line // no xml tag? we're done. if (line.find_first_of('<') == string::npos) { return true; } // break up input into a vector of xml tags and text // example: (this), (<b>), (is a), (</b>), (test .) vector<string> xmlTokens = TokenizeXml(line); // we need to store opened tags, until they are closed // tags are stored as tripled (tagname, startpos, contents) typedef pair< string, pair< size_t, string > > OpenedTag; vector< OpenedTag > tagStack; // stack that contains active opened tags string cleanLine; // return string (text without xml) size_t wordPos = 0; // position in sentence (in terms of number of words) bool isLinked = false; // loop through the tokens for (size_t xmlTokenPos = 0 ; xmlTokenPos < xmlTokens.size() ; xmlTokenPos++) { // not a xml tag, but regular text (may contain many words) if(!isXmlTag(xmlTokens[xmlTokenPos])) { // add a space at boundary, if necessary if (cleanLine.size()>0 && cleanLine[cleanLine.size() - 1] != ' ' && xmlTokens[xmlTokenPos][0] != ' ') { cleanLine += " "; } cleanLine += unescape(xmlTokens[xmlTokenPos]); // add to output wordPos = Tokenize(cleanLine).size(); // count all the words } // process xml tag else { // *** get essential information about tag *** // strip extra boundary spaces and "<" and ">" string tag = Trim(TrimXml(xmlTokens[xmlTokenPos])); // cerr << "XML TAG IS: " << tag << std::endl; if (tag.size() == 0) { cerr << "ERROR: empty tag name: " << line << endl; return false; } // check if unary (e.g., "<wall/>") bool isUnary = ( tag[tag.size() - 1] == '/' ); // check if opening tag (e.g. "<a>", not "</a>")g bool isClosed = ( tag[0] == '/' ); bool isOpen = !isClosed; if (isClosed && isUnary) { cerr << "ERROR: can't have both closed and unary tag <" << tag << ">: " << line << endl; return false; } if (isClosed) tag = tag.substr(1); // remove "/" at the beginning if (isUnary) tag = tag.substr(0,tag.size()-1); // remove "/" at the end // find the tag name and contents string::size_type endOfName = tag.find_first_of(' '); string tagName = tag; string tagContent = ""; if (endOfName != string::npos) { tagName = tag.substr(0,endOfName); tagContent = tag.substr(endOfName+1); } // *** process new tag *** if (isOpen || isUnary) { // put the tag on the tag stack OpenedTag openedTag = make_pair( tagName, make_pair( wordPos, tagContent ) ); tagStack.push_back( openedTag ); // cerr << "XML TAG " << tagName << " (" << tagContent << ") added to stack, now size " << tagStack.size() << endl; } // *** process completed tag *** if (isClosed || isUnary) { // pop last opened tag from stack; if (tagStack.size() == 0) { cerr << "ERROR: tag " << tagName << " closed, but not opened" << ":" << line << endl; return false; } OpenedTag openedTag = tagStack.back(); tagStack.pop_back(); // tag names have to match if (openedTag.first != tagName) { cerr << "ERROR: tag " << openedTag.first << " closed by tag " << tagName << ": " << line << endl; return false; } // assemble remaining information about tag size_t startPos = openedTag.second.first; string tagContent = openedTag.second.second; size_t endPos = wordPos; // span attribute overwrites position string span = ParseXmlTagAttribute(tagContent,"span"); if (! span.empty()) { vector<string> ij = Tokenize(span, "-"); if (ij.size() != 1 && ij.size() != 2) { cerr << "ERROR: span attribute must be of the form \"i-j\" or \"i\": " << line << endl; return false; } startPos = atoi(ij[0].c_str()); if (ij.size() == 1) endPos = startPos + 1; else endPos = atoi(ij[1].c_str()) + 1; } // cerr << "XML TAG " << tagName << " (" << tagContent << ") spanning " << startPos << " to " << (endPos-1) << " complete, commence processing" << endl; if (startPos >= endPos) { cerr << "ERROR: tag " << tagName << " must span at least one word (" << startPos << "-" << endPos << "): " << line << endl; return false; } string label = ParseXmlTagAttribute(tagContent,"label"); labelCollection.insert( label ); string scoreString = ParseXmlTagAttribute(tagContent,"score"); float score = scoreString == "" ? 0.0f : std::atof(scoreString.c_str()); // report what we have processed so far if (0) { cerr << "XML TAG NAME IS: '" << tagName << "'" << endl; cerr << "XML TAG LABEL IS: '" << label << "'" << endl; cerr << "XML SPAN IS: " << startPos << "-" << (endPos-1) << endl; } SyntaxNode *node = tree.AddNode( startPos, endPos-1, label ); node->SetScore(score); } } } // we are done. check if there are tags that are still open if (tagStack.size() > 0) { cerr << "ERROR: some opened tags were never closed: " << line << endl; return false; } // collect top labels const vector< SyntaxNode* >& topNodes = tree.GetNodes( 0, wordPos-1 ); for( vector< SyntaxNode* >::const_iterator node = topNodes.begin(); node != topNodes.end(); node++ ) { SyntaxNode *n = *node; const string &label = n->GetLabel(); if (topLabelCollection.find( label ) == topLabelCollection.end()) topLabelCollection[ label ] = 0; topLabelCollection[ label ]++; } // return de-xml'ed sentence in line line = cleanLine; return true; }
void SAMT( SyntaxTree &tree, ParentNodes &parents ) { int numWords = tree.GetNumWords(); SyntaxTree newTree; // to store new nodes // look through parents to combine children for(ParentNodes::const_iterator p = parents.begin(); p != parents.end(); p++) { const SplitPoints &point = *p; // neighboring childen: DET+ADJ if (point.size() >= 3) { // cerr << "complex parent: "; // for(int i=0;i<point.size();i++) cerr << point[i] << " "; // cerr << endl; for(int i = 0; i+2 < point.size(); i++) { // cerr << "\tadding " << point[i] << ";" << point[i+1] << ";" << (point[i+2]-1) << ": " << tree.GetNodes(point[i ],point[i+1]-1)[0]->GetLabel() << "+" << tree.GetNodes(point[i+1],point[i+2]-1)[0]->GetLabel() << endl; newTree.AddNode( point[i],point[i+2]-1, tree.GetNodes(point[i ],point[i+1]-1)[0]->GetLabel() + "+" + tree.GetNodes(point[i+1],point[i+2]-1)[0]->GetLabel() ); } } if (point.size() >= 4) { int ps = point.size(); string topLabel = tree.GetNodes(point[0],point[ps-1]-1)[0]->GetLabel(); // cerr << "\tadding " << topLabel + "\\" + tree.GetNodes(point[0],point[1]-1)[0]->GetLabel() << endl; newTree.AddNode( point[1],point[ps-1]-1, topLabel + "\\" + tree.GetNodes(point[0],point[1]-1)[0]->GetLabel() ); // cerr << "\tadding " << topLabel + "/" + tree.GetNodes(point[ps-2],point[ps-1]-1)[0]->GetLabel() << endl; newTree.AddNode( point[0],point[ps-2]-1, topLabel + "/" + tree.GetNodes(point[ps-2],point[ps-1]-1)[0]->GetLabel() ); } } // rules for any bordering constituents... for(int size = 2; size < numWords; size++) { for(int start = 0; start < numWords-size+1; start++) { int end = start+size-1; bool done = false; if (tree.HasNode( start,end ) || newTree.HasNode( start,end ) || SAMTLevel <= 1) { continue; } // if matching two adjacent parse constituents: use ++ for(int mid=start+1; mid<=end && !done; mid++) { if (tree.HasNode(start,mid-1) && tree.HasNode(mid,end)) { // cerr << "\tadding " << tree.GetNodes(start,mid-1)[0]->GetLabel() << "++" << tree.GetNodes(mid, end )[0]->GetLabel() << endl; newTree.AddNode( start, end, tree.GetNodes(start,mid-1)[0]->GetLabel() + "++" + tree.GetNodes(mid, end )[0]->GetLabel() ); done = true; } } if (done) continue; // if matching a constituent A right-minus const. B: use A//B for(int postEnd=end+1; postEnd<numWords && !done; postEnd++) { if (tree.HasNode(start,postEnd) && tree.HasNode(end+1,postEnd)) { newTree.AddNode( start, end, tree.GetNodes(start,postEnd)[0]->GetLabel() + "//" + tree.GetNodes(end+1,postEnd)[0]->GetLabel() ); done = true; } } if (done) continue; // if matching a constituent A left-minus constituent B: use A\\B for(int preStart=start-1; preStart>=0; preStart--) { if (tree.HasNode(preStart,end) && tree.HasNode(preStart,start-1)) { // cerr << "\tadding " << tree.GetNodes(preStart,end )[0]->GetLabel() << "\\\\" <<tree.GetNodes(preStart,start-1)[0]->GetLabel() << endl; newTree.AddNode( start, end, tree.GetNodes(preStart,end )[0]->GetLabel() + "\\\\" + tree.GetNodes(preStart,start-1)[0]->GetLabel() ); done = true; } } if (done) continue; // if matching three consecutive constituents, use double-plus // SAMT Level 3, not yet implemented // else: assign default category _FAIL if (SAMTLevel>=4) { newTree.AddNode( start, end, "_FAIL" ); } } } // adding all new nodes vector< SyntaxNode* > nodes = newTree.GetAllNodes(); for( int i=0; i<nodes.size(); i++ ) { tree.AddNode( nodes[i]->GetStart(), nodes[i]->GetEnd(), nodes[i]->GetLabel()); } }