void LeftBinarize( SyntaxTree &tree, ParentNodes &parents ) { for(ParentNodes::const_iterator p = parents.begin(); p != parents.end(); p++) { const SplitPoints &point = *p; if (point.size() > 3) { const vector< SyntaxNode* >& topNodes = tree.GetNodes( point[0], point[point.size()-1]-1); string topLabel = topNodes[0]->GetLabel(); for(size_t i=2; i<point.size()-1; i++) { // cerr << "LeftBin " << point[0] << "-" << (point[point.size()-1]-1) << ": " << point[0] << "-" << point[i]-1 << " ^" << topLabel << endl; tree.AddNode( point[0], point[i]-1, "^" + topLabel ); } } } }
void RightBinarize( SyntaxNodeCollection &tree, ParentNodes &parents ) { for(ParentNodes::const_iterator p = parents.begin(); p != parents.end(); p++) { const SplitPoints &point = *p; if (point.size() > 3) { int endPoint = point[point.size()-1]-1; const vector< SyntaxNode* >& topNodes = tree.GetNodes( point[0], endPoint); string topLabel = topNodes[0]->label; for(size_t i=1; i<point.size()-2; i++) { // cerr << "RightBin " << point[0] << "-" << (point[point.size()-1]-1) << ": " << point[i] << "-" << endPoint << " ^" << topLabel << endl; tree.AddNode( point[i], endPoint, "^" + topLabel ); } } } }
ParentNodes determineSplitPoints(const SyntaxNodeCollection &nodeColl) { ParentNodes parents; const std::size_t numWords = nodeColl.GetNumWords(); // looping through all spans of size >= 2 for( int length=2; length<=numWords; length++ ) { for( int startPos = 0; startPos <= numWords-length; startPos++ ) { if (nodeColl.HasNode( startPos, startPos+length-1 )) { // processing one (parent) span //std::cerr << "# " << startPos << "-" << (startPos+length-1) << ":"; SplitPoints splitPoints; splitPoints.push_back( startPos ); //std::cerr << " " << startPos; int first = 1; int covered = 0; int found_somehing = 1; // break loop if nothing found while( covered < length && found_somehing ) { // find largest covering subspan (child) // starting at last covered position found_somehing = 0; for( int midPos=length-first; midPos>covered; midPos-- ) { if( nodeColl.HasNode( startPos+covered, startPos+midPos-1 ) ) { covered = midPos; splitPoints.push_back( startPos+covered ); // std::cerr << " " << ( startPos+covered ); first = 0; found_somehing = 1; } } } // std::cerr << std::endl; parents.push_back( splitPoints ); } } } return parents; }
ParentNodes SyntaxTree::Parse() { ParentNodes parents; int size = m_index.size(); // looping through all spans of size >= 2 for( int length=2; length<=size; length++ ) { for( int startPos = 0; startPos <= size-length; startPos++ ) { if (HasNode( startPos, startPos+length-1 )) { // processing one (parent) span //std::cerr << "# " << startPos << "-" << (startPos+length-1) << ":"; SplitPoints splitPoints; splitPoints.push_back( startPos ); //std::cerr << " " << startPos; int first = 1; int covered = 0; while( covered < length ) { // find largest covering subspan (child) // starting at last covered position for( int midPos=length-first; midPos>covered; midPos-- ) { if( HasNode( startPos+covered, startPos+midPos-1 ) ) { covered = midPos; splitPoints.push_back( startPos+covered ); // std::cerr << " " << ( startPos+covered ); first = 0; } } } // std::cerr << std::endl; parents.push_back( splitPoints ); } } } return parents; }
void SAMT( SyntaxTree &tree, ParentNodes &parents ) { int numWords = tree.GetNumWords(); SyntaxTree newTree; // to store new nodes // look through parents to combine children for(ParentNodes::const_iterator p = parents.begin(); p != parents.end(); p++) { const SplitPoints &point = *p; // neighboring childen: DET+ADJ if (point.size() >= 3) { // cerr << "complex parent: "; // for(int i=0;i<point.size();i++) cerr << point[i] << " "; // cerr << endl; for(int i = 0; i+2 < point.size(); i++) { // cerr << "\tadding " << point[i] << ";" << point[i+1] << ";" << (point[i+2]-1) << ": " << tree.GetNodes(point[i ],point[i+1]-1)[0]->GetLabel() << "+" << tree.GetNodes(point[i+1],point[i+2]-1)[0]->GetLabel() << endl; newTree.AddNode( point[i],point[i+2]-1, tree.GetNodes(point[i ],point[i+1]-1)[0]->GetLabel() + "+" + tree.GetNodes(point[i+1],point[i+2]-1)[0]->GetLabel() ); } } if (point.size() >= 4) { int ps = point.size(); string topLabel = tree.GetNodes(point[0],point[ps-1]-1)[0]->GetLabel(); // cerr << "\tadding " << topLabel + "\\" + tree.GetNodes(point[0],point[1]-1)[0]->GetLabel() << endl; newTree.AddNode( point[1],point[ps-1]-1, topLabel + "\\" + tree.GetNodes(point[0],point[1]-1)[0]->GetLabel() ); // cerr << "\tadding " << topLabel + "/" + tree.GetNodes(point[ps-2],point[ps-1]-1)[0]->GetLabel() << endl; newTree.AddNode( point[0],point[ps-2]-1, topLabel + "/" + tree.GetNodes(point[ps-2],point[ps-1]-1)[0]->GetLabel() ); } } // rules for any bordering constituents... for(int size = 2; size < numWords; size++) { for(int start = 0; start < numWords-size+1; start++) { int end = start+size-1; bool done = false; if (tree.HasNode( start,end ) || newTree.HasNode( start,end ) || SAMTLevel <= 1) { continue; } // if matching two adjacent parse constituents: use ++ for(int mid=start+1; mid<=end && !done; mid++) { if (tree.HasNode(start,mid-1) && tree.HasNode(mid,end)) { // cerr << "\tadding " << tree.GetNodes(start,mid-1)[0]->GetLabel() << "++" << tree.GetNodes(mid, end )[0]->GetLabel() << endl; newTree.AddNode( start, end, tree.GetNodes(start,mid-1)[0]->GetLabel() + "++" + tree.GetNodes(mid, end )[0]->GetLabel() ); done = true; } } if (done) continue; // if matching a constituent A right-minus const. B: use A//B for(int postEnd=end+1; postEnd<numWords && !done; postEnd++) { if (tree.HasNode(start,postEnd) && tree.HasNode(end+1,postEnd)) { newTree.AddNode( start, end, tree.GetNodes(start,postEnd)[0]->GetLabel() + "//" + tree.GetNodes(end+1,postEnd)[0]->GetLabel() ); done = true; } } if (done) continue; // if matching a constituent A left-minus constituent B: use A\\B for(int preStart=start-1; preStart>=0; preStart--) { if (tree.HasNode(preStart,end) && tree.HasNode(preStart,start-1)) { // cerr << "\tadding " << tree.GetNodes(preStart,end )[0]->GetLabel() << "\\\\" <<tree.GetNodes(preStart,start-1)[0]->GetLabel() << endl; newTree.AddNode( start, end, tree.GetNodes(preStart,end )[0]->GetLabel() + "\\\\" + tree.GetNodes(preStart,start-1)[0]->GetLabel() ); done = true; } } if (done) continue; // if matching three consecutive constituents, use double-plus // SAMT Level 3, not yet implemented // else: assign default category _FAIL if (SAMTLevel>=4) { newTree.AddNode( start, end, "_FAIL" ); } } } // adding all new nodes vector< SyntaxNode* > nodes = newTree.GetAllNodes(); for( int i=0; i<nodes.size(); i++ ) { tree.AddNode( nodes[i]->GetStart(), nodes[i]->GetEnd(), nodes[i]->GetLabel()); } }