std::string *AttachContextGenerator::getContext(Parse constituents[], int index, std::vector<Parse*> &rightFrontier, int rfi) { std::vector<std::string> features = std::vector<std::string>(100); int nodeDistance = rfi; Parse *fn = rightFrontier[rfi]; Parse *fp = 0; if (rfi + 1 < rightFrontier.size()) { fp = rightFrontier[rfi + 1]; } Parse *p_1 = 0; if (rightFrontier.size() > 0) { p_1 = rightFrontier[0]; } Parse *p0 = constituents[index]; Parse *p1 = 0; if (index + 1 < sizeof(constituents) / sizeof(constituents[0])) { p1 = constituents[index + 1]; } Collection<Parse*> *punct1s = 0; Collection<Parse*> *punct_1s = 0; Collection<Parse*> *punct_1fs = 0; punct_1fs = fn->getPreviousPunctuationSet(); punct_1s = p0->getPreviousPunctuationSet(); punct1s = p0->getNextPunctuationSet(); std::string consfp = cons(fp,-3); std::string consf = cons(fn,-2); std::string consp_1 = cons(p_1,-1); std::string consp0 = cons(p0,0); std::string consp1 = cons(p1,1); std::string consbofp = consbo(fp,-3); std::string consbof = consbo(fn,-2); std::string consbop_1 = consbo(p_1,-1); std::string consbop0 = consbo(p0,0); std::string consbop1 = consbo(p1,1); Cons *cfp = new Cons(consfp,consbofp,-3,true); Cons *cf = new Cons(consf,consbof,-2,true); Cons *c_1 = new Cons(consp_1,consbop_1,-1,true); Cons *c0 = new Cons(consp0,consbop0,0,true); Cons *c1 = new Cons(consp1,consbop1,1,true); //default features.push_back("default"); //unigrams features.push_back(consfp); features.push_back(consbofp); features.push_back(consf); features.push_back(consbof); features.push_back(consp_1); features.push_back(consbop_1); features.push_back(consp0); features.push_back(consbop0); features.push_back(consp1); features.push_back(consbop1); //productions std::string prod = production(fn,false); //String punctProd = production(fn,true,punctSet); features.push_back("pn=" + prod); features.push_back("pd=" + prod + "," + p0->getType()); features.push_back("ps=" + fn->getType() + "->" + fn->getType() + "," + p0->getType()); if (punct_1s != 0) { StringBuffer *punctBuf = new StringBuffer(5); for (Collection<Parse*>::const_iterator pi = punct_1s->begin(); pi != punct_1s->end(); ++pi) { Parse *punct = *pi; punctBuf->append(punct->getType())->append(","); } //features.add("ppd="+punctProd+","+punctBuf.toString()+p0.getType()); //features.add("pps="+fn.getType()+"->"+fn.getType()+","+punctBuf.toString()+p0.getType()); } //bi-grams //cons(fn),cons(0) cons2(features,cfp,c0,punct_1s,true); cons2(features,cf,c0,punct_1s,true); cons2(features,c_1,c0,punct_1s,true); cons2(features,c0,c1,punct1s,true); cons3(features,cf,c_1,c0,0,punct_1s,true,true,true); cons3(features,cf,c0,c1,punct_1s,punct1s,true,true,true); cons3(features,cfp,cf,c0,0,punct_1s,true,true,true); /* for (int ri=0;ri<rfi;ri++) { Parse jn = (Parse) rightFrontier.get(ri); features.add("jn="+jn.getType()); } */ int headDistance = (p0->getHeadIndex() - fn->getHeadIndex()); features.push_back("hd=" + headDistance); features.push_back("nd=" + nodeDistance); features.push_back("nd=" + p0->getType() + "." + nodeDistance); features.push_back("hd=" + p0->getType() + "." + headDistance); //features.add("fs="+rightFrontier.size()); //paired punct features if (containsPunct(punct_1s,"''")) { if (containsPunct(punct_1fs,"``")) { features.push_back("quotematch"); //? not generating feature correctly } else { //features.add("noquotematch"); } } return features.toArray(new std::string[features.size()]); }
void ParserEventStream::addParseEvents(std::vector<Event*> &parseEvents, Parse chunks[]) { /// <summary> /// Frontier nodes built from node in a completed parse. Specifically, /// they have all their children regardless of the stage of parsing. /// </summary> std::vector<Parse*> rightFrontier = std::vector<Parse*>(); std::vector<Parse*> builtNodes = std::vector<Parse*>(); /// <summary> /// Nodes which characterize what the parse looks like to the parser as its being built. /// Specifically, these nodes don't have all their children attached like the parents of /// the chunk nodes do. /// </summary> Parse currentChunks[sizeof(chunks) / sizeof(chunks[0])]; for (int ci = 0;ci < sizeof(chunks) / sizeof(chunks[0]);ci++) { currentChunks[ci] = static_cast<Parse*>(chunks[ci]->clone()); currentChunks[ci]->setPrevPunctuation(chunks[ci]->getPreviousPunctuationSet()); currentChunks[ci]->setNextPunctuation(chunks[ci]->getNextPunctuationSet()); currentChunks[ci]->setLabel(Parser::COMPLETE); chunks[ci]->setLabel(Parser::COMPLETE); } for (int ci = 0;ci < sizeof(chunks) / sizeof(chunks[0]);ci++) { //System.err.println("parserEventStream.addParseEvents: chunks="+Arrays.asList(chunks)); Parse *parent = chunks[ci]->getParent(); Parse *prevParent = chunks[ci]; int off = 0; //build un-built parents if (!chunks[ci]->isPosTag()) { builtNodes.push_back(off++,chunks[ci]); } //perform build stages while (parent->getType() != AbstractBottomUpParser::TOP_NODE && parent->getLabel() == "") { if (parent->getLabel() == "" && prevParent->getType() != parent->getType()) { //build level if (debug) System::err::println("Build: " + parent->getType() + " for: " + currentChunks[ci]); if (etype == opennlp::tools::parser::BUILD) { parseEvents.push_back(new Event(parent->getType(), buildContextGenerator->getContext(currentChunks, ci))); } builtNodes.push_back(off++,parent); Parse *newParent = new Parse(currentChunks[ci]->getText(),currentChunks[ci]->getSpan(),parent->getType(),1,0); newParent->add(currentChunks[ci],rules); newParent->setPrevPunctuation(currentChunks[ci]->getPreviousPunctuationSet()); newParent->setNextPunctuation(currentChunks[ci]->getNextPunctuationSet()); currentChunks[ci]->setParent(newParent); currentChunks[ci] = newParent; newParent->setLabel(Parser::BUILT); //see if chunk is complete if (lastChild(chunks[ci], parent)) { if (etype == opennlp::tools::parser::CHECK) { parseEvents.push_back(new Event(Parser::COMPLETE, checkContextGenerator->getContext(currentChunks[ci],currentChunks, ci,false))); } currentChunks[ci]->setLabel(Parser::COMPLETE); parent->setLabel(Parser::COMPLETE); } else { if (etype == opennlp::tools::parser::CHECK) { parseEvents.push_back(new Event(Parser::INCOMPLETE, checkContextGenerator->getContext(currentChunks[ci],currentChunks,ci,false))); } currentChunks[ci]->setLabel(Parser::INCOMPLETE); parent->setLabel(Parser::COMPLETE); } chunks[ci] = parent; //System.err.println("build: "+newParent+" for "+parent); } //TODO: Consider whether we need to set this label or train parses at all. parent->setLabel(Parser::BUILT); prevParent = parent; parent = parent->getParent(); } //decide to attach if (etype == opennlp::tools::parser::BUILD) { parseEvents.push_back(new Event(Parser::DONE, buildContextGenerator->getContext(currentChunks, ci))); } //attach node std::string attachType = ""; /// <summary> /// Node selected for attachment. </summary> Parse *attachNode = 0; int attachNodeIndex = -1; if (ci == 0) { Parse *top = new Parse(currentChunks[ci]->getText(),new Span(0,currentChunks[ci]->getText()->length()),AbstractBottomUpParser::TOP_NODE,1,0); top->insert(currentChunks[ci]); } else { /// <summary> /// Right frontier consisting of partially-built nodes based on current state of the parse. </summary> std::vector<Parse*> currentRightFrontier = Parser::getRightFrontier(currentChunks[0],punctSet); if (currentRightFrontier.size() != rightFrontier.size()) { System::err::println("fontiers mis-aligned: " + currentRightFrontier.size() + " != " + rightFrontier.size() + " " + currentRightFrontier + " " + rightFrontier); exit(1); } Map<Parse*, int> *parents = getNonAdjoinedParent(chunks[ci]); //try daughters first. for (int cfi = 0;cfi < currentRightFrontier.size();cfi++) { Parse *frontierNode = rightFrontier[cfi]; Parse *cfn = currentRightFrontier[cfi]; if (!Parser::checkComplete || Parser::COMPLETE != cfn->getLabel()) { int i = parents->get(frontierNode); if (debug) System::err::println("Looking at attachment site (" + cfi + "): " + cfn->getType() + " ci=" + i + " cs=" + nonPunctChildCount(cfn) + ", " + cfn + " :for " + currentChunks[ci]->getType() + " " + currentChunks[ci] + " -> " + parents); if (attachNode == 0 && i != 0 && i == nonPunctChildCount(cfn)) { attachType = Parser::ATTACH_DAUGHTER; attachNodeIndex = cfi; attachNode = cfn; if (etype == opennlp::tools::parser::ATTACH) { parseEvents.push_back(new Event(attachType, attachContextGenerator->getContext(currentChunks, ci, currentRightFrontier, attachNodeIndex))); } //System.err.println("daughter attach "+attachNode+" at "+fi); } } else { if (debug) System::err::println("Skipping (" + cfi + "): " + cfn->getType() + "," + cfn->getPreviousPunctuationSet() + " " + cfn + " :for " + currentChunks[ci]->getType() + " " + currentChunks[ci] + " -> " + parents); } // Can't attach past first incomplete node. if (Parser::checkComplete && cfn->getLabel() == Parser::INCOMPLETE) { if (debug) System::err::println("breaking on incomplete:" + cfn->getType() + " " + cfn); break; } } //try sisters, and generate non-attach events. for (int cfi = 0;cfi < currentRightFrontier.size();cfi++) { Parse *frontierNode = rightFrontier[cfi]; Parse *cfn = currentRightFrontier[cfi]; if (attachNode == 0 && parents->containsKey(frontierNode->getParent()) && frontierNode->getType() == frontierNode->getParent()->getType()) //&& frontierNode.getParent().getLabel() == null) { { attachType = Parser::ATTACH_SISTER; attachNode = cfn; attachNodeIndex = cfi; if (etype == opennlp::tools::parser::ATTACH) { parseEvents.push_back(new Event(Parser::ATTACH_SISTER, attachContextGenerator->getContext(currentChunks, ci, currentRightFrontier, cfi))); } chunks[ci]->getParent()->setLabel(Parser::BUILT); //System.err.println("in search sister attach "+attachNode+" at "+cfi); } else if (cfi == attachNodeIndex) { //skip over previously attached daughter. } else { if (etype == opennlp::tools::parser::ATTACH) { parseEvents.push_back(new Event(Parser::NON_ATTACH, attachContextGenerator->getContext(currentChunks, ci, currentRightFrontier, cfi))); } } //Can't attach past first incomplete node. if (Parser::checkComplete && cfn->getLabel() == Parser::INCOMPLETE) { if (debug) System::err::println("breaking on incomplete:" + cfn->getType() + " " + cfn); break; } } //attach Node if (attachNode != 0) { if (attachType == Parser::ATTACH_DAUGHTER) { Parse *daughter = currentChunks[ci]; if (debug) System::err::println("daughter attach a=" + attachNode->getType() + ":" + attachNode + " d=" + daughter + " com=" + lastChild(chunks[ci], rightFrontier[attachNodeIndex])); attachNode->add(daughter,rules); daughter->setParent(attachNode); if (lastChild(chunks[ci], rightFrontier[attachNodeIndex])) { if (etype == opennlp::tools::parser::CHECK) { parseEvents.push_back(new Event(Parser::COMPLETE, checkContextGenerator->getContext(attachNode,currentChunks,ci,true))); } attachNode->setLabel(Parser::COMPLETE); } else { if (etype == opennlp::tools::parser::CHECK) { parseEvents.push_back(new Event(Parser::INCOMPLETE, checkContextGenerator->getContext(attachNode,currentChunks,ci,true))); } } } else if (attachType == Parser::ATTACH_SISTER) { Parse *frontierNode = rightFrontier[attachNodeIndex]; rightFrontier[attachNodeIndex] = frontierNode->getParent(); Parse *sister = currentChunks[ci]; if (debug) System::err::println("sister attach a=" + attachNode->getType() + ":" + attachNode + " s=" + sister + " ap=" + attachNode->getParent() + " com=" + lastChild(chunks[ci], rightFrontier[attachNodeIndex])); Parse *newParent = attachNode->getParent()->adjoin(sister,rules); newParent->setParent(attachNode->getParent()); attachNode->setParent(newParent); sister->setParent(newParent); if (attachNode == currentChunks[0]) { currentChunks[0] = newParent; } if (lastChild(chunks[ci], rightFrontier[attachNodeIndex])) { if (etype == opennlp::tools::parser::CHECK) { parseEvents.push_back(new Event(Parser::COMPLETE, checkContextGenerator->getContext(newParent,currentChunks,ci,true))); } newParent->setLabel(Parser::COMPLETE); } else { if (etype == opennlp::tools::parser::CHECK) { parseEvents.push_back(new Event(Parser::INCOMPLETE, checkContextGenerator->getContext(newParent,currentChunks,ci,true))); } newParent->setLabel(Parser::INCOMPLETE); } } //update right frontier for (int ni = 0;ni < attachNodeIndex;ni++) { //System.err.println("removing: "+rightFrontier.get(0)); rightFrontier.remove(0); } } else { //System.err.println("No attachment!"); throw std::exception("No Attachment: " + chunks[ci]); } } rightFrontier.addAll(0,builtNodes); builtNodes.clear(); } }