std::string *AttachContextGenerator::getContext(Parse constituents[], int index, std::vector<Parse*> &rightFrontier, int rfi)
                {
                  std::vector<std::string> features = std::vector<std::string>(100);
                  int nodeDistance = rfi;
                  Parse *fn = rightFrontier[rfi];
                  Parse *fp = 0;
                  if (rfi + 1 < rightFrontier.size())
                  {
                    fp = rightFrontier[rfi + 1];
                  }
                  Parse *p_1 = 0;
                  if (rightFrontier.size() > 0)
                  {
                    p_1 = rightFrontier[0];
                  }
                  Parse *p0 = constituents[index];
                  Parse *p1 = 0;
                  if (index + 1 < sizeof(constituents) / sizeof(constituents[0]))
                  {
                    p1 = constituents[index + 1];
                  }

                  Collection<Parse*> *punct1s = 0;
                  Collection<Parse*> *punct_1s = 0;
                  Collection<Parse*> *punct_1fs = 0;
                  punct_1fs = fn->getPreviousPunctuationSet();
                  punct_1s = p0->getPreviousPunctuationSet();
                  punct1s = p0->getNextPunctuationSet();

                  std::string consfp = cons(fp,-3);
                  std::string consf = cons(fn,-2);
                  std::string consp_1 = cons(p_1,-1);
                  std::string consp0 = cons(p0,0);
                  std::string consp1 = cons(p1,1);

                  std::string consbofp = consbo(fp,-3);
                  std::string consbof = consbo(fn,-2);
                  std::string consbop_1 = consbo(p_1,-1);
                  std::string consbop0 = consbo(p0,0);
                  std::string consbop1 = consbo(p1,1);

                  Cons *cfp = new Cons(consfp,consbofp,-3,true);
                  Cons *cf = new Cons(consf,consbof,-2,true);
                  Cons *c_1 = new Cons(consp_1,consbop_1,-1,true);
                  Cons *c0 = new Cons(consp0,consbop0,0,true);
                  Cons *c1 = new Cons(consp1,consbop1,1,true);

                  //default
                  features.push_back("default");

                  //unigrams
                  features.push_back(consfp);
                  features.push_back(consbofp);
                  features.push_back(consf);
                  features.push_back(consbof);
                  features.push_back(consp_1);
                  features.push_back(consbop_1);
                  features.push_back(consp0);
                  features.push_back(consbop0);
                  features.push_back(consp1);
                  features.push_back(consbop1);

                  //productions
                  std::string prod = production(fn,false);
                  //String punctProd = production(fn,true,punctSet);
                  features.push_back("pn=" + prod);
                  features.push_back("pd=" + prod + "," + p0->getType());
                  features.push_back("ps=" + fn->getType() + "->" + fn->getType() + "," + p0->getType());
                  if (punct_1s != 0)
                  {
                    StringBuffer *punctBuf = new StringBuffer(5);
                    for (Collection<Parse*>::const_iterator pi = punct_1s->begin(); pi != punct_1s->end(); ++pi)
                    {
                      Parse *punct = *pi;
                      punctBuf->append(punct->getType())->append(",");
                    }
                    //features.add("ppd="+punctProd+","+punctBuf.toString()+p0.getType());
                    //features.add("pps="+fn.getType()+"->"+fn.getType()+","+punctBuf.toString()+p0.getType());
                  }

                  //bi-grams
                  //cons(fn),cons(0)
                  cons2(features,cfp,c0,punct_1s,true);
                  cons2(features,cf,c0,punct_1s,true);
                  cons2(features,c_1,c0,punct_1s,true);
                  cons2(features,c0,c1,punct1s,true);
                  cons3(features,cf,c_1,c0,0,punct_1s,true,true,true);
                  cons3(features,cf,c0,c1,punct_1s,punct1s,true,true,true);
                  cons3(features,cfp,cf,c0,0,punct_1s,true,true,true);
                  /*
                  for (int ri=0;ri<rfi;ri++) {
                    Parse jn = (Parse) rightFrontier.get(ri);
                    features.add("jn="+jn.getType());
                  }
                  */
                  int headDistance = (p0->getHeadIndex() - fn->getHeadIndex());
                  features.push_back("hd=" + headDistance);
                  features.push_back("nd=" + nodeDistance);

                  features.push_back("nd=" + p0->getType() + "." + nodeDistance);
                  features.push_back("hd=" + p0->getType() + "." + headDistance);
                  //features.add("fs="+rightFrontier.size());
                  //paired punct features
                  if (containsPunct(punct_1s,"''"))
                  {
                    if (containsPunct(punct_1fs,"``"))
                    {
                      features.push_back("quotematch"); //? not generating feature correctly

                    }
                    else
                    {
                      //features.add("noquotematch");
                    }
                  }
                  return features.toArray(new std::string[features.size()]);
                }
Пример #2
0
                void ParserEventStream::addParseEvents(std::vector<Event*> &parseEvents, Parse chunks[])
                {
                  /// <summary>
                  /// Frontier nodes built from node in a completed parse.  Specifically,
                  /// they have all their children regardless of the stage of parsing.
                  /// </summary>
                  std::vector<Parse*> rightFrontier = std::vector<Parse*>();
                  std::vector<Parse*> builtNodes = std::vector<Parse*>();
                  /// <summary>
                  /// Nodes which characterize what the parse looks like to the parser as its being built.
                  /// Specifically, these nodes don't have all their children attached like the parents of
                  /// the chunk nodes do.
                  /// </summary>
                  Parse currentChunks[sizeof(chunks) / sizeof(chunks[0])];
                  for (int ci = 0;ci < sizeof(chunks) / sizeof(chunks[0]);ci++)
                  {
                    currentChunks[ci] = static_cast<Parse*>(chunks[ci]->clone());
                    currentChunks[ci]->setPrevPunctuation(chunks[ci]->getPreviousPunctuationSet());
                    currentChunks[ci]->setNextPunctuation(chunks[ci]->getNextPunctuationSet());
                    currentChunks[ci]->setLabel(Parser::COMPLETE);
                    chunks[ci]->setLabel(Parser::COMPLETE);
                  }
                  for (int ci = 0;ci < sizeof(chunks) / sizeof(chunks[0]);ci++)
                  {
                    //System.err.println("parserEventStream.addParseEvents: chunks="+Arrays.asList(chunks));
                    Parse *parent = chunks[ci]->getParent();
                    Parse *prevParent = chunks[ci];
                    int off = 0;
                    //build un-built parents
                    if (!chunks[ci]->isPosTag())
                    {
                      builtNodes.push_back(off++,chunks[ci]);
                    }
                    //perform build stages
                    while (parent->getType() != AbstractBottomUpParser::TOP_NODE && parent->getLabel() == "")
                    {
                      if (parent->getLabel() == "" && prevParent->getType() != parent->getType())
                      {
                        //build level
                        if (debug)
                            System::err::println("Build: " + parent->getType() + " for: " + currentChunks[ci]);
                        if (etype == opennlp::tools::parser::BUILD)
                        {
                          parseEvents.push_back(new Event(parent->getType(), buildContextGenerator->getContext(currentChunks, ci)));
                        }
                        builtNodes.push_back(off++,parent);
                        Parse *newParent = new Parse(currentChunks[ci]->getText(),currentChunks[ci]->getSpan(),parent->getType(),1,0);
                        newParent->add(currentChunks[ci],rules);
                        newParent->setPrevPunctuation(currentChunks[ci]->getPreviousPunctuationSet());
                        newParent->setNextPunctuation(currentChunks[ci]->getNextPunctuationSet());
                        currentChunks[ci]->setParent(newParent);
                        currentChunks[ci] = newParent;
                        newParent->setLabel(Parser::BUILT);
                        //see if chunk is complete
                        if (lastChild(chunks[ci], parent))
                        {
                          if (etype == opennlp::tools::parser::CHECK)
                          {
                            parseEvents.push_back(new Event(Parser::COMPLETE, checkContextGenerator->getContext(currentChunks[ci],currentChunks, ci,false)));
                          }
                          currentChunks[ci]->setLabel(Parser::COMPLETE);
                          parent->setLabel(Parser::COMPLETE);
                        }
                        else
                        {
                          if (etype == opennlp::tools::parser::CHECK)
                          {
                            parseEvents.push_back(new Event(Parser::INCOMPLETE, checkContextGenerator->getContext(currentChunks[ci],currentChunks,ci,false)));
                          }
                          currentChunks[ci]->setLabel(Parser::INCOMPLETE);
                          parent->setLabel(Parser::COMPLETE);
                        }

                        chunks[ci] = parent;
                        //System.err.println("build: "+newParent+" for "+parent);
                      }
                      //TODO: Consider whether we need to set this label or train parses at all.
                      parent->setLabel(Parser::BUILT);
                      prevParent = parent;
                      parent = parent->getParent();
                    }
                    //decide to attach
                    if (etype == opennlp::tools::parser::BUILD)
                    {
                      parseEvents.push_back(new Event(Parser::DONE, buildContextGenerator->getContext(currentChunks, ci)));
                    }
                    //attach node
                    std::string attachType = "";
                    /// <summary>
                    /// Node selected for attachment. </summary>
                    Parse *attachNode = 0;
                    int attachNodeIndex = -1;
                    if (ci == 0)
                    {
                      Parse *top = new Parse(currentChunks[ci]->getText(),new Span(0,currentChunks[ci]->getText()->length()),AbstractBottomUpParser::TOP_NODE,1,0);
                      top->insert(currentChunks[ci]);
                    }
                    else
                    {
                      /// <summary>
                      /// Right frontier consisting of partially-built nodes based on current state of the parse. </summary>
                      std::vector<Parse*> currentRightFrontier = Parser::getRightFrontier(currentChunks[0],punctSet);
                      if (currentRightFrontier.size() != rightFrontier.size())
                      {
                        System::err::println("fontiers mis-aligned: " + currentRightFrontier.size() + " != " + rightFrontier.size() + " " + currentRightFrontier + " " + rightFrontier);
                        exit(1);
                      }
                      Map<Parse*, int> *parents = getNonAdjoinedParent(chunks[ci]);
                      //try daughters first.
                      for (int cfi = 0;cfi < currentRightFrontier.size();cfi++)
                      {
                        Parse *frontierNode = rightFrontier[cfi];
                        Parse *cfn = currentRightFrontier[cfi];
                        if (!Parser::checkComplete || Parser::COMPLETE != cfn->getLabel())
                        {
                          int i = parents->get(frontierNode);
                          if (debug)
                              System::err::println("Looking at attachment site (" + cfi + "): " + cfn->getType() + " ci=" + i + " cs=" + nonPunctChildCount(cfn) + ", " + cfn + " :for " + currentChunks[ci]->getType() + " " + currentChunks[ci] + " -> " + parents);
                          if (attachNode == 0 && i != 0 && i == nonPunctChildCount(cfn))
                          {
                            attachType = Parser::ATTACH_DAUGHTER;
                            attachNodeIndex = cfi;
                            attachNode = cfn;
                            if (etype == opennlp::tools::parser::ATTACH)
                            {
                              parseEvents.push_back(new Event(attachType, attachContextGenerator->getContext(currentChunks, ci, currentRightFrontier, attachNodeIndex)));
                            }
                            //System.err.println("daughter attach "+attachNode+" at "+fi);
                          }
                        }
                        else
                        {
                          if (debug)
                              System::err::println("Skipping (" + cfi + "): " + cfn->getType() + "," + cfn->getPreviousPunctuationSet() + " " + cfn + " :for " + currentChunks[ci]->getType() + " " + currentChunks[ci] + " -> " + parents);
                        }
                        // Can't attach past first incomplete node.
                        if (Parser::checkComplete && cfn->getLabel() == Parser::INCOMPLETE)
                        {
                          if (debug)
                              System::err::println("breaking on incomplete:" + cfn->getType() + " " + cfn);
                          break;
                        }
                      }
                      //try sisters, and generate non-attach events.
                      for (int cfi = 0;cfi < currentRightFrontier.size();cfi++)
                      {
                        Parse *frontierNode = rightFrontier[cfi];
                        Parse *cfn = currentRightFrontier[cfi];
                        if (attachNode == 0 && parents->containsKey(frontierNode->getParent()) && frontierNode->getType() == frontierNode->getParent()->getType()) //&& frontierNode.getParent().getLabel() == null) {
                        {
                          attachType = Parser::ATTACH_SISTER;
                          attachNode = cfn;
                          attachNodeIndex = cfi;
                          if (etype == opennlp::tools::parser::ATTACH)
                          {
                            parseEvents.push_back(new Event(Parser::ATTACH_SISTER, attachContextGenerator->getContext(currentChunks, ci, currentRightFrontier, cfi)));
                          }
                          chunks[ci]->getParent()->setLabel(Parser::BUILT);
                          //System.err.println("in search sister attach "+attachNode+" at "+cfi);
                        }
                        else if (cfi == attachNodeIndex)
                        {
                          //skip over previously attached daughter.
                        }
                        else
                        {
                          if (etype == opennlp::tools::parser::ATTACH)
                          {
                            parseEvents.push_back(new Event(Parser::NON_ATTACH, attachContextGenerator->getContext(currentChunks, ci, currentRightFrontier, cfi)));
                          }
                        }
                        //Can't attach past first incomplete node.
                        if (Parser::checkComplete && cfn->getLabel() == Parser::INCOMPLETE)
                        {
                          if (debug)
                              System::err::println("breaking on incomplete:" + cfn->getType() + " " + cfn);
                          break;
                        }
                      }
                      //attach Node
                      if (attachNode != 0)
                      {
                        if (attachType == Parser::ATTACH_DAUGHTER)
                        {
                          Parse *daughter = currentChunks[ci];
                          if (debug)
                              System::err::println("daughter attach a=" + attachNode->getType() + ":" + attachNode + " d=" + daughter + " com=" + lastChild(chunks[ci], rightFrontier[attachNodeIndex]));
                          attachNode->add(daughter,rules);
                          daughter->setParent(attachNode);
                          if (lastChild(chunks[ci], rightFrontier[attachNodeIndex]))
                          {
                            if (etype == opennlp::tools::parser::CHECK)
                            {
                              parseEvents.push_back(new Event(Parser::COMPLETE, checkContextGenerator->getContext(attachNode,currentChunks,ci,true)));
                            }
                            attachNode->setLabel(Parser::COMPLETE);
                          }
                          else
                          {
                            if (etype == opennlp::tools::parser::CHECK)
                            {
                              parseEvents.push_back(new Event(Parser::INCOMPLETE, checkContextGenerator->getContext(attachNode,currentChunks,ci,true)));
                            }
                          }
                        }
                        else if (attachType == Parser::ATTACH_SISTER)
                        {
                          Parse *frontierNode = rightFrontier[attachNodeIndex];
                          rightFrontier[attachNodeIndex] = frontierNode->getParent();
                          Parse *sister = currentChunks[ci];
                          if (debug)
                              System::err::println("sister attach a=" + attachNode->getType() + ":" + attachNode + " s=" + sister + " ap=" + attachNode->getParent() + " com=" + lastChild(chunks[ci], rightFrontier[attachNodeIndex]));
                          Parse *newParent = attachNode->getParent()->adjoin(sister,rules);

                          newParent->setParent(attachNode->getParent());
                          attachNode->setParent(newParent);
                          sister->setParent(newParent);
                          if (attachNode == currentChunks[0])
                          {
                            currentChunks[0] = newParent;
                          }
                          if (lastChild(chunks[ci], rightFrontier[attachNodeIndex]))
                          {
                            if (etype == opennlp::tools::parser::CHECK)
                            {
                              parseEvents.push_back(new Event(Parser::COMPLETE, checkContextGenerator->getContext(newParent,currentChunks,ci,true)));
                            }
                            newParent->setLabel(Parser::COMPLETE);
                          }
                          else
                          {
                            if (etype == opennlp::tools::parser::CHECK)
                            {
                              parseEvents.push_back(new Event(Parser::INCOMPLETE, checkContextGenerator->getContext(newParent,currentChunks,ci,true)));
                            }
                            newParent->setLabel(Parser::INCOMPLETE);
                          }

                        }
                        //update right frontier
                        for (int ni = 0;ni < attachNodeIndex;ni++)
                        {
                          //System.err.println("removing: "+rightFrontier.get(0));
                          rightFrontier.remove(0);
                        }
                      }
                      else
                      {
                        //System.err.println("No attachment!");
                        throw std::exception("No Attachment: " + chunks[ci]);
                      }
                    }
                    rightFrontier.addAll(0,builtNodes);
                    builtNodes.clear();
                  }
                }