Exemple #1
0
Val*
Val::
newIth(int ith, Val* oval, bool& stop)
{
  int ithc = oval->vec(ith);
  if(ithc > 0) stop = true;
  short nxtI = ithc +1;;
  //cerr<< "Wnt " << nxtI << "th var on pos " << ith << " of " << *oval<< endl;
  if(oval->wrd() >= 0) return NULL; //???;
  double ovalprob = oval->prob();
  //if(oval->status == TERMINALVAL) return NULL;
  Val* nval = ithBst(ith,oval->bsts()).next(nxtI);
  if(!nval) return NULL;
  double ovalcompprob = ithBst(ith, oval->bsts()).nth(ithc)->prob();
  double nprob = nval->prob();
  //cerr << "Its prob is " << nprob << endl;
  if(nprob < 0) return NULL;
  Val* ans = new Val(oval);
  ans->vec(ith) = nxtI;
  double frac = nprob/ovalcompprob;
  ans->prob() *= frac;
  assert(nxtI <= ithBst(ith,ans->bsts()).num());
  //cerr << "ith is " << ith << " " << ans->vec().size() << " "
  //   << ans->vec()[ith] << " " << frac << endl;
  //cerr << "The var is " << *ans << endl;
  //assert(ans->check());
  return ans;
}
Bst&
MeChart::
bestParseGivenHead(int posInt, const Wrd& wd, Item* itm,
		   FullHist* h, ItmGHeadInfo& ighInfo, Val* cval, Val* gcval)
{
  EdgeSet& es = ighInfo.first;
  BstMap&  atm = ighInfo.second;
  curVal = cval;
  gcurVal = gcval;
  Bst& bst = recordedBPGH(itm, atm, h);
  if(bst.explored())
    {
      if(printDebug() > 19)
	{
	  int subfv[MAXNUMFS];
	  getHt(h, subfv);
	  CntxArray ca(subfv);
	  prDp();
	  cerr << "bpknown for " << posInt << ", " << wd
	       << ", " << *itm << ") : " << bst.prob() << " " << ca <<endl;
	}
      curVal=gcurVal=NULL;
      return bst;
    }
  bst.explored() = true;
  curVal=gcurVal=NULL;
  const Term* trm = itm->term();
  if(trm->terminal_p())
    {
      Val* nval = new Val;
      nval->prob() = 1;
      nval->trm1() = itm->term()->toInt();
      nval->wrd1() = itm->word()->toInt();
      nval->status = TERMINALVAL;
      bst.addnth(nval);
      bst.sum() = nval->prob();
      return bst;
    }
  if(printDebug() > 10)
    {
      prDp();
      cerr << "bestParseGivenHead(" << posInt << ", " << wd
	   << ", " << *itm  << ")" << endl;
    }
  double bestP = 0;
  double sumP = 0;
  EdgeSetIter ei = es.begin();
  for( ; ei != es.end() ; ei++)
    {
      Edge* e = *ei;
      if(!sufficiently_likely(e))
	{
	  continue;
	}

      float edgePg = 1;
      int finish = e->loc();

      int effVal = effEnd(finish);
      if(itm->term()->isRoot()) edgePg = 1;
      else if(Feature::isLM) edgePg == 1;
      else if(effVal == 1)
	edgePg = endFactor;
      else if(effVal == 0) edgePg = midFactor;
      h->e = e;
      if(printDebug() > 20)
	{
	  prDp();
	  cerr << "consid " << *e << endl;
	}
      
      gcurVal = gcval;
      float prob = meRuleProb(e,h);
      gcurVal=NULL;
 
      double nextP = prob * edgePg;
      double nextPs = nextP;
      Item* sitm;
      //LeftRightGotIter gi(e); 
      MiddleOutGotIter gi(e);
      Val* val = new Val(e, nextPs);
      val->trm1() = itm->term()->toInt();
      val->wrd1() = wd.toInt();
      int pos = 0;
      depth++;
      h = h->extendByEdge(e);
      bool zeroProb = false;
      while( gi.next(sitm,pos) )
	{
	  //cerr << "Looking at " << *sitm << endl;
	  if(zeroProb)
	    {
	      h = h->extendBySubConstit();
	      continue;
	    }
	  if(sitm->term() == Term::stopTerm)
	    {
	      h = h->extendBySubConstit(); 
	      continue;
	    }
	  if(pos == 0)
	    {
	      h->preTerm = posInt; 
	      h->hd = &wd;
	      ItmGHeadInfo& ighi = sitm->posAndheads()[posInt][wd]; 
	      Bst&
		bst2 = bestParseGivenHead(posInt,wd,sitm,h,ighi,val,cval);
              curVal = gcurVal = NULL;
              curDir = -1;
	      if(bst2.empty())
		{
		  zeroProb = true;
		}
              val->extendTrees(bst2,pos); 
              nextPs *= bst2.sum();

	    }
	  else
	    {
	      Bst& bst2 = bestParse(sitm,h,val,cval,pos);
	      if(bst2.empty())
		{
		  zeroProb = true;
		}
              val->extendTrees(bst2,pos); 
              nextPs *= bst2.sum();
	    }
	  if(printDebug() > 39)
	    {
	      prDp();
	      cerr << "FullHist from " << *h;
	    }
	  h = h->extendBySubConstit(); 
	  if(printDebug() > 39)
	    cerr << " -> " << *h << endl;
	}
      if(!zeroProb) bst.push(val);
      if(printDebug() > 20)
	{
	  prDp();
	  cerr << "P(" << *e << " | " << wd << " ) = " ;
	  cerr << bestP;
	  cerr << "\n"; 
	}
      depth--;
      sumP += nextPs;
      h->retractByEdge(); 
      if(printDebug() > 20)
        {
          prDp();
          cerr << "Val: " << *val << endl;
        }
    }
  Val* vbest = bst.pop();
  if(vbest) bst.addnth(vbest);
  bst.sum() = sumP;
  if(printDebug() > 10)
    {
      prDp();
      cerr << "Bestpgh for "<<*itm << ", " << wd << " = " << bst.prob()<< endl;
    }
  return bst;
}
Bst&
MeChart::
bestParse(Item* itm, FullHist* h, Val* cval, Val* gcval, int cdir)
{
  curVal = cval;
  gcurVal = gcval;
  curDir = cdir;
  Bst& bst = recordedBP(itm, h);
  curVal = gcurVal = NULL;
  curDir = -1;
  if(bst.explored())
    {
      if(printDebug() > 19)
	{
	  prDp();
	  cerr << "already known bestParse(" << *itm << ", ...) has p = "
	       << bst.prob() << endl;
	}
      return bst;
    }
  if(printDebug() > 10)
    {
      prDp();
      cerr << "bestParse(" << *itm << ", ...)" << endl;
    }
  bst.explored() = true;  //David McClosky bug;
  int itermInt = itm->term()->toInt();
  PosMap& pm = itm->posAndheads();
  PosIter pi = pm.begin();
  ECString bestW;
  for( ; pi != pm.end() ; pi++ )
    {
      int posInt = (*pi).first;
      if(printDebug() > 16)
	{
	  prDp();
	  cerr << "consider Pos(" << *itm << ") = " << posInt << endl;
	}
      HeadMap& hm = (*pi).second;
      /* we are using collected counts for p(u|t) */
      float hposprob = 1;
      /* if we have reached a preterminal, then termInt == posInt
	 and p(posInt|termInt) == 1 */
      if( itermInt != posInt)
	{
          curVal = cval;
          gcurVal = gcval;
          curDir = cdir;
	  hposprob = meProb(posInt, h, UCALC); 
	  if(hposprob == 0) hposprob = .00001; //??? this can happen;
          curVal = gcurVal = NULL;
          curDir = -1;
	  if(printDebug() > 16)
	    {
	      prDp();
	      cerr <<  "p(pos) = " <<  hposprob << endl;
	    }
	}
      h->preTerm = posInt;
      HeadIter hi = hm.begin(); 
      for( ;hi != hm.end();hi++)
	{
	  const Wrd& subhw = (*hi).first;
	  int wrdInt = subhw.toInt();
	  ECString subh = subhw.lexeme();
	  if(printDebug() > 16)
	    {
	      prDp();
	      cerr << "consider head(" << *itm << ") = " << subh << endl;
	    }
	  float hprob = 0;
	  if(wrdInt >= 0 && wrdInt <= lastKnownWord)
	    {
	      hprob = pCapgt(&subhw,posInt); 
	      hprob *= (1 - pHugt(posInt)); 
              curVal = cval;
              gcurVal = gcval;
              curDir = cdir;
	      float hprob2 = meHeadProb(wrdInt, h);
              curVal = gcurVal = NULL;
              curDir = -1;
	      hprob *= hprob2;
	      if(hprob < 0)
		{
		  cerr << posInt << " " << pHugt(posInt) <<" "<<hprob2 << endl;
		  assert(hprob >=0);
		}
	    }
	  //hprob can be zero if lower case NNPS.
	  if(wrdInt > lastKnownWord || hprob == 0)
	    {
	      hprob = psutt(&subhw,posInt);
	    }
	  if(printDebug() > 16)
	    {
	      prDp();
	      cerr << "p(hd) = "<< hprob << endl;
	    }
	  float hhprob = (hposprob * hprob);
	  if(hhprob < 0)
	    {
	      cerr << hposprob << " " << hprob << endl;
	      assert(hhprob >= 0);
	    }
	  h->hd = &subhw;
	  Bst&  
	    bst2 = bestParseGivenHead(posInt,subhw,itm,h,(*hi).second,cval,gcval);
          if(bst2.empty()) continue;
          Val* nval = new Val();
	  Val* oldval0 = bst2.nth(0);
          nval->prob() = oldval0->prob()*hhprob;
          nval->bsts().push_back(&bst2);
	  nval->status = EXTRAVAL;
          bst.push(nval);
          bst.sum() += bst2.sum()*hhprob;
	}
    }
  Val* nbest = bst.pop();
  if(nbest) bst.addnth(nbest);
  if(printDebug() > 10)
    {
      prDp();
      cerr << "Bestp for " << *itm << " = " << bst.prob() <<endl;
    }
  return bst;
}
Exemple #4
0
/* the function called by each thread is "mainLoop" */
void*
mainLoop(void* arg)
{
  loopArg *loopA = (loopArg*)arg;
  istream* testSStream = loopA->inpt;
  ostream* pstatStream = loopA->outpt;
  int id = loopA->id;
  double log600 = log2(600.0);
  PrintStack printStack;
  for( ;  ; )
    {
      InputTree     correct;  
      InputTree*    cuse;

      /* first lock to read in the material */
      pthread_mutex_lock(&readlock);
      if( !*testSStream ) {
	pthread_mutex_unlock(&readlock);
	break;
      }
      *testSStream >> correct;
      if( !*testSStream ){
	pthread_mutex_unlock(&readlock);
	break;
      }
      totWords += correct.length()+1;
      int locCount = sentenceCount++;
      list<ECString>  wtList;
      correct.make(wtList);
      SentRep sr( wtList );  // used in precision calc

      ExtPos extPos;
      if(params.extPosIfstream)
	extPos.read(params.extPosIfstream,sr);
      pthread_mutex_unlock(&readlock);

      cuse = &correct;
      int len = correct.length();
      if(len > params.maxSentLen) continue;
      //cerr << "Len = " << len << endl;
      /*
	if( !params.field().in(sentenceCount) )
	{
	sentenceCount++;
	continue;
	}
	if(sentenceCount < -1)
	{
	sentenceCount++;
	continue;
	}
	sentenceCount++;
      */
      vector<ECString> poslist;
      correct.makePosList(poslist);
      ScoreTree sc;
      sc.setEquivInts(poslist);
      MeChart*	chart = new MeChart( sr,extPos,id );
       
      chart->parse( );
      Item* topS = chart->topS();
      if(!topS)
	{
	  cerr << "Parse failed" << endl;
	  cerr << correct << endl;
	  error(" could not parse "); 
	  delete chart;
	  continue;
	}
       
      // compute the outside probabilities on the items so that we can
      // skip doing detailed computations on the really bad ones 

      chart->set_Alphas();

      Bst& bst = chart->findMapParse();
      if( bst.empty()) error( "mapProbs did not return answer");
      float bestF = -1;
      int i;
      int numVersions = 0;
      Link diffs(0);
      //cerr << "Need num diff: " << Bchart::Nth << endl;
      printStruct printS;
      printS.sentenceCount = locCount;
      printS.numDiff = 0;
      for(numVersions = 0 ; ; numVersions++)
	{
	  short pos = 0;
	  Val* val = bst.next(numVersions);
	  if(!val)
	    {
	      //cerr << "Breaking" << endl;
	      break;
	    }
	  InputTree*  mapparse = inputTreeFromBsts(val,pos,sr);
	  bool isU;
	  int dummy = 0;
	  diffs.is_unique(mapparse, isU, dummy);
	  // cerr << "V " << isU << " " << numVersions << *mapparse << endl;
	  if(isU)
	    {
	      printS.probs.push_back(val->prob());
	      printS.trees.push_back(mapparse);
	      printS.numDiff++;
	    }
	  else
	    {
	      delete mapparse;
	    }
	  if(printS.numDiff >= Bchart::Nth) break;
	  if(numVersions > 20000) break;
	}

      ParseStats* locPst = new ParseStats[Bchart::Nth];
      ParseStats bestPs;
      for(i = 0 ; i <printS.numDiff ; i++)
	{
	  InputTree *mapparse = printS.trees[i];
	  assert(mapparse);
	  sc.trips.clear();
	  ParseStats pSt;
	  sc.recordGold(cuse,pSt);
	  sc.precisionRecall(mapparse,pSt);
	  float newF = pSt.fMeasure();
	  cerr << printS.sentenceCount << "\t" << newF << endl;
	  if(newF > bestF)
	    {
	      bestF = newF;
	      bestPs = pSt;
	    }
	  if(histPoints[i])
	    {
	      locPst[i] += bestPs;
	    }
	}
      if(printS.numDiff < Bchart::Nth)
	{
	  for(i = printS.numDiff ; i < Bchart::Nth ; i++)
	    {
	      if(histPoints[i]) locPst[i] += bestPs;
	    }
	}

      pthread_mutex_lock(&scorelock);
      for(i = 0 ; i < Bchart::Nth ; i++) totPst[i]+=locPst[i];
      pthread_mutex_unlock(&scorelock);

      int numPrinted;

      /* put the sentence with which we just finished at the end of the printStack*/
      printStack.push_back(printS);
      PrintStack::iterator psi = printStack.begin();
      /* now look at each item from the front of the print stack
	 to see if it should be printed now */
      pthread_mutex_lock(&writelock);
      for( numPrinted =0; psi != printStack.end(); numPrinted++ )
	{
	  printStruct& pstr=(*psi);
	  if(pstr.sentenceCount != printCount) break;
	  *pstatStream << pstr.sentenceCount << "\t" << pstr.numDiff << "\n";
	  printCount++;
	  for(i = 0 ; i < pstr.numDiff ; i++)
	    {
	      InputTree*  mapparse = pstr.trees[i];
	      assert(mapparse);
	      double logP =log2(pstr.probs[i]);
	      logP -= (sr.length()*log600);
	      *pstatStream <<  logP << "\n";
	      if(Bchart::prettyPrint) *pstatStream << *mapparse << "\n\n";
	      else
		{
		  mapparse->printproper(*pstatStream);
		  *pstatStream << "\n";
		}
	      delete mapparse;
	    }
	  *pstatStream << endl;
	  psi++;
	}
      pthread_mutex_unlock(&writelock);
      for(i = 0 ; i < numPrinted ; i++) printStack.pop_front();
      if(Feature::isLM)
	{
	  double lgram = log2(bst.sum());
	  lgram -= (sr.length()*log600);
	  double pgram = pow(2,lgram);
	  double iptri = chart->triGram();;
	  double ltri = (log2(iptri)-sr.length()*log600);
	  double ptri = pow(2.0,ltri);
	  double pcomb1 = (0.667 * pgram)+(0.333 * ptri);
	  double lcom1 = log2(pcomb1);
	  totGram -= lgram;
	  totTri -= ltri;
	  totMix -= lcom1;
	  if(locCount%10 == 9)
	    {
	      cerr << locCount << "\t";
	      cerr << pow(2.0,totGram/(double)totWords);
	      cerr <<"\t" <<  pow(2.0,totTri/(double)totWords);
	      cerr << "\t" << pow(2.0,totMix/(double)(totWords));
	      cerr << endl;
	    }
	}
      if(locCount%50 == 1)
	{
	  cerr << sentenceCount << "\t";
	  for(int i = 0 ; i < Bchart::Nth ; i++)
	    if(histPoints[i])
	      {
		cerr << i << " " << totPst[i].fMeasure() << "\t";
	      }
	  cerr << endl;
	}

      delete chart;
      delete [] locPst;
    }
  return 0;
}