예제 #1
0
int
main(int argc, char *argv[])
{
   ECArgs args( argc, argv );
   /* l = length of sentence to be proceeds 0-100 is default
      n = work on each #'th line.
      d = print out debugging info at level #
      t = report timings */

   params.init( args );
   TimeIt timeIt;
   ECString  path( args.arg( 0 ) );
   generalInit(path);

   int      sentenceCount = 0;  //counts all sentences so we can use e.g,1/50;
   int totUnparsed = 0;
   double log600 = log2(600.0);

   ECString flnm = "dummy";
   if(args.nargs()==2) flnm = args.arg(1);
   ewDciTokStrm* tokStream = NULL;
   if(Bchart::tokenize)
     {
       tokStream = new ewDciTokStrm(flnm);
       if(args.nargs() ==1) tokStream->useCin = 1;
     }
   istream* nontokStream = NULL;
   if(args.nargs()==2) nontokStream = new ifstream(args.arg(1).c_str());
   else nontokStream = &cin;
   
   for( ;  ; sentenceCount++)
     {
       SentRep* srp;
       if(Bchart::tokenize) srp = new SentRep(*tokStream, SentRep::SGML);
       else srp = new SentRep(*nontokStream, SentRep::SGML);
       int len = srp->length();
       if(len > params.maxSentLen) continue;
       if(len == 0) break;
       if( !params.field().in(sentenceCount) ) continue;

       if(args.isset('t')) timeIt.befSent();

       MeChart*	chart = new MeChart( *srp );
       curChart = chart;
       
       if(args.isset('t') ) timeIt.lastTime = clock();

       chart->parse( );

       Item* topS = chart->topS();
       if(!topS)
	 {
	   totUnparsed++;
	   cerr << "Parse failed" << endl;
	   cerr << *srp << endl;
	   delete chart;
	   continue;
	 }
       if(args.isset('t') ) timeIt.betweenSent(chart);

       // compute the outside probabilities on the items so that we can
       // skip doing detailed computations on the really bad ones 
       chart->set_Alphas();

       AnsTreeStr& at = chart->findMapParse();
       if( at.probs[0] <= 0 ) error( "mapProbs did not return answer" );

       if(Feature::isLM)
	 {
	   double lgram = log2(at.sum);
	   lgram -= (srp->length()*log600);
	   double pgram = pow(2,lgram);
	   double iptri =chart->triGram();;
	   double ltri = (log2(iptri)-srp->length()*log600);
	   double ptri = pow(2.0,ltri);
	   double pcomb = (0.667 * pgram)+(0.333 * ptri);
	   double lmix = log2(pcomb);
	   cout << lgram << "\t" << ltri << "\t" << lmix << endl;
	 }

       int numVersions = 0;
       for(numVersions = 0 ; numVersions < NTH ; numVersions++)
	 if(at.probs[numVersions] <= 0) break;
       if(NTH > 1)cout << sentenceCount << "\t" << numVersions << endl;
       for(int i = 0 ; i < numVersions ; i++)
	 {
	   short pos = 0;
	   InputTree*  mapparse = inputTreeFromAnsTree(&at.trees[i], pos ,*srp);
	   double logP =log(at.probs[i]);
	   logP -= (srp->length()*log600);
	   if(NTH > 1) cout <<  logP << endl;
	   cout << *mapparse << endl << endl;
	   delete mapparse;
	 }
       cout << endl;
       if(args.isset('t') ) timeIt.aftSent();

       delete chart;
     }
   if( args.isset('t') ) timeIt.finish(sentenceCount);
   return 0;
}
예제 #2
0
int
main(int argc, char *argv[])
{
   ECArgs args( argc, argv );
   /* o = basic, but not debugging, output.
      l = length of sentence to be proceeds 0-40 is default
      n = work on each #'th line.
      d = print out debugging info at level #
      W = use wwclasses
      R = use rwclasses
      t = report timings (requires o)
      s = maximum sleep time
      f = f# says multiply ctl2 counts by #
      p = p# use prepFactor #
      P = which types of prob models to use */

   // prevent core file creation;
   struct rlimit 	core_limits;
   core_limits.rlim_cur = 0;
   core_limits.rlim_max = 0;
   setrlimit( RLIMIT_CORE, &core_limits );

   params.init( args );
   if(args.isset('s'))
     {
       int  maxDelay = atoi(args.value('s').c_str());
       srand(params.whichSent());
       int randN = rand();
       int delay = randN%maxDelay;
       sleep(delay);
     }

   if(args.isset('T'))
     {
       int fac = atoi(args.value('T').c_str());
       float ffac = (float)fac;
       ffac /= 10;
       Bchart::timeFactor = ffac;
     }
	 
   int maxSentLen = 70;
   if(args.isset('l'))
     {
       maxSentLen = atoi(args.value('l').c_str());
     }
   int    totEdges = 0;
   int    totPopedEdges = 0;
   double totAccessTime = 0;
   double totParseTime = 0;
   double totSemParseTime = 0;
   clock_t lastTime, currTime;
   double lastTimeSec, currTimeSec, elapsedTime;

   endFactor = 1.2;
   midFactor = (1.0 - (.3684 * endFactor))/(1.0 - .3684);

   if( args.nargs() > 2 || args.nargs() == 0 )	// require path name 
     error( "Need exactly two arg." );
   ECString  path( args.arg( 0 ) );
   readHeadInfo(path);
   Term::init( path );
   InputTree::init();

   ECString testSString( args.arg(1) );

   ewDciTokStrm testSStream(testSString);
   //ifstream testSStream(testSString.c_str());
   if( !testSStream ) error( "No testSstream" );
   int      sentenceCount = 0;  //counts all sentences so we can use 1/50;

   ECString  probSumString( path );
   probSumString += "pSgT.txt";
   ifstream    probSumStream( probSumString.c_str() );
   if( !probSumStream ) error( "Failed to find probSum file" );

   Bchart::readTermProbs(path);

   if( args.isset('d') )
     {
       int lev = atoi(args.value('d').c_str());
       Bchart::printDebug() = lev;
     }
   int totSents = 0;
   int totUnparsed = 0;

   MeChart::init(path);
   Bchart::setPosStarts();
   for( ; !(!testSStream) ; )
     {
       SentRep sr(testSStream, SentRep::SGML); 
       int len = sr.length();
       if(len == 0) continue;
       if(len > maxSentLen) continue;
       if( !params.field().in(sentenceCount) )
	 {
	   sentenceCount++;
	   continue;
	 }
       if(len == 1)
	 {
	   if(sr[0].lexeme() == "</DOC>")
	     {
	       continue;
	     }
	 }
       sentenceCount++;

       //SentRep orgsr( wtList );  // used in precision calc;

       if( args.isset('t') ) lastTime = clock();
       if(args.isset('t') )
	 {
	   currTime = clock();
	   lastTimeSec = (double)lastTime/(double)CLOCKS_PER_SEC;
	   currTimeSec = (double)currTime/(double)CLOCKS_PER_SEC;
	   elapsedTime = currTimeSec - lastTimeSec;
	   if(elapsedTime < 0) elapsedTime += 2147;
	   cerr << "Reading data time = " << elapsedTime << endl;
	   totAccessTime += elapsedTime;
	   lastTime = currTime;
	 }

       MeChart*	chart = new MeChart( sr );
       curChart = chart;
       chart->ruleCountTimeout() = 250000;
       
       totSents++;
       if(args.isset('t') )
	 lastTime = clock();
       double tmpCrossEnt = chart->parse( );
       Item* topS = chart->topS();

       if(!topS)
	 {
	   if(len == 1)
	     {
	       delete chart;
	       continue;
	     }
	   Edge::DemFac = .9;
	   delete chart;
	   chart = new MeChart(sr);
	   chart->ruleCountTimeout() = 350000;
	   curChart = chart;
	   tmpCrossEnt = chart->parse( );
	   topS = chart->topS();
	   Edge::DemFac = .999;
	   if(!topS)
	     {
	       totUnparsed++;
	       cerr << "Parse failed on: " << sr << endl;

	       delete chart;
	       continue;
	     }
	 }
       
       // compute the outside probabilities on the items so that we can
       // skip doing detailed computations on the really bad ones 
       if(args.isset('t') )
	 {
	   currTime = clock();
	   lastTimeSec = (double)lastTime/(double)CLOCKS_PER_SEC;
	   currTimeSec = (double)currTime/(double)CLOCKS_PER_SEC;
	   elapsedTime = currTimeSec - lastTimeSec;
	   if(elapsedTime < 0) elapsedTime += 2147;
	   cerr << "Parsing time = " << elapsedTime
	     << "\tEdges created = " << chart->totEdgeCountAtS()
	       << "\tEdges poped = " << chart->popedEdgeCountAtS() << endl;
	   totParseTime += elapsedTime;
	   //totEdges += chart->totEdgeCountAtS();
	   //totPopedEdges += chart->popedEdgeCountAtS();
	   totEdges += chart->totEdgeCountAtS();
	   totPopedEdges += chart->popedEdgeCountAtS();
	   lastTime = clock();

	 }

       chart->set_Alphas();

       AnswerTree* at = chart->findMapParse();
       if( !at ) 
	 {
	   totUnparsed++;
	   cerr << "MapParse failed on: " << sr << endl;
	   delete chart;
	   continue;
	 }
       InputTree*  mapparse = inputTreeFromAnswerTree(at,topS);
       //at->deleteSubTrees();
       //delete at;
       cout << *mapparse << endl;
       delete mapparse;

       if(args.isset('t') )
	 {
	   currTime = clock();
	   lastTimeSec = (double)lastTime/(double)CLOCKS_PER_SEC;
	   currTimeSec = (double)currTime/(double)CLOCKS_PER_SEC;
	   elapsedTime = currTimeSec - lastTimeSec;
	   if(elapsedTime < 0) elapsedTime += 2147;
	   cerr << "Sem Parsing time = " << elapsedTime << endl;
	   totSemParseTime += elapsedTime;
	 }

       delete chart;
     }
   if( args.isset('t') )
     cout << "Av access time = " << totAccessTime/totSents
       << "\t Av parse time = "
	 << totParseTime/totSents
       << "\t Av stats time = "
	 << totSemParseTime/totSents
       << "\nAv edges created = "
	 << (float)totEdges/totSents
       << "\tAv edges poped = "
	 << (float)totPopedEdges/totSents
	   << endl;

   return 0;
}
예제 #3
0
/* the function called by each thread is "mainLoop" */
void*
mainLoop(void* arg)
{
  loopArg *loopA = (loopArg*)arg;
  istream* testSStream = loopA->inpt;
  ostream* pstatStream = loopA->outpt;
  int id = loopA->id;
  double log600 = log2(600.0);
  PrintStack printStack;
  for( ;  ; )
    {
      InputTree     correct;  
      InputTree*    cuse;

      /* first lock to read in the material */
      pthread_mutex_lock(&readlock);
      if( !*testSStream ) {
	pthread_mutex_unlock(&readlock);
	break;
      }
      *testSStream >> correct;
      if( !*testSStream ){
	pthread_mutex_unlock(&readlock);
	break;
      }
      totWords += correct.length()+1;
      int locCount = sentenceCount++;
      list<ECString>  wtList;
      correct.make(wtList);
      SentRep sr( wtList );  // used in precision calc

      ExtPos extPos;
      if(params.extPosIfstream)
	extPos.read(params.extPosIfstream,sr);
      pthread_mutex_unlock(&readlock);

      cuse = &correct;
      int len = correct.length();
      if(len > params.maxSentLen) continue;
      //cerr << "Len = " << len << endl;
      /*
	if( !params.field().in(sentenceCount) )
	{
	sentenceCount++;
	continue;
	}
	if(sentenceCount < -1)
	{
	sentenceCount++;
	continue;
	}
	sentenceCount++;
      */
      vector<ECString> poslist;
      correct.makePosList(poslist);
      ScoreTree sc;
      sc.setEquivInts(poslist);
      MeChart*	chart = new MeChart( sr,extPos,id );
       
      chart->parse( );
      Item* topS = chart->topS();
      if(!topS)
	{
	  cerr << "Parse failed" << endl;
	  cerr << correct << endl;
	  error(" could not parse "); 
	  delete chart;
	  continue;
	}
       
      // compute the outside probabilities on the items so that we can
      // skip doing detailed computations on the really bad ones 

      chart->set_Alphas();

      Bst& bst = chart->findMapParse();
      if( bst.empty()) error( "mapProbs did not return answer");
      float bestF = -1;
      int i;
      int numVersions = 0;
      Link diffs(0);
      //cerr << "Need num diff: " << Bchart::Nth << endl;
      printStruct printS;
      printS.sentenceCount = locCount;
      printS.numDiff = 0;
      for(numVersions = 0 ; ; numVersions++)
	{
	  short pos = 0;
	  Val* val = bst.next(numVersions);
	  if(!val)
	    {
	      //cerr << "Breaking" << endl;
	      break;
	    }
	  InputTree*  mapparse = inputTreeFromBsts(val,pos,sr);
	  bool isU;
	  int dummy = 0;
	  diffs.is_unique(mapparse, isU, dummy);
	  // cerr << "V " << isU << " " << numVersions << *mapparse << endl;
	  if(isU)
	    {
	      printS.probs.push_back(val->prob());
	      printS.trees.push_back(mapparse);
	      printS.numDiff++;
	    }
	  else
	    {
	      delete mapparse;
	    }
	  if(printS.numDiff >= Bchart::Nth) break;
	  if(numVersions > 20000) break;
	}

      ParseStats* locPst = new ParseStats[Bchart::Nth];
      ParseStats bestPs;
      for(i = 0 ; i <printS.numDiff ; i++)
	{
	  InputTree *mapparse = printS.trees[i];
	  assert(mapparse);
	  sc.trips.clear();
	  ParseStats pSt;
	  sc.recordGold(cuse,pSt);
	  sc.precisionRecall(mapparse,pSt);
	  float newF = pSt.fMeasure();
	  cerr << printS.sentenceCount << "\t" << newF << endl;
	  if(newF > bestF)
	    {
	      bestF = newF;
	      bestPs = pSt;
	    }
	  if(histPoints[i])
	    {
	      locPst[i] += bestPs;
	    }
	}
      if(printS.numDiff < Bchart::Nth)
	{
	  for(i = printS.numDiff ; i < Bchart::Nth ; i++)
	    {
	      if(histPoints[i]) locPst[i] += bestPs;
	    }
	}

      pthread_mutex_lock(&scorelock);
      for(i = 0 ; i < Bchart::Nth ; i++) totPst[i]+=locPst[i];
      pthread_mutex_unlock(&scorelock);

      int numPrinted;

      /* put the sentence with which we just finished at the end of the printStack*/
      printStack.push_back(printS);
      PrintStack::iterator psi = printStack.begin();
      /* now look at each item from the front of the print stack
	 to see if it should be printed now */
      pthread_mutex_lock(&writelock);
      for( numPrinted =0; psi != printStack.end(); numPrinted++ )
	{
	  printStruct& pstr=(*psi);
	  if(pstr.sentenceCount != printCount) break;
	  *pstatStream << pstr.sentenceCount << "\t" << pstr.numDiff << "\n";
	  printCount++;
	  for(i = 0 ; i < pstr.numDiff ; i++)
	    {
	      InputTree*  mapparse = pstr.trees[i];
	      assert(mapparse);
	      double logP =log2(pstr.probs[i]);
	      logP -= (sr.length()*log600);
	      *pstatStream <<  logP << "\n";
	      if(Bchart::prettyPrint) *pstatStream << *mapparse << "\n\n";
	      else
		{
		  mapparse->printproper(*pstatStream);
		  *pstatStream << "\n";
		}
	      delete mapparse;
	    }
	  *pstatStream << endl;
	  psi++;
	}
      pthread_mutex_unlock(&writelock);
      for(i = 0 ; i < numPrinted ; i++) printStack.pop_front();
      if(Feature::isLM)
	{
	  double lgram = log2(bst.sum());
	  lgram -= (sr.length()*log600);
	  double pgram = pow(2,lgram);
	  double iptri = chart->triGram();;
	  double ltri = (log2(iptri)-sr.length()*log600);
	  double ptri = pow(2.0,ltri);
	  double pcomb1 = (0.667 * pgram)+(0.333 * ptri);
	  double lcom1 = log2(pcomb1);
	  totGram -= lgram;
	  totTri -= ltri;
	  totMix -= lcom1;
	  if(locCount%10 == 9)
	    {
	      cerr << locCount << "\t";
	      cerr << pow(2.0,totGram/(double)totWords);
	      cerr <<"\t" <<  pow(2.0,totTri/(double)totWords);
	      cerr << "\t" << pow(2.0,totMix/(double)(totWords));
	      cerr << endl;
	    }
	}
      if(locCount%50 == 1)
	{
	  cerr << sentenceCount << "\t";
	  for(int i = 0 ; i < Bchart::Nth ; i++)
	    if(histPoints[i])
	      {
		cerr << i << " " << totPst[i].fMeasure() << "\t";
	      }
	  cerr << endl;
	}

      delete chart;
      delete [] locPst;
    }
  return 0;
}