Ejemplo n.º 1
0
int main(int argc, char *argv[]) {
    ECArgs args(argc, argv);
    params.init(args);

    ECString path(args.arg(0));
    generalInit(path);

    // we don't use sentenceCount since it treeLogProb may parse the
    // sentence multiple times
    int index = 0;
    while (true) {
        if (!cin) {
            break;
        }
        InputTree correct;
        cin >> correct;
        int len = correct.length();
        if (len == 0) {
            break;
        }
        if (len > params.maxSentLen) {
            continue;
        }
        double logProb;
        try {
            logProb = treeLogProb(&correct);
        } catch (ParserError) {
            logProb = 0;
        }
        cout << index << "\t" << logProb << endl;
        index++;
    }
    return 0;
}
Ejemplo n.º 2
0
int
main(int argc, char *argv[])
{
  ECArgs args( argc, argv );
  assert(args.nargs() == 1);
  ECString path(args.arg(0));
  cerr << "At start of pHsgt" << endl;

  for(int n = 0 ; n < MAXNUMNTS ; n++)
    numTerm[n] = 0;

  Term::init( path );
  readHeadInfo(path);

  int sentenceCount = 0;

  ECString s1lex("^^");
  ECString s1nm("S1");
  int s1Int = Term::get(s1nm)->toInt();
	
  UnitRules ur;
  ur.init();
  while(cin)
    {
      //if(sentenceCount > 4000) break;
      if(sentenceCount%10000 == 0) cerr << sentenceCount << endl;
      InputTree  parse;
      cin >> parse;
      //cerr << parse << endl;
      if(!cin) break;
      if(parse.length() == 0) break;
       EcSPairs wtList;
       parse.make(wtList); 
       InputTree* par;
       par = &parse;

      addWwData(par);
      incrWordData(s1Int, s1lex);
      ur.gatherData(par);
      sentenceCount++;
    }
  ECString resultsString(path);
  resultsString += "pSgT.txt";
  ofstream     resultsStream(resultsString.c_str());
  assert(resultsStream);

  int numWords = 0;
  resultsStream << "       \n";  //leave space for number of words;
  resultsStream.precision(3);
  ECString lastWord;
  int wordFreq = 0;
  WordMap::iterator wmi = wordMap.begin();
  resultsStream << wordMap.size() << "\n\n";
  for( ; wmi != wordMap.end() ; wmi++)
    {
      ECString w = (*wmi).first;
      resultsStream << w << "\t";
      PosD& posd = (*wmi).second;
      PosD::iterator pdi = posd.begin();
      int count = 0;
      for( ; pdi != posd.end(); pdi++)
	{
	  int posInt = (*pdi).first;
	  int c = (*pdi).second;
	  count += c;
	  float p = (float)c/(float)numTerm[posInt];
	  resultsStream << posInt << " " << p << " ";
	}
      resultsStream << "| " << count << "\n";
    }
  ur.setData(path);
  return 1;
}
Ejemplo n.º 3
0
int
main(int argc, char *argv[])
{
   struct rlimit 	core_limits;
   core_limits.rlim_cur = 0;
   core_limits.rlim_max = 0;
   setrlimit( RLIMIT_CORE, &core_limits );

   ECArgs args( argc, argv );
   assert(args.nargs() == 2);
   conditionedType = args.arg(0);
   cerr << "start trainRs: " << conditionedType << endl;

   ECString  path( args.arg( 1 ) );
   if(args.isset('L')) Feature::setLM();

   Term::init(path);
   readHeadInfo(path);

   Pst pst(path);
   if(Feature::isLM) ClassRule::readCRules(path);

   addSubFeatureFns();
   Feature::init(path, conditionedType); 

   whichInt = Feature::whichInt;
   int ceFunInt = Feature::conditionedFeatureInt[Feature::whichInt];
   Feature::conditionedEvent
     = SubFeature::Funs[ceFunInt];

   Feat::Usage = PARSE;
   ECString ftstr(path);
   ftstr += conditionedType;
   ftstr += ".g";
   ifstream fts(ftstr.c_str());
   if(!fts)
     {
       cerr << "Could not find " << ftstr << endl;
       assert(fts);
     }
   tRoot = new FeatureTree(fts); //puts it in root;

   cout.precision(3);
   cerr.precision(3);

   lamInit();

   InputTree* trainingData[1001];
   int usedCount = 0;
   sentenceCount = 0;
   for( ;  ; sentenceCount++)
     {
       if(sentenceCount%10000 == 1)
	 {
	   // cerr << conditionedType << ".tr "
	   //<< sentenceCount << endl;
	 }
       if(usedCount >= 1000) break;
       InputTree*     correct = new InputTree;  
       cin >> (*correct);
       if(correct->length() == 0) break;
       if(!cin) break;
       EcSPairs wtList;
       correct->make(wtList); 
       InputTree* par;
       par = correct;
       trainingData[usedCount++] = par;
     }
   if(Feature::isLM) pickLogBases(trainingData,sentenceCount);
   procGSwitch = true;
   for(pass = 0 ; pass < 10 ; pass++)
     {
       if(pass%2 == 1) cout << "Pass " << pass << endl;
       goThroughSents(trainingData, sentenceCount);
       updateLambdas();
       //printLambdas(cout);
       zeroData();
     }
   ECString resS(path);
   resS += conditionedType;
   resS += ".lambdas";
   ofstream res(resS.c_str());
   res.precision(3);
   printLambdas(res);
   printLambdas(cout);
   cout << "Total params = " << FeatureTree::totParams << endl;
   cout << "Done: " << (int)sbrk(0) << endl;
}
Ejemplo n.º 4
0
int
main(int argc, char *argv[])
{
   struct rlimit 	core_limits;
   core_limits.rlim_cur = 0;
   core_limits.rlim_max = 0;
   setrlimit( RLIMIT_CORE, &core_limits );

   ECArgs args( argc, argv );
   assert(args.nargs() == 2);
   if(args.isset('N')) numGram = atoi(args.value('N').c_str());
   Feature::setLM();
   if(args.isset('L')) Term::Language = args.value('L');
   string  path( args.arg( 1 ) );
   if(Term::Language == "Ch") readHeadInfoCh(path);
   else readHeadInfo(path);

   string  conditionedType( args.arg(0) );
   cerr << "start kn3Counts " <<  conditionedType << endl;
   int minCount = 1;
   if(args.isset('m')) minCount = atoi(args.value('m').c_str());
   Feat::Usage = KNCOUNTS;
   FeatureTree::minCount = minCount;

   Term::init(path);
   readHeadInfo(path);
   Pst pst(path);
   addSubFeatureFns();

   Feature::assignCalc(conditionedType);
       
   FeatureTree::root() = new FeatureTree();
   Feature::init(path, conditionedType);
   int wI = Feature::whichInt;
   int ceFunInt = Feature::conditionedFeatureInt[wI];

   Feature::conditionedEvent
     = SubFeature::Funs[ceFunInt];
   string trainingString( path );

   int sentenceCount = 0;
   for( ; ; sentenceCount++)
     {
       if(sentenceCount%10000 == 1)
	 {
	   cerr << "rCounts "
	     << sentenceCount << endl;
	 }
       InputTree     correct;  
       cin >> correct;
       //if(sentenceCount > 1000) break;
       if(correct.length() == 0) break;
       //cerr <<sentenceCount << correct << endl;
       EcSPairs wtList;
       correct.make(wtList); 
       InputTree* par;
       int strt = 0;
       par = &correct;

       makeSent(par);
       curS = par;
       gatherFfCounts(par, 0);
       if(wI == TTCALC || wI == WWCALC)
	 {
	   list<InputTree*> dummy2;
	   InputTree stopInputTree(par->finish(),par->finish(),
				   wI==TTCALC ? "" : "^^",
				   "STOP","",
				   dummy2,NULL,NULL);
	   stopInputTree.headTree() = &stopInputTree;
	   TreeHist treeh(&stopInputTree,0);
	   treeh.hpos = 0;
	   callProcG(&treeh);
	 }
     }
   finalProbComputation();
   string resS(path);
   resS += conditionedType;
   resS += ".g";
   ofstream res(resS.c_str());
   assert(res);
   FTreeMap& fts = FeatureTree::root()->subtree;
   FTreeMap::iterator fti = fts.begin();
   for( ; fti != fts.end() ; fti++)
     {
       int asVal = (*fti).first;
       (*fti).second->printFTree(asVal, res);
     }
   res.close();
   cout << "Tot words: " << totWords << endl;
   cout << "Total params for " << conditionedType << " = "
	<< FeatureTree::totParams << endl;
}
Ejemplo n.º 5
0
int
main(int argc, char *argv[])
{
  ECArgs args( argc, argv );
  assert(args.nargs() == 1);
  ECString path(args.arg(0));
  cerr << "At start of pSfgt" << endl;

  for(int n = 0 ; n < 140 ; n++)
    numTerm[n] = 0;

  ECString resultsString(path);
  resultsString += "endings.txt";

  Term::init( path );
  if(args.isset('L')) Term::Language = args.value('L');
  readHeadInfo(path);
  Pst pst(path); //???;

  int sentenceCount = 0;
  int wordCount = 0;
  int processedCount = 0;

  /*int i, j;
  for(i = 0 ; i < 60 ; i++)
    for(j = 0 ; j < 30 ; j++)
      data[i][j] = 0;
  */
  int i = 0;
  while(cin)
    {
      if(i++%5000 == 1) cerr << i << endl;
      InputTree  parse;
      cin >> parse;
      if(!cin) break;
      if(parse.length() == 0 && cin) continue;
      if(parse.length()==0 ||!cin) break;
      addWwData(&parse);
      processedCount++;
      wordCount += parse.length();
    }
  ofstream     resultsStream(resultsString.c_str());
  assert(resultsStream);
  /*int  totNt[30];
  for(i = 0 ; i < 30 ; i++) totNt[i] = 0;
  for(i = 0 ; i <= Term::lastTagInt() ; i++)
    {
      for(j = 0 ; j < (Term::lastNTInt() - Term::lastTagInt()) ; j++)
	totNt[j] += data[i][j];
    }
    */
  resultsStream << numEndings << "\n";

  for(i = 0 ; i < 140 ; i++)
    {
      endMap::iterator emi = endData[i].begin();
      for( ; emi != endData[i].end() ; emi++)
	{
	  ECString ending = (*emi).first;
	  int cnt = (*emi).second;
	  resultsStream << i << "\t" << ending << "\t"
			<< (float) cnt / (float) numTerm[i]
			<< endl;
	    //<< "\n";

	}
    }
  cout<<"totol sentence:"<<processedCount<<endl;
  cout<<"total suffix:"<<numEndings<<endl;

  return 0;
}
Ejemplo n.º 6
0
int
main(int argc, char *argv[])
{
  ECArgs args( argc, argv );
  assert(args.nargs() == 1);
  ECString path(args.arg(0));
  cerr << "At start of pTgNt" << endl;

  for(int n = 0 ; n < MAXNUMTS ; n++)
    numTerm[n] = 0;

  ECString resultsString(path);
  resultsString += "endings.txt";

  Term::init( path );  
  if(args.isset('L')) Term::Language = args.value('L');
  readHeadInfo(path);
  Pst pst(path);

  int sentenceCount = 0;
  int wordCount = 0;
  int processedCount = 0;

  int i, j;
  for(i = 0 ; i < MAXNUMTS ; i++)
    for(j = 0 ; j < MAXNUMNTS ; j++)
      data[i][j] = 0;

  i = 0;
  while(cin)
    {
      if(i%10000 == 0) cerr << i << endl;
      //if(i > 1000) break;
      InputTree  parse;
      cin >> parse;
      if(!cin) break;
      if(parse.length() == 0) break;
      const Term* resTerm = addWwData(&parse);
      processedCount++;
      wordCount += parse.length();
      i++;
    }
  ofstream     resultsStream(resultsString.c_str());
  assert(resultsStream);
  int  totNt[MAXNUMTS];
  for(i = 0 ; i < MAXNUMTS ; i++) totNt[i] = 0;
  for(i = 0 ; i <= Term::lastTagInt() ; i++)
    {
      for(j = 0 ; j < (Term::lastNTInt() - Term::lastTagInt()) ; j++)
	totNt[j] += data[i][j];
    }
  resultsStream << numEndings << "\n";
  for(i = 0 ; i < MAXNUMTS ; i++)
    {
      endMap::iterator emi = endData[i].begin();
      for( ; emi != endData[i].end() ; emi++)
	{
	  ECString ending = (*emi).first;
	  int cnt = (*emi).second;
	  resultsStream << i << "\t" << ending << "\t"
			<< (float) cnt / (float) numTerm[i]
			<< endl;
	    //<< "\n";
	}
    }
  return 0;
}
Ejemplo n.º 7
0
int
main(int argc, char *argv[])
{
  ECArgs args( argc, argv );
  ECString path(args.arg(0));
  cerr << "At start of pUgT" << endl;

  Term::init( path );  
  if(args.isset('L')) Term::Language = args.value('L');
  readHeadInfo(path);
  Pst pst(path);

  int sentenceCount = 0;

  int i, j;
  for(i = 0 ; i < MAXNUMTS ; i++)
    {
      posCounts[i] = 0;
      posCapCounts[i] = 0;
      posDenoms[i] = 0;
      posUCounts[i] = 0;
      posDashCounts[i] = 0;
    }
  for(i = 0 ; i < MAXNUMTS ; i++) totCounts[i] = 0;

  i = 0;
  for( ; ; )
    {
      if(i++%10000 == 1) cerr << i << endl;
      //if(i > 1000) break;
      InputTree  parse;
      cin >> parse;
      //cerr << parse << endl;
      if(parse.length() == 0) break;
      if(!cin) break;
      curSent = &parse;
      addWwData(&parse);
      sentenceCount++;
    }

  ECString resultsString(path);
  resultsString += "pUgT.txt";
  ofstream     resultsStream(resultsString.c_str());
  assert(resultsStream);
  /* we print out p(unknown|tag)    p(Capital|tag)   p(hasDash|tag, unknown)
     note for Capital the denom is different because we ignore the first
     two words of the sentence */
  int nm = Term::lastTagInt()+1;
  for(i = 0 ; i < nm ; i++)
    {
      resultsStream << i << "\t";
      float pugt = 0;
      float pudenom = (float)posDenoms[i];
      if(pudenom > 0) pugt = (float)posUCounts[i]/pudenom;
      resultsStream << pugt << "\t";
      if(posCounts[i] == 0) resultsStream << 0 << "\t";
      else
	resultsStream << (float) posCapCounts[i]/ (float)posCounts[i] << "\t";
      if(posUCounts[i] == 0) resultsStream << 0;
      else resultsStream << (float)posDashCounts[i]/posUCounts[i] ;
      resultsStream << endl;
    }
  ECString resultsString2(path);
  resultsString2 += "nttCounts.txt";
  ofstream     resultsStream2(resultsString2.c_str());
  assert(resultsStream2);
  for(i = 0 ; i <= Term::lastNTInt() ; i++)
    {
      resultsStream2 << i << "\t";
      resultsStream2 << totCounts[i] << "\n";
    }
  return 0;
}
Ejemplo n.º 8
0
/* the function called by each thread is "mainLoop" */
void*
mainLoop(void* arg)
{
  loopArg *loopA = (loopArg*)arg;
  istream* testSStream = loopA->inpt;
  ostream* pstatStream = loopA->outpt;
  int id = loopA->id;
  double log600 = log2(600.0);
  PrintStack printStack;
  for( ;  ; )
    {
      InputTree     correct;  
      InputTree*    cuse;

      /* first lock to read in the material */
      pthread_mutex_lock(&readlock);
      if( !*testSStream ) {
	pthread_mutex_unlock(&readlock);
	break;
      }
      *testSStream >> correct;
      if( !*testSStream ){
	pthread_mutex_unlock(&readlock);
	break;
      }
      totWords += correct.length()+1;
      int locCount = sentenceCount++;
      list<ECString>  wtList;
      correct.make(wtList);
      SentRep sr( wtList );  // used in precision calc

      ExtPos extPos;
      if(params.extPosIfstream)
	extPos.read(params.extPosIfstream,sr);
      pthread_mutex_unlock(&readlock);

      cuse = &correct;
      int len = correct.length();
      if(len > params.maxSentLen) continue;
      //cerr << "Len = " << len << endl;
      /*
	if( !params.field().in(sentenceCount) )
	{
	sentenceCount++;
	continue;
	}
	if(sentenceCount < -1)
	{
	sentenceCount++;
	continue;
	}
	sentenceCount++;
      */
      vector<ECString> poslist;
      correct.makePosList(poslist);
      ScoreTree sc;
      sc.setEquivInts(poslist);
      MeChart*	chart = new MeChart( sr,extPos,id );
       
      chart->parse( );
      Item* topS = chart->topS();
      if(!topS)
	{
	  cerr << "Parse failed" << endl;
	  cerr << correct << endl;
	  error(" could not parse "); 
	  delete chart;
	  continue;
	}
       
      // compute the outside probabilities on the items so that we can
      // skip doing detailed computations on the really bad ones 

      chart->set_Alphas();

      Bst& bst = chart->findMapParse();
      if( bst.empty()) error( "mapProbs did not return answer");
      float bestF = -1;
      int i;
      int numVersions = 0;
      Link diffs(0);
      //cerr << "Need num diff: " << Bchart::Nth << endl;
      printStruct printS;
      printS.sentenceCount = locCount;
      printS.numDiff = 0;
      for(numVersions = 0 ; ; numVersions++)
	{
	  short pos = 0;
	  Val* val = bst.next(numVersions);
	  if(!val)
	    {
	      //cerr << "Breaking" << endl;
	      break;
	    }
	  InputTree*  mapparse = inputTreeFromBsts(val,pos,sr);
	  bool isU;
	  int dummy = 0;
	  diffs.is_unique(mapparse, isU, dummy);
	  // cerr << "V " << isU << " " << numVersions << *mapparse << endl;
	  if(isU)
	    {
	      printS.probs.push_back(val->prob());
	      printS.trees.push_back(mapparse);
	      printS.numDiff++;
	    }
	  else
	    {
	      delete mapparse;
	    }
	  if(printS.numDiff >= Bchart::Nth) break;
	  if(numVersions > 20000) break;
	}

      ParseStats* locPst = new ParseStats[Bchart::Nth];
      ParseStats bestPs;
      for(i = 0 ; i <printS.numDiff ; i++)
	{
	  InputTree *mapparse = printS.trees[i];
	  assert(mapparse);
	  sc.trips.clear();
	  ParseStats pSt;
	  sc.recordGold(cuse,pSt);
	  sc.precisionRecall(mapparse,pSt);
	  float newF = pSt.fMeasure();
	  cerr << printS.sentenceCount << "\t" << newF << endl;
	  if(newF > bestF)
	    {
	      bestF = newF;
	      bestPs = pSt;
	    }
	  if(histPoints[i])
	    {
	      locPst[i] += bestPs;
	    }
	}
      if(printS.numDiff < Bchart::Nth)
	{
	  for(i = printS.numDiff ; i < Bchart::Nth ; i++)
	    {
	      if(histPoints[i]) locPst[i] += bestPs;
	    }
	}

      pthread_mutex_lock(&scorelock);
      for(i = 0 ; i < Bchart::Nth ; i++) totPst[i]+=locPst[i];
      pthread_mutex_unlock(&scorelock);

      int numPrinted;

      /* put the sentence with which we just finished at the end of the printStack*/
      printStack.push_back(printS);
      PrintStack::iterator psi = printStack.begin();
      /* now look at each item from the front of the print stack
	 to see if it should be printed now */
      pthread_mutex_lock(&writelock);
      for( numPrinted =0; psi != printStack.end(); numPrinted++ )
	{
	  printStruct& pstr=(*psi);
	  if(pstr.sentenceCount != printCount) break;
	  *pstatStream << pstr.sentenceCount << "\t" << pstr.numDiff << "\n";
	  printCount++;
	  for(i = 0 ; i < pstr.numDiff ; i++)
	    {
	      InputTree*  mapparse = pstr.trees[i];
	      assert(mapparse);
	      double logP =log2(pstr.probs[i]);
	      logP -= (sr.length()*log600);
	      *pstatStream <<  logP << "\n";
	      if(Bchart::prettyPrint) *pstatStream << *mapparse << "\n\n";
	      else
		{
		  mapparse->printproper(*pstatStream);
		  *pstatStream << "\n";
		}
	      delete mapparse;
	    }
	  *pstatStream << endl;
	  psi++;
	}
      pthread_mutex_unlock(&writelock);
      for(i = 0 ; i < numPrinted ; i++) printStack.pop_front();
      if(Feature::isLM)
	{
	  double lgram = log2(bst.sum());
	  lgram -= (sr.length()*log600);
	  double pgram = pow(2,lgram);
	  double iptri = chart->triGram();;
	  double ltri = (log2(iptri)-sr.length()*log600);
	  double ptri = pow(2.0,ltri);
	  double pcomb1 = (0.667 * pgram)+(0.333 * ptri);
	  double lcom1 = log2(pcomb1);
	  totGram -= lgram;
	  totTri -= ltri;
	  totMix -= lcom1;
	  if(locCount%10 == 9)
	    {
	      cerr << locCount << "\t";
	      cerr << pow(2.0,totGram/(double)totWords);
	      cerr <<"\t" <<  pow(2.0,totTri/(double)totWords);
	      cerr << "\t" << pow(2.0,totMix/(double)(totWords));
	      cerr << endl;
	    }
	}
      if(locCount%50 == 1)
	{
	  cerr << sentenceCount << "\t";
	  for(int i = 0 ; i < Bchart::Nth ; i++)
	    if(histPoints[i])
	      {
		cerr << i << " " << totPst[i].fMeasure() << "\t";
	      }
	  cerr << endl;
	}

      delete chart;
      delete [] locPst;
    }
  return 0;
}