vector< boost::tuple<string, string> > Tagger::do_sentence_tagging_map(string line)
{
	vector< boost::tuple<string, string> > tags;

	PERFORM_TOKENIZATION = true; 

	vector<Token> vt;
	tokenize(line, vt, PERFORM_TOKENIZATION);

	if (vt.size() > 990)
	{
		cerr << "warning: the sentence is too long. it has been truncated." << endl;
      		while (vt.size() > 990) vt.pop_back();
    	}

	// convert parantheses
	vector<string> org_strs;
	for (vector<Token>::iterator i = vt.begin(); i != vt.end(); i++)
	{
	       	org_strs.push_back(i->str);
      		i->str = paren_converter.Ptb2Pos(i->str);
      		i->prd = "?";
    	}

    	// tag the words
	vector< map<string, double> > tagp0, tagp1;

	crf_decode_lookahead(vt, crfm, tagp0);
	if (false)
	{
	        assert(0); exit(1);
    	}
	else
	{

      		for (vector<Token>::const_iterator i = vt.begin(); i != vt.end(); i++)
		{
			map<string, double> dummy;
			tagp1.push_back(dummy);
	       	}
    	}

    	// merge the outputs (simple interpolation of probabilities)
    	vector< map<string, double> > tagp; // merged

    	for (size_t i = 0; i < vt.size(); i++)
	{
        	const map<string, double> & crf = tagp0[i];
        	const map<string, double> & ef  = tagp1[i];
        	map<string, double> m, m2; // merged

      		double sum = 0;
      		for (map<string, double>::const_iterator j = crf.begin(); j != crf.end(); j++)
		{
			m.insert(pair<string, double>(j->first, j->second));
			sum += j->second;
	        }

      		for (map<string, double>::const_iterator j = ef.begin(); j != ef.end(); j++)
		{
			sum += j->second;
			if (m.find(j->first) == m.end())
			{
	        		m.insert(pair<string, double>(j->first, j->second));
			}
			else
			{
	  			m[j->first] += j->second;
			}
      		}

      		const double th = PROB_OUTPUT_THRESHOLD * sum;
      		for (map<string, double>::iterator j = m.begin(); j != m.end(); j++)
		{
			if (j->second >= th) m2.insert(*j);
      		}

      		double maxp = -1;
	        string maxtag;
      		for (map<string, double>::iterator j = m2.begin(); j != m2.end(); j++)
		{
			const double p = j->second;

			if (p > maxp) { maxp = p; maxtag = j->first; }
	        }
      		tagp.push_back(m2);
      		vt[i].prd = maxtag;
    	}

    	for (size_t i = 0; i < vt.size(); i++)
	{
		const string s = org_strs[i];
      		const string p = vt[i].prd;

		string term = "";

		int dot_pos = s.find(".");

		if(  dot_pos!= std::string::npos )
		{
			term = s.substr(0,dot_pos);
		}
		else
		{
			term = s;
		}

		tags.push_back( boost::tuple<string, string>( term, p) );
    	}

	return tags;
}
vector<string> Tagger::do_tagging(string ifilename)
{  
	vector<string> nn_vector;

	PERFORM_TOKENIZATION = true; 

	istream *is(&std::cin);	
	ifstream ifile;
	if (ifilename != "")
	{
		ifile.open(ifilename.c_str());
    
		if (!ifile) { cerr << "error: cannot open " << ifilename << endl; exit(1); }
		is = &ifile;
	}

	string line;
	int nlines = 0;
	while (getline(*is, line))
	{
		nlines++;
		vector<Token> vt;
		tokenize(line, vt, PERFORM_TOKENIZATION);

		if (vt.size() > 990)
		{
			cerr << "warning: the sentence is too long. it has been truncated." << endl;
      			while (vt.size() > 990) vt.pop_back();
    		}

		// convert parantheses
		vector<string> org_strs;
		for (vector<Token>::iterator i = vt.begin(); i != vt.end(); i++)
		{
	        	org_strs.push_back(i->str);
      			i->str = paren_converter.Ptb2Pos(i->str);
      			i->prd = "?";
    		}

    		// tag the words
		vector< map<string, double> > tagp0, tagp1;

		crf_decode_lookahead(vt, crfm, tagp0);
		if (false)
		{

		        assert(0); exit(1);
    		}
		else
		{

      			for (vector<Token>::const_iterator i = vt.begin(); i != vt.end(); i++)
			{
				map<string, double> dummy;
				tagp1.push_back(dummy);
	        	}
    		}

    	// merge the outputs (simple interpolation of probabilities)
    	vector< map<string, double> > tagp; // merged

    	for (size_t i = 0; i < vt.size(); i++)
	{
        	const map<string, double> & crf = tagp0[i];
        	const map<string, double> & ef  = tagp1[i];
        	map<string, double> m, m2; // merged

      		double sum = 0;
      		for (map<string, double>::const_iterator j = crf.begin(); j != crf.end(); j++)
		{
			m.insert(pair<string, double>(j->first, j->second));
			sum += j->second;
	        }

      		for (map<string, double>::const_iterator j = ef.begin(); j != ef.end(); j++)
		{
			sum += j->second;
			if (m.find(j->first) == m.end())
			{
	        		m.insert(pair<string, double>(j->first, j->second));
			}
			else
			{
	  			m[j->first] += j->second;
			}
      		}

      		const double th = PROB_OUTPUT_THRESHOLD * sum;
      		for (map<string, double>::iterator j = m.begin(); j != m.end(); j++)
		{
			if (j->second >= th) m2.insert(*j);
      		}

      		double maxp = -1;
	        string maxtag;
      		for (map<string, double>::iterator j = m2.begin(); j != m2.end(); j++)
		{
			const double p = j->second;

			if (p > maxp) { maxp = p; maxtag = j->first; }
	        }
      		tagp.push_back(m2);
      		vt[i].prd = maxtag;
    	}

    	for (size_t i = 0; i < vt.size(); i++)
	{
		const string s = org_strs[i];
      		const string p = vt[i].prd;

		string term = "";

		if( (p == "NN") || (p=="JJ") || (p=="JJR") || (p=="JJS") || (p=="NNS") || (p=="NNP") || (p=="NNPS") )
		{
			int dot_pos = s.find(".");

			if(  dot_pos!= std::string::npos )
			{
				term = s.substr(0,dot_pos);
			}
			else
			{
				term = s;
			}

			nn_vector.push_back(term);
		}

    	}

  }

  return nn_vector;
}
Ejemplo n.º 3
0
int main(int argc, char** argv)
{
  string WORDNET_DIR = "";
  
  string ifilename;

  for (int i = 1; i < argc; i++) {
    string v = argv[i];
    if ( (v == "-m" || v == "--model") && i < argc-1) {
      MODEL_DIR = argv[i+1];
      i++;
      continue;
    }
    //    if ( (v == "-wn" || v == "--wordnet") && i < argc-1) {
    //      WORDNET_DIR = argv[i+1];
    //      i++;
    //      continue;
    //    }
    if (v.substr(0, 8) == "--model=") {
      MODEL_DIR = v.substr(8);
      continue;
    }
    if (v == "-t" || v == "--tokenize") { 
      PERFORM_TOKENIZATION = true; 
      continue; 
    }
    if (v == "-s" || v == "--standoff") { 
      STANDOFF = true;
      continue; 
    }
    if (v == "-u" || v == "--uima") {
      UIMA = true;
      continue;
    }
    if (v == "-e" || v == "--enju") {
      ENJU = true;
      continue;
    }
    if ( (v == "-n" || v == "--nbest") && i < argc-1) {
      NBEST = atoi(argv[i+1]);
      i++;
      continue;
    }
    if (v.substr(0, 8) == "--nbest=") {
      NBEST = atoi(v.substr(8).c_str());
      continue;
    }
    if (v == "-") {
      ifilename = "";
      continue;
    }
    if (v == "-h" || v == "--help")  print_help();
    if (v == "--version")            print_version();

    if (v[0] == '-') {
      cerr << "error: unknown option " << v << endl;
      cerr << "Try `stepp --help' for more information." << endl;
      exit(1);
    }
    ifilename = v;
  }

  if (NBEST) {
    cerr << "error: n-best output is currently not supported" << endl;
    exit(1);
  }

  istream *is(&std::cin);
  ifstream ifile;
  if (ifilename != "") {
    ifile.open(ifilename.c_str());
    if (!ifile) { cerr << "error: cannot open " << ifilename << endl; exit(1); }
    is = &ifile;
  }

  if (MODEL_DIR[MODEL_DIR.size()-1] != '/')  MODEL_DIR += "/";

  CRF_Model crfm;

  if (!ENJU) {
    cerr << "loading the models from the directory \"" << MODEL_DIR << "\" ...";
  }
  if (!crfm.load_from_file(MODEL_DIR + "model.la", ENJU ? false : true)) exit(1);
  if (!ENJU) {
    cerr << "done" << endl;
  }

  //  crfm.save_to_file("test");

  //  push_stop_watch();

  string line;
  int nlines = 0;
  while (getline(*is, line)) {
    nlines++;
    vector<Token> vt;
    tokenize(line, vt, PERFORM_TOKENIZATION);

    if (vt.size() > 990) {
      cerr << "warning: the sentence is too long. it has been truncated." << endl;
      while (vt.size() > 990) vt.pop_back();
    }

    // convert parantheses
    vector<string> org_strs;
    for (vector<Token>::iterator i = vt.begin(); i != vt.end(); i++) {
      org_strs.push_back(i->str);
      i->str = paren_converter.Ptb2Pos(i->str);
      i->prd = "?";
    }

    if (STANDOFF) cout << line << endl;
    if (vt.size() == 0) { cout << endl; continue; }

    // tag the words
    vector< map<string, double> > tagp0, tagp1;
    //    crf_decode_forward_backward(vt, crfm, tagp0);
    crf_decode_lookahead(vt, crfm, tagp0);
    if (false) {
      //      ef_decode_beam(vt, vme, tagp1);
      assert(0); exit(1);
    } else {
      for (vector<Token>::const_iterator i = vt.begin(); i != vt.end(); i++) {
	map<string, double> dummy;
	tagp1.push_back(dummy);
      }
    }

    // merge the outputs (simple interpolation of probabilities)
    vector< map<string, double> > tagp; // merged
    //if(tagp0.size() > 0) {
    for (size_t i = 0; i < vt.size(); i++) {
      const map<string, double> & crf = tagp0[i];
      const map<string, double> & ef  = tagp1[i];
      map<string, double> m, m2; // merged

      double sum = 0;
      for (map<string, double>::const_iterator j = crf.begin(); j != crf.end(); j++) {
	//	cout << j->first << ":" << j->second << " ";
	m.insert(pair<string, double>(j->first, j->second));
	sum += j->second;
      }
      //      cout << endl;

      for (map<string, double>::const_iterator j = ef.begin(); j != ef.end(); j++) {
	//	cout << j->first << ":" << j->second << " ";
	sum += j->second;
	if (m.find(j->first) == m.end()) {
	  m.insert(pair<string, double>(j->first, j->second));
	} else {
	  m[j->first] += j->second;
	}
      }
      //      cout << endl;

      const double th = PROB_OUTPUT_THRESHOLD * sum;
      for (map<string, double>::iterator j = m.begin(); j != m.end(); j++) {
	if (j->second >= th) m2.insert(*j);
      }
      double maxp = -1;
      string maxtag;
      for (map<string, double>::iterator j = m2.begin(); j != m2.end(); j++) {
	const double p = j->second;
	if (p > maxp) { maxp = p; maxtag = j->first; }
	//	cout << j->first << ":" << j->second << " ";
      }
      //      cout << endl;
      tagp.push_back(m2);
      vt[i].prd = maxtag;
    }
    //}
    // print the resutls
    for (size_t i = 0; i < vt.size(); i++) {
      const string s = org_strs[i];
      const string p = vt[i].prd;
      if (STANDOFF || OUTPUT_TAG_PROBS || UIMA || ENJU) {
	if (STANDOFF || UIMA || ENJU) {
	  cout << vt[i].begin << "\t" << vt[i].end;
	  if (!UIMA && !ENJU){
	    cout << "\t";
	  }
	}
	if (!UIMA && !ENJU){
	  cout << s;
	}
	if (OUTPUT_TAG_PROBS) {
	  vector<TagProb> tp;
	  double sum = 0;
	  for (map<string, double>::iterator j = tagp[i].begin(); j != tagp[i].end(); j++) {
	    tp.push_back(TagProb(j->first, j->second));
	    sum += j->second;
	  }
	  sort(tp.begin(), tp.end());
	  for (vector<TagProb>::iterator j = tp.begin(); j != tp.end(); j++) {
	    const double p = j->prob / sum; // normalize
	    if (p == 1) cout << resetiosflags(ios::fixed);
	    else        cout << setiosflags(ios::fixed) << setprecision(3);
	    cout << "\t" << j->tag << "\t" << p;
	  }
	} else {
	  cout << "\t" + p;
          if (ENJU) {
            cout << "\t1";
          }
	}
	if (UIMA){
	  cout << "\t0";
	}
	cout << endl;
      } else {
	if (i == 0) cout << s + "/" + p;
	else        cout << " " + s + "/" + p;
      }
    }
    cout << endl;
    crfm.incr_line_counter();
  }

  //  int msec = push_stop_watch();
  //  cerr << "tagging time = " << msec << " msec" << endl;
  //  cerr << 1000.0 * nlines / msec << " lines / sec" << endl;

}