void ExtractTask::saveHieroAlignment( int startT, int endT, int startS, int endS
                                      , const WordIndex &indexS, const WordIndex &indexT, HoleCollection &holeColl, ExtractedRule &rule)
{
  // print alignment of words
  for(int ti=startT; ti<=endT; ti++) {
    WordIndex::const_iterator p = indexT.find(ti);
    if (p != indexT.end()) { // does word still exist?
      for(unsigned int i=0; i<m_sentence.alignedToT[ti].size(); i++) {
        int si = m_sentence.alignedToT[ti][i];
        std::string sourceSymbolIndex = IntToString(indexS.find(si)->second);
        std::string targetSymbolIndex = IntToString(p->second);
        rule.alignment      += sourceSymbolIndex + "-" + targetSymbolIndex + " ";
        if (! m_options.onlyDirectFlag)
          rule.alignmentInv += targetSymbolIndex + "-" + sourceSymbolIndex + " ";
      }
    }
  }

  // print alignment of non terminals
  HoleList::const_iterator iterHole;
  for (iterHole = holeColl.GetHoles().begin(); iterHole != holeColl.GetHoles().end(); ++iterHole) {
    const Hole &hole = *iterHole;

    std::string sourceSymbolIndex = IntToString(hole.GetPos(0));
    std::string targetSymbolIndex = IntToString(hole.GetPos(1));
    rule.alignment      += sourceSymbolIndex + "-" + targetSymbolIndex + " ";
    if (!m_options.onlyDirectFlag)
      rule.alignmentInv += targetSymbolIndex + "-" + sourceSymbolIndex + " ";
  }

  rule.alignment.erase(rule.alignment.size()-1);
  if (!m_options.onlyDirectFlag) {
    rule.alignmentInv.erase(rule.alignmentInv.size()-1);
  }
}
Exemple #2
0
void ApParser::train(SentenceReader* sentenceReader, char const* modelFile)
{
  WordIndex		labelIndex;
  vector<string>	labels;

  vector<string>	predLabels;

  // collect events
  list<Tanl::Classifier::Event*>	events;

  WordCounts		predCount; // count predicate occurrences
  int evCount = 0;
  Tanl::Classifier::PID pID = 1;		// leave 0 for bias
  // create inverted index of predicate names
  // used to create vector of pIDs
  EventStream eventStream(sentenceReader, &info);
  while (eventStream.hasNext()) {
    Tanl::Classifier::Event* ev = eventStream.next();
    events.push_back(ev);
    evCount++;		      // count them explicitly, since size() is costly
    if (config.verbose) {
      if (evCount % 10000 == 0)
	cerr << '+' << flush;
      else if (evCount % 1000 == 0)
	cerr << '.' << flush;
    }
    vector<string>& ec = ev->features; // ec = {p1, ... , pn}
    for (unsigned j = 0; j < ec.size(); j++) {
      string& pred = ec[j];
      // decide whether to retain it (# occurrences > cutoff)
      if (predIndex.find(pred.c_str()) == predIndex.end()) {
	// not yet among those retained
	WordCounts::iterator wcit = predCount.find(pred);
	// increment # of occurrences
	int count;
	if (wcit == predCount.end())
	  count = predCount[pred] = 1;
	else
	  count = ++wcit->second;
	if (count >= config.featureCutoff) {
	  predLabels.push_back(pred); // accept it into predLabels
	  predIndex[pred.c_str()] = pID++;
	  predCount.erase(pred);
	}
      }
    }
  }
  if (config.verbose)
    cerr << endl;

  // build cases
  Cases cases;
  cases.reserve(evCount);
  int n = 0;
  Tanl::Classifier::ClassID oID = 0;
  while (!events.empty()) {
    Tanl::Classifier::Event* ev = events.front();
    events.pop_front();
    cases.push_back(Case());
    X& x = cases[n].first;	// features
    // add features
    vector<string>& ec = ev->features; // ec = {p1, ... , pn}
    char const* c = ev->className.c_str();
    for (unsigned j = 0; j < ec.size(); j++) {
      string& pred = ec[j];
      WordIndex::const_iterator pit = predIndex.find(pred.c_str());
      if (pit != predIndex.end()) {
	x.push_back(pit->second);
      }
    }
    if (x.size()) {
      if (labelIndex.find(c) == labelIndex.end()) {
	labelIndex[c] = oID++;
	labels.push_back(c);
      }
      cases[n].second = labelIndex[c];
      n++;
      if (config.verbose) {
	if (n % 10000 == 0)
	  cerr << '+' << flush;
	else if (n % 1000 == 0)
	  cerr << '.' << flush;
      }
      x.push_back(0);		// bias
    }
    delete ev;
  }
  cases.resize(n);
  if (config.verbose)
    cerr << endl;

  int predSize = predLabels.size();
  predSize++;			// bias
  APSV ap(labels.size(), predSize);
  
  ofstream ofs(modelFile, ios::binary | ios::trunc);
  // dump configuration settings
  config.writeHeader(ofs);
  // dump labels
  ofs << labels.size() << endl;
  FOR_EACH (vector<string>, labels, pit)
    ofs << *pit << endl;
  // dump predLabels
  ofs << predLabels.size() << endl;
  FOR_EACH (vector<string>, predLabels, pit)
    ofs << *pit << endl;
  // free memory
  predIndex.clear();
  WordIndex().swap(predIndex); // STL map do not deallocate. resize(0) has no effect
  labelIndex.clear();
  WordIndex().swap(labelIndex);
  // clear memory for unfrequent entities
  info.clearRareEntities();
  // perform training
  ap.train(cases, iter);
  // dump parameters
  ap.save(ofs);
  // dump global info
  info.save(ofs);
}