void ExtractTask::saveHieroAlignment( int startT, int endT, int startS, int endS
                                      , const WordIndex &indexS, const WordIndex &indexT, HoleCollection &holeColl, ExtractedRule &rule)
{
  // print alignment of words
  for(int ti=startT; ti<=endT; ti++) {
    WordIndex::const_iterator p = indexT.find(ti);
    if (p != indexT.end()) { // does word still exist?
      for(unsigned int i=0; i<m_sentence.alignedToT[ti].size(); i++) {
        int si = m_sentence.alignedToT[ti][i];
        std::string sourceSymbolIndex = IntToString(indexS.find(si)->second);
        std::string targetSymbolIndex = IntToString(p->second);
        rule.alignment      += sourceSymbolIndex + "-" + targetSymbolIndex + " ";
        if (! m_options.onlyDirectFlag)
          rule.alignmentInv += targetSymbolIndex + "-" + sourceSymbolIndex + " ";
      }
    }
  }

  // print alignment of non terminals
  HoleList::const_iterator iterHole;
  for (iterHole = holeColl.GetHoles().begin(); iterHole != holeColl.GetHoles().end(); ++iterHole) {
    const Hole &hole = *iterHole;

    std::string sourceSymbolIndex = IntToString(hole.GetPos(0));
    std::string targetSymbolIndex = IntToString(hole.GetPos(1));
    rule.alignment      += sourceSymbolIndex + "-" + targetSymbolIndex + " ";
    if (!m_options.onlyDirectFlag)
      rule.alignmentInv += targetSymbolIndex + "-" + sourceSymbolIndex + " ";
  }

  rule.alignment.erase(rule.alignment.size()-1);
  if (!m_options.onlyDirectFlag) {
    rule.alignmentInv.erase(rule.alignmentInv.size()-1);
  }
}
Beispiel #2
0
void WebCrawler::CrawlWeb(){
	PageDownloader downloader;
	WordIndex index;
	HTMLParser parser(start_url);
	StopWords stop_words;
	PageHistory history;
	PageQueue queue;
	XMLGenerator generator(history, queue);
	string word, page_string;

	//create new page and place in queue and history
	Page* page = new Page(start_url);
	queue.push(page);
	history.push(page);

	stop_words.getWords(stop_file);

	while(!queue.empty()){
		//pop page from queue
		page = queue.pop();
		//Download page
		page_string = downloader.download(page);
		//Parse string returned from downloader
		parser.setNewPageString(page_string);
		//Determine if this is a valid page
		if(!parser.isHTML()){
			history.pop(page);
			continue;
		}
		//Grab the description and set to page
		page->setDescription(parser.getDescription());
		//Go through the html document and index words
		while(!parser.empty()){
			word =parser.getWord();
			if(!stop_words.isStopWord(word)){
				index.push(word, page->getURL());
			}
		}

		//Get links from html, create new page and push on queue and history
		while(parser.existNextLink()){
			page=new Page(parser.getLink());
			queue.push(page);
			history.push(page);
		}

	}

	generator.writeFile();

}
Beispiel #3
0
bool WordIndex::Test (ostream & os) {
  bool success = true;
  const int PAGES = 3;
  const int WORDS = 30;
  string urlStrs[PAGES] = {
      "http://www.google.com/index.html"
    , "file:///home/file.txt"
    , "http://www.msn.com/sports.html"
  };
  URL urls[PAGES] = {
      URL(urlStrs[0])
    , URL(urlStrs[1])
    , URL(urlStrs[2])
  };
  Word words[WORDS] = {
      "and", "the", "a", "wood", "couch", "potato", "Henry", "the", "a", "and"
    , "a", "house", "dog", "wood", "couch", "frisbee", "green", "then", "why", "how"
    , "a", "a", "yes", "no", "maybe", "Henry", "the", "frisbee", "green", "couch"
  };

  WordIndex wordIndex;

  for (int i = 0; i < PAGES; i++) {
    for (int j = 0; j < WORDS; j++) {
      wordIndex.Insert(words[j], urls[i]);
    }
  }

  OccurrenceSet set = wordIndex.GetValue("a");

  BSTNode<Occurrence>* node = set.Find(Occurrence(urls[1]));
  TEST (NULL != node);

  Occurrence current = node->GetValue();
  TEST(current.getURL().getFullURL() == urls[1].getFullURL());
  TEST(current.getCount() == 5);

  return success;
}
Beispiel #4
0
void Tests::testWordIndex()
{
   qDebug() << "===== testWordIndex() =====";

   WordIndex<int> index;
   int arbre = 1;
   int arbalete = 2;
   int ar = 2;
   int arbuste = 3;

   index.addItem("arbre", arbre);
   index.addItem("arbalete", arbalete);

   QList<int> result0 = WordIndex<int>::resultToList(index.search("arime"));
   QVERIFY(result0.size() == 0);

   index.addItem("ar", ar);
   index.addItem("arbuste", arbuste);

   qDebug() << index.toStringLog();

   QList<int> result1 = WordIndex<int>::resultToList(index.search("ar"));
   QVERIFY(result1.size() == 1);
   QVERIFY(result1.contains(ar));

   QList<int> result2 = WordIndex<int>::resultToList(index.search("arb"));
   QVERIFY(result2.size() == 3);
   QVERIFY(result2.contains(arbre));
   QVERIFY(result2.contains(arbalete));
   QVERIFY(result2.contains(arbuste));

   QList<int> result3 = WordIndex<int>::resultToList(index.search("arbr"));
   QVERIFY(result3.size() == 1);
   QVERIFY(result3.contains(arbre));

   QList<int> result4 = WordIndex<int>::resultToList(index.search("arbre"));
   QVERIFY(result4.size() == 1);
   QVERIFY(result4.contains(arbre));

   QList<int> result5 = WordIndex<int>::resultToList(index.search("arbres"));
   QVERIFY(result5.size() == 0);

   index.rmItem("arbuste", arbuste);

   QList<int> result6 = WordIndex<int>::resultToList(index.search("arb"));
   QVERIFY(result6.size() == 2);
   QVERIFY(result6.contains(arbre));
   QVERIFY(result6.contains(arbalete));

   QList<int> result7 = WordIndex<int>::resultToList(index.search("arbuste"));
   QVERIFY(result7.size() == 0);

   index.rmItem("arbalete", arbalete);

   QList<int> result8 = WordIndex<int>::resultToList(index.search("arb"));
   QVERIFY(result8.size() == 1);
   QVERIFY(result8.contains(arbre));

   QList<int> result9 = WordIndex<int>::resultToList(index.search("arbalete"));
   QVERIFY(result9.size() == 0);

   index.rmItem("arbre", arbre);

   QList<int> result10 = WordIndex<int>::resultToList(index.search("arb"));
   QVERIFY(result10.size() == 0);
}
Beispiel #5
0
void ApParser::train(SentenceReader* sentenceReader, char const* modelFile)
{
  WordIndex		labelIndex;
  vector<string>	labels;

  vector<string>	predLabels;

  // collect events
  list<Tanl::Classifier::Event*>	events;

  WordCounts		predCount; // count predicate occurrences
  int evCount = 0;
  Tanl::Classifier::PID pID = 1;		// leave 0 for bias
  // create inverted index of predicate names
  // used to create vector of pIDs
  EventStream eventStream(sentenceReader, &info);
  while (eventStream.hasNext()) {
    Tanl::Classifier::Event* ev = eventStream.next();
    events.push_back(ev);
    evCount++;		      // count them explicitly, since size() is costly
    if (config.verbose) {
      if (evCount % 10000 == 0)
	cerr << '+' << flush;
      else if (evCount % 1000 == 0)
	cerr << '.' << flush;
    }
    vector<string>& ec = ev->features; // ec = {p1, ... , pn}
    for (unsigned j = 0; j < ec.size(); j++) {
      string& pred = ec[j];
      // decide whether to retain it (# occurrences > cutoff)
      if (predIndex.find(pred.c_str()) == predIndex.end()) {
	// not yet among those retained
	WordCounts::iterator wcit = predCount.find(pred);
	// increment # of occurrences
	int count;
	if (wcit == predCount.end())
	  count = predCount[pred] = 1;
	else
	  count = ++wcit->second;
	if (count >= config.featureCutoff) {
	  predLabels.push_back(pred); // accept it into predLabels
	  predIndex[pred.c_str()] = pID++;
	  predCount.erase(pred);
	}
      }
    }
  }
  if (config.verbose)
    cerr << endl;

  // build cases
  Cases cases;
  cases.reserve(evCount);
  int n = 0;
  Tanl::Classifier::ClassID oID = 0;
  while (!events.empty()) {
    Tanl::Classifier::Event* ev = events.front();
    events.pop_front();
    cases.push_back(Case());
    X& x = cases[n].first;	// features
    // add features
    vector<string>& ec = ev->features; // ec = {p1, ... , pn}
    char const* c = ev->className.c_str();
    for (unsigned j = 0; j < ec.size(); j++) {
      string& pred = ec[j];
      WordIndex::const_iterator pit = predIndex.find(pred.c_str());
      if (pit != predIndex.end()) {
	x.push_back(pit->second);
      }
    }
    if (x.size()) {
      if (labelIndex.find(c) == labelIndex.end()) {
	labelIndex[c] = oID++;
	labels.push_back(c);
      }
      cases[n].second = labelIndex[c];
      n++;
      if (config.verbose) {
	if (n % 10000 == 0)
	  cerr << '+' << flush;
	else if (n % 1000 == 0)
	  cerr << '.' << flush;
      }
      x.push_back(0);		// bias
    }
    delete ev;
  }
  cases.resize(n);
  if (config.verbose)
    cerr << endl;

  int predSize = predLabels.size();
  predSize++;			// bias
  APSV ap(labels.size(), predSize);
  
  ofstream ofs(modelFile, ios::binary | ios::trunc);
  // dump configuration settings
  config.writeHeader(ofs);
  // dump labels
  ofs << labels.size() << endl;
  FOR_EACH (vector<string>, labels, pit)
    ofs << *pit << endl;
  // dump predLabels
  ofs << predLabels.size() << endl;
  FOR_EACH (vector<string>, predLabels, pit)
    ofs << *pit << endl;
  // free memory
  predIndex.clear();
  WordIndex().swap(predIndex); // STL map do not deallocate. resize(0) has no effect
  labelIndex.clear();
  WordIndex().swap(labelIndex);
  // clear memory for unfrequent entities
  info.clearRareEntities();
  // perform training
  ap.train(cases, iter);
  // dump parameters
  ap.save(ofs);
  // dump global info
  info.save(ofs);
}
Beispiel #6
0
// compare two WordIndex, used by sort function
bool comp(WordIndex wi1, WordIndex wi2) {
	return wi1.getWord() < wi2.getWord();
}