void ExtractTask::saveHieroAlignment( int startT, int endT, int startS, int endS , const WordIndex &indexS, const WordIndex &indexT, HoleCollection &holeColl, ExtractedRule &rule) { // print alignment of words for(int ti=startT; ti<=endT; ti++) { WordIndex::const_iterator p = indexT.find(ti); if (p != indexT.end()) { // does word still exist? for(unsigned int i=0; i<m_sentence.alignedToT[ti].size(); i++) { int si = m_sentence.alignedToT[ti][i]; std::string sourceSymbolIndex = IntToString(indexS.find(si)->second); std::string targetSymbolIndex = IntToString(p->second); rule.alignment += sourceSymbolIndex + "-" + targetSymbolIndex + " "; if (! m_options.onlyDirectFlag) rule.alignmentInv += targetSymbolIndex + "-" + sourceSymbolIndex + " "; } } } // print alignment of non terminals HoleList::const_iterator iterHole; for (iterHole = holeColl.GetHoles().begin(); iterHole != holeColl.GetHoles().end(); ++iterHole) { const Hole &hole = *iterHole; std::string sourceSymbolIndex = IntToString(hole.GetPos(0)); std::string targetSymbolIndex = IntToString(hole.GetPos(1)); rule.alignment += sourceSymbolIndex + "-" + targetSymbolIndex + " "; if (!m_options.onlyDirectFlag) rule.alignmentInv += targetSymbolIndex + "-" + sourceSymbolIndex + " "; } rule.alignment.erase(rule.alignment.size()-1); if (!m_options.onlyDirectFlag) { rule.alignmentInv.erase(rule.alignmentInv.size()-1); } }
void WebCrawler::CrawlWeb(){ PageDownloader downloader; WordIndex index; HTMLParser parser(start_url); StopWords stop_words; PageHistory history; PageQueue queue; XMLGenerator generator(history, queue); string word, page_string; //create new page and place in queue and history Page* page = new Page(start_url); queue.push(page); history.push(page); stop_words.getWords(stop_file); while(!queue.empty()){ //pop page from queue page = queue.pop(); //Download page page_string = downloader.download(page); //Parse string returned from downloader parser.setNewPageString(page_string); //Determine if this is a valid page if(!parser.isHTML()){ history.pop(page); continue; } //Grab the description and set to page page->setDescription(parser.getDescription()); //Go through the html document and index words while(!parser.empty()){ word =parser.getWord(); if(!stop_words.isStopWord(word)){ index.push(word, page->getURL()); } } //Get links from html, create new page and push on queue and history while(parser.existNextLink()){ page=new Page(parser.getLink()); queue.push(page); history.push(page); } } generator.writeFile(); }
bool WordIndex::Test (ostream & os) { bool success = true; const int PAGES = 3; const int WORDS = 30; string urlStrs[PAGES] = { "http://www.google.com/index.html" , "file:///home/file.txt" , "http://www.msn.com/sports.html" }; URL urls[PAGES] = { URL(urlStrs[0]) , URL(urlStrs[1]) , URL(urlStrs[2]) }; Word words[WORDS] = { "and", "the", "a", "wood", "couch", "potato", "Henry", "the", "a", "and" , "a", "house", "dog", "wood", "couch", "frisbee", "green", "then", "why", "how" , "a", "a", "yes", "no", "maybe", "Henry", "the", "frisbee", "green", "couch" }; WordIndex wordIndex; for (int i = 0; i < PAGES; i++) { for (int j = 0; j < WORDS; j++) { wordIndex.Insert(words[j], urls[i]); } } OccurrenceSet set = wordIndex.GetValue("a"); BSTNode<Occurrence>* node = set.Find(Occurrence(urls[1])); TEST (NULL != node); Occurrence current = node->GetValue(); TEST(current.getURL().getFullURL() == urls[1].getFullURL()); TEST(current.getCount() == 5); return success; }
void Tests::testWordIndex() { qDebug() << "===== testWordIndex() ====="; WordIndex<int> index; int arbre = 1; int arbalete = 2; int ar = 2; int arbuste = 3; index.addItem("arbre", arbre); index.addItem("arbalete", arbalete); QList<int> result0 = WordIndex<int>::resultToList(index.search("arime")); QVERIFY(result0.size() == 0); index.addItem("ar", ar); index.addItem("arbuste", arbuste); qDebug() << index.toStringLog(); QList<int> result1 = WordIndex<int>::resultToList(index.search("ar")); QVERIFY(result1.size() == 1); QVERIFY(result1.contains(ar)); QList<int> result2 = WordIndex<int>::resultToList(index.search("arb")); QVERIFY(result2.size() == 3); QVERIFY(result2.contains(arbre)); QVERIFY(result2.contains(arbalete)); QVERIFY(result2.contains(arbuste)); QList<int> result3 = WordIndex<int>::resultToList(index.search("arbr")); QVERIFY(result3.size() == 1); QVERIFY(result3.contains(arbre)); QList<int> result4 = WordIndex<int>::resultToList(index.search("arbre")); QVERIFY(result4.size() == 1); QVERIFY(result4.contains(arbre)); QList<int> result5 = WordIndex<int>::resultToList(index.search("arbres")); QVERIFY(result5.size() == 0); index.rmItem("arbuste", arbuste); QList<int> result6 = WordIndex<int>::resultToList(index.search("arb")); QVERIFY(result6.size() == 2); QVERIFY(result6.contains(arbre)); QVERIFY(result6.contains(arbalete)); QList<int> result7 = WordIndex<int>::resultToList(index.search("arbuste")); QVERIFY(result7.size() == 0); index.rmItem("arbalete", arbalete); QList<int> result8 = WordIndex<int>::resultToList(index.search("arb")); QVERIFY(result8.size() == 1); QVERIFY(result8.contains(arbre)); QList<int> result9 = WordIndex<int>::resultToList(index.search("arbalete")); QVERIFY(result9.size() == 0); index.rmItem("arbre", arbre); QList<int> result10 = WordIndex<int>::resultToList(index.search("arb")); QVERIFY(result10.size() == 0); }
void ApParser::train(SentenceReader* sentenceReader, char const* modelFile) { WordIndex labelIndex; vector<string> labels; vector<string> predLabels; // collect events list<Tanl::Classifier::Event*> events; WordCounts predCount; // count predicate occurrences int evCount = 0; Tanl::Classifier::PID pID = 1; // leave 0 for bias // create inverted index of predicate names // used to create vector of pIDs EventStream eventStream(sentenceReader, &info); while (eventStream.hasNext()) { Tanl::Classifier::Event* ev = eventStream.next(); events.push_back(ev); evCount++; // count them explicitly, since size() is costly if (config.verbose) { if (evCount % 10000 == 0) cerr << '+' << flush; else if (evCount % 1000 == 0) cerr << '.' << flush; } vector<string>& ec = ev->features; // ec = {p1, ... , pn} for (unsigned j = 0; j < ec.size(); j++) { string& pred = ec[j]; // decide whether to retain it (# occurrences > cutoff) if (predIndex.find(pred.c_str()) == predIndex.end()) { // not yet among those retained WordCounts::iterator wcit = predCount.find(pred); // increment # of occurrences int count; if (wcit == predCount.end()) count = predCount[pred] = 1; else count = ++wcit->second; if (count >= config.featureCutoff) { predLabels.push_back(pred); // accept it into predLabels predIndex[pred.c_str()] = pID++; predCount.erase(pred); } } } } if (config.verbose) cerr << endl; // build cases Cases cases; cases.reserve(evCount); int n = 0; Tanl::Classifier::ClassID oID = 0; while (!events.empty()) { Tanl::Classifier::Event* ev = events.front(); events.pop_front(); cases.push_back(Case()); X& x = cases[n].first; // features // add features vector<string>& ec = ev->features; // ec = {p1, ... , pn} char const* c = ev->className.c_str(); for (unsigned j = 0; j < ec.size(); j++) { string& pred = ec[j]; WordIndex::const_iterator pit = predIndex.find(pred.c_str()); if (pit != predIndex.end()) { x.push_back(pit->second); } } if (x.size()) { if (labelIndex.find(c) == labelIndex.end()) { labelIndex[c] = oID++; labels.push_back(c); } cases[n].second = labelIndex[c]; n++; if (config.verbose) { if (n % 10000 == 0) cerr << '+' << flush; else if (n % 1000 == 0) cerr << '.' << flush; } x.push_back(0); // bias } delete ev; } cases.resize(n); if (config.verbose) cerr << endl; int predSize = predLabels.size(); predSize++; // bias APSV ap(labels.size(), predSize); ofstream ofs(modelFile, ios::binary | ios::trunc); // dump configuration settings config.writeHeader(ofs); // dump labels ofs << labels.size() << endl; FOR_EACH (vector<string>, labels, pit) ofs << *pit << endl; // dump predLabels ofs << predLabels.size() << endl; FOR_EACH (vector<string>, predLabels, pit) ofs << *pit << endl; // free memory predIndex.clear(); WordIndex().swap(predIndex); // STL map do not deallocate. resize(0) has no effect labelIndex.clear(); WordIndex().swap(labelIndex); // clear memory for unfrequent entities info.clearRareEntities(); // perform training ap.train(cases, iter); // dump parameters ap.save(ofs); // dump global info info.save(ofs); }
// compare two WordIndex, used by sort function bool comp(WordIndex wi1, WordIndex wi2) { return wi1.getWord() < wi2.getWord(); }