IndexPtr GetValidIndexPointer(char *word, int pos) { IndexPtr idx; char *morphword; idx = getindex(word, pos); if (idx == NULL) { if ((morphword = morphstr(word, pos)) != NULL) while (morphword) { if ((idx = getindex(morphword, pos)) != NULL) break; morphword = morphstr(NULL, pos); } } return (idx); }
void morphMe(char *str) { char *temp; if(morphinit()<0) printf("\nError, cant init morphy"); //wninit(); printf("\n%s", morphword(str, NOUN)); while (temp != NULL) { temp=morphstr(str, NOUN); printf("\n%s", temp); } re_morphinit(); }
vector<PopWord> TopTensSearch::parsePage(string buffer, string word) { vector<PopWord> results; set<PopWord*, DereferenceLess> unique; // boost::smatch what; // boost::regex expr(regex); //parse the XML document rapidxml::xml_document<> doc; doc.parse<0>((char*)buffer.c_str()); rapidxml::xml_node<> *node = doc.first_node(); //Filter the titles string exp(".*st "+word+"s?( (?!(in|about|from)).*|$)|.(?!\\w+s)* "+word+"s( (?!.*(in|about|from)).*|$)|.* "+word+"s? ((?!color)|(\\w+))s( (?!.*(in|about|from)).*|$)"); boost::regex e(exp, boost::regex::icase); wninit(); for (rapidxml::xml_node<> *list = node->first_node(); list; list = list->next_sibling()) { //The list has: // * title // * link // * items // printf("Title: %s\n", list->first_node("title")->value()); const string test = list->first_node("title")->value(); //Determine if the title is any good if(boost::regex_match(test, e)) { int count = 0; for (rapidxml::xml_node<> *attr = list->first_node("items")->first_node(); attr && count < 5; attr = attr->next_sibling()) { std::string item = (std::string) attr->value(); boost::regex re(" (in |from |of |- |\\(|:)", boost::regbase::icase); boost::sregex_token_iterator i(item.begin(), item.end(), re, -1); boost::sregex_token_iterator j; std::string found = *i++; if (i!=j) { boost::replace_all(found, " ", "_"); //PopWord* pop = new PopWord(found, test, *i); set<PopWord*>::iterator it = unique.insert(unique.begin(), new PopWord(found, test, *i)); // if (it != unique.end()) // { // cout << (*it)->getCount(); (*it)->incrementCount(); // cout << "\t1 |" << (*it)->getStem() << "|\t" << *i << " " << (*it)->getCount() << endl; // } // else // unique.insert(pop); } else { boost::replace_all(item, " ", "_"); bool inWN = false; for (int j=1; j <= 4; j++) { //If the word is defined for the POS if (is_defined((char*)item.c_str(), j) != 0) { set<PopWord*>::iterator it = unique.insert(unique.begin(), new PopWord(item, test)); (*it)->incrementCount(); inWN=true; // cout << "\t\t2 " << item << endl; break; } else { char* morph = morphstr((char*)item.c_str(),j); if (morph != NULL) { set<PopWord*>::iterator it = unique.insert(unique.begin(), new PopWord(item, test)); (*it)->incrementCount(); inWN=true; // cout << "\t\t3 " << item << endl; break; } } } // if(!inWN) // std::cout << "Not used\t\t4 "<<item << std::endl; } } } } //Iterate through the unique set. Check how apparent the concept is in wikipedia int maxCount = 0; set<PopWord*>::iterator it; for (it=unique.begin(); it !=unique.end(); it++) { PopWord ttr = *(*it); // cout << ttr.getWord() << endl; //int count = -1; if (false) { string buffer = requestURL("http://en.wikipedia.org/w/api.php?format=json&action=parse&page="+ttr.getStem()+"&prop=links&redirects"); boost::regex re("[\"\\s(:]" + word + "s?[)\"\\s]", boost::regbase::icase); boost::sregex_token_iterator i(buffer.begin(), buffer.end(), re, -1); boost::sregex_token_iterator j; // cout << "-->\t" << buffer << endl; while (i != j) { // cout << "\t" << *i++ << endl; ttr.incrementCount(); } // cout << count << endl << endl; } // ttr.setCount(count); if (ttr.getCount() > maxCount) maxCount=ttr.getCount(); results.push_back(ttr); } for (int i =0; i < results.size(); i++) { results[i].setCount(results[i].getCount()/maxCount); } return results; }