Пример #1
0
IndexPtr GetValidIndexPointer(char *word, int pos)
{
    IndexPtr idx;
    char *morphword;

    idx = getindex(word, pos);

    if (idx == NULL) {
	if ((morphword = morphstr(word, pos)) != NULL)
	    while (morphword) {
		if ((idx = getindex(morphword, pos)) != NULL) break;
		morphword = morphstr(NULL, pos);
	    }
    }
    return (idx);
}
Пример #2
0
void
morphMe(char *str) {

    char *temp;
    if(morphinit()<0) printf("\nError, cant init morphy");
    //wninit();
    printf("\n%s", morphword(str, NOUN));
    while (temp != NULL) {
        temp=morphstr(str, NOUN);
        printf("\n%s", temp);
    }
    re_morphinit();
}
Пример #3
0
vector<PopWord> TopTensSearch::parsePage(string buffer, string word)
{
   vector<PopWord> results;
   set<PopWord*, DereferenceLess> unique;
//   boost::smatch what;
//   boost::regex expr(regex);

   //parse the XML document
   rapidxml::xml_document<> doc;
   doc.parse<0>((char*)buffer.c_str());

   rapidxml::xml_node<> *node = doc.first_node();

   //Filter the titles
   string exp(".*st "+word+"s?( (?!(in|about|from)).*|$)|.(?!\\w+s)* "+word+"s( (?!.*(in|about|from)).*|$)|.* "+word+"s? ((?!color)|(\\w+))s( (?!.*(in|about|from)).*|$)");
   boost::regex e(exp, boost::regex::icase);
   wninit();

   for (rapidxml::xml_node<> *list = node->first_node(); list; list = list->next_sibling())
   {
      //The list has:
      // * title
      // * link
      // * items
//      printf("Title: %s\n", list->first_node("title")->value());
      
      const string test = list->first_node("title")->value();
      //Determine if the title is any good
      if(boost::regex_match(test, e))
      {

         int count = 0;
         for (rapidxml::xml_node<> *attr = list->first_node("items")->first_node(); attr && count < 5; attr = attr->next_sibling())
         {
            std::string item = (std::string) attr->value();
            boost::regex re(" (in |from |of |- |\\(|:)", boost::regbase::icase);
            boost::sregex_token_iterator i(item.begin(), item.end(), re, -1);
            boost::sregex_token_iterator j;
            std::string found = *i++;
            if (i!=j)
            {
               boost::replace_all(found, " ", "_");
               //PopWord* pop = new PopWord(found, test, *i);
               set<PopWord*>::iterator it = unique.insert(unique.begin(), new PopWord(found, test, *i));
//               if (it != unique.end())
//               {
//                  cout << (*it)->getCount();
                  (*it)->incrementCount();
//                  cout << "\t1 |" << (*it)->getStem() << "|\t" << *i << " " << (*it)->getCount() << endl;
//               }
//               else
//                  unique.insert(pop);
            }
            else
            {
               boost::replace_all(item, " ", "_");
               bool inWN = false;

               for (int j=1; j <= 4; j++)
               {
                  //If the word is defined for the POS
                  if (is_defined((char*)item.c_str(), j) != 0)
                  {
                     set<PopWord*>::iterator it = unique.insert(unique.begin(), new PopWord(item, test));
                     (*it)->incrementCount();
                     inWN=true;
//                     cout << "\t\t2 " << item << endl;
                     break;
                  }
                  else 
                  {
                     char* morph = morphstr((char*)item.c_str(),j);
                     if (morph != NULL)
                     {
                        set<PopWord*>::iterator it = unique.insert(unique.begin(), new PopWord(item, test));
                        (*it)->incrementCount();
                        inWN=true;
//                        cout << "\t\t3 " << item << endl;
                        break;
                     }
                  }
               }
//               if(!inWN)
//                  std::cout << "Not used\t\t4 "<<item << std::endl;
            }
         }
      }
   }

   //Iterate through the unique set. Check how apparent the concept is in wikipedia
   int maxCount = 0;
   set<PopWord*>::iterator it;
   for (it=unique.begin(); it !=unique.end(); it++)
   {
      PopWord ttr = *(*it);
//      cout << ttr.getWord() << endl;
      //int count = -1;
      if (false)
      {
      string buffer = requestURL("http://en.wikipedia.org/w/api.php?format=json&action=parse&page="+ttr.getStem()+"&prop=links&redirects");
      boost::regex re("[\"\\s(:]" + word + "s?[)\"\\s]", boost::regbase::icase);
      boost::sregex_token_iterator i(buffer.begin(), buffer.end(), re, -1);
      boost::sregex_token_iterator j;

	      
//      cout << "-->\t" << buffer << endl;
      while (i != j)
      {
//         cout << "\t" << *i++ << endl;
         ttr.incrementCount();
      }
//      cout << count << endl << endl;
      }
     // ttr.setCount(count);
     if (ttr.getCount() > maxCount)
        maxCount=ttr.getCount();
      results.push_back(ttr);
      
   }
   for (int i =0; i < results.size(); i++)
   {
      results[i].setCount(results[i].getCount()/maxCount);
   }
   return results;
}