Beispiel #1
0
QStringList stemWords(const QString &words)
{
    struct {
        const char* suffix;
        const char* stem;
    } dict[] = {
        {"ies",  "y"},
        {"ied",  "y"},
        {"es",  ""},
        {"ting",  "te"},
        {"ing",  ""},
        {"ing",  "e"},
        {"ed",  "e"},
        {"ed",  ""},
        {"id",  "y"},
        {"ices",  "ex"},
        {"ves",  "fe"},
        {"s",  ""},
    };

    QStringList list;
    for (size_t i=0; i<sizeof(dict)/sizeof(dict[0]); i++) {
        QString suffix(dict[i].suffix);
        if (words.endsWith(suffix, Qt::CaseInsensitive)) {
            QString stem(dict[i].stem);
            QString t(words);
            t.chop(suffix.length());
            t.append(stem);
            list << t;
        }
    }
    return list;
}
Beispiel #2
0
/* Tokenize a given sds, setting a term to zero-length sds if it's
 * a stopword. A total number of tokens and total number of nonstopwords
 * will be returned */
static sds *sds_tokenize(sds s, int *len, int *nonstopwords) {
    int i, l, k;
    sds *terms;
    struct stemmer *stmer;

    *nonstopwords = 0;
    terms = sdssplitlen(s, sdslen(s), " ", 1, len);
    if (!terms) return NULL;
    stmer = create_stemmer();
    for (i = 0; i < *len; i++) {
        sds stemmed = NULL, term = terms[i];
        term = sdstrim(term, puncs);
        l = sdslen(term);
        sdstolower(term);
        if (l == 0 || rr_stopwords_check(term)) {
            sdsclear(term);
            continue;
        }
        *nonstopwords += 1;
        /* note that the third argument is a zero-based index */
        k = stem(stmer, term, l-1);
        if (k < l-1) {
            stemmed = sdsnewlen(term, k+1);
            sdsfree(term);
            terms[i] = stemmed;
        }
    }

    free_stemmer(stmer);
    return terms;
}
Beispiel #3
0
static PyObject* jellyfish_porter_stem(PyObject *self, PyObject *args)
{
    const char *str;
    char *result;
    PyObject *ret;
    struct stemmer *z;
    int end;

    if (!PyArg_ParseTuple(args, "s", &str)) {
        return NULL;
    }

    z = create_stemmer();
    if (!z) {
        PyErr_NoMemory();
        return NULL;
    }

    result = strdup(str);
    if (!result) {
        free_stemmer(z);
        PyErr_NoMemory();
        return NULL;
    }

    end = stem(z, result, strlen(result) - 1);
    result[end + 1] = '\0';

    ret = Py_BuildValue("s", result);

    free(result);
    free_stemmer(z);

    return ret;
}
/*------------------------------------------------------------------
 *  gethostbyname()
 *------------------------------------------------------------------*/
RexxRoutine2(int, SockGetHostByName, CSTRING, name, RexxObjectPtr, stemSource)
{
    StemManager stem(context);

    if (!stem.resolveStem(stemSource))
    {
        return 0;
    }
    struct hostent *pHostEnt;

    /*---------------------------------------------------------------
     * call function
     *---------------------------------------------------------------*/
    pHostEnt = gethostbyname(name);
    // set the errno information
    cleanup(context);

    if (!pHostEnt)
    {
        return 0;
    }
    else
    {
        hostEntToStem(context, pHostEnt, stem);
        return 1;
    }
}
Beispiel #5
0
void stemfile(struct stemmer * z, FILE * f)
{  while(TRUE)
{  int ch = getc(f);
	if (ch == EOF) return;
	if (LETTER(ch))
	{  int i = 0;
		while(TRUE)
		{  if (i == i_max)
		{  i_max += INC;
			s = realloc(s, i_max + 1);
		}
            ch = tolower(ch); /* forces lower case */
			
            s[i] = ch; i++;
            ch = getc(f);
            if (!LETTER(ch)) { ungetc(ch,f); break; }
		}
		s[stem(z, s, i - 1) + 1] = 0;
		/* the previous line calls the stemmer and uses its result to
            zero-terminate the string in s */
		printf("%s",s);
	}
	else putchar(ch);
}
}
Beispiel #6
0
void
file_chooser::on_single_file_toggled ()
{
  std::string name (get_current_name ());
  smatch m;

  if (regex_match (name, m, filename_re))
    {
      if (!single_file_.get_active ()) return;

      set_current_name (m.str (1) + m.str (5));
    }
  else
    {
      if (single_file_.get_active ()) return;

      fs::path path (get_current_name ());
      fs::path stem (path.stem ());
      fs::path ext  (path.extension ());

      path = stem;
      path = path.native () + default_pattern_;
      path.replace_extension (ext);

      set_current_name (path.string ());
    }
}
/*------------------------------------------------------------------------------
 * accept()
 *
 * @remarks  The sockAddrToStem() function calls both htons() and inet_ntoa().
 *           On Windows, one or both, of those functions sets errno back to 0.
 *           This prevents the Rexx programmer from ever seeing the errno if
 *           accept fails.  Because of this, we call cleanup() immediately after
 *           the accept call in the belief that the Rexx programmer is more
 *           interested in the result of accept().
* ----------------------------------------------------------------------------*/
RexxRoutine2(int, SockAccept, int, sock, OPTIONAL_RexxObjectPtr, stemSource)
{
    sockaddr_in  addr;
    socklen_t    nameLen;

    nameLen = sizeof(addr);
    int rc = accept(sock, (struct sockaddr *)&addr, &nameLen);

    // set the errno variables
    cleanup(context);

    /*---------------------------------------------------------------
     * set addr, if asked for
     *---------------------------------------------------------------*/
    if (stemSource != NULLOBJECT)
    {
        StemManager stem(context);

        if (!stem.resolveStem(stemSource))
        {
            return 0;
        }
        sockAddrToStem(context, &addr, stem);
    }

    return rc;
}
/*------------------------------------------------------------------
 * connect()
 *------------------------------------------------------------------*/
RexxRoutine2(int, SockConnect, int, sock, RexxObjectPtr, stemSource)
{
    StemManager stem(context);

    if (!stem.resolveStem(stemSource))
    {
        return 0;
    }

    sockaddr_in  addr;

    /*---------------------------------------------------------------
     * get addr
     *---------------------------------------------------------------*/
    stemToSockAddr(context, stem, &addr);

    /*---------------------------------------------------------------
     * call function
     *---------------------------------------------------------------*/
    int rc = connect(sock,(struct sockaddr *)&addr, sizeof(addr));
    // set the errno information
    cleanup(context);

    return rc;
}
Beispiel #9
0
CAMLprim value caml_stemmer_porter2_stem(value v_stem, value v_str)
{
  CAMLparam2(v_stem,v_str);
  CAMLlocal1(v_res);

  stemmer_t* val = (stemmer_t*)v_stem;
  size_t i, len = caml_string_length(v_str);
  if(len >= val->len){
    val->len = len + 1; // to put trailing zero securely
    val->buf = realloc(val->buf, val->len);
  }
  
  len = 0;
  char *word = val->buf, *d = val->buf, *s = String_val(v_str);
  // This is much more beautiful and optimistic, but generates ugly
  // warnings. Sad.
  //while(*d++ = *s++) i++ ;
  while(*s && (len < val->len)){ *d = *s; d++; s++; len++; }
  if(0 == len) CAMLreturn(caml_copy_string(""));

  // short words are not lowercased by stem()
  if (len < 3){
    word[0] = tolower(word[0]);
    word[1] = tolower(word[1]);
  }

  i = stem(val->st, word, len-1);

  word[i+1] = '\0';
  v_res = caml_copy_string(word);

  CAMLreturn(v_res);
}
Beispiel #10
0
/**
 * Reload all avatars.
 */
void AvatarGallery::Reload()
{
	avatars.clear();

	if (!fs::exists(path)) {
		HR_LOG(info) << "Avatar directory does not exist: " <<
			(const char*)Str::PU(path);
		return;
	}

	if (!fs::is_directory(path)) {
		HR_LOG(warning) << "Avatar directory is not a directory: " <<
			(const char*)Str::PU(path);
		return;
	}

	OS::dirIter_t dend;
	for (OS::dirIter_t iter{ path }; iter != dend; ++iter) {
		auto filename = iter->path().filename();
		auto avatarName = filename.stem().string();

		if (filename.extension() != ".png") {
			HR_LOG(debug) << "Ignoring non-avatar: " << filename;
			continue;
		}

		HR_LOG(debug) << "Found avatar: " << avatarName << ": " << filename;
		avatars.emplace(
			avatarName,
			std::make_shared<Display::MediaRes<Display::Texture>>(
				Str::UP("avatars") / filename));
	}
}
/*------------------------------------------------------------------
 *  getsockname()
 *------------------------------------------------------------------*/
RexxRoutine2(int, SockGetSockName, int, sock, RexxObjectPtr, stemSource)
{
    StemManager stem(context);

    if (!stem.resolveStem(stemSource))
    {
        return 0;
    }
    sockaddr_in  addr;
    socklen_t    nameLen;

    /*---------------------------------------------------------------
     * call function
     *---------------------------------------------------------------*/
    nameLen = sizeof(addr);
    int rc = getsockname(sock,(struct sockaddr *)&addr,&nameLen);
    // set the errno information
    cleanup(context);

    /*---------------------------------------------------------------
     * write address to stem
     *---------------------------------------------------------------*/
    sockAddrToStem(context, &addr, stem);

    /*---------------------------------------------------------------
     * set return code
     *---------------------------------------------------------------*/
    return rc;
}
Beispiel #12
0
void Morfologik::stemsOnLexemeLevel_(const std::string & word,
                                     LemmatizerOutputIterator & outputIterator) {

    std::multimap<std::string, std::vector<std::string> > stems = stem(word);

    std::set<std::string> lemmas = getLemmasFromStems_(stems);
    std::set<std::string>::iterator lem;

    DEBUG("found stems of word [" << word << "] on lexeme level: ["
        << boost::algorithm::join(lemmas, ", ") << "]");

    for (lem = lemmas.begin(); lem != lemmas.end(); ++lem) {
        if (!foundLemma_) outputIterator.addNormalization(word);
        outputIterator.addLemma(*lem);
        foundLemma_ = true;

        std::vector<std::string> lexemeTags = getLexemeTagsFromStems_(stems, *lem);
        std::vector<std::string>::iterator lxt;

        for (lxt = lexemeTags.begin(); lxt != lexemeTags.end(); ++lxt) {
            AnnotationItem lexItem = createLexemeAnnotation_(*lem, *lxt);
            outputIterator.addLexeme(lexItem);
        }
    }
}
Beispiel #13
0
static void
evaluate(const char *input) {
  value = strdup(input);

  value[stem(value, 0, strlen(value) - 1) + 1] = 0;

  printf("%s\n", value);
}
Beispiel #14
0
/* aplica el algoritmo de stemming a la palabra pasada como parametro.
   retorna el string resultante.
*/
std::string Stemmer::stemPalabra(std::string w) {
	s = w;

	/* lo paso a minusculas */
	std::transform(s.begin(), s.end(), s.begin(), ::tolower);

	int nuevoTamanio = stem( s , 0 , s.size()-1 );
	s.resize(nuevoTamanio+1);
	return s;
}
Beispiel #15
0
static DWORD pollDiscDrives(void)
{
    /* Try to use SetThreadErrorMode(), which showed up in Windows 7. */
    HANDLE lib = LoadLibraryA("kernel32.dll");
    fnSTEM stem = NULL;
    char drive[4] = { 'x', ':', '\\', '\0' };
    DWORD oldErrorMode = 0;
    DWORD drives = 0;
    DWORD i;

    if (lib)
        stem = (fnSTEM) GetProcAddress(lib, "SetThreadErrorMode");

    if (stem)
        stem(SEM_FAILCRITICALERRORS, &oldErrorMode);
    else
        oldErrorMode = SetErrorMode(SEM_FAILCRITICALERRORS);

    /* Do detection. This may block if a disc is spinning up. */
    for (i = 'A'; i <= 'Z'; i++)
    {
        DWORD tmp = 0;
        drive[0] = (char) i;
        if (GetDriveTypeA(drive) != DRIVE_CDROM)
            continue;

        /* If this function succeeds, there's media in the drive */
        if (GetVolumeInformationA(drive, NULL, 0, NULL, NULL, &tmp, NULL, 0))
            drives |= (1 << (i - 'A'));
    } /* for */

    if (stem)
        stem(oldErrorMode, NULL);
    else
        SetErrorMode(oldErrorMode);

    if (lib)
        FreeLibrary(lib);

    return drives;
} /* pollDiscDrives */
/**
 * Search for a term.
 */
static void
ys_find_terms(ys_collection_t *collection, ys_query_t *query)
{
 	ys_uchar_t key[YS_MAXKEYSIZE+1];
	char *cp = query->termbuf;
		
	/* printf("searching term %s\n", query->termbuf); */
	key[0] = strlen(cp);
	memcpy(key+1, cp, key[0]+1);
	if (collection->stemmed)
		stem(key);
	ys_btree_iterate( collection->tree, key, ys_find_docs, collection );
}
Beispiel #17
0
std::string Stemmer::stem(std::string str) {

	int length = str.length();
	char word[length + 1];
	strcpy(word, str.c_str());

	word[stem( word, 0, length ) + 1] = '\0';

	std::string str_stem;
	str_stem.assign(word);

	return str_stem;
}
Beispiel #18
0
void
file_chooser::on_file_type_changed ()
{
  Glib::RefPtr< Gtk::TreeSelection > s (file_type_.get_selection ());
  if (!s) return;

  Gtk::TreeModel::iterator it (s->get_selected ());
  if (!it) return;

  Gtk::TreeModel::Row r (*it);
  extension_list      l (r[column->exts]);

  if (l.empty ())
    {
      expander_.set_label (_("File Type"));
    }
  else
    {
      expander_.set_label ((format (_("File type: %1%"))
                            % r.get_value (column->text)).str ());

      if (!count (l.begin (), l.end (), get_current_extension ()))
        set_current_extension (l.front ());
    }

  if (!single_image_mode_)
    {
      single_file_.set_sensitive (supports_multi_image (get_current_name ()));
      if (!supports_multi_image (get_current_name ()))
        {
          if (!regex_match (get_current_name (), filename_re))
            {
              fs::path path (get_current_name ());
              fs::path stem (path.stem ());
              fs::path ext  (path.extension ());

              path = stem;
              path = path.native () + default_pattern_;
              path.replace_extension (ext);

              set_current_name (path.string ());
            }
        }
      single_file_.set_active (requests_single_file (get_current_name ()));
    }
}
Beispiel #19
0
/* Tokenise and stem a file */
static void
stemFile(FILE *file) {
  int character;
  int index;

  while (TRUE) {
    character = getc(file);

    if (character == EOF) {
      return;
    }

    if (IS_LETTER(character)) {
      index = 0;

      while (TRUE) {
        if (index == indexMax) {
          increaseValue();
        }

        character = tolower(character);

        value[index] = character;

        index++;

        character = getc(file);

        if (!IS_LETTER(character)) {
          ungetc(character, file);
          break;
        }
      }

      value[stem(value, 0, index - 1) + 1] = 0;

      /* The previous line calls the stemmer and
       * uses its result to zero-terminate the
       * string in `value`. */
      printf("%s", value);
    } else {
      putchar(character);
    }
  }
}
Beispiel #20
0
string 
get_word( istream & i )
{
	string word;
	
	while ( !i.eof() && !is_non_text(i.peek()) )
	{
		word += i.get();
	}
	
	// lowercase
	transform(
		word.begin(), word.end(),
		word.begin(),
		tolower
	);
	
	return stem( strip_specials( remove_duplicates(word) ) );
}
Beispiel #21
0
void Morfologik::stemsOnFormLevel_(const std::string & word,
    LemmatizerOutputIterator & outputIterator) {

    std::multimap<std::string, std::vector<std::string> > stems = stem(word);

    std::set<std::string> lemmas = getLemmasFromStems_(stems);
    std::set<std::string>::iterator lem;

    DEBUG("found stems of word [" << word << "] on form level: ["
        << boost::algorithm::join(lemmas, ", ") << "]");

    for (lem = lemmas.begin(); lem != lemmas.end(); ++lem) {
        if (!foundLemma_) outputIterator.addNormalization(word);
        outputIterator.addLemma(*lem);
        foundLemma_ = true;

        std::multimap<std::string, std::vector<std::string> >::iterator lex;

        for (lex = stems.equal_range(*lem).first; lex != stems.equal_range(*lem).second; ++lex) {
            std::vector<std::string> tags = lex->second;
            std::vector<std::string>::iterator tag = tags.begin();

            AnnotationItem lexItm = createLexemeAnnotation_(*lem, *tag);
            outputIterator.addLexeme(lexItm);

            DEBUG("tags: [" << boost::algorithm::join(tags, ", ") << "]");

            for (tag = tags.begin(); tag != tags.end(); ++tag) {

                std::vector<std::map<std::string, std::string> > forms =
                    tagsParser_.getFormAttributes(*tag);
                std::vector<std::map<std::string, std::string> >::iterator frm;

                for (frm = forms.begin(); frm != forms.end(); ++frm) {
                    AnnotationItem frmItm = createFormAnnotation_(lexItm, word, *frm);
                    outputIterator.addForm(frmItm);
                }
            }

        }
    }
}
Beispiel #22
0
static void stemfile(FILE * f)
{  while(TRUE)
   {  int ch = getc(f);
      if (ch == EOF) return;
      if (LETTER(ch))
      {  int i = 0;
         while(TRUE)
         {  if (i == i_max) increase_s();

            ch = tolower(ch); /* forces lower case */

            s[i] = ch; i++;
            ch = getc(f);
            if (!LETTER(ch)) { ungetc(ch,f); break; }
         }
         s[stem(s,0,i-1)+1] = 0;
         /* the previous line calls the stemmer and uses its result to
            zero-terminate the string in s */
         printf("%s",s);
      }
      else putchar(ch);
   }
}
Beispiel #23
0
int main(void)  {

    struct tnode *root            = NULL;
    char   word[MAXWORDSIZE]      = "";
    char   unstemmed[MAXWORDSIZE] = "";
    int    line                   = 1;

    while (getword(word, MAXWORDSIZE) != EOF)
        if (isalpha(word[0])) {
            lowerstr(word);
            if (isnotstopword(word)) {
                strncpy(unstemmed, word, MAXWORDSIZE);
                squeezechar(word, '\'');
                word[stem(word, 0, strlen(word) - 1) + 1] = '\0';
                root = treeadd(root, word, unstemmed, line);
            }
        } else if (word[0] == '\n')
            line++;

    treeprint(root);

    return 0;
}
Beispiel #24
0
//! [3]
void QtLogo::buildGeometry(int divisions, qreal scale)
{
    qreal cw = cross_width * scale;
    qreal bt = bar_thickness * scale;
    qreal ld = logo_depth * scale;
    qreal th = tee_height *scale;

    RectPrism cross(geom, cw, bt, ld);
    RectPrism stem(geom, bt, th, ld);

    QVector3D z(0.0, 0.0, 1.0);
    cross.rotate(45.0, z);
    stem.rotate(45.0, z);

    qreal stem_downshift = (th + bt) / 2.0;
    stem.translate(QVector3D(0.0, -stem_downshift, 0.0));

    RectTorus body(geom, 0.20, 0.30, 0.1, divisions);

    parts << stem.parts << cross.parts << body.parts;

    geom->finalize();
}
Beispiel #25
0
int
main(int argc, char *argv[])
{
	static char *line;
	static size_t llen;
	ssize_t nrd;

	/* just read the words from stdin */
	while ((nrd = getline(&line, &llen, stdin)) > 0) {
		ssize_t s;

		/* lower them */
		for (char *lp = line; lp < line + nrd - 1; lp++) {
			*lp = (char)tolower(*lp);
		}
		if ((s = stem(line, nrd - 1)) < 0) {
			continue;
		}
		line[s + 1U] = '\0';
		puts(line);
	}
	free(line);
	return 0;
}
/*------------------------------------------------------------------
 * gethostbyaddr()
 *------------------------------------------------------------------*/
RexxRoutine3(int, SockGetHostByAddr, CSTRING, addrArg, RexxObjectPtr, stemSource, OPTIONAL_int, domain)
{
    struct hostent *pHostEnt;
    in_addr addr;

    StemManager stem(context);

    if (!stem.resolveStem(stemSource))
    {
        return 0;
    }

    addr.s_addr = inet_addr(addrArg);

    if (argumentOmitted(3))
    {
        domain = AF_INET;
    }

    /*---------------------------------------------------------------
     * call function
     *---------------------------------------------------------------*/
    pHostEnt = gethostbyaddr((char*)&addr, sizeof(addr), domain);
    // set the errno information
    cleanup(context);

    if (!pHostEnt)
    {
        return 0;
    }
    else
    {
        hostEntToStem(context, pHostEnt, stem);
        return 1;
    }
}
/*------------------------------------------------------------------
 * bind()
 *------------------------------------------------------------------*/
RexxRoutine2(int, SockBind, int, sock, RexxObjectPtr, stemSource)
{
    StemManager stem(context);

    if (!stem.resolveStem(stemSource))
    {
        return 0;
    }

    sockaddr_in  addr;

    /*---------------------------------------------------------------
     * get addr
     *---------------------------------------------------------------*/
    stemToSockAddr(context, stem, &addr);

    /*---------------------------------------------------------------
     * call function
     *---------------------------------------------------------------*/
    int rc = bind(sock, (struct sockaddr *)&addr, sizeof(addr));
    // make sure the errno variables are set
    cleanup(context);
    return rc;
}
void stemfile(FILE * f)
{  while(TRUE)
   {  int ch = getc(f);
      if (ch == EOF) return;
      if (LETTER(ch))
      {  int i = 0;
         while(TRUE)
         {  if (i == i_max) increase_s();

            if UC(ch) ch = FORCELC(ch);
            /* forces lower case. Remove this line to make the program work
               exactly like the Muscat stemtext command. */

            s[i] = ch; i++;
            ch = getc(f);
            if (!LETTER(ch)) { ungetc(ch,f); break; }
         }
         s[stem(s,0,i-1)+1] = 0;
         /* the pevious line calls the stemmer and uses its result to
            zero-terminate the string in s */
         printf("%s",s);
      }
      else putchar(ch);
   }
/*
** Stem the input word zIn[0..nIn-1].  Store the output in zOut.
** zOut is at least big enough to hold nIn bytes.  Write the actual
** size of the output word (exclusive of the '\0' terminator) into *pnOut.
**
** Any upper-case characters in the US-ASCII character set ([A-Z])
** are converted to lower case.  Upper-case UTF characters are
** unchanged.
**
** Words that are longer than about 20 bytes are stemmed by retaining
** a few bytes from the beginning and the end of the word.  If the
** word contains digits, 3 bytes are taken from the beginning and
** 3 bytes from the end.  For long words without digits, 10 bytes
** are taken from each end.  US-ASCII case folding still applies.
** 
** If the input word contains not digits but does characters not 
** in [a-zA-Z] then no stemming is attempted and this routine just 
** copies the input into the input into the output with US-ASCII
** case folding.
**
** Stemming never increases the length of the word.  So there is
** no chance of overflowing the zOut buffer.
*/
static void porter_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){
  int i, j, c;
  char zReverse[28];
  char *z, *z2;
  if( nIn<3 || nIn>=sizeof(zReverse)-7 ){
    /* The word is too big or too small for the porter stemmer.
    ** Fallback to the copy stemmer */
    copy_stemmer(zIn, nIn, zOut, pnOut);
    return;
  }
  for(i=0, j=sizeof(zReverse)-6; i<nIn; i++, j--){
    c = zIn[i];
    if( c>='A' && c<='Z' ){
      zReverse[j] = c + 'a' - 'A';
    }else if( c>='a' && c<='z' ){
      zReverse[j] = c;
    }else{
      /* The use of a character not in [a-zA-Z] means that we fallback
      ** to the copy stemmer */
      copy_stemmer(zIn, nIn, zOut, pnOut);
      return;
    }
  }
  memset(&zReverse[sizeof(zReverse)-5], 0, 5);
  z = &zReverse[j+1];


  /* Step 1a */
  if( z[0]=='s' ){
    if(
     !stem(&z, "sess", "ss", 0) &&
     !stem(&z, "sei", "i", 0)  &&
     !stem(&z, "ss", "ss", 0)
    ){
      z++;
    }
  }

  /* Step 1b */  
  z2 = z;
  if( stem(&z, "dee", "ee", m_gt_0) ){
    /* Do nothing.  The work was all in the test */
  }else if( 
     (stem(&z, "gni", "", hasVowel) || stem(&z, "de", "", hasVowel))
      && z!=z2
  ){
     if( stem(&z, "ta", "ate", 0) ||
         stem(&z, "lb", "ble", 0) ||
         stem(&z, "zi", "ize", 0) ){
       /* Do nothing.  The work was all in the test */
     }else if( doubleConsonant(z) && (*z!='l' && *z!='s' && *z!='z') ){
       z++;
     }else if( m_eq_1(z) && star_oh(z) ){
       *(--z) = 'e';
     }
  }

  /* Step 1c */
  if( z[0]=='y' && hasVowel(z+1) ){
    z[0] = 'i';
  }

  /* Step 2 */
  switch( z[1] ){
   case 'a':
     stem(&z, "lanoita", "ate", m_gt_0) ||
     stem(&z, "lanoit", "tion", m_gt_0);
     break;
   case 'c':
     stem(&z, "icne", "ence", m_gt_0) ||
     stem(&z, "icna", "ance", m_gt_0);
     break;
   case 'e':
     stem(&z, "rezi", "ize", m_gt_0);
     break;
   case 'g':
     stem(&z, "igol", "log", m_gt_0);
     break;
   case 'l':
     stem(&z, "ilb", "ble", m_gt_0) ||
     stem(&z, "illa", "al", m_gt_0) ||
     stem(&z, "iltne", "ent", m_gt_0) ||
     stem(&z, "ile", "e", m_gt_0) ||
     stem(&z, "ilsuo", "ous", m_gt_0);
     break;
   case 'o':
     stem(&z, "noitazi", "ize", m_gt_0) ||
     stem(&z, "noita", "ate", m_gt_0) ||
     stem(&z, "rota", "ate", m_gt_0);
     break;
   case 's':
     stem(&z, "msila", "al", m_gt_0) ||
     stem(&z, "ssenevi", "ive", m_gt_0) ||
     stem(&z, "ssenluf", "ful", m_gt_0) ||
     stem(&z, "ssensuo", "ous", m_gt_0);
     break;
   case 't':
     stem(&z, "itila", "al", m_gt_0) ||
     stem(&z, "itivi", "ive", m_gt_0) ||
     stem(&z, "itilib", "ble", m_gt_0);
     break;
  }

  /* Step 3 */
  switch( z[0] ){
   case 'e':
     stem(&z, "etaci", "ic", m_gt_0) ||
     stem(&z, "evita", "", m_gt_0)   ||
     stem(&z, "ezila", "al", m_gt_0);
     break;
   case 'i':
     stem(&z, "itici", "ic", m_gt_0);
     break;
   case 'l':
     stem(&z, "laci", "ic", m_gt_0) ||
     stem(&z, "luf", "", m_gt_0);
     break;
   case 's':
     stem(&z, "ssen", "", m_gt_0);
     break;
  }

  /* Step 4 */
  switch( z[1] ){
   case 'a':
     if( z[0]=='l' && m_gt_1(z+2) ){
       z += 2;
     }
     break;
   case 'c':
     if( z[0]=='e' && z[2]=='n' && (z[3]=='a' || z[3]=='e')  && m_gt_1(z+4)  ){
       z += 4;
     }
     break;
   case 'e':
     if( z[0]=='r' && m_gt_1(z+2) ){
       z += 2;
     }
     break;
   case 'i':
     if( z[0]=='c' && m_gt_1(z+2) ){
       z += 2;
     }
     break;
   case 'l':
     if( z[0]=='e' && z[2]=='b' && (z[3]=='a' || z[3]=='i') && m_gt_1(z+4) ){
       z += 4;
     }
     break;
   case 'n':
     if( z[0]=='t' ){
       if( z[2]=='a' ){
         if( m_gt_1(z+3) ){
           z += 3;
         }
       }else if( z[2]=='e' ){
         stem(&z, "tneme", "", m_gt_1) ||
         stem(&z, "tnem", "", m_gt_1) ||
         stem(&z, "tne", "", m_gt_1);
       }
     }
     break;
   case 'o':
     if( z[0]=='u' ){
       if( m_gt_1(z+2) ){
         z += 2;
       }
     }else if( z[3]=='s' || z[3]=='t' ){
       stem(&z, "noi", "", m_gt_1);
     }
     break;
   case 's':
     if( z[0]=='m' && z[2]=='i' && m_gt_1(z+3) ){
       z += 3;
     }
     break;
   case 't':
     stem(&z, "eta", "", m_gt_1) ||
     stem(&z, "iti", "", m_gt_1);
     break;
   case 'u':
     if( z[0]=='s' && z[2]=='o' && m_gt_1(z+3) ){
       z += 3;
     }
     break;
   case 'v':
   case 'z':
     if( z[0]=='e' && z[2]=='i' && m_gt_1(z+3) ){
       z += 3;
     }
     break;
  }

  /* Step 5a */
  if( z[0]=='e' ){
    if( m_gt_1(z+1) ){
      z++;
    }else if( m_eq_1(z+1) && !star_oh(z+1) ){
      z++;
    }
  }

  /* Step 5b */
  if( m_gt_1(z) && z[0]=='l' && z[1]=='l' ){
    z++;
  }

  /* z[] is now the stemmed word in reverse order.  Flip it back
  ** around into forward order and return.
  */
  *pnOut = i = strlen(z);
  zOut[i] = 0;
  while( *z ){
    zOut[--i] = *(z++);
  }
}
Beispiel #30
0
// <hash, <start, len> >
vector<pair<long long, pair<int, int> > > splitWords(const wstring &s, vector<pair<long long, long long> > &fixedstem, vector<pair<long long, long long> > &replaced, set<long long> &names)
{
    vector<pair<long long, pair<int, int> > > ans;
    wstring word;
    int prevKind = 0;  // 1 - letter, 2 - digit
    wstring S = s + L' ';
    for(int j=0; j<(int)S.size(); j++)
    {
        wchar_t i = towupper(S[j]);
        if(isLetter(i) && prevKind != 2)
        {
            word.push_back(i);
            prevKind = 1;
        }
        else if(isDigit(i) && prevKind != 1)
        {
            word.push_back('0');
            prevKind = 2;
        }
        else
        {
            if(word.length())
            {
                bool st = 1;
                long long pw = phash(word);
                for(auto &t : fixedstem)
                {
                    if(t.first == pw)
                    {
                        pw = t.second;
//                        wcerr << pw << L" proc\n";
                        st = 0;
                        break;
                    }
                }
                if(names.count(pw))
                {
                    ans.push_back({phname, {j-word.length(), word.length()}});
                }
                else
                {
                    long long std = 0;
                    if(st)
                    {
                        std = stem(word);
                        for(auto &t : replaced)
                        {
                            if(std == t.first)
                            {
                                std = t.second;
                                break;
                            }
                        }
                    }
                    ans.push_back({st ? std : pw, {j-word.length(), word.length()}});
                }
            }
            word.clear();
            prevKind = 0;
            if(isLetter(i))
            {
                word.push_back(i);
                prevKind = 1;
            }
            else if(isDigit(i))
            {
                word.push_back('0');
                prevKind = 2;
            }
        }
    }
    return ans;
}