QStringList stemWords(const QString &words) { struct { const char* suffix; const char* stem; } dict[] = { {"ies", "y"}, {"ied", "y"}, {"es", ""}, {"ting", "te"}, {"ing", ""}, {"ing", "e"}, {"ed", "e"}, {"ed", ""}, {"id", "y"}, {"ices", "ex"}, {"ves", "fe"}, {"s", ""}, }; QStringList list; for (size_t i=0; i<sizeof(dict)/sizeof(dict[0]); i++) { QString suffix(dict[i].suffix); if (words.endsWith(suffix, Qt::CaseInsensitive)) { QString stem(dict[i].stem); QString t(words); t.chop(suffix.length()); t.append(stem); list << t; } } return list; }
/* Tokenize a given sds, setting a term to zero-length sds if it's * a stopword. A total number of tokens and total number of nonstopwords * will be returned */ static sds *sds_tokenize(sds s, int *len, int *nonstopwords) { int i, l, k; sds *terms; struct stemmer *stmer; *nonstopwords = 0; terms = sdssplitlen(s, sdslen(s), " ", 1, len); if (!terms) return NULL; stmer = create_stemmer(); for (i = 0; i < *len; i++) { sds stemmed = NULL, term = terms[i]; term = sdstrim(term, puncs); l = sdslen(term); sdstolower(term); if (l == 0 || rr_stopwords_check(term)) { sdsclear(term); continue; } *nonstopwords += 1; /* note that the third argument is a zero-based index */ k = stem(stmer, term, l-1); if (k < l-1) { stemmed = sdsnewlen(term, k+1); sdsfree(term); terms[i] = stemmed; } } free_stemmer(stmer); return terms; }
static PyObject* jellyfish_porter_stem(PyObject *self, PyObject *args) { const char *str; char *result; PyObject *ret; struct stemmer *z; int end; if (!PyArg_ParseTuple(args, "s", &str)) { return NULL; } z = create_stemmer(); if (!z) { PyErr_NoMemory(); return NULL; } result = strdup(str); if (!result) { free_stemmer(z); PyErr_NoMemory(); return NULL; } end = stem(z, result, strlen(result) - 1); result[end + 1] = '\0'; ret = Py_BuildValue("s", result); free(result); free_stemmer(z); return ret; }
/*------------------------------------------------------------------ * gethostbyname() *------------------------------------------------------------------*/ RexxRoutine2(int, SockGetHostByName, CSTRING, name, RexxObjectPtr, stemSource) { StemManager stem(context); if (!stem.resolveStem(stemSource)) { return 0; } struct hostent *pHostEnt; /*--------------------------------------------------------------- * call function *---------------------------------------------------------------*/ pHostEnt = gethostbyname(name); // set the errno information cleanup(context); if (!pHostEnt) { return 0; } else { hostEntToStem(context, pHostEnt, stem); return 1; } }
void stemfile(struct stemmer * z, FILE * f) { while(TRUE) { int ch = getc(f); if (ch == EOF) return; if (LETTER(ch)) { int i = 0; while(TRUE) { if (i == i_max) { i_max += INC; s = realloc(s, i_max + 1); } ch = tolower(ch); /* forces lower case */ s[i] = ch; i++; ch = getc(f); if (!LETTER(ch)) { ungetc(ch,f); break; } } s[stem(z, s, i - 1) + 1] = 0; /* the previous line calls the stemmer and uses its result to zero-terminate the string in s */ printf("%s",s); } else putchar(ch); } }
void file_chooser::on_single_file_toggled () { std::string name (get_current_name ()); smatch m; if (regex_match (name, m, filename_re)) { if (!single_file_.get_active ()) return; set_current_name (m.str (1) + m.str (5)); } else { if (single_file_.get_active ()) return; fs::path path (get_current_name ()); fs::path stem (path.stem ()); fs::path ext (path.extension ()); path = stem; path = path.native () + default_pattern_; path.replace_extension (ext); set_current_name (path.string ()); } }
/*------------------------------------------------------------------------------ * accept() * * @remarks The sockAddrToStem() function calls both htons() and inet_ntoa(). * On Windows, one or both, of those functions sets errno back to 0. * This prevents the Rexx programmer from ever seeing the errno if * accept fails. Because of this, we call cleanup() immediately after * the accept call in the belief that the Rexx programmer is more * interested in the result of accept(). * ----------------------------------------------------------------------------*/ RexxRoutine2(int, SockAccept, int, sock, OPTIONAL_RexxObjectPtr, stemSource) { sockaddr_in addr; socklen_t nameLen; nameLen = sizeof(addr); int rc = accept(sock, (struct sockaddr *)&addr, &nameLen); // set the errno variables cleanup(context); /*--------------------------------------------------------------- * set addr, if asked for *---------------------------------------------------------------*/ if (stemSource != NULLOBJECT) { StemManager stem(context); if (!stem.resolveStem(stemSource)) { return 0; } sockAddrToStem(context, &addr, stem); } return rc; }
/*------------------------------------------------------------------ * connect() *------------------------------------------------------------------*/ RexxRoutine2(int, SockConnect, int, sock, RexxObjectPtr, stemSource) { StemManager stem(context); if (!stem.resolveStem(stemSource)) { return 0; } sockaddr_in addr; /*--------------------------------------------------------------- * get addr *---------------------------------------------------------------*/ stemToSockAddr(context, stem, &addr); /*--------------------------------------------------------------- * call function *---------------------------------------------------------------*/ int rc = connect(sock,(struct sockaddr *)&addr, sizeof(addr)); // set the errno information cleanup(context); return rc; }
CAMLprim value caml_stemmer_porter2_stem(value v_stem, value v_str) { CAMLparam2(v_stem,v_str); CAMLlocal1(v_res); stemmer_t* val = (stemmer_t*)v_stem; size_t i, len = caml_string_length(v_str); if(len >= val->len){ val->len = len + 1; // to put trailing zero securely val->buf = realloc(val->buf, val->len); } len = 0; char *word = val->buf, *d = val->buf, *s = String_val(v_str); // This is much more beautiful and optimistic, but generates ugly // warnings. Sad. //while(*d++ = *s++) i++ ; while(*s && (len < val->len)){ *d = *s; d++; s++; len++; } if(0 == len) CAMLreturn(caml_copy_string("")); // short words are not lowercased by stem() if (len < 3){ word[0] = tolower(word[0]); word[1] = tolower(word[1]); } i = stem(val->st, word, len-1); word[i+1] = '\0'; v_res = caml_copy_string(word); CAMLreturn(v_res); }
/** * Reload all avatars. */ void AvatarGallery::Reload() { avatars.clear(); if (!fs::exists(path)) { HR_LOG(info) << "Avatar directory does not exist: " << (const char*)Str::PU(path); return; } if (!fs::is_directory(path)) { HR_LOG(warning) << "Avatar directory is not a directory: " << (const char*)Str::PU(path); return; } OS::dirIter_t dend; for (OS::dirIter_t iter{ path }; iter != dend; ++iter) { auto filename = iter->path().filename(); auto avatarName = filename.stem().string(); if (filename.extension() != ".png") { HR_LOG(debug) << "Ignoring non-avatar: " << filename; continue; } HR_LOG(debug) << "Found avatar: " << avatarName << ": " << filename; avatars.emplace( avatarName, std::make_shared<Display::MediaRes<Display::Texture>>( Str::UP("avatars") / filename)); } }
/*------------------------------------------------------------------ * getsockname() *------------------------------------------------------------------*/ RexxRoutine2(int, SockGetSockName, int, sock, RexxObjectPtr, stemSource) { StemManager stem(context); if (!stem.resolveStem(stemSource)) { return 0; } sockaddr_in addr; socklen_t nameLen; /*--------------------------------------------------------------- * call function *---------------------------------------------------------------*/ nameLen = sizeof(addr); int rc = getsockname(sock,(struct sockaddr *)&addr,&nameLen); // set the errno information cleanup(context); /*--------------------------------------------------------------- * write address to stem *---------------------------------------------------------------*/ sockAddrToStem(context, &addr, stem); /*--------------------------------------------------------------- * set return code *---------------------------------------------------------------*/ return rc; }
void Morfologik::stemsOnLexemeLevel_(const std::string & word, LemmatizerOutputIterator & outputIterator) { std::multimap<std::string, std::vector<std::string> > stems = stem(word); std::set<std::string> lemmas = getLemmasFromStems_(stems); std::set<std::string>::iterator lem; DEBUG("found stems of word [" << word << "] on lexeme level: [" << boost::algorithm::join(lemmas, ", ") << "]"); for (lem = lemmas.begin(); lem != lemmas.end(); ++lem) { if (!foundLemma_) outputIterator.addNormalization(word); outputIterator.addLemma(*lem); foundLemma_ = true; std::vector<std::string> lexemeTags = getLexemeTagsFromStems_(stems, *lem); std::vector<std::string>::iterator lxt; for (lxt = lexemeTags.begin(); lxt != lexemeTags.end(); ++lxt) { AnnotationItem lexItem = createLexemeAnnotation_(*lem, *lxt); outputIterator.addLexeme(lexItem); } } }
static void evaluate(const char *input) { value = strdup(input); value[stem(value, 0, strlen(value) - 1) + 1] = 0; printf("%s\n", value); }
/* aplica el algoritmo de stemming a la palabra pasada como parametro. retorna el string resultante. */ std::string Stemmer::stemPalabra(std::string w) { s = w; /* lo paso a minusculas */ std::transform(s.begin(), s.end(), s.begin(), ::tolower); int nuevoTamanio = stem( s , 0 , s.size()-1 ); s.resize(nuevoTamanio+1); return s; }
static DWORD pollDiscDrives(void) { /* Try to use SetThreadErrorMode(), which showed up in Windows 7. */ HANDLE lib = LoadLibraryA("kernel32.dll"); fnSTEM stem = NULL; char drive[4] = { 'x', ':', '\\', '\0' }; DWORD oldErrorMode = 0; DWORD drives = 0; DWORD i; if (lib) stem = (fnSTEM) GetProcAddress(lib, "SetThreadErrorMode"); if (stem) stem(SEM_FAILCRITICALERRORS, &oldErrorMode); else oldErrorMode = SetErrorMode(SEM_FAILCRITICALERRORS); /* Do detection. This may block if a disc is spinning up. */ for (i = 'A'; i <= 'Z'; i++) { DWORD tmp = 0; drive[0] = (char) i; if (GetDriveTypeA(drive) != DRIVE_CDROM) continue; /* If this function succeeds, there's media in the drive */ if (GetVolumeInformationA(drive, NULL, 0, NULL, NULL, &tmp, NULL, 0)) drives |= (1 << (i - 'A')); } /* for */ if (stem) stem(oldErrorMode, NULL); else SetErrorMode(oldErrorMode); if (lib) FreeLibrary(lib); return drives; } /* pollDiscDrives */
/** * Search for a term. */ static void ys_find_terms(ys_collection_t *collection, ys_query_t *query) { ys_uchar_t key[YS_MAXKEYSIZE+1]; char *cp = query->termbuf; /* printf("searching term %s\n", query->termbuf); */ key[0] = strlen(cp); memcpy(key+1, cp, key[0]+1); if (collection->stemmed) stem(key); ys_btree_iterate( collection->tree, key, ys_find_docs, collection ); }
std::string Stemmer::stem(std::string str) { int length = str.length(); char word[length + 1]; strcpy(word, str.c_str()); word[stem( word, 0, length ) + 1] = '\0'; std::string str_stem; str_stem.assign(word); return str_stem; }
void file_chooser::on_file_type_changed () { Glib::RefPtr< Gtk::TreeSelection > s (file_type_.get_selection ()); if (!s) return; Gtk::TreeModel::iterator it (s->get_selected ()); if (!it) return; Gtk::TreeModel::Row r (*it); extension_list l (r[column->exts]); if (l.empty ()) { expander_.set_label (_("File Type")); } else { expander_.set_label ((format (_("File type: %1%")) % r.get_value (column->text)).str ()); if (!count (l.begin (), l.end (), get_current_extension ())) set_current_extension (l.front ()); } if (!single_image_mode_) { single_file_.set_sensitive (supports_multi_image (get_current_name ())); if (!supports_multi_image (get_current_name ())) { if (!regex_match (get_current_name (), filename_re)) { fs::path path (get_current_name ()); fs::path stem (path.stem ()); fs::path ext (path.extension ()); path = stem; path = path.native () + default_pattern_; path.replace_extension (ext); set_current_name (path.string ()); } } single_file_.set_active (requests_single_file (get_current_name ())); } }
/* Tokenise and stem a file */ static void stemFile(FILE *file) { int character; int index; while (TRUE) { character = getc(file); if (character == EOF) { return; } if (IS_LETTER(character)) { index = 0; while (TRUE) { if (index == indexMax) { increaseValue(); } character = tolower(character); value[index] = character; index++; character = getc(file); if (!IS_LETTER(character)) { ungetc(character, file); break; } } value[stem(value, 0, index - 1) + 1] = 0; /* The previous line calls the stemmer and * uses its result to zero-terminate the * string in `value`. */ printf("%s", value); } else { putchar(character); } } }
string get_word( istream & i ) { string word; while ( !i.eof() && !is_non_text(i.peek()) ) { word += i.get(); } // lowercase transform( word.begin(), word.end(), word.begin(), tolower ); return stem( strip_specials( remove_duplicates(word) ) ); }
void Morfologik::stemsOnFormLevel_(const std::string & word, LemmatizerOutputIterator & outputIterator) { std::multimap<std::string, std::vector<std::string> > stems = stem(word); std::set<std::string> lemmas = getLemmasFromStems_(stems); std::set<std::string>::iterator lem; DEBUG("found stems of word [" << word << "] on form level: [" << boost::algorithm::join(lemmas, ", ") << "]"); for (lem = lemmas.begin(); lem != lemmas.end(); ++lem) { if (!foundLemma_) outputIterator.addNormalization(word); outputIterator.addLemma(*lem); foundLemma_ = true; std::multimap<std::string, std::vector<std::string> >::iterator lex; for (lex = stems.equal_range(*lem).first; lex != stems.equal_range(*lem).second; ++lex) { std::vector<std::string> tags = lex->second; std::vector<std::string>::iterator tag = tags.begin(); AnnotationItem lexItm = createLexemeAnnotation_(*lem, *tag); outputIterator.addLexeme(lexItm); DEBUG("tags: [" << boost::algorithm::join(tags, ", ") << "]"); for (tag = tags.begin(); tag != tags.end(); ++tag) { std::vector<std::map<std::string, std::string> > forms = tagsParser_.getFormAttributes(*tag); std::vector<std::map<std::string, std::string> >::iterator frm; for (frm = forms.begin(); frm != forms.end(); ++frm) { AnnotationItem frmItm = createFormAnnotation_(lexItm, word, *frm); outputIterator.addForm(frmItm); } } } } }
static void stemfile(FILE * f) { while(TRUE) { int ch = getc(f); if (ch == EOF) return; if (LETTER(ch)) { int i = 0; while(TRUE) { if (i == i_max) increase_s(); ch = tolower(ch); /* forces lower case */ s[i] = ch; i++; ch = getc(f); if (!LETTER(ch)) { ungetc(ch,f); break; } } s[stem(s,0,i-1)+1] = 0; /* the previous line calls the stemmer and uses its result to zero-terminate the string in s */ printf("%s",s); } else putchar(ch); } }
int main(void) { struct tnode *root = NULL; char word[MAXWORDSIZE] = ""; char unstemmed[MAXWORDSIZE] = ""; int line = 1; while (getword(word, MAXWORDSIZE) != EOF) if (isalpha(word[0])) { lowerstr(word); if (isnotstopword(word)) { strncpy(unstemmed, word, MAXWORDSIZE); squeezechar(word, '\''); word[stem(word, 0, strlen(word) - 1) + 1] = '\0'; root = treeadd(root, word, unstemmed, line); } } else if (word[0] == '\n') line++; treeprint(root); return 0; }
//! [3] void QtLogo::buildGeometry(int divisions, qreal scale) { qreal cw = cross_width * scale; qreal bt = bar_thickness * scale; qreal ld = logo_depth * scale; qreal th = tee_height *scale; RectPrism cross(geom, cw, bt, ld); RectPrism stem(geom, bt, th, ld); QVector3D z(0.0, 0.0, 1.0); cross.rotate(45.0, z); stem.rotate(45.0, z); qreal stem_downshift = (th + bt) / 2.0; stem.translate(QVector3D(0.0, -stem_downshift, 0.0)); RectTorus body(geom, 0.20, 0.30, 0.1, divisions); parts << stem.parts << cross.parts << body.parts; geom->finalize(); }
int main(int argc, char *argv[]) { static char *line; static size_t llen; ssize_t nrd; /* just read the words from stdin */ while ((nrd = getline(&line, &llen, stdin)) > 0) { ssize_t s; /* lower them */ for (char *lp = line; lp < line + nrd - 1; lp++) { *lp = (char)tolower(*lp); } if ((s = stem(line, nrd - 1)) < 0) { continue; } line[s + 1U] = '\0'; puts(line); } free(line); return 0; }
/*------------------------------------------------------------------ * gethostbyaddr() *------------------------------------------------------------------*/ RexxRoutine3(int, SockGetHostByAddr, CSTRING, addrArg, RexxObjectPtr, stemSource, OPTIONAL_int, domain) { struct hostent *pHostEnt; in_addr addr; StemManager stem(context); if (!stem.resolveStem(stemSource)) { return 0; } addr.s_addr = inet_addr(addrArg); if (argumentOmitted(3)) { domain = AF_INET; } /*--------------------------------------------------------------- * call function *---------------------------------------------------------------*/ pHostEnt = gethostbyaddr((char*)&addr, sizeof(addr), domain); // set the errno information cleanup(context); if (!pHostEnt) { return 0; } else { hostEntToStem(context, pHostEnt, stem); return 1; } }
/*------------------------------------------------------------------ * bind() *------------------------------------------------------------------*/ RexxRoutine2(int, SockBind, int, sock, RexxObjectPtr, stemSource) { StemManager stem(context); if (!stem.resolveStem(stemSource)) { return 0; } sockaddr_in addr; /*--------------------------------------------------------------- * get addr *---------------------------------------------------------------*/ stemToSockAddr(context, stem, &addr); /*--------------------------------------------------------------- * call function *---------------------------------------------------------------*/ int rc = bind(sock, (struct sockaddr *)&addr, sizeof(addr)); // make sure the errno variables are set cleanup(context); return rc; }
void stemfile(FILE * f) { while(TRUE) { int ch = getc(f); if (ch == EOF) return; if (LETTER(ch)) { int i = 0; while(TRUE) { if (i == i_max) increase_s(); if UC(ch) ch = FORCELC(ch); /* forces lower case. Remove this line to make the program work exactly like the Muscat stemtext command. */ s[i] = ch; i++; ch = getc(f); if (!LETTER(ch)) { ungetc(ch,f); break; } } s[stem(s,0,i-1)+1] = 0; /* the pevious line calls the stemmer and uses its result to zero-terminate the string in s */ printf("%s",s); } else putchar(ch); }
/* ** Stem the input word zIn[0..nIn-1]. Store the output in zOut. ** zOut is at least big enough to hold nIn bytes. Write the actual ** size of the output word (exclusive of the '\0' terminator) into *pnOut. ** ** Any upper-case characters in the US-ASCII character set ([A-Z]) ** are converted to lower case. Upper-case UTF characters are ** unchanged. ** ** Words that are longer than about 20 bytes are stemmed by retaining ** a few bytes from the beginning and the end of the word. If the ** word contains digits, 3 bytes are taken from the beginning and ** 3 bytes from the end. For long words without digits, 10 bytes ** are taken from each end. US-ASCII case folding still applies. ** ** If the input word contains not digits but does characters not ** in [a-zA-Z] then no stemming is attempted and this routine just ** copies the input into the input into the output with US-ASCII ** case folding. ** ** Stemming never increases the length of the word. So there is ** no chance of overflowing the zOut buffer. */ static void porter_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){ int i, j, c; char zReverse[28]; char *z, *z2; if( nIn<3 || nIn>=sizeof(zReverse)-7 ){ /* The word is too big or too small for the porter stemmer. ** Fallback to the copy stemmer */ copy_stemmer(zIn, nIn, zOut, pnOut); return; } for(i=0, j=sizeof(zReverse)-6; i<nIn; i++, j--){ c = zIn[i]; if( c>='A' && c<='Z' ){ zReverse[j] = c + 'a' - 'A'; }else if( c>='a' && c<='z' ){ zReverse[j] = c; }else{ /* The use of a character not in [a-zA-Z] means that we fallback ** to the copy stemmer */ copy_stemmer(zIn, nIn, zOut, pnOut); return; } } memset(&zReverse[sizeof(zReverse)-5], 0, 5); z = &zReverse[j+1]; /* Step 1a */ if( z[0]=='s' ){ if( !stem(&z, "sess", "ss", 0) && !stem(&z, "sei", "i", 0) && !stem(&z, "ss", "ss", 0) ){ z++; } } /* Step 1b */ z2 = z; if( stem(&z, "dee", "ee", m_gt_0) ){ /* Do nothing. The work was all in the test */ }else if( (stem(&z, "gni", "", hasVowel) || stem(&z, "de", "", hasVowel)) && z!=z2 ){ if( stem(&z, "ta", "ate", 0) || stem(&z, "lb", "ble", 0) || stem(&z, "zi", "ize", 0) ){ /* Do nothing. The work was all in the test */ }else if( doubleConsonant(z) && (*z!='l' && *z!='s' && *z!='z') ){ z++; }else if( m_eq_1(z) && star_oh(z) ){ *(--z) = 'e'; } } /* Step 1c */ if( z[0]=='y' && hasVowel(z+1) ){ z[0] = 'i'; } /* Step 2 */ switch( z[1] ){ case 'a': stem(&z, "lanoita", "ate", m_gt_0) || stem(&z, "lanoit", "tion", m_gt_0); break; case 'c': stem(&z, "icne", "ence", m_gt_0) || stem(&z, "icna", "ance", m_gt_0); break; case 'e': stem(&z, "rezi", "ize", m_gt_0); break; case 'g': stem(&z, "igol", "log", m_gt_0); break; case 'l': stem(&z, "ilb", "ble", m_gt_0) || stem(&z, "illa", "al", m_gt_0) || stem(&z, "iltne", "ent", m_gt_0) || stem(&z, "ile", "e", m_gt_0) || stem(&z, "ilsuo", "ous", m_gt_0); break; case 'o': stem(&z, "noitazi", "ize", m_gt_0) || stem(&z, "noita", "ate", m_gt_0) || stem(&z, "rota", "ate", m_gt_0); break; case 's': stem(&z, "msila", "al", m_gt_0) || stem(&z, "ssenevi", "ive", m_gt_0) || stem(&z, "ssenluf", "ful", m_gt_0) || stem(&z, "ssensuo", "ous", m_gt_0); break; case 't': stem(&z, "itila", "al", m_gt_0) || stem(&z, "itivi", "ive", m_gt_0) || stem(&z, "itilib", "ble", m_gt_0); break; } /* Step 3 */ switch( z[0] ){ case 'e': stem(&z, "etaci", "ic", m_gt_0) || stem(&z, "evita", "", m_gt_0) || stem(&z, "ezila", "al", m_gt_0); break; case 'i': stem(&z, "itici", "ic", m_gt_0); break; case 'l': stem(&z, "laci", "ic", m_gt_0) || stem(&z, "luf", "", m_gt_0); break; case 's': stem(&z, "ssen", "", m_gt_0); break; } /* Step 4 */ switch( z[1] ){ case 'a': if( z[0]=='l' && m_gt_1(z+2) ){ z += 2; } break; case 'c': if( z[0]=='e' && z[2]=='n' && (z[3]=='a' || z[3]=='e') && m_gt_1(z+4) ){ z += 4; } break; case 'e': if( z[0]=='r' && m_gt_1(z+2) ){ z += 2; } break; case 'i': if( z[0]=='c' && m_gt_1(z+2) ){ z += 2; } break; case 'l': if( z[0]=='e' && z[2]=='b' && (z[3]=='a' || z[3]=='i') && m_gt_1(z+4) ){ z += 4; } break; case 'n': if( z[0]=='t' ){ if( z[2]=='a' ){ if( m_gt_1(z+3) ){ z += 3; } }else if( z[2]=='e' ){ stem(&z, "tneme", "", m_gt_1) || stem(&z, "tnem", "", m_gt_1) || stem(&z, "tne", "", m_gt_1); } } break; case 'o': if( z[0]=='u' ){ if( m_gt_1(z+2) ){ z += 2; } }else if( z[3]=='s' || z[3]=='t' ){ stem(&z, "noi", "", m_gt_1); } break; case 's': if( z[0]=='m' && z[2]=='i' && m_gt_1(z+3) ){ z += 3; } break; case 't': stem(&z, "eta", "", m_gt_1) || stem(&z, "iti", "", m_gt_1); break; case 'u': if( z[0]=='s' && z[2]=='o' && m_gt_1(z+3) ){ z += 3; } break; case 'v': case 'z': if( z[0]=='e' && z[2]=='i' && m_gt_1(z+3) ){ z += 3; } break; } /* Step 5a */ if( z[0]=='e' ){ if( m_gt_1(z+1) ){ z++; }else if( m_eq_1(z+1) && !star_oh(z+1) ){ z++; } } /* Step 5b */ if( m_gt_1(z) && z[0]=='l' && z[1]=='l' ){ z++; } /* z[] is now the stemmed word in reverse order. Flip it back ** around into forward order and return. */ *pnOut = i = strlen(z); zOut[i] = 0; while( *z ){ zOut[--i] = *(z++); } }
// <hash, <start, len> > vector<pair<long long, pair<int, int> > > splitWords(const wstring &s, vector<pair<long long, long long> > &fixedstem, vector<pair<long long, long long> > &replaced, set<long long> &names) { vector<pair<long long, pair<int, int> > > ans; wstring word; int prevKind = 0; // 1 - letter, 2 - digit wstring S = s + L' '; for(int j=0; j<(int)S.size(); j++) { wchar_t i = towupper(S[j]); if(isLetter(i) && prevKind != 2) { word.push_back(i); prevKind = 1; } else if(isDigit(i) && prevKind != 1) { word.push_back('0'); prevKind = 2; } else { if(word.length()) { bool st = 1; long long pw = phash(word); for(auto &t : fixedstem) { if(t.first == pw) { pw = t.second; // wcerr << pw << L" proc\n"; st = 0; break; } } if(names.count(pw)) { ans.push_back({phname, {j-word.length(), word.length()}}); } else { long long std = 0; if(st) { std = stem(word); for(auto &t : replaced) { if(std == t.first) { std = t.second; break; } } } ans.push_back({st ? std : pw, {j-word.length(), word.length()}}); } } word.clear(); prevKind = 0; if(isLetter(i)) { word.push_back(i); prevKind = 1; } else if(isDigit(i)) { word.push_back('0'); prevKind = 2; } } } return ans; }