stem(const char *lang, const char *enc) { m_stem = sb_stemmer_new(lang, enc); if (!m_stem) { m_stem = sb_stemmer_new("eng", enc); if (!m_stem) throw std::bad_alloc(); } }
CSnowballWrapper::CSnowballWrapper(const tchar* szLanguage) :m_pStemmerBuffer(NULL) ,m_nBufferSize(0) ,m_pIrregularWords(NULL) { if(szLanguage) m_pStemmer = sb_stemmer_new(szLanguage, NULL);///using utf-8 else m_pStemmer = sb_stemmer_new("english", NULL);///using utf-8 }
CSnowballWrapper::CSnowballWrapper(const tchar* szLanguage,const tchar* szEnc) :m_pStemmerBuffer(NULL) ,m_nBufferSize(0) ,m_pIrregularWords(NULL) { if(szLanguage) m_pStemmer = sb_stemmer_new(szLanguage, szEnc); else m_pStemmer = sb_stemmer_new("english", szEnc); }
CSnowballWrapper::CSnowballWrapper(const tchar* szLanguage,const tchar* szIrregularWords,const tchar* szEnc) :m_pStemmerBuffer(NULL) ,m_nBufferSize(0) ,m_pIrregularWords(NULL) { if(szIrregularWords) { m_pIrregularWords = new map<string,string>(); m_sIrregularWords = szIrregularWords; } if(szLanguage) m_pStemmer = sb_stemmer_new(szLanguage, szEnc); else m_pStemmer = sb_stemmer_new("english", szEnc); }
Datum stem_token_arr(PG_FUNCTION_ARGS) { if (PG_ARGISNULL(0)) { PG_RETURN_NULL(); } /* Prepare elements to receive input text[] */ ArrayType *arr = PG_GETARG_ARRAYTYPE_P(0); Datum *dtum; bool *nulls; int ndim; /* Deconstruct input text[] */ deconstruct_array(arr, TEXTOID, -1, false, 'i', &dtum, &nulls, &ndim); /* Prepare stemmer */ struct sb_stemmer *stemmer = sb_stemmer_new( "english" /* language */, NULL /* language encoding NULL for UTF-8 */); Assert(stemmer); /* Call stemming code */ text **result = (text **) palloc(ndim * sizeof(text * )); for(int i=0; i< ndim; i++) { text *token = dtum[i] == 0 ? NULL : DatumGetTextP(dtum[i]); char *empty; if(token == NULL) { empty = (char *)palloc(sizeof(char)); empty[0] = '\0'; } result[i] = (token == NULL ? cstring_to_text(empty) : cstring_to_text(stem_token_text(stemmer, token))); } ArrayType *res = construct_array((Datum*)result, ndim, TEXTOID, -1, false, 'i'); sb_stemmer_delete(stemmer); PG_RETURN_ARRAYTYPE_P(res); }
CSnowballWrapper::CSnowballWrapper() :m_pStemmerBuffer(NULL) ,m_nBufferSize(0) ,m_pIrregularWords(NULL) { m_pStemmer = sb_stemmer_new("english", NULL);///using utf-8 }
SnowballFilter::SnowballFilter(const TokenStreamPtr& input, const String& name) : TokenFilter(input) { stemmer = sb_stemmer_new(StringUtils::toUTF8(name).c_str(), "UTF_8"); if (stemmer == NULL) { boost::throw_exception(IllegalArgumentException(L"language not available for stemming:" + name)); } termAtt = addAttribute<TermAttribute>(); utf8Result = newLucene<UTF8Result>(); }
SbStemmerWrapper SbStemmerWrapper::create(string countryCode) { const char *cCode = countryCode.c_str(); if (strcmp(cCode, "UNKNOWN")) { cCode = "en"; } auto stemmer = sb_stemmer_new(cCode, "UTF_8"); return SbStemmerWrapper(countryCode, stemmer); }
Datum stem_token(PG_FUNCTION_ARGS) { if (PG_ARGISNULL(0)) { PG_RETURN_NULL(); } text * org_token = PG_GETARG_TEXT_P(0); struct sb_stemmer *stemmer = sb_stemmer_new( "english" /* language */, NULL /* language encoding NULL for UTF-8 */); Assert(stemmer); text *stemmed = cstring_to_text(stem_token_text(stemmer, org_token)); sb_stemmer_delete(stemmer); PG_RETURN_TEXT_P(stemmed); }
Stemmer *__newSnowballStemmer(const char *language) { struct sb_stemmer *sb = sb_stemmer_new(language, NULL); // No stemmer available for this language if (!sb) { return NULL; } Stemmer *ret = malloc(sizeof(Stemmer)); ret->ctx = sb; ret->Stem = __sbstemmer_Stem; ret->Free = __sbstemmer_Free; return ret; }
static void stem_filter(grn_ctx *ctx, grn_token *current_token, grn_token *next_token, void *user_data) { grn_stem_token_filter *token_filter = user_data; grn_obj *data; if (GRN_CTX_GET_ENCODING(ctx) != GRN_ENC_UTF8) { return; } data = grn_token_get_data(ctx, current_token); if (token_filter->stemmer) { sb_stemmer_delete(token_filter->stemmer); } { /* TODO: Detect algorithm from the current token. */ const char *algorithm = "english"; const char *encoding = "UTF_8"; token_filter->stemmer = sb_stemmer_new(algorithm, encoding); if (!token_filter->stemmer) { GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, "[token-filter][stem] " "failed to create stemmer: " "algorithm=<%s>, encoding=<%s>", algorithm, encoding); return; } } { const sb_symbol *stemmed; stemmed = sb_stemmer_stem(token_filter->stemmer, GRN_TEXT_VALUE(data), GRN_TEXT_LEN(data)); if (stemmed) { grn_token_set_data(ctx, next_token, stemmed, sb_stemmer_length(token_filter->stemmer)); } else { GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE, "[token-filter][stem] " "failed to allocate memory for stemmed word: <%.*s>", (int)GRN_TEXT_LEN(data), GRN_TEXT_VALUE(data)); return; } } }
static int fts_filter_stemmer_snowball_create_stemmer(struct fts_filter_stemmer_snowball *sp, const char **error_r) { sp->stemmer = sb_stemmer_new(sp->lang->name, "UTF_8"); if (sp->stemmer == NULL) { *error_r = t_strdup_printf( "Creating a Snowball stemmer for language '%s' failed.", sp->lang->name); fts_filter_stemmer_snowball_destroy(&sp->filter); return -1; } return 0; }
int init(iplus1_lang_t* lang) { strcpy(lang->lang, "deu"); lang->full_lang = strdup("german"); lang->param = malloc(sizeof(iplus1_german_t)); if (lang->param == NULL) return IPLUS1_FAIL; lang->parse = parse; iplus1_german_t* deu = (iplus1_german_t*)lang->param; if ((deu->stemmer = sb_stemmer_new("deu", "UTF_8")) == NULL) { fprintf(stderr, "could not find german stemmer\n"); return IPLUS1_FAIL; } load_stopwords(deu); return IPLUS1_SUCCESS; }
int init(iplus1_lang_t* lang) { strcpy(lang->lang, "por"); lang->full_lang = strdup("portugese"); lang->param = malloc(sizeof(iplus1_portugese_t)); if (lang->param == NULL) return IPLUS1_FAIL; lang->parse = parse; iplus1_portugese_t* por = (iplus1_portugese_t*)lang->param; if ((por->stemmer = sb_stemmer_new("por", "UTF_8")) == NULL) { fprintf(stderr, "could not find portugese stemmer\n"); return IPLUS1_FAIL; } load_stopwords(por); return IPLUS1_SUCCESS; }
SnowballStemmer* SnowStemmer_init(SnowballStemmer *self, String *language) { char lang_buf[3]; Analyzer_init((Analyzer*)self); SnowballStemmerIVARS *const ivars = SnowStemmer_IVARS(self); ivars->language = Str_Clone(language); // Get a Snowball stemmer. Be case-insensitive. lang_buf[0] = tolower(Str_Code_Point_At(language, 0)); lang_buf[1] = tolower(Str_Code_Point_At(language, 1)); lang_buf[2] = '\0'; ivars->snowstemmer = sb_stemmer_new(lang_buf, "UTF_8"); if (!ivars->snowstemmer) { THROW(ERR, "Can't find a Snowball stemmer for %o", language); } return self; }
int init(iplus1_lang_t* lang) { strcpy(lang->lang, "eng"); lang->full_lang = strdup("english"); lang->param = malloc(sizeof(iplus1_english_t)); if (lang->param == NULL) return IPLUS1_FAIL; lang->parse = parse; iplus1_english_t* eng = (iplus1_english_t*)lang->param; if ((eng->stemmer = sb_stemmer_new("eng", "UTF_8")) == NULL) { fprintf(stderr, "could not find english stemmer\n"); return IPLUS1_FAIL; } load_stopwords(eng); return IPLUS1_SUCCESS; }
VALUE fuzzy_snowball(int argc, VALUE * argv, VALUE self) { VALUE word, language, result = Qnil; rb_scan_args(argc, argv, "11", &word, &language); if (NIL_P(language)) language = fuzzy_default_language; if (TYPE(word) != T_STRING) rb_raise(rb_eArgError, "invalid word, expect string"); struct sb_stemmer *stemmer = sb_stemmer_new(CSTRING(language), "UTF_8"); if (stemmer) { const sb_symbol *stem = sb_stemmer_stem(stemmer, RSTRING_PTR(word), RSTRING_LEN(word)); uint32_t stem_len = sb_stemmer_length(stemmer); result = rb_enc_str_new(stem, stem_len, rb_enc_get(word)); sb_stemmer_delete(stemmer); } return result; }
static int get_lang_stemmer(term_t t, struct sb_stemmer **stemmer) { stem_cache *cache = get_cache(); atom_t lang; int i; if ( !PL_get_atom(t, &lang) ) return type_error("atom", t); for(i=0; i<CACHE_SIZE; i++) { if ( cache->stemmers[i].language == lang ) { *stemmer = cache->stemmers[i].stemmer; return TRUE; } } for(i=0; i<CACHE_SIZE; i++) { if ( !cache->stemmers[i].stemmer ) { struct sb_stemmer *st; if ( !(st= sb_stemmer_new(PL_atom_chars(lang), NULL)) ) { if ( errno == ENOMEM ) return resource_error("memory"); else return domain_error("snowball_algorithm", t); } cache->stemmers[i].language = lang; cache->stemmers[i].stemmer = st; PL_register_atom(cache->stemmers[i].language); *stemmer = cache->stemmers[i].stemmer; return TRUE; } } assert(0); /* TBD: clean cache */ return FALSE; }
Utils::Utils() { permutationTable_ = (int *)malloc(512*sizeof(int)); memset(permutationTable_, 0, 512*sizeof(int)); for(int i = 0; i < 512; i++) { permutationTable_[i] = p[i & 255]; // printf("perm[%d] = %d\n", i, perm[i]); } memset(crcTab_, 0, 256); crc32Initialize(); randomGenerator_ = new CRandomMersenne(4); //perlin_.SetFrequency(20); //perlin_.SetFrequency(0.4); //perlin_.SetFrequency(500); perlin_.SetFrequency(800); //perlin_.SetPersistence(0.02); perlin_.SetOctaveCount(20); #ifndef Q_OS_MAC stemmer_ = sb_stemmer_new("en", "UTF_8"); #endif }
int main(int argc, char * argv[]) { char * in = 0; char * out = 0; FILE * f_in; FILE * f_out; struct sb_stemmer * stemmer; char * language = "english"; char * charenc = NULL; char * s; int i = 1; pretty = 0; progname = argv[0]; while(i < argc) { s = argv[i++]; if (s[0] == '-') { if (strcmp(s, "-o") == 0) { if (i >= argc) { fprintf(stderr, "%s requires an argument\n", s); exit(1); } out = argv[i++]; } else if (strcmp(s, "-i") == 0) { if (i >= argc) { fprintf(stderr, "%s requires an argument\n", s); exit(1); } in = argv[i++]; } else if (strcmp(s, "-l") == 0) { if (i >= argc) { fprintf(stderr, "%s requires an argument\n", s); exit(1); } language = argv[i++]; } else if (strcmp(s, "-c") == 0) { if (i >= argc) { fprintf(stderr, "%s requires an argument\n", s); exit(1); } charenc = argv[i++]; } else if (strcmp(s, "-p") == 0) { pretty = 1; } else if (strcmp(s, "-h") == 0) { usage(0); } else { fprintf(stderr, "option %s unknown\n", s); usage(1); } } else { fprintf(stderr, "unexpected parameter %s\n", s); usage(1); } } /* prepare the files */ f_in = (in == 0) ? stdin : fopen(in, "r"); if (f_in == 0) { fprintf(stderr, "file %s not found\n", in); exit(1); } f_out = (out == 0) ? stdout : fopen(out, "w"); if (f_out == 0) { fprintf(stderr, "file %s cannot be opened\n", out); exit(1); } /* do the stemming process: */ stemmer = sb_stemmer_new(language, charenc); if (stemmer == 0) { if (charenc == NULL) { fprintf(stderr, "language `%s' not available for stemming\n", language); exit(1); } else { fprintf(stderr, "language `%s' not available for stemming in encoding `%s'\n", language, charenc); exit(1); } } stem_file(stemmer, f_in, f_out); sb_stemmer_delete(stemmer); if (in != 0) (void) fclose(f_in); if (out != 0) (void) fclose(f_out); return 0; }
void Snowball(sLONG_PTR *pResult, PackagePtr pParams) { C_TEXT Param1; ARRAY_TEXT Param2; ARRAY_TEXT Param3; C_LONGINT Param4; Param1.fromParamAtIndex(pParams, 1); Param4.fromParamAtIndex(pParams, 4); CUTF8String t; Param1.copyUTF8String(&t); std::string str((const char *)t.c_str()); std::vector<std::string>words = split(str); struct sb_stemmer *stemmer; char *language = (char *)"english"; Snowball_Language lang = (Snowball_Language)Param4.getIntValue(); switch (lang) { case Snowball_Danish: language = (char *)"danish"; break; case Snowball_Dutch: language = (char *)"dutch"; break; case Snowball_English: language = (char *)"english"; break; case Snowball_Finnish: language = (char *)"finnish"; break; case Snowball_French: language = (char *)"french"; break; case Snowball_German: language = (char *)"german"; break; case Snowball_Hungarian: language = (char *)"hungarian"; break; case Snowball_Italian: language = (char *)"italian"; break; case Snowball_Norwegian: language = (char *)"norwegian"; break; case Snowball_Portuguese: language = (char *)"portuguese"; break; case Snowball_Romanian: language = (char *)"romanian"; break; case Snowball_Russian: language = (char *)"russian"; break; case Snowball_Spanish: language = (char *)"spanish"; break; case Snowball_Swedish: language = (char *)"swedish"; break; case Snowball_Turkish: language = (char *)"turkish"; break; default: break; } char *charenc = NULL;//UTF-8 stemmer = sb_stemmer_new(language, charenc); if (stemmer) { Param2.setSize(1); Param3.setSize(1); for(std::vector<std::string>::iterator it = words.begin(); it != words.end(); ++it) { std::string word = *it; sb_symbol * symbol = (sb_symbol *)word.c_str(); int size = word.length(); const sb_symbol *stemmed = sb_stemmer_stem(stemmer, symbol, size); if (stemmed) { CUTF8String w((const uint8_t *)symbol); CUTF8String s((const uint8_t *)stemmed); Param2.appendUTF8String(&w); Param3.appendUTF8String(&s); } } sb_stemmer_delete(stemmer); } Param2.toParamAtIndex(pParams, 2); Param3.toParamAtIndex(pParams, 3); }
QString RStemmer::stem(const QString& word, const QString& locale) { static QString prevLocale = ""; static struct sb_stemmer* stemmer = NULL; // keep always the last used stemmer in memory: if (locale!=prevLocale) { if (stemmer!=NULL) { sb_stemmer_delete(stemmer); stemmer = NULL; } stemmer = sb_stemmer_new(locale.toUtf8(), "UTF_8"); prevLocale = locale; } if (stemmer == NULL) { qWarning() << "No stemmer found for locale: " << locale; return word; } QByteArray ba = word.toUtf8(); int lim = 10; sb_symbol* sbWord = (sb_symbol*)malloc(lim * sizeof(sb_symbol)); int i = 0; int inlen = 0; while (1) { if (i==ba.length()) { break; } int ch = ba.at(i); if (i == lim) { sb_symbol * newb; newb = (sb_symbol*)realloc(sbWord, (lim + 10) * sizeof(sb_symbol)); if (newb == 0) { Q_ASSERT(false); qWarning() << "RStemmer::stem: Memory allocation error."; } sbWord = newb; lim = lim + 10; } // Update count of utf-8 characters. if (ch < 0x80 || ch > 0xBF) { inlen += 1; } // force lower case: //if (isupper(ch)) { // ch = tolower(ch); //} sbWord[i] = ch; i++; } const sb_symbol* sbStemmed = sb_stemmer_stem(stemmer, sbWord, i); QString stemmed; i=0; while(1) { if (sbStemmed[i]==0) { break; } stemmed += QChar(sbStemmed[i]); i++; } return stemmed; }
Stemmer::Stemmer( const string& language ) { _stemmer = NULL; if ( language != "none" ) _stemmer = sb_stemmer_new(language.c_str(), "UTF_8"); }
/* "/home/richard/private/Working/snowball/pystemmer/src/Stemmer.pyx":68 */ Py_INCREF(__pyx_v_py_algs); __pyx_r = __pyx_v_py_algs; goto __pyx_L0; __pyx_r = Py_None; Py_INCREF(Py_None); goto __pyx_L0; __pyx_L1:; Py_XDECREF(__pyx_1); Py_XDECREF(__pyx_3); Py_XDECREF(__pyx_4); __Pyx_AddTraceback("Stemmer.algorithms"); __pyx_r = 0; __pyx_L0:; Py_DECREF(__pyx_v_py_algs); Py_DECREF(__pyx_v_aliases); return __pyx_r; } static PyObject *__pyx_f_7Stemmer_version(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/ static char __pyx_doc_7Stemmer_version[] = "Get the version string of the stemming module.\n\n This version number is for the Stemmer module as a whole (not for an\n individual stemming algorithm).\n\n "; static PyObject *__pyx_f_7Stemmer_version(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) { PyObject *__pyx_r; static char *__pyx_argnames[] = {0}; if (!PyArg_ParseTupleAndKeywords(__pyx_args, __pyx_kwds, "", __pyx_argnames)) return 0; Py_INCREF(__pyx_k2p); __pyx_r = __pyx_k2p; goto __pyx_L0; __pyx_r = Py_None; Py_INCREF(Py_None); __pyx_L0:; return __pyx_r; } static int __pyx_f_7Stemmer_7Stemmer___init__(PyObject *__pyx_v_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/ static int __pyx_f_7Stemmer_7Stemmer___init__(PyObject *__pyx_v_self, PyObject *__pyx_args, PyObject *__pyx_kwds) { PyObject *__pyx_v_algorithm = 0; int __pyx_v_maxCacheSize; int __pyx_r; char *__pyx_1; int __pyx_2; PyObject *__pyx_3 = 0; PyObject *__pyx_4 = 0; static char *__pyx_argnames[] = {"algorithm","maxCacheSize",0}; __pyx_v_maxCacheSize = __pyx_d2; if (!PyArg_ParseTupleAndKeywords(__pyx_args, __pyx_kwds, "O|i", __pyx_argnames, &__pyx_v_algorithm, &__pyx_v_maxCacheSize)) return -1; Py_INCREF(__pyx_v_self); Py_INCREF(__pyx_v_algorithm); /* "/home/richard/private/Working/snowball/pystemmer/src/Stemmer.pyx":118 */ __pyx_1 = PyString_AsString(__pyx_v_algorithm); if (!__pyx_1) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 118; goto __pyx_L1;} ((struct __pyx_obj_7Stemmer_Stemmer *)__pyx_v_self)->cobj = sb_stemmer_new(__pyx_1,__pyx_k3); /* "/home/richard/private/Working/snowball/pystemmer/src/Stemmer.pyx":119 */ __pyx_2 = (((struct __pyx_obj_7Stemmer_Stemmer *)__pyx_v_self)->cobj == NULL); if (__pyx_2) { __pyx_3 = PyNumber_Remainder(__pyx_k4p, __pyx_v_algorithm); if (!__pyx_3) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 120; goto __pyx_L1;} __pyx_4 = PyTuple_New(1); if (!__pyx_4) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 120; goto __pyx_L1;} PyTuple_SET_ITEM(__pyx_4, 0, __pyx_3); __pyx_3 = 0; __pyx_3 = PyObject_CallObject(PyExc_KeyError, __pyx_4); if (!__pyx_3) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 120; goto __pyx_L1;} Py_DECREF(__pyx_4); __pyx_4 = 0; __Pyx_Raise(__pyx_3, 0, 0); Py_DECREF(__pyx_3); __pyx_3 = 0; {__pyx_filename = __pyx_f[0]; __pyx_lineno = 120; goto __pyx_L1;} goto __pyx_L2; } __pyx_L2:; /* "/home/richard/private/Working/snowball/pystemmer/src/Stemmer.pyx":121 */ ((struct __pyx_obj_7Stemmer_Stemmer *)__pyx_v_self)->max_cache_size = __pyx_v_maxCacheSize; /* "/home/richard/private/Working/snowball/pystemmer/src/Stemmer.pyx":122 */ __pyx_4 = PyInt_FromLong(0); if (!__pyx_4) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 122; goto __pyx_L1;} Py_DECREF(((struct __pyx_obj_7Stemmer_Stemmer *)__pyx_v_self)->counter); ((struct __pyx_obj_7Stemmer_Stemmer *)__pyx_v_self)->counter = __pyx_4; __pyx_4 = 0; /* "/home/richard/private/Working/snowball/pystemmer/src/Stemmer.pyx":123 */ __pyx_3 = PyDict_New(); if (!__pyx_3) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 123; goto __pyx_L1;} Py_DECREF(((struct __pyx_obj_7Stemmer_Stemmer *)__pyx_v_self)->cache); ((struct __pyx_obj_7Stemmer_Stemmer *)__pyx_v_self)->cache = __pyx_3; __pyx_3 = 0; __pyx_r = 0; goto __pyx_L0; __pyx_L1:; Py_XDECREF(__pyx_3); Py_XDECREF(__pyx_4); __Pyx_AddTraceback("Stemmer.Stemmer.__init__"); __pyx_r = -1; __pyx_L0:; Py_DECREF(__pyx_v_self); Py_DECREF(__pyx_v_algorithm); return __pyx_r; }
void Words::initiliazeStemmers() { stemmerGreek = sb_stemmer_new("greek", NULL); stemmerEnglish = sb_stemmer_new("english", NULL); }
Stemmer::Stemmer( const FTSLanguage language ) { _stemmer = NULL; if ( language.str() != "none" ) _stemmer = sb_stemmer_new(language.str().c_str(), "UTF_8"); }