Inversion* StandardTokenizer_transform(StandardTokenizer *self, Inversion *inversion) { Inversion *new_inversion = Inversion_new(NULL); Token *token; while (NULL != (token = Inversion_Next(inversion))) { StandardTokenizer_Tokenize_Str(self, token->text, token->len, new_inversion); } return new_inversion; }
Inversion* WhitespaceTokenizer_Transform_IMP(WhitespaceTokenizer *self, Inversion *inversion) { Inversion *new_inversion = Inversion_new(NULL); Token *token; while (NULL != (token = Inversion_Next(inversion))) { WhitespaceTokenizer_Tokenize_Str(self, Token_Get_Text(token), Token_Get_Len(token), new_inversion); } return new_inversion; }
Inversion* RegexTokenizer_Transform_IMP(RegexTokenizer *self, Inversion *inversion) { Inversion *new_inversion = Inversion_new(NULL); Token *token; while (NULL != (token = Inversion_Next(inversion))) { TokenIVARS *const token_ivars = Token_IVARS(token); RegexTokenizer_Tokenize_Utf8(self, token_ivars->text, token_ivars->len, new_inversion); } return new_inversion; }
Inversion* Stopalizer_transform(Stopalizer *self, Inversion *inversion) { Token *token; Inversion *new_inversion = Inversion_new(NULL); Hash *const stoplist = self->stoplist; while (NULL != (token = Inversion_Next(inversion))) { if (!Hash_Fetch_Str(stoplist, token->text, token->len)) { Inversion_Append(new_inversion, (Token*)INCREF(token)); } } return new_inversion; }
Inversion* SnowStop_Transform_IMP(SnowballStopFilter *self, Inversion *inversion) { Token *token; Inversion *new_inversion = Inversion_new(NULL); SnowballStopFilterIVARS *const ivars = SnowStop_IVARS(self); Hash *const stoplist = ivars->stoplist; while (NULL != (token = Inversion_Next(inversion))) { TokenIVARS *const token_ivars = Token_IVARS(token); if (!Hash_Fetch_Utf8(stoplist, token_ivars->text, token_ivars->len)) { Inversion_Append(new_inversion, (Token*)INCREF(token)); } } return new_inversion; }
Inversion* Normalizer_transform(Normalizer *self, Inversion *inversion) { // allocate additional space because utf8proc_reencode adds a // terminating null char int32_t static_buffer[INITIAL_BUFSIZE + 1]; int32_t *buffer = static_buffer; ssize_t bufsize = INITIAL_BUFSIZE; Token *token; while (NULL != (token = Inversion_Next(inversion))) { ssize_t len = utf8proc_decompose((uint8_t*)token->text, token->len, buffer, bufsize, self->options); if (len > bufsize) { // buffer too small, (re)allocate if (buffer != static_buffer) { FREEMEM(buffer); } // allocate additional INITIAL_BUFSIZE items bufsize = len + INITIAL_BUFSIZE; buffer = (int32_t*)MALLOCATE((bufsize + 1) * sizeof(int32_t)); len = utf8proc_decompose((uint8_t*)token->text, token->len, buffer, bufsize, self->options); } if (len < 0) { continue; } len = utf8proc_reencode(buffer, len, self->options); if (len >= 0) { if (len > (ssize_t)token->len) { FREEMEM(token->text); token->text = (char*)MALLOCATE(len + 1); } memcpy(token->text, buffer, len + 1); token->len = len; } } if (buffer != static_buffer) { FREEMEM(buffer); } Inversion_Reset(inversion); return (Inversion*)INCREF(inversion); }
Inversion* SnowStemmer_Transform_IMP(SnowballStemmer *self, Inversion *inversion) { Token *token; SnowballStemmerIVARS *const ivars = SnowStemmer_IVARS(self); struct sb_stemmer *const snowstemmer = (struct sb_stemmer*)ivars->snowstemmer; while (NULL != (token = Inversion_Next(inversion))) { TokenIVARS *const token_ivars = Token_IVARS(token); const sb_symbol *stemmed_text = sb_stemmer_stem(snowstemmer, (sb_symbol*)token_ivars->text, token_ivars->len); size_t len = sb_stemmer_length(snowstemmer); if (len > token_ivars->len) { FREEMEM(token_ivars->text); token_ivars->text = (char*)MALLOCATE(len + 1); } memcpy(token_ivars->text, stemmed_text, len + 1); token_ivars->len = len; } Inversion_Reset(inversion); return (Inversion*)INCREF(inversion); }