/* * Parse a word starting with an ALetter, Numeric or Katakana character. * Advances the iterator and returns the word break property of the current * character. */ static int S_parse_word(const char *text, size_t len, lucy_StringIter *iter, int state, Inversion *inversion) { int wb = -1; lucy_StringIter start = *iter; S_iter_advance(text, iter); lucy_StringIter end = *iter; while (iter->byte_pos < len) { wb = S_wb_lookup(text + iter->byte_pos); switch (wb) { case WB_ALetter: case WB_Numeric: if (state == WB_Katakana) { goto word_break; } break; case WB_Katakana: if (state == WB_ALetter || state == WB_Numeric) { goto word_break; } break; case WB_ExtendNumLet: break; case WB_Extend_Format: // keep state wb = state; break; case WB_MidNumLet: case WB_MidLetter: case WB_MidNum: if ((state == WB_ALetter && wb != WB_MidNum) || (state == WB_Numeric && wb != WB_MidLetter)) { wb = S_skip_extend_format(text, len, iter); if (wb == state) { break; } } goto word_break; default: goto word_break; } state = wb; S_iter_advance(text, iter); end = *iter; } Token *token; word_break: token = Token_new(text + start.byte_pos, end.byte_pos - start.byte_pos, start.char_pos, end.char_pos, 1.0f, 1); Inversion_Append(inversion, token); return wb; }
/* * Parse a word consisting of a single codepoint followed by extend or * format characters. Used for Alphabetic characters that don't have the * ALetter word break property: ideographs, Hiragana, and "complex content". * Advances the iterator and returns the word break property of the current * character. */ static int S_parse_single(const char *text, size_t len, lucy_StringIter *iter, Inversion *inversion) { lucy_StringIter start = *iter; int wb = S_skip_extend_format(text, len, iter); Token *token = Token_new(text + start.byte_pos, iter->byte_pos - start.byte_pos, start.char_pos, iter->char_pos, 1.0f, 1); Inversion_Append(inversion, token); return wb; }
void WhitespaceTokenizer_Tokenize_Str_IMP(WhitespaceTokenizer *self, const char *text, size_t len, Inversion *inversion) { size_t byte_pos = 0; size_t char_pos = 0; size_t start_byte_pos = 0; size_t start_char_pos = 0; int prev_ws = 1; while (byte_pos < len) { uint32_t cp = StrHelp_decode_utf8_char(text + byte_pos); int ws = isspace(cp); if (prev_ws && !ws) { start_byte_pos = byte_pos; start_char_pos = char_pos; } else if (!prev_ws && ws) { Token *token = Token_new(text + start_byte_pos, byte_pos - start_byte_pos, start_char_pos, char_pos, 1.0f, 1); Inversion_Append(inversion, token); } prev_ws = ws; byte_pos += StrHelp_UTF8_COUNT[(uint8_t)text[byte_pos]]; char_pos += 1; } if (!prev_ws) { Token *token = Token_new(text + start_byte_pos, byte_pos - start_byte_pos, start_char_pos, char_pos, 1.0f, 1); Inversion_Append(inversion, token); } }
Inversion* Stopalizer_transform(Stopalizer *self, Inversion *inversion) { Token *token; Inversion *new_inversion = Inversion_new(NULL); Hash *const stoplist = self->stoplist; while (NULL != (token = Inversion_Next(inversion))) { if (!Hash_Fetch_Str(stoplist, token->text, token->len)) { Inversion_Append(new_inversion, (Token*)INCREF(token)); } } return new_inversion; }
Inversion* SnowStop_Transform_IMP(SnowballStopFilter *self, Inversion *inversion) { Token *token; Inversion *new_inversion = Inversion_new(NULL); SnowballStopFilterIVARS *const ivars = SnowStop_IVARS(self); Hash *const stoplist = ivars->stoplist; while (NULL != (token = Inversion_Next(inversion))) { TokenIVARS *const token_ivars = Token_IVARS(token); if (!Hash_Fetch_Utf8(stoplist, token_ivars->text, token_ivars->len)) { Inversion_Append(new_inversion, (Token*)INCREF(token)); } } return new_inversion; }
Inversion* Inversion_new(Token *seed_token) { Inversion *self = (Inversion*)Class_Make_Obj(INVERSION); InversionIVARS *const ivars = Inversion_IVARS(self); // Init. ivars->cap = 16; ivars->size = 0; ivars->tokens = (Token**)CALLOCATE(ivars->cap, sizeof(Token*)); ivars->cur = 0; ivars->inverted = false; ivars->cluster_counts = NULL; ivars->cluster_counts_size = 0; // Process the seed token. if (seed_token != NULL) { Inversion_Append(self, (Token*)INCREF(seed_token)); } return self; }
/* * Parse a word starting with an ALetter, Numeric, Katakana, or ExtendNumLet * character. Advances the iterator and returns the word break property of the * current character. */ static int S_parse_word(const char *text, size_t len, lucy_StringIter *iter, int state, Inversion *inversion) { int wb = -1; lucy_StringIter start = *iter; S_iter_advance(text, iter); lucy_StringIter end = *iter; while (iter->byte_pos < len) { wb = S_wb_lookup(text + iter->byte_pos); switch (wb) { case WB_ALetter: case WB_Hebrew_Letter: case WB_Numeric: if (state == WB_Katakana) { goto word_break; } // Rules WB5, WB8, WB9, WB10, and WB13b. break; case WB_Katakana: if (state != WB_Katakana && state != WB_ExtendNumLet) { goto word_break; } // Rules WB13 and WB13b. break; case WB_ExtendNumLet: // Rule WB13a. break; case WB_Extend_Format: // Rule WB4. Keep state. wb = state; break; case WB_Single_Quote: case WB_MidNumLet: case WB_MidLetter: case WB_MidNum: if (state == WB_ALetter) { if (wb == WB_MidNum) { goto word_break; } wb = S_skip_extend_format(text, len, iter); if (wb == WB_ALetter || wb == WB_Hebrew_Letter) { // Rules WB6 and WB7. state = wb; break; } } else if (state == WB_Hebrew_Letter) { if (wb == WB_MidNum) { goto word_break; } if (wb == WB_Single_Quote) { // Rule WB7a. ++end.byte_pos; ++end.char_pos; } wb = S_skip_extend_format(text, len, iter); if (wb == WB_ALetter || wb == WB_Hebrew_Letter) { // Rules WB6 and WB7. state = wb; break; } } else if (state == WB_Numeric) { if (wb == WB_MidLetter) { goto word_break; } wb = S_skip_extend_format(text, len, iter); if (wb == state) { // Rules WB11 and WB12. break; } } goto word_break; case WB_Double_Quote: if (state == WB_Hebrew_Letter) { wb = S_skip_extend_format(text, len, iter); if (wb == state) { // Rules WB7b and WB7c. break; } } goto word_break; default: goto word_break; } state = wb; S_iter_advance(text, iter); end = *iter; } Token *token; word_break: token = Token_new(text + start.byte_pos, end.byte_pos - start.byte_pos, start.char_pos, end.char_pos, 1.0f, 1); Inversion_Append(inversion, token); return wb; }