void StandardTokenizer_tokenize_str(StandardTokenizer *self, const char *text, size_t len, Inversion *inversion) { if ((len >= 1 && (uint8_t)text[len - 1] >= 0xC0) || (len >= 2 && (uint8_t)text[len - 2] >= 0xE0) || (len >= 3 && (uint8_t)text[len - 3] >= 0xF0)) { THROW(ERR, "Invalid UTF-8 sequence"); } lucy_StringIter iter = { 0, 0 }; while (iter.byte_pos < len) { int wb = S_wb_lookup(text + iter.byte_pos); while (wb >= WB_ASingle && wb <= WB_ExtendNumLet) { if (wb == WB_ASingle) { wb = S_parse_single(text, len, &iter, inversion); } else { wb = S_parse_word(text, len, &iter, wb, inversion); } if (iter.byte_pos >= len) return; } S_iter_advance(text, &iter); } }
/* * Advances the iterator skipping over Extend and Format characters. * Returns the word break property of the current character. */ static int S_skip_extend_format(const char *text, size_t len, lucy_StringIter *iter) { int wb = -1; do { S_iter_advance(text, iter); if (iter->byte_pos >= len) { break; } wb = S_wb_lookup(text + iter->byte_pos); } while (wb == WB_Extend_Format); return wb; }
/* * Parse a word starting with an ALetter, Numeric or Katakana character. * Advances the iterator and returns the word break property of the current * character. */ static int S_parse_word(const char *text, size_t len, lucy_StringIter *iter, int state, Inversion *inversion) { int wb = -1; lucy_StringIter start = *iter; S_iter_advance(text, iter); lucy_StringIter end = *iter; while (iter->byte_pos < len) { wb = S_wb_lookup(text + iter->byte_pos); switch (wb) { case WB_ALetter: case WB_Numeric: if (state == WB_Katakana) { goto word_break; } break; case WB_Katakana: if (state == WB_ALetter || state == WB_Numeric) { goto word_break; } break; case WB_ExtendNumLet: break; case WB_Extend_Format: // keep state wb = state; break; case WB_MidNumLet: case WB_MidLetter: case WB_MidNum: if ((state == WB_ALetter && wb != WB_MidNum) || (state == WB_Numeric && wb != WB_MidLetter)) { wb = S_skip_extend_format(text, len, iter); if (wb == state) { break; } } goto word_break; default: goto word_break; } state = wb; S_iter_advance(text, iter); end = *iter; } Token *token; word_break: token = Token_new(text + start.byte_pos, end.byte_pos - start.byte_pos, start.char_pos, end.char_pos, 1.0f, 1); Inversion_Append(inversion, token); return wb; }
/* * Parse a word starting with an ALetter, Numeric, Katakana, or ExtendNumLet * character. Advances the iterator and returns the word break property of the * current character. */ static int S_parse_word(const char *text, size_t len, lucy_StringIter *iter, int state, Inversion *inversion) { int wb = -1; lucy_StringIter start = *iter; S_iter_advance(text, iter); lucy_StringIter end = *iter; while (iter->byte_pos < len) { wb = S_wb_lookup(text + iter->byte_pos); switch (wb) { case WB_ALetter: case WB_Hebrew_Letter: case WB_Numeric: if (state == WB_Katakana) { goto word_break; } // Rules WB5, WB8, WB9, WB10, and WB13b. break; case WB_Katakana: if (state != WB_Katakana && state != WB_ExtendNumLet) { goto word_break; } // Rules WB13 and WB13b. break; case WB_ExtendNumLet: // Rule WB13a. break; case WB_Extend_Format: // Rule WB4. Keep state. wb = state; break; case WB_Single_Quote: case WB_MidNumLet: case WB_MidLetter: case WB_MidNum: if (state == WB_ALetter) { if (wb == WB_MidNum) { goto word_break; } wb = S_skip_extend_format(text, len, iter); if (wb == WB_ALetter || wb == WB_Hebrew_Letter) { // Rules WB6 and WB7. state = wb; break; } } else if (state == WB_Hebrew_Letter) { if (wb == WB_MidNum) { goto word_break; } if (wb == WB_Single_Quote) { // Rule WB7a. ++end.byte_pos; ++end.char_pos; } wb = S_skip_extend_format(text, len, iter); if (wb == WB_ALetter || wb == WB_Hebrew_Letter) { // Rules WB6 and WB7. state = wb; break; } } else if (state == WB_Numeric) { if (wb == WB_MidLetter) { goto word_break; } wb = S_skip_extend_format(text, len, iter); if (wb == state) { // Rules WB11 and WB12. break; } } goto word_break; case WB_Double_Quote: if (state == WB_Hebrew_Letter) { wb = S_skip_extend_format(text, len, iter); if (wb == state) { // Rules WB7b and WB7c. break; } } goto word_break; default: goto word_break; } state = wb; S_iter_advance(text, iter); end = *iter; } Token *token; word_break: token = Token_new(text + start.byte_pos, end.byte_pos - start.byte_pos, start.char_pos, end.char_pos, 1.0f, 1); Inversion_Append(inversion, token); return wb; }