bool utf8_tokenizer::next_sentence(vector<string_piece>* forms, vector<token_range>* tokens) { if (forms) forms->clear(); if (tokens) tokens->clear(); if (!text || text == text_end) return false; cache* c = nullptr; if (!forms) { c = caches.pop(); if (!c) c = new cache(); forms = &c->forms; forms->clear(); } const char* text_start = text; bool result = next_sentence(*forms); for (auto&& form : *forms) { for (; text_start < form.str; chars++) utf8_advance(text_start, form.str); size_t chars_start = chars; for (; text_start < form.str + form.len; chars++) utf8_advance(text_start, form.str + form.len); if (tokens) tokens->emplace_back(chars_start, chars - chars_start); } for (; text_start < text; chars++) utf8_advance(text_start, text); if (c) caches.push(c); return result; }
local size_t sentencizer_next(struct mascara *imp, struct mr_token **tks) { struct sentencizer *szr = (struct sentencizer *)imp; struct sentence *sent = &szr->sent; assert(szr->str && "text no set"); sentence_clear(sent); size_t len; const unsigned char *last_period; const unsigned char *str = next_sentence(szr, &len, &last_period); if (!str) { *tks = NULL; return 0; } size_t offset_incr = szr->offset_incr + str - szr->str; struct tokenizer tkr; tokenizer_init(&tkr, szr->vtab); tokenizer_set_text(&tkr.base, str, len, offset_incr); struct mr_token *tk; while (tokenizer_next(&tkr.base, &tk)) { if (tk->str == (const char *)last_period || !sentencizer_reattach_period(sent, tk)) { sentence_add(sent, tk); if (sent->len == MR_MAX_SENTENCE_LEN) { szr->p = (const unsigned char *)tk->str + tk->len; break; } } } *tks = sent->tokens; return sent->len; }