bool utf8_tokenizer::next_sentence(vector<string_piece>* forms, vector<token_range>* tokens) {
  if (forms) forms->clear();
  if (tokens) tokens->clear();
  if (!text || text == text_end) return false;

  cache* c = nullptr;
  if (!forms) {
    c = caches.pop();
    if (!c) c = new cache();
    forms = &c->forms;
    forms->clear();
  }

  const char* text_start = text;
  bool result = next_sentence(*forms);
  for (auto&& form : *forms) {
    for (; text_start < form.str; chars++) utf8_advance(text_start, form.str);
    size_t chars_start = chars;
    for (; text_start < form.str + form.len; chars++) utf8_advance(text_start, form.str + form.len);
    if (tokens) tokens->emplace_back(chars_start, chars - chars_start);
  }
  for (; text_start < text; chars++) utf8_advance(text_start, text);

  if (c) caches.push(c);
  return result;
}
Beispiel #2
0
local size_t sentencizer_next(struct mascara *imp, struct mr_token **tks)
{
   struct sentencizer *szr = (struct sentencizer *)imp;
   struct sentence *sent = &szr->sent;

   assert(szr->str && "text no set");
   sentence_clear(sent);

   size_t len;
   const unsigned char *last_period;
   const unsigned char *str = next_sentence(szr, &len, &last_period);
   if (!str) {
      *tks = NULL;
      return 0;
   }
   size_t offset_incr = szr->offset_incr + str - szr->str;

   struct tokenizer tkr;
   tokenizer_init(&tkr, szr->vtab);
   tokenizer_set_text(&tkr.base, str, len, offset_incr);

   struct mr_token *tk;
   while (tokenizer_next(&tkr.base, &tk)) {
      if (tk->str == (const char *)last_period ||
         !sentencizer_reattach_period(sent, tk)) {
         sentence_add(sent, tk);
         if (sent->len == MR_MAX_SENTENCE_LEN) {
            szr->p = (const unsigned char *)tk->str + tk->len;
            break;
         }
      }
   }
   *tks = sent->tokens;
   return sent->len;
}