bool utf8_tokenizer::next_sentence(vector<string_piece>* forms, vector<token_range>* tokens) { if (forms) forms->clear(); if (tokens) tokens->clear(); if (!text || text == text_end) return false; cache* c = nullptr; if (!forms) { c = caches.pop(); if (!c) c = new cache(); forms = &c->forms; forms->clear(); } const char* text_start = text; bool result = next_sentence(*forms); for (auto&& form : *forms) { for (; text_start < form.str; chars++) utf8_advance(text_start, form.str); size_t chars_start = chars; for (; text_start < form.str + form.len; chars++) utf8_advance(text_start, form.str + form.len); if (tokens) tokens->emplace_back(chars_start, chars - chars_start); } for (; text_start < text; chars++) utf8_advance(text_start, text); if (c) caches.push(c); return result; }
bool string_has_suffix_i( const char * string, const char * suffix ) { for(;;) { if (*string == 0) return false; if (stringEqualsI_utf8( string, suffix )) return true; if (!utf8_advance(string)) return false; } }
bool string_base::limit_length(t_size length_in_chars,const char * append) { bool rv = false; const char * base = get_ptr(), * ptr = base; while(length_in_chars && utf8_advance(ptr)) length_in_chars--; if (length_in_chars==0) { truncate(ptr-base); add_string(append); rv = true; } return rv; }