/// the non-empty non-space spans in s[0..len) inline void addTokenSpans(TokenSpans &spans, char const* s, Position len) { Position i = 0; for (;;) { if (i == len) return; if (s[i] != ' ') break; ++i; } TokenSpan span; span.first = span.second = i; for (;;) { assert(span.second < len && s[span.second] != ' '); if (++span.second == len) { spans.push_back(span); return; } if (s[span.second] == ' ') { spans.push_back(span); for (;;) { if (s[++span.second] != ' ') break; if (span.second == len) return; } span.first = span.second; } } }
inline void spansToTokens(std::string const& str, TokenSpans const& spans, Tokens &tokens) { if (str.empty()) return; char const* s = &str[0]; unsigned i = 0, n = spans.size(); tokens.resize(n); for (; i < n; ++i) { TokenSpan const& span = spans[i]; assert(span.first < str.size()); assert(span.second <= str.size()); tokens[i].assign(s + span.first, s + span.second); } }
void operator()(std::string const& word, TokenSpan span) const { if (spans_) spans_->push_back(span); tokens_.push_back(word); }
void operator()(Unicode c, Position pos) const { if (spans_) spans_->push_back(TokenSpan(pos, pos + 1)); tokens_.push_back(Util::utf8s(c)); }
void operator()(Slice const& word, TokenSpan span) const { if (spans_) spans_->push_back(span); tokens_.push_back(std::string(word.first, word.second)); }