// Scan ***, **, or * and return number scanned, or 0. // Advances position. static int scan_delims(subject* subj, unsigned char c, bool * can_open, bool * can_close) { int numdelims = 0; int before_char_pos; int32_t after_char = 0; int32_t before_char = 0; int len; bool left_flanking, right_flanking; if (subj->pos == 0) { before_char = 10; } else { before_char_pos = subj->pos - 1; // walk back to the beginning of the UTF_8 sequence: while (peek_at(subj, before_char_pos) >> 6 == 2 && before_char_pos > 0) { before_char_pos -= 1; } len = utf8proc_iterate(subj->input.data + before_char_pos, subj->pos - before_char_pos, &before_char); if (len == -1) { before_char = 10; } } while (peek_char(subj) == c) { numdelims++; advance(subj); } len = utf8proc_iterate(subj->input.data + subj->pos, subj->input.len - subj->pos, &after_char); if (len == -1) { after_char = 10; } left_flanking = numdelims > 0 && !utf8proc_is_space(after_char) && !(utf8proc_is_punctuation(after_char) && !utf8proc_is_space(before_char) && !utf8proc_is_punctuation(before_char)); right_flanking = numdelims > 0 && !utf8proc_is_space(before_char) && !(utf8proc_is_punctuation(before_char) && !utf8proc_is_space(after_char) && !utf8proc_is_punctuation(after_char)); if (c == '_') { *can_open = left_flanking && !right_flanking; *can_close = right_flanking && !left_flanking; } else { *can_open = left_flanking; *can_close = right_flanking; } return numdelims; }
void MultiLineTextView::typesetGlyphs(const std::string& str, const zeus::CColor& defaultColor, unsigned wrap) { if (wrap) { typesetGlyphs(LineWrap(str, wrap), defaultColor); return; } m_width = 0; m_lines.clear(); size_t rem = str.size() + 1; const utf8proc_uint8_t* it = reinterpret_cast<const utf8proc_uint8_t*>(str.data()); size_t lineCount = 0; while (rem) { utf8proc_int32_t ch; utf8proc_ssize_t sz = utf8proc_iterate(it, -1, &ch); if (sz < 0) Log.report(logvisor::Fatal, "invalid UTF-8 char"); if (ch == '\n' || ch == '\0') ++lineCount; rem -= sz; it += sz; } m_lines.reserve(lineCount); rem = str.size() + 1; it = reinterpret_cast<const utf8proc_uint8_t*>(str.data()); const utf8proc_uint8_t* beginIt = it; while (rem) { utf8proc_int32_t ch; utf8proc_ssize_t sz = utf8proc_iterate(it, -1, &ch); if (ch == '\n' || ch == '\0') { m_lines.emplace_back(new TextView(m_viewSystem, *this, m_fontAtlas, m_align, m_lineCapacity)); m_lines.back()->typesetGlyphs(std::string((char*)beginIt, it - beginIt), defaultColor); m_width = std::max(m_width, m_lines.back()->nominalWidth()); beginIt = it + 1; } rem -= sz; it += sz; } updateSize(); }
SSliceStatus sslice_get_first_rune(SSlice *s, rune *r) { rune r2; ssize_t bytes_read; if (sslice_empty(s)) { return SSLICE_END; } bytes_read = utf8proc_iterate( (const unsigned char *)s->data, s->len, &r2 ); if (bytes_read < 1) { switch (bytes_read) { case UTF8PROC_ERROR_NOMEM: return SSLICE_MEMORY_EXHAUSTED; case UTF8PROC_ERROR_OVERFLOW: return SSLICE_OVERFLOW; case UTF8PROC_ERROR_INVALIDUTF8: return SSLICE_INVALID_UTF8; case UTF8PROC_ERROR_NOTASSIGNED: return SSLICE_NOT_ASSIGNED; case UTF8PROC_ERROR_INVALIDOPTS: return SSLICE_INVALID_OPTS; default: return SSLICE_ERROR_UNKNOWN; } } *r = r2; return SSLICE_OK; }
string_script_t get_string_script(char *str, size_t len) { int32_t ch; script_t last_script = SCRIPT_UNKNOWN; script_t script = SCRIPT_UNKNOWN; uint8_t *ptr = (uint8_t *)str; size_t script_len = 0; size_t idx = 0; bool is_ascii = true; while (idx < len) { ssize_t char_len = utf8proc_iterate(ptr, len, &ch); if (ch == 0) break; script = get_char_script((uint32_t)ch); if (script == SCRIPT_COMMON && last_script != SCRIPT_UNKNOWN) { script = last_script; } if (last_script != script && last_script != SCRIPT_UNKNOWN && last_script != SCRIPT_COMMON) { if (script_len < len) { while (true) { char_len = utf8proc_iterate_reversed((const uint8_t *)str, idx, &ch); if (ch == 0) break; script = get_char_script((uint32_t)ch); if (script != SCRIPT_COMMON) { break; } script_len -= char_len; ptr -= char_len; idx -= char_len; } } break; } is_ascii = is_ascii && ch < MAX_ASCII; ptr += char_len; idx += char_len; script_len += char_len; if (script != SCRIPT_UNKNOWN) { last_script = script; } } return (string_script_t) {last_script, script_len, is_ascii}; }
inline int sustream_prev_char(struct ustream *ustrm) { struct sustream* sstrm = (struct sustream*)(ustrm->stream); sstrm->cur -= sstrm->octcnt; if (sstrm->cur < sstrm->str) return STRM_END; sstrm->octcnt = utf8proc_iterate(sstrm->cur, 4, &ustrm->point); if (sstrm->octcnt < 0) return WRONG_ENCODING; return OK; }
unsigned int utf8_glypheme_length(const char *p) { int32_t cp; int len = utf8proc_iterate((uint8_t *) p, -1, &cp); if (len == UTF8PROC_ERROR_INVALIDUTF8) return 1; // Check for combining characters const char *p2 = p + len; while (*p2) { int comb_len = utf8proc_iterate((uint8_t *) p2, -1, &cp); if (comb_len == UTF8PROC_ERROR_INVALIDUTF8) return 1; const utf8proc_property_t* prop = utf8proc_get_property(cp); if (!prop->combining_class) break; len += comb_len; p2 += comb_len; } return len; }
int utf8proc_mbtowc(wchar_t *pwc, const char *s, size_t n) { utf8proc_ssize_t slen; if (s == NULL) return (0); /* * *pwc == -1 indicates invalid codepoint * slen < 0 indicates an error */ slen = utf8proc_iterate(s, n, pwc); if (*pwc == (wchar_t)-1 || slen < 0) return (-1); return (slen); }
static void test_utf8proc_iterate ( void ) { char utf8_str_simple[] = "The quick brown.fox"; uint8_t *utf8_str_simple_ptr = (uint8_t*)(&utf8_str_simple[0]); size_t length_simple_string = strlen ( utf8_str_simple ); int32_t unicode_char; unsigned int index; ssize_t bytes_read; for (index = 0; index < length_simple_string; ++index) { bytes_read = utf8proc_iterate ( &utf8_str_simple_ptr[index], length_simple_string - index, &unicode_char ); rtems_test_assert ( bytes_read == 1 ); rtems_test_assert ( (uint8_t)unicode_char == utf8_str_simple_ptr[index]); } }
SSliceStatus sslice_base_assign(SSlice *s, char *cs, bool validate) { size_t slen = strlen(cs); if (validate) { SSlice cursor; int32_t cc; ssize_t bytes_read; cursor.data = cs; cursor.len = slen; while (cursor.len > 0) { bytes_read = utf8proc_iterate( (const unsigned char *)cursor.data, cursor.len, &cc ); if (bytes_read < 1) { switch (bytes_read) { case UTF8PROC_ERROR_NOMEM: return SSLICE_MEMORY_EXHAUSTED; case UTF8PROC_ERROR_OVERFLOW: return SSLICE_OVERFLOW; case UTF8PROC_ERROR_INVALIDUTF8: return SSLICE_INVALID_UTF8; case UTF8PROC_ERROR_NOTASSIGNED: return SSLICE_NOT_ASSIGNED; case UTF8PROC_ERROR_INVALIDOPTS: return SSLICE_INVALID_OPTS; default: return SSLICE_ERROR_UNKNOWN; } } cursor.data += bytes_read; cursor.len -= bytes_read; } } s->data = cs; s->len = slen; return SSLICE_OK; }
SSliceStatus sslice_advance_rune(SSlice *s) { rune r; ssize_t bytes_read; if (sslice_empty(s)) { return SSLICE_END; } bytes_read = utf8proc_iterate( (const unsigned char *)s->data, s->len, &r ); if (bytes_read < 1) { switch (bytes_read) { case UTF8PROC_ERROR_NOMEM: return SSLICE_MEMORY_EXHAUSTED; case UTF8PROC_ERROR_OVERFLOW: return SSLICE_OVERFLOW; case UTF8PROC_ERROR_INVALIDUTF8: return SSLICE_INVALID_UTF8; case UTF8PROC_ERROR_NOTASSIGNED: return SSLICE_NOT_ASSIGNED; case UTF8PROC_ERROR_INVALIDOPTS: return SSLICE_INVALID_OPTS; default: return SSLICE_ERROR_UNKNOWN; } } if (bytes_read < (ssize_t)s->len) { s->data += bytes_read; s->len -= bytes_read; return SSLICE_OK; } sslice_clear(s); return SSLICE_END; }
string_tree_t *regex_string_tree(char *regex, size_t len) { uint8_t *char_ptr = (uint8_t *)regex; bool in_set = false; bool in_brackets = false; int32_t codepoint; int32_t last_codepoint = 0; ssize_t char_len; size_t bracket_start; size_t bracket_len; char temp_char[MAX_UTF8_CHAR_SIZE]; ssize_t temp_char_len; string_tree_t *tree = string_tree_new(); if (len == 0) { // Single token with zero-length string_tree_add_string_len(tree, regex, len); string_tree_finalize_token(tree); return tree; } uint32_array *char_set = uint32_array_new(); size_t idx = 0; int i, j; bool add_to_index = false; while (idx < len) { char_len = utf8proc_iterate(char_ptr, len, &codepoint); if (char_len <= 0) { uint32_array_destroy(char_set); string_tree_destroy(tree); return NULL; } if (!(utf8proc_codepoint_valid(codepoint))) { idx += char_len; char_ptr += char_len; continue; } add_to_index = true; if (codepoint == LSQUARE_CODEPOINT && last_codepoint != BACKSLASH_CODEPOINT) { log_debug("begin set\n"); in_set = true; codepoint = BEGIN_SET_CODEPOINT; uint32_array_clear(char_set); } else if (codepoint == RSQUARE_CODEPOINT && last_codepoint != BACKSLASH_CODEPOINT && in_set) { log_debug("end set"); for (j = 0; j < char_set->n; j++) { temp_char_len = utf8proc_encode_char(char_set->a[j], (uint8_t *)temp_char); log_debug("Adding string %.*s\n", (int)temp_char_len, temp_char); string_tree_add_string_len(tree, temp_char, temp_char_len); } string_tree_finalize_token(tree); uint32_array_clear(char_set); // Add a special codepoint to the sequence to distinguish from an escaped square bracket codepoint = END_SET_CODEPOINT; in_set = false; } else if (codepoint == LCURLY_CODEPOINT && last_codepoint != BACKSLASH_CODEPOINT) { in_brackets = true; bracket_start = idx + char_len; bracket_len = 0; add_to_index = false; } else if (codepoint == RCURLY_CODEPOINT && last_codepoint != BACKSLASH_CODEPOINT && in_brackets) { log_debug("Adding bracketed string: %.*s\n", (int) bracket_len, regex + bracket_start); string_tree_add_string_len(tree, regex + bracket_start, bracket_len); in_brackets = false; } else if ((codepoint == LPAREN_CODEPOINT || codepoint == RPAREN_CODEPOINT) && last_codepoint != BACKSLASH_CODEPOINT) { log_debug("group\n"); add_to_index = false; } else if (in_set) { log_debug("in set\n"); // Queue node, we'll add them to the trie uint32_array_push(char_set, codepoint); add_to_index = false; } else if (in_brackets) { add_to_index = false; bracket_len += char_len; } else if (codepoint == BACKSLASH_CODEPOINT && last_codepoint != BACKSLASH_CODEPOINT) { add_to_index = false; } log_debug("codepoint = %d\n", codepoint); if (add_to_index) { temp_char_len = utf8proc_encode_char(codepoint, (uint8_t *)temp_char); log_debug("char = %.*s\n", (int)temp_char_len, temp_char); string_tree_add_string_len(tree, temp_char, temp_char_len); string_tree_finalize_token(tree); } idx += char_len; char_ptr += char_len; } uint32_array_destroy(char_set); return tree; }
group_capture_array *parse_groups(char *regex, size_t len) { uint8_t *char_ptr = (uint8_t *)regex; char last_ch = '\0'; bool in_group = false; bool in_set = false; int32_t codepoint, last_codepoint = 0; ssize_t char_len; char temp_char[MAX_UTF8_CHAR_SIZE]; ssize_t temp_char_len; if (len == 0) { return NULL; } group_capture_array *groups = group_capture_array_new_size(1); size_t idx = 0; size_t pos = 0; size_t group_start = 0; size_t chars_in_group = 0; while (idx < len) { char_len = utf8proc_iterate(char_ptr, len, &codepoint); if (char_len <= 0) { log_error("char %s had len=%zd\n", char_ptr, char_len); return NULL; } if (!(utf8proc_codepoint_valid(codepoint))) { idx += char_len; char_ptr += char_len; pos++; continue; } if (codepoint == LSQUARE_CODEPOINT && last_codepoint != BACKSLASH_CODEPOINT) { log_debug("begin set\n"); in_set = true; } else if (codepoint == RSQUARE_CODEPOINT && last_codepoint != BACKSLASH_CODEPOINT) { log_debug("end set"); pos++; in_set = false; } else if (codepoint == LPAREN_CODEPOINT && last_codepoint != BACKSLASH_CODEPOINT) { log_debug("begin group\n"); in_group = true; group_start = pos; } else if (codepoint == RPAREN_CODEPOINT && last_codepoint != BACKSLASH_CODEPOINT) { log_debug("close group\n"); in_group = false; group_capture_array_push(groups, (group_capture_t){group_start, pos - group_start}); } else if (!in_set) { log_debug("other char\n"); pos++; } idx += char_len; char_ptr += char_len; } return groups; }
static int __lua_font_wrap_text ( lua_State *_l ) { ex_font_t *font; const char *text, *whitespace; int maxWidth; const char *str, *nextstr, *word_start; char *newtext, *newtext_p, *last_newtext; int ch, next_ch, len, newlen, cpylen; uint ft_index, prev_ft_index; int cur_x, word_start_x; ex_glyph_t *glyph; bool linebreak, beginningOfLine, trimWhitespace, skipcpy; bool wrapword, collapseSpace, collapseLinebreak; // get lua arguments ex_lua_check_nargs(_l,5); text = luaL_checkstring(_l,1); luaL_checktype( _l, 2, LUA_TLIGHTUSERDATA ); font = lua_touserdata(_l,2); whitespace = luaL_checkstring(_l,3); maxWidth = luaL_checkint(_l,4); trimWhitespace = beginningOfLine = (luaL_checkint(_l,5) == 1); // len = strlen(text); str = nextstr = word_start = text; newtext_p = newtext = last_newtext = ex_malloc( len * sizeof(char) ); prev_ft_index = -1; // get wrapMode wrapword = false; collapseSpace = collapseLinebreak = false; if ( !strncmp( whitespace, "pre-wrap", 8 ) ) { wrapword = true; collapseSpace = false; collapseLinebreak = false; } else if ( !strncmp( whitespace, "pre-line", 8 ) ) { wrapword = true; collapseSpace = true; collapseLinebreak = false; } else if ( !strncmp( whitespace, "normal", 6 ) ) { wrapword = true; collapseSpace = true; collapseLinebreak = true; } else if ( !strncmp( whitespace, "nowrap", 6 ) ) { wrapword = false; collapseSpace = true; collapseLinebreak = true; } else if ( !strncmp( whitespace, "pre", 3 ) ) { wrapword = false; collapseSpace = false; collapseLinebreak = false; } // process text cur_x = word_start_x = 0; linebreak = false; while ( *str ) { skipcpy = false; nextstr += utf8proc_iterate ((const uint8_t *)str, -1, &ch); // if this is line-break if ( ch == '\n' || ch == '\r' ) { if ( collapseLinebreak ) { ch = ' '; // turn it to space } else { linebreak = true; } } // if this is space if ( ch == ' ' || ch == '\t' || ch == '\f' ) { if ( collapseSpace ) { const char * nextnextstr = nextstr; while ( *nextnextstr ) { nextstr = nextnextstr; nextnextstr += utf8proc_iterate ((const uint8_t *)nextnextstr, -1, &next_ch); // if next_ch is white-space, then collapse this char if ( next_ch == ' ' || next_ch == '\t' || next_ch == '\f' ) { str = nextstr; continue; } // if next_ch is line-break and collapseLinebreak is true, then collapse this char if ( next_ch == '\n' || next_ch == '\r' ) { if ( collapseLinebreak ) { str = nextstr; continue; } } // break; } // skip first-time collapse if ( trimWhitespace ) { trimWhitespace = false; str = nextstr; continue; } // yes, must turn it to space to make sure only one space ch = ' '; } } // trimWhitespace = false; // process word-break, word-wrap if ( wrapword ) { word_start = str; word_start_x = cur_x; // if this character can break if ( nextstr == NULL || __can_word_break (ch) ) { // advanced character ft_index = ex_font_get_index ( font, ch ); glyph = ex_font_get_glyph ( font, ft_index ); cur_x += ex_font_get_kerning( font, prev_ft_index, ft_index ); cur_x += glyph->advance_x; prev_ft_index = ft_index; // check if the word exceed content width if ( cur_x > maxWidth ) { if ( !beginningOfLine ) { linebreak = true; // skip copy the white-space if it is at the end of the wrap if ( ch == ' ' || ch == '\t' || ch == '\f' ) { skipcpy = true; } else { nextstr = word_start; cur_x = word_start_x; } } } beginningOfLine = false; } else { // advanced current character ft_index = ex_font_get_index ( font, ch ); glyph = ex_font_get_glyph ( font, ft_index ); cur_x += ex_font_get_kerning( font, prev_ft_index, ft_index ); cur_x += glyph->advance_x; prev_ft_index = ft_index; const char * nextnextstr = nextstr; while ( *nextnextstr ) { nextstr = nextnextstr; nextnextstr += utf8proc_iterate ((const uint8_t *)nextnextstr, -1, &next_ch); // if this character can break if ( __can_word_break (next_ch) ) { break; } // advanced character ft_index = ex_font_get_index ( font, next_ch ); glyph = ex_font_get_glyph ( font, ft_index ); cur_x += ex_font_get_kerning( font, prev_ft_index, ft_index ); cur_x += glyph->advance_x; prev_ft_index = ft_index; // TODO: process word-break // check if the word exceed content width if ( cur_x > maxWidth ) { if ( !beginningOfLine ) { linebreak = true; nextstr = word_start; cur_x = word_start_x; skipcpy = true; break; } } } } } else { // advanced character ft_index = ex_font_get_index ( font, ch ); glyph = ex_font_get_glyph ( font, ft_index ); cur_x += ex_font_get_kerning( font, prev_ft_index, ft_index ); cur_x += glyph->advance_x; prev_ft_index = ft_index; } // copy character to newtext_p if ( !skipcpy ) { cpylen = nextstr - str; if ( cpylen > 0 ) { strncpy( newtext_p, str, cpylen); newtext_p += cpylen; } } // step str = nextstr; if ( linebreak ) { break; } } // text1 newlen = newtext_p-newtext; if ( newlen > 0 ) { lua_pushlstring(_l, newtext, newlen); } else { lua_pushnil(_l); } // text2 if ( linebreak && *str ) { lua_pushstring(_l, str ); } else { lua_pushnil(_l); } lua_pushinteger(_l,cur_x); // width lua_pushboolean(_l,linebreak); // line-break // ex_free(newtext); return 4; // text1(can be nil), text2(can be nil), width of text1, linebreak }
static void S_out(cmark_renderer *renderer, const char *source, bool wrap, cmark_escaping escape) { int length = cmark_strbuf_safe_strlen(source); unsigned char nextc; int32_t c; int i = 0; int len; cmark_chunk remainder = cmark_chunk_literal(""); int k = renderer->buffer->size - 1; wrap = wrap && !renderer->no_wrap; if (renderer->in_tight_list_item && renderer->need_cr > 1) { renderer->need_cr = 1; } while (renderer->need_cr) { if (k < 0 || renderer->buffer->ptr[k] == '\n') { k -= 1; } else { cmark_strbuf_putc(renderer->buffer, '\n'); if (renderer->need_cr > 1) { cmark_strbuf_put(renderer->buffer, renderer->prefix->ptr, renderer->prefix->size); } } renderer->column = 0; renderer->begin_line = true; renderer->need_cr -= 1; } while (i < length) { if (renderer->begin_line) { cmark_strbuf_put(renderer->buffer, renderer->prefix->ptr, renderer->prefix->size); // note: this assumes prefix is ascii: renderer->column = renderer->prefix->size; } len = utf8proc_iterate((const uint8_t *)source + i, length - i, &c); if (len == -1) { // error condition return; // return without rendering rest of string } nextc = source[i + len]; if (c == 32 && wrap) { if (!renderer->begin_line) { cmark_strbuf_putc(renderer->buffer, ' '); renderer->column += 1; renderer->begin_line = false; renderer->last_breakable = renderer->buffer->size - 1; // skip following spaces while (source[i + 1] == ' ') { i++; } } } else if (c == 10) { cmark_strbuf_putc(renderer->buffer, '\n'); renderer->column = 0; renderer->begin_line = true; renderer->last_breakable = 0; } else if (escape == LITERAL) { cmark_render_code_point(renderer, c); renderer->begin_line = false; } else { (renderer->outc)(renderer, escape, c, nextc); renderer->begin_line = false; } // If adding the character went beyond width, look for an // earlier place where the line could be broken: if (renderer->width > 0 && renderer->column > renderer->width && !renderer->begin_line && renderer->last_breakable > 0) { // copy from last_breakable to remainder cmark_chunk_set_cstr(&remainder, (char *) renderer->buffer->ptr + renderer->last_breakable + 1); // truncate at last_breakable cmark_strbuf_truncate(renderer->buffer, renderer->last_breakable); // add newline, prefix, and remainder cmark_strbuf_putc(renderer->buffer, '\n'); cmark_strbuf_put(renderer->buffer, renderer->prefix->ptr, renderer->prefix->size); cmark_strbuf_put(renderer->buffer, remainder.data, remainder.len); renderer->column = renderer->prefix->size + remainder.len; cmark_chunk_free(&remainder); renderer->last_breakable = 0; renderer->begin_line = false; } i += len; } }
bool trie_search_from_index(trie_t *self, char *text, uint32_t start_node_id, phrase_array **phrases) { if (text == NULL) return false; ssize_t len, remaining; int32_t unich = 0; unsigned char ch = '\0'; const uint8_t *ptr = (const uint8_t *)text; const uint8_t *fail_ptr = ptr; uint32_t node_id = start_node_id; trie_node_t node = trie_get_node(self, node_id), last_node = node; uint32_t next_id; bool match = false; uint32_t index = 0; uint32_t phrase_len = 0; uint32_t phrase_start = 0; uint32_t data; trie_search_state_t state = SEARCH_STATE_BEGIN, last_state = SEARCH_STATE_BEGIN; bool advance_index = true; while(1) { len = utf8proc_iterate(ptr, -1, &unich); remaining = len; if (len <= 0) return false; if (!(utf8proc_codepoint_valid(unich))) return false; int cat = utf8proc_category(unich); bool is_letter = utf8_is_letter(cat); // If we're in the middle of a word and the first letter was not a match, skip the word if (is_letter && state == SEARCH_STATE_NO_MATCH) { log_debug("skipping\n"); ptr += len; index += len; last_state = state; continue; } // Match in the middle of a word if (is_letter && last_state == SEARCH_STATE_MATCH) { log_debug("last_state == SEARCH_STATE_MATCH && is_letter\n"); // Only set match to false so we don't callback match = false; } for (int i=0; remaining > 0; remaining--, i++, ptr++, last_node=node, last_state=state, node_id=next_id) { ch = (unsigned char) *ptr; log_debug("char=%c\n", ch); next_id = trie_get_transition_index(self, node, *ptr); node = trie_get_node(self, next_id); if (node.check != node_id) { state = is_letter ? SEARCH_STATE_NO_MATCH : SEARCH_STATE_BEGIN; if (match) { log_debug("match is true and state==SEARCH_STATE_NO_MATCH\n"); if (*phrases == NULL) { *phrases = phrase_array_new_size(1); } phrase_array_push(*phrases, (phrase_t){phrase_start, phrase_len, data}); index = phrase_start + phrase_len; advance_index = false; // Set the text back to the end of the last phrase ptr = (const uint8_t *)text + index; len = utf8proc_iterate(ptr, -1, &unich); log_debug("ptr=%s\n", ptr); } else { ptr += remaining; log_debug("done with char, now at %s\n", ptr); } fail_ptr = ptr; node_id = start_node_id; last_node = node = trie_get_node(self, node_id); phrase_start = phrase_len = 0; last_state = state; match = false; break; } else { log_debug("node.check == node_id\n"); state = SEARCH_STATE_PARTIAL_MATCH; if (last_state == SEARCH_STATE_NO_MATCH || last_state == SEARCH_STATE_BEGIN) { log_debug("phrase_start=%u\n", index); phrase_start = index; fail_ptr = ptr + remaining; } if (node.base < 0) { int32_t data_index = -1*node.base; trie_data_node_t data_node = self->data->a[data_index]; unsigned char *current_tail = self->tail->a + data_node.tail; size_t tail_len = strlen((char *)current_tail); char *query_tail = (char *)(*ptr ? ptr + 1 : ptr); size_t query_tail_len = strlen((char *)query_tail); log_debug("next node tail: %s\n", current_tail); log_debug("query node tail: %s\n", query_tail); if (tail_len <= query_tail_len && strncmp((char *)current_tail, query_tail, tail_len) == 0) { state = SEARCH_STATE_MATCH; log_debug("Tail matches\n"); last_state = state; data = data_node.data; log_debug("%u, %d, %zu\n", index, phrase_len, tail_len); ptr += tail_len; index += tail_len; advance_index = false; phrase_len = index + 1 - phrase_start; match = true; } else if (match) { log_debug("match is true and longer phrase tail did not match\n"); log_debug("phrase_start=%d, phrase_len=%d\n", phrase_start, phrase_len); if (*phrases == NULL) { *phrases = phrase_array_new_size(1); } phrase_array_push(*phrases, (phrase_t){phrase_start, phrase_len, data}); ptr = fail_ptr; match = false; index = phrase_start + phrase_len; advance_index = false; } } if (ch != '\0') { trie_node_t terminal_node = trie_get_transition(self, node, '\0'); if (terminal_node.check == next_id) { log_debug("Transition to NUL byte matched\n"); state = SEARCH_STATE_MATCH; match = true; phrase_len = index + (uint32_t)len - phrase_start; if (terminal_node.base < 0) { int32_t data_index = -1*terminal_node.base; trie_data_node_t data_node = self->data->a[data_index]; data = data_node.data; } log_debug("Got match with len=%d\n", phrase_len); fail_ptr = ptr; } } } } if (unich == 0) { if (last_state == SEARCH_STATE_MATCH) { log_debug("Found match at the end\n"); if (*phrases == NULL) { *phrases = phrase_array_new_size(1); } phrase_array_push(*phrases, (phrase_t){phrase_start, phrase_len, data}); } break; } if (advance_index) index += len; advance_index = true; log_debug("index now %u\n", index); } // while return true; }
phrase_t trie_search_prefixes_from_index(trie_t *self, char *word, size_t len, uint32_t start_node_id) { log_debug("Call to trie_search_prefixes_from_index\n"); uint32_t node_id = start_node_id, last_node_id = node_id; trie_node_t node = trie_get_node(self, node_id), last_node = node; log_debug("last_node_id = %d\n", last_node_id); uint32_t value = 0, phrase_start = 0, phrase_len = 0; uint8_t *ptr = (uint8_t *)word; ssize_t char_len = 0; uint32_t idx = 0; size_t separator_char_len = 0; int32_t codepoint = 0; bool first_char = true; trie_data_node_t data_node; trie_node_t terminal_node; while (idx < len) { char_len = utf8proc_iterate(ptr, len, &codepoint); log_debug("char_len = %zu, char=%d\n", char_len, codepoint); if (char_len <= 0) break; if (!(utf8proc_codepoint_valid(codepoint))) break; bool is_hyphen = utf8_is_hyphen(codepoint); int cat = utf8proc_category(codepoint); bool is_space = utf8_is_separator(cat); uint8_t *char_ptr = ptr; size_t i = 0; bool skip_char = false; bool break_out = false; for (i = 0; i < char_len; i++) { node_id = trie_get_transition_index(self, last_node, *char_ptr); node = trie_get_node(self, node_id); log_debug("At idx=%zu, char=%.*s\n", i, (int)char_len, char_ptr); if (node.check != last_node_id) { log_debug("node.check = %d and last_node_id = %d\n", node.check, last_node_id); if (is_hyphen || (is_space && *ptr != ' ')) { log_debug("Got hyphen or other separator, trying space instead\n"); node_id = trie_get_transition_index(self, last_node, ' '); node = trie_get_node(self, node_id); } if (is_hyphen && node.check != last_node_id) { log_debug("No space transition\n"); ptr += char_len; idx += char_len; separator_char_len = char_len; node_id = last_node_id; node = trie_get_node(self, node_id); skip_char = true; break; } else if (node.check != last_node_id) { break_out = true; log_debug("Breaking\n"); break; } break; } if (first_char) { phrase_start = idx; first_char = false; } if (node.base < 0) { log_debug("Searching tail\n"); data_node = trie_get_data_node(self, node); uint32_t current_tail_pos = data_node.tail; unsigned char *current_tail = self->tail->a + current_tail_pos; log_debug("comparing tail: %s vs %s\n", current_tail, char_ptr + 1); size_t current_tail_len = strlen((char *)current_tail); size_t match_len = i + 1; size_t offset = i + 1; size_t tail_pos = 0; log_debug("offset=%zu\n", offset); if (char_len > 1) { log_debug("char_len = %zu\n", char_len); log_debug("Doing strncmp: (%zu) %s vs %s\n", char_len - offset, current_tail, char_ptr + 1); if (strncmp((char *)ptr + offset, (char *)current_tail, char_len - offset) == 0) { match_len += char_len - offset; tail_pos = char_len - offset; log_debug("in char match_len = %zu\n", match_len); } else { return NULL_PHRASE; } } size_t tail_match_len = utf8_common_prefix_len((char *)ptr + char_len, (char *)current_tail + tail_pos, current_tail_len - tail_pos); match_len += tail_match_len; log_debug("match_len=%zu\n", match_len); if (tail_match_len == current_tail_len - tail_pos) { if (first_char) phrase_start = idx; phrase_len = (uint32_t)(idx + match_len) - phrase_start; log_debug("tail match! phrase_len=%u\n", phrase_len); value = data_node.data; return (phrase_t){phrase_start, phrase_len, value}; } else { return NULL_PHRASE; } } else if (node.check == last_node_id) { terminal_node = trie_get_transition(self, node, '\0'); log_debug("Trying link from %d to terminal node\n", last_node_id); if (terminal_node.check == node_id) { log_debug("Transition to NUL byte matched\n"); if (terminal_node.base < 0) { phrase_len = (uint32_t)(idx + char_len) - phrase_start; data_node = trie_get_data_node(self, terminal_node); value = data_node.data; } log_debug("Got match with len=%d\n", phrase_len); } } last_node = node; last_node_id = node_id; log_debug("last_node_id = %d\n", last_node_id); char_ptr++; } if (break_out) { break; } else if (skip_char) { continue; } log_debug("Incrementing index\n"); idx += char_len; ptr += char_len; } log_debug("exited while loop\n"); if (phrase_len == 0) return NULL_PHRASE; return (phrase_t) {phrase_start, phrase_len, value}; }