string_script_t get_string_script(char *str, size_t len) { int32_t ch; script_t last_script = SCRIPT_UNKNOWN; script_t script = SCRIPT_UNKNOWN; uint8_t *ptr = (uint8_t *)str; size_t script_len = 0; size_t idx = 0; bool is_ascii = true; while (idx < len) { ssize_t char_len = utf8proc_iterate(ptr, len, &ch); if (ch == 0) break; script = get_char_script((uint32_t)ch); if (script == SCRIPT_COMMON && last_script != SCRIPT_UNKNOWN) { script = last_script; } if (last_script != script && last_script != SCRIPT_UNKNOWN && last_script != SCRIPT_COMMON) { if (script_len < len) { while (true) { char_len = utf8proc_iterate_reversed((const uint8_t *)str, idx, &ch); if (ch == 0) break; script = get_char_script((uint32_t)ch); if (script != SCRIPT_COMMON) { break; } script_len -= char_len; ptr -= char_len; idx -= char_len; } } break; } is_ascii = is_ascii && ch < MAX_ASCII; ptr += char_len; idx += char_len; script_len += char_len; if (script != SCRIPT_UNKNOWN) { last_script = script; } } return (string_script_t) {last_script, script_len, is_ascii}; }
phrase_t trie_search_suffixes_from_index(trie_t *self, char *word, size_t len, uint32_t start_node_id) { uint32_t last_node_id = start_node_id; trie_node_t last_node = trie_get_node(self, last_node_id); uint32_t node_id = last_node_id; trie_node_t node = last_node; uint32_t value = 0, phrase_start = 0, phrase_len = 0; ssize_t char_len; int32_t unich = 0; ssize_t index = len; const uint8_t *ptr = (const uint8_t *)word; const uint8_t *char_ptr; bool in_tail = false; unsigned char *current_tail = (unsigned char *)""; size_t tail_remaining = 0; uint32_t tail_value = 0; while(index > 0) { char_len = utf8proc_iterate_reversed(ptr, index, &unich); if (char_len <= 0) return NULL_PHRASE; if (!(utf8proc_codepoint_valid(unich))) return NULL_PHRASE; index -= char_len; char_ptr = ptr + index; if (in_tail && tail_remaining >= char_len && strncmp((char *)current_tail, (char *)char_ptr, char_len) == 0) { tail_remaining -= char_len; current_tail += char_len; phrase_start = (uint32_t)index; log_debug("tail matched at char %.*s (len=%zd)\n", (int)char_len, char_ptr, char_len); log_debug("tail_remaining = %zu\n", tail_remaining); if (tail_remaining == 0) { log_debug("tail match! tail_value=%u\n",tail_value); phrase_len = (uint32_t)(len - index); value = tail_value; index = 0; break; } continue; } else if (in_tail) { break; } for (int i=0; i < char_len; i++, char_ptr++, last_node = node, last_node_id = node_id) { log_debug("char=%c\n", (unsigned char)*char_ptr); node_id = trie_get_transition_index(self, node, *char_ptr); node = trie_get_node(self, node_id); if (node.check != last_node_id) { log_debug("node.check = %d and last_node_id = %d\n", node.check, last_node_id); index = 0; break; } else if (node.base < 0) { log_debug("Searching tail\n"); uint32_t data_index = -1*node.base; trie_data_node_t data_node = self->data->a[data_index]; uint32_t current_tail_pos = data_node.tail; tail_value = data_node.data; current_tail = self->tail->a + current_tail_pos; tail_remaining = strlen((char *)current_tail); log_debug("tail_remaining=%zu\n", tail_remaining); in_tail = true; size_t remaining_char_len = char_len - i - 1; log_debug("remaining_char_len = %zu\n", remaining_char_len); if (remaining_char_len > 0 && strncmp((char *)char_ptr + 1, (char *)current_tail, remaining_char_len) == 0) { log_debug("tail string comparison successful\n"); tail_remaining -= remaining_char_len; current_tail += remaining_char_len; } else if (remaining_char_len > 0) { log_debug("tail comparison unsuccessful, %s vs %s\n", char_ptr, current_tail); index = 0; break; } if (tail_remaining == 0) { phrase_start = (uint32_t)index; phrase_len = (uint32_t)(len - index); log_debug("phrase_start = %d, phrase_len=%d\n", phrase_start, phrase_len); value = tail_value; index = 0; } break; } else if (i == char_len - 1) { trie_node_t terminal_node = trie_get_transition(self, node, '\0'); if (terminal_node.check == node_id) { int32_t data_index = -1 * terminal_node.base; trie_data_node_t data_node = self->data->a[data_index]; value = data_node.data; phrase_start = (uint32_t)index; phrase_len = (uint32_t)(len - index); } } } } return (phrase_t) {phrase_start, phrase_len, value}; }