Exemple #1
0
string_script_t get_string_script(char *str, size_t len) {
    int32_t ch;
    script_t last_script = SCRIPT_UNKNOWN;
    script_t script = SCRIPT_UNKNOWN;

    uint8_t *ptr = (uint8_t *)str;

    size_t script_len = 0;
    size_t idx = 0;

    bool is_ascii = true;

    while (idx < len) {
        ssize_t char_len = utf8proc_iterate(ptr, len, &ch);

        if (ch == 0) break;

        script = get_char_script((uint32_t)ch);

        if (script == SCRIPT_COMMON && last_script != SCRIPT_UNKNOWN) {
            script = last_script;
        }

        if (last_script != script && last_script != SCRIPT_UNKNOWN && last_script != SCRIPT_COMMON) {
            if (script_len < len) {
                while (true) {
                    char_len = utf8proc_iterate_reversed((const uint8_t *)str, idx, &ch);
                    if (ch == 0) break;

                    script = get_char_script((uint32_t)ch);
                    if (script != SCRIPT_COMMON) {
                        break;
                    }

                    script_len -= char_len;
                    ptr -= char_len;
                    idx -= char_len;
                }
            }

            break;
        }

        is_ascii = is_ascii && ch < MAX_ASCII;

        ptr += char_len;
        idx += char_len;
        script_len += char_len;

        if (script != SCRIPT_UNKNOWN) {
            last_script = script;
        }
    
    }

    return (string_script_t) {last_script, script_len, is_ascii};
}
Exemple #2
0
phrase_t trie_search_suffixes_from_index(trie_t *self, char *word, size_t len, uint32_t start_node_id) {
    uint32_t last_node_id = start_node_id;
    trie_node_t last_node = trie_get_node(self, last_node_id);
    uint32_t node_id = last_node_id;
    trie_node_t node = last_node;

    uint32_t value = 0, phrase_start = 0, phrase_len = 0;

    ssize_t char_len;

    int32_t unich = 0;

    ssize_t index = len;
    const uint8_t *ptr = (const uint8_t *)word;
    const uint8_t *char_ptr;

    bool in_tail = false;
    unsigned char *current_tail = (unsigned char *)"";
    size_t tail_remaining = 0;

    uint32_t tail_value = 0;

    while(index > 0) {
        char_len = utf8proc_iterate_reversed(ptr, index, &unich);

        if (char_len <= 0) return NULL_PHRASE;
        if (!(utf8proc_codepoint_valid(unich))) return NULL_PHRASE;

        index -= char_len;
        char_ptr = ptr + index;

        if (in_tail && tail_remaining >= char_len && strncmp((char *)current_tail, (char *)char_ptr, char_len) == 0) {
            tail_remaining -= char_len;
            current_tail += char_len;
            phrase_start = (uint32_t)index;

            log_debug("tail matched at char %.*s (len=%zd)\n", (int)char_len, char_ptr, char_len);
            log_debug("tail_remaining = %zu\n", tail_remaining);

            if (tail_remaining == 0) {
                log_debug("tail match! tail_value=%u\n",tail_value);
                phrase_len = (uint32_t)(len - index);
                value = tail_value;
                index = 0;
                break;
            }
            continue;
        } else if (in_tail) {
            break;
        }

        for (int i=0; i < char_len; i++, char_ptr++, last_node = node, last_node_id = node_id) {
            log_debug("char=%c\n", (unsigned char)*char_ptr);

            node_id = trie_get_transition_index(self, node, *char_ptr);
            node = trie_get_node(self, node_id);
    
            if (node.check != last_node_id) {
                log_debug("node.check = %d and last_node_id = %d\n", node.check, last_node_id);
                index = 0;
                break;
            } else if (node.base < 0) {
                log_debug("Searching tail\n");

                uint32_t data_index = -1*node.base;
                trie_data_node_t data_node = self->data->a[data_index];
                uint32_t current_tail_pos = data_node.tail;

                tail_value = data_node.data;

                current_tail = self->tail->a + current_tail_pos;

                tail_remaining = strlen((char *)current_tail);
                log_debug("tail_remaining=%zu\n", tail_remaining);
                in_tail = true;

                size_t remaining_char_len = char_len - i - 1;
                log_debug("remaining_char_len = %zu\n", remaining_char_len);

                if (remaining_char_len > 0 && strncmp((char *)char_ptr + 1, (char *)current_tail, remaining_char_len) == 0) {
                    log_debug("tail string comparison successful\n");
                    tail_remaining -= remaining_char_len;
                    current_tail += remaining_char_len;
                } else if (remaining_char_len > 0) {
                    log_debug("tail comparison unsuccessful, %s vs %s\n", char_ptr, current_tail);
                    index = 0;
                    break;
                }

                if (tail_remaining == 0) {
                    phrase_start = (uint32_t)index;
                    phrase_len = (uint32_t)(len - index);
                    log_debug("phrase_start = %d, phrase_len=%d\n", phrase_start, phrase_len);
                    value = tail_value;
                    index = 0;
                }
                break;
            } else if (i == char_len - 1) {
                trie_node_t terminal_node = trie_get_transition(self, node, '\0');
                if (terminal_node.check == node_id) {
                    int32_t data_index = -1 * terminal_node.base;
                    trie_data_node_t data_node = self->data->a[data_index];
                    value = data_node.data;
                    phrase_start = (uint32_t)index;
                    phrase_len = (uint32_t)(len - index);
                }
            }

        }

    }

    return (phrase_t) {phrase_start, phrase_len, value};
}