Exemplo n.º 1
0
Arquivo: case.c Projeto: 0x09/hfsfuse
int main(int argc, char **argv)
{
     int error = 0, better = 0;
     utf8proc_int32_t c;

     (void) argc; /* unused */
     (void) argv; /* unused */

     /* some simple sanity tests of the character widths */
     for (c = 0; c <= 0x110000; ++c) {
          utf8proc_int32_t l = utf8proc_tolower(c);
          utf8proc_int32_t u = utf8proc_toupper(c);

          check(l == c || utf8proc_codepoint_valid(l), "invalid tolower");
          check(u == c || utf8proc_codepoint_valid(u), "invalid toupper");

          if (sizeof(wint_t) > 2 || c < (1<<16)) {
               wint_t l0 = towlower(c), u0 = towupper(c);
               
               /* OS unicode tables may be out of date.  But if they
                  do have a lower/uppercase mapping, hopefully it
                  is correct? */
               if (l0 != c && l0 != l) {
                    fprintf(stderr, "MISMATCH %x != towlower(%x) == %x\n",
                            l, c, l0);
                    ++error;
               }
               else if (l0 != l) { /* often true for out-of-date OS unicode */
                    ++better;
                    /* printf("%x != towlower(%x) == %x\n", l, c, l0); */
               }
               if (u0 != c && u0 != u) {
                    fprintf(stderr, "MISMATCH %x != towupper(%x) == %x\n",
                            u, c, u0);
                    ++error;
               }
               else if (u0 != u) { /* often true for out-of-date OS unicode */
                    ++better;
                    /* printf("%x != towupper(%x) == %x\n", u, c, u0); */
               }
          }
     }
     check(!error, "utf8proc case conversion FAILED %d tests.", error);
     printf("More up-to-date than OS unicode tables for %d tests.\n", better);
     printf("utf8proc case conversion tests SUCCEEDED.\n");
     return 0;
}
Exemplo n.º 2
0
int
utf8proc_wctomb(char *s, wchar_t wc)
{
	if (s == NULL)
		return (0);

	if (!utf8proc_codepoint_valid(wc))
		return (-1);
	return (utf8proc_encode_char(wc, s));
}
Exemplo n.º 3
0
static VALUE utf8proc_ruby_char(VALUE self, VALUE code_param) {
  char buffer[4];
  ssize_t result;
  int uc;
  uc = NUM2INT(code_param);
  if (!utf8proc_codepoint_valid(uc))
    rb_raise(rb_eArgError, "Invalid Unicode code point");
  result = utf8proc_encode_char(uc, buffer);
  return rb_str_new(buffer, result);
}
string_tree_t *regex_string_tree(char *regex, size_t len) {
    uint8_t *char_ptr = (uint8_t *)regex;
    bool in_set = false;
    bool in_brackets = false;

    int32_t codepoint;
    int32_t last_codepoint = 0;
    ssize_t char_len;

    size_t bracket_start;
    size_t bracket_len;

    char temp_char[MAX_UTF8_CHAR_SIZE];
    ssize_t temp_char_len;

    string_tree_t *tree = string_tree_new();

    if (len == 0) {
        // Single token with zero-length
        string_tree_add_string_len(tree, regex, len);
        string_tree_finalize_token(tree);
        return tree;
    }

    uint32_array *char_set = uint32_array_new();

    size_t idx = 0;

    int i, j;

    bool add_to_index = false;

    while (idx < len) {
        char_len = utf8proc_iterate(char_ptr, len, &codepoint);
        if (char_len <= 0) {
            uint32_array_destroy(char_set);
            string_tree_destroy(tree);
            return NULL;
        }

        if (!(utf8proc_codepoint_valid(codepoint))) {
            idx += char_len;
            char_ptr += char_len;
            continue;
        }

        add_to_index = true;

        if (codepoint == LSQUARE_CODEPOINT && last_codepoint != BACKSLASH_CODEPOINT) {
            log_debug("begin set\n");
            in_set = true;
            codepoint = BEGIN_SET_CODEPOINT;
            uint32_array_clear(char_set);
        } else if (codepoint == RSQUARE_CODEPOINT && last_codepoint != BACKSLASH_CODEPOINT && in_set) {
            log_debug("end set");

            for (j = 0; j < char_set->n; j++) {
                temp_char_len = utf8proc_encode_char(char_set->a[j], (uint8_t *)temp_char);
                log_debug("Adding string %.*s\n", (int)temp_char_len, temp_char);
                string_tree_add_string_len(tree, temp_char, temp_char_len);
            }
            string_tree_finalize_token(tree);

            uint32_array_clear(char_set);
            // Add a special codepoint to the sequence to distinguish from an escaped square bracket
            codepoint = END_SET_CODEPOINT;
            in_set = false;
        } else if (codepoint == LCURLY_CODEPOINT && last_codepoint != BACKSLASH_CODEPOINT) {
            in_brackets = true;
            bracket_start = idx + char_len;
            bracket_len = 0;
            add_to_index = false;
        } else if (codepoint == RCURLY_CODEPOINT && last_codepoint != BACKSLASH_CODEPOINT && in_brackets) {
            log_debug("Adding bracketed string: %.*s\n", (int) bracket_len, regex + bracket_start);
            string_tree_add_string_len(tree, regex + bracket_start, bracket_len);
            in_brackets = false;
        } else if ((codepoint == LPAREN_CODEPOINT || codepoint == RPAREN_CODEPOINT) && last_codepoint != BACKSLASH_CODEPOINT) {
            log_debug("group\n");
            add_to_index = false;
        } else if (in_set) {
            log_debug("in set\n");
            // Queue node, we'll add them to the trie
            uint32_array_push(char_set, codepoint);
            add_to_index = false;
        } else if (in_brackets) {
            add_to_index = false;
            bracket_len += char_len;
        } else if (codepoint == BACKSLASH_CODEPOINT && last_codepoint != BACKSLASH_CODEPOINT) {
            add_to_index = false;
        }

        log_debug("codepoint = %d\n", codepoint);

        if (add_to_index) {
            temp_char_len = utf8proc_encode_char(codepoint, (uint8_t *)temp_char);
            log_debug("char = %.*s\n", (int)temp_char_len, temp_char);
            string_tree_add_string_len(tree, temp_char, temp_char_len);
            string_tree_finalize_token(tree);
        }

        idx += char_len;
        char_ptr += char_len;
    }

    uint32_array_destroy(char_set);

    return tree;
   
}
group_capture_array *parse_groups(char *regex, size_t len) {
    uint8_t *char_ptr = (uint8_t *)regex;
    char last_ch = '\0';
    bool in_group = false;
    bool in_set = false;

    int32_t codepoint, last_codepoint = 0;
    ssize_t char_len;

    char temp_char[MAX_UTF8_CHAR_SIZE];
    ssize_t temp_char_len;


    if (len == 0) {
        return NULL;
    }

    group_capture_array *groups = group_capture_array_new_size(1);

    size_t idx = 0;

    size_t pos = 0;
    size_t group_start = 0;
    size_t chars_in_group = 0;

    while (idx < len) {
        char_len = utf8proc_iterate(char_ptr, len, &codepoint);
        if (char_len <= 0) {
            log_error("char %s had len=%zd\n", char_ptr, char_len);
            return NULL;
        }

        if (!(utf8proc_codepoint_valid(codepoint))) {
            idx += char_len;
            char_ptr += char_len;
            pos++;
            continue;
        }

        if (codepoint == LSQUARE_CODEPOINT && last_codepoint != BACKSLASH_CODEPOINT) {
            log_debug("begin set\n");
            in_set = true;
        } else if (codepoint == RSQUARE_CODEPOINT && last_codepoint != BACKSLASH_CODEPOINT) {
            log_debug("end set");
            pos++;
            in_set = false;
        } else if (codepoint == LPAREN_CODEPOINT && last_codepoint != BACKSLASH_CODEPOINT) {
            log_debug("begin group\n");
            in_group = true;
            group_start = pos;
        } else if (codepoint == RPAREN_CODEPOINT && last_codepoint != BACKSLASH_CODEPOINT) {
            log_debug("close group\n");
            in_group = false;
            group_capture_array_push(groups, (group_capture_t){group_start, pos - group_start});
        } else if (!in_set) {
            log_debug("other char\n");
            pos++;
        }

        idx += char_len;
        char_ptr += char_len;

    }

    return groups;

}
Exemplo n.º 6
0
bool trie_search_from_index(trie_t *self, char *text, uint32_t start_node_id, phrase_array **phrases) {
    if (text == NULL) return false;

    ssize_t len, remaining;
    int32_t unich = 0;
    unsigned char ch = '\0';

    const uint8_t *ptr = (const uint8_t *)text;
    const uint8_t *fail_ptr = ptr;

    uint32_t node_id = start_node_id;
    trie_node_t node = trie_get_node(self, node_id), last_node = node;
    uint32_t next_id;

    bool match = false;
    uint32_t index = 0;
    uint32_t phrase_len = 0;
    uint32_t phrase_start = 0;
    uint32_t data;

    trie_search_state_t state = SEARCH_STATE_BEGIN, last_state = SEARCH_STATE_BEGIN;

    bool advance_index = true;

    while(1) {
        len = utf8proc_iterate(ptr, -1, &unich);
        remaining = len;
        if (len <= 0) return false;
        if (!(utf8proc_codepoint_valid(unich))) return false;

        int cat = utf8proc_category(unich);
        bool is_letter = utf8_is_letter(cat);

        // If we're in the middle of a word and the first letter was not a match, skip the word
        if (is_letter && state == SEARCH_STATE_NO_MATCH) { 
            log_debug("skipping\n");
            ptr += len;
            index += len;
            last_state = state;
            continue; 
        }

        // Match in the middle of a word
        if (is_letter && last_state == SEARCH_STATE_MATCH) {
            log_debug("last_state == SEARCH_STATE_MATCH && is_letter\n");
            // Only set match to false so we don't callback
            match = false;
        }

        for (int i=0; remaining > 0; remaining--, i++, ptr++, last_node=node, last_state=state, node_id=next_id) {
            ch = (unsigned char) *ptr;
            log_debug("char=%c\n", ch);

            next_id = trie_get_transition_index(self, node, *ptr);
            node = trie_get_node(self, next_id);

            if (node.check != node_id) {
                state = is_letter ? SEARCH_STATE_NO_MATCH : SEARCH_STATE_BEGIN;
                if (match) {
                    log_debug("match is true and state==SEARCH_STATE_NO_MATCH\n");
                    if (*phrases == NULL) {
                        *phrases = phrase_array_new_size(1);
                    }
                    phrase_array_push(*phrases, (phrase_t){phrase_start, phrase_len, data});
                    index = phrase_start + phrase_len;
                    advance_index = false;
                    // Set the text back to the end of the last phrase
                    ptr = (const uint8_t *)text + index;
                    len = utf8proc_iterate(ptr, -1, &unich);
                    log_debug("ptr=%s\n", ptr);
                } else {
                    ptr += remaining;
                    log_debug("done with char, now at %s\n", ptr);
                }
                fail_ptr = ptr;
                node_id = start_node_id;
                last_node = node = trie_get_node(self, node_id);
                phrase_start = phrase_len = 0;
                last_state = state;
                match = false;
                break;
            } else {
                log_debug("node.check == node_id\n");
                state = SEARCH_STATE_PARTIAL_MATCH;
                if (last_state == SEARCH_STATE_NO_MATCH || last_state == SEARCH_STATE_BEGIN) {
                    log_debug("phrase_start=%u\n", index);
                    phrase_start = index;
                    fail_ptr = ptr + remaining;
                }

                if (node.base < 0) {
                    int32_t data_index = -1*node.base;
                    trie_data_node_t data_node = self->data->a[data_index];
                    unsigned char *current_tail = self->tail->a + data_node.tail;

                    size_t tail_len = strlen((char *)current_tail);
                    char *query_tail = (char *)(*ptr ? ptr + 1 : ptr);
                    size_t query_tail_len = strlen((char *)query_tail);
                    log_debug("next node tail: %s\n", current_tail);
                    log_debug("query node tail: %s\n", query_tail);

                    if (tail_len <= query_tail_len && strncmp((char *)current_tail, query_tail, tail_len) == 0) {
                        state = SEARCH_STATE_MATCH;
                        log_debug("Tail matches\n");
                        last_state = state;
                        data = data_node.data;
                        log_debug("%u, %d, %zu\n", index, phrase_len, tail_len);
                        ptr += tail_len;
                        index += tail_len;
                        advance_index = false;
                        phrase_len = index + 1 - phrase_start;
                        match = true;
                    } else if (match) {
                        log_debug("match is true and longer phrase tail did not match\n");
                        log_debug("phrase_start=%d, phrase_len=%d\n", phrase_start, phrase_len);
                        if (*phrases == NULL) {
                            *phrases = phrase_array_new_size(1);
                        }
                        phrase_array_push(*phrases, (phrase_t){phrase_start, phrase_len, data});
                        ptr = fail_ptr;
                        match = false;
                        index = phrase_start + phrase_len;
                        advance_index = false;
                    }

                } 

                if (ch != '\0') {
                    trie_node_t terminal_node = trie_get_transition(self, node, '\0');
                    if (terminal_node.check == next_id) {
                        log_debug("Transition to NUL byte matched\n");
                        state = SEARCH_STATE_MATCH;
                        match = true;
                        phrase_len = index + (uint32_t)len - phrase_start;
                        if (terminal_node.base < 0) {
                            int32_t data_index = -1*terminal_node.base;
                            trie_data_node_t data_node = self->data->a[data_index];
                            data = data_node.data;
                        }
                        log_debug("Got match with len=%d\n", phrase_len);
                        fail_ptr = ptr;
                    }
                }
            }

        }

        if (unich == 0) {
            if (last_state == SEARCH_STATE_MATCH) {
                log_debug("Found match at the end\n");
                if (*phrases == NULL) {
                    *phrases = phrase_array_new_size(1);
                }
                phrase_array_push(*phrases, (phrase_t){phrase_start, phrase_len, data});
            }
            break;
        }

        if (advance_index) index += len;

        advance_index = true;
        log_debug("index now %u\n", index);
    } // while

    return true;
}
Exemplo n.º 7
0
phrase_t trie_search_prefixes_from_index(trie_t *self, char *word, size_t len, uint32_t start_node_id) {
    log_debug("Call to trie_search_prefixes_from_index\n");
    uint32_t node_id = start_node_id, last_node_id = node_id;
    trie_node_t node = trie_get_node(self, node_id), last_node = node;

    log_debug("last_node_id = %d\n", last_node_id);

    uint32_t value = 0, phrase_start = 0, phrase_len = 0;

    uint8_t *ptr = (uint8_t *)word;

    ssize_t char_len = 0;

    uint32_t idx = 0;

    size_t separator_char_len = 0;

    int32_t codepoint = 0;

    bool first_char = true;

    trie_data_node_t data_node;
    trie_node_t terminal_node;

    while (idx < len) {
        char_len = utf8proc_iterate(ptr, len, &codepoint);
        log_debug("char_len = %zu, char=%d\n", char_len, codepoint);
        if (char_len <= 0) break;
        if (!(utf8proc_codepoint_valid(codepoint))) break;

        bool is_hyphen = utf8_is_hyphen(codepoint);

        int cat = utf8proc_category(codepoint);
        bool is_space = utf8_is_separator(cat);

        uint8_t *char_ptr = ptr;
        size_t i = 0;

        bool skip_char = false;
        bool break_out = false;

        for (i = 0; i < char_len; i++) {
            node_id = trie_get_transition_index(self, last_node, *char_ptr);
            node = trie_get_node(self, node_id);
            log_debug("At idx=%zu, char=%.*s\n", i, (int)char_len, char_ptr);

            if (node.check != last_node_id) {
                log_debug("node.check = %d and last_node_id = %d\n", node.check, last_node_id);

                if (is_hyphen || (is_space && *ptr != ' ')) {
                    log_debug("Got hyphen or other separator, trying space instead\n");
                    node_id = trie_get_transition_index(self, last_node, ' ');
                    node = trie_get_node(self, node_id);
                }

                if (is_hyphen && node.check != last_node_id) {
                    log_debug("No space transition\n");
                    ptr += char_len;
                    idx += char_len;
                    separator_char_len = char_len;
                    node_id = last_node_id;
                    node = trie_get_node(self, node_id);
                    skip_char = true;
                    break;
                } else if (node.check != last_node_id) {
                    break_out = true;
                    log_debug("Breaking\n");
                    break;
                }
                break;
            }

            if (first_char) {
                phrase_start = idx;
                first_char = false;
            }

            if (node.base < 0) {
                log_debug("Searching tail\n");

                data_node = trie_get_data_node(self, node);
                uint32_t current_tail_pos = data_node.tail;

                unsigned char *current_tail = self->tail->a + current_tail_pos;

                log_debug("comparing tail: %s vs %s\n", current_tail, char_ptr + 1);
                size_t current_tail_len = strlen((char *)current_tail);

                size_t match_len = i + 1;
                size_t offset = i + 1;
                size_t tail_pos = 0;
                log_debug("offset=%zu\n", offset);

                if (char_len > 1) {
                    log_debug("char_len = %zu\n", char_len);
                    log_debug("Doing strncmp: (%zu) %s vs %s\n", char_len - offset, current_tail, char_ptr + 1);

                    if (strncmp((char *)ptr + offset, (char *)current_tail, char_len - offset) == 0) {
                        match_len += char_len - offset;
                        tail_pos = char_len - offset;
                        log_debug("in char match_len = %zu\n", match_len);
                    } else {
                        return NULL_PHRASE;
                    }
                }

                size_t tail_match_len = utf8_common_prefix_len((char *)ptr + char_len, (char *)current_tail + tail_pos, current_tail_len - tail_pos);
                match_len += tail_match_len;
                log_debug("match_len=%zu\n", match_len);
                
                if (tail_match_len == current_tail_len - tail_pos) {
                    if (first_char) phrase_start = idx;
                    phrase_len = (uint32_t)(idx + match_len) - phrase_start;

                    log_debug("tail match! phrase_len=%u\n", phrase_len);
                    value = data_node.data;
                    return (phrase_t){phrase_start, phrase_len, value};
                } else {
                    return NULL_PHRASE;
                }

            } else if (node.check == last_node_id) {
                terminal_node = trie_get_transition(self, node, '\0');
                log_debug("Trying link from %d to terminal node\n", last_node_id);

                if (terminal_node.check == node_id) {
                    log_debug("Transition to NUL byte matched\n");
                    if (terminal_node.base < 0) {
                        phrase_len = (uint32_t)(idx + char_len) - phrase_start;
                        data_node = trie_get_data_node(self, terminal_node);
                        value = data_node.data;
                    }
                    log_debug("Got match with len=%d\n", phrase_len);
                }
            }

            last_node = node;
            last_node_id = node_id;
            log_debug("last_node_id = %d\n", last_node_id);
            char_ptr++;
        }


        if (break_out) {
            break;
        } else if (skip_char) {
            continue;
        }

        log_debug("Incrementing index\n");

        idx += char_len;
        ptr += char_len;
    }

    log_debug("exited while loop\n");

    if (phrase_len == 0) return NULL_PHRASE;
 
    return (phrase_t) {phrase_start, phrase_len, value};
}
Exemplo n.º 8
0
phrase_t trie_search_suffixes_from_index(trie_t *self, char *word, size_t len, uint32_t start_node_id) {
    uint32_t last_node_id = start_node_id;
    trie_node_t last_node = trie_get_node(self, last_node_id);
    uint32_t node_id = last_node_id;
    trie_node_t node = last_node;

    uint32_t value = 0, phrase_start = 0, phrase_len = 0;

    ssize_t char_len;

    int32_t unich = 0;

    ssize_t index = len;
    const uint8_t *ptr = (const uint8_t *)word;
    const uint8_t *char_ptr;

    bool in_tail = false;
    unsigned char *current_tail = (unsigned char *)"";
    size_t tail_remaining = 0;

    uint32_t tail_value = 0;

    while(index > 0) {
        char_len = utf8proc_iterate_reversed(ptr, index, &unich);

        if (char_len <= 0) return NULL_PHRASE;
        if (!(utf8proc_codepoint_valid(unich))) return NULL_PHRASE;

        index -= char_len;
        char_ptr = ptr + index;

        if (in_tail && tail_remaining >= char_len && strncmp((char *)current_tail, (char *)char_ptr, char_len) == 0) {
            tail_remaining -= char_len;
            current_tail += char_len;
            phrase_start = (uint32_t)index;

            log_debug("tail matched at char %.*s (len=%zd)\n", (int)char_len, char_ptr, char_len);
            log_debug("tail_remaining = %zu\n", tail_remaining);

            if (tail_remaining == 0) {
                log_debug("tail match! tail_value=%u\n",tail_value);
                phrase_len = (uint32_t)(len - index);
                value = tail_value;
                index = 0;
                break;
            }
            continue;
        } else if (in_tail) {
            break;
        }

        for (int i=0; i < char_len; i++, char_ptr++, last_node = node, last_node_id = node_id) {
            log_debug("char=%c\n", (unsigned char)*char_ptr);

            node_id = trie_get_transition_index(self, node, *char_ptr);
            node = trie_get_node(self, node_id);
    
            if (node.check != last_node_id) {
                log_debug("node.check = %d and last_node_id = %d\n", node.check, last_node_id);
                index = 0;
                break;
            } else if (node.base < 0) {
                log_debug("Searching tail\n");

                uint32_t data_index = -1*node.base;
                trie_data_node_t data_node = self->data->a[data_index];
                uint32_t current_tail_pos = data_node.tail;

                tail_value = data_node.data;

                current_tail = self->tail->a + current_tail_pos;

                tail_remaining = strlen((char *)current_tail);
                log_debug("tail_remaining=%zu\n", tail_remaining);
                in_tail = true;

                size_t remaining_char_len = char_len - i - 1;
                log_debug("remaining_char_len = %zu\n", remaining_char_len);

                if (remaining_char_len > 0 && strncmp((char *)char_ptr + 1, (char *)current_tail, remaining_char_len) == 0) {
                    log_debug("tail string comparison successful\n");
                    tail_remaining -= remaining_char_len;
                    current_tail += remaining_char_len;
                } else if (remaining_char_len > 0) {
                    log_debug("tail comparison unsuccessful, %s vs %s\n", char_ptr, current_tail);
                    index = 0;
                    break;
                }

                if (tail_remaining == 0) {
                    phrase_start = (uint32_t)index;
                    phrase_len = (uint32_t)(len - index);
                    log_debug("phrase_start = %d, phrase_len=%d\n", phrase_start, phrase_len);
                    value = tail_value;
                    index = 0;
                }
                break;
            } else if (i == char_len - 1) {
                trie_node_t terminal_node = trie_get_transition(self, node, '\0');
                if (terminal_node.check == node_id) {
                    int32_t data_index = -1 * terminal_node.base;
                    trie_data_node_t data_node = self->data->a[data_index];
                    value = data_node.data;
                    phrase_start = (uint32_t)index;
                    phrase_len = (uint32_t)(len - index);
                }
            }

        }

    }

    return (phrase_t) {phrase_start, phrase_len, value};
}