JL_DLLEXPORT int jl_id_start_char(uint32_t wc) { if ((wc >= 'A' && wc <= 'Z') || (wc >= 'a' && wc <= 'z') || wc == '_') return 1; if (wc < 0xA1 || wc > 0x10ffff) return 0; return is_wc_cat_id_start(wc, utf8proc_category((utf8proc_int32_t) wc)); }
int utf8proc_wcwidth(wchar_t wc) { int cat; cat = utf8proc_category(wc); if (cat == UTF8PROC_CATEGORY_CO) { /* * The private use category is where powerline and similar * codepoints are stored, they have "ambiguous" width - use 1. */ return (1); } return (utf8proc_charwidth(wc)); }
// chars that can follow an operator (e.g. +) and be parsed as part of the operator int jl_op_suffix_char(uint32_t wc) { static htable_t jl_opsuffs; if (!jl_opsuffs.size) { // initialize hash table of suffixes size_t i, opsuffs_len = sizeof(opsuffs) / (sizeof(uint32_t)); htable_t *h = htable_new(&jl_opsuffs, opsuffs_len); assert(sizeof(uint32_t) <= sizeof(void*)); for (i = 0; i < opsuffs_len; ++i) wcharhash_put_r(h, (void*)((uintptr_t)opsuffs[i]), NULL, NULL); } if (wc < 0xA1 || wc > 0x10ffff) return 0; utf8proc_category_t cat = utf8proc_category((utf8proc_int32_t) wc); if (cat == UTF8PROC_CATEGORY_MN || cat == UTF8PROC_CATEGORY_MC || cat == UTF8PROC_CATEGORY_ME) return 1; // use hash table of other allowed characters: primes and sub/superscripts return HT_NOTFOUND != wcharhash_get_r(&jl_opsuffs, (void*)((uintptr_t)wc), NULL); }
JL_DLLEXPORT int jl_id_char(uint32_t wc) { if ((wc >= 'A' && wc <= 'Z') || (wc >= 'a' && wc <= 'z') || wc == '_' || (wc >= '0' && wc <= '9') || wc == '!') return 1; if (wc < 0xA1 || wc > 0x10ffff) return 0; utf8proc_category_t cat = utf8proc_category((utf8proc_int32_t) wc); if (is_wc_cat_id_start(wc, cat)) return 1; if (cat == UTF8PROC_CATEGORY_MN || cat == UTF8PROC_CATEGORY_MC || cat == UTF8PROC_CATEGORY_ND || cat == UTF8PROC_CATEGORY_PC || cat == UTF8PROC_CATEGORY_SK || cat == UTF8PROC_CATEGORY_ME || cat == UTF8PROC_CATEGORY_NO || // primes (single, double, triple, their reverses, and quadruple) (wc >= 0x2032 && wc <= 0x2037) || (wc == 0x2057)) return 1; return 0; }
// chars that we will never allow to be part of a valid non-operator identifier static int never_id_char(uint32_t wc) { utf8proc_category_t cat = utf8proc_category((utf8proc_int32_t) wc); return ( // spaces and control characters: (cat >= UTF8PROC_CATEGORY_ZS && cat <= UTF8PROC_CATEGORY_CS) || // ASCII and Latin1 non-connector punctuation (wc < 0xff && cat >= UTF8PROC_CATEGORY_PD && cat <= UTF8PROC_CATEGORY_PO) || wc == '`' || // mathematical brackets (wc >= 0x27e6 && wc <= 0x27ef) || // angle, corner, and lenticular brackets (wc >= 0x3008 && wc <= 0x3011) || // tortoise shell, square, and more lenticular brackets (wc >= 0x3014 && wc <= 0x301b) || // fullwidth parens (wc == 0xff08 || wc == 0xff09) || // fullwidth square brackets (wc == 0xff3b || wc == 0xff3d)); }
bool trie_search_from_index(trie_t *self, char *text, uint32_t start_node_id, phrase_array **phrases) { if (text == NULL) return false; ssize_t len, remaining; int32_t unich = 0; unsigned char ch = '\0'; const uint8_t *ptr = (const uint8_t *)text; const uint8_t *fail_ptr = ptr; uint32_t node_id = start_node_id; trie_node_t node = trie_get_node(self, node_id), last_node = node; uint32_t next_id; bool match = false; uint32_t index = 0; uint32_t phrase_len = 0; uint32_t phrase_start = 0; uint32_t data; trie_search_state_t state = SEARCH_STATE_BEGIN, last_state = SEARCH_STATE_BEGIN; bool advance_index = true; while(1) { len = utf8proc_iterate(ptr, -1, &unich); remaining = len; if (len <= 0) return false; if (!(utf8proc_codepoint_valid(unich))) return false; int cat = utf8proc_category(unich); bool is_letter = utf8_is_letter(cat); // If we're in the middle of a word and the first letter was not a match, skip the word if (is_letter && state == SEARCH_STATE_NO_MATCH) { log_debug("skipping\n"); ptr += len; index += len; last_state = state; continue; } // Match in the middle of a word if (is_letter && last_state == SEARCH_STATE_MATCH) { log_debug("last_state == SEARCH_STATE_MATCH && is_letter\n"); // Only set match to false so we don't callback match = false; } for (int i=0; remaining > 0; remaining--, i++, ptr++, last_node=node, last_state=state, node_id=next_id) { ch = (unsigned char) *ptr; log_debug("char=%c\n", ch); next_id = trie_get_transition_index(self, node, *ptr); node = trie_get_node(self, next_id); if (node.check != node_id) { state = is_letter ? SEARCH_STATE_NO_MATCH : SEARCH_STATE_BEGIN; if (match) { log_debug("match is true and state==SEARCH_STATE_NO_MATCH\n"); if (*phrases == NULL) { *phrases = phrase_array_new_size(1); } phrase_array_push(*phrases, (phrase_t){phrase_start, phrase_len, data}); index = phrase_start + phrase_len; advance_index = false; // Set the text back to the end of the last phrase ptr = (const uint8_t *)text + index; len = utf8proc_iterate(ptr, -1, &unich); log_debug("ptr=%s\n", ptr); } else { ptr += remaining; log_debug("done with char, now at %s\n", ptr); } fail_ptr = ptr; node_id = start_node_id; last_node = node = trie_get_node(self, node_id); phrase_start = phrase_len = 0; last_state = state; match = false; break; } else { log_debug("node.check == node_id\n"); state = SEARCH_STATE_PARTIAL_MATCH; if (last_state == SEARCH_STATE_NO_MATCH || last_state == SEARCH_STATE_BEGIN) { log_debug("phrase_start=%u\n", index); phrase_start = index; fail_ptr = ptr + remaining; } if (node.base < 0) { int32_t data_index = -1*node.base; trie_data_node_t data_node = self->data->a[data_index]; unsigned char *current_tail = self->tail->a + data_node.tail; size_t tail_len = strlen((char *)current_tail); char *query_tail = (char *)(*ptr ? ptr + 1 : ptr); size_t query_tail_len = strlen((char *)query_tail); log_debug("next node tail: %s\n", current_tail); log_debug("query node tail: %s\n", query_tail); if (tail_len <= query_tail_len && strncmp((char *)current_tail, query_tail, tail_len) == 0) { state = SEARCH_STATE_MATCH; log_debug("Tail matches\n"); last_state = state; data = data_node.data; log_debug("%u, %d, %zu\n", index, phrase_len, tail_len); ptr += tail_len; index += tail_len; advance_index = false; phrase_len = index + 1 - phrase_start; match = true; } else if (match) { log_debug("match is true and longer phrase tail did not match\n"); log_debug("phrase_start=%d, phrase_len=%d\n", phrase_start, phrase_len); if (*phrases == NULL) { *phrases = phrase_array_new_size(1); } phrase_array_push(*phrases, (phrase_t){phrase_start, phrase_len, data}); ptr = fail_ptr; match = false; index = phrase_start + phrase_len; advance_index = false; } } if (ch != '\0') { trie_node_t terminal_node = trie_get_transition(self, node, '\0'); if (terminal_node.check == next_id) { log_debug("Transition to NUL byte matched\n"); state = SEARCH_STATE_MATCH; match = true; phrase_len = index + (uint32_t)len - phrase_start; if (terminal_node.base < 0) { int32_t data_index = -1*terminal_node.base; trie_data_node_t data_node = self->data->a[data_index]; data = data_node.data; } log_debug("Got match with len=%d\n", phrase_len); fail_ptr = ptr; } } } } if (unich == 0) { if (last_state == SEARCH_STATE_MATCH) { log_debug("Found match at the end\n"); if (*phrases == NULL) { *phrases = phrase_array_new_size(1); } phrase_array_push(*phrases, (phrase_t){phrase_start, phrase_len, data}); } break; } if (advance_index) index += len; advance_index = true; log_debug("index now %u\n", index); } // while return true; }
phrase_t trie_search_prefixes_from_index(trie_t *self, char *word, size_t len, uint32_t start_node_id) { log_debug("Call to trie_search_prefixes_from_index\n"); uint32_t node_id = start_node_id, last_node_id = node_id; trie_node_t node = trie_get_node(self, node_id), last_node = node; log_debug("last_node_id = %d\n", last_node_id); uint32_t value = 0, phrase_start = 0, phrase_len = 0; uint8_t *ptr = (uint8_t *)word; ssize_t char_len = 0; uint32_t idx = 0; size_t separator_char_len = 0; int32_t codepoint = 0; bool first_char = true; trie_data_node_t data_node; trie_node_t terminal_node; while (idx < len) { char_len = utf8proc_iterate(ptr, len, &codepoint); log_debug("char_len = %zu, char=%d\n", char_len, codepoint); if (char_len <= 0) break; if (!(utf8proc_codepoint_valid(codepoint))) break; bool is_hyphen = utf8_is_hyphen(codepoint); int cat = utf8proc_category(codepoint); bool is_space = utf8_is_separator(cat); uint8_t *char_ptr = ptr; size_t i = 0; bool skip_char = false; bool break_out = false; for (i = 0; i < char_len; i++) { node_id = trie_get_transition_index(self, last_node, *char_ptr); node = trie_get_node(self, node_id); log_debug("At idx=%zu, char=%.*s\n", i, (int)char_len, char_ptr); if (node.check != last_node_id) { log_debug("node.check = %d and last_node_id = %d\n", node.check, last_node_id); if (is_hyphen || (is_space && *ptr != ' ')) { log_debug("Got hyphen or other separator, trying space instead\n"); node_id = trie_get_transition_index(self, last_node, ' '); node = trie_get_node(self, node_id); } if (is_hyphen && node.check != last_node_id) { log_debug("No space transition\n"); ptr += char_len; idx += char_len; separator_char_len = char_len; node_id = last_node_id; node = trie_get_node(self, node_id); skip_char = true; break; } else if (node.check != last_node_id) { break_out = true; log_debug("Breaking\n"); break; } break; } if (first_char) { phrase_start = idx; first_char = false; } if (node.base < 0) { log_debug("Searching tail\n"); data_node = trie_get_data_node(self, node); uint32_t current_tail_pos = data_node.tail; unsigned char *current_tail = self->tail->a + current_tail_pos; log_debug("comparing tail: %s vs %s\n", current_tail, char_ptr + 1); size_t current_tail_len = strlen((char *)current_tail); size_t match_len = i + 1; size_t offset = i + 1; size_t tail_pos = 0; log_debug("offset=%zu\n", offset); if (char_len > 1) { log_debug("char_len = %zu\n", char_len); log_debug("Doing strncmp: (%zu) %s vs %s\n", char_len - offset, current_tail, char_ptr + 1); if (strncmp((char *)ptr + offset, (char *)current_tail, char_len - offset) == 0) { match_len += char_len - offset; tail_pos = char_len - offset; log_debug("in char match_len = %zu\n", match_len); } else { return NULL_PHRASE; } } size_t tail_match_len = utf8_common_prefix_len((char *)ptr + char_len, (char *)current_tail + tail_pos, current_tail_len - tail_pos); match_len += tail_match_len; log_debug("match_len=%zu\n", match_len); if (tail_match_len == current_tail_len - tail_pos) { if (first_char) phrase_start = idx; phrase_len = (uint32_t)(idx + match_len) - phrase_start; log_debug("tail match! phrase_len=%u\n", phrase_len); value = data_node.data; return (phrase_t){phrase_start, phrase_len, value}; } else { return NULL_PHRASE; } } else if (node.check == last_node_id) { terminal_node = trie_get_transition(self, node, '\0'); log_debug("Trying link from %d to terminal node\n", last_node_id); if (terminal_node.check == node_id) { log_debug("Transition to NUL byte matched\n"); if (terminal_node.base < 0) { phrase_len = (uint32_t)(idx + char_len) - phrase_start; data_node = trie_get_data_node(self, terminal_node); value = data_node.data; } log_debug("Got match with len=%d\n", phrase_len); } } last_node = node; last_node_id = node_id; log_debug("last_node_id = %d\n", last_node_id); char_ptr++; } if (break_out) { break; } else if (skip_char) { continue; } log_debug("Incrementing index\n"); idx += char_len; ptr += char_len; } log_debug("exited while loop\n"); if (phrase_len == 0) return NULL_PHRASE; return (phrase_t) {phrase_start, phrase_len, value}; }