static gchar * smie_gtk_source_buffer_backward_token (gpointer data) { smie_gtk_source_buffer_context_t *context = data; GtkTextIter iter, start_iter; if (gtk_text_iter_is_start (&context->iter)) return NULL; /* Skip comments and whitespaces. */ gtk_text_iter_backward_char (&context->iter); while (!gtk_text_iter_is_start (&context->iter) && (gtk_source_buffer_iter_has_context_class (context->buffer, &context->iter, "comment") || g_unichar_isspace (gtk_text_iter_get_char (&context->iter)))) gtk_text_iter_backward_char (&context->iter); gtk_text_iter_assign (&iter, &context->iter); if (gtk_source_buffer_iter_has_context_class (context->buffer, &context->iter, "string")) { /* Read a string literal. */ while (!gtk_text_iter_is_start (&context->iter) && gtk_source_buffer_iter_has_context_class (context->buffer, &context->iter, "string")) gtk_text_iter_backward_char (&context->iter); } else if (g_unichar_ispunct (gtk_text_iter_get_char (&context->iter))) { /* Read a punctuation. */ while (!gtk_text_iter_is_start (&context->iter) && g_unichar_ispunct (gtk_text_iter_get_char (&context->iter))) gtk_text_iter_backward_char (&context->iter); } else { /* Read a normal token. */ while (!gtk_text_iter_is_start (&context->iter) && !(gtk_source_buffer_iter_has_context_class (context->buffer, &context->iter, "comment") || gtk_source_buffer_iter_has_context_class (context->buffer, &context->iter, "string") || g_unichar_ispunct (gtk_text_iter_get_char (&context->iter)) || g_unichar_isspace (gtk_text_iter_get_char (&context->iter)))) gtk_text_iter_backward_char (&context->iter); } gtk_text_iter_assign (&start_iter, &context->iter); if (!gtk_text_iter_is_start (&start_iter)) gtk_text_iter_forward_char (&start_iter); gtk_text_iter_forward_char (&iter); return gtk_text_iter_get_slice (&start_iter, &iter); }
/* * call-seq: * utf8_titleize(string) * * Returns a title case string. * * Glib.utf8_titleize('привет всем') #=> Привет Всем */ static VALUE utf8_titleize(VALUE self, VALUE string) { VALUE result; gchar *temp; long index, length_in_bytes, length_in_chars; gunichar *chars_as_ucs4, current_char; gboolean first_character_of_word = TRUE; Check_Type(string, T_STRING); length_in_bytes = RSTRING_LEN(string); if ((chars_as_ucs4 = g_utf8_to_ucs4(StringValuePtr(string), length_in_bytes, NULL, &length_in_chars, NULL))) { for (index = 0; index < length_in_chars; index++) { current_char = chars_as_ucs4[index]; if (first_character_of_word == TRUE && g_unichar_isalpha(current_char)) { chars_as_ucs4[index] = g_unichar_totitle(current_char); first_character_of_word = FALSE; } if (g_unichar_isspace(current_char) || g_unichar_ispunct(current_char)) { first_character_of_word = TRUE; } } temp = g_ucs4_to_utf8(chars_as_ucs4, -1, NULL, NULL, NULL); result = rb_str_new2(temp); g_free(chars_as_ucs4); g_free(temp); return result; } else { return Qnil; } }
void TextEdit::setFlags(int new_flags, bool revalidate) { if (new_flags == flags) return; flags = new_flags; if (flags && revalidate) { bool valid = true; const char *p = getTextStart(); while (p < bufend - 1) { gunichar uc = g_utf8_get_char(p); if ((flags & FLAG_ALPHABETIC) && !g_unichar_isalpha(uc)) { valid = false; break; } if ((flags & FLAG_NUMERIC) && !g_unichar_isdigit(uc)) { valid = false; break; } if ((flags & FLAG_NOSPACE) && g_unichar_isspace(uc)) { valid = false; break; } if ((flags & FLAG_NOPUNCTUATION) && g_unichar_ispunct(uc)) { valid = false; break; } p = nextChar(p); } if (!valid) clear(); } }
static int str_utf8_ispunct (const char *text) { gunichar uni; uni = g_utf8_get_char_validated (text, -1); return g_unichar_ispunct (uni); }
static gboolean handle_esc_maybe (GString *gstr, char **cur, gunichar uc, gboolean query_esc, gboolean range_field) { char kar; kar = *cur[0]; if (query_esc) { switch (kar) { case ':': case '(': case ')': case '*': case '&': case '"': g_string_append_c (gstr, kar); return TRUE; case '.': if (!range_field) break; if ((*cur)[1] == '.' && (*cur)[2] != '.') { g_string_append (gstr, ".."); *cur = g_utf8_next_char (*cur); return TRUE; } default: break; } } if (g_unichar_ispunct(uc) || isblank(kar)) { g_string_append_c (gstr, '_'); return TRUE; } return FALSE; }
static gboolean rspamd_tokenizer_get_word (rspamd_ftok_t * buf, gchar const **cur, rspamd_ftok_t * token, GList **exceptions, gboolean is_utf, gsize *rl, gboolean check_signature) { gsize remain, pos, siglen = 0; const gchar *p, *next_p, *sig = NULL; gunichar uc; guint processed = 0; struct process_exception *ex = NULL; enum { skip_delimiters = 0, feed_token, skip_exception, process_signature } state = skip_delimiters; if (buf == NULL) { return FALSE; } if (exceptions != NULL && *exceptions != NULL) { ex = (*exceptions)->data; } g_assert (is_utf); g_assert (cur != NULL); if (*cur == NULL) { *cur = buf->begin; } token->len = 0; pos = *cur - buf->begin; if (pos >= buf->len) { return FALSE; } remain = buf->len - pos; p = *cur; token->begin = p; while (remain > 0) { uc = g_utf8_get_char (p); next_p = g_utf8_next_char (p); if (next_p - p > (gint)remain) { return FALSE; } switch (state) { case skip_delimiters: if (ex != NULL && p - buf->begin == (gint)ex->pos) { token->begin = "!!EX!!"; token->len = sizeof ("!!EX!!") - 1; processed = token->len; state = skip_exception; continue; } else if (g_unichar_isgraph (uc)) { if (!g_unichar_ispunct (uc)) { state = feed_token; token->begin = p; continue; } else if (check_signature && pos != 0 && (*p == '_' || *p == '-')) { sig = p; siglen = remain; state = process_signature; continue; } } break; case feed_token: if (ex != NULL && p - buf->begin == (gint)ex->pos) { goto set_token; } else if (!g_unichar_isgraph (uc) || g_unichar_ispunct (uc)) { goto set_token; } processed ++; break; case skip_exception: *cur = p + ex->len; *exceptions = g_list_next (*exceptions); goto set_token; break; case process_signature: if (*p == '\r' || *p == '\n') { msg_debug ("signature found: %*s", (gint)siglen, sig); return FALSE; } else if (*p != ' ' && *p != '-' && *p != '_') { state = skip_delimiters; continue; } break; } remain -= next_p - p; p = next_p; } set_token: if (rl) { *rl = processed; } if (token->len == 0) { token->len = p - token->begin; g_assert (token->len > 0); *cur = p; } return TRUE; }
static gboolean is_word_sep(gunichar c) { return g_unichar_isspace(c) || g_unichar_ispunct(c); }
gint donna_strcmp (const gchar *s1, const gchar *s2, DonnaSortOptions options) { gboolean is_string = TRUE; gint res_fb = 0; /* fallback */ gint res_cs = 0; /* case-sensitive */ gint res = 0; /* if at least one string if NULL or empty, we have a result */ if (!s1 || *s1 == '\0') { if (s2 && *s2 != '\0') return -1; else return 0; } else if (!s2 || *s2 == '\0') return 1; if (options & DONNA_SORT_DOT_FIRST) { if (*s1 == '.') { if (*s2 != '.') /* only s1 is dotted, it comes first */ return -1; else { /* both are dotted, skip the dot */ ++s1; ++s2; } } else if (*s2 == '.') /* only s2 is dotted, it comes first */ return 1; } else if (options & DONNA_SORT_DOT_MIXED) { if (*s1 == '.') ++s1; if (*s2 == '.') ++s2; } for (;;) { gunichar c1, c2; /* is at least one string over? */ if (!*s1) { if (!*s2) res = 0; else /* shorter first */ res = -1; goto done; } else if (!*s2) { /* shorter first */ res = 1; goto done; } c1 = g_utf8_get_char (s1); c2 = g_utf8_get_char (s2); if (is_string) { if (options & DONNA_SORT_IGNORE_SPUNCT) { while (g_unichar_isspace (c1) || g_unichar_ispunct (c1)) { s1 = g_utf8_next_char (s1); c1 = (*s1) ? g_utf8_get_char (s1) : 0; } while (g_unichar_isspace (c2) || g_unichar_ispunct (c2)) { s2 = g_utf8_next_char (s2); c2 = (*s2) ? g_utf8_get_char (s2) : 0; } /* did we reached the end of a string? */ if (!*s1 || !*s2) continue; } /* is at least one string a number? */ if (g_unichar_isdigit (c1)) { if (g_unichar_isdigit (c2)) { if (options & DONNA_SORT_NATURAL_ORDER) { /* switch to number comparison */ is_string = FALSE; continue; } } else { /* number first */ res = -1; goto done; } } else if (g_unichar_isdigit (c2)) { /* number first */ res = 1; goto done; } /* compare chars */ if (c1 > c2) res_cs = 1; else if (c1 < c2) res_cs = -1; if (options & DONNA_SORT_CASE_INSENSITIVE) { /* compare uppper chars */ c1 = g_unichar_toupper (c1); c2 = g_unichar_toupper (c2); if (c1 > c2) { res = 1; goto done; } else if (c1 < c2) { res = -1; goto done; } else if (res_fb == 0) /* set the case-sensitive result in case strings end up * being the same otherwise */ res_fb = res_cs; } /* do we have a res_cs yet? */ else if (res_cs != 0) { res = res_cs; goto done; } /* next chars */ s1 = g_utf8_next_char (s1); s2 = g_utf8_next_char (s2); } /* mode number */ else { unsigned long n1, n2; if (res_fb == 0) { /* count number of leading zeros */ for (n1 = 0; *s1 == '0'; ++n1, ++s1) ; for (n2 = 0; *s2 == '0'; ++n2, ++s2) ; /* try to set a fallback to put less leading zeros first */ if (n1 > n2) res_fb = 1; else if (n1 < n2) res_fb = -1; if (n1 > 0) c1 = g_utf8_get_char (s1); if (n2 > 0) c2 = g_utf8_get_char (s2); } n1 = 0; while (g_unichar_isdigit (c1)) { int d; d = g_unichar_digit_value (c1); n1 *= 10; n1 += (unsigned long) d; s1 = g_utf8_next_char (s1); if (*s1) c1 = g_utf8_get_char (s1); else break; } n2 = 0; while (g_unichar_isdigit (c2)) { int d; d = g_unichar_digit_value (c2); n2 *= 10; n2 += (unsigned long) d; s2 = g_utf8_next_char (s2); if (*s2) c2 = g_utf8_get_char (s2); else break; } if (n1 > n2) { res = 1; goto done; } else if (n1 < n2) { res = -1; goto done; } /* back to string comparison */ is_string = TRUE; } } done: return (res != 0) ? res : res_fb; }
static gboolean is_word_sep(gunichar c) { return (g_unichar_isspace(c) || g_unichar_ispunct(c)) && c != (gunichar)'\''; }
//! //! @brief Analyzes a sentence for misspellings, positions, and stem forms of words //! GList* lw_morphologyengine_hunspell_analyze (LwMorphologyEngine *engine, const gchar *TEXT, gboolean include_spellcheck) { //Sanity checks if (engine == NULL) return NULL; if (engine->hunspell == NULL) return NULL; if (TEXT == NULL) return NULL; //Declations gint start_offset = 0, end_offset = 0; GMatchInfo *match_info = NULL; GList *list = NULL; gchar *word = NULL; LwMorphology *morphology = NULL; //Initializations gchar *shortened = lw_regex_remove_parenthesis (TEXT); //Body lw_regex_get_contiguous (shortened, &match_info); while (g_match_info_matches (match_info)) { word = g_match_info_fetch (match_info, 0); if (word != NULL && !g_unichar_ispunct (g_utf8_get_char (word)) && !lw_util_string_has_japanese (word)) { g_match_info_fetch_pos (match_info, 0, &start_offset, &end_offset); gchar *normalized = NULL, *stem = NULL, *canonical = NULL, *spellcheck = NULL; //Generate the forms normalized = lw_util_normalize_string (word, TRUE, FALSE); stem = lw_morphologyengine_hunspell_stem (engine, word); if (stem != NULL) canonical = lw_util_normalize_string (stem, TRUE, FALSE); //You don't want to case fold before hunspell works if (include_spellcheck) spellcheck = lw_morphologyengine_hunspell_spellcheck (engine, word); //Cleanup identicals if (normalized != NULL) { if (strcmp(normalized, word) == 0) { g_free (normalized); normalized = NULL; } } if (stem != NULL) { if (strcmp(stem, word) == 0) { g_free (stem); stem = NULL; } if (canonical != NULL && strcmp(stem, canonical) == 0) { g_free (canonical); canonical = NULL; } //Canonical is built on stem } morphology = lw_morphology_new ( word, normalized, stem, canonical, spellcheck, NULL, start_offset, end_offset ); if (morphology != NULL) { list = g_list_append (list, morphology); morphology = NULL; } word = NULL; } else if (word != NULL) { g_free (word); word = NULL; } g_match_info_next (match_info, NULL); } errored: if (match_info != NULL) g_match_info_free (match_info); match_info = NULL; if (shortened != NULL) g_free(shortened); shortened = NULL; if (morphology != NULL) lw_morphology_free (morphology); morphology = NULL; if (word != NULL) g_free (word); word = NULL; return list; }