gboolean is_kanji_only(gchar *line) { gchar *currentchar; gchar *line_end; currentchar = line; line_end = line + strlen(line); while (g_unichar_isspace(*currentchar) == FALSE) { // find first space if (currentchar == line_end) break; if (isKanjiChar(g_utf8_get_char(currentchar)) == FALSE) return FALSE; currentchar = g_utf8_next_char(currentchar); } return TRUE; }
/** * g_utf8_strncpy: * @dest: buffer to fill with characters from @src * @src: UTF-8 encoded string * @n: character count * * Like the standard C strncpy() function, but * copies a given number of characters instead of a given number of * bytes. The @src string must be valid UTF-8 encoded text. * (Use g_utf8_validate() on all text before trying to use UTF-8 * utility functions with it.) * * Return value: @dest **/ gchar * g_utf8_strncpy (gchar *dest, const gchar *src, gsize n) { const gchar *s = src; while (n && *s) { s = g_utf8_next_char(s); n--; } strncpy(dest, src, s - src); dest[s - src] = 0; return dest; }
static gint e_name_western_str_count_words (const gchar *str) { gint word_count; const gchar *p; word_count = 0; for (p = str; p != NULL; p = g_utf8_strchr (p, -1, ' ')) { word_count++; p = g_utf8_next_char (p); } return word_count; }
/* utiliti function, thah skip characters from cheked until ident is greater or * equal to to_ident */ static int utf8_tool_skip_chars_to (struct utf8_tool *tool, int to_ident) { gunichar uni; while (to_ident > tool->ident && tool->cheked[0] != '\0') { uni = g_utf8_get_char (tool->cheked); if (!str_unichar_iscombiningmark (uni)) { tool->ident++; if (g_unichar_iswide (uni)) tool->ident++; } tool->cheked = g_utf8_next_char (tool->cheked); } uni = g_utf8_get_char (tool->cheked); while (str_unichar_iscombiningmark (uni)) { tool->cheked = g_utf8_next_char (tool->cheked); uni = g_utf8_get_char (tool->cheked); } return 1; }
/* skip forward until we hit a character in @s, CRLF, or \0. leave *p pointing at the character that causes us to stop */ static void skip_until (char **p, char *s) { char *lp; lp = *p; while (*lp != '\r' && *lp != '\0') { gboolean s_matches = FALSE; char *ls; for (ls = s; *ls; ls = g_utf8_next_char (ls)) { if (g_utf8_get_char (ls) == g_utf8_get_char (lp)) { s_matches = TRUE; break; } } if (s_matches) break; lp = g_utf8_next_char (lp); } *p = lp; }
/** * g_utf8_pointer_to_offset: * @str: a UTF-8 encoded string * @pos: a pointer to a position within @str * * Converts from a pointer to position within a string to a integer * character offset. * * Return value: the resulting character offset **/ glong g_utf8_pointer_to_offset (const gchar *str, const gchar *pos) { const gchar *s = str; glong offset = 0; while (s < pos) { s = g_utf8_next_char (s); offset++; } return offset; }
static void desktop_file_index_text_index_add_folded (GPtrArray *array, const gchar *start, const gchar *end) { gchar *normal; normal = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL_COMPOSE); /* TODO: Invent time machine. Converse with Mustafa Ataturk... */ if (strstr (normal, "ı") || strstr (normal, "İ")) { gchar *s = normal; GString *tmp; tmp = g_string_new (NULL); while (*s) { gchar *i, *I, *e; i = strstr (s, "ı"); I = strstr (s, "İ"); if (!i && !I) break; else if (i && !I) e = i; else if (I && !i) e = I; else if (i < I) e = i; else e = I; g_string_append_len (tmp, s, e - s); g_string_append_c (tmp, 'i'); s = g_utf8_next_char (e); } g_string_append (tmp, s); g_free (normal); normal = g_string_free (tmp, FALSE); } g_ptr_array_add (array, g_utf8_casefold (normal, -1)); g_free (normal); }
void cb_mini_tweet_parse (CbMiniTweet *t, JsonObject *obj) { GDateTime *time; JsonObject *extended_object; const char *tweet_text; if (json_object_has_member (obj, "extended_tweet")) extended_object = json_object_get_object_member (obj, "extended_tweet"); else extended_object = obj; time = cb_utils_parse_date (json_object_get_string_member (obj, "created_at")); t->id = json_object_get_int_member (obj, "id"); if (json_object_has_member (extended_object, "full_text")) tweet_text = json_object_get_string_member (extended_object, "full_text"); else tweet_text = json_object_get_string_member (extended_object, "text"); if (json_object_has_member (extended_object, "display_text_range")) { /* We only remove the prefix */ guint start = (guint)json_array_get_int_element ( json_object_get_array_member (extended_object, "display_text_range"), 0); guint i; const char *p = tweet_text; /* Skip ahead */ for (i = 0; i < start; i ++) p = g_utf8_next_char (p); t->text = g_strdup (p); t->display_range_start = start; } else { t->text = g_strdup (tweet_text); t->display_range_start= 0; } t->created_at = g_date_time_to_unix (time); cb_user_identity_parse (&t->author, json_object_get_object_member (obj, "user")); g_date_time_unref (time); }
/// @todo should g_unichar_iszerowidth be used? int onscreen_width(const char *start, const char *end) { int width = 0; if (!start) return 0; if (!end) end = start + strlen(start); while (start < end) { width += onscreen_width(g_utf8_get_char(start)); start = g_utf8_next_char(start); } return width; }
static gboolean utf8_only_spaces (const char *text) { const char *scan; if (text == NULL) return TRUE; for (scan = text; *scan != 0; scan = g_utf8_next_char (scan)) { gunichar c = g_utf8_get_char (scan); if (! g_unichar_isspace (c)) return FALSE; } return TRUE; }
static gboolean str_has_uppercase (const char *str) { while (str != NULL && *str != '\0') { gunichar c; c = g_utf8_get_char (str); if (g_unichar_isupper (c)) return TRUE; str = g_utf8_next_char (str); } return FALSE; }
static int grind_get_char_validated (const char *str, gsize len) { gunichar acc = 0; GRIND_LOOP_BEGIN { const char *p = str; while (*p) { acc += g_utf8_get_char_validated (p, -1); p = g_utf8_next_char (p); } } GRIND_LOOP_END; return acc; }
void gui_entry_insert_text(GUI_ENTRY_REC *entry, const char *str) { unichar chr; int i, len; const char *ptr; g_return_if_fail(entry != NULL); g_return_if_fail(str != NULL); gui_entry_redraw_from(entry, entry->pos); if (entry->utf8) { g_utf8_validate(str, -1, &ptr); len = g_utf8_pointer_to_offset(str, ptr); } else if (term_type == TERM_TYPE_BIG5) len = strlen_big5((const unsigned char *)str); else len = strlen(str); entry_text_grow(entry, len); /* make space for the string */ g_memmove(entry->text + entry->pos + len, entry->text + entry->pos, (entry->text_len-entry->pos + 1) * sizeof(unichar)); if (!entry->utf8) { if (term_type == TERM_TYPE_BIG5) { chr = entry->text[entry->pos + len]; big5_to_unichars(str, entry->text + entry->pos); entry->text[entry->pos + len] = chr; } else { for (i = 0; i < len; i++) entry->text[entry->pos + i] = str[i]; } } else { ptr = str; for (i = 0; i < len; i++) { entry->text[entry->pos + i] = g_utf8_get_char(ptr); ptr = g_utf8_next_char(ptr); } } entry->text_len += len; entry->pos += len; gui_entry_fix_cursor(entry); gui_entry_draw(entry); }
static void e_name_western_extract_first (ENameWestern *name, ENameWesternIdxs *idxs) { /* * If there's a prefix, then the first name is right after it. */ if (idxs->prefix_idx != -1) { gint first_idx; gchar *p; first_idx = idxs->prefix_idx + strlen (name->prefix); /* Skip past white space. */ p = name->full + first_idx; while (g_unichar_isspace (g_utf8_get_char (p)) && *p != '\0') p = g_utf8_next_char (p); if (*p == '\0') return; idxs->first_idx = p - name->full; name->first = e_name_western_get_words_at_idx ( name->full, idxs->first_idx, 1); } else { /* * Otherwise, the first name is probably the first string. */ idxs->first_idx = 0; name->first = e_name_western_get_words_at_idx ( name->full, idxs->first_idx, 1); } /* * Check that we didn't just assign the beginning of a * compound last name to the first name. */ if (name->first != NULL) { if (e_name_western_is_complex_last_beginning (name->first)) { g_free (name->first); name->first = NULL; idxs->first_idx = -1; } } }
void convert(const char *filename) { struct stat stats; if (stat (filename, &stats) == -1) { printf("File not find.\n"); return; } FILE *mbfile; mbfile = fopen(filename, "r"); char *buffer = (char *)malloc (stats.st_size + 1); size_t fread_size; fread_size = fread (buffer, 1, stats.st_size, mbfile); if (fread_size != (size_t)stats.st_size) { g_print("fread error!\n"); } fclose (mbfile); buffer[stats.st_size] = '\0'; std::string newfilename = filename; newfilename += ".babylon"; FILE *babylonfile = fopen(newfilename.c_str(), "w"); char *p = strstr(buffer, "[Text]\r\n"); p += sizeof("[Text]\r\n") -1; char *p1, *p2; std::string hanzi, codes, synonyms; while (true) { p1 = strstr(p, "\r\n"); if (!p1) break; *p1 = '\0'; p2 = g_utf8_next_char(p); hanzi.assign(p, p2 -p); codes.assign(p2, p1 - p2); synonyms.clear(); int len = p1 - p2; for (int i = 0; i <len; i++) { if (codes[i] == ' ') synonyms += '|'; else synonyms += codes[i]; } fprintf(babylonfile, "%s|%s\n%s\n\n", hanzi.c_str(), synonyms.c_str(), codes.c_str()); p = p1 + 2; } fclose(babylonfile); free(buffer); printf("Write %s\n", newfilename.c_str()); }
/* * unescapes string, handles octal characters representations */ gchar* unescape_octal_values(gchar *text) { GString *value = g_string_new(""); gboolean utf8 = g_str_has_suffix(getenv("LANG"), "UTF-8"); gchar *tmp = g_strdup(text); gchar *unescaped = g_strcompress(tmp); gchar *pos = unescaped; while (*pos) { if (isvalidcharacter(pos, utf8)) { if (utf8) { /* valid utf8 character, copy to output and move to the next character */ gchar *next = g_utf8_next_char(pos); g_string_append_len(value, pos, next - pos); pos = next; } else { g_string_append_len(value, pos++, 1); } } else { /* not a valid character, convert it to its octal representation and append to the result string */ gchar *invalid = g_strndup(pos, 1); gchar *escaped = g_strescape(invalid, NULL); g_string_append(value, escaped); g_free(escaped); g_free(invalid); pos += 1; } } g_free(tmp); return g_string_free (value, FALSE); }
/* Skip newline characters and return the next character. * This function takes care of folding lines, skipping * newline characters if found, taking care of equal characters * and other strange things. */ static char* skip_newline (char *str, gboolean quoted_printable) { char *p; char *next; char *next2; p = str; /* -- swallow equal signs at end of line for quoted printable */ /* note: a quoted_printable linefolding is an equal sign followed by one or more newline characters and optional a whitespace */ if (quoted_printable && *p == '=' ) { next = g_utf8_next_char (p); if (*next == '\r' || *next == '\n') { p = g_utf8_next_char (next); /* swallow equal and newline */ if ((*p == '\r' || *p == '\n') && *p != *next ) { p = g_utf8_next_char (p); /* swallow second newline */ if (*p == ' ' || *p == '\t') { p = g_utf8_next_char (p); /* swallow whitespace */ } } } /* -- swallow newline and (if present) following whitespaces */ } else if (*p == '\r' || *p == '\n') { next = g_utf8_next_char (p); if ((*next == '\n' || *next == '\r') && *p != *next) { next2 = g_utf8_next_char (next); if (*next2 == ' ' || *next2 == '\t') { p = g_utf8_next_char (next2); /* we found a line folding */ } } else if (*next == ' ' || *next == '\t') { p = g_utf8_next_char (next); /* we found a line folding */ } } return p; }
gboolean empathy_spell_check (const gchar *word) { gint enchant_result = 1; const gchar *p; gboolean digit; gunichar c; gint len; GList *l; g_return_val_if_fail (word != NULL, FALSE); spell_setup_languages (); if (!languages) { DEBUG ("No languages to check against"); return TRUE; } /* Ignore certain cases like numbers, etc. */ for (p = word, digit = TRUE; *p && digit; p = g_utf8_next_char (p)) { c = g_utf8_get_char (p); digit = g_unichar_isdigit (c); } if (digit) { /* We don't spell check digits. */ DEBUG ("Not spell checking word:'%s', it is all digits", word); return TRUE; } len = strlen (word); for (l = languages; l; l = l->next) { SpellLanguage *lang; lang = l->data; enchant_result = enchant_dict_check (lang->speller, word, len); if (enchant_result == 0) { break; } } return (enchant_result == 0); }
/** * This function escapes an unsanitized input (e.g. that can contain binary * characters, and produces an escaped format that can be deescaped in need, * which is guaranteed to be utf8 clean. The major difference between * "binary" and "text" form is that the receiver is able to cope with \xXX * sequences that can incorporate invalid utf8 sequences when decoded. With * "text" format, we never embed anything that would become not valid utf8 * when decoded. * * Here are the rules that the routine follows: * - well-known control characters are escaped (0x0a as \n and so on) * - other control characters as per control_format * - backslash is escaped as \\ * - any additional characters (only ASCII is supported) as \<char> * - invalid utf8 sequences are converted as per invalid_format * - utf8 characters are reproduced as is */ static gsize _append_escaped_utf8_character(GString *escaped_output, const gchar **raw, gssize raw_len, const gchar *unsafe_chars, const gchar *control_format, const gchar *invalid_format) { const gchar *char_ptr = *raw; gunichar uchar = g_utf8_get_char_validated(char_ptr, raw_len); switch (uchar) { case (gunichar) -1: case (gunichar) -2: g_string_append_printf(escaped_output, invalid_format, *(guint8 *) char_ptr); (*raw)++; return 1; break; case '\b': g_string_append(escaped_output, "\\b"); break; case '\f': g_string_append(escaped_output, "\\f"); break; case '\n': g_string_append(escaped_output, "\\n"); break; case '\r': g_string_append(escaped_output, "\\r"); break; case '\t': g_string_append(escaped_output, "\\t"); break; case '\\': g_string_append(escaped_output, "\\\\"); break; default: if (uchar < 32) g_string_append_printf(escaped_output, control_format, uchar); else if (_is_character_unsafe(uchar, unsafe_chars)) g_string_append_printf(escaped_output, "\\%c", (gchar) uchar); else _append_unichar(escaped_output, uchar); break; } *raw = g_utf8_next_char(char_ptr); return *raw - char_ptr; }
static gboolean all_whitespace (const char *text, gint text_len) { const char *p = text; const char *end = text + text_len; while (p != end) { if (! g_ascii_isspace (*p)) return FALSE; p = g_utf8_next_char (p); } return TRUE; }
/* this function acts like g_utf8_offset_to_pointer() except that if it finds a * decomposable character it consumes the decomposition length from the given * offset. So it's useful when the offset was calculated for the normalized * version of str, but we need a pointer to str itself. */ static const gchar * pointer_from_offset_skipping_decomp (const gchar *str, gint offset) { gsize decomp_len; gunichar *decomp; const gchar *p; p = str; while (offset > 0) { decomp = g_unicode_canonical_decomposition (g_utf8_get_char (p), &decomp_len); g_free (decomp); p = g_utf8_next_char (p); offset -= decomp_len; } return p; }
static void gdaui_numeric_entry_assume_insert (GdauiEntry *entry, const gchar *text, gint text_length, gint *virt_pos, gint offset) { GdauiNumericEntry *fentry; gchar *otext, *ptr, *ntext; gchar tmp; gint i; GString *string; gint olen, nlen; fentry = (GdauiNumericEntry*) entry; otext = gdaui_entry_get_text (GDAUI_ENTRY (entry)); olen = strlen (otext); for (ptr = otext, i = 0; (i < *virt_pos) && *ptr; ptr = g_utf8_next_char (ptr), i++); if (i != *virt_pos) return; tmp = *ptr; *ptr = 0; string = g_string_new (""); g_string_append (string, otext); *ptr = tmp; g_string_append (string, text); g_string_append (string, ptr); g_free (otext); /*g_print ("RAW: [%s]", string->str);*/ *virt_pos += text_length; text_unformat (fentry, string->str, virt_pos); /*g_print ("SANITIZED: [%s]", string->str);*/ if (!test_text_validity (fentry, string->str)) { g_string_free (string, TRUE); /*g_print ("ERROR!\n");*/ return; } ntext = text_reformat (fentry, string->str, virt_pos, (text[text_length-1] == fentry->priv->decimal_sep)); g_string_free (string, TRUE); /*g_print ("NEW: [%s]\n", ntext);*/ i = offset; nlen = strlen (ntext); gtk_editable_delete_text ((GtkEditable*) entry, offset, olen + offset); gtk_editable_insert_text ((GtkEditable*) entry, ntext, nlen, &i); g_free (ntext); }
static void str_utf8_fix_string (char *text) { while (text[0] != '\0') { gunichar uni; uni = g_utf8_get_char_validated (text, -1); if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2))) text = g_utf8_next_char (text); else { text[0] = '?'; text++; } } }
/* * Find out how many columns an UTF-8 string occupies on the screen. */ int nmc_string_screen_width (const char *start, const char *end) { int width = 0; const char *p = start; if (end == NULL) end = start + strlen (start); while (p < end) { width += g_unichar_iswide (g_utf8_get_char (p)) ? 2 : g_unichar_iszerowidth (g_utf8_get_char (p)) ? 0 : 1; p = g_utf8_next_char (p); } /* Subtract color escape sequences as they don't occupy space. */ return width - nmc_count_color_escape_chars (start, NULL); }
/* * html_print_encoded: * * @output: the stream * @str: the string * * print the string to output encoded all special chars * */ static void html_print_encoded (GsfOutput *output, char const *str) { gunichar c; gchar *encoded; if (str == NULL) return; for (; *str != '\0' ; str = g_utf8_next_char (str)) { switch (*str) { case '"': gsf_output_puts (output, "\"\""); break; case '<': gsf_output_puts (output, "<"); break; case '>': gsf_output_puts (output, ">"); break; case '&': gsf_output_puts (output, "&"); break; case '\n': gsf_output_puts (output, "<br />\n"); break; case '\r': gsf_output_puts (output, "<br />\r"); if( *(str+1) == '\n' ) { gsf_output_puts (output, "\n"); str++; } break; default: c = g_utf8_get_char (str); if (((c >= 0x20) && (c < 0x80)) || (c == '\n') || (c == '\r') || (c == '\t')) { gsf_output_write (output, 1, (guint8 *)str); } else { c = g_utf8_get_char (str); encoded = g_ucs4_to_utf8(&c, 1, NULL, NULL, NULL); gsf_output_puts (output, encoded); g_free(encoded); } break; } } }
static void smiley_manager_tree_insert (SmileyManagerTree *tree, GdkPixbuf *smiley, const gchar *str) { SmileyManagerTree *child; child = smiley_manager_tree_find_or_insert_child (tree, g_utf8_get_char (str)); str = g_utf8_next_char (str); if (*str) { smiley_manager_tree_insert (child, smiley, str); return; } child->pixbuf = g_object_ref (smiley); }
static char* process_str (const char *str, gboolean xapian_esc, gboolean query_esc) { GString *gstr; char *norm, *cur; gboolean is_field, is_range_field; norm = g_utf8_normalize (str, -1, G_NORMALIZE_ALL); if (G_UNLIKELY(!norm)) { /* not valid utf8? */ char *u8; u8 = mu_str_utf8ify (str); norm = g_utf8_normalize (u8, -1, G_NORMALIZE_ALL); g_free (u8); } if (!norm) return NULL; /* msg-id needs some special care in queries */ if (query_esc && is_msgid_field (str)) return mu_str_process_msgid (str, TRUE); check_for_field (str, &is_field, &is_range_field); gstr = g_string_sized_new (strlen (norm)); for (cur = norm; cur && *cur; cur = g_utf8_next_char (cur)) { gunichar uc; uc = g_utf8_get_char (cur); if (xapian_esc) if (handle_esc_maybe (gstr, &cur, uc, query_esc, is_range_field)) continue; if (g_unichar_ismark(uc)) continue; if (!is_range_field) uc = g_unichar_tolower (uc); g_string_append_unichar (gstr, uc); } g_free (norm); return g_string_free (gstr, FALSE); }
/* Check if an entire UTF-8 string is printable. */ gboolean isprint_utf8_string(const gchar *str, guint length) { const char *c; if (!g_utf8_validate (str, length, NULL)) { return FALSE; } for (c = str; *c; c = g_utf8_next_char(c)) { if (!g_unichar_isprint(g_utf8_get_char(c))) { return FALSE; } } return TRUE; }
static guint utf8_n_spaces (const gchar *string) { guint n = 0; for (; string[0]; string = g_utf8_next_char(string)) { if (g_unichar_iswide_cjk(g_utf8_get_char(string))) { n += 2; } else if (string[0] == '\t') { n += 8; } else { n++; } } return n; }
gchar* gsc_utils_clear_word (const gchar* word) { int len = g_utf8_strlen(word,-1); int i; const gchar *temp = word; for (i=0; i<len; i++) { if (gsc_utils_is_separator(g_utf8_get_char(temp))) temp = g_utf8_next_char(temp); else return g_strdup(temp); } return NULL; }