/** * Upcase the first letter of the word. * XXX FIXME This works 'most of the time', but is not technically correct. * This is because towlower() and towupper() are locale dependent, and also * because the byte-counts might not match up, e.g. German ß and SS. * The correct long-term fix is to use ICU or glib g_utf8_strup(), etc. */ void upcase_utf8_str(char *to, const char * from, size_t usize) { wchar_t c; int i, nbl, nbh; char low[MB_LEN_MAX]; mbstate_t mbs; memset(&mbs, 0, sizeof(mbs)); nbh = mbrtowc (&c, from, MB_CUR_MAX, &mbs); if (nbh < 0) { prt_error("Error: Invalid multi-byte string!"); return; } c = towupper(c); nbl = wctomb_check(low, c); /* Check for error on an in-place copy */ if ((nbh < nbl) && (to == from)) { /* I'm to lazy to fix this */ prt_error("Error: can't upcase multi-byte string!"); return; } /* Upcase */ for (i=0; i<nbl; i++) { to[i] = low[i]; } if ((nbh == nbl) && (to == from)) return; from += nbh; to += nbl; safe_strcpy(to, from, usize-nbl); }
/** * Reads in one word from the file, allocates space for it, * and returns it. */ static const char * get_a_word(Dictionary dict, FILE * fp) { char word[MAX_WORD+4]; /* allow for 4-byte wide chars */ const char * s; wint_t c; mbstate_t mbss; int j; do { c = fgetwc(fp); } while ((c != WEOF) && iswspace(c)); if (c == WEOF) return NULL; memset(&mbss, 0, sizeof(mbss)); for (j=0; (j <= MAX_WORD-1) && (!iswspace(c)) && (c != WEOF);) { j += wctomb_check(&word[j], c, &mbss); c = fgetwc(fp); } if (j >= MAX_WORD) { word[MAX_WORD] = 0x0; prt_error("Fatal Error: The dictionary contains a word that " "is too long. The word was: %s", word); exit(1); } word[j] = '\0'; s = string_set_add(word, dict->string_set); return s; }
/** * Downcase the first letter of the word. * XXX FIXME This works 'most of the time', but is not technically correct. * This is because towlower() and towupper() are locale dependent, and also * because the byte-counts might not match up, e.g. German ß and SS. * The correct long-term fix is to use ICU or glib g_utf8_strup(), etc. */ void downcase_utf8_str(char *to, const char * from, size_t usize, locale_t locale) { wchar_t c; int i, nbl, nbh; char low[MB_LEN_MAX]; mbstate_t mbs; /* Make sure it doesn't contain garbage in case of an error */ if (to != from) strcpy(to, from); memset(&mbs, 0, sizeof(mbs)); nbh = mbrtowc (&c, from, MB_CUR_MAX, &mbs); if (nbh < 0) { prt_error("Error: Invalid UTF-8 string!\n"); return; } c = towlower_l(c, locale); nbl = wctomb_check(low, c); /* Check for error on an in-place copy */ if ((nbh < nbl) && (to == from)) { /* I'm to lazy to fix this */ prt_error("Error: can't downcase UTF-8 string!\n"); return; } /* Downcase */ for (i=0; i<nbl; i++) { to[i] = low[i]; } if ((nbh == nbl) && (to == from)) return; from += nbh; to += nbl; safe_strcpy(to, from, usize-nbl); }
/** * This reads the next token from the input into token. * Return 1 if a character was read, else return 0 (and print a warning). */ static int link_advance(Dictionary dict) { wchar_t c; int nr, i; int quote_mode; dict->is_special = FALSE; if (dict->already_got_it != '\0') { dict->is_special = is_special(dict->already_got_it, &dict->mbss); if (dict->already_got_it == WEOF) { dict->token[0] = '\0'; } else { dict->token[0] = dict->already_got_it; /* specials are one byte */ dict->token[1] = '\0'; } dict->already_got_it = '\0'; return 1; } do { c = get_character(dict, FALSE); } while (iswspace(c)); quote_mode = FALSE; i = 0; for (;;) { if (i > MAX_TOKEN_LENGTH-3) { /* 3 for multi-byte tokens */ dict_error(dict, "Token too long"); return 0; } if (quote_mode) { if (c == '\"') { quote_mode = FALSE; dict->token[i] = '\0'; return 1; } if (iswspace(c)) { dict_error(dict, "White space inside of token"); return 0; } /* Although we read wide chars, we store UTF8 internally, always. */ nr = wcrtomb(&dict->token[i], c, &dict->mbss); if (nr < 0) { #ifndef _WIN32 dict_error2(dict, "Unable to read UTF8 string in current locale", nl_langinfo(CODESET)); fprintf (stderr, "\tTry setting the locale with \"export LANG=en_US.UTF-8\"\n"); #else dict_error(dict, "Unable to read UTF8 string in current locale"); #endif return 0; } i += nr; } else { if (is_special(c, &dict->mbss)) { if (i == 0) { dict->token[0] = c; /* special toks are one char always */ dict->token[1] = '\0'; dict->is_special = TRUE; return 1; } dict->token[i] = '\0'; dict->already_got_it = c; return 1; } if (c == 0x0) { if (i == 0) { dict->token[0] = '\0'; return 1; } dict->token[i] = '\0'; dict->already_got_it = c; return 1; } if (iswspace(c)) { dict->token[i] = '\0'; return 1; } if (c == '\"') { quote_mode = TRUE; } else { /* store UTF8 internally, always. */ nr = wctomb_check(&dict->token[i], c, &dict->mbss); if (nr < 0) { #ifndef _WIN32 dict_error2(dict, "Unable to read UTF8 string in current locale", nl_langinfo(CODESET)); fprintf (stderr, "\tTry setting the locale with \"export LANG=en_US.UTF-8\"\n"); #else dict_error(dict, "Unable to read UTF8 string in current locale"); #endif return 0; } i += nr; } } c = get_character(dict, quote_mode); } return 1; }