GAIAAUX_DECLARE const char * gaiaGetLocaleCharset () { /* identifies the locale charset */ #if defined(__MINGW32__) || defined(_WIN32) return locale_charset (); #else /* not MINGW32 - WIN32 */ #if defined(__APPLE__) || defined(__ANDROID__) return locale_charset (); #else /* neither Mac OsX nor Android */ return nl_langinfo (CODESET); #endif #endif }
void BLI_string_to_utf8(char *original, char *utf_8, const char *code) { size_t inbytesleft = strlen(original); size_t outbytesleft = 512; size_t rv = 0; iconv_t cd; if (NULL == code) { code = locale_charset(); } cd = iconv_open("UTF-8", code); if (cd == (iconv_t)(-1)) { printf("iconv_open Error"); *utf_8 = '\0'; return; } rv = iconv(cd, &original, &inbytesleft, &utf_8, &outbytesleft); if (rv == (size_t) -1) { printf("iconv Error\n"); return; } *utf_8 = '\0'; iconv_close(cd); }
static const char *default_charset(void) { # if defined HAVE_LIBCHARSET_H && defined HAVE_LOCALE_CHARSET return locale_charset(); # elif defined HAVE_LANGINFO_H && defined HAVE_NL_LANGINFO return nl_langinfo(CODESET); # else return ""; /* Works with (at the very least) gnu iconv... */ # endif }
char *aacenc_to_utf8(const char *s) { char *result; const char *charset; if ((charset = locale_charset()) == 0) charset = "US-ASCII"; if (utf8_from_charset(charset, s, &result) < 0) result = strdup(s); return result; }
const char * idn_localencoding_name(void) { char *name; TRACE(("idn_localencoding_name()\n")); if ((name = getenv(IDN_LOCALCS_ENV)) != NULL) { TRACE(("local encoding=\"%-.30s\"\n", name == NULL ? "<null>" : name)); return (name); } #ifdef ENABLE_MDNKIT_COMPAT if ((name = getenv(MDN_LOCALCS_ENV)) != NULL) { TRACE(("local encoding=\"%-.30s\"\n", name == NULL ? "<null>" : name)); return (name); } #endif #ifdef WIN32 { static char cp_str[40]; /* enough */ (void)sprintf(cp_str, "CP%u", GetACP()); TRACE(("local encoding(codepage)=\"%-.30s\"\n", cp_str)); return (cp_str); } #else /* WIN32 */ #ifdef HAVE_LIBCHARSET name = locale_charset(); TRACE(("local encoding=\"%-.30s\"\n", name == NULL ? "<null>" : name)); return (name); #endif #if defined(HAVE_NL_LANGINFO) && defined(CODESET) if ((name = nl_langinfo(CODESET)) != NULL) { TRACE(("local encoding=\"%-.30s\"\n", name == NULL ? "<null>" : name)); return (name); } #endif (void)( #ifdef HAVE_SETLOCALE (name = setlocale(LC_CTYPE, NULL)) || #endif (name = getenv("LC_ALL")) || (name = getenv("LC_CTYPE")) || (name = getenv("LANG"))); TRACE(("local encoding=\"%-.30s\"\n", name == NULL ? "<null>" : name)); return (name); #endif /* WIN32 */ }
/* Returns the encoding specified by ENCODING, which must be in one of the forms described at the top of encoding-guesser.h. The returned string might be ENCODING itself or a suffix of it, or it might be a statically allocated string. */ const char * encoding_guess_parse_encoding (const char *encoding) { if (encoding == NULL || !c_strcasecmp (encoding, "auto") || !c_strcasecmp (encoding, "auto,locale") || !c_strcasecmp (encoding, "locale")) return locale_charset (); else if (!c_strncasecmp (encoding, "auto,", 5)) return encoding + 5; else return encoding; }
int main () { wchar_t wc; /* Test width of ASCII characters. */ for (wc = 0x20; wc < 0x7F; wc++) ASSERT (wcwidth (wc) == 1); /* Switch to an UTF-8 locale. */ if (setlocale (LC_ALL, "fr_FR.UTF-8") != NULL /* Check whether it's really an UTF-8 locale. On OpenBSD 4.0, the setlocale call succeeds only for the LC_CTYPE category and therefore returns "C/fr_FR.UTF-8/C/C/C/C", but the LC_CTYPE category is effectively set to an ASCII LC_CTYPE category; in particular, locale_charset() returns "ASCII". */ && strcmp (locale_charset (), "UTF-8") == 0) { /* Test width of ASCII characters. */ for (wc = 0x20; wc < 0x7F; wc++) ASSERT (wcwidth (wc) == 1); /* Test width of some non-spacing characters. */ ASSERT (wcwidth (0x0301) == 0); ASSERT (wcwidth (0x05B0) == 0); /* Test width of some format control characters. */ ASSERT (wcwidth (0x200E) <= 0); ASSERT (wcwidth (0x2060) <= 0); #if 0 /* wchar_t may be only 16 bits. */ ASSERT (wcwidth (0xE0001) <= 0); ASSERT (wcwidth (0xE0044) <= 0); #endif /* Test width of some zero width characters. */ ASSERT (wcwidth (0x200B) == 0); ASSERT (wcwidth (0xFEFF) <= 0); /* Test width of some CJK characters. */ ASSERT (wcwidth (0x3000) == 2); ASSERT (wcwidth (0xB250) == 2); ASSERT (wcwidth (0xFF1A) == 2); #if 0 /* wchar_t may be only 16 bits. */ ASSERT (wcwidth (0x20369) == 2); ASSERT (wcwidth (0x2F876) == 2); #endif } return 0; }
/*************************************************************************** Return the local encoding (dependent on the system). ***************************************************************************/ const char *get_local_encoding(void) { #ifdef HAVE_ICONV fc_assert_ret_val(is_init, NULL); return local_encoding; #else # ifdef HAVE_LIBCHARSET return locale_charset(); # else # ifdef HAVE_LANGINFO_CODESET return nl_langinfo(CODESET); # else return ""; # endif # endif #endif }
const char *knh_getSystemEncoding(void) { const char *enc = knh_getenv("KONOHAENC"); if(enc != NULL) { return enc; } #if defined(K_OSENCODING) return K_OSENCODING; #elif defined(HAVE_LOCALCHARSET_H) return (char*)locale_charset(); #else // char *enc = knh_getenv("LC_CTYPE"); // if(enc != NULL) { // return (char*)enc; // } return K_ENCODING; #endif }
std::string get_local_charset() { std::string lc_charset; setlocale(LC_CTYPE, ""); #if defined(COMP_MINGW) || defined(COMP_MSC) lc_charset = "CP" + to_string(GetACP()); #elif defined(SYS_SOLARIS) int i; lc_charset = nl_langinfo(CODESET); if (parse_int(lc_charset, i)) lc_charset = std::string("ISO") + lc_charset + std::string("-US"); #elif HAVE_NL_LANGINFO lc_charset = nl_langinfo(CODESET); #elif HAVE_LOCALE_CHARSET lc_charset = locale_charset(); #endif return lc_charset; }
int rpl_wcwidth (wchar_t wc) { /* In UTF-8 locales, use a Unicode aware width function. */ const char *encoding = locale_charset (); if (STREQ (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0 ,0)) { /* We assume that in a UTF-8 locale, a wide character is the same as a Unicode character. */ return uc_width (wc, encoding); } else { /* Otherwise, fall back to the system's wcwidth function. */ #if HAVE_WCWIDTH return wcwidth (wc); #else return wc == 0 ? 0 : iswprint (wc) ? 1 : -1; #endif } }
const char* default_codeset (void) { const char* fromcode = 0; #if defined HAVE_LOCALE_CHARSET debug_printf ("Using locale_charset() to get system codeset.\n"); fromcode = locale_charset (); #elif defined HAVE_LANGINFO_CODESET debug_printf ("Using nl_langinfo() to get system codeset.\n"); fromcode = nl_langinfo (CODESET); #else debug_printf ("No way to get system codeset.\n"); #endif if (!fromcode || !fromcode[0]) { debug_printf ("No default codeset, using ISO-8859-1.\n"); fromcode = "ISO-8859-1"; } else { debug_printf ("Found default codeset %s\n", fromcode); } #if defined (WIN32) { /* This is just for debugging */ LCID lcid; lcid = GetSystemDefaultLCID (); debug_printf ("SystemDefaultLCID: %04x\n", lcid); lcid = GetUserDefaultLCID (); debug_printf ("UserDefaultLCID: %04x\n", lcid); } #endif #if defined HAVE_ICONV debug_printf ("Have iconv.\n"); #else debug_printf ("No iconv.\n"); #endif return fromcode; }
int main (int argc, char * argv[]) { setlocale (LC_CTYPE, ""); if (argc == 2) { /* Insert line breaks for a given width. */ int width = atoi (argv[1]); char *input = read_file (stdin); int length = strlen (input); char *breaks = malloc (length); int i; ulc_width_linebreaks (input, length, width, 0, 0, NULL, locale_charset (), breaks); for (i = 0; i < length; i++) { switch (breaks[i]) { case UC_BREAK_POSSIBLE: putc ('\n', stdout); break; case UC_BREAK_MANDATORY: break; case UC_BREAK_PROHIBITED: break; default: abort (); } putc (input[i], stdout); } free (breaks); return 0; } else return 1; }
static char * curencoding () { char *loc; #if defined (HAVE_LOCALE_CHARSET) loc = (char *)locale_charset (); return loc; #else char *dot, *mod; loc = get_locale_var ("LC_CTYPE"); if (loc == 0 || *loc == 0) return ""; dot = strchr (loc, '.'); if (dot == 0) return loc; mod = strchr (dot, '@'); if (mod) *mod = '\0'; return ++dot; #endif }
int main (int argc, char * argv[]) { setlocale (LC_CTYPE, ""); if (argc == 1) { /* Display all the break opportunities in the input string. */ char *input = read_file (stdin); int length = strlen (input); char *breaks = malloc (length); int i; ulc_possible_linebreaks (input, length, locale_charset (), breaks); for (i = 0; i < length; i++) { switch (breaks[i]) { case UC_BREAK_POSSIBLE: putc ('|', stdout); break; case UC_BREAK_MANDATORY: break; case UC_BREAK_PROHIBITED: break; default: abort (); } putc (input[i], stdout); } free (breaks); return 0; } else return 1; }
static uint8_t * ulc_u8_casefold (const char *s, size_t n, const char *iso639_language, uninorm_t nf, uint8_t *resultbuf, size_t *lengthp) { uint8_t convbuf[2048 / sizeof (uint8_t)]; uint8_t *conv; size_t conv_length; uint8_t *result; /* Convert the string to UTF-8. */ conv_length = sizeof (convbuf) / sizeof (uint8_t); conv = u8_conv_from_encoding (locale_charset (), iconveh_error, s, n, NULL, convbuf, &conv_length); if (conv == NULL) /* errno is set here. */ return NULL; /* Case-fold and normalize. */ result = u8_casefold (conv, conv_length, iso639_language, nf, resultbuf, lengthp); if (result == NULL) { if (conv != convbuf) { int saved_errno = errno; free (conv); errno = saved_errno; } return NULL; } if (conv != convbuf) free (conv); return result; }
char *acp_to_utf8(char *acp) { #if defined(__WIN32__) int buflen; WCHAR *wChars; char *mChars; if (!acp) { return knil; } buflen = MultiByteToWideChar(CP_ACP, 0, acp, -1, NULL, 0); wChars = kmem_alloc(buflen * sizeof(WCHAR)); MultiByteToWideChar(CP_ACP, 0, acp, -1, wChars, buflen); buflen = WideCharToMultiByte(CP_UTF8, 0, wChars, -1, NULL, 0, NULL, NULL); mChars = kmem_alloc(buflen * sizeof(char)); WideCharToMultiByte(CP_UTF8, 0, wChars, -1, mChars, buflen, NULL, NULL); kmem_free(wChars); return mChars; #elif defined(__UNIX__) kchar *enc_conv(const kchar *in, const kchar *enc_from, const kchar *enc_to); static const kchar *__s_charset = knil; if (!acp) { return knil; } if (!__s_charset) { __s_charset = locale_charset(); } return enc_conv(acp, __s_charset, "UTF-8"); #endif }
size_t mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps) { char *pstate = (char *)ps; if (s == NULL) { pwc = NULL; s = ""; n = 1; } if (n == 0) return (size_t)(-2); /* Here n > 0. */ if (pstate == NULL) pstate = internal_state; { size_t nstate = pstate[0]; char buf[4]; const char *p; size_t m; switch (nstate) { case 0: p = s; m = n; break; case 3: buf[2] = pstate[3]; /*FALLTHROUGH*/ case 2: buf[1] = pstate[2]; /*FALLTHROUGH*/ case 1: buf[0] = pstate[1]; p = buf; m = nstate; buf[m++] = s[0]; if (n >= 2 && m < 4) { buf[m++] = s[1]; if (n >= 3 && m < 4) buf[m++] = s[2]; } break; default: errno = EINVAL; return (size_t)(-1); } /* Here m > 0. */ # if __GLIBC__ || defined __UCLIBC__ /* Work around bug <http://sourceware.org/bugzilla/show_bug.cgi?id=9674> */ mbtowc (NULL, NULL, 0); # endif { int res = mbtowc (pwc, p, m); if (res >= 0) { if (pwc != NULL && ((*pwc == 0) != (res == 0))) abort (); if (nstate >= (res > 0 ? res : 1)) abort (); res -= nstate; pstate[0] = 0; return res; } /* mbtowc does not distinguish between invalid and incomplete multibyte sequences. But mbrtowc needs to make this distinction. There are two possible approaches: - Use iconv() and its return value. - Use built-in knowledge about the possible encodings. Given the low quality of implementation of iconv() on the systems that lack mbrtowc(), we use the second approach. The possible encodings are: - 8-bit encodings, - EUC-JP, EUC-KR, GB2312, EUC-TW, BIG5, GB18030, SJIS, - UTF-8. Use specialized code for each. */ if (m >= 4 || m >= MB_CUR_MAX) goto invalid; /* Here MB_CUR_MAX > 1 and 0 < m < 4. */ { const char *encoding = locale_charset (); if (STREQ_OPT (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0)) { /* Cf. unistr/u8-mblen.c. */ unsigned char c = (unsigned char) p[0]; if (c >= 0xc2) { if (c < 0xe0) { if (m == 1) goto incomplete; } else if (c < 0xf0) { if (m == 1) goto incomplete; if (m == 2) { unsigned char c2 = (unsigned char) p[1]; if ((c2 ^ 0x80) < 0x40 && (c >= 0xe1 || c2 >= 0xa0) && (c != 0xed || c2 < 0xa0)) goto incomplete; } } else if (c <= 0xf4) { if (m == 1) goto incomplete; else /* m == 2 || m == 3 */ { unsigned char c2 = (unsigned char) p[1]; if ((c2 ^ 0x80) < 0x40 && (c >= 0xf1 || c2 >= 0x90) && (c < 0xf4 || (c == 0xf4 && c2 < 0x90))) { if (m == 2) goto incomplete; else /* m == 3 */ { unsigned char c3 = (unsigned char) p[2]; if ((c3 ^ 0x80) < 0x40) goto incomplete; } } } } } goto invalid; } /* As a reference for this code, you can use the GNU libiconv implementation. Look for uses of the RET_TOOFEW macro. */ if (STREQ_OPT (encoding, "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0)) { if (m == 1) { unsigned char c = (unsigned char) p[0]; if ((c >= 0xa1 && c < 0xff) || c == 0x8e || c == 0x8f) goto incomplete; } if (m == 2) { unsigned char c = (unsigned char) p[0]; if (c == 0x8f) { unsigned char c2 = (unsigned char) p[1]; if (c2 >= 0xa1 && c2 < 0xff) goto incomplete; } } goto invalid; } if (STREQ_OPT (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0) || STREQ_OPT (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0) || STREQ_OPT (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)) { if (m == 1) { unsigned char c = (unsigned char) p[0]; if (c >= 0xa1 && c < 0xff) goto incomplete; } goto invalid; } if (STREQ_OPT (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)) { if (m == 1) { unsigned char c = (unsigned char) p[0]; if ((c >= 0xa1 && c < 0xff) || c == 0x8e) goto incomplete; } else /* m == 2 || m == 3 */ { unsigned char c = (unsigned char) p[0]; if (c == 0x8e) goto incomplete; } goto invalid; } if (STREQ_OPT (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0)) { if (m == 1) { unsigned char c = (unsigned char) p[0]; if ((c >= 0x90 && c <= 0xe3) || (c >= 0xf8 && c <= 0xfe)) goto incomplete; } else /* m == 2 || m == 3 */ { unsigned char c = (unsigned char) p[0]; if (c >= 0x90 && c <= 0xe3) { unsigned char c2 = (unsigned char) p[1]; if (c2 >= 0x30 && c2 <= 0x39) { if (m == 2) goto incomplete; else /* m == 3 */ { unsigned char c3 = (unsigned char) p[2]; if (c3 >= 0x81 && c3 <= 0xfe) goto incomplete; } } } } goto invalid; } if (STREQ_OPT (encoding, "SJIS", 'S', 'J', 'I', 'S', 0, 0, 0, 0, 0)) { if (m == 1) { unsigned char c = (unsigned char) p[0]; if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea) || (c >= 0xf0 && c <= 0xf9)) goto incomplete; } goto invalid; } /* An unknown multibyte encoding. */ goto incomplete; } incomplete: { size_t k = nstate; /* Here 0 <= k < m < 4. */ pstate[++k] = s[0]; if (k < m) { pstate[++k] = s[1]; if (k < m) pstate[++k] = s[2]; } if (k != m) abort (); } pstate[0] = m; return (size_t)(-2); invalid: errno = EILSEQ; /* The conversion state is undefined, says POSIX. */ return (size_t)(-1); } } }
/*! \brief Print module usage description in XML format. */ void G__usage_xml(void) { struct Option *opt; struct Flag *flag; char *type; char *s, *top; int i; const char *encoding; int new_prompt = 0; new_prompt = G__uses_new_gisprompt(); /* gettext converts strings to encoding returned by nl_langinfo(CODESET) */ #if defined(HAVE_LANGINFO_H) encoding = nl_langinfo(CODESET); #elif defined(__MINGW32__) && defined(USE_NLS) encoding = locale_charset(); #endif if (!encoding || strlen(encoding) == 0) encoding = "UTF-8"; #ifdef HAVE_ICONV_H src_enc = encoding; encoding = "UTF-8"; #endif if (!st->pgm_name) /* v.dave && r.michael */ st->pgm_name = G_program_name(); if (!st->pgm_name) st->pgm_name = "??"; fprintf(stdout, "<?xml version=\"1.0\" encoding=\"%s\"?>\n", encoding); fprintf(stdout, "<!DOCTYPE task SYSTEM \"grass-interface.dtd\">\n"); fprintf(stdout, "<task name=\"%s\">\n", st->pgm_name); if (st->module_info.label) { fprintf(stdout, "\t<label>\n\t\t"); print_escaped_for_xml(stdout, st->module_info.label); fprintf(stdout, "\n\t</label>\n"); } if (st->module_info.description) { fprintf(stdout, "\t<description>\n\t\t"); print_escaped_for_xml(stdout, st->module_info.description); fprintf(stdout, "\n\t</description>\n"); } if (st->module_info.keywords) { fprintf(stdout, "\t<keywords>\n\t\t"); G__print_keywords(stdout, print_escaped_for_xml); fprintf(stdout, "\n\t</keywords>\n"); } /***** Don't use parameter-groups for now. We'll reimplement this later ***** when we have a concept of several mutually exclusive option ***** groups if (st->n_opts || st->n_flags) fprintf(stdout, "\t<parameter-group>\n"); ***** ***** *****/ if (st->n_opts) { opt = &st->first_option; while (opt != NULL) { /* TODO: make this a enumeration type? */ switch (opt->type) { case TYPE_INTEGER: type = "integer"; break; case TYPE_DOUBLE: type = "float"; break; case TYPE_STRING: type = "string"; break; default: type = "string"; break; } fprintf(stdout, "\t<parameter " "name=\"%s\" " "type=\"%s\" " "required=\"%s\" " "multiple=\"%s\">\n", opt->key, type, opt->required == YES ? "yes" : "no", opt->multiple == YES ? "yes" : "no"); if (opt->label) { fprintf(stdout, "\t\t<label>\n\t\t\t"); print_escaped_for_xml(stdout, opt->label); fprintf(stdout, "\n\t\t</label>\n"); } if (opt->description) { fprintf(stdout, "\t\t<description>\n\t\t\t"); print_escaped_for_xml(stdout, opt->description); fprintf(stdout, "\n\t\t</description>\n"); } if (opt->key_desc) { fprintf(stdout, "\t\t<keydesc>\n"); top = G_calloc(strlen(opt->key_desc) + 1, 1); strcpy(top, opt->key_desc); s = strtok(top, ","); for (i = 1; s != NULL; i++) { fprintf(stdout, "\t\t\t<item order=\"%d\">", i); print_escaped_for_xml(stdout, s); fprintf(stdout, "</item>\n"); s = strtok(NULL, ","); } fprintf(stdout, "\t\t</keydesc>\n"); G_free(top); } if (opt->gisprompt) { const char *atts[] = { "age", "element", "prompt", NULL }; top = G_calloc(strlen(opt->gisprompt) + 1, 1); strcpy(top, opt->gisprompt); s = strtok(top, ","); fprintf(stdout, "\t\t<gisprompt "); for (i = 0; s != NULL && atts[i] != NULL; i++) { fprintf(stdout, "%s=\"%s\" ", atts[i], s); s = strtok(NULL, ","); } fprintf(stdout, "/>\n"); G_free(top); } if (opt->def) { fprintf(stdout, "\t\t<default>\n\t\t\t"); print_escaped_for_xml(stdout, opt->def); fprintf(stdout, "\n\t\t</default>\n"); } if (opt->options) { /* TODO: * add something like * <range min="xxx" max="xxx"/> * to <values> */ i = 0; fprintf(stdout, "\t\t<values>\n"); while (opt->opts[i]) { fprintf(stdout, "\t\t\t<value>\n"); fprintf(stdout, "\t\t\t\t<name>"); print_escaped_for_xml(stdout, opt->opts[i]); fprintf(stdout, "</name>\n"); if (opt->descs && opt->opts[i] && opt->descs[i]) { fprintf(stdout, "\t\t\t\t<description>"); print_escaped_for_xml(stdout, opt->descs[i]); fprintf(stdout, "</description>\n"); } fprintf(stdout, "\t\t\t</value>\n"); i++; } fprintf(stdout, "\t\t</values>\n"); } if (opt->guisection) { fprintf(stdout, "\t\t<guisection>\n\t\t\t"); print_escaped_for_xml(stdout, opt->guisection); fprintf(stdout, "\n\t\t</guisection>\n"); } if (opt->guidependency) { fprintf(stdout, "\t\t<guidependency>\n\t\t\t"); print_escaped_for_xml(stdout, opt->guidependency); fprintf(stdout, "\n\t\t</guidependency>\n"); } /* TODO: * - key_desc? * - there surely are some more. which ones? */ opt = opt->next_opt; fprintf(stdout, "\t</parameter>\n"); } } if (st->n_flags) { flag = &st->first_flag; while (flag != NULL) { fprintf(stdout, "\t<flag name=\"%c\">\n", flag->key); if (flag->label) { fprintf(stdout, "\t\t<label>\n\t\t\t"); print_escaped_for_xml(stdout, flag->label); fprintf(stdout, "\n\t\t</label>\n"); } if (flag->suppress_required) fprintf(stdout, "\t\t<suppress_required/>\n"); if (flag->description) { fprintf(stdout, "\t\t<description>\n\t\t\t"); print_escaped_for_xml(stdout, flag->description); fprintf(stdout, "\n\t\t</description>\n"); } if (flag->guisection) { fprintf(stdout, " \t\t<guisection>\n\t\t\t"); print_escaped_for_xml(stdout, flag->guisection); fprintf(stdout, "\n\t\t</guisection>\n"); } flag = flag->next_flag; fprintf(stdout, "\t</flag>\n"); } } /***** Don't use parameter-groups for now. We'll reimplement this later ***** when we have a concept of several mutually exclusive option ***** groups if (st->n_opts || st->n_flags) fprintf(stdout, "\t</parameter-group>\n"); ***** ***** *****/ if (new_prompt) { /* overwrite */ fprintf(stdout, "\t<flag name=\"%s\">\n", "overwrite"); fprintf(stdout, "\t\t<description>\n\t\t\t"); print_escaped_for_xml(stdout, _("Allow output files to overwrite existing files")); fprintf(stdout, "\n\t\t</description>\n"); fprintf(stdout, "\t</flag>\n"); } /* verbose */ fprintf(stdout, "\t<flag name=\"%s\">\n", "verbose"); fprintf(stdout, "\t\t<description>\n\t\t\t"); print_escaped_for_xml(stdout, _("Verbose module output")); fprintf(stdout, "\n\t\t</description>\n"); fprintf(stdout, "\t</flag>\n"); /* quiet */ fprintf(stdout, "\t<flag name=\"%s\">\n", "quiet"); fprintf(stdout, "\t\t<description>\n\t\t\t"); print_escaped_for_xml(stdout, _("Quiet module output")); fprintf(stdout, "\n\t\t</description>\n"); fprintf(stdout, "\t</flag>\n"); fprintf(stdout, "</task>\n"); }
internal_function _nl_init_domain_conv (struct loaded_l10nfile *domain_file, struct loaded_domain *domain, struct binding *domainbinding) { /* Find out about the character set the file is encoded with. This can be found (in textual form) in the entry "". If this entry does not exist or if this does not contain the `charset=' information, we will assume the charset matches the one the current locale and we don't have to perform any conversion. */ char *nullentry; size_t nullentrylen; /* Preinitialize fields, to avoid recursion during _nl_find_msg. */ domain->codeset_cntr = (domainbinding != NULL ? domainbinding->codeset_cntr : 0); #ifdef _LIBC domain->conv = (__gconv_t) -1; #else # if HAVE_ICONV domain->conv = (iconv_t) -1; # endif #endif domain->conv_tab = NULL; /* Get the header entry. */ nullentry = _nl_find_msg (domain_file, domainbinding, "", &nullentrylen); if (nullentry != NULL) { #if defined _LIBC || HAVE_ICONV const char *charsetstr; charsetstr = strstr (nullentry, "charset="); if (charsetstr != NULL) { size_t len; char *charset; const char *outcharset; charsetstr += strlen ("charset="); len = strcspn (charsetstr, " \t\n"); charset = (char *) alloca (len + 1); # if defined _LIBC || HAVE_MEMPCPY *((char *) mempcpy (charset, charsetstr, len)) = '\0'; # else memcpy (charset, charsetstr, len); charset[len] = '\0'; # endif /* The output charset should normally be determined by the locale. But sometimes the locale is not used or not correctly set up, so we provide a possibility for the user to override this. Moreover, the value specified through bind_textdomain_codeset overrides both. */ if (domainbinding != NULL && domainbinding->codeset != NULL) outcharset = domainbinding->codeset; else { outcharset = getenv ("OUTPUT_CHARSET"); if (outcharset == NULL || outcharset[0] == '\0') { # ifdef _LIBC outcharset = _NL_CURRENT (LC_CTYPE, CODESET); # else # if HAVE_ICONV extern const char *locale_charset (void); outcharset = locale_charset (); # endif # endif } } # ifdef _LIBC /* We always want to use transliteration. */ outcharset = norm_add_slashes (outcharset, "TRANSLIT"); charset = norm_add_slashes (charset, NULL); if (__gconv_open (outcharset, charset, &domain->conv, GCONV_AVOID_NOCONV) != __GCONV_OK) domain->conv = (__gconv_t) -1; # else # if HAVE_ICONV /* When using GNU libc >= 2.2 or GNU libiconv >= 1.5, we want to use transliteration. */ # if (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2 \ || _LIBICONV_VERSION >= 0x0105 if (strchr (outcharset, '/') == NULL) { char *tmp; len = strlen (outcharset); tmp = (char *) alloca (len + 10 + 1); memcpy (tmp, outcharset, len); memcpy (tmp + len, "//TRANSLIT", 10 + 1); outcharset = tmp; domain->conv = iconv_open (outcharset, charset); freea (outcharset); } else # endif domain->conv = iconv_open (outcharset, charset); # endif # endif freea (charset); } #endif /* _LIBC || HAVE_ICONV */ } return nullentry; }
char * u32_strconv_to_locale (const uint32_t *string) { const char *encoding = locale_charset (); return u32_strconv_to_encoding (string, encoding, iconveh_question_mark); }
const char * iconv_canonicalize (const char * name) { const char* code; char buf[MAX_WORD_LENGTH+10+1]; const char* cp; char* bp; const struct alias * ap; unsigned int count; unsigned int index; const char* pool; /* Before calling aliases_lookup, convert the input string to upper case, * and check whether it's entirely ASCII (we call gperf with option "-7" * to achieve a smaller table) and non-empty. If it's not entirely ASCII, * or if it's too long, it is not a valid encoding name. */ for (code = name;;) { /* Search code in the table. */ for (cp = code, bp = buf, count = MAX_WORD_LENGTH+10+1; ; cp++, bp++) { unsigned char c = * (unsigned char *) cp; if (c >= 0x80) goto invalid; if (c >= 'a' && c <= 'z') c -= 'a'-'A'; *bp = c; if (c == '\0') break; if (--count == 0) goto invalid; } for (;;) { if (bp-buf >= 10 && memcmp(bp-10,"//TRANSLIT",10)==0) { bp -= 10; *bp = '\0'; continue; } if (bp-buf >= 8 && memcmp(bp-8,"//IGNORE",8)==0) { bp -= 8; *bp = '\0'; continue; } break; } if (buf[0] == '\0') { code = locale_charset(); /* Avoid an endless loop that could occur when using an older version of localcharset.c. */ if (code[0] == '\0') goto invalid; continue; } pool = stringpool; ap = aliases_lookup(buf, (unsigned int)(bp-buf)); if (ap == NULL) { pool = stringpool2; ap = aliases2_lookup(buf); if (ap == NULL) goto invalid; } if (ap->encoding_index == ei_local_char) { code = locale_charset(); /* Avoid an endless loop that could occur when using an older version of localcharset.c. */ if (code[0] == '\0') goto invalid; continue; } if (ap->encoding_index == ei_local_wchar_t) { /* On systems which define __STDC_ISO_10646__, wchar_t is Unicode. This is also the case on native Woe32 systems and Cygwin >= 1.7, where we know that it is UTF-16. */ #if ((defined _WIN32 || defined __WIN32__) && !defined __CYGWIN__) || (defined __CYGWIN__ && CYGWIN_VERSION_DLL_MAJOR >= 1007) if (sizeof(wchar_t) == 4) { index = ei_ucs4internal; break; } if (sizeof(wchar_t) == 2) { # if WORDS_LITTLEENDIAN index = ei_utf16le; # else index = ei_utf16be; # endif break; } #elif __STDC_ISO_10646__ if (sizeof(wchar_t) == 4) { index = ei_ucs4internal; break; } if (sizeof(wchar_t) == 2) { index = ei_ucs2internal; break; } if (sizeof(wchar_t) == 1) { index = ei_iso8859_1; break; } #endif } index = ap->encoding_index; break; } return all_canonical[index] + pool; invalid: return name; }
int main () { setlocale(LC_ALL, ""); printf("%s\n", locale_charset()); exit(0); }
int main (int argc _GL_UNUSED, char *argv[]) { int i; bool ascii_only = MB_CUR_MAX == 1 && !isprint ((unsigned char) LQ[0]); set_program_name (argv[0]); /* This part of the program is hard-wired to the C locale since it does not call setlocale. However, according to POSIX, the use of 8-bit bytes in a character context in the C locale gives unspecified results (that is, the C locale charset is allowed to be unibyte with 8-bit bytes rejected [ASCII], unibyte with 8-bit bytes being characters [often ISO-8859-1], or multibyte [often UTF-8]). We assume that the latter two cases will be indistinguishable in this test - that is, the LQ and RQ sequences will pass through unchanged in either type of charset. So when testing for quoting of str7, use the ascii_only flag to decide what to expect for the 8-bit data being quoted. */ ASSERT (!isprint ('\033')); for (i = literal_quoting_style; i <= clocale_quoting_style; i++) { set_quoting_style (NULL, (enum quoting_style) i); if (!(i == locale_quoting_style || i == clocale_quoting_style) || (strcmp (locale_charset (), "ASCII") == 0 || strcmp (locale_charset (), "ANSI_X3.4-1968") == 0)) { compare_strings (use_quotearg_buffer, &results_g[i].group1, ascii_only); compare_strings (use_quotearg, &results_g[i].group2, ascii_only); if (i == c_quoting_style) compare_strings (use_quote_double_quotes, &results_g[i].group2, ascii_only); compare_strings (use_quotearg_colon, &results_g[i].group3, ascii_only); } } set_quoting_style (NULL, literal_quoting_style); ASSERT (set_quoting_flags (NULL, QA_ELIDE_NULL_BYTES) == 0); compare_strings (use_quotearg_buffer, &flag_results[0].group1, ascii_only); compare_strings (use_quotearg, &flag_results[0].group2, ascii_only); compare_strings (use_quotearg_colon, &flag_results[0].group3, ascii_only); set_quoting_style (NULL, c_quoting_style); ASSERT (set_quoting_flags (NULL, QA_ELIDE_OUTER_QUOTES) == QA_ELIDE_NULL_BYTES); compare_strings (use_quotearg_buffer, &flag_results[1].group1, ascii_only); compare_strings (use_quotearg, &flag_results[1].group2, ascii_only); compare_strings (use_quote_double_quotes, &flag_results[1].group2, ascii_only); compare_strings (use_quotearg_colon, &flag_results[1].group3, ascii_only); ASSERT (set_quoting_flags (NULL, QA_SPLIT_TRIGRAPHS) == QA_ELIDE_OUTER_QUOTES); compare_strings (use_quotearg_buffer, &flag_results[2].group1, ascii_only); compare_strings (use_quotearg, &flag_results[2].group2, ascii_only); compare_strings (use_quote_double_quotes, &flag_results[2].group2, ascii_only); compare_strings (use_quotearg_colon, &flag_results[2].group3, ascii_only); ASSERT (set_quoting_flags (NULL, 0) == QA_SPLIT_TRIGRAPHS); for (i = 0; i < sizeof custom_quotes / sizeof *custom_quotes; ++i) { set_custom_quoting (NULL, custom_quotes[i][0], custom_quotes[i][1]); compare_strings (use_quotearg_buffer, &custom_results[i].group1, ascii_only); compare_strings (use_quotearg, &custom_results[i].group2, ascii_only); compare_strings (use_quotearg_colon, &custom_results[i].group3, ascii_only); } { /* Trigger the bug whereby quotearg_buffer would read beyond the NUL that defines the end of the string being quoted. Use an input string whose NUL is the last byte before an unreadable page. */ char *z = zerosize_ptr (); if (z) { size_t q_len = 1024; char *q = malloc (q_len + 1); char buf[10]; memset (q, 'Q', q_len); q[q_len] = 0; /* Z points to the boundary between a readable/writable page and one that is neither readable nor writable. Position our string so its NUL is at the end of the writable one. */ char const *str = "____"; size_t s_len = strlen (str); z -= s_len + 1; memcpy (z, str, s_len + 1); set_custom_quoting (NULL, q, q); /* Whether this actually triggers a SEGV depends on the implementation of memcmp: whether it compares only byte-at- a-time, and from left to right (no SEGV) or some other way. */ size_t n = quotearg_buffer (buf, sizeof buf, z, SIZE_MAX, NULL); ASSERT (n == s_len + 2 * q_len); ASSERT (memcmp (buf, q, sizeof buf) == 0); free (q); } } quotearg_free (); return 0; }
void G__wps_print_process_description(void) { struct Option *opt; struct Flag *flag; char *type; char *s, *top; const char *value = NULL; int i; char *encoding; int new_prompt = 0; int store = 1; int status = 1; const char *identifier = NULL; const char *title = NULL; const char *abstract = NULL; const char **keywords = NULL; int data_type, is_input, is_output; int num_raster_inputs = 0, num_raster_outputs = 0; int num_vector_inputs = 0, num_vector_outputs = 0; int num_strds_inputs = 0, num_strds_outputs = 0; int num_stvds_inputs = 0, num_stvds_outputs = 0; int min = 0, max = 0; int num_keywords = 0; int found_output = 0; int is_tuple; /* Checks the key_descr for comma separated values */ int num_tuples; /* Counts the "," in key_descr */ new_prompt = G__uses_new_gisprompt(); /* gettext converts strings to encoding returned by nl_langinfo(CODESET) */ #if defined(HAVE_LANGINFO_H) encoding = nl_langinfo(CODESET); if (!encoding || strlen(encoding) == 0) { encoding = "UTF-8"; } #elif defined(__MINGW32__) && defined(USE_NLS) encoding = locale_charset(); if (!encoding || strlen(encoding) == 0) { encoding = "UTF-8"; } #else encoding = "UTF-8"; #endif if (!st->pgm_name) st->pgm_name = G_program_name(); if (!st->pgm_name) st->pgm_name = "??"; /* the identifier of the process is the module name */ identifier = st->pgm_name; if (st->module_info.description) { title = st->module_info.description; abstract = st->module_info.description; } if (st->module_info.keywords) { keywords = st->module_info.keywords; num_keywords = st->n_keys; } wps_print_process_descriptions_begin(); /* store and status are supported as default. The WPS server should change this if necessary */ wps_print_process_description_begin(store, status, identifier, title, abstract, keywords, num_keywords); wps_print_data_inputs_begin(); /* Print the bounding box element with all the coordinate reference systems, which are supported by grass*/ /* Currently Disabled! A list of all proj4 supported EPSG coordinate reference systems must be implemented*/ if(1 == 0) wps_print_bounding_box_data(); /* We parse only the inputs at the beginning */ if (st->n_opts) { opt = &st->first_option; while (opt != NULL) { identifier = NULL; title = NULL; abstract = NULL; keywords = NULL; num_keywords = 0; value = NULL; is_input = 1; is_output = 0; is_tuple = 0; num_tuples = 0; data_type = TYPE_OTHER; /* Check the gisprompt */ if (opt->gisprompt) { const char *atts[] = { "age", "element", "prompt", NULL }; top = G_calloc(strlen(opt->gisprompt) + 1, 1); strcpy(top, opt->gisprompt); s = strtok(top, ","); for (i = 0; s != NULL && atts[i] != NULL; i++) { char *token = G_store(s); /* we print only input parameter, sort out the output parameter */ if(strcmp(token, "new") == 0) { is_input = 0; is_output = 1; } if(strcmp(token, "raster") == 0) { data_type = TYPE_RASTER; /* Count the raster inputs and outputs for default option creation */ if(is_input == 1) num_raster_inputs++; if(is_output == 1) num_raster_outputs++; } if(strcmp(token, "vector") == 0) { data_type = TYPE_VECTOR; if(is_input == 1) num_vector_inputs++; if(is_output == 1) num_vector_outputs++; } /* Modules may have different types of space time datasets as inputs */ if(strcmp(token, "stds") == 0) { data_type = TYPE_STDS; } if(strcmp(token, "strds") == 0) { data_type = TYPE_STRDS; if(is_input == 1) num_strds_inputs++; if(is_output == 1) num_strds_outputs++; } if(strcmp(token, "stvds") == 0) { data_type = TYPE_STVDS; if(is_input == 1) num_stvds_inputs++; if(is_output == 1) num_stvds_outputs++; } if(strcmp(token, "file") == 0) { data_type = TYPE_PLAIN_TEXT; } s = strtok(NULL, ","); G_free(token); } G_free(top); } /* Check the key description */ if (opt->key_desc) { top = G_calloc(strlen(opt->key_desc) + 1, 1); strcpy(top, opt->key_desc); s = strtok(top, ","); /* Count comma's */ for (i = 0; s != NULL; i++) { num_tuples++; s = strtok(NULL, ","); } if(num_tuples > 1) is_tuple = 1; G_free(top); } /* We have an input option */ if(is_input == 1) { switch (opt->type) { case TYPE_INTEGER: type = "integer"; break; case TYPE_DOUBLE: type = "float"; break; case TYPE_STRING: type = "string"; break; default: type = "string"; break; } identifier = opt->key; if(opt->required == YES) { if(is_tuple) min = num_tuples; else min = 1; } else { min = 0; } if(opt->multiple == YES) { max = 1024; } else { if(is_tuple) max = num_tuples; else max = 1; } if(opt->label) { title = opt->label; } if (opt->description) { if(!opt->label) title = opt->description; else abstract = opt->description; } if (opt->def) { value = opt->def; } if (opt->options) { /* TODO: * add something like * <range min="xxx" max="xxx"/> * to <values> */ i = 0; while (opt->opts[i]) { i++; } keywords = opt->opts; num_keywords = i; } if(data_type == TYPE_RASTER || data_type == TYPE_VECTOR || data_type == TYPE_STRDS || data_type == TYPE_STVDS || data_type == TYPE_STDS || data_type == TYPE_PLAIN_TEXT) { /* 2048 is the maximum size of the map in mega bytes */ wps_print_complex_input(min, max, identifier, title, abstract, 2048, data_type); } else { /* The keyword array is missused for options, type means the type of the value (integer, float ... )*/ wps_print_literal_input_output(WPS_INPUT, min, max, identifier, title, abstract, type, 0, keywords, num_keywords, value, TYPE_OTHER); } } opt = opt->next_opt; } } /* Flags are always input options and can be false or true (boolean) */ if (st->n_flags) { flag = &st->first_flag; while (flag != NULL) { /* The identifier is the flag "-x" */ char* ident = (char*)G_calloc(3, sizeof(char)); ident[0] = '-'; ident[1] = flag->key; ident[2] = '\0'; title = NULL; abstract = NULL; if (flag->description) { title = flag->description; abstract = flag->description; } const char *val[] = {"true","false"}; wps_print_literal_input_output(WPS_INPUT, 0, 1, ident, title, NULL, "boolean", 0, val, 2, "false", TYPE_OTHER); flag = flag->next_flag; } } /* We have two default options, which define the resolution of the created mapset */ if(num_raster_inputs > 0 || num_raster_outputs > 0 || num_strds_inputs > 0 || num_strds_outputs > 0) { wps_print_literal_input_output(WPS_INPUT, 0, 1, "grass_resolution_ns", "Resolution of the mapset in north-south direction in meters or degrees", "This parameter defines the north-south resolution of the mapset in meter or degrees, which should be used to process the input and output raster data. To enable this setting, you need to specify north-south and east-west resolution.", "float", 1, NULL, 0, NULL, TYPE_OTHER); wps_print_literal_input_output(WPS_INPUT, 0, 1, "grass_resolution_ew", "Resolution of the mapset in east-west direction in meters or degrees", "This parameter defines the east-west resolution of the mapset in meters or degrees, which should be used to process the input and output raster data. To enable this setting, you need to specify north-south and east-west resolution.", "float", 1, NULL, 0, NULL, TYPE_OTHER); } /* In case multi band raster maps should be imported, the band number must be provided */ if(num_raster_inputs > 0) wps_print_literal_input_output(WPS_INPUT, 0, 1, "grass_band_number", "Band to select for processing (default is all bands)", "This parameter defines band number of the input raster files which should be processed. As default all bands are processed and used as single and multiple inputs for raster modules.", "integer", 0, NULL, 0, NULL, TYPE_OTHER); /* End of inputs */ wps_print_data_inputs_end(); /* Start of the outputs */ wps_print_process_outputs_begin(); found_output = 0; /*parse the output. only raster maps, vector maps, space time raster and vector datasets plus stdout are supported */ if (st->n_opts) { opt = &st->first_option; while (opt != NULL) { identifier = NULL; title = NULL; abstract = NULL; value = NULL; is_output = 0; data_type = TYPE_OTHER; if (opt->gisprompt) { const char *atts[] = { "age", "element", "prompt", NULL }; top = G_calloc(strlen(opt->gisprompt) + 1, 1); strcpy(top, opt->gisprompt); s = strtok(top, ","); for (i = 0; s != NULL && atts[i] != NULL; i++) { char *token = G_store(s); /* we print only the output parameter */ if(strcmp(token, "new") == 0) is_output = 1; if(strcmp(token, "raster") == 0) { data_type = TYPE_RASTER; } if(strcmp(token, "vector") == 0) { data_type = TYPE_VECTOR; } if(strcmp(token, "stds") == 0) { data_type = TYPE_STDS; } if(strcmp(token, "strds") == 0) { data_type = TYPE_STRDS; } if(strcmp(token, "stvds") == 0) { data_type = TYPE_STVDS; } if(strcmp(token, "file") == 0) { data_type = TYPE_PLAIN_TEXT; } s = strtok(NULL, ","); G_free(token); } G_free(top); } /* Only single module output is supported!! */ if(is_output == 1) { if(opt->multiple == YES) G_warning(_("Multiple outputs are not supported by WPS 1.0.0")); identifier = opt->key; if(opt->label) { title = opt->label; } if (opt->description) { if(!opt->label) title = opt->description; else abstract = opt->description; } if(data_type == TYPE_RASTER || data_type == TYPE_VECTOR || data_type == TYPE_STRDS || data_type == TYPE_STVDS || data_type == TYPE_STDS || data_type == TYPE_PLAIN_TEXT) { wps_print_complex_output(identifier, title, abstract, data_type); found_output = 1; } } opt = opt->next_opt; } /* we assume the computatuon output on stdout, if no raster/vector output was found*/ if(found_output == 0) wps_print_complex_output("stdout", "Module output on stdout", "The output of the module written to stdout", TYPE_PLAIN_TEXT); } wps_print_process_outputs_end(); wps_print_process_description_end(); wps_print_process_descriptions_end(); }
int main (void) { int result = 0; static struct re_pattern_buffer regex; unsigned char folded_chars[UCHAR_MAX + 1]; int i; const char *s; struct re_registers regs; #if HAVE_DECL_ALARM /* Some builds of glibc go into an infinite loop on this test. */ int alarm_value = 2; signal (SIGALRM, SIG_DFL); alarm (alarm_value); #endif if (setlocale (LC_ALL, "en_US.UTF-8")) { { /* http://sourceware.org/ml/libc-hacker/2006-09/msg00008.html This test needs valgrind to catch the bug on Debian GNU/Linux 3.1 x86, but it might catch the bug better on other platforms and it shouldn't hurt to try the test here. */ static char const pat[] = "insert into"; static char const data[] = "\xFF\0\x12\xA2\xAA\xC4\xB1,K\x12\xC4\xB1*\xACK"; re_set_syntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE | RE_ICASE); memset (®ex, 0, sizeof regex); s = re_compile_pattern (pat, sizeof pat - 1, ®ex); if (s) result |= 1; else if (re_search (®ex, data, sizeof data - 1, 0, sizeof data - 1, ®s) != -1) result |= 1; } /* Check whether it's really a UTF-8 locale. On mingw, the setlocale call succeeds but returns "English_United States.1252", with locale_charset() returning "CP1252". */ if (strcmp (locale_charset (), "UTF-8") == 0) { /* This test is from glibc bug 15078. The test case is from Andreas Schwab in <http://www.sourceware.org/ml/libc-alpha/2013-01/msg00967.html>. */ static char const pat[] = "[^x]x"; static char const data[] = /* <U1000><U103B><U103D><U1014><U103A><U102F><U1015><U103A> */ "\xe1\x80\x80" "\xe1\x80\xbb" "\xe1\x80\xbd" "\xe1\x80\x94" "\xe1\x80\xba" "\xe1\x80\xaf" "\xe1\x80\x95" "\xe1\x80\xba" "x"; re_set_syntax (0); memset (®ex, 0, sizeof regex); s = re_compile_pattern (pat, sizeof pat - 1, ®ex); if (s) result |= 1; else { i = re_search (®ex, data, sizeof data - 1, 0, sizeof data - 1, 0); if (i != 0 && i != 21) result |= 1; } } if (! setlocale (LC_ALL, "C")) return 1; } /* This test is from glibc bug 3957, reported by Andrew Mackey. */ re_set_syntax (RE_SYNTAX_EGREP | RE_HAT_LISTS_NOT_NEWLINE); memset (®ex, 0, sizeof regex); s = re_compile_pattern ("a[^x]b", 6, ®ex); if (s) result |= 2; /* This should fail, but succeeds for glibc-2.5. */ else if (re_search (®ex, "a\nb", 3, 0, 3, ®s) != -1) result |= 2; /* This regular expression is from Spencer ere test number 75 in grep-2.3. */ re_set_syntax (RE_SYNTAX_POSIX_EGREP); memset (®ex, 0, sizeof regex); for (i = 0; i <= UCHAR_MAX; i++) folded_chars[i] = i; regex.translate = folded_chars; s = re_compile_pattern ("a[[:@:>@:]]b\n", 11, ®ex); /* This should fail with _Invalid character class name_ error. */ if (!s) result |= 4; /* Ensure that [b-a] is diagnosed as invalid, when using RE_NO_EMPTY_RANGES. */ re_set_syntax (RE_SYNTAX_POSIX_EGREP | RE_NO_EMPTY_RANGES); memset (®ex, 0, sizeof regex); s = re_compile_pattern ("a[b-a]", 6, ®ex); if (s == 0) result |= 8; /* This should succeed, but does not for glibc-2.1.3. */ memset (®ex, 0, sizeof regex); s = re_compile_pattern ("{1", 2, ®ex); if (s) result |= 8; /* The following example is derived from a problem report against gawk from Jorge Stolfi <*****@*****.**>. */ memset (®ex, 0, sizeof regex); s = re_compile_pattern ("[an\371]*n", 7, ®ex); if (s) result |= 8; /* This should match, but does not for glibc-2.2.1. */ else if (re_match (®ex, "an", 2, 0, ®s) != 2) result |= 8; memset (®ex, 0, sizeof regex); s = re_compile_pattern ("x", 1, ®ex); if (s) result |= 8; /* glibc-2.2.93 does not work with a negative RANGE argument. */ else if (re_search (®ex, "wxy", 3, 2, -2, ®s) != 1) result |= 8; /* The version of regex.c in older versions of gnulib ignored RE_ICASE. Detect that problem too. */ re_set_syntax (RE_SYNTAX_EMACS | RE_ICASE); memset (®ex, 0, sizeof regex); s = re_compile_pattern ("x", 1, ®ex); if (s) result |= 16; else if (re_search (®ex, "WXY", 3, 0, 3, ®s) < 0) result |= 16; /* Catch a bug reported by Vin Shelton in http://lists.gnu.org/archive/html/bug-coreutils/2007-06/msg00089.html */ re_set_syntax (RE_SYNTAX_POSIX_BASIC & ~RE_CONTEXT_INVALID_DUP & ~RE_NO_EMPTY_RANGES); memset (®ex, 0, sizeof regex); s = re_compile_pattern ("[[:alnum:]_-]\\\\+$", 16, ®ex); if (s) result |= 32; /* REG_STARTEND was added to glibc on 2004-01-15. Reject older versions. */ if (! REG_STARTEND) result |= 64; #if 0 /* It would be nice to reject hosts whose regoff_t values are too narrow (including glibc on hosts with 64-bit ptrdiff_t and 32-bit int), but we should wait until glibc implements this feature. Otherwise, support for equivalence classes and multibyte collation symbols would always be broken except when compiling --without-included-regex. */ if (sizeof (regoff_t) < sizeof (ptrdiff_t) || sizeof (regoff_t) < sizeof (ssize_t)) result |= 64; #endif return result; }
static void init_gettext_charset(const char *domain) { const char *charset; /* This trick arranges for messages to be emitted in the user's requested encoding, but avoids setting LC_CTYPE from the environment for the whole program. This primarily done to avoid a bug in vsnprintf in the GNU C Library [1]. which triggered a "your vsnprintf is broken" error on Git's own repository when inspecting v0.99.6~1 under a UTF-8 locale. That commit contains a ISO-8859-1 encoded author name, which the locale aware vsnprintf(3) won't interpolate in the format argument, due to mismatch between the data encoding and the locale. Even if it wasn't for that bug we wouldn't want to use LC_CTYPE at this point, because it'd require auditing all the code that uses C functions whose semantics are modified by LC_CTYPE. But only setting LC_MESSAGES as we do creates a problem, since we declare the encoding of our PO files[2] the gettext implementation will try to recode it to the user's locale, but without LC_CTYPE it'll emit something like this on 'git init' under the Icelandic locale: Bj? til t?ma Git lind ? /hlagh/.git/ Gettext knows about the encoding of our PO file, but we haven't told it about the user's encoding, so all the non-US-ASCII characters get encoded to question marks. But we're in luck! We can set LC_CTYPE from the environment only while we call nl_langinfo and bind_textdomain_codeset. That suffices to tell gettext what encoding it should emit in, so it'll now say: Bjó til tóma Git lind í /hlagh/.git/ And the equivalent ISO-8859-1 string will be emitted under a ISO-8859-1 locale. With this change way we get the advantages of setting LC_CTYPE (talk to the user in his language/encoding), without the major drawbacks (changed semantics for C functions we rely on). However foreign functions using other message catalogs that aren't using our neat trick will still have a problem, e.g. if we have to call perror(3): #include <stdio.h> #include <locale.h> #include <errno.h> int main(void) { setlocale(LC_MESSAGES, ""); setlocale(LC_CTYPE, "C"); errno = ENODEV; perror("test"); return 0; } Running that will give you a message with question marks: $ LANGUAGE= LANG=de_DE.utf8 ./test test: Kein passendes Ger?t gefunden In the long term we should probably see about getting that vsnprintf bug in glibc fixed, and audit our code so it won't fall apart under a non-C locale. Then we could simply set LC_CTYPE from the environment, which would make things like the external perror(3) messages work. See t/t0203-gettext-setlocale-sanity.sh's "gettext.c" tests for regression tests. 1. http://sourceware.org/bugzilla/show_bug.cgi?id=6530 2. E.g. "Content-Type: text/plain; charset=UTF-8\n" in po/is.po */ setlocale(LC_CTYPE, ""); charset = locale_charset(); bind_textdomain_codeset(domain, charset); setlocale(LC_CTYPE, "C"); }
iconv_t iconv_open (const char* tocode, const char* fromcode) { struct conv_struct * cd; char buf[MAX_WORD_LENGTH+10+1]; const char* cp; char* bp; const struct alias * ap; unsigned int count; unsigned int from_index; int from_wchar; unsigned int to_index; int to_wchar; int transliterate = 0; /* Before calling aliases_lookup, convert the input string to upper case, * and check whether it's entirely ASCII (we call gperf with option "-7" * to achieve a smaller table) and non-empty. If it's not entirely ASCII, * or if it's too long, it is not a valid encoding name. */ for (to_wchar = 0;;) { /* Search tocode in the table. */ for (cp = tocode, bp = buf, count = MAX_WORD_LENGTH+10+1; ; cp++, bp++) { unsigned char c = * (unsigned char *) cp; if (c >= 0x80) goto invalid; if (c >= 'a' && c <= 'z') c -= 'a'-'A'; *bp = c; if (c == '\0') break; if (--count == 0) goto invalid; } if (bp-buf > 10 && memcmp(bp-10,"//TRANSLIT",10)==0) { bp -= 10; *bp = '\0'; transliterate = 1; } ap = aliases_lookup(buf,bp-buf); if (ap == NULL) { ap = aliases2_lookup(buf); if (ap == NULL) goto invalid; } if (ap->encoding_index == ei_local_char) { tocode = locale_charset(); if (tocode != NULL) continue; goto invalid; } if (ap->encoding_index == ei_local_wchar_t) { #if __STDC_ISO_10646__ if (sizeof(wchar_t) == 4) { to_index = ei_ucs4internal; break; } if (sizeof(wchar_t) == 2) { to_index = ei_ucs2internal; break; } if (sizeof(wchar_t) == 1) { to_index = ei_iso8859_1; break; } #endif #if HAVE_MBRTOWC to_wchar = 1; tocode = locale_charset(); if (tocode != NULL) continue; #endif goto invalid; } to_index = ap->encoding_index; break; } for (from_wchar = 0;;) { /* Search fromcode in the table. */ for (cp = fromcode, bp = buf, count = MAX_WORD_LENGTH+10+1; ; cp++, bp++) { unsigned char c = * (unsigned char *) cp; if (c >= 0x80) goto invalid; if (c >= 'a' && c <= 'z') c -= 'a'-'A'; *bp = c; if (c == '\0') break; if (--count == 0) goto invalid; } if (bp-buf > 10 && memcmp(bp-10,"//TRANSLIT",10)==0) { bp -= 10; *bp = '\0'; } ap = aliases_lookup(buf,bp-buf); if (ap == NULL) { ap = aliases2_lookup(buf); if (ap == NULL) goto invalid; } if (ap->encoding_index == ei_local_char) { fromcode = locale_charset(); if (fromcode != NULL) continue; goto invalid; } if (ap->encoding_index == ei_local_wchar_t) { #if __STDC_ISO_10646__ if (sizeof(wchar_t) == 4) { from_index = ei_ucs4internal; break; } if (sizeof(wchar_t) == 2) { from_index = ei_ucs2internal; break; } if (sizeof(wchar_t) == 1) { from_index = ei_iso8859_1; break; } #endif #if HAVE_WCRTOMB from_wchar = 1; fromcode = locale_charset(); if (fromcode != NULL) continue; #endif goto invalid; } from_index = ap->encoding_index; break; } cd = (struct conv_struct *) malloc(from_wchar != to_wchar ? sizeof(struct wchar_conv_struct) : sizeof(struct conv_struct)); if (cd == NULL) { errno = ENOMEM; return (iconv_t)(-1); } cd->iindex = from_index; cd->ifuncs = all_encodings[from_index].ifuncs; cd->oindex = to_index; cd->ofuncs = all_encodings[to_index].ofuncs; cd->oflags = all_encodings[to_index].oflags; /* Initialize the loop functions. */ #if HAVE_MBRTOWC if (to_wchar) { #if HAVE_WCRTOMB if (from_wchar) { cd->lfuncs.loop_convert = wchar_id_loop_convert; cd->lfuncs.loop_reset = wchar_id_loop_reset; } else #endif { cd->lfuncs.loop_convert = wchar_to_loop_convert; cd->lfuncs.loop_reset = wchar_to_loop_reset; } } else #endif { #if HAVE_WCRTOMB if (from_wchar) { cd->lfuncs.loop_convert = wchar_from_loop_convert; cd->lfuncs.loop_reset = wchar_from_loop_reset; } else #endif { cd->lfuncs.loop_convert = unicode_loop_convert; cd->lfuncs.loop_reset = unicode_loop_reset; } } /* Initialize the states. */ memset(&cd->istate,'\0',sizeof(state_t)); memset(&cd->ostate,'\0',sizeof(state_t)); /* Initialize the operation flags. */ cd->transliterate = transliterate; /* Initialize additional fields. */ if (from_wchar != to_wchar) { struct wchar_conv_struct * wcd = (struct wchar_conv_struct *) cd; memset(&wcd->state,'\0',sizeof(mbstate_t)); } /* Done. */ return (iconv_t)cd; invalid: errno = EINVAL; return (iconv_t)(-1); }
void ulc_wordbreaks (const char *s, size_t n, char *p) { if (n > 0) { const char *encoding = locale_charset (); if (is_utf8_encoding (encoding)) u8_wordbreaks ((const uint8_t *) s, n, p); else { /* Convert the string to UTF-8 and build a translation table from offsets into s to offsets into the translated string. */ size_t *offsets = (size_t *) malloc (n * sizeof (size_t)); if (offsets != NULL) { uint8_t *t; size_t m; t = u8_conv_from_encoding (encoding, iconveh_question_mark, s, n, offsets, NULL, &m); if (t != NULL) { char *q = (char *) (m > 0 ? malloc (m) : NULL); if (m == 0 || q != NULL) { size_t i; /* Determine the word breaks of the UTF-8 string. */ u8_wordbreaks (t, m, q); /* Translate the result back to the original string. */ memset (p, 0, n); for (i = 0; i < n; i++) if (offsets[i] != (size_t)(-1)) p[i] = q[offsets[i]]; free (q); free (t); free (offsets); return; } free (t); } free (offsets); } /* Impossible to convert. */ #if C_CTYPE_ASCII if (is_all_ascii (s, n)) { /* ASCII is a subset of UTF-8. */ u8_wordbreaks ((const uint8_t *) s, n, p); return; } #endif /* We have a non-ASCII string and cannot convert it. Don't produce any word breaks. */ memset (p, 0, n); } } }
uint16_t * u16_strconv_from_locale (const char *string) { const char *encoding = locale_charset (); return u16_strconv_from_encoding (string, encoding, iconveh_question_mark); }