static int p_isalpha(TParser *prs) { Assert( prs->state ); if (prs->usewide) { if (lc_ctype_is_c()) { unsigned int c = *(prs->wstr + prs->state->poschar); /* * any non-ascii symbol with multibyte encoding * with C-locale is an alpha character */ if ( c > 0x7f ) return 1; return isalpha(0xff & c); } return iswalpha( (wint_t)*( prs->wstr + prs->state->poschar)); } return isalpha( *(unsigned char*)( prs->str + prs->state->posbyte )); }
/* * pg_set_regex_collation: set collation for these functions to obey * * This is called when beginning compilation or execution of a regexp. * Since there's no need for re-entrancy of regexp operations, it's okay * to store the results in static variables. */ void pg_set_regex_collation(Oid collation) { if (lc_ctype_is_c(collation)) { /* C/POSIX collations use this path regardless of database encoding */ pg_regex_strategy = PG_REGEX_LOCALE_C; pg_regex_locale = 0; pg_regex_collation = C_COLLATION_OID; } else { if (collation == DEFAULT_COLLATION_OID) pg_regex_locale = 0; else if (OidIsValid(collation)) { /* * NB: pg_newlocale_from_collation will fail if not HAVE_LOCALE_T; * the case of pg_regex_locale != 0 but not HAVE_LOCALE_T does not * have to be considered below. */ pg_regex_locale = pg_newlocale_from_collation(collation); } else { /* * This typically means that the parser could not resolve a * conflict of implicit collations, so report it that way. */ ereport(ERROR, (errcode(ERRCODE_INDETERMINATE_COLLATION), errmsg("could not determine which collation to use for regular expression"), errhint("Use the COLLATE clause to set the collation explicitly."))); } #ifdef USE_WIDE_UPPER_LOWER if (GetDatabaseEncoding() == PG_UTF8) { if (pg_regex_locale) pg_regex_strategy = PG_REGEX_LOCALE_WIDE_L; else pg_regex_strategy = PG_REGEX_LOCALE_WIDE; } else #endif /* USE_WIDE_UPPER_LOWER */ { if (pg_regex_locale) pg_regex_strategy = PG_REGEX_LOCALE_1BYTE_L; else pg_regex_strategy = PG_REGEX_LOCALE_1BYTE; } pg_regex_collation = collation; } }
int _t_isprint(const char *ptr) { wchar_t character[2]; if (lc_ctype_is_c()) return isprint(TOUCHAR(ptr)); char2wchar(character, ptr, 1); return iswprint((wint_t) *character); }
int t_isprint(const char *ptr) { int clen = pg_mblen(ptr); wchar_t character[2]; if (clen == 1 || lc_ctype_is_c()) return isprint(TOUCHAR(ptr)); char2wchar(character, 2, ptr, clen); return iswprint((wint_t) character[0]); }
int t_isprint(const char *ptr) { int clen = pg_mblen(ptr); wchar_t character[2]; Oid collation = DEFAULT_COLLATION_OID; /* TODO */ pg_locale_t mylocale = 0; /* TODO */ if (clen == 1 || lc_ctype_is_c(collation)) return isprint(TOUCHAR(ptr)); char2wchar(character, 2, ptr, clen, mylocale); return iswprint((wint_t) character[0]); }
size_t char2wchar(wchar_t *to, const char *from, size_t len) { if (len == 0) return 0; #ifdef WIN32 if (GetDatabaseEncoding() == PG_UTF8) { int r; r = MultiByteToWideChar(CP_UTF8, 0, from, len, to, len); if (!r) { pg_verifymbstr(from, strlen(from), false); ereport(ERROR, (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), errmsg("invalid multibyte character for locale"), errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding."))); } Assert(r <= len); return r; } else #endif /* WIN32 */ if ( lc_ctype_is_c() ) { /* * pg_mb2wchar_with_len always adds trailing '\0', so * 'to' should be allocated with sufficient space */ return pg_mb2wchar_with_len(from, (pg_wchar *)to, len); } return mbstowcs(to, from, len); }
/* * wchar2char --- convert wide characters to multibyte format * * This has the same API as the standard wcstombs() function; in particular, * tolen is the maximum number of bytes to store at *to, and *from must be * zero-terminated. The output will be zero-terminated iff there is room. */ size_t wchar2char(char *to, const wchar_t *from, size_t tolen) { size_t result; if (tolen == 0) return 0; #ifdef WIN32 /* * On Windows, the "Unicode" locales assume UTF16 not UTF8 encoding, and * for some reason mbstowcs and wcstombs won't do this for us, so we use * MultiByteToWideChar(). */ if (GetDatabaseEncoding() == PG_UTF8) { result = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, tolen, NULL, NULL); /* A zero return is failure */ if (result <= 0) result = -1; else { Assert(result <= tolen); /* Microsoft counts the zero terminator in the result */ result--; } } else #endif /* WIN32 */ { Assert(!lc_ctype_is_c()); result = wcstombs(to, from, tolen); } return result; }
/* * lowerstr_with_len --- fold string to lower case * * Input string need not be null-terminated. * * Returned string is palloc'd */ char * lowerstr_with_len(const char *str, int len) { char *out; #ifdef USE_WIDE_UPPER_LOWER Oid collation = DEFAULT_COLLATION_OID; /* TODO */ pg_locale_t mylocale = 0; /* TODO */ #endif if (len == 0) return pstrdup(""); #ifdef USE_WIDE_UPPER_LOWER /* * Use wide char code only when max encoding length > 1 and ctype != C. * Some operating systems fail with multi-byte encodings and a C locale. * Also, for a C locale there is no need to process as multibyte. From * backend/utils/adt/oracle_compat.c Teodor */ if (pg_database_encoding_max_length() > 1 && !lc_ctype_is_c(collation)) { wchar_t *wstr, *wptr; int wlen; /* * alloc number of wchar_t for worst case, len contains number of * bytes >= number of characters and alloc 1 wchar_t for 0, because * wchar2char wants zero-terminated string */ wptr = wstr = (wchar_t *) palloc(sizeof(wchar_t) * (len + 1)); wlen = char2wchar(wstr, len + 1, str, len, mylocale); Assert(wlen <= len); while (*wptr) { *wptr = towlower((wint_t) *wptr); wptr++; } /* * Alloc result string for worst case + '\0' */ len = pg_database_encoding_max_length() * wlen + 1; out = (char *) palloc(len); wlen = wchar2char(out, wstr, len, mylocale); pfree(wstr); if (wlen < 0) ereport(ERROR, (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), errmsg("conversion from wchar_t to server encoding failed: %m"))); Assert(wlen < len); } else #endif /* USE_WIDE_UPPER_LOWER */ { const char *ptr = str; char *outptr; outptr = out = (char *) palloc(sizeof(char) * (len + 1)); while ((ptr - str) < len && *ptr) { *outptr++ = tolower(TOUCHAR(ptr)); ptr++; } *outptr = '\0'; } return out; }
/* * char2wchar --- convert multibyte characters to wide characters * * This has almost the API of mbstowcs(), except that *from need not be * null-terminated; instead, the number of input bytes is specified as * fromlen. Also, we ereport() rather than returning -1 for invalid * input encoding. tolen is the maximum number of wchar_t's to store at *to. * The output will be zero-terminated iff there is room. */ size_t char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen) { size_t result; if (tolen == 0) return 0; #ifdef WIN32 /* See WIN32 "Unicode" comment above */ if (GetDatabaseEncoding() == PG_UTF8) { /* Win32 API does not work for zero-length input */ if (fromlen == 0) result = 0; else { result = MultiByteToWideChar(CP_UTF8, 0, from, fromlen, to, tolen - 1); /* A zero return is failure */ if (result == 0) result = -1; } if (result != -1) { Assert(result < tolen); /* Append trailing null wchar (MultiByteToWideChar() does not) */ to[result] = 0; } } else #endif /* WIN32 */ { /* mbstowcs requires ending '\0' */ char *str = pnstrdup(from, fromlen); Assert(!lc_ctype_is_c()); result = mbstowcs(to, str, tolen); pfree(str); } if (result == -1) { /* * Invalid multibyte character encountered. We try to give a useful * error message by letting pg_verifymbstr check the string. But it's * possible that the string is OK to us, and not OK to mbstowcs --- * this suggests that the LC_CTYPE locale is different from the * database encoding. Give a generic error message if verifymbstr * can't find anything wrong. */ pg_verifymbstr(from, fromlen, false); /* might not return */ /* but if it does ... */ ereport(ERROR, (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), errmsg("invalid multibyte character for locale"), errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding."))); } return result; }
static inline int Generic_Text_IC_like(text *str, text *pat, Oid collation) { char *s, *p; int slen, plen; /* * For efficiency reasons, in the single byte case we don't call lower() * on the pattern and text, but instead call SB_lower_char on each * character. In the multi-byte case we don't have much choice :-( */ if (pg_database_encoding_max_length() > 1) { /* lower's result is never packed, so OK to use old macros here */ pat = DatumGetTextP(DirectFunctionCall1Coll(lower, collation, PointerGetDatum(pat))); p = VARDATA(pat); plen = (VARSIZE(pat) - VARHDRSZ); str = DatumGetTextP(DirectFunctionCall1Coll(lower, collation, PointerGetDatum(str))); s = VARDATA(str); slen = (VARSIZE(str) - VARHDRSZ); if (GetDatabaseEncoding() == PG_UTF8) return UTF8_MatchText(s, slen, p, plen, 0, true); else return MB_MatchText(s, slen, p, plen, 0, true); } else { /* * Here we need to prepare locale information for SB_lower_char. This * should match the methods used in str_tolower(). */ pg_locale_t locale = 0; bool locale_is_c = false; if (lc_ctype_is_c(collation)) locale_is_c = true; else if (collation != DEFAULT_COLLATION_OID) { if (!OidIsValid(collation)) { /* * This typically means that the parser could not resolve a * conflict of implicit collations, so report it that way. */ ereport(ERROR, (errcode(ERRCODE_INDETERMINATE_COLLATION), errmsg("could not determine which collation to use for ILIKE"), errhint("Use the COLLATE clause to set the collation explicitly."))); } locale = pg_newlocale_from_collation(collation); } p = VARDATA_ANY(pat); plen = VARSIZE_ANY_EXHDR(pat); s = VARDATA_ANY(str); slen = VARSIZE_ANY_EXHDR(str); return SB_IMatchText(s, slen, p, plen, locale, locale_is_c); } }
char * lowerstr(char *str) { char *ptr = str; char *out; int len = strlen(str); if ( len == 0 ) return pstrdup(""); #ifdef TS_USE_WIDE /* * Use wide char code only when max encoding length > 1 and ctype != C. * Some operating systems fail with multi-byte encodings and a C locale. * Also, for a C locale there is no need to process as multibyte. From * backend/utils/adt/oracle_compat.c Teodor */ if (pg_database_encoding_max_length() > 1 && !lc_ctype_is_c()) { wchar_t *wstr, *wptr; int wlen; /* *alloc number of wchar_t for worst case, len contains * number of bytes <= number of characters and * alloc 1 wchar_t for 0, because wchar2char(wcstombs in really) * wants zero-terminated string */ wptr = wstr = (wchar_t *) palloc(sizeof(wchar_t) * (len+1)); /* * str SHOULD be cstring, so wlen contains number * of converted character */ wlen = char2wchar(wstr, str, len); if ( wlen < 0 ) ereport(ERROR, (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), errmsg("translation failed from server encoding to wchar_t"))); Assert(wlen<=len); wstr[wlen] = 0; while (*wptr) { *wptr = towlower((wint_t) *wptr); wptr++; } /* * Alloc result string for worst case + '\0' */ len = sizeof(char)*pg_database_encoding_max_length()*(wlen+1); out = (char*)palloc(len); /* * wlen now is number of bytes which is always >= number of characters */ wlen = wchar2char(out, wstr, len); pfree(wstr); if ( wlen < 0 ) ereport(ERROR, (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), errmsg("translation failed from wchar_t to server encoding %d", errno))); Assert(wlen<=len); out[wlen]='\0'; } else #endif { char *outptr; outptr = out = (char*)palloc( sizeof(char) * (len+1) ); while (*ptr) { *outptr++ = tolower(*(unsigned char *) ptr); ptr++; } *outptr = '\0'; } return out; }