static wchar_t* Py_DecodeLocale(const char* arg, size_t*) { size_t argsize = mbstowcs(NULL, arg, 0); if (argsize == (size_t)-1) { return NULL; } if (argsize == PY_SSIZE_T_MAX) { return NULL; } if (argsize > PY_SSIZE_T_MAX/sizeof(wchar_t)) { return NULL; } wchar_t *res = (wchar_t *)PyMem_RawMalloc(argsize*sizeof(wchar_t)); if (!res) { return NULL; } size_t count = mbstowcs(res, arg, argsize); if (count != (size_t)-1) { wchar_t *tmp; for (tmp = res; *tmp != 0 && !Py_UNICODE_IS_SURROGATE(*tmp); tmp++) { } if (*tmp == 0) { return res; } } PyMem_RawFree(res); return NULL; }
/* Decode a byte string from the locale encoding with the surrogateescape error handler (undecodable bytes are decoded as characters in range U+DC80..U+DCFF). If a byte sequence can be decoded as a surrogate character, escape the bytes using the surrogateescape error handler instead of decoding them. Use _Py_wchar2char() to encode the character string back to a byte string. Return a pointer to a newly allocated wide character string (use PyMem_RawFree() to free the memory) and write the number of written wide characters excluding the null character into *size if size is not NULL, or NULL on error (decoding or memory allocation error). If size is not NULL, *size is set to (size_t)-1 on memory error and (size_t)-2 on decoding error. Conversion errors should never happen, unless there is a bug in the C library. */ wchar_t* _Py_char2wchar(const char* arg, size_t *size) { #ifdef __APPLE__ wchar_t *wstr; wstr = _Py_DecodeUTF8_surrogateescape(arg, strlen(arg)); if (size != NULL) { if (wstr != NULL) *size = wcslen(wstr); else *size = (size_t)-1; } return wstr; #else wchar_t *res; size_t argsize; size_t count; #ifdef HAVE_MBRTOWC unsigned char *in; wchar_t *out; mbstate_t mbs; #endif #ifndef MS_WINDOWS if (force_ascii == -1) force_ascii = check_force_ascii(); if (force_ascii) { /* force ASCII encoding to workaround mbstowcs() issue */ res = decode_ascii_surrogateescape(arg, size); if (res == NULL) goto oom; return res; } #endif #ifdef HAVE_BROKEN_MBSTOWCS /* Some platforms have a broken implementation of * mbstowcs which does not count the characters that * would result from conversion. Use an upper bound. */ argsize = strlen(arg); #else argsize = mbstowcs(NULL, arg, 0); #endif if (argsize != (size_t)-1) { if (argsize == PY_SSIZE_T_MAX) goto oom; argsize += 1; if (argsize > PY_SSIZE_T_MAX/sizeof(wchar_t)) goto oom; res = (wchar_t *)PyMem_RawMalloc(argsize*sizeof(wchar_t)); if (!res) goto oom; count = mbstowcs(res, arg, argsize); if (count != (size_t)-1) { wchar_t *tmp; /* Only use the result if it contains no surrogate characters. */ for (tmp = res; *tmp != 0 && !Py_UNICODE_IS_SURROGATE(*tmp); tmp++) ; if (*tmp == 0) { if (size != NULL) *size = count; return res; } } PyMem_RawFree(res); } /* Conversion failed. Fall back to escaping with surrogateescape. */ #ifdef HAVE_MBRTOWC /* Try conversion with mbrtwoc (C99), and escape non-decodable bytes. */ /* Overallocate; as multi-byte characters are in the argument, the actual output could use less memory. */ argsize = strlen(arg) + 1; if (argsize > PY_SSIZE_T_MAX/sizeof(wchar_t)) goto oom; res = (wchar_t*)PyMem_RawMalloc(argsize*sizeof(wchar_t)); if (!res) goto oom; in = (unsigned char*)arg; out = res; memset(&mbs, 0, sizeof mbs); while (argsize) { size_t converted = mbrtowc(out, (char*)in, argsize, &mbs); if (converted == 0) /* Reached end of string; null char stored. */ break; if (converted == (size_t)-2) { /* Incomplete character. This should never happen, since we provide everything that we have - unless there is a bug in the C library, or I misunderstood how mbrtowc works. */ PyMem_RawFree(res); if (size != NULL) *size = (size_t)-2; return NULL; } if (converted == (size_t)-1) { /* Conversion error. Escape as UTF-8b, and start over in the initial shift state. */ *out++ = 0xdc00 + *in++; argsize--; memset(&mbs, 0, sizeof mbs); continue; } if (Py_UNICODE_IS_SURROGATE(*out)) { /* Surrogate character. Escape the original byte sequence with surrogateescape. */ argsize -= converted; while (converted--) *out++ = 0xdc00 + *in++; continue; } /* successfully converted some bytes */ in += converted; argsize -= converted; out++; } if (size != NULL) *size = out - res; #else /* HAVE_MBRTOWC */ /* Cannot use C locale for escaping; manually escape as if charset is ASCII (i.e. escape all bytes > 128. This will still roundtrip correctly in the locale's charset, which must be an ASCII superset. */ res = decode_ascii_surrogateescape(arg, size); if (res == NULL) goto oom; #endif /* HAVE_MBRTOWC */ return res; oom: if (size != NULL) *size = (size_t)-1; return NULL; #endif /* __APPLE__ */ }
static int decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen, const char **reason, int surrogateescape) { wchar_t *res; size_t argsize; size_t count; #ifdef HAVE_MBRTOWC unsigned char *in; wchar_t *out; mbstate_t mbs; #endif #ifdef HAVE_BROKEN_MBSTOWCS /* Some platforms have a broken implementation of * mbstowcs which does not count the characters that * would result from conversion. Use an upper bound. */ argsize = strlen(arg); #else argsize = mbstowcs(NULL, arg, 0); #endif if (argsize != (size_t)-1) { if (argsize > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) { return -1; } res = (wchar_t *)PyMem_RawMalloc((argsize + 1) * sizeof(wchar_t)); if (!res) { return -1; } count = mbstowcs(res, arg, argsize + 1); if (count != (size_t)-1) { wchar_t *tmp; /* Only use the result if it contains no surrogate characters. */ for (tmp = res; *tmp != 0 && !Py_UNICODE_IS_SURROGATE(*tmp); tmp++) ; if (*tmp == 0) { if (wlen != NULL) { *wlen = count; } *wstr = res; return 0; } } PyMem_RawFree(res); } /* Conversion failed. Fall back to escaping with surrogateescape. */ #ifdef HAVE_MBRTOWC /* Try conversion with mbrtwoc (C99), and escape non-decodable bytes. */ /* Overallocate; as multi-byte characters are in the argument, the actual output could use less memory. */ argsize = strlen(arg) + 1; if (argsize > PY_SSIZE_T_MAX / sizeof(wchar_t)) { return -1; } res = (wchar_t*)PyMem_RawMalloc(argsize * sizeof(wchar_t)); if (!res) { return -1; } in = (unsigned char*)arg; out = res; memset(&mbs, 0, sizeof mbs); while (argsize) { size_t converted = mbrtowc(out, (char*)in, argsize, &mbs); if (converted == 0) { /* Reached end of string; null char stored. */ break; } if (converted == (size_t)-2) { /* Incomplete character. This should never happen, since we provide everything that we have - unless there is a bug in the C library, or I misunderstood how mbrtowc works. */ goto decode_error; } if (converted == (size_t)-1) { if (!surrogateescape) { goto decode_error; } /* Conversion error. Escape as UTF-8b, and start over in the initial shift state. */ *out++ = 0xdc00 + *in++; argsize--; memset(&mbs, 0, sizeof mbs); continue; } if (Py_UNICODE_IS_SURROGATE(*out)) { if (!surrogateescape) { goto decode_error; } /* Surrogate character. Escape the original byte sequence with surrogateescape. */ argsize -= converted; while (converted--) { *out++ = 0xdc00 + *in++; } continue; } /* successfully converted some bytes */ in += converted; argsize -= converted; out++; } if (wlen != NULL) { *wlen = out - res; } *wstr = res; return 0; decode_error: PyMem_RawFree(res); if (wlen) { *wlen = in - (unsigned char*)arg; } if (reason) { *reason = "decoding error"; } return -2; #else /* HAVE_MBRTOWC */ /* Cannot use C locale for escaping; manually escape as if charset is ASCII (i.e. escape all bytes > 128. This will still roundtrip correctly in the locale's charset, which must be an ASCII superset. */ return decode_ascii(arg, wstr, wlen, reason, surrogateescape); #endif /* HAVE_MBRTOWC */ }
static PyObject* translate_to_ucs2(PyObject* o) { PyObject* translated = NULL; Py_ssize_t len; wchar_t* unicode; assert(PyUnicode_Check(o)); #if PY_MAJOR_VERSION < 3 len = PyUnicode_GetSize(o); do { unicode = tds_mem_malloc((size_t)len * sizeof(wchar_t)); if (!unicode) { PyErr_NoMemory(); break; } if (-1 == PyUnicode_AsWideChar((PyUnicodeObject*)o, unicode, len)) { break; } } while (0); #else /* if PY_MAJOR_VERSION < 3 */ unicode = PyUnicode_AsWideCharString(o, &len); #endif /* else if PY_MAJOR_VERSION < 3 */ if (!PyErr_Occurred()) { Py_ssize_t ixsrc, ixdst = 0; for (ixsrc = 0; ixsrc < len; ++ixsrc, ++ixdst) { #if defined(WCHAR_T_UCS4) if (0xFFFF < unicode[ixsrc]) #else /* if defined(WCHAR_T_UCS4) */ if (Py_UNICODE_IS_SURROGATE(unicode[ixsrc])) #endif /* else if defined(WCHAR_T_UCS4) */ { static const char s_fmt[] = "Unicode codepoint U+%08X is not representable in UCS-2; replaced with U+FFFD"; char buffer[ARRAYSIZE(s_fmt) + 8 /* for codepoint chars */]; #if defined(WCHAR_T_UCS4) uint32_t codepoint = (uint32_t)unicode[ixsrc]; #else /* if defined(WCHAR_T_UCS4) */ uint32_t codepoint; assert(((ixsrc + 1) < len) && Py_UNICODE_IS_SURROGATE(unicode[ixsrc + 1])); codepoint = Py_UNICODE_JOIN_SURROGATES(unicode[ixsrc], unicode[ixsrc + 1]); ++ixsrc; #endif /* else if defined(WCHAR_T_UCS4) */ (void)sprintf(buffer, s_fmt, codepoint); if (0 != PyErr_WarnEx(PyExc_UnicodeWarning, buffer, 1)) { break; } unicode[ixdst] = 0xFFFD; /* unicode replacement character */ } #if !defined(WCHAR_T_UCS4) else { unicode[ixdst] = unicode[ixsrc]; } #endif /* if !defined(WCHAR_T_UCS4) */ } if (!PyErr_Occurred()) { translated = PyUnicode_FromWideChar(unicode, ixdst); } } #if PY_MAJOR_VERSION < 3 tds_mem_free(unicode); #else /* if PY_MAJOR_VERSION < 3 */ PyMem_Free(unicode); #endif /* else if PY_MAJOR_VERSION < 3 */ return translated; }