static int encode_locale_ex(const wchar_t *text, char **str, size_t *error_pos, const char **reason, int raw_malloc, int current_locale, int surrogateescape) { if (current_locale) { #ifdef __ANDROID__ return _Py_EncodeUTF8Ex(text, str, error_pos, reason, raw_malloc, surrogateescape); #else return encode_current_locale(text, str, error_pos, reason, raw_malloc, surrogateescape); #endif } #if defined(__APPLE__) || defined(__ANDROID__) return _Py_EncodeUTF8Ex(text, str, error_pos, reason, raw_malloc, surrogateescape); #else /* __APPLE__ */ if (Py_UTF8Mode == 1) { return _Py_EncodeUTF8Ex(text, str, error_pos, reason, raw_malloc, surrogateescape); } #ifdef USE_FORCE_ASCII if (force_ascii == -1) { force_ascii = check_force_ascii(); } if (force_ascii) { return encode_ascii(text, str, error_pos, reason, raw_malloc, surrogateescape); } #endif return encode_current_locale(text, str, error_pos, reason, raw_malloc, surrogateescape); #endif /* __APPLE__ or __ANDROID__ */ }
/* Decode a byte string from the locale encoding. Use the strict error handler if 'surrogateescape' is zero. Use the surrogateescape error handler if 'surrogateescape' is non-zero: undecodable bytes are decoded as characters in range U+DC80..U+DCFF. If a byte sequence can be decoded as a surrogate character, escape the bytes using the surrogateescape error handler instead of decoding them. On success, return 0 and write the newly allocated wide character string into *wstr (use PyMem_RawFree() to free the memory). If wlen is not NULL, write the number of wide characters excluding the null character into *wlen. On memory allocation failure, return -1. On decoding error, return -2. If wlen is not NULL, write the start of invalid byte sequence in the input string into *wlen. If reason is not NULL, write the decoding error message into *reason. Use the Py_EncodeLocaleEx() function to encode the character string back to a byte string. */ int _Py_DecodeLocaleEx(const char* arg, wchar_t **wstr, size_t *wlen, const char **reason, int current_locale, int surrogateescape) { if (current_locale) { #ifdef __ANDROID__ return _Py_DecodeUTF8Ex(arg, strlen(arg), wstr, wlen, reason, surrogateescape); #else return decode_current_locale(arg, wstr, wlen, reason, surrogateescape); #endif } #if defined(__APPLE__) || defined(__ANDROID__) return _Py_DecodeUTF8Ex(arg, strlen(arg), wstr, wlen, reason, surrogateescape); #else if (Py_UTF8Mode == 1) { return _Py_DecodeUTF8Ex(arg, strlen(arg), wstr, wlen, reason, surrogateescape); } #ifdef USE_FORCE_ASCII if (force_ascii == -1) { force_ascii = check_force_ascii(); } if (force_ascii) { /* force ASCII encoding to workaround mbstowcs() issue */ return decode_ascii(arg, wstr, wlen, reason, surrogateescape); } #endif return decode_current_locale(arg, wstr, wlen, reason, surrogateescape); #endif /* __APPLE__ or __ANDROID__ */ }
/* Encode a (wide) character string to the locale encoding with the surrogateescape error handler (characters in range U+DC80..U+DCFF are converted to bytes 0x80..0xFF). This function is the reverse of _Py_char2wchar(). Return a pointer to a newly allocated byte string (use PyMem_Free() to free the memory), or NULL on encoding or memory allocation error. If error_pos is not NULL: *error_pos is the index of the invalid character on encoding error, or (size_t)-1 otherwise. */ char* _Py_wchar2char(const wchar_t *text, size_t *error_pos) { #ifdef __APPLE__ Py_ssize_t len; PyObject *unicode, *bytes = NULL; char *cpath; unicode = PyUnicode_FromWideChar(text, wcslen(text)); if (unicode == NULL) return NULL; bytes = _PyUnicode_AsUTF8String(unicode, "surrogateescape"); Py_DECREF(unicode); if (bytes == NULL) { PyErr_Clear(); if (error_pos != NULL) *error_pos = (size_t)-1; return NULL; } len = PyBytes_GET_SIZE(bytes); cpath = PyMem_Malloc(len+1); if (cpath == NULL) { PyErr_Clear(); Py_DECREF(bytes); if (error_pos != NULL) *error_pos = (size_t)-1; return NULL; } memcpy(cpath, PyBytes_AsString(bytes), len + 1); Py_DECREF(bytes); return cpath; #else /* __APPLE__ */ const size_t len = wcslen(text); char *result = NULL, *bytes = NULL; size_t i, size, converted; wchar_t c, buf[2]; #ifndef MS_WINDOWS if (force_ascii == -1) force_ascii = check_force_ascii(); if (force_ascii) return encode_ascii_surrogateescape(text, error_pos); #endif /* The function works in two steps: 1. compute the length of the output buffer in bytes (size) 2. outputs the bytes */ size = 0; buf[1] = 0; while (1) { for (i=0; i < len; i++) { c = text[i]; if (c >= 0xdc80 && c <= 0xdcff) { /* UTF-8b surrogate */ if (bytes != NULL) { *bytes++ = c - 0xdc00; size--; } else size++; continue; } else { buf[0] = c; if (bytes != NULL) converted = wcstombs(bytes, buf, size); else converted = wcstombs(NULL, buf, 0); if (converted == (size_t)-1) { if (result != NULL) PyMem_Free(result); if (error_pos != NULL) *error_pos = i; return NULL; } if (bytes != NULL) { bytes += converted; size -= converted; } else size += converted; } } if (result != NULL) { *bytes = '\0'; break; } size += 1; /* nul byte at the end */ result = PyMem_Malloc(size); if (result == NULL) { if (error_pos != NULL) *error_pos = (size_t)-1; return NULL; } bytes = result; } return result; #endif /* __APPLE__ */ }
/* Decode a byte string from the locale encoding with the surrogateescape error handler (undecodable bytes are decoded as characters in range U+DC80..U+DCFF). If a byte sequence can be decoded as a surrogate character, escape the bytes using the surrogateescape error handler instead of decoding them. Use _Py_wchar2char() to encode the character string back to a byte string. Return a pointer to a newly allocated wide character string (use PyMem_RawFree() to free the memory) and write the number of written wide characters excluding the null character into *size if size is not NULL, or NULL on error (decoding or memory allocation error). If size is not NULL, *size is set to (size_t)-1 on memory error and (size_t)-2 on decoding error. Conversion errors should never happen, unless there is a bug in the C library. */ wchar_t* _Py_char2wchar(const char* arg, size_t *size) { #ifdef __APPLE__ wchar_t *wstr; wstr = _Py_DecodeUTF8_surrogateescape(arg, strlen(arg)); if (size != NULL) { if (wstr != NULL) *size = wcslen(wstr); else *size = (size_t)-1; } return wstr; #else wchar_t *res; size_t argsize; size_t count; #ifdef HAVE_MBRTOWC unsigned char *in; wchar_t *out; mbstate_t mbs; #endif #ifndef MS_WINDOWS if (force_ascii == -1) force_ascii = check_force_ascii(); if (force_ascii) { /* force ASCII encoding to workaround mbstowcs() issue */ res = decode_ascii_surrogateescape(arg, size); if (res == NULL) goto oom; return res; } #endif #ifdef HAVE_BROKEN_MBSTOWCS /* Some platforms have a broken implementation of * mbstowcs which does not count the characters that * would result from conversion. Use an upper bound. */ argsize = strlen(arg); #else argsize = mbstowcs(NULL, arg, 0); #endif if (argsize != (size_t)-1) { if (argsize == PY_SSIZE_T_MAX) goto oom; argsize += 1; if (argsize > PY_SSIZE_T_MAX/sizeof(wchar_t)) goto oom; res = (wchar_t *)PyMem_RawMalloc(argsize*sizeof(wchar_t)); if (!res) goto oom; count = mbstowcs(res, arg, argsize); if (count != (size_t)-1) { wchar_t *tmp; /* Only use the result if it contains no surrogate characters. */ for (tmp = res; *tmp != 0 && !Py_UNICODE_IS_SURROGATE(*tmp); tmp++) ; if (*tmp == 0) { if (size != NULL) *size = count; return res; } } PyMem_RawFree(res); } /* Conversion failed. Fall back to escaping with surrogateescape. */ #ifdef HAVE_MBRTOWC /* Try conversion with mbrtwoc (C99), and escape non-decodable bytes. */ /* Overallocate; as multi-byte characters are in the argument, the actual output could use less memory. */ argsize = strlen(arg) + 1; if (argsize > PY_SSIZE_T_MAX/sizeof(wchar_t)) goto oom; res = (wchar_t*)PyMem_RawMalloc(argsize*sizeof(wchar_t)); if (!res) goto oom; in = (unsigned char*)arg; out = res; memset(&mbs, 0, sizeof mbs); while (argsize) { size_t converted = mbrtowc(out, (char*)in, argsize, &mbs); if (converted == 0) /* Reached end of string; null char stored. */ break; if (converted == (size_t)-2) { /* Incomplete character. This should never happen, since we provide everything that we have - unless there is a bug in the C library, or I misunderstood how mbrtowc works. */ PyMem_RawFree(res); if (size != NULL) *size = (size_t)-2; return NULL; } if (converted == (size_t)-1) { /* Conversion error. Escape as UTF-8b, and start over in the initial shift state. */ *out++ = 0xdc00 + *in++; argsize--; memset(&mbs, 0, sizeof mbs); continue; } if (Py_UNICODE_IS_SURROGATE(*out)) { /* Surrogate character. Escape the original byte sequence with surrogateescape. */ argsize -= converted; while (converted--) *out++ = 0xdc00 + *in++; continue; } /* successfully converted some bytes */ in += converted; argsize -= converted; out++; } if (size != NULL) *size = out - res; #else /* HAVE_MBRTOWC */ /* Cannot use C locale for escaping; manually escape as if charset is ASCII (i.e. escape all bytes > 128. This will still roundtrip correctly in the locale's charset, which must be an ASCII superset. */ res = decode_ascii_surrogateescape(arg, size); if (res == NULL) goto oom; #endif /* HAVE_MBRTOWC */ return res; oom: if (size != NULL) *size = (size_t)-1; return NULL; #endif /* __APPLE__ */ }