C++ (Cpp) Py_UNICODE_IS_SURROGATEの例

コード例 #1

0

ファイルを表示

ファイル: demangle.cpp プロジェクト: P4N74/demangler

static wchar_t* Py_DecodeLocale(const char* arg, size_t*) {
    size_t argsize = mbstowcs(NULL, arg, 0);
    if (argsize == (size_t)-1) {
        return NULL;
    }
    if (argsize == PY_SSIZE_T_MAX) {
        return NULL;
    }
    if (argsize > PY_SSIZE_T_MAX/sizeof(wchar_t)) {
        return NULL;
    }
    wchar_t *res = (wchar_t *)PyMem_RawMalloc(argsize*sizeof(wchar_t));
    if (!res) {
        return NULL;
    }
    size_t count = mbstowcs(res, arg, argsize);
    if (count != (size_t)-1) {
        wchar_t *tmp;
        for (tmp = res; *tmp != 0 && !Py_UNICODE_IS_SURROGATE(*tmp); tmp++) {
        }
        if (*tmp == 0) {
            return res;
        }
    }
    PyMem_RawFree(res);
    return NULL;
}

コード例 #2

0

ファイルを表示

ファイル: fileutils.c プロジェクト: CIBC-Internal/python

/* Decode a byte string from the locale encoding with the
   surrogateescape error handler (undecodable bytes are decoded as characters
   in range U+DC80..U+DCFF). If a byte sequence can be decoded as a surrogate
   character, escape the bytes using the surrogateescape error handler instead
   of decoding them.

   Use _Py_wchar2char() to encode the character string back to a byte string.

   Return a pointer to a newly allocated wide character string (use
   PyMem_RawFree() to free the memory) and write the number of written wide
   characters excluding the null character into *size if size is not NULL, or
   NULL on error (decoding or memory allocation error). If size is not NULL,
   *size is set to (size_t)-1 on memory error and (size_t)-2 on decoding
   error.

   Conversion errors should never happen, unless there is a bug in the C
   library. */
wchar_t*
_Py_char2wchar(const char* arg, size_t *size)
{
#ifdef __APPLE__
    wchar_t *wstr;
    wstr = _Py_DecodeUTF8_surrogateescape(arg, strlen(arg));
    if (size != NULL) {
        if (wstr != NULL)
            *size = wcslen(wstr);
        else
            *size = (size_t)-1;
    }
    return wstr;
#else
    wchar_t *res;
    size_t argsize;
    size_t count;
#ifdef HAVE_MBRTOWC
    unsigned char *in;
    wchar_t *out;
    mbstate_t mbs;
#endif

#ifndef MS_WINDOWS
    if (force_ascii == -1)
        force_ascii = check_force_ascii();

    if (force_ascii) {
        /* force ASCII encoding to workaround mbstowcs() issue */
        res = decode_ascii_surrogateescape(arg, size);
        if (res == NULL)
            goto oom;
        return res;
    }
#endif

#ifdef HAVE_BROKEN_MBSTOWCS
    /* Some platforms have a broken implementation of
     * mbstowcs which does not count the characters that
     * would result from conversion.  Use an upper bound.
     */
    argsize = strlen(arg);
#else
    argsize = mbstowcs(NULL, arg, 0);
#endif
    if (argsize != (size_t)-1) {
        if (argsize == PY_SSIZE_T_MAX)
            goto oom;
        argsize += 1;
        if (argsize > PY_SSIZE_T_MAX/sizeof(wchar_t))
            goto oom;
        res = (wchar_t *)PyMem_RawMalloc(argsize*sizeof(wchar_t));
        if (!res)
            goto oom;
        count = mbstowcs(res, arg, argsize);
        if (count != (size_t)-1) {
            wchar_t *tmp;
            /* Only use the result if it contains no
               surrogate characters. */
            for (tmp = res; *tmp != 0 &&
                         !Py_UNICODE_IS_SURROGATE(*tmp); tmp++)
                ;
            if (*tmp == 0) {
                if (size != NULL)
                    *size = count;
                return res;
            }
        }
        PyMem_RawFree(res);
    }
    /* Conversion failed. Fall back to escaping with surrogateescape. */
#ifdef HAVE_MBRTOWC
    /* Try conversion with mbrtwoc (C99), and escape non-decodable bytes. */

    /* Overallocate; as multi-byte characters are in the argument, the
       actual output could use less memory. */
    argsize = strlen(arg) + 1;
    if (argsize > PY_SSIZE_T_MAX/sizeof(wchar_t))
        goto oom;
    res = (wchar_t*)PyMem_RawMalloc(argsize*sizeof(wchar_t));
    if (!res)
        goto oom;
    in = (unsigned char*)arg;
    out = res;
    memset(&mbs, 0, sizeof mbs);
    while (argsize) {
        size_t converted = mbrtowc(out, (char*)in, argsize, &mbs);
        if (converted == 0)
            /* Reached end of string; null char stored. */
            break;
        if (converted == (size_t)-2) {
            /* Incomplete character. This should never happen,
               since we provide everything that we have -
               unless there is a bug in the C library, or I
               misunderstood how mbrtowc works. */
            PyMem_RawFree(res);
            if (size != NULL)
                *size = (size_t)-2;
            return NULL;
        }
        if (converted == (size_t)-1) {
            /* Conversion error. Escape as UTF-8b, and start over
               in the initial shift state. */
            *out++ = 0xdc00 + *in++;
            argsize--;
            memset(&mbs, 0, sizeof mbs);
            continue;
        }
        if (Py_UNICODE_IS_SURROGATE(*out)) {
            /* Surrogate character.  Escape the original
               byte sequence with surrogateescape. */
            argsize -= converted;
            while (converted--)
                *out++ = 0xdc00 + *in++;
            continue;
        }
        /* successfully converted some bytes */
        in += converted;
        argsize -= converted;
        out++;
    }
    if (size != NULL)
        *size = out - res;
#else   /* HAVE_MBRTOWC */
    /* Cannot use C locale for escaping; manually escape as if charset
       is ASCII (i.e. escape all bytes > 128. This will still roundtrip
       correctly in the locale's charset, which must be an ASCII superset. */
    res = decode_ascii_surrogateescape(arg, size);
    if (res == NULL)
        goto oom;
#endif   /* HAVE_MBRTOWC */
    return res;
oom:
    if (size != NULL)
        *size = (size_t)-1;
    return NULL;
#endif   /* __APPLE__ */
}

コード例 #3

0

ファイルを表示

ファイル: fileutils.c プロジェクト: Apoorvadabhere/cpython

static int
decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen,
                      const char **reason, int surrogateescape)
{
    wchar_t *res;
    size_t argsize;
    size_t count;
#ifdef HAVE_MBRTOWC
    unsigned char *in;
    wchar_t *out;
    mbstate_t mbs;
#endif

#ifdef HAVE_BROKEN_MBSTOWCS
    /* Some platforms have a broken implementation of
     * mbstowcs which does not count the characters that
     * would result from conversion.  Use an upper bound.
     */
    argsize = strlen(arg);
#else
    argsize = mbstowcs(NULL, arg, 0);
#endif
    if (argsize != (size_t)-1) {
        if (argsize > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
            return -1;
        }
        res = (wchar_t *)PyMem_RawMalloc((argsize + 1) * sizeof(wchar_t));
        if (!res) {
            return -1;
        }

        count = mbstowcs(res, arg, argsize + 1);
        if (count != (size_t)-1) {
            wchar_t *tmp;
            /* Only use the result if it contains no
               surrogate characters. */
            for (tmp = res; *tmp != 0 &&
                         !Py_UNICODE_IS_SURROGATE(*tmp); tmp++)
                ;
            if (*tmp == 0) {
                if (wlen != NULL) {
                    *wlen = count;
                }
                *wstr = res;
                return 0;
            }
        }
        PyMem_RawFree(res);
    }

    /* Conversion failed. Fall back to escaping with surrogateescape. */
#ifdef HAVE_MBRTOWC
    /* Try conversion with mbrtwoc (C99), and escape non-decodable bytes. */

    /* Overallocate; as multi-byte characters are in the argument, the
       actual output could use less memory. */
    argsize = strlen(arg) + 1;
    if (argsize > PY_SSIZE_T_MAX / sizeof(wchar_t)) {
        return -1;
    }
    res = (wchar_t*)PyMem_RawMalloc(argsize * sizeof(wchar_t));
    if (!res) {
        return -1;
    }

    in = (unsigned char*)arg;
    out = res;
    memset(&mbs, 0, sizeof mbs);
    while (argsize) {
        size_t converted = mbrtowc(out, (char*)in, argsize, &mbs);
        if (converted == 0) {
            /* Reached end of string; null char stored. */
            break;
        }

        if (converted == (size_t)-2) {
            /* Incomplete character. This should never happen,
               since we provide everything that we have -
               unless there is a bug in the C library, or I
               misunderstood how mbrtowc works. */
            goto decode_error;
        }

        if (converted == (size_t)-1) {
            if (!surrogateescape) {
                goto decode_error;
            }

            /* Conversion error. Escape as UTF-8b, and start over
               in the initial shift state. */
            *out++ = 0xdc00 + *in++;
            argsize--;
            memset(&mbs, 0, sizeof mbs);
            continue;
        }

        if (Py_UNICODE_IS_SURROGATE(*out)) {
            if (!surrogateescape) {
                goto decode_error;
            }

            /* Surrogate character.  Escape the original
               byte sequence with surrogateescape. */
            argsize -= converted;
            while (converted--) {
                *out++ = 0xdc00 + *in++;
            }
            continue;
        }
        /* successfully converted some bytes */
        in += converted;
        argsize -= converted;
        out++;
    }
    if (wlen != NULL) {
        *wlen = out - res;
    }
    *wstr = res;
    return 0;

decode_error:
    PyMem_RawFree(res);
    if (wlen) {
        *wlen = in - (unsigned char*)arg;
    }
    if (reason) {
        *reason = "decoding error";
    }
    return -2;
#else   /* HAVE_MBRTOWC */
    /* Cannot use C locale for escaping; manually escape as if charset
       is ASCII (i.e. escape all bytes > 128. This will still roundtrip
       correctly in the locale's charset, which must be an ASCII superset. */
    return decode_ascii(arg, wstr, wlen, reason, surrogateescape);
#endif   /* HAVE_MBRTOWC */
}

コード例 #4

0

ファイルを表示

ファイル: type.c プロジェクト: zillow/ctds

static PyObject* translate_to_ucs2(PyObject* o)
{
    PyObject* translated = NULL;
    Py_ssize_t len;
    wchar_t* unicode;

    assert(PyUnicode_Check(o));
#if PY_MAJOR_VERSION < 3
    len = PyUnicode_GetSize(o);
    do
    {
        unicode = tds_mem_malloc((size_t)len * sizeof(wchar_t));
        if (!unicode)
        {
            PyErr_NoMemory();
            break;
        }
        if (-1 == PyUnicode_AsWideChar((PyUnicodeObject*)o, unicode, len))
        {
            break;
        }
    }
    while (0);
#else /* if PY_MAJOR_VERSION < 3 */
    unicode = PyUnicode_AsWideCharString(o, &len);
#endif /* else if PY_MAJOR_VERSION < 3 */

    if (!PyErr_Occurred())
    {
        Py_ssize_t ixsrc, ixdst = 0;
        for (ixsrc = 0; ixsrc < len; ++ixsrc, ++ixdst)
        {
#if defined(WCHAR_T_UCS4)
            if (0xFFFF < unicode[ixsrc])
#else /* if defined(WCHAR_T_UCS4) */
            if (Py_UNICODE_IS_SURROGATE(unicode[ixsrc]))
#endif /* else if defined(WCHAR_T_UCS4) */
            {
                static const char s_fmt[] =
                    "Unicode codepoint U+%08X is not representable in UCS-2; replaced with U+FFFD";
                char buffer[ARRAYSIZE(s_fmt) + 8 /* for codepoint chars */];
#if defined(WCHAR_T_UCS4)
                uint32_t codepoint = (uint32_t)unicode[ixsrc];
#else /* if defined(WCHAR_T_UCS4) */
                uint32_t codepoint;
                assert(((ixsrc + 1) < len) && Py_UNICODE_IS_SURROGATE(unicode[ixsrc + 1]));
                codepoint = Py_UNICODE_JOIN_SURROGATES(unicode[ixsrc], unicode[ixsrc + 1]);
                ++ixsrc;
#endif /* else if defined(WCHAR_T_UCS4) */
                (void)sprintf(buffer, s_fmt, codepoint);
                if (0 != PyErr_WarnEx(PyExc_UnicodeWarning, buffer, 1))
                {
                    break;
                }

                unicode[ixdst] = 0xFFFD; /* unicode replacement character */
            }
#if !defined(WCHAR_T_UCS4)
            else
            {
                unicode[ixdst] = unicode[ixsrc];
            }
#endif /* if !defined(WCHAR_T_UCS4) */
        }

        if (!PyErr_Occurred())
        {
            translated = PyUnicode_FromWideChar(unicode, ixdst);
        }
    }
#if PY_MAJOR_VERSION < 3
    tds_mem_free(unicode);
#else /* if PY_MAJOR_VERSION < 3 */
    PyMem_Free(unicode);
#endif /* else if PY_MAJOR_VERSION < 3 */

    return translated;
}