예제 #1
0
const char *utf_decodeWchar(unsigned short *s, size_t len, size_t *pidx, dchar_t *presult)
{
    const char *msg;
    size_t i = *pidx;
    unsigned u = s[i];

    assert(i >= 0 && i < len);
    if (u & ~0x7F)
    {   if (u >= 0xD800 && u <= 0xDBFF)
        {   unsigned u2;

            if (i + 1 == len)
            {   msg = "surrogate UTF-16 high value past end of string";
                goto Lerr;
            }
            u2 = s[i + 1];
            if (u2 < 0xDC00 || u2 > 0xDFFF)
            {   msg = "surrogate UTF-16 low value out of range";
                goto Lerr;
            }
            u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00);
            i += 2;
        }
        else if (u >= 0xDC00 && u <= 0xDFFF)
        {   msg = "unpaired surrogate UTF-16 value";
            goto Lerr;
        }
        else if (u == 0xFFFE || u == 0xFFFF)
        {   msg = "illegal UTF-16 value";
            goto Lerr;
        }
        else
            i++;
    }
    else
    {
        i++;
    }

    assert(utf_isValidDchar(u));
    *pidx = i;
    *presult = (dchar_t)u;
    return NULL;

  Lerr:
    *presult = (dchar_t)s[i];
    *pidx = i + 1;
    return msg;
}
예제 #2
0
파일: cast.c 프로젝트: smunix/ldc
Expression *StringExp::castTo(Scope *sc, Type *t)
{
    /* This follows copy-on-write; any changes to 'this'
     * will result in a copy.
     * The this->string member is considered immutable.
     */
    StringExp *se;
    Type *tb;
    int copied = 0;

    //printf("StringExp::castTo(t = %s), '%s' committed = %d\n", t->toChars(), toChars(), committed);

    if (!committed && t->ty == Tpointer && t->nextOf()->ty == Tvoid)
    {
        error("cannot convert string literal to void*");
        return new ErrorExp();
    }

    se = this;
    if (!committed)
    {   se = (StringExp *)copy();
        se->committed = 1;
        copied = 1;
    }

    if (type == t)
    {
        return se;
    }

    tb = t->toBasetype();
    //printf("\ttype = %s\n", type->toChars());
    if (tb->ty == Tdelegate && type->toBasetype()->ty != Tdelegate)
        return Expression::castTo(sc, t);

    Type *typeb = type->toBasetype();
    if (typeb == tb)
    {
        if (!copied)
        {   se = (StringExp *)copy();
            copied = 1;
        }
        se->type = t;
        return se;
    }

    if (tb->ty != Tsarray && tb->ty != Tarray && tb->ty != Tpointer)
    {   if (!copied)
        {   se = (StringExp *)copy();
            copied = 1;
        }
        goto Lcast;
    }
    if (typeb->ty != Tsarray && typeb->ty != Tarray && typeb->ty != Tpointer)
    {   if (!copied)
        {   se = (StringExp *)copy();
            copied = 1;
        }
        goto Lcast;
    }

    if (typeb->nextOf()->size() == tb->nextOf()->size())
    {
        if (!copied)
        {   se = (StringExp *)copy();
            copied = 1;
        }
        if (tb->ty == Tsarray)
            goto L2;    // handle possible change in static array dimension
        se->type = t;
        return se;
    }

    if (committed)
        goto Lcast;

#define X(tf,tt)        ((tf) * 256 + (tt))
    {
    OutBuffer buffer;
    size_t newlen = 0;
    int tfty = typeb->nextOf()->toBasetype()->ty;
    int ttty = tb->nextOf()->toBasetype()->ty;
    switch (X(tfty, ttty))
    {
        case X(Tchar, Tchar):
        case X(Twchar,Twchar):
        case X(Tdchar,Tdchar):
            break;

        case X(Tchar, Twchar):
            for (size_t u = 0; u < len;)
            {   unsigned c;
                const char *p = utf_decodeChar((unsigned char *)se->string, len, &u, &c);
                if (p)
                    error("%s", p);
                else
                    buffer.writeUTF16(c);
            }
            newlen = buffer.offset / 2;
            buffer.writeUTF16(0);
            goto L1;

        case X(Tchar, Tdchar):
            for (size_t u = 0; u < len;)
            {   unsigned c;
                const char *p = utf_decodeChar((unsigned char *)se->string, len, &u, &c);
                if (p)
                    error("%s", p);
                buffer.write4(c);
                newlen++;
            }
            buffer.write4(0);
            goto L1;

        case X(Twchar,Tchar):
            for (size_t u = 0; u < len;)
            {   unsigned c;
                const char *p = utf_decodeWchar((unsigned short *)se->string, len, &u, &c);
                if (p)
                    error("%s", p);
                else
                    buffer.writeUTF8(c);
            }
            newlen = buffer.offset;
            buffer.writeUTF8(0);
            goto L1;

        case X(Twchar,Tdchar):
            for (size_t u = 0; u < len;)
            {   unsigned c;
                const char *p = utf_decodeWchar((unsigned short *)se->string, len, &u, &c);
                if (p)
                    error("%s", p);
                buffer.write4(c);
                newlen++;
            }
            buffer.write4(0);
            goto L1;

        case X(Tdchar,Tchar):
            for (size_t u = 0; u < len; u++)
            {
                unsigned c = ((unsigned *)se->string)[u];
                if (!utf_isValidDchar(c))
                    error("invalid UCS-32 char \\U%08x", c);
                else
                    buffer.writeUTF8(c);
                newlen++;
            }
            newlen = buffer.offset;
            buffer.writeUTF8(0);
            goto L1;

        case X(Tdchar,Twchar):
            for (size_t u = 0; u < len; u++)
            {
                unsigned c = ((unsigned *)se->string)[u];
                if (!utf_isValidDchar(c))
                    error("invalid UCS-32 char \\U%08x", c);
                else
                    buffer.writeUTF16(c);
                newlen++;
            }
            newlen = buffer.offset / 2;
            buffer.writeUTF16(0);
            goto L1;

        L1:
            if (!copied)
            {   se = (StringExp *)copy();
                copied = 1;
            }
            se->string = buffer.extractData();
            se->len = newlen;
            se->sz = tb->nextOf()->size();
            break;

        default:
            assert(typeb->nextOf()->size() != tb->nextOf()->size());
            goto Lcast;
    }
    }
#undef X
L2:
    assert(copied);

    // See if need to truncate or extend the literal
    if (tb->ty == Tsarray)
    {
        int dim2 = ((TypeSArray *)tb)->dim->toInteger();

        //printf("dim from = %d, to = %d\n", se->len, dim2);

        // Changing dimensions
        if (dim2 != se->len)
        {
            // Copy when changing the string literal
            unsigned newsz = se->sz;
            void *s;
            int d;

            d = (dim2 < se->len) ? dim2 : se->len;
            s = (unsigned char *)mem.malloc((dim2 + 1) * newsz);
            memcpy(s, se->string, d * newsz);
            // Extend with 0, add terminating 0
            memset((char *)s + d * newsz, 0, (dim2 + 1 - d) * newsz);
            se->string = s;
            se->len = dim2;
        }
    }
    se->type = t;
    return se;

Lcast:
    Expression *e = new CastExp(loc, se, t);
    e->type = t;        // so semantic() won't be run on e
    return e;
}
예제 #3
0
파일: mangle.c 프로젝트: NativeAPI/dmd
    void visit(StringExp *e)
    {
        char m;
        OutBuffer tmp;
        utf8_t *q;
        size_t qlen;

        /* Write string in UTF-8 format
         */
        switch (e->sz)
        {
            case 1:
                m = 'a';
                q = (utf8_t *)e->string;
                qlen = e->len;
                break;

            case 2:
                m = 'w';
                for (size_t u = 0; u < e->len; )
                {
                    unsigned c;
                    const char *p = utf_decodeWchar((unsigned short *)e->string, e->len, &u, &c);
                    if (p)
                        e->error("%s", p);
                    else
                        tmp.writeUTF8(c);
                }
                q = (utf8_t *)tmp.data;
                qlen = tmp.offset;
                break;

            case 4:
                m = 'd';
                for (size_t u = 0; u < e->len; u++)
                {
                    unsigned c = ((unsigned *)e->string)[u];
                    if (!utf_isValidDchar(c))
                        e->error("invalid UCS-32 char \\U%08x", c);
                    else
                        tmp.writeUTF8(c);
                }
                q = (utf8_t *)tmp.data;
                qlen = tmp.offset;
                break;

            default:
                assert(0);
        }
        buf->reserve(1 + 11 + 2 * qlen);
        buf->writeByte(m);
        buf->printf("%d_", (int)qlen); // nbytes <= 11

        for (utf8_t *p = (utf8_t *)buf->data + buf->offset, *pend = p + 2 * qlen;
             p < pend; p += 2, ++q)
        {
            utf8_t hi = *q >> 4 & 0xF;
            p[0] = (utf8_t)(hi < 10 ? hi + '0' : hi - 10 + 'a');
            utf8_t lo = *q & 0xF;
            p[1] = (utf8_t)(lo < 10 ? lo + '0' : lo - 10 + 'a');
        }
        buf->offset += 2 * qlen;
    }
예제 #4
0
const char *utf_decodeChar(unsigned char *s, size_t len, size_t *pidx, dchar_t *presult)
{
    dchar_t V;
    size_t i = *pidx;
    unsigned char u = s[i];

    //printf("utf_decodeChar(s = %02x, %02x, %02x len = %d)\n", u, s[1], s[2], len);

    assert(i >= 0 && i < len);

    if (u & 0x80)
    {   unsigned n;
        unsigned char u2;

        /* The following encodings are valid, except for the 5 and 6 byte
         * combinations:
         *      0xxxxxxx
         *      110xxxxx 10xxxxxx
         *      1110xxxx 10xxxxxx 10xxxxxx
         *      11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
         *      111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
         *      1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
         */
        for (n = 1; ; n++)
        {
            if (n > 4)
                goto Lerr;              // only do the first 4 of 6 encodings
            if (((u << n) & 0x80) == 0)
            {
                if (n == 1)
                    goto Lerr;
                break;
            }
        }

        // Pick off (7 - n) significant bits of B from first byte of octet
        V = (dchar_t)(u & ((1 << (7 - n)) - 1));

        if (i + (n - 1) >= len)
            goto Lerr;                  // off end of string

        /* The following combinations are overlong, and illegal:
         *      1100000x (10xxxxxx)
         *      11100000 100xxxxx (10xxxxxx)
         *      11110000 1000xxxx (10xxxxxx 10xxxxxx)
         *      11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx)
         *      11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx)
         */
        u2 = s[i + 1];
        if ((u & 0xFE) == 0xC0 ||
            (u == 0xE0 && (u2 & 0xE0) == 0x80) ||
            (u == 0xF0 && (u2 & 0xF0) == 0x80) ||
            (u == 0xF8 && (u2 & 0xF8) == 0x80) ||
            (u == 0xFC && (u2 & 0xFC) == 0x80))
            goto Lerr;                  // overlong combination

        for (unsigned j = 1; j != n; j++)
        {
            u = s[i + j];
            if ((u & 0xC0) != 0x80)
                goto Lerr;                      // trailing bytes are 10xxxxxx
            V = (V << 6) | (u & 0x3F);
        }
        if (!utf_isValidDchar(V))
            goto Lerr;
        i += n;
    }
    else
    {
        V = (dchar_t) u;
        i++;
    }

    assert(utf_isValidDchar(V));
    *pidx = i;
    *presult = V;
    return NULL;

  Lerr:
    *presult = (dchar_t) s[i];
    *pidx = i + 1;
    return "invalid UTF-8 sequence";
}