const char *utf_decodeWchar(unsigned short *s, size_t len, size_t *pidx, dchar_t *presult) { const char *msg; size_t i = *pidx; unsigned u = s[i]; assert(i >= 0 && i < len); if (u & ~0x7F) { if (u >= 0xD800 && u <= 0xDBFF) { unsigned u2; if (i + 1 == len) { msg = "surrogate UTF-16 high value past end of string"; goto Lerr; } u2 = s[i + 1]; if (u2 < 0xDC00 || u2 > 0xDFFF) { msg = "surrogate UTF-16 low value out of range"; goto Lerr; } u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00); i += 2; } else if (u >= 0xDC00 && u <= 0xDFFF) { msg = "unpaired surrogate UTF-16 value"; goto Lerr; } else if (u == 0xFFFE || u == 0xFFFF) { msg = "illegal UTF-16 value"; goto Lerr; } else i++; } else { i++; } assert(utf_isValidDchar(u)); *pidx = i; *presult = (dchar_t)u; return NULL; Lerr: *presult = (dchar_t)s[i]; *pidx = i + 1; return msg; }
Expression *StringExp::castTo(Scope *sc, Type *t) { /* This follows copy-on-write; any changes to 'this' * will result in a copy. * The this->string member is considered immutable. */ StringExp *se; Type *tb; int copied = 0; //printf("StringExp::castTo(t = %s), '%s' committed = %d\n", t->toChars(), toChars(), committed); if (!committed && t->ty == Tpointer && t->nextOf()->ty == Tvoid) { error("cannot convert string literal to void*"); return new ErrorExp(); } se = this; if (!committed) { se = (StringExp *)copy(); se->committed = 1; copied = 1; } if (type == t) { return se; } tb = t->toBasetype(); //printf("\ttype = %s\n", type->toChars()); if (tb->ty == Tdelegate && type->toBasetype()->ty != Tdelegate) return Expression::castTo(sc, t); Type *typeb = type->toBasetype(); if (typeb == tb) { if (!copied) { se = (StringExp *)copy(); copied = 1; } se->type = t; return se; } if (tb->ty != Tsarray && tb->ty != Tarray && tb->ty != Tpointer) { if (!copied) { se = (StringExp *)copy(); copied = 1; } goto Lcast; } if (typeb->ty != Tsarray && typeb->ty != Tarray && typeb->ty != Tpointer) { if (!copied) { se = (StringExp *)copy(); copied = 1; } goto Lcast; } if (typeb->nextOf()->size() == tb->nextOf()->size()) { if (!copied) { se = (StringExp *)copy(); copied = 1; } if (tb->ty == Tsarray) goto L2; // handle possible change in static array dimension se->type = t; return se; } if (committed) goto Lcast; #define X(tf,tt) ((tf) * 256 + (tt)) { OutBuffer buffer; size_t newlen = 0; int tfty = typeb->nextOf()->toBasetype()->ty; int ttty = tb->nextOf()->toBasetype()->ty; switch (X(tfty, ttty)) { case X(Tchar, Tchar): case X(Twchar,Twchar): case X(Tdchar,Tdchar): break; case X(Tchar, Twchar): for (size_t u = 0; u < len;) { unsigned c; const char *p = utf_decodeChar((unsigned char *)se->string, len, &u, &c); if (p) error("%s", p); else buffer.writeUTF16(c); } newlen = buffer.offset / 2; buffer.writeUTF16(0); goto L1; case X(Tchar, Tdchar): for (size_t u = 0; u < len;) { unsigned c; const char *p = utf_decodeChar((unsigned char *)se->string, len, &u, &c); if (p) error("%s", p); buffer.write4(c); newlen++; } buffer.write4(0); goto L1; case X(Twchar,Tchar): for (size_t u = 0; u < len;) { unsigned c; const char *p = utf_decodeWchar((unsigned short *)se->string, len, &u, &c); if (p) error("%s", p); else buffer.writeUTF8(c); } newlen = buffer.offset; buffer.writeUTF8(0); goto L1; case X(Twchar,Tdchar): for (size_t u = 0; u < len;) { unsigned c; const char *p = utf_decodeWchar((unsigned short *)se->string, len, &u, &c); if (p) error("%s", p); buffer.write4(c); newlen++; } buffer.write4(0); goto L1; case X(Tdchar,Tchar): for (size_t u = 0; u < len; u++) { unsigned c = ((unsigned *)se->string)[u]; if (!utf_isValidDchar(c)) error("invalid UCS-32 char \\U%08x", c); else buffer.writeUTF8(c); newlen++; } newlen = buffer.offset; buffer.writeUTF8(0); goto L1; case X(Tdchar,Twchar): for (size_t u = 0; u < len; u++) { unsigned c = ((unsigned *)se->string)[u]; if (!utf_isValidDchar(c)) error("invalid UCS-32 char \\U%08x", c); else buffer.writeUTF16(c); newlen++; } newlen = buffer.offset / 2; buffer.writeUTF16(0); goto L1; L1: if (!copied) { se = (StringExp *)copy(); copied = 1; } se->string = buffer.extractData(); se->len = newlen; se->sz = tb->nextOf()->size(); break; default: assert(typeb->nextOf()->size() != tb->nextOf()->size()); goto Lcast; } } #undef X L2: assert(copied); // See if need to truncate or extend the literal if (tb->ty == Tsarray) { int dim2 = ((TypeSArray *)tb)->dim->toInteger(); //printf("dim from = %d, to = %d\n", se->len, dim2); // Changing dimensions if (dim2 != se->len) { // Copy when changing the string literal unsigned newsz = se->sz; void *s; int d; d = (dim2 < se->len) ? dim2 : se->len; s = (unsigned char *)mem.malloc((dim2 + 1) * newsz); memcpy(s, se->string, d * newsz); // Extend with 0, add terminating 0 memset((char *)s + d * newsz, 0, (dim2 + 1 - d) * newsz); se->string = s; se->len = dim2; } } se->type = t; return se; Lcast: Expression *e = new CastExp(loc, se, t); e->type = t; // so semantic() won't be run on e return e; }
void visit(StringExp *e) { char m; OutBuffer tmp; utf8_t *q; size_t qlen; /* Write string in UTF-8 format */ switch (e->sz) { case 1: m = 'a'; q = (utf8_t *)e->string; qlen = e->len; break; case 2: m = 'w'; for (size_t u = 0; u < e->len; ) { unsigned c; const char *p = utf_decodeWchar((unsigned short *)e->string, e->len, &u, &c); if (p) e->error("%s", p); else tmp.writeUTF8(c); } q = (utf8_t *)tmp.data; qlen = tmp.offset; break; case 4: m = 'd'; for (size_t u = 0; u < e->len; u++) { unsigned c = ((unsigned *)e->string)[u]; if (!utf_isValidDchar(c)) e->error("invalid UCS-32 char \\U%08x", c); else tmp.writeUTF8(c); } q = (utf8_t *)tmp.data; qlen = tmp.offset; break; default: assert(0); } buf->reserve(1 + 11 + 2 * qlen); buf->writeByte(m); buf->printf("%d_", (int)qlen); // nbytes <= 11 for (utf8_t *p = (utf8_t *)buf->data + buf->offset, *pend = p + 2 * qlen; p < pend; p += 2, ++q) { utf8_t hi = *q >> 4 & 0xF; p[0] = (utf8_t)(hi < 10 ? hi + '0' : hi - 10 + 'a'); utf8_t lo = *q & 0xF; p[1] = (utf8_t)(lo < 10 ? lo + '0' : lo - 10 + 'a'); } buf->offset += 2 * qlen; }
const char *utf_decodeChar(unsigned char *s, size_t len, size_t *pidx, dchar_t *presult) { dchar_t V; size_t i = *pidx; unsigned char u = s[i]; //printf("utf_decodeChar(s = %02x, %02x, %02x len = %d)\n", u, s[1], s[2], len); assert(i >= 0 && i < len); if (u & 0x80) { unsigned n; unsigned char u2; /* The following encodings are valid, except for the 5 and 6 byte * combinations: * 0xxxxxxx * 110xxxxx 10xxxxxx * 1110xxxx 10xxxxxx 10xxxxxx * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */ for (n = 1; ; n++) { if (n > 4) goto Lerr; // only do the first 4 of 6 encodings if (((u << n) & 0x80) == 0) { if (n == 1) goto Lerr; break; } } // Pick off (7 - n) significant bits of B from first byte of octet V = (dchar_t)(u & ((1 << (7 - n)) - 1)); if (i + (n - 1) >= len) goto Lerr; // off end of string /* The following combinations are overlong, and illegal: * 1100000x (10xxxxxx) * 11100000 100xxxxx (10xxxxxx) * 11110000 1000xxxx (10xxxxxx 10xxxxxx) * 11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx) * 11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx) */ u2 = s[i + 1]; if ((u & 0xFE) == 0xC0 || (u == 0xE0 && (u2 & 0xE0) == 0x80) || (u == 0xF0 && (u2 & 0xF0) == 0x80) || (u == 0xF8 && (u2 & 0xF8) == 0x80) || (u == 0xFC && (u2 & 0xFC) == 0x80)) goto Lerr; // overlong combination for (unsigned j = 1; j != n; j++) { u = s[i + j]; if ((u & 0xC0) != 0x80) goto Lerr; // trailing bytes are 10xxxxxx V = (V << 6) | (u & 0x3F); } if (!utf_isValidDchar(V)) goto Lerr; i += n; } else { V = (dchar_t) u; i++; } assert(utf_isValidDchar(V)); *pidx = i; *presult = V; return NULL; Lerr: *presult = (dchar_t) s[i]; *pidx = i + 1; return "invalid UTF-8 sequence"; }