void Module::parse() #endif { char *srcname; unsigned char *buf; unsigned buflen; unsigned le; unsigned bom; //printf("Module::parse()\n"); srcname = srcfile->name->toChars(); //printf("Module::parse(srcname = '%s')\n", srcname); buf = srcfile->buffer; buflen = srcfile->len; if (buflen >= 2) { /* Convert all non-UTF-8 formats to UTF-8. * BOM : http://www.unicode.org/faq/utf_bom.html * 00 00 FE FF UTF-32BE, big-endian * FF FE 00 00 UTF-32LE, little-endian * FE FF UTF-16BE, big-endian * FF FE UTF-16LE, little-endian * EF BB BF UTF-8 */ bom = 1; // assume there's a BOM if (buf[0] == 0xFF && buf[1] == 0xFE) { if (buflen >= 4 && buf[2] == 0 && buf[3] == 0) { // UTF-32LE le = 1; Lutf32: OutBuffer dbuf; unsigned *pu = (unsigned *)(buf); unsigned *pumax = &pu[buflen / 4]; if (buflen & 3) { error("odd length of UTF-32 char source %u", buflen); fatal(); } dbuf.reserve(buflen / 4); for (pu += bom; pu < pumax; pu++) { unsigned u; u = le ? readlongLE(pu) : readlongBE(pu); if (u & ~0x7F) { if (u > 0x10FFFF) { error("UTF-32 value %08x greater than 0x10FFFF", u); fatal(); } dbuf.writeUTF8(u); } else dbuf.writeByte(u); } dbuf.writeByte(0); // add 0 as sentinel for scanner buflen = dbuf.offset - 1; // don't include sentinel in count buf = (unsigned char *) dbuf.extractData(); } else { // UTF-16LE (X86) // Convert it to UTF-8 le = 1; Lutf16: OutBuffer dbuf; unsigned short *pu = (unsigned short *)(buf); unsigned short *pumax = &pu[buflen / 2]; if (buflen & 1) { error("odd length of UTF-16 char source %u", buflen); fatal(); } dbuf.reserve(buflen / 2); for (pu += bom; pu < pumax; pu++) { unsigned u; u = le ? readwordLE(pu) : readwordBE(pu); if (u & ~0x7F) { if (u >= 0xD800 && u <= 0xDBFF) { unsigned u2; if (++pu > pumax) { error("surrogate UTF-16 high value %04x at EOF", u); fatal(); } u2 = le ? readwordLE(pu) : readwordBE(pu); if (u2 < 0xDC00 || u2 > 0xDFFF) { error("surrogate UTF-16 low value %04x out of range", u2); fatal(); } u = (u - 0xD7C0) << 10; u |= (u2 - 0xDC00); } else if (u >= 0xDC00 && u <= 0xDFFF) { error("unpaired surrogate UTF-16 value %04x", u); fatal(); } else if (u == 0xFFFE || u == 0xFFFF) { error("illegal UTF-16 value %04x", u); fatal(); } dbuf.writeUTF8(u); } else dbuf.writeByte(u); } dbuf.writeByte(0); // add 0 as sentinel for scanner buflen = dbuf.offset - 1; // don't include sentinel in count buf = (unsigned char *) dbuf.extractData(); } } else if (buf[0] == 0xFE && buf[1] == 0xFF) { // UTF-16BE le = 0; goto Lutf16; } else if (buflen >= 4 && buf[0] == 0 && buf[1] == 0 && buf[2] == 0xFE && buf[3] == 0xFF) { // UTF-32BE le = 0; goto Lutf32; } else if (buflen >= 3 && buf[0] == 0xEF && buf[1] == 0xBB && buf[2] == 0xBF) { // UTF-8 buf += 3; buflen -= 3; } else { /* There is no BOM. Make use of Arcane Jill's insight that * the first char of D source must be ASCII to * figure out the encoding. */ bom = 0; if (buflen >= 4) { if (buf[1] == 0 && buf[2] == 0 && buf[3] == 0) { // UTF-32LE le = 1; goto Lutf32; } else if (buf[0] == 0 && buf[1] == 0 && buf[2] == 0) { // UTF-32BE le = 0; goto Lutf32; } } if (buflen >= 2) { if (buf[1] == 0) { // UTF-16LE le = 1; goto Lutf16; } else if (buf[0] == 0) { // UTF-16BE le = 0; goto Lutf16; } } // It's UTF-8 if (buf[0] >= 0x80) { error("source file must start with BOM or ASCII character, not \\x%02X", buf[0]); fatal(); } } } #ifdef IN_GCC // dump utf-8 encoded source if (dump_source) { // %% srcname could contain a path ... d_gcc_dump_source(srcname, "utf-8", buf, buflen); } #endif /* If it starts with the string "Ddoc", then it's a documentation * source file. */ if (buflen >= 4 && memcmp(buf, "Ddoc", 4) == 0) { comment = buf + 4; isDocFile = 1; return; } if (isHtml) { OutBuffer *dbuf = new OutBuffer(); Html h(srcname, buf, buflen); h.extractCode(dbuf); buf = dbuf->data; buflen = dbuf->offset; #ifdef IN_GCC // dump extracted source if (dump_source) d_gcc_dump_source(srcname, "d.utf-8", buf, buflen); #endif } #if IN_LLVM Parser p(this, buf, buflen, gen_docs); #else Parser p(this, buf, buflen, docfile != NULL); #endif p.nextToken(); members = p.parseModule(); md = p.md; numlines = p.loc.linnum; DsymbolTable *dst; if (md) { this->ident = md->id; dst = Package::resolve(md->packages, &this->parent, NULL); } else { dst = modules; /* Check to see if module name is a valid identifier */ if (!Lexer::isValidIdentifier(this->ident->toChars())) error("has non-identifier characters in filename, use module declaration instead"); } // Update global list of modules if (!dst->insert(this)) { Dsymbol *prev = dst->lookup(ident); assert(prev); Module *mprev = prev->isModule(); if (mprev) error(loc, "from file %s conflicts with another module %s from file %s", srcname, mprev->toChars(), mprev->srcfile->toChars()); else { Package *pkg = prev->isPackage(); assert(pkg); error(loc, "from file %s conflicts with package name %s", srcname, pkg->toChars()); } } else { amodules.push(this); } }
Expression *StringExp::castTo(Scope *sc, Type *t) { /* This follows copy-on-write; any changes to 'this' * will result in a copy. * The this->string member is considered immutable. */ StringExp *se; Type *tb; int copied = 0; //printf("StringExp::castTo(t = %s), '%s' committed = %d\n", t->toChars(), toChars(), committed); if (!committed && t->ty == Tpointer && t->nextOf()->ty == Tvoid) { error("cannot convert string literal to void*"); return new ErrorExp(); } se = this; if (!committed) { se = (StringExp *)copy(); se->committed = 1; copied = 1; } if (type == t) { return se; } tb = t->toBasetype(); //printf("\ttype = %s\n", type->toChars()); if (tb->ty == Tdelegate && type->toBasetype()->ty != Tdelegate) return Expression::castTo(sc, t); Type *typeb = type->toBasetype(); if (typeb == tb) { if (!copied) { se = (StringExp *)copy(); copied = 1; } se->type = t; return se; } if (tb->ty != Tsarray && tb->ty != Tarray && tb->ty != Tpointer) { if (!copied) { se = (StringExp *)copy(); copied = 1; } goto Lcast; } if (typeb->ty != Tsarray && typeb->ty != Tarray && typeb->ty != Tpointer) { if (!copied) { se = (StringExp *)copy(); copied = 1; } goto Lcast; } if (typeb->nextOf()->size() == tb->nextOf()->size()) { if (!copied) { se = (StringExp *)copy(); copied = 1; } if (tb->ty == Tsarray) goto L2; // handle possible change in static array dimension se->type = t; return se; } if (committed) goto Lcast; #define X(tf,tt) ((tf) * 256 + (tt)) { OutBuffer buffer; size_t newlen = 0; int tfty = typeb->nextOf()->toBasetype()->ty; int ttty = tb->nextOf()->toBasetype()->ty; switch (X(tfty, ttty)) { case X(Tchar, Tchar): case X(Twchar,Twchar): case X(Tdchar,Tdchar): break; case X(Tchar, Twchar): for (size_t u = 0; u < len;) { unsigned c; const char *p = utf_decodeChar((unsigned char *)se->string, len, &u, &c); if (p) error("%s", p); else buffer.writeUTF16(c); } newlen = buffer.offset / 2; buffer.writeUTF16(0); goto L1; case X(Tchar, Tdchar): for (size_t u = 0; u < len;) { unsigned c; const char *p = utf_decodeChar((unsigned char *)se->string, len, &u, &c); if (p) error("%s", p); buffer.write4(c); newlen++; } buffer.write4(0); goto L1; case X(Twchar,Tchar): for (size_t u = 0; u < len;) { unsigned c; const char *p = utf_decodeWchar((unsigned short *)se->string, len, &u, &c); if (p) error("%s", p); else buffer.writeUTF8(c); } newlen = buffer.offset; buffer.writeUTF8(0); goto L1; case X(Twchar,Tdchar): for (size_t u = 0; u < len;) { unsigned c; const char *p = utf_decodeWchar((unsigned short *)se->string, len, &u, &c); if (p) error("%s", p); buffer.write4(c); newlen++; } buffer.write4(0); goto L1; case X(Tdchar,Tchar): for (size_t u = 0; u < len; u++) { unsigned c = ((unsigned *)se->string)[u]; if (!utf_isValidDchar(c)) error("invalid UCS-32 char \\U%08x", c); else buffer.writeUTF8(c); newlen++; } newlen = buffer.offset; buffer.writeUTF8(0); goto L1; case X(Tdchar,Twchar): for (size_t u = 0; u < len; u++) { unsigned c = ((unsigned *)se->string)[u]; if (!utf_isValidDchar(c)) error("invalid UCS-32 char \\U%08x", c); else buffer.writeUTF16(c); newlen++; } newlen = buffer.offset / 2; buffer.writeUTF16(0); goto L1; L1: if (!copied) { se = (StringExp *)copy(); copied = 1; } se->string = buffer.extractData(); se->len = newlen; se->sz = tb->nextOf()->size(); break; default: assert(typeb->nextOf()->size() != tb->nextOf()->size()); goto Lcast; } } #undef X L2: assert(copied); // See if need to truncate or extend the literal if (tb->ty == Tsarray) { int dim2 = ((TypeSArray *)tb)->dim->toInteger(); //printf("dim from = %d, to = %d\n", se->len, dim2); // Changing dimensions if (dim2 != se->len) { // Copy when changing the string literal unsigned newsz = se->sz; void *s; int d; d = (dim2 < se->len) ? dim2 : se->len; s = (unsigned char *)mem.malloc((dim2 + 1) * newsz); memcpy(s, se->string, d * newsz); // Extend with 0, add terminating 0 memset((char *)s + d * newsz, 0, (dim2 + 1 - d) * newsz); se->string = s; se->len = dim2; } } se->type = t; return se; Lcast: Expression *e = new CastExp(loc, se, t); e->type = t; // so semantic() won't be run on e return e; }
void Module::parse() { //printf("Module::parse()\n"); char *srcname = srcfile->name->toChars(); //printf("Module::parse(srcname = '%s')\n", srcname); utf8_t *buf = (utf8_t *)srcfile->buffer; size_t buflen = srcfile->len; if (buflen >= 2) { /* Convert all non-UTF-8 formats to UTF-8. * BOM : http://www.unicode.org/faq/utf_bom.html * 00 00 FE FF UTF-32BE, big-endian * FF FE 00 00 UTF-32LE, little-endian * FE FF UTF-16BE, big-endian * FF FE UTF-16LE, little-endian * EF BB BF UTF-8 */ unsigned le; unsigned bom = 1; // assume there's a BOM if (buf[0] == 0xFF && buf[1] == 0xFE) { if (buflen >= 4 && buf[2] == 0 && buf[3] == 0) { // UTF-32LE le = 1; Lutf32: OutBuffer dbuf; unsigned *pu = (unsigned *)(buf); unsigned *pumax = &pu[buflen / 4]; if (buflen & 3) { error("odd length of UTF-32 char source %u", buflen); fatal(); } dbuf.reserve(buflen / 4); for (pu += bom; pu < pumax; pu++) { unsigned u; u = le ? readlongLE(pu) : readlongBE(pu); if (u & ~0x7F) { if (u > 0x10FFFF) { error("UTF-32 value %08x greater than 0x10FFFF", u); fatal(); } dbuf.writeUTF8(u); } else dbuf.writeByte(u); } dbuf.writeByte(0); // add 0 as sentinel for scanner buflen = dbuf.offset - 1; // don't include sentinel in count buf = (utf8_t *) dbuf.extractData(); } else { // UTF-16LE (X86) // Convert it to UTF-8 le = 1; Lutf16: OutBuffer dbuf; unsigned short *pu = (unsigned short *)(buf); unsigned short *pumax = &pu[buflen / 2]; if (buflen & 1) { error("odd length of UTF-16 char source %u", buflen); fatal(); } dbuf.reserve(buflen / 2); for (pu += bom; pu < pumax; pu++) { unsigned u; u = le ? readwordLE(pu) : readwordBE(pu); if (u & ~0x7F) { if (u >= 0xD800 && u <= 0xDBFF) { unsigned u2; if (++pu > pumax) { error("surrogate UTF-16 high value %04x at EOF", u); fatal(); } u2 = le ? readwordLE(pu) : readwordBE(pu); if (u2 < 0xDC00 || u2 > 0xDFFF) { error("surrogate UTF-16 low value %04x out of range", u2); fatal(); } u = (u - 0xD7C0) << 10; u |= (u2 - 0xDC00); } else if (u >= 0xDC00 && u <= 0xDFFF) { error("unpaired surrogate UTF-16 value %04x", u); fatal(); } else if (u == 0xFFFE || u == 0xFFFF) { error("illegal UTF-16 value %04x", u); fatal(); } dbuf.writeUTF8(u); } else dbuf.writeByte(u); } dbuf.writeByte(0); // add 0 as sentinel for scanner buflen = dbuf.offset - 1; // don't include sentinel in count buf = (utf8_t *) dbuf.extractData(); } } else if (buf[0] == 0xFE && buf[1] == 0xFF) { // UTF-16BE le = 0; goto Lutf16; } else if (buflen >= 4 && buf[0] == 0 && buf[1] == 0 && buf[2] == 0xFE && buf[3] == 0xFF) { // UTF-32BE le = 0; goto Lutf32; } else if (buflen >= 3 && buf[0] == 0xEF && buf[1] == 0xBB && buf[2] == 0xBF) { // UTF-8 buf += 3; buflen -= 3; } else { /* There is no BOM. Make use of Arcane Jill's insight that * the first char of D source must be ASCII to * figure out the encoding. */ bom = 0; if (buflen >= 4) { if (buf[1] == 0 && buf[2] == 0 && buf[3] == 0) { // UTF-32LE le = 1; goto Lutf32; } else if (buf[0] == 0 && buf[1] == 0 && buf[2] == 0) { // UTF-32BE le = 0; goto Lutf32; } } if (buflen >= 2) { if (buf[1] == 0) { // UTF-16LE le = 1; goto Lutf16; } else if (buf[0] == 0) { // UTF-16BE le = 0; goto Lutf16; } } // It's UTF-8 if (buf[0] >= 0x80) { error("source file must start with BOM or ASCII character, not \\x%02X", buf[0]); fatal(); } } } /* If it starts with the string "Ddoc", then it's a documentation * source file. */ if (buflen >= 4 && memcmp(buf, "Ddoc", 4) == 0) { comment = buf + 4; isDocFile = 1; if (!docfile) setDocfile(); return; } { Parser p(this, buf, buflen, docfile != NULL); p.nextToken(); members = p.parseModule(); md = p.md; numlines = p.scanloc.linnum; } if (srcfile->ref == 0) ::free(srcfile->buffer); srcfile->buffer = NULL; srcfile->len = 0; /* The symbol table into which the module is to be inserted. */ DsymbolTable *dst; if (md) { /* A ModuleDeclaration, md, was provided. * The ModuleDeclaration sets the packages this module appears in, and * the name of this module. */ this->ident = md->id; this->safe = md->safe; Package *ppack = NULL; dst = Package::resolve(md->packages, &this->parent, &ppack); assert(dst); Module *m = ppack ? ppack->isModule() : NULL; if (m && strcmp(m->srcfile->name->name(), "package.d") != 0) { ::error(md->loc, "package name '%s' conflicts with usage as a module name in file %s", ppack->toPrettyChars(), m->srcfile->toChars()); } } else { /* The name of the module is set to the source file name. * There are no packages. */ dst = modules; // and so this module goes into global module symbol table /* Check to see if module name is a valid identifier */ if (!Lexer::isValidIdentifier(this->ident->toChars())) error("has non-identifier characters in filename, use module declaration instead"); } // Insert module into the symbol table Dsymbol *s = this; bool isPackageMod = strcmp(srcfile->name->name(), "package.d") == 0; if (isPackageMod) { /* If the source tree is as follows: * pkg/ * +- package.d * +- common.d * the 'pkg' will be incorporated to the internal package tree in two ways: * import pkg; * and: * import pkg.common; * * If both are used in one compilation, 'pkg' as a module (== pkg/package.d) * and a package name 'pkg' will conflict each other. * * To avoid the conflict: * 1. If preceding package name insertion had occurred by Package::resolve, * later package.d loading will change Package::isPkgMod to PKGmodule and set Package::mod. * 2. Otherwise, 'package.d' wrapped by 'Package' is inserted to the internal tree in here. */ Package *p = new Package(ident); p->parent = this->parent; p->isPkgMod = PKGmodule; p->mod = this; p->symtab = new DsymbolTable(); s = p; } if (!dst->insert(s)) { /* It conflicts with a name that is already in the symbol table. * Figure out what went wrong, and issue error message. */ Dsymbol *prev = dst->lookup(ident); assert(prev); if (Module *mprev = prev->isModule()) { if (strcmp(srcname, mprev->srcfile->toChars()) == 0) error(loc, "from file %s must be imported as module '%s'", srcname, toPrettyChars()); else error(loc, "from file %s conflicts with another module %s from file %s", srcname, mprev->toChars(), mprev->srcfile->toChars()); } else if (Package *pkg = prev->isPackage()) { if (pkg->isPkgMod == PKGunknown && isPackageMod) { /* If the previous inserted Package is not yet determined as package.d, * link it to the actual module. */ pkg->isPkgMod = PKGmodule; pkg->mod = this; } else error(pkg->loc, "from file %s conflicts with package name %s", srcname, pkg->toChars()); } else assert(global.errors); } else { // Add to global array of all modules amodules.push(this); } }
void visit(StringExp *e) { char m; OutBuffer tmp; utf8_t *q; size_t qlen; /* Write string in UTF-8 format */ switch (e->sz) { case 1: m = 'a'; q = (utf8_t *)e->string; qlen = e->len; break; case 2: m = 'w'; for (size_t u = 0; u < e->len; ) { unsigned c; const char *p = utf_decodeWchar((unsigned short *)e->string, e->len, &u, &c); if (p) e->error("%s", p); else tmp.writeUTF8(c); } q = (utf8_t *)tmp.data; qlen = tmp.offset; break; case 4: m = 'd'; for (size_t u = 0; u < e->len; u++) { unsigned c = ((unsigned *)e->string)[u]; if (!utf_isValidDchar(c)) e->error("invalid UCS-32 char \\U%08x", c); else tmp.writeUTF8(c); } q = (utf8_t *)tmp.data; qlen = tmp.offset; break; default: assert(0); } buf->reserve(1 + 11 + 2 * qlen); buf->writeByte(m); buf->printf("%d_", (int)qlen); // nbytes <= 11 for (utf8_t *p = (utf8_t *)buf->data + buf->offset, *pend = p + 2 * qlen; p < pend; p += 2, ++q) { utf8_t hi = *q >> 4 & 0xF; p[0] = (utf8_t)(hi < 10 ? hi + '0' : hi - 10 + 'a'); utf8_t lo = *q & 0xF; p[1] = (utf8_t)(lo < 10 ? lo + '0' : lo - 10 + 'a'); } buf->offset += 2 * qlen; }