void Module::parse() #endif { char *srcname; unsigned char *buf; unsigned buflen; unsigned le; unsigned bom; //printf("Module::parse()\n"); srcname = srcfile->name->toChars(); //printf("Module::parse(srcname = '%s')\n", srcname); buf = srcfile->buffer; buflen = srcfile->len; if (buflen >= 2) { /* Convert all non-UTF-8 formats to UTF-8. * BOM : http://www.unicode.org/faq/utf_bom.html * 00 00 FE FF UTF-32BE, big-endian * FF FE 00 00 UTF-32LE, little-endian * FE FF UTF-16BE, big-endian * FF FE UTF-16LE, little-endian * EF BB BF UTF-8 */ bom = 1; // assume there's a BOM if (buf[0] == 0xFF && buf[1] == 0xFE) { if (buflen >= 4 && buf[2] == 0 && buf[3] == 0) { // UTF-32LE le = 1; Lutf32: OutBuffer dbuf; unsigned *pu = (unsigned *)(buf); unsigned *pumax = &pu[buflen / 4]; if (buflen & 3) { error("odd length of UTF-32 char source %u", buflen); fatal(); } dbuf.reserve(buflen / 4); for (pu += bom; pu < pumax; pu++) { unsigned u; u = le ? readlongLE(pu) : readlongBE(pu); if (u & ~0x7F) { if (u > 0x10FFFF) { error("UTF-32 value %08x greater than 0x10FFFF", u); fatal(); } dbuf.writeUTF8(u); } else dbuf.writeByte(u); } dbuf.writeByte(0); // add 0 as sentinel for scanner buflen = dbuf.offset - 1; // don't include sentinel in count buf = (unsigned char *) dbuf.extractData(); } else { // UTF-16LE (X86) // Convert it to UTF-8 le = 1; Lutf16: OutBuffer dbuf; unsigned short *pu = (unsigned short *)(buf); unsigned short *pumax = &pu[buflen / 2]; if (buflen & 1) { error("odd length of UTF-16 char source %u", buflen); fatal(); } dbuf.reserve(buflen / 2); for (pu += bom; pu < pumax; pu++) { unsigned u; u = le ? readwordLE(pu) : readwordBE(pu); if (u & ~0x7F) { if (u >= 0xD800 && u <= 0xDBFF) { unsigned u2; if (++pu > pumax) { error("surrogate UTF-16 high value %04x at EOF", u); fatal(); } u2 = le ? readwordLE(pu) : readwordBE(pu); if (u2 < 0xDC00 || u2 > 0xDFFF) { error("surrogate UTF-16 low value %04x out of range", u2); fatal(); } u = (u - 0xD7C0) << 10; u |= (u2 - 0xDC00); } else if (u >= 0xDC00 && u <= 0xDFFF) { error("unpaired surrogate UTF-16 value %04x", u); fatal(); } else if (u == 0xFFFE || u == 0xFFFF) { error("illegal UTF-16 value %04x", u); fatal(); } dbuf.writeUTF8(u); } else dbuf.writeByte(u); } dbuf.writeByte(0); // add 0 as sentinel for scanner buflen = dbuf.offset - 1; // don't include sentinel in count buf = (unsigned char *) dbuf.extractData(); } } else if (buf[0] == 0xFE && buf[1] == 0xFF) { // UTF-16BE le = 0; goto Lutf16; } else if (buflen >= 4 && buf[0] == 0 && buf[1] == 0 && buf[2] == 0xFE && buf[3] == 0xFF) { // UTF-32BE le = 0; goto Lutf32; } else if (buflen >= 3 && buf[0] == 0xEF && buf[1] == 0xBB && buf[2] == 0xBF) { // UTF-8 buf += 3; buflen -= 3; } else { /* There is no BOM. Make use of Arcane Jill's insight that * the first char of D source must be ASCII to * figure out the encoding. */ bom = 0; if (buflen >= 4) { if (buf[1] == 0 && buf[2] == 0 && buf[3] == 0) { // UTF-32LE le = 1; goto Lutf32; } else if (buf[0] == 0 && buf[1] == 0 && buf[2] == 0) { // UTF-32BE le = 0; goto Lutf32; } } if (buflen >= 2) { if (buf[1] == 0) { // UTF-16LE le = 1; goto Lutf16; } else if (buf[0] == 0) { // UTF-16BE le = 0; goto Lutf16; } } // It's UTF-8 if (buf[0] >= 0x80) { error("source file must start with BOM or ASCII character, not \\x%02X", buf[0]); fatal(); } } } #ifdef IN_GCC // dump utf-8 encoded source if (dump_source) { // %% srcname could contain a path ... d_gcc_dump_source(srcname, "utf-8", buf, buflen); } #endif /* If it starts with the string "Ddoc", then it's a documentation * source file. */ if (buflen >= 4 && memcmp(buf, "Ddoc", 4) == 0) { comment = buf + 4; isDocFile = 1; return; } if (isHtml) { OutBuffer *dbuf = new OutBuffer(); Html h(srcname, buf, buflen); h.extractCode(dbuf); buf = dbuf->data; buflen = dbuf->offset; #ifdef IN_GCC // dump extracted source if (dump_source) d_gcc_dump_source(srcname, "d.utf-8", buf, buflen); #endif } #if IN_LLVM Parser p(this, buf, buflen, gen_docs); #else Parser p(this, buf, buflen, docfile != NULL); #endif p.nextToken(); members = p.parseModule(); md = p.md; numlines = p.loc.linnum; DsymbolTable *dst; if (md) { this->ident = md->id; dst = Package::resolve(md->packages, &this->parent, NULL); } else { dst = modules; /* Check to see if module name is a valid identifier */ if (!Lexer::isValidIdentifier(this->ident->toChars())) error("has non-identifier characters in filename, use module declaration instead"); } // Update global list of modules if (!dst->insert(this)) { Dsymbol *prev = dst->lookup(ident); assert(prev); Module *mprev = prev->isModule(); if (mprev) error(loc, "from file %s conflicts with another module %s from file %s", srcname, mprev->toChars(), mprev->srcfile->toChars()); else { Package *pkg = prev->isPackage(); assert(pkg); error(loc, "from file %s conflicts with package name %s", srcname, pkg->toChars()); } } else { amodules.push(this); } }
void Module::parse() { //printf("Module::parse()\n"); char *srcname = srcfile->name->toChars(); //printf("Module::parse(srcname = '%s')\n", srcname); utf8_t *buf = (utf8_t *)srcfile->buffer; size_t buflen = srcfile->len; if (buflen >= 2) { /* Convert all non-UTF-8 formats to UTF-8. * BOM : http://www.unicode.org/faq/utf_bom.html * 00 00 FE FF UTF-32BE, big-endian * FF FE 00 00 UTF-32LE, little-endian * FE FF UTF-16BE, big-endian * FF FE UTF-16LE, little-endian * EF BB BF UTF-8 */ unsigned le; unsigned bom = 1; // assume there's a BOM if (buf[0] == 0xFF && buf[1] == 0xFE) { if (buflen >= 4 && buf[2] == 0 && buf[3] == 0) { // UTF-32LE le = 1; Lutf32: OutBuffer dbuf; unsigned *pu = (unsigned *)(buf); unsigned *pumax = &pu[buflen / 4]; if (buflen & 3) { error("odd length of UTF-32 char source %u", buflen); fatal(); } dbuf.reserve(buflen / 4); for (pu += bom; pu < pumax; pu++) { unsigned u; u = le ? readlongLE(pu) : readlongBE(pu); if (u & ~0x7F) { if (u > 0x10FFFF) { error("UTF-32 value %08x greater than 0x10FFFF", u); fatal(); } dbuf.writeUTF8(u); } else dbuf.writeByte(u); } dbuf.writeByte(0); // add 0 as sentinel for scanner buflen = dbuf.offset - 1; // don't include sentinel in count buf = (utf8_t *) dbuf.extractData(); } else { // UTF-16LE (X86) // Convert it to UTF-8 le = 1; Lutf16: OutBuffer dbuf; unsigned short *pu = (unsigned short *)(buf); unsigned short *pumax = &pu[buflen / 2]; if (buflen & 1) { error("odd length of UTF-16 char source %u", buflen); fatal(); } dbuf.reserve(buflen / 2); for (pu += bom; pu < pumax; pu++) { unsigned u; u = le ? readwordLE(pu) : readwordBE(pu); if (u & ~0x7F) { if (u >= 0xD800 && u <= 0xDBFF) { unsigned u2; if (++pu > pumax) { error("surrogate UTF-16 high value %04x at EOF", u); fatal(); } u2 = le ? readwordLE(pu) : readwordBE(pu); if (u2 < 0xDC00 || u2 > 0xDFFF) { error("surrogate UTF-16 low value %04x out of range", u2); fatal(); } u = (u - 0xD7C0) << 10; u |= (u2 - 0xDC00); } else if (u >= 0xDC00 && u <= 0xDFFF) { error("unpaired surrogate UTF-16 value %04x", u); fatal(); } else if (u == 0xFFFE || u == 0xFFFF) { error("illegal UTF-16 value %04x", u); fatal(); } dbuf.writeUTF8(u); } else dbuf.writeByte(u); } dbuf.writeByte(0); // add 0 as sentinel for scanner buflen = dbuf.offset - 1; // don't include sentinel in count buf = (utf8_t *) dbuf.extractData(); } } else if (buf[0] == 0xFE && buf[1] == 0xFF) { // UTF-16BE le = 0; goto Lutf16; } else if (buflen >= 4 && buf[0] == 0 && buf[1] == 0 && buf[2] == 0xFE && buf[3] == 0xFF) { // UTF-32BE le = 0; goto Lutf32; } else if (buflen >= 3 && buf[0] == 0xEF && buf[1] == 0xBB && buf[2] == 0xBF) { // UTF-8 buf += 3; buflen -= 3; } else { /* There is no BOM. Make use of Arcane Jill's insight that * the first char of D source must be ASCII to * figure out the encoding. */ bom = 0; if (buflen >= 4) { if (buf[1] == 0 && buf[2] == 0 && buf[3] == 0) { // UTF-32LE le = 1; goto Lutf32; } else if (buf[0] == 0 && buf[1] == 0 && buf[2] == 0) { // UTF-32BE le = 0; goto Lutf32; } } if (buflen >= 2) { if (buf[1] == 0) { // UTF-16LE le = 1; goto Lutf16; } else if (buf[0] == 0) { // UTF-16BE le = 0; goto Lutf16; } } // It's UTF-8 if (buf[0] >= 0x80) { error("source file must start with BOM or ASCII character, not \\x%02X", buf[0]); fatal(); } } } /* If it starts with the string "Ddoc", then it's a documentation * source file. */ if (buflen >= 4 && memcmp(buf, "Ddoc", 4) == 0) { comment = buf + 4; isDocFile = 1; if (!docfile) setDocfile(); return; } { Parser p(this, buf, buflen, docfile != NULL); p.nextToken(); members = p.parseModule(); md = p.md; numlines = p.scanloc.linnum; } if (srcfile->ref == 0) ::free(srcfile->buffer); srcfile->buffer = NULL; srcfile->len = 0; /* The symbol table into which the module is to be inserted. */ DsymbolTable *dst; if (md) { /* A ModuleDeclaration, md, was provided. * The ModuleDeclaration sets the packages this module appears in, and * the name of this module. */ this->ident = md->id; this->safe = md->safe; Package *ppack = NULL; dst = Package::resolve(md->packages, &this->parent, &ppack); assert(dst); Module *m = ppack ? ppack->isModule() : NULL; if (m && strcmp(m->srcfile->name->name(), "package.d") != 0) { ::error(md->loc, "package name '%s' conflicts with usage as a module name in file %s", ppack->toPrettyChars(), m->srcfile->toChars()); } } else { /* The name of the module is set to the source file name. * There are no packages. */ dst = modules; // and so this module goes into global module symbol table /* Check to see if module name is a valid identifier */ if (!Lexer::isValidIdentifier(this->ident->toChars())) error("has non-identifier characters in filename, use module declaration instead"); } // Insert module into the symbol table Dsymbol *s = this; bool isPackageMod = strcmp(srcfile->name->name(), "package.d") == 0; if (isPackageMod) { /* If the source tree is as follows: * pkg/ * +- package.d * +- common.d * the 'pkg' will be incorporated to the internal package tree in two ways: * import pkg; * and: * import pkg.common; * * If both are used in one compilation, 'pkg' as a module (== pkg/package.d) * and a package name 'pkg' will conflict each other. * * To avoid the conflict: * 1. If preceding package name insertion had occurred by Package::resolve, * later package.d loading will change Package::isPkgMod to PKGmodule and set Package::mod. * 2. Otherwise, 'package.d' wrapped by 'Package' is inserted to the internal tree in here. */ Package *p = new Package(ident); p->parent = this->parent; p->isPkgMod = PKGmodule; p->mod = this; p->symtab = new DsymbolTable(); s = p; } if (!dst->insert(s)) { /* It conflicts with a name that is already in the symbol table. * Figure out what went wrong, and issue error message. */ Dsymbol *prev = dst->lookup(ident); assert(prev); if (Module *mprev = prev->isModule()) { if (strcmp(srcname, mprev->srcfile->toChars()) == 0) error(loc, "from file %s must be imported as module '%s'", srcname, toPrettyChars()); else error(loc, "from file %s conflicts with another module %s from file %s", srcname, mprev->toChars(), mprev->srcfile->toChars()); } else if (Package *pkg = prev->isPackage()) { if (pkg->isPkgMod == PKGunknown && isPackageMod) { /* If the previous inserted Package is not yet determined as package.d, * link it to the actual module. */ pkg->isPkgMod = PKGmodule; pkg->mod = this; } else error(pkg->loc, "from file %s conflicts with package name %s", srcname, pkg->toChars()); } else assert(global.errors); } else { // Add to global array of all modules amodules.push(this); } }