/////////////////////////////////////////////////////////////////////////// // // Parsing Interface // Token *tokenizer_next( Tokenizer *tokenizer ) { wchar_t c, next; Token *token = NULL; next = ss_peek( tokenizer->ss_ ); while ( next != WEOF && !token ) { // Whitespace if ( isWhitespaceStart( tokenizer->ss_, 0 ) ) { token = parseWhitespace( tokenizer ); } // Strings else if ( isStringStart( tokenizer->ss_, 0 ) ) { token = parseString( tokenizer ); } // Comments else if ( isCommentStart( tokenizer->ss_, 0 ) ) { token = parseComment( tokenizer ); } // URL else if ( isUrlStart( tokenizer->ss_, 0 ) ) { token = parseUrl( tokenizer ); } // SGML Comments else if ( isSGMLCommentStart( tokenizer->ss_, 0 ) ) { token = parseSGMLComment( tokenizer ); } // Identifier else if ( isIdentifierStart( tokenizer->ss_, 0 ) ) { token = parseIdentifier( tokenizer ); } // @keyword else if ( isAtkeywordStart( tokenizer->ss_, 0 ) ) { token = parseAtkeyword( tokenizer ); } // #keyword else if ( isHashkeywordStart( tokenizer->ss_, 0 ) ) { token = parseHashkeyword( tokenizer ); } // Number else if ( isNumberStart( tokenizer->ss_, 0 ) ) { token = parseNumber( tokenizer ); } // Operators & Delims (everything else) else { token = parseEverythingElse( tokenizer ); } } if ( token ) { return token; } else { return NULL; } }
Token *parseComment( Tokenizer *tokenizer ) { StatefulString *ss = tokenizer->ss_; assert( isCommentStart( ss, 0 ) ); int start, length; StatefulStringPosition pos1, pos2; Token *token; wchar_t *error = malloc( 201 * sizeof( wchar_t ) ); error[ 0 ] = L'\0'; start = ss->next_index; pos1 = ss->next_position; length = 2; TokenType type = COMMENT; ss_getchar( ss ); ss_getchar( ss ); // Throw away `/*` while ( ss_peek( ss ) != WEOF && ( ss_peek( ss ) != L'*' || ss_peekx( ss, 1 ) != L'/' ) ) { length++; if ( ss_getchar( ss ) == L'\\' ) { ss_getchar( ss ); length++; } } if ( ss_peek( ss ) == WEOF ) { swprintf( error, 200, L"Encountered end-of-file while parsing a comment. Probably a forgotten `*/`." ); } else { ss_getchar( ss ); ss_getchar( ss ); // Throw away `*/` length += 2; } // Return the token. pos2 = ss->next_position; token = token_new( ss_substr( ss, start, length ), length, type, pos1, pos2 ); if ( wcscmp( error, L"" ) != 0 ) { tokenizer_error( tokenizer, error, token ); } else { free( error ); } return token; }
void Html::extractCode(OutBuffer *buf) { //printf("Html::extractCode()\n"); dbuf = buf; // save for other routines buf->reserve(end - p); inCode = 0; while (1) { //printf("p = %p, *p = x%x\n", p, *p); switch (*p) { #if 0 // strings are not recognized outside of tags case '"': case '\'': skipString(); continue; #endif case '<': if (p[1] == '!' && isCommentStart()) { // Comments start with <!-- scanComment(); } else if(p[1] == '!' && isCDATAStart()) { scanCDATA(); } else if (p[1] == '/' && istagstart(*skipWhite(p + 2))) skipTag(); else if (istagstart(*skipWhite(p + 1))) skipTag(); else goto Ldefault; continue; case 0: case 0x1a: break; // end of file case '&': if (inCode) { // Translate character entity into ascii for D parser int c; c = charEntity(); buf->writeUTF8(c); } else p++; continue; case '\r': if (p[1] == '\n') goto Ldefault; case '\n': linnum++; // Always extract new lines, so that D lexer counts the // lines right. buf->writeByte(*p); p++; continue; default: Ldefault: if (inCode) buf->writeByte(*p); p++; continue; } break; } buf->writeByte(0); // ending sentinel //printf("D code is: '%s'\n", (char *)buf->data); }
/* Small FSM. States indicate what the machine is looking for *next*, * so eg _cfgKEYSTART means "looking for the token that indicates the start * of a key" */ struct configFile *_cfgParseConfigFile (struct configFile *cfg) { char *currentSectionString="DEFAULT"; char *currentStringStart=NULL; char *currentKey=NULL; unsigned int filePos=0, state=_cfgKEYSTART; hash_table *tempHash; /* Create the default section. */ tempHash=hashConstructTable (31); hashInsert (currentSectionString, tempHash, cfg->sections); while (filePos < cfg->bbdgSize) { switch (state) { case _cfgKEYSTART: if (cfg->bbdg[filePos]=='[') { filePos++; currentStringStart=(char *) &(cfg->bbdg[filePos]); state=_cfgSECTIONEND; break; } if (isCommentStart(cfg->bbdg[filePos])) { filePos++; state=_cfgCOMMENTEND; break; } if ( !isspace (cfg->bbdg[filePos]) ) { currentStringStart=(char *) &(cfg->bbdg[filePos]); state=_cfgKEYEND; } else { filePos ++; } break; case _cfgCOMMENTEND: if (cfg->bbdg[filePos]=='\n') { state=_cfgKEYSTART; } filePos++; break; case _cfgSECTIONEND: if (cfg->bbdg[filePos]==']') { cfg->bbdg[filePos]='\0'; currentSectionString=currentStringStart; state=_cfgKEYSTART; } filePos++; break; case _cfgKEYEND: if (isspace (cfg->bbdg[filePos]) || isKeyValSep(cfg->bbdg[filePos])) { if (isKeyValSep(cfg->bbdg[filePos])) { cfg->bbdg[filePos]='\0'; } else { cfg->bbdg[filePos]='\0'; filePos++; } currentKey=currentStringStart; state=_cfgCOLON; } else { //Do this in search routine instead (with strcasecmp) //cfg->bbdg[filePos] = tolower(cfg->bbdg[filePos]); filePos++; } break; case _cfgCOLON: if (isKeyValSep(cfg->bbdg[filePos]) || cfg->bbdg[filePos]=='\0') { state=_cfgVALSTART; } filePos++; break; case _cfgVALSTART: if (!myisblank(cfg->bbdg[filePos])) { currentStringStart=(char *) &(cfg->bbdg[filePos]); state=_cfgVALEND; } else { filePos ++; } break; case _cfgVALEND: if (cfg->bbdg[filePos]=='\n' || isCommentStart(cfg->bbdg[filePos])) { /* First see if the current section exists. */ tempHash=hashLookup (currentSectionString, cfg->sections); if (tempHash==NULL) { tempHash=hashConstructTable (31); hashInsert (currentSectionString, tempHash, cfg->sections); } /* Now stick it in the table. */ if (isCommentStart(cfg->bbdg[filePos])) { cfg->bbdg[filePos]='\0'; hashInsert (currentKey, currentStringStart, tempHash); state=_cfgCOMMENTEND; } else { cfg->bbdg[filePos]='\0'; hashInsert (currentKey, currentStringStart, tempHash); state=_cfgKEYSTART; } } filePos++; break; } } return cfg; }
void Html::skipTag() { enum TagState // what parsing state we're in { TStagstart, // start of tag name TStag, // in a tag name TSrest, // following tag name }; enum TagState state = TStagstart; int inot; unsigned char *tagstart = NULL; int taglen = 0; p++; inot = 0; if (*p == '/') { inot = 1; p++; } while (1) { switch (*p) { case '>': // found end of tag p++; break; case '"': case '\'': state = TSrest; skipString(); continue; case '<': if (p[1] == '!' && isCommentStart()) { // Comments start with <!-- scanComment(); } else if (p[1] == '/' && istagstart(*skipWhite(p + 2))) { error("nested tag"); skipTag(); } else if (istagstart(*skipWhite(p + 1))) { error("nested tag"); skipTag(); } // Treat comments as if they were whitespace state = TSrest; continue; case 0: case 0x1a: error("end of file before end of tag"); break; // end of file case '\r': if (p[1] == '\n') goto Ldefault; case '\n': linnum++; // Always extract new lines, so that code lexer counts the // lines right. dbuf->writeByte(*p); state = TSrest; // end of tag p++; continue; case ' ': case '\t': case '\f': case '\v': if (state == TStagstart) { p++; continue; } default: Ldefault: switch (state) { case TStagstart: // start of tag name assert(istagstart(*p)); state = TStag; tagstart = p; taglen = 0; break; case TStag: if (istag(*p)) { // Continuing tag name taglen++; } else { // End of tag name state = TSrest; } break; case TSrest: break; } p++; continue; } break; } // See if we parsed a <code> or </code> tag if (taglen && memicmp((char *) tagstart, (char *) "CODE", taglen) == 0 && *(p - 2) != '/') // ignore "<code />" (XHTML) { if (inot) { inCode--; if (inCode < 0) inCode = 0; // ignore extra </code>'s } else inCode++; } }