bool HTMLLexer::IsIdentifierCharacter( UniChar inChar ) { return IsIdentifierStart( inChar ) && !( inChar == CHAR_SOLIDUS || inChar == CHAR_LESS_THAN_SIGN || inChar == CHAR_GREATER_THAN_SIGN || inChar == CHAR_AMPERSAND || inChar == CHAR_EQUALS_SIGN ); }
bool HTMLLexer::AdvanceOneToken( int &outToken, TokenList *outTokens ) { outToken = -1; if (!fLexerInput) return false; // There can be any combination of newlines and whitespaces preceeding semantic tokens, // so we're going to loop until we don't find either. while (fLexerInput->HasMoreChars()) { bool consumedWhitespace = false; bool consumedNewLine = false; if (!SkipWhitespaces( consumedWhitespace, outTokens )) return false; // HTML also treats newlines as a whitespace while (fLexerInput->HasMoreChars() && IsLineEnding( fLexerInput->PeekAtNextChar() )) { // Eat the line ending ConsumeLineEnding( fLexerInput->MoveToNextChar() ); consumedNewLine = true; } // If we're done consuming newlines and whitespaces, then we're done with this loop if (!consumedWhitespace && !consumedNewLine) break; } if (!fLexerInput->HasMoreChars()) return false; // Take a peek at what sort of token we're about to deal with. UniChar uChar = fLexerInput->PeekAtNextChar(); sLONG stringType; sLONG stringValue; if( (outToken = ConsumePossiblePunctuation(uChar, outTokens)) != 0 ) { } else if (IsStringStart( uChar, stringType, stringValue )) { VString vstrQuoted; if (!ConsumeString( &vstrQuoted, outTokens, stringType, stringValue )) { return false; } outToken = stringValue; fLastTokenText = vstrQuoted; } else if (IsIdentifierStart( uChar )) { // The base class assumes we've consumed the first character already for this call. We should // rectify this some day, as it's very confusing. fLexerInput->MoveToNextChar(); VString *vstrNAME = ConsumeIdentifier(); if (!vstrNAME) { return false; } outToken = HTMLLexemes::TEXT; if (outTokens) outTokens->push_back( new HTMLLexerToken( ILexerToken::TT_NAME, fLexerInput->GetCurrentPosition() - vstrNAME->GetLength(), vstrNAME->GetLength(), *vstrNAME, outToken ) ); fLastTokenText = *vstrNAME; delete vstrNAME; } else { return false; } SetLastToken( outToken ); return true; }
char* Preprocessor::ParseLexem( char* start, char* end, Lexem& out ) { if( start == end ) return start; char current_char = *start; if( IsTrivial( current_char ) ) { out.Value += current_char; out.Type = TrivialTypes[Trivials.find_first_of( current_char )]; return ++start; } if( IsIdentifierStart( current_char ) ) return ParseIdentifier( start, end, out ); if( current_char == '#' ) { out.Value = "#"; ++start; if( *start == '#' ) { out.Value = "##"; out.Type = Lexem::IGNORE; return ( ++start ); } while( start != end && ( *start == ' ' || *start == '\t' ) ) ++start; if( start != end && IsIdentifierStart( *start ) ) start = ParseIdentifier( start, end, out ); out.Type = Lexem::PREPROCESSOR; return start; } if( IsNumber( current_char ) ) return ParseNumber( start, end, out ); if( current_char == '\"' ) return ParseStringLiteral( start, end, '\"', out ); if( current_char == '\'' ) return ParseStringLiteral( start, end, '\'', out ); // Todo: set optional ParseCharacterLiteral? if( current_char == '/' ) { // Need to see if it's a comment. ++start; if( start == end ) return start; if( *start == '*' ) return ParseBlockComment( start, end, out ); if( *start == '/' ) return ParseLineComment( start, end, out ); // Not a comment - let default code catch it as MISC --start; } if( current_char == '\\' ) { out.Type = Lexem::BACKSLASH; return ++start; } out.Value = std::string( 1, current_char ); out.Type = Lexem::IGNORE; return ++start; }