XMLCToken xmlcTokenizerNextToken(XMLCTokenizer *s) { XMLCCharacter ch; XMLCCharacter *mark = s->token_start = s->buffer_position; XMLCUInt i; while (s->buffer_position < s->buffer_end) { getChar(ch); /* states that do their own getChar aren't allowed to eat these */ if (ch == '\n' || ch == '\r') { s->line_number++; s->line_start = s->buffer_position; } /* printf("char '%c' props %x s->tokState: %@", ch, characterPropertiesTable[ch], [_tokenStateNameStrings objectAtIndex:s->tokState]); */ switch (s->tokState) { /* FIXME: do I need this state? */ case T_IN_START_PCDATA: s->token_start = mark; s->tokState = T_IN_PCDATA; /* fall through */ case T_IN_PCDATA: if (ch == '<') { /* ^<Marker */ unGetChar(ch); s->tokState = T_IN_TAG; if (s->buffer_position > s->token_start) { if(s->isWhiteSpaceToken) return s->tokType = WHITESPACE_TOKEN; else return s->tokType = PCDATA_TOKEN; } } if(!isWhiteSpace(ch)) s->isWhiteSpaceToken = 0; break; case T_IN_TAG: /* ^<Marker */ if (ch == '<') { getChar(ch); if (ch == '?') { /* <? */ s->token_start = mark; s->tokState = T_IN_COMMAND; } else if (ch == '/') { return s->tokType = OPEN_SLASH_ELEMENT_TOKEN; /* </ */ } else if (ch == '!') { s->token_start = mark; s->tokState = T_IN_DECLARATION; getChar(ch); if (ch =='-') { getChar(ch); if (ch == '-') { /* <!-- */ s->token_start = mark; s->tokState = T_IN_COMMENT; } } else if (ch == '[') { s->tokState = T_IN_CDATA; for(i=0; i<6 && s->tokState == T_IN_CDATA; i++) { getChar(ch); if (!ch == "CDATA["[i]) s->tokState = T_IN_DECLARATION; } if (s->tokState == T_IN_CDATA) s->token_start = mark; } else { /* <! */ unGetChar(ch); s->token_start = mark; s->tokState = T_IN_DECLARATION; } } if (s->tokState == T_IN_TAG) { /* < */ unGetChar(ch); return s->tokType = OPEN_ELEMENT_TOKEN; } } else if (ch == '>') { s->token_start = s->buffer_position; s->isWhiteSpaceToken = 1; s->tokState = T_IN_PCDATA; return s->tokType = CLOSE_ELEMENT_TOKEN; } else if (ch == '/') { getChar(ch); if (ch =='>') { s->token_start = mark; s->tokState = T_IN_PCDATA; return s->tokType = SLASH_CLOSE_ELEMENT_TOKEN; } else { unGetChar(ch); return s->tokType = '/'; /* FIXME: is this an error, the parser should complain */ } } else if (ch == '=') { s->token_start = mark; return s->tokType = '='; } else if (ch == '"') { s->token_start = mark; s->end_quote_char = '"'; s->tokState = T_IN_QUOTE_STRING; } else if (ch == 0x0027) { /*# APOSTROPHE */ s->token_start = mark; s->end_quote_char = 0x0027; s->tokState = T_IN_QUOTE_STRING; } else if (isNameStartChar(ch)) { /* FIXME: whitespace tokens inside tags aren't recognized */ /* need to handle whitespace tokens in tags, could eliminate s->buffer_position-1 and use _mark */ s->token_start = s->buffer_position-1; s->tokState = T_IN_NAME_STRING; } else if (!isWhiteSpace(ch)) { /* don't know what this is, return it as itself, let the parser deal with it */ return s->tokType = ch; } else { mark = s->buffer_position; } break; case T_IN_DECLARATION: /* scan until > FIXME: allow everything? */ if (ch == '>') { s->tokState = T_IN_START_PCDATA; return s->tokType = DECLARATION_TOKEN; } case T_IN_COMMENT: /* scan until --> allow everything */ if (ch == '-') { getChar(ch); if (ch == '-') { getChar(ch); if (ch == '>') { s->tokState = T_IN_START_PCDATA; return s->tokType = COMMENT_TOKEN; } } } break; case T_IN_COMMAND: /* scan until ?> FIXME: allow everything? */ if (ch == '?') { getChar(ch); if (ch == '>') { s->tokState = T_IN_START_PCDATA; return s->tokType = COMMAND_TOKEN; } } break; case T_IN_CDATA: /* scan until ]]> allow everything */ if (ch == ']') { getChar(ch); if (ch == ']') { getChar(ch); if (ch == '>') { s->tokState = T_IN_START_PCDATA; return s->tokType = CDATA_TOKEN; } } } break; case T_IN_NAME_STRING: /* <M^arker */ if (!(isNameChar(ch))) { unGetChar(ch); s->tokState = T_IN_TAG; return s->tokType = NAME_TOKEN; } break; case T_IN_QUOTE_STRING: /* FIXME: handle "e; type things (and whitespace eating?) */ if (ch == s->end_quote_char) { s->tokState = T_IN_TAG; return s->tokType = QUOTE_STRING_TOKEN; } break; } } return s->tokType = s->tokState != T_IN_PCDATA ? PARTIAL_TOKEN : EOF_TOKEN; }
// Parser lexer, called by yylex() int NetaParser::lex() { if (neta_ == NULL) { printf("Lexer called when no target NETA structure set.\n"); return 0; } int n; bool done; static QString token; char c; token.clear(); // Skip over whitespace while ((c = getChar()) == ' ' || c == '\t' || c == '\r' || c == '\n' ); if (c == 0) return 0; // Set this point as the start of our new token (for error reporting) tokenStart_ = stringPos_-1; /* * Integer number */ /* * Number Detection - Either '-', '.' or a digit begins a number */ bool nextCharIsPossibleDigit = (isdigit(peekChar()) || (peekChar() == '.')); if ((c == '.') || isdigit(c) || ((c == '-') && nextCharIsPossibleDigit)) { // Default to integer, unless first char is '.' bool integer = (c != '.'); bool hasexp = false; token += c; done = false; do { c = getChar(); if (isdigit(c)) token += c; else if (c == '.') { integer = false; token += '.'; } else if ((c == 'e') || (c == 'E')) { // Check for previous exponential in number if (hasexp) { Messenger::print("Error: Number has two exponentiations (e/E)."); return 0; } token += 'E'; hasexp = true; } else if ((c == '-') || (c == '+')) { // We allow '-' or '+' only as part of an exponentiation, so if it is not preceeded by 'E' we stop parsing if ((!token.isEmpty()) && (!token.endsWith("E"))) { unGetChar(); done = true; } else token += c; } else { unGetChar(); done = true; } } while (!done); // We now have the number as a text token... if (!hasexp) { if (integer) NetaParser_lval.intConst = token.toInt(); else NetaParser_lval.doubleConst = token.toDouble(); } else { // Exponentiations are always returned as a double integer = false; NetaParser_lval.doubleConst = token.toDouble(); } if (integer) Messenger::print(Messenger::Parse, "NETA : found an integer constant [%s] [%i]", qPrintable(token), NetaParser_lval.intConst); else Messenger::print(Messenger::Parse, "NETA : found a floating-point constant [%s] [%e]", qPrintable(token), NetaParser_lval.doubleConst); return (integer ? INTCONST : DOUBLECONST); } /* * Alphanumeric-token : Element symbol, NETA keyword, value, expander, or geometry */ if (isalpha (c)) { do { token += c; c = getChar(); } while (isalnum(c) || (c == '_')); unGetChar(); Messenger::print(Messenger::Typing, "NETA : found an alpha token [%s]...", qPrintable(token)); // Element Symbol (or 'Any') if (token == "Any") { NetaParser_lval.intConst = 0; Messenger::print(Messenger::Typing, "NETA : ...which is the any element symbol (Any)"); return ELEMENT; } for (n=0; n<ElementMap::nElements(); ++n) if (token == ElementMap::symbol(n)) break; if (n < ElementMap::nElements()) { NetaParser_lval.intConst = n; Messenger::print(Messenger::Typing, "NETA : ...which is a an element symbol (%i)",n); return ELEMENT; } // Is this a NETA keyword? Neta::NetaKeyword nk = Neta::netaKeyword(token, false); if (nk != Neta::nNetaKeywords) { Messenger::print(Messenger::Typing, "NETA : ...which is a keyword (->NETAKEY)"); NetaParser_lval.netaKey = nk; return NETAKEY; } // Is this a NETA geometry? Atom::AtomGeometry ag = Atom::atomGeometry(token, false); if (ag != Atom::nAtomGeometries) { Messenger::print(Messenger::Typing, "NETA : ...which is a geometry (->NETAGEOMETRYTYPE)"); NetaParser_lval.atomGeom = ag; return NETAGEOMETRYTYPE; } // Is this a NETA value? Neta::NetaValue nv = Neta::netaValue(token, false); if (nv == Neta::RepeatValue) { Messenger::print(Messenger::Typing, "NETA : ...which is a repeat value (->NETAREPEAT)"); NetaParser_lval.netaVal = nv; return NETAREPEAT; } else if (nv != Neta::nNetaValues) { Messenger::print(Messenger::Typing, "NETA : ...which is a value (->NETAVAL)"); NetaParser_lval.netaVal = nv; return NETAVAL; } // Is this a NETA expander? Neta::NetaExpander ne = Neta::netaExpander(token, false); if (ne != Neta::nNetaExpanders) { Messenger::print(Messenger::Typing, "NETA : ...which is an expander (->NETAEXP)"); if (ne == Neta::RingExpander) return NETARING; else if (ne == Neta::ChainExpander) return NETACHAIN; else if (ne == Neta::GeometryExpander) return NETAGEOMETRY; else if (ne == Neta::PathExpander) return NETAPATH; return 0; } // Is it a bond type? Bond::BondType bt = Bond::bondType(token, false); if (bt != Bond::nBondTypes) { Messenger::print(Messenger::Typing, "NETA : ...which is a bond type (->INTCONST, %i)", bt); NetaParser_lval.intConst = bt; return INTCONST; } // If we get to here then we have found an unrecognised alphanumeric token Messenger::print(Messenger::Typing, "NETA : ...which is unrecognised (->TOKEN)"); lastUnknownToken_ = token; return TOKEN; } /* We have found a symbolic character (or a pair) that corresponds to an operator */ // Return immediately in the case of brackets, commas etc. if ((c == '(') || (c == ')') || (c == ',') || (c == '[') || (c == ']') || (c == '-') || (c == '~') || (c == '&') || ( c == '!') || (c == '$') || (c == '=') || (c == '?') || (c == ':')) { Messenger::print(Messenger::Typing, "NETA : found symbol [%c]",c); return c; } token += c; // Similarly, if the next character is a bracket or double quotes, return immediately char c2 = peekChar(); if ((c2 == '(') || (c2 == ')') || (c2 == ';') || (c2 == '{') || (c2 == '}') || (c2 == '"')) return c; // If it is 'punctuation', add this second character to our operator and search for it if (ispunct(c2)) { c = getChar(); token += c; if (!NetaParser::quiet()) Messenger::print(Messenger::Typing, "NETA : found symbol [%s]", qPrintable(token)); NetaSymbolToken st = (NetaSymbolToken) enumSearch("", nNetaSymbolTokens, NetaSymbolTokenKeywords, token, !NetaParser::quiet()); if (st != nNetaSymbolTokens) return NetaSymbolTokenValues[st]; else if (!NetaParser::quiet()) Messenger::print("Error: Unrecognised symbol found in input (%s).", qPrintable(token)); } else { // Make sure that this is a known symbol if ((c == '$') || (c == '%') || (c == '&') || (c == '@') || (c == '?') || (c == ':')) { if (!NetaParser::quiet()) Messenger::print("Error: Unrecognised symbol found in input (%c).", c); } else return c; } return 0; }