Example #1
0
XMLCToken xmlcTokenizerNextToken(XMLCTokenizer *s)
{
    XMLCCharacter ch;
    XMLCCharacter *mark = s->token_start = s->buffer_position;
    XMLCUInt i;


    while (s->buffer_position < s->buffer_end) {

        getChar(ch);

        /* states that do their own getChar aren't allowed to eat these */
        if (ch == '\n' || ch == '\r') {
            s->line_number++;
            s->line_start = s->buffer_position;
        }

        /*   printf("char '%c' props %x s->tokState: %@",  ch, characterPropertiesTable[ch], [_tokenStateNameStrings objectAtIndex:s->tokState]); */
        switch (s->tokState) {

        /* FIXME: do I need this state? */
        case T_IN_START_PCDATA:
            s->token_start = mark;
            s->tokState = T_IN_PCDATA;
        /* fall through */

        case T_IN_PCDATA:
            if (ch == '<') { /* ^<Marker */
                unGetChar(ch);
                s->tokState = T_IN_TAG;
                if (s->buffer_position > s->token_start) {
                    if(s->isWhiteSpaceToken)
                        return s->tokType = WHITESPACE_TOKEN;
                    else
                        return s->tokType = PCDATA_TOKEN;
                }
            }
            if(!isWhiteSpace(ch))
                s->isWhiteSpaceToken = 0;
            break;

        case T_IN_TAG:   /* ^<Marker */
            if (ch == '<') {
                getChar(ch);
                if (ch == '?') {  /* <? */
                    s->token_start = mark;
                    s->tokState = T_IN_COMMAND;
                } else if (ch == '/') {
                    return s->tokType = OPEN_SLASH_ELEMENT_TOKEN;  /* </ */
                } else if (ch == '!') {
                    s->token_start = mark;
                    s->tokState = T_IN_DECLARATION;
                    getChar(ch);
                    if (ch =='-') {
                        getChar(ch);
                        if (ch == '-') { /* <!-- */
                            s->token_start = mark;
                            s->tokState = T_IN_COMMENT;
                        }
                    } else if (ch == '[') {
                        s->tokState = T_IN_CDATA;
                        for(i=0; i<6 && s->tokState == T_IN_CDATA; i++) {
                            getChar(ch);
                            if (!ch == "CDATA["[i])
                                s->tokState = T_IN_DECLARATION;
                        }
                        if (s->tokState == T_IN_CDATA)
                            s->token_start = mark;
                    } else { /* <! */
                        unGetChar(ch);
                        s->token_start = mark;
                        s->tokState = T_IN_DECLARATION;
                    }
                }
                if (s->tokState == T_IN_TAG) { /* < */
                    unGetChar(ch);
                    return s->tokType = OPEN_ELEMENT_TOKEN;
                }

            } else if (ch == '>') {
                s->token_start = s->buffer_position;
                s->isWhiteSpaceToken = 1;
                s->tokState = T_IN_PCDATA;
                return s->tokType =  CLOSE_ELEMENT_TOKEN;
            } else if (ch == '/') {
                getChar(ch);
                if (ch =='>') {
                    s->token_start = mark;
                    s->tokState = T_IN_PCDATA;
                    return s->tokType = SLASH_CLOSE_ELEMENT_TOKEN;
                } else {
                    unGetChar(ch);
                    return s->tokType = '/'; /* FIXME: is this an error, the parser should complain */
                }
            } else if (ch == '=') {
                s->token_start = mark;
                return s->tokType = '=';
            } else if (ch == '"') {
                s->token_start = mark;
                s->end_quote_char = '"';
                s->tokState = T_IN_QUOTE_STRING;
            } else if (ch == 0x0027) { /*#	APOSTROPHE */
                s->token_start = mark;
                s->end_quote_char = 0x0027;
                s->tokState = T_IN_QUOTE_STRING;
            } else if (isNameStartChar(ch)) {
                /* FIXME: whitespace tokens inside tags aren't recognized */
                /* need to handle whitespace tokens in tags, could eliminate s->buffer_position-1 and use _mark */
                s->token_start = s->buffer_position-1;
                s->tokState = T_IN_NAME_STRING;
            } else if (!isWhiteSpace(ch)) {
                /* don't know what this is, return it as itself, let the parser deal with it */
                return s->tokType = ch;
            } else {
                mark = s->buffer_position;
            }
            break;

        case T_IN_DECLARATION: /* scan until >  FIXME: allow everything? */
            if (ch == '>') {
                s->tokState = T_IN_START_PCDATA;
                return s->tokType = DECLARATION_TOKEN;
            }

        case T_IN_COMMENT: /* scan until --> allow everything */
            if (ch == '-') {
                getChar(ch);
                if (ch == '-') {
                    getChar(ch);
                    if (ch == '>') {
                        s->tokState = T_IN_START_PCDATA;
                        return s->tokType = COMMENT_TOKEN;
                    }
                }
            }
            break;

        case T_IN_COMMAND: /* scan until ?>  FIXME: allow everything? */
            if (ch == '?') {
                getChar(ch);
                if (ch == '>') {
                    s->tokState = T_IN_START_PCDATA;
                    return s->tokType = COMMAND_TOKEN;
                }
            }
            break;

        case T_IN_CDATA: /* scan until ]]> allow everything */
            if (ch == ']') {
                getChar(ch);
                if (ch == ']') {
                    getChar(ch);
                    if (ch == '>') {
                        s->tokState = T_IN_START_PCDATA;
                        return s->tokType = CDATA_TOKEN;
                    }
                }
            }
            break;

        case T_IN_NAME_STRING:   /* <M^arker */
            if (!(isNameChar(ch))) {
                unGetChar(ch);
                s->tokState = T_IN_TAG;
                return s->tokType = NAME_TOKEN;
            }
            break;

        case T_IN_QUOTE_STRING:   /* FIXME: handle &quote; type things (and whitespace eating?) */
            if (ch == s->end_quote_char) {
                s->tokState = T_IN_TAG;
                return s->tokType = QUOTE_STRING_TOKEN;
            }
            break;
        }
    }
    return s->tokType = s->tokState != T_IN_PCDATA ? PARTIAL_TOKEN : EOF_TOKEN;
}
Example #2
0
// Parser lexer, called by yylex()
int NetaParser::lex()
{
	if (neta_ == NULL)
	{
		printf("Lexer called when no target NETA structure set.\n");
		return 0;
	}

	int n;
	bool done;
	static QString token;
	char c;
	token.clear();

	// Skip over whitespace
	while ((c = getChar()) == ' ' || c == '\t' || c == '\r' || c == '\n' );

	if (c == 0) return 0;

	// Set this point as the start of our new token (for error reporting)
	tokenStart_ = stringPos_-1;

	/*
	 * Integer number
	 */
	/*
	 * Number Detection - Either '-', '.' or a digit begins a number
	 */
	bool nextCharIsPossibleDigit = (isdigit(peekChar()) || (peekChar() == '.'));
	if ((c == '.') || isdigit(c) || ((c == '-') && nextCharIsPossibleDigit))
	{
		// Default to integer, unless first char is '.'
		bool integer = (c != '.');
		bool hasexp = false;
		token += c;
		done = false;
		do
		{
			c = getChar();
			if (isdigit(c)) token += c;
			else if (c == '.')
			{
				integer = false;
				token += '.';
			}
			else if ((c == 'e') || (c == 'E'))
			{
				// Check for previous exponential in number
				if (hasexp)
				{
					Messenger::print("Error: Number has two exponentiations (e/E).");
					return 0;
				}
				token += 'E';
				hasexp = true;
			}
			else if ((c == '-') || (c == '+'))
			{
				// We allow '-' or '+' only as part of an exponentiation, so if it is not preceeded by 'E' we stop parsing
				if ((!token.isEmpty()) && (!token.endsWith("E")))
				{
					unGetChar();
					done = true;
				}
				else token += c;
			}
			else
			{
				unGetChar();
				done = true;
			}
		} while (!done);

		// We now have the number as a text token...
		if (!hasexp)
		{
			if (integer) NetaParser_lval.intConst = token.toInt();
			else NetaParser_lval.doubleConst = token.toDouble();
		}
		else
		{
			// Exponentiations are always returned as a double
			integer = false;
			NetaParser_lval.doubleConst = token.toDouble();
		}
		if (integer) Messenger::print(Messenger::Parse, "NETA : found an integer constant [%s] [%i]", qPrintable(token), NetaParser_lval.intConst);
		else Messenger::print(Messenger::Parse, "NETA : found a floating-point constant [%s] [%e]", qPrintable(token), NetaParser_lval.doubleConst);
		return (integer ? INTCONST : DOUBLECONST);
	}

	/*
	 * Alphanumeric-token : Element symbol, NETA keyword, value, expander, or geometry
	 */
	if (isalpha (c))
	{
		do
		{
			token += c;
			c = getChar();
		}
		while (isalnum(c) || (c == '_'));
		unGetChar();
		Messenger::print(Messenger::Typing, "NETA : found an alpha token [%s]...", qPrintable(token));

		// Element Symbol (or 'Any')
		if (token == "Any")
		{
			NetaParser_lval.intConst = 0;
			Messenger::print(Messenger::Typing, "NETA : ...which is the any element symbol (Any)");
			return ELEMENT;
		}
		for (n=0; n<ElementMap::nElements(); ++n) if (token == ElementMap::symbol(n)) break;
		if (n < ElementMap::nElements())
		{
			NetaParser_lval.intConst = n;
			Messenger::print(Messenger::Typing, "NETA : ...which is a an element symbol (%i)",n);
			return ELEMENT;
		}

		// Is this a NETA keyword?
		Neta::NetaKeyword nk = Neta::netaKeyword(token, false);
		if (nk != Neta::nNetaKeywords)
		{
			Messenger::print(Messenger::Typing, "NETA : ...which is a keyword (->NETAKEY)");
			NetaParser_lval.netaKey = nk;
			return NETAKEY;
		}

		// Is this a NETA geometry?
		Atom::AtomGeometry ag = Atom::atomGeometry(token, false);
		if (ag != Atom::nAtomGeometries)
		{
			Messenger::print(Messenger::Typing, "NETA : ...which is a geometry (->NETAGEOMETRYTYPE)");
			NetaParser_lval.atomGeom = ag;
			return NETAGEOMETRYTYPE;
		}

		// Is this a NETA value?
		Neta::NetaValue nv = Neta::netaValue(token, false);
		if (nv == Neta::RepeatValue)
		{
			Messenger::print(Messenger::Typing, "NETA : ...which is a repeat value (->NETAREPEAT)");
			NetaParser_lval.netaVal = nv;
			return NETAREPEAT;
		}
		else if (nv != Neta::nNetaValues)
		{
			Messenger::print(Messenger::Typing, "NETA : ...which is a value (->NETAVAL)");
			NetaParser_lval.netaVal = nv;
			return NETAVAL;
		}

		// Is this a NETA expander?
		Neta::NetaExpander ne = Neta::netaExpander(token, false);
		if (ne != Neta::nNetaExpanders)
		{
			Messenger::print(Messenger::Typing, "NETA : ...which is an expander (->NETAEXP)");
			if (ne == Neta::RingExpander) return NETARING;
			else if (ne == Neta::ChainExpander) return NETACHAIN;
			else if (ne == Neta::GeometryExpander) return NETAGEOMETRY;
			else if (ne == Neta::PathExpander) return NETAPATH;
			return 0;
		}

		// Is it a bond type?
		Bond::BondType bt = Bond::bondType(token, false);
		if (bt != Bond::nBondTypes)
		{
			Messenger::print(Messenger::Typing, "NETA : ...which is a bond type (->INTCONST, %i)", bt);
			NetaParser_lval.intConst = bt;
			return INTCONST;
		}

		// If we get to here then we have found an unrecognised alphanumeric token
		Messenger::print(Messenger::Typing, "NETA : ...which is unrecognised (->TOKEN)");
		lastUnknownToken_ = token;
		return TOKEN;
	}

	/* We have found a symbolic character (or a pair) that corresponds to an operator */
	// Return immediately in the case of brackets, commas etc.
	if ((c == '(') || (c == ')') || (c == ',') || (c == '[') || (c == ']') || (c == '-') || (c == '~') || (c == '&') || ( c == '!') || (c == '$') || (c == '=') || (c == '?') || (c == ':'))
	{
		Messenger::print(Messenger::Typing, "NETA : found symbol [%c]",c);
		return c;
	}
	token += c;

	// Similarly, if the next character is a bracket or double quotes, return immediately
	char c2 = peekChar();
	if ((c2 == '(') || (c2 == ')') || (c2 == ';') || (c2 == '{') || (c2 == '}') || (c2 == '"')) return c;

	// If it is 'punctuation', add this second character to our operator and search for it
	if (ispunct(c2))
	{
		c = getChar();
		token += c;
		if (!NetaParser::quiet()) Messenger::print(Messenger::Typing, "NETA : found symbol [%s]", qPrintable(token));
		NetaSymbolToken st = (NetaSymbolToken) enumSearch("", nNetaSymbolTokens, NetaSymbolTokenKeywords, token, !NetaParser::quiet());
		if (st != nNetaSymbolTokens) return NetaSymbolTokenValues[st];
		else if (!NetaParser::quiet()) Messenger::print("Error: Unrecognised symbol found in input (%s).", qPrintable(token));
 	}
	else
	{
		// Make sure that this is a known symbol
		if ((c == '$') || (c == '%') || (c == '&') || (c == '@') || (c == '?') || (c == ':'))
		{
			if (!NetaParser::quiet()) Messenger::print("Error: Unrecognised symbol found in input (%c).", c);
		}
		else return c;
	}
	return 0;
}