//--------------------------------
	bool StringUtils::isNameChar( wchar_t c )
	{
		return	isNameStartChar( c )	
			||	( c == '-' ) 
			||  ( c == '.' )
			||  ( c >= '0' && c <= '9' )
			||  ( c == 0xB7 )
			||  ( c >= 0x0300 && c <= 0x036F ) 
			||	( c >= 0x203F && c <= 0x2040 );
	}
示例#2
0
XMLCToken xmlcTokenizerNextToken(XMLCTokenizer *s)
{
    XMLCCharacter ch;
    XMLCCharacter *mark = s->token_start = s->buffer_position;
    XMLCUInt i;


    while (s->buffer_position < s->buffer_end) {

        getChar(ch);

        /* states that do their own getChar aren't allowed to eat these */
        if (ch == '\n' || ch == '\r') {
            s->line_number++;
            s->line_start = s->buffer_position;
        }

        /*   printf("char '%c' props %x s->tokState: %@",  ch, characterPropertiesTable[ch], [_tokenStateNameStrings objectAtIndex:s->tokState]); */
        switch (s->tokState) {

        /* FIXME: do I need this state? */
        case T_IN_START_PCDATA:
            s->token_start = mark;
            s->tokState = T_IN_PCDATA;
        /* fall through */

        case T_IN_PCDATA:
            if (ch == '<') { /* ^<Marker */
                unGetChar(ch);
                s->tokState = T_IN_TAG;
                if (s->buffer_position > s->token_start) {
                    if(s->isWhiteSpaceToken)
                        return s->tokType = WHITESPACE_TOKEN;
                    else
                        return s->tokType = PCDATA_TOKEN;
                }
            }
            if(!isWhiteSpace(ch))
                s->isWhiteSpaceToken = 0;
            break;

        case T_IN_TAG:   /* ^<Marker */
            if (ch == '<') {
                getChar(ch);
                if (ch == '?') {  /* <? */
                    s->token_start = mark;
                    s->tokState = T_IN_COMMAND;
                } else if (ch == '/') {
                    return s->tokType = OPEN_SLASH_ELEMENT_TOKEN;  /* </ */
                } else if (ch == '!') {
                    s->token_start = mark;
                    s->tokState = T_IN_DECLARATION;
                    getChar(ch);
                    if (ch =='-') {
                        getChar(ch);
                        if (ch == '-') { /* <!-- */
                            s->token_start = mark;
                            s->tokState = T_IN_COMMENT;
                        }
                    } else if (ch == '[') {
                        s->tokState = T_IN_CDATA;
                        for(i=0; i<6 && s->tokState == T_IN_CDATA; i++) {
                            getChar(ch);
                            if (!ch == "CDATA["[i])
                                s->tokState = T_IN_DECLARATION;
                        }
                        if (s->tokState == T_IN_CDATA)
                            s->token_start = mark;
                    } else { /* <! */
                        unGetChar(ch);
                        s->token_start = mark;
                        s->tokState = T_IN_DECLARATION;
                    }
                }
                if (s->tokState == T_IN_TAG) { /* < */
                    unGetChar(ch);
                    return s->tokType = OPEN_ELEMENT_TOKEN;
                }

            } else if (ch == '>') {
                s->token_start = s->buffer_position;
                s->isWhiteSpaceToken = 1;
                s->tokState = T_IN_PCDATA;
                return s->tokType =  CLOSE_ELEMENT_TOKEN;
            } else if (ch == '/') {
                getChar(ch);
                if (ch =='>') {
                    s->token_start = mark;
                    s->tokState = T_IN_PCDATA;
                    return s->tokType = SLASH_CLOSE_ELEMENT_TOKEN;
                } else {
                    unGetChar(ch);
                    return s->tokType = '/'; /* FIXME: is this an error, the parser should complain */
                }
            } else if (ch == '=') {
                s->token_start = mark;
                return s->tokType = '=';
            } else if (ch == '"') {
                s->token_start = mark;
                s->end_quote_char = '"';
                s->tokState = T_IN_QUOTE_STRING;
            } else if (ch == 0x0027) { /*#	APOSTROPHE */
                s->token_start = mark;
                s->end_quote_char = 0x0027;
                s->tokState = T_IN_QUOTE_STRING;
            } else if (isNameStartChar(ch)) {
                /* FIXME: whitespace tokens inside tags aren't recognized */
                /* need to handle whitespace tokens in tags, could eliminate s->buffer_position-1 and use _mark */
                s->token_start = s->buffer_position-1;
                s->tokState = T_IN_NAME_STRING;
            } else if (!isWhiteSpace(ch)) {
                /* don't know what this is, return it as itself, let the parser deal with it */
                return s->tokType = ch;
            } else {
                mark = s->buffer_position;
            }
            break;

        case T_IN_DECLARATION: /* scan until >  FIXME: allow everything? */
            if (ch == '>') {
                s->tokState = T_IN_START_PCDATA;
                return s->tokType = DECLARATION_TOKEN;
            }

        case T_IN_COMMENT: /* scan until --> allow everything */
            if (ch == '-') {
                getChar(ch);
                if (ch == '-') {
                    getChar(ch);
                    if (ch == '>') {
                        s->tokState = T_IN_START_PCDATA;
                        return s->tokType = COMMENT_TOKEN;
                    }
                }
            }
            break;

        case T_IN_COMMAND: /* scan until ?>  FIXME: allow everything? */
            if (ch == '?') {
                getChar(ch);
                if (ch == '>') {
                    s->tokState = T_IN_START_PCDATA;
                    return s->tokType = COMMAND_TOKEN;
                }
            }
            break;

        case T_IN_CDATA: /* scan until ]]> allow everything */
            if (ch == ']') {
                getChar(ch);
                if (ch == ']') {
                    getChar(ch);
                    if (ch == '>') {
                        s->tokState = T_IN_START_PCDATA;
                        return s->tokType = CDATA_TOKEN;
                    }
                }
            }
            break;

        case T_IN_NAME_STRING:   /* <M^arker */
            if (!(isNameChar(ch))) {
                unGetChar(ch);
                s->tokState = T_IN_TAG;
                return s->tokType = NAME_TOKEN;
            }
            break;

        case T_IN_QUOTE_STRING:   /* FIXME: handle &quote; type things (and whitespace eating?) */
            if (ch == s->end_quote_char) {
                s->tokState = T_IN_TAG;
                return s->tokType = QUOTE_STRING_TOKEN;
            }
            break;
        }
    }
    return s->tokType = s->tokState != T_IN_PCDATA ? PARTIAL_TOKEN : EOF_TOKEN;
}