//-------------------------------- bool StringUtils::isNameChar( wchar_t c ) { return isNameStartChar( c ) || ( c == '-' ) || ( c == '.' ) || ( c >= '0' && c <= '9' ) || ( c == 0xB7 ) || ( c >= 0x0300 && c <= 0x036F ) || ( c >= 0x203F && c <= 0x2040 ); }
XMLCToken xmlcTokenizerNextToken(XMLCTokenizer *s) { XMLCCharacter ch; XMLCCharacter *mark = s->token_start = s->buffer_position; XMLCUInt i; while (s->buffer_position < s->buffer_end) { getChar(ch); /* states that do their own getChar aren't allowed to eat these */ if (ch == '\n' || ch == '\r') { s->line_number++; s->line_start = s->buffer_position; } /* printf("char '%c' props %x s->tokState: %@", ch, characterPropertiesTable[ch], [_tokenStateNameStrings objectAtIndex:s->tokState]); */ switch (s->tokState) { /* FIXME: do I need this state? */ case T_IN_START_PCDATA: s->token_start = mark; s->tokState = T_IN_PCDATA; /* fall through */ case T_IN_PCDATA: if (ch == '<') { /* ^<Marker */ unGetChar(ch); s->tokState = T_IN_TAG; if (s->buffer_position > s->token_start) { if(s->isWhiteSpaceToken) return s->tokType = WHITESPACE_TOKEN; else return s->tokType = PCDATA_TOKEN; } } if(!isWhiteSpace(ch)) s->isWhiteSpaceToken = 0; break; case T_IN_TAG: /* ^<Marker */ if (ch == '<') { getChar(ch); if (ch == '?') { /* <? */ s->token_start = mark; s->tokState = T_IN_COMMAND; } else if (ch == '/') { return s->tokType = OPEN_SLASH_ELEMENT_TOKEN; /* </ */ } else if (ch == '!') { s->token_start = mark; s->tokState = T_IN_DECLARATION; getChar(ch); if (ch =='-') { getChar(ch); if (ch == '-') { /* <!-- */ s->token_start = mark; s->tokState = T_IN_COMMENT; } } else if (ch == '[') { s->tokState = T_IN_CDATA; for(i=0; i<6 && s->tokState == T_IN_CDATA; i++) { getChar(ch); if (!ch == "CDATA["[i]) s->tokState = T_IN_DECLARATION; } if (s->tokState == T_IN_CDATA) s->token_start = mark; } else { /* <! */ unGetChar(ch); s->token_start = mark; s->tokState = T_IN_DECLARATION; } } if (s->tokState == T_IN_TAG) { /* < */ unGetChar(ch); return s->tokType = OPEN_ELEMENT_TOKEN; } } else if (ch == '>') { s->token_start = s->buffer_position; s->isWhiteSpaceToken = 1; s->tokState = T_IN_PCDATA; return s->tokType = CLOSE_ELEMENT_TOKEN; } else if (ch == '/') { getChar(ch); if (ch =='>') { s->token_start = mark; s->tokState = T_IN_PCDATA; return s->tokType = SLASH_CLOSE_ELEMENT_TOKEN; } else { unGetChar(ch); return s->tokType = '/'; /* FIXME: is this an error, the parser should complain */ } } else if (ch == '=') { s->token_start = mark; return s->tokType = '='; } else if (ch == '"') { s->token_start = mark; s->end_quote_char = '"'; s->tokState = T_IN_QUOTE_STRING; } else if (ch == 0x0027) { /*# APOSTROPHE */ s->token_start = mark; s->end_quote_char = 0x0027; s->tokState = T_IN_QUOTE_STRING; } else if (isNameStartChar(ch)) { /* FIXME: whitespace tokens inside tags aren't recognized */ /* need to handle whitespace tokens in tags, could eliminate s->buffer_position-1 and use _mark */ s->token_start = s->buffer_position-1; s->tokState = T_IN_NAME_STRING; } else if (!isWhiteSpace(ch)) { /* don't know what this is, return it as itself, let the parser deal with it */ return s->tokType = ch; } else { mark = s->buffer_position; } break; case T_IN_DECLARATION: /* scan until > FIXME: allow everything? */ if (ch == '>') { s->tokState = T_IN_START_PCDATA; return s->tokType = DECLARATION_TOKEN; } case T_IN_COMMENT: /* scan until --> allow everything */ if (ch == '-') { getChar(ch); if (ch == '-') { getChar(ch); if (ch == '>') { s->tokState = T_IN_START_PCDATA; return s->tokType = COMMENT_TOKEN; } } } break; case T_IN_COMMAND: /* scan until ?> FIXME: allow everything? */ if (ch == '?') { getChar(ch); if (ch == '>') { s->tokState = T_IN_START_PCDATA; return s->tokType = COMMAND_TOKEN; } } break; case T_IN_CDATA: /* scan until ]]> allow everything */ if (ch == ']') { getChar(ch); if (ch == ']') { getChar(ch); if (ch == '>') { s->tokState = T_IN_START_PCDATA; return s->tokType = CDATA_TOKEN; } } } break; case T_IN_NAME_STRING: /* <M^arker */ if (!(isNameChar(ch))) { unGetChar(ch); s->tokState = T_IN_TAG; return s->tokType = NAME_TOKEN; } break; case T_IN_QUOTE_STRING: /* FIXME: handle "e; type things (and whitespace eating?) */ if (ch == s->end_quote_char) { s->tokState = T_IN_TAG; return s->tokType = QUOTE_STRING_TOKEN; } break; } } return s->tokType = s->tokState != T_IN_PCDATA ? PARTIAL_TOKEN : EOF_TOKEN; }