/** Checks for next tag */ static void proc_before( PPOS* ppos /**< input stream position */ ) { int c; assert(ppos != NULL); assert(ppos->state == STATE_BEFORE); c = skip_space(ppos); if (c != '<') { xml_error(ppos, "Expecting '<'"); ppos->state = STATE_ERROR; } else { c = getsymbol(ppos); switch(c) { case EOF : xml_error(ppos, "Unexpected EOF"); ppos->state = STATE_ERROR; break; case '!' : handle_decl(ppos); break; case '?' : handle_pi(ppos); break; case '/' : handle_endtag(ppos); break; default : ungetsymbol(ppos, c); handle_starttag(ppos); break; } } }
char *ekhtml_parse_starttag(ekhtml_parser_t *parser, void **state_data, char *curp, char *endp, int *baddata) { ekhtml_starttag_state *startstate = *state_data; int *offset = &parser->state.offset; char *workp; assert(*curp == '<' && isalpha(*(curp + 1))); assert(endp - curp >= 3); if(startstate == NULL){ /* First time the tag is called */ startstate = &parser->startstate; startstate->tagend = sizeof("<F") - 1; startstate->mode = EKHTML_STMODE_TAG; startstate->attrs = NULL; startstate->curattr = NULL; startstate->quote = '\0'; *state_data = startstate; *offset = startstate->tagend; } workp = curp + *offset; if(startstate->mode == EKHTML_STMODE_TAG){ /* Find that tag! */ workp = ekhtml_find_notcharsmap(workp, endp - workp, EKCMap_CharMap, EKHTML_CHAR_TAGNAME); *offset = workp - curp; if(workp == endp) return NULL; startstate->tagend = *offset; startstate->mode = EKHTML_STMODE_BEGNAME; } while(workp != endp){ /* Main state processing loop */ if(startstate->mode == EKHTML_STMODE_BEGNAME){ ekhtml_attr_t *attr; workp = ekhtml_find_notcharsmap(workp, endp - workp, EKCMap_CharMap, EKHTML_CHAR_WHITESPACE); if(workp == endp) break; if(!(EKCMap_CharMap[(unsigned char)*workp] & EKHTML_CHAR_BEGATTRNAME)) { /* Bad attrname character */ startstate->mode = EKHTML_STMODE_SUCK; } else { assert(startstate->curattr == NULL); /* Valid attribute name, allocate space for it */ attr = ekhtml_parser_attr_new(parser); attr->name.str = (char *)NULL + (workp - curp); attr->name.len = 0; /* Will get assigned later */ attr->val.str = NULL; attr->val.len = 0; attr->isBoolean = 1; attr->next = NULL; startstate->mode = EKHTML_STMODE_GETNAME; startstate->curattr = attr; } } if(startstate->mode == EKHTML_STMODE_GETNAME){ workp = ekhtml_find_notcharsmap(workp, endp - workp, EKCMap_CharMap, EKHTML_CHAR_ATTRNAME); if(workp == endp) break; /* There be dragons here -- watch out -- see comment @ top of file */ startstate->curattr->name.len = workp - (curp + (int)startstate->curattr->name.str); if(*workp == '='){ startstate->mode = EKHTML_STMODE_BEGVALUE; workp++; /* Skip the equals sign */ } else { if(!(EKCMap_CharMap[(unsigned char)*workp] & EKHTML_CHAR_WHITESPACE)) { /* Found something we weren't expecting. Use the current attribute as a boolean value and suck the rest */ scroll_attribute(startstate); startstate->mode = EKHTML_STMODE_SUCK; } else startstate->mode = EKHTML_STMODE_GETEQUAL; } } if(startstate->mode == EKHTML_STMODE_GETEQUAL){ workp = ekhtml_find_notcharsmap(workp, endp - workp, EKCMap_CharMap, EKHTML_CHAR_WHITESPACE); if(workp == endp) break; if(*workp != '='){ /* Unexpected value. Could either be time to suck, or this was really only a boolean value */ scroll_attribute(startstate); if(EKCMap_CharMap[(unsigned char)*workp] & EKHTML_CHAR_BEGATTRNAME) { startstate->mode = EKHTML_STMODE_BEGNAME; continue; } else { startstate->mode = EKHTML_STMODE_SUCK; } } else { startstate->mode = EKHTML_STMODE_BEGVALUE; workp++; /* Skip the equals sign */ } } if(startstate->mode == EKHTML_STMODE_BEGVALUE){ workp = ekhtml_find_notcharsmap(workp, endp - workp, EKCMap_CharMap, EKHTML_CHAR_WHITESPACE); if(workp == endp) break; startstate->curattr->isBoolean = 0; startstate->curattr->val.str = (char *)NULL + (workp - curp); startstate->quote = '\0'; if(*workp == '"' || *workp == '\''){ startstate->curattr->val.str++; /* Skip the quote */ startstate->mode = EKHTML_STMODE_GETVALUE; startstate->quote = *workp; workp++; } else if(!(EKCMap_CharMap[(unsigned char)*workp] & EKHTML_CHAR_ATTRVALUE)) { /* Bad value .. */ startstate->curattr->val.len = 0; scroll_attribute(startstate); startstate->mode = EKHTML_STMODE_SUCK; } else { /* Valid value */ startstate->mode = EKHTML_STMODE_GETVALUE; } } if(startstate->mode == EKHTML_STMODE_GETVALUE){ if(startstate->quote){ for(;workp != endp && *workp != '>' && *workp != '<'; workp++){ if(*workp == startstate->quote){ startstate->curattr->val.len = workp - (curp + (int)startstate->curattr->val.str); scroll_attribute(startstate); startstate->mode = EKHTML_STMODE_BEGNAME; workp++; /* Skip the quote */ break; } } /* In case we broke out in the above loop, we may need to continue in the main loop -- CONFUSING */ if(startstate->mode == EKHTML_STMODE_BEGNAME) continue; } else workp = ekhtml_find_notcharsmap(workp, endp - workp, EKCMap_CharMap, EKHTML_CHAR_ATTRVALUE); if(workp == endp) break; startstate->curattr->val.len = workp - (curp + (int)startstate->curattr->val.str); scroll_attribute(startstate); if(*workp == '>' || *workp == '<') { *offset = workp - curp; handle_starttag(parser, curp, startstate); release_attributes(parser, startstate); *state_data = NULL; if(*workp == '<') return workp; else return workp + 1; } else { startstate->mode = EKHTML_STMODE_BEGNAME; continue; } } if(startstate->mode == EKHTML_STMODE_SUCK){ /* The sucking mode is here in case someone puts a bad character in an attribute name. We suck until what looks like end of tag*/ for(;workp != endp && *workp != '<' && *workp != '>'; workp++) ; if(workp == endp) break; *offset = workp - curp; handle_starttag(parser, curp, startstate); release_attributes(parser, startstate); *state_data = NULL; if(*workp == '<') return workp; else return workp + 1; } } *offset = workp - curp; return NULL; }
static int lexer_callback(tdi_lexer_event *event_, void *self_) { tdi_soup_parser *self = self_; tdi_parser_event event; switch (event_->type) { case TDI_LEXER_EVENT_STARTTAG: return handle_starttag(self, &event, event_); case TDI_LEXER_EVENT_ENDTAG: return handle_endtag(self, &event, event_); case TDI_LEXER_EVENT_TEXT: if (self->inempty && close_empty(self) == -1) return -1; event.type = TDI_PARSER_EVENT_TEXT; event.info.text.data = event_->info.text.data; return !self->cb(&event, self->cb_ctx) ? 0 : -1; case TDI_LEXER_EVENT_COMMENT: if (self->inempty && close_empty(self) == -1) return -1; event.type = TDI_PARSER_EVENT_COMMENT; event.info.comment.data = event_->info.comment.data; return !self->cb(&event, self->cb_ctx) ? 0 : -1; case TDI_LEXER_EVENT_MSECTION: if (self->inempty && close_empty(self) == -1) return -1; event.type = TDI_PARSER_EVENT_MSECTION; event.info.msection.data = event_->info.msection.data; event.info.msection.name = event_->info.msection.name; event.info.msection.value = event_->info.msection.value; return !self->cb(&event, self->cb_ctx) ? 0 : -1; case TDI_LEXER_EVENT_DECL: if (self->inempty && close_empty(self) == -1) return -1; event.type = TDI_PARSER_EVENT_DECL; event.info.decl.data = event_->info.decl.data; event.info.decl.name = event_->info.decl.name; event.info.decl.value = event_->info.decl.value; return !self->cb(&event, self->cb_ctx) ? 0 : -1; case TDI_LEXER_EVENT_PI: if (self->inempty && close_empty(self) == -1) return -1; event.type = TDI_PARSER_EVENT_PI; event.info.pi.data = event_->info.pi.data; return !self->cb(&event, self->cb_ctx) ? 0 : -1; case TDI_LEXER_EVENT_ESCAPE: break; } /* Should not happen */ PyErr_SetNone(PyExc_AssertionError); self->last_error = TDI_PARSER_ERR_ENV; return -1; }