/* Entered after the "<?" sequence. Ready to read the rest. */ static void read_instruction(PInfo pi) { struct _Attr attrs[MAX_ATTRS + 1]; Attr a = attrs; char *target; char *end; char c; memset(attrs, 0, sizeof(attrs)); target = read_name_token(pi); end = pi->s; next_non_white(pi); c = *pi->s; *end = '\0'; // terminate name if ('?' != c) { while ('?' != *pi->s) { if ('\0' == *pi->s) { raise_error("invalid format, processing instruction not terminated", pi->str, pi->s); } next_non_white(pi); a->name = read_name_token(pi); end = pi->s; next_non_white(pi); if ('=' != *pi->s++) { raise_error("invalid format, no attribute value", pi->str, pi->s); } *end = '\0'; // terminate name // read value next_non_white(pi); a->value = read_quoted_value(pi); a++; if (MAX_ATTRS <= (a - attrs)) { raise_error("too many attributes", pi->str, pi->s); } next_non_white(pi); } if ('?' == *pi->s) { pi->s++; } } else { pi->s++; } if ('>' != *pi->s++) { raise_error("invalid format, processing instruction not terminated", pi->str, pi->s); } if (0 != pi->pcb->instruct) { pi->pcb->instruct(pi, target, attrs); } }
/* Entered after the '<' and the first character after that. Returns status * code. */ static void read_element(PInfo pi) { struct _Attr attrs[MAX_ATTRS]; Attr ap = attrs; char *name; char *ename; char *end; char c; long elen; int hasChildren = 0; int done = 0; ename = read_name_token(pi); end = pi->s; elen = end - ename; next_non_white(pi); c = *pi->s; *end = '\0'; if ('/' == c) { /* empty element, no attributes and no children */ pi->s++; if ('>' != *pi->s) { //printf("*** '%s' ***\n", pi->s); raise_error("invalid format, element not closed", pi->str, pi->s); } pi->s++; /* past > */ ap->name = 0; pi->pcb->add_element(pi, ename, attrs, hasChildren); pi->pcb->end_element(pi, ename); return; } /* read attribute names until the close (/ or >) is reached */ while (!done) { if ('\0' == c) { next_non_white(pi); c = *pi->s; } switch (c) { case '\0': raise_error("invalid format, document not terminated", pi->str, pi->s); case '/': // Element with just attributes. pi->s++; if ('>' != *pi->s) { raise_error("invalid format, element not closed", pi->str, pi->s); } pi->s++; ap->name = 0; pi->pcb->add_element(pi, ename, attrs, hasChildren); pi->pcb->end_element(pi, ename); return; case '>': // has either children or a value pi->s++; hasChildren = 1; done = 1; ap->name = 0; pi->pcb->add_element(pi, ename, attrs, hasChildren); break; default: // Attribute name so it's an element and the attribute will be // added to it. ap->name = read_name_token(pi); end = pi->s; next_non_white(pi); if ('=' != *pi->s++) { raise_error("invalid format, no attribute value", pi->str, pi->s); } *end = '\0'; // terminate name // read value next_non_white(pi); ap->value = read_quoted_value(pi); if (0 != strchr(ap->value, '&')) { if (0 != collapse_special((char*)ap->value)) { raise_error("invalid format, special character does not end with a semicolon", pi->str, pi->s); } } ap++; if (MAX_ATTRS <= (ap - attrs)) { raise_error("too many attributes", pi->str, pi->s); } break; } c = '\0'; } if (hasChildren) { char *start; done = 0; // read children while (!done) { start = pi->s; next_non_white(pi); c = *pi->s++; if ('\0' == c) { raise_error("invalid format, document not terminated", pi->str, pi->s); } if ('<' == c) { switch (*pi->s) { case '!': /* better be a comment or CDATA */ pi->s++; if ('-' == *pi->s && '-' == *(pi->s + 1)) { pi->s += 2; read_comment(pi); } else if (0 == strncmp("[CDATA[", pi->s, 7)) { pi->s += 7; read_cdata(pi); } else { raise_error("invalid format, invalid comment or CDATA format", pi->str, pi->s); } break; case '/': pi->s++; name = read_name_token(pi); end = pi->s; next_non_white(pi); c = *pi->s; *end = '\0'; if (0 != strcmp(name, ename)) { raise_error("invalid format, elements overlap", pi->str, pi->s); } if ('>' != c) { raise_error("invalid format, element not closed", pi->str, pi->s); } pi->s++; pi->pcb->end_element(pi, ename); return; case '\0': raise_error("invalid format, document not terminated", pi->str, pi->s); default: // a child element read_element(pi); break; } } else { // read as TEXT pi->s = start; //pi->s--; read_text(pi); //read_reduced_text(pi); // to exit read_text with no errors the next character must be < if ('/' == *(pi->s + 1) && 0 == strncmp(ename, pi->s + 2, elen) && '>' == *(pi->s + elen + 2)) { // close tag after text so treat as a value pi->s += elen + 3; pi->pcb->end_element(pi, ename); return; } } } } }
static char read_attrs(SaxDrive dr, char c, char termc, char term2, int is_xml, int eq_req) { VALUE name = Qnil; int is_encoding = 0; int line; int col; char *attr_value; // already protected by caller dr->buf.str = dr->buf.tail; if (is_white(c)) { c = buf_next_non_white(&dr->buf); } while (termc != c && term2 != c) { buf_backup(&dr->buf); if ('\0' == c) { ox_sax_drive_error(dr, NO_TERM "attributes not terminated"); return '\0'; } line = dr->buf.line; col = dr->buf.col; if ('\0' == (c = read_name_token(dr))) { ox_sax_drive_error(dr, NO_TERM "error reading token"); return '\0'; } if (is_xml && 0 == strcasecmp("encoding", dr->buf.str)) { is_encoding = 1; } if (dr->has.attr || dr->has.attr_value) { name = str2sym(dr, dr->buf.str, 0); } if (is_white(c)) { c = buf_next_non_white(&dr->buf); } if ('=' != c) { if (eq_req) { dr->err = 1; return c; } else { ox_sax_drive_error(dr, WRONG_CHAR "no attribute value"); attr_value = (char*)""; } } else { line = dr->buf.line; col = dr->buf.col; c = read_quoted_value(dr); attr_value = dr->buf.str; if (is_encoding) { #if HAS_ENCODING_SUPPORT dr->encoding = rb_enc_find(dr->buf.str); #elif HAS_PRIVATE_ENCODING dr->encoding = rb_str_new2(dr->buf.str); #else dr->encoding = dr->buf.str; #endif is_encoding = 0; } } if (dr->has.attr_value) { VALUE args[2]; if (dr->has.line) { rb_ivar_set(dr->handler, ox_at_line_id, LONG2NUM(line)); } if (dr->has.column) { rb_ivar_set(dr->handler, ox_at_column_id, LONG2NUM(col)); } args[0] = name; args[1] = dr->value_obj; rb_funcall2(dr->handler, ox_attr_value_id, 2, args); } else if (dr->has.attr) { VALUE args[2]; args[0] = name; ox_sax_collapse_special(dr, dr->buf.str, line, col); args[1] = rb_str_new2(attr_value); #if HAS_ENCODING_SUPPORT if (0 != dr->encoding) { rb_enc_associate(args[1], dr->encoding); } #elif HAS_PRIVATE_ENCODING if (Qnil != dr->encoding) { rb_funcall(args[1], ox_force_encoding_id, 1, dr->encoding); } #endif if (dr->has.line) { rb_ivar_set(dr->handler, ox_at_line_id, LONG2NUM(line)); } if (dr->has.column) { rb_ivar_set(dr->handler, ox_at_column_id, LONG2NUM(col)); } rb_funcall2(dr->handler, ox_attr_id, 2, args); } if (is_white(c)) { c = buf_next_non_white(&dr->buf); } } dr->buf.str = 0; return c; }
static char read_element_end(SaxDrive dr) { VALUE name = Qnil; char c; int line = dr->buf.line; int col = dr->buf.col - 2; Nv nv; if ('\0' == (c = read_name_token(dr))) { return '\0'; } // c should be > and current is one past so read another char c = buf_get(&dr->buf); nv = stack_peek(&dr->stack); if (0 != nv && 0 == strcmp(dr->buf.str, nv->name)) { name = nv->val; stack_pop(&dr->stack); } else { // Mismatched start and end char msg[256]; Nv match = stack_rev_find(&dr->stack, dr->buf.str); if (0 == match) { // Not found so open and close element. char *ename = 0; Hint h = ox_hint_find(dr->hints, dr->buf.str); if (0 != h && h->empty) { // Just close normally name = str2sym(dr, dr->buf.str, &ename); snprintf(msg, sizeof(msg) - 1, "%selement '%s' should not have a separate close element", EL_MISMATCH, dr->buf.str); ox_sax_drive_error_at(dr, msg, line, col); return c; } else { snprintf(msg, sizeof(msg) - 1, "%selement '%s' closed but not opened", EL_MISMATCH, dr->buf.str); ox_sax_drive_error_at(dr, msg, line, col); name = str2sym(dr, dr->buf.str, &ename); if (dr->has.start_element) { VALUE args[1]; if (dr->has.line) { rb_ivar_set(dr->handler, ox_at_line_id, LONG2NUM(line)); } if (dr->has.column) { rb_ivar_set(dr->handler, ox_at_column_id, LONG2NUM(col)); } args[0] = name; rb_funcall2(dr->handler, ox_start_element_id, 1, args); } } } else { // Found a match so close all up to the found element in stack. Nv n2; if (0 != (n2 = hint_try_close(dr, dr->buf.str))) { name = n2->val; } else { snprintf(msg, sizeof(msg) - 1, "%selement '%s' close does not match '%s' open", EL_MISMATCH, dr->buf.str, nv->name); ox_sax_drive_error_at(dr, msg, line, col); if (dr->has.line) { rb_ivar_set(dr->handler, ox_at_line_id, LONG2NUM(line)); } if (dr->has.column) { rb_ivar_set(dr->handler, ox_at_column_id, LONG2NUM(col)); } for (nv = stack_pop(&dr->stack); match < nv; nv = stack_pop(&dr->stack)) { if (dr->has.end_element) { rb_funcall(dr->handler, ox_end_element_id, 1, nv->val); } } name = nv->val; } } } end_element_cb(dr, name, line, col); return c; }
/* Entered after the '<' and the first character after that. Returns status * code. */ static char read_element_start(SaxDrive dr) { char *ename = 0; VALUE name = Qnil; char c; int closed; int line = dr->buf.line; int col = dr->buf.col - 1; Hint h = 0; int stackless = 0; if ('\0' == (c = read_name_token(dr))) { return '\0'; } if (dr->options.smart && 0 == dr->hints && stack_empty(&dr->stack) && 0 == strcasecmp("html", dr->buf.str)) { dr->hints = ox_hints_html(); } if (0 != dr->hints) { hint_clear_empty(dr); h = ox_hint_find(dr->hints, dr->buf.str); if (0 == h) { char msg[100]; sprintf(msg, "%s%s is not a valid element type for a %s document type.", INV_ELEMENT, dr->buf.str, dr->hints->name); ox_sax_drive_error(dr, msg); } else { Nv top_nv = stack_peek(&dr->stack); if (h->empty) { stackless = 1; } if (0 != top_nv) { char msg[256]; if (!h->nest && 0 == strcasecmp(top_nv->name, h->name)) { snprintf(msg, sizeof(msg) - 1, "%s%s can not be nested in a %s document, closing previous.", INV_ELEMENT, dr->buf.str, dr->hints->name); ox_sax_drive_error(dr, msg); stack_pop(&dr->stack); end_element_cb(dr, top_nv->val, line, col); top_nv = stack_peek(&dr->stack); } if (0 != h->parents) { const char **p; int ok = 0; for (p = h->parents; 0 != *p; p++) { if (0 == strcasecmp(*p, top_nv->name)) { ok = 1; break; } } if (!ok) { snprintf(msg, sizeof(msg) - 1, "%s%s can not be a child of a %s in a %s document.", INV_ELEMENT, h->name, top_nv->name, dr->hints->name); ox_sax_drive_error(dr, msg); } } } } } name = str2sym(dr, dr->buf.str, &ename); if (dr->has.start_element) { VALUE args[1]; if (dr->has.line) { rb_ivar_set(dr->handler, ox_at_line_id, LONG2NUM(line)); } if (dr->has.column) { rb_ivar_set(dr->handler, ox_at_column_id, LONG2NUM(col)); } args[0] = name; rb_funcall2(dr->handler, ox_start_element_id, 1, args); } if ('/' == c) { closed = 1; } else if ('>' == c) { closed = 0; } else { buf_protect(&dr->buf); c = read_attrs(dr, c, '/', '>', 0, 0); if (is_white(c)) { c = buf_next_non_white(&dr->buf); } closed = ('/' == c); } if (dr->has.attrs_done) { rb_funcall(dr->handler, ox_attrs_done_id, 0); } if (closed) { c = buf_next_non_white(&dr->buf); line = dr->buf.line; col = dr->buf.col - 1; end_element_cb(dr, name, line, col); } else if (stackless) { end_element_cb(dr, name, line, col); } else { stack_push(&dr->stack, ename, name, h); } if ('>' != c) { ox_sax_drive_error(dr, WRONG_CHAR "element not closed"); return c; } dr->buf.str = 0; return buf_get(&dr->buf); }
/* Entered after the "<?" sequence. Ready to read the rest. */ static char read_instruction(SaxDrive dr) { char content[1024]; char c; char *cend; VALUE target = Qnil; int is_xml; int line = dr->buf.line; int col = dr->buf.col - 1; buf_protect(&dr->buf); if ('\0' == (c = read_name_token(dr))) { return c; } is_xml = (0 == strcmp("xml", dr->buf.str)); if (dr->has.instruct || dr->has.end_instruct) { target = rb_str_new2(dr->buf.str); } if (dr->has.instruct) { VALUE args[1]; if (dr->has.line) { rb_ivar_set(dr->handler, ox_at_line_id, LONG2NUM(line)); } if (dr->has.column) { rb_ivar_set(dr->handler, ox_at_column_id, LONG2NUM(col)); } args[0] = target; rb_funcall2(dr->handler, ox_instruct_id, 1, args); } buf_protect(&dr->buf); line = dr->buf.line; col = dr->buf.col; read_content(dr, content, sizeof(content) - 1); cend = dr->buf.tail; buf_reset(&dr->buf); dr->err = 0; c = read_attrs(dr, c, '?', '?', is_xml, 1); if (dr->has.attrs_done) { rb_funcall(dr->handler, ox_attrs_done_id, 0); } if (dr->err) { if (dr->has.text) { VALUE args[1]; if (dr->options.convert_special) { ox_sax_collapse_special(dr, content, line, col); } args[0] = rb_str_new2(content); #if HAS_ENCODING_SUPPORT if (0 != dr->encoding) { rb_enc_associate(args[0], dr->encoding); } #elif HAS_PRIVATE_ENCODING if (Qnil != dr->encoding) { rb_funcall(args[0], ox_force_encoding_id, 1, dr->encoding); } #endif if (dr->has.line) { rb_ivar_set(dr->handler, ox_at_line_id, LONG2NUM(line)); } if (dr->has.column) { rb_ivar_set(dr->handler, ox_at_column_id, LONG2NUM(col)); } rb_funcall2(dr->handler, ox_text_id, 1, args); } dr->buf.tail = cend; c = buf_get(&dr->buf); } else { line = dr->buf.line; col = dr->buf.col; c = buf_next_non_white(&dr->buf); if ('>' == c) { c = buf_get(&dr->buf); } else { ox_sax_drive_error_at(dr, NO_TERM "instruction not terminated", line, col); if ('>' == c) { c = buf_get(&dr->buf); } } } if (dr->has.end_instruct) { VALUE args[1]; if (dr->has.line) { rb_ivar_set(dr->handler, ox_at_line_id, LONG2NUM(line)); } if (dr->has.column) { rb_ivar_set(dr->handler, ox_at_column_id, LONG2NUM(col)); } args[0] = target; rb_funcall2(dr->handler, ox_end_instruct_id, 1, args); } dr->buf.str = 0; return c; }
static void parse(SaxDrive dr) { char c = skipBOM(dr); int state = START_STATE; while ('\0' != c) { buf_protect(&dr->buf); if (is_white(c) && '\0' == (c = buf_next_non_white(&dr->buf))) { break; } if ('<' == c) { c = buf_get(&dr->buf); switch (c) { case '?': /* instructions (xml or otherwise) */ c = read_instruction(dr); break; case '!': /* comment or doctype */ buf_protect(&dr->buf); c = buf_get(&dr->buf); if ('\0' == c) { ox_sax_drive_error(dr, NO_TERM "DOCTYPE or comment not terminated"); goto DONE; } else if ('-' == c) { c = buf_get(&dr->buf); /* skip first - and get next character */ if ('-' != c) { ox_sax_drive_error(dr, INVALID_FORMAT "bad comment format, expected <!--"); } else { c = buf_get(&dr->buf); /* skip second - */ } c = read_comment(dr); } else { int i; int spaced = 0; int line = dr->buf.line; int col = dr->buf.col; if (is_white(c)) { spaced = 1; c = buf_next_non_white(&dr->buf); } dr->buf.str = dr->buf.tail - 1; for (i = 7; 0 < i; i--) { c = buf_get(&dr->buf); } if (0 == strncmp("DOCTYPE", dr->buf.str, 7)) { if (spaced) { ox_sax_drive_error_at(dr, WRONG_CHAR "<!DOCTYPE can not included spaces", line, col); } if (START_STATE != state) { ox_sax_drive_error(dr, OUT_OF_ORDER "DOCTYPE can not come after an element"); } c = read_doctype(dr); } else if (0 == strncasecmp("DOCTYPE", dr->buf.str, 7)) { ox_sax_drive_error(dr, CASE_ERROR "expected DOCTYPE all in caps"); if (START_STATE != state) { ox_sax_drive_error(dr, OUT_OF_ORDER "DOCTYPE can not come after an element"); } c = read_doctype(dr); } else if (0 == strncmp("[CDATA[", dr->buf.str, 7)) { if (spaced) { ox_sax_drive_error_at(dr, WRONG_CHAR "<![CDATA[ can not included spaces", line, col); } c = read_cdata(dr); } else if (0 == strncasecmp("[CDATA[", dr->buf.str, 7)) { ox_sax_drive_error(dr, CASE_ERROR "expected CDATA all in caps"); c = read_cdata(dr); } else { ox_sax_drive_error_at(dr, WRONG_CHAR "DOCTYPE, CDATA, or comment expected", line, col); c = read_name_token(dr); if ('>' == c) { c = buf_get(&dr->buf); } } } break; case '/': /* element end */ c = read_element_end(dr); if (0 == stack_peek(&dr->stack)) { state = AFTER_STATE; } break; case '\0': goto DONE; default: buf_backup(&dr->buf); if (AFTER_STATE == state) { ox_sax_drive_error(dr, OUT_OF_ORDER "multiple top level elements"); } state = BODY_STATE; c = read_element_start(dr); if (0 == stack_peek(&dr->stack)) { state = AFTER_STATE; } break; } } else { buf_reset(&dr->buf); c = read_text(dr); } } DONE: if (dr->stack.head < dr->stack.tail) { char msg[256]; Nv sp; if (dr->has.line) { rb_ivar_set(dr->handler, ox_at_line_id, LONG2NUM(dr->buf.line)); } if (dr->has.column) { rb_ivar_set(dr->handler, ox_at_column_id, LONG2NUM(dr->buf.col)); } for (sp = dr->stack.tail - 1; dr->stack.head <= sp; sp--) { snprintf(msg, sizeof(msg) - 1, "%selement '%s' not closed", EL_MISMATCH, sp->name); ox_sax_drive_error_at(dr, msg, dr->buf.line, dr->buf.col); if (dr->has.end_element) { VALUE args[1]; args[0] = sp->val; rb_funcall2(dr->handler, ox_end_element_id, 1, args); } } } }
/* Entered after the "<?" sequence. Ready to read the rest. */ static void read_instruction(PInfo pi) { char content[1024]; struct _Attr attrs[MAX_ATTRS + 1]; Attr a = attrs; char *target; char *end; char c; char *cend; int attrs_ok = 1; *content = '\0'; memset(attrs, 0, sizeof(attrs)); target = read_name_token(pi); end = pi->s; if (0 == (cend = gather_content(pi->s, content, sizeof(content) - 1))) { raise_error("processing instruction content too large or not terminated", pi->str, pi->s); } next_non_white(pi); c = *pi->s; *end = '\0'; /* terminate name */ if ('?' != c) { while ('?' != *pi->s) { if ('\0' == *pi->s) { raise_error("invalid format, processing instruction not terminated", pi->str, pi->s); } next_non_white(pi); a->name = read_name_token(pi); end = pi->s; next_non_white(pi); if ('=' != *pi->s++) { attrs_ok = 0; break; } *end = '\0'; /* terminate name */ /* read value */ next_non_white(pi); a->value = read_quoted_value(pi); a++; if (MAX_ATTRS <= (a - attrs)) { attrs_ok = 0; break; } next_non_white(pi); } if ('?' == *pi->s) { pi->s++; } } else { pi->s++; } if (attrs_ok) { if ('>' != *pi->s++) { raise_error("invalid format, processing instruction not terminated", pi->str, pi->s); } } else { pi->s = cend + 1; } if (0 != pi->pcb->instruct) { if (attrs_ok) { pi->pcb->instruct(pi, target, attrs, 0); } else { pi->pcb->instruct(pi, target, attrs, content); } } }