/* prepends the passed sprintf-like arguments to ERRSV, which also * gets stringified on the way */ void modperl_errsv_prepend(pTHX_ const char *pat, ...) { SV *sv; va_list args; va_start(args, pat); sv = vnewSVpvf(pat, &args); va_end(args); sv_catsv(sv, ERRSV); sv_copypv(ERRSV, sv); sv_free(sv); }
EXTERN void parse(pTHX_ PSTATE* p_state, SV* chunk, SV* self) { char *s, *beg, *end; U32 utf8 = 0; STRLEN len; if (!chunk) { /* eof */ char empty[1]; if (p_state->buf && SvOK(p_state->buf)) { /* flush it */ s = SvPV(p_state->buf, len); end = s + len; utf8 = SvUTF8(p_state->buf); assert(len); while (s < end) { if (p_state->literal_mode) { if (strEQ(p_state->literal_mode, "plaintext") && !p_state->closing_plaintext) break; p_state->pending_end_tag = p_state->literal_mode; p_state->literal_mode = 0; s = parse_buf(aTHX_ p_state, s, end, utf8, self); continue; } if (!p_state->strict_comment && !p_state->no_dash_dash_comment_end && *s == '<') { p_state->no_dash_dash_comment_end = 1; s = parse_buf(aTHX_ p_state, s, end, utf8, self); continue; } if (!p_state->strict_comment && *s == '<') { /* some kind of unterminated markup. Report rest as as comment */ token_pos_t token; token.beg = s + 1; token.end = end; report_event(p_state, E_COMMENT, s, end, utf8, &token, 1, self); s = end; } break; } if (s < end) { /* report rest as text */ report_event(p_state, E_TEXT, s, end, utf8, 0, 0, self); } SvREFCNT_dec(p_state->buf); p_state->buf = 0; } if (p_state->pend_text && SvOK(p_state->pend_text)) flush_pending_text(p_state, self); if (p_state->ignoring_element) { /* document not balanced */ SvREFCNT_dec(p_state->ignoring_element); p_state->ignoring_element = 0; } report_event(p_state, E_END_DOCUMENT, empty, empty, 0, 0, 0, self); /* reset state */ p_state->offset = 0; if (p_state->line) p_state->line = 1; p_state->column = 0; p_state->literal_mode = 0; p_state->is_cdata = 0; return; } #ifdef UNICODE_HTML_PARSER if (p_state->utf8_mode) sv_utf8_downgrade(chunk, 0); #endif if (p_state->buf && SvOK(p_state->buf)) { sv_catsv(p_state->buf, chunk); beg = SvPV(p_state->buf, len); utf8 = SvUTF8(p_state->buf); } else { beg = SvPV(chunk, len); utf8 = SvUTF8(chunk); if (p_state->offset == 0) { report_event(p_state, E_START_DOCUMENT, beg, beg, 0, 0, 0, self); /* Print warnings if we find unexpected Unicode BOM forms */ #ifdef UNICODE_HTML_PARSER if (DOWARN && p_state->argspec_entity_decode && !p_state->utf8_mode && ( (!utf8 && len >= 3 && strnEQ(beg, "\xEF\xBB\xBF", 3)) || (utf8 && len >= 6 && strnEQ(beg, "\xC3\xAF\xC2\xBB\xC2\xBF", 6)) || (!utf8 && probably_utf8_chunk(aTHX_ beg, len)) ) ) { warn("Parsing of undecoded UTF-8 will give garbage when decoding entities"); } if (DOWARN && utf8 && len >= 2 && strnEQ(beg, "\xFF\xFE", 2)) { warn("Parsing string decoded with wrong endianess"); } #endif if (DOWARN) { if (!utf8 && len >= 4 && (strnEQ(beg, "\x00\x00\xFE\xFF", 4) || strnEQ(beg, "\xFE\xFF\x00\x00", 4)) ) { warn("Parsing of undecoded UTF-32"); } else if (!utf8 && len >= 2 && (strnEQ(beg, "\xFE\xFF", 2) || strnEQ(beg, "\xFF\xFE", 2)) ) { warn("Parsing of undecoded UTF-16"); } } } } if (!len) return; /* nothing to do */ end = beg + len; s = parse_buf(aTHX_ p_state, beg, end, utf8, self); if (s == end || p_state->eof) { if (p_state->buf) { SvOK_off(p_state->buf); } } else { /* need to keep rest in buffer */ if (p_state->buf) { /* chop off some chars at the beginning */ if (SvOK(p_state->buf)) { sv_chop(p_state->buf, s); } else { sv_setpvn(p_state->buf, s, end - s); if (utf8) SvUTF8_on(p_state->buf); else SvUTF8_off(p_state->buf); } } else { p_state->buf = newSVpv(s, end - s); if (utf8) SvUTF8_on(p_state->buf); } } return; }
static void report_event(PSTATE* p_state, event_id_t event, char *beg, char *end, U32 utf8, token_pos_t *tokens, int num_tokens, SV* self ) { struct p_handler *h; dTHX; dSP; AV *array; STRLEN my_na; char *argspec; char *s; #ifdef UNICODE_HTML_PARSER #define CHR_DIST(a,b) (utf8 ? utf8_distance((U8*)(a),(U8*)(b)) : (a) - (b)) #else #define CHR_DIST(a,b) ((a) - (b)) #endif /* capture offsets */ STRLEN offset = p_state->offset; STRLEN line = p_state->line; STRLEN column = p_state->column; #if 0 { /* used for debugging at some point */ char *s = beg; int i; /* print debug output */ switch(event) { case E_DECLARATION: printf("DECLARATION"); break; case E_COMMENT: printf("COMMENT"); break; case E_START: printf("START"); break; case E_END: printf("END"); break; case E_TEXT: printf("TEXT"); break; case E_PROCESS: printf("PROCESS"); break; case E_NONE: printf("NONE"); break; default: printf("EVENT #%d", event); break; } printf(" ["); while (s < end) { if (*s == '\n') { putchar('\\'); putchar('n'); } else putchar(*s); s++; } printf("] %d\n", end - beg); for (i = 0; i < num_tokens; i++) { printf(" token %d: %d %d\n", i, tokens[i].beg - beg, tokens[i].end - tokens[i].beg); } } #endif if (p_state->pending_end_tag && event != E_TEXT && event != E_COMMENT) { token_pos_t t; char dummy; t.beg = p_state->pending_end_tag; t.end = p_state->pending_end_tag + strlen(p_state->pending_end_tag); p_state->pending_end_tag = 0; report_event(p_state, E_END, &dummy, &dummy, 0, &t, 1, self); SPAGAIN; } /* update offsets */ p_state->offset += CHR_DIST(end, beg); if (line) { char *s = beg; char *nl = NULL; while (s < end) { if (*s == '\n') { p_state->line++; nl = s; } s++; } if (nl) p_state->column = CHR_DIST(end, nl) - 1; else p_state->column += CHR_DIST(end, beg); } if (event == E_NONE) goto IGNORE_EVENT; #ifdef MARKED_SECTION if (p_state->ms == MS_IGNORE) goto IGNORE_EVENT; #endif /* tag filters */ if (p_state->ignore_tags || p_state->report_tags || p_state->ignore_elements) { if (event == E_START || event == E_END) { SV* tagname = p_state->tmp; assert(num_tokens >= 1); sv_setpvn(tagname, tokens[0].beg, tokens[0].end - tokens[0].beg); if (utf8) SvUTF8_on(tagname); else SvUTF8_off(tagname); if (!CASE_SENSITIVE(p_state)) sv_lower(aTHX_ tagname); if (p_state->ignoring_element) { if (sv_eq(p_state->ignoring_element, tagname)) { if (event == E_START) p_state->ignore_depth++; else if (--p_state->ignore_depth == 0) { SvREFCNT_dec(p_state->ignoring_element); p_state->ignoring_element = 0; } } goto IGNORE_EVENT; } if (p_state->ignore_elements && hv_fetch_ent(p_state->ignore_elements, tagname, 0, 0)) { p_state->ignoring_element = newSVsv(tagname); p_state->ignore_depth = 1; goto IGNORE_EVENT; } if (p_state->ignore_tags && hv_fetch_ent(p_state->ignore_tags, tagname, 0, 0)) { goto IGNORE_EVENT; } if (p_state->report_tags && !hv_fetch_ent(p_state->report_tags, tagname, 0, 0)) { goto IGNORE_EVENT; } } else if (p_state->ignoring_element) { goto IGNORE_EVENT; } } h = &p_state->handlers[event]; if (!h->cb) { /* event = E_DEFAULT; */ h = &p_state->handlers[E_DEFAULT]; if (!h->cb) goto IGNORE_EVENT; } if (SvTYPE(h->cb) != SVt_PVAV && !SvTRUE(h->cb)) { /* FALSE scalar ('' or 0) means IGNORE this event */ return; } if (p_state->unbroken_text && event == E_TEXT) { /* should buffer text */ if (!p_state->pend_text) p_state->pend_text = newSV(256); if (SvOK(p_state->pend_text)) { if (p_state->is_cdata != p_state->pend_text_is_cdata) { flush_pending_text(p_state, self); SPAGAIN; goto INIT_PEND_TEXT; } } else { INIT_PEND_TEXT: p_state->pend_text_offset = offset; p_state->pend_text_line = line; p_state->pend_text_column = column; p_state->pend_text_is_cdata = p_state->is_cdata; sv_setpvn(p_state->pend_text, "", 0); if (!utf8) SvUTF8_off(p_state->pend_text); } #ifdef UNICODE_HTML_PARSER if (utf8 && !SvUTF8(p_state->pend_text)) sv_utf8_upgrade(p_state->pend_text); if (utf8 || !SvUTF8(p_state->pend_text)) { sv_catpvn(p_state->pend_text, beg, end - beg); } else { SV *tmp = newSVpvn(beg, end - beg); sv_utf8_upgrade(tmp); sv_catsv(p_state->pend_text, tmp); SvREFCNT_dec(tmp); } #else sv_catpvn(p_state->pend_text, beg, end - beg); #endif return; } else if (p_state->pend_text && SvOK(p_state->pend_text)) { flush_pending_text(p_state, self); SPAGAIN; } /* At this point we have decided to generate an event callback */ argspec = h->argspec ? SvPV(h->argspec, my_na) : ""; if (SvTYPE(h->cb) == SVt_PVAV) { if (*argspec == ARG_FLAG_FLAT_ARRAY) { argspec++; array = (AV*)h->cb; } else { /* start sub-array for accumulator array */ array = newAV(); } } else { array = 0; if (*argspec == ARG_FLAG_FLAT_ARRAY) argspec++; /* start argument stack for callback */ ENTER; SAVETMPS; PUSHMARK(SP); } for (s = argspec; *s; s++) { SV* arg = 0; int push_arg = 1; enum argcode argcode = (enum argcode)*s; switch( argcode ) { case ARG_SELF: arg = sv_mortalcopy(self); break; case ARG_TOKENS: if (num_tokens >= 1) { AV* av = newAV(); SV* prev_token = &PL_sv_undef; int i; av_extend(av, num_tokens); for (i = 0; i < num_tokens; i++) { if (tokens[i].beg) { prev_token = newSVpvn(tokens[i].beg, tokens[i].end-tokens[i].beg); if (utf8) SvUTF8_on(prev_token); av_push(av, prev_token); } else { /* boolean */ av_push(av, p_state->bool_attr_val ? newSVsv(p_state->bool_attr_val) : newSVsv(prev_token)); } } arg = sv_2mortal(newRV_noinc((SV*)av)); } break; case ARG_TOKENPOS: if (num_tokens >= 1 && tokens[0].beg >= beg) { AV* av = newAV(); int i; av_extend(av, num_tokens*2); for (i = 0; i < num_tokens; i++) { if (tokens[i].beg) { av_push(av, newSViv(CHR_DIST(tokens[i].beg, beg))); av_push(av, newSViv(CHR_DIST(tokens[i].end, tokens[i].beg))); } else { /* boolean tag value */ av_push(av, newSViv(0)); av_push(av, newSViv(0)); } } arg = sv_2mortal(newRV_noinc((SV*)av)); } break; case ARG_TOKEN0: case ARG_TAGNAME: /* fall through */ case ARG_TAG: if (num_tokens >= 1) { arg = sv_2mortal(newSVpvn(tokens[0].beg, tokens[0].end - tokens[0].beg)); if (utf8) SvUTF8_on(arg); if (!CASE_SENSITIVE(p_state) && argcode != ARG_TOKEN0) sv_lower(aTHX_ arg); if (argcode == ARG_TAG && event != E_START) { char *e_type = "!##/#?#"; sv_insert(arg, 0, 0, &e_type[event], 1); } } break; case ARG_ATTR: case ARG_ATTRARR: if (event == E_START) { HV* hv; int i; if (argcode == ARG_ATTR) { hv = newHV(); arg = sv_2mortal(newRV_noinc((SV*)hv)); } else { #ifdef __GNUC__ /* gcc -Wall reports this variable as possibly used uninitialized */ hv = 0; #endif push_arg = 0; /* deal with argument pushing here */ } for (i = 1; i < num_tokens; i += 2) { SV* attrname = newSVpvn(tokens[i].beg, tokens[i].end-tokens[i].beg); SV* attrval; if (utf8) SvUTF8_on(attrname); if (tokens[i+1].beg) { char *beg = tokens[i+1].beg; STRLEN len = tokens[i+1].end - beg; if (*beg == '"' || *beg == '\'') { assert(len >= 2 && *beg == beg[len-1]); beg++; len -= 2; } attrval = newSVpvn(beg, len); if (utf8) SvUTF8_on(attrval); if (!p_state->attr_encoded) { #ifdef UNICODE_HTML_PARSER if (p_state->utf8_mode) sv_utf8_decode(attrval); #endif decode_entities(aTHX_ attrval, p_state->entity2char, 0); if (p_state->utf8_mode) SvUTF8_off(attrval); } } else { /* boolean */ if (p_state->bool_attr_val) attrval = newSVsv(p_state->bool_attr_val); else attrval = newSVsv(attrname); } if (!CASE_SENSITIVE(p_state)) sv_lower(aTHX_ attrname); if (argcode == ARG_ATTR) { if (hv_exists_ent(hv, attrname, 0) || !hv_store_ent(hv, attrname, attrval, 0)) { SvREFCNT_dec(attrval); } SvREFCNT_dec(attrname); } else { /* ARG_ATTRARR */ if (array) { av_push(array, attrname); av_push(array, attrval); } else { XPUSHs(sv_2mortal(attrname)); XPUSHs(sv_2mortal(attrval)); } } } } else if (argcode == ARG_ATTRARR) { push_arg = 0; } break; case ARG_ATTRSEQ: /* (v2 compatibility stuff) */ if (event == E_START) { AV* av = newAV(); int i; for (i = 1; i < num_tokens; i += 2) { SV* attrname = newSVpvn(tokens[i].beg, tokens[i].end-tokens[i].beg); if (utf8) SvUTF8_on(attrname); if (!CASE_SENSITIVE(p_state)) sv_lower(aTHX_ attrname); av_push(av, attrname); } arg = sv_2mortal(newRV_noinc((SV*)av)); } break; case ARG_TEXT: arg = sv_2mortal(newSVpvn(beg, end - beg)); if (utf8) SvUTF8_on(arg); break; case ARG_DTEXT: if (event == E_TEXT) { arg = sv_2mortal(newSVpvn(beg, end - beg)); if (utf8) SvUTF8_on(arg); if (!p_state->is_cdata) { #ifdef UNICODE_HTML_PARSER if (p_state->utf8_mode) sv_utf8_decode(arg); #endif decode_entities(aTHX_ arg, p_state->entity2char, 1); if (p_state->utf8_mode) SvUTF8_off(arg); } } break; case ARG_IS_CDATA: if (event == E_TEXT) { arg = boolSV(p_state->is_cdata); } break; case ARG_SKIPPED_TEXT: arg = sv_2mortal(p_state->skipped_text); p_state->skipped_text = newSVpvn("", 0); break; case ARG_OFFSET: arg = sv_2mortal(newSViv(offset)); break; case ARG_OFFSET_END: arg = sv_2mortal(newSViv(offset + CHR_DIST(end, beg))); break; case ARG_LENGTH: arg = sv_2mortal(newSViv(CHR_DIST(end, beg))); break; case ARG_LINE: arg = sv_2mortal(newSViv(line)); break; case ARG_COLUMN: arg = sv_2mortal(newSViv(column)); break; case ARG_EVENT: assert(event >= 0 && event < EVENT_COUNT); arg = sv_2mortal(newSVpv(event_id_str[event], 0)); break; case ARG_LITERAL: { int len = (unsigned char)s[1]; arg = sv_2mortal(newSVpvn(s+2, len)); if (SvUTF8(h->argspec)) SvUTF8_on(arg); s += len + 1; } break; case ARG_UNDEF: arg = sv_mortalcopy(&PL_sv_undef); break; default: arg = sv_2mortal(newSVpvf("Bad argspec %d", *s)); break; } if (push_arg) { if (!arg) arg = sv_mortalcopy(&PL_sv_undef); if (array) { /* have to fix mortality here or add mortality to * XPUSHs after removing it from the switch cases. */ av_push(array, SvREFCNT_inc(arg)); } else { XPUSHs(arg); } } } if (array) { if (array != (AV*)h->cb) av_push((AV*)h->cb, newRV_noinc((SV*)array)); } else { PUTBACK; if ((enum argcode)*argspec == ARG_SELF && !SvROK(h->cb)) { char *method = SvPV(h->cb, my_na); perl_call_method(method, G_DISCARD | G_EVAL | G_VOID); } else { perl_call_sv(h->cb, G_DISCARD | G_EVAL | G_VOID); } if (SvTRUE(ERRSV)) { RETHROW; } FREETMPS; LEAVE; } if (p_state->skipped_text) SvCUR_set(p_state->skipped_text, 0); return; IGNORE_EVENT: if (p_state->skipped_text) { if (event != E_TEXT && p_state->pend_text && SvOK(p_state->pend_text)) flush_pending_text(p_state, self); #ifdef UNICODE_HTML_PARSER if (utf8 && !SvUTF8(p_state->skipped_text)) sv_utf8_upgrade(p_state->skipped_text); if (utf8 || !SvUTF8(p_state->skipped_text)) { #endif sv_catpvn(p_state->skipped_text, beg, end - beg); #ifdef UNICODE_HTML_PARSER } else { SV *tmp = newSVpvn(beg, end - beg); sv_utf8_upgrade(tmp); sv_catsv(p_state->pend_text, tmp); SvREFCNT_dec(tmp); } #endif } #undef CHR_DIST return; }