char * Perl_sv_pvutf8n(pTHX_ SV *sv, STRLEN *lp) { PERL_ARGS_ASSERT_SV_PVUTF8N; sv_utf8_upgrade(sv); return sv_pvn(sv,lp); }
char * Perl_sv_pvutf8(pTHX_ SV *sv) { PERL_ARGS_ASSERT_SV_PVUTF8; sv_utf8_upgrade(sv); return sv_pv(sv); }
void plcb_convert_storage(PLCB_t *object, AV *docav, plcb_DOCVAL *vspec) { SV *pv = SvROK(vspec->value) ? SvRV(vspec->value) : vspec->value; uint32_t fmt = vspec->spec; if (object->cv_customenc) { vspec->need_free = 1; vspec->value = custom_convert(docav, object->cv_customenc, vspec->value, &vspec->flags, CONVERT_OUT); } else if (fmt == PLCB_CF_JSON) { vspec->flags = PLCB_LF_JSON|PLCB_CF_JSON; vspec->need_free = 1; vspec->value = serialize_convert(object->cv_jsonenc, vspec->value, CONVERT_OUT); } else if (fmt == PLCB_CF_STORABLE) { vspec->flags = PLCB_CF_STORABLE | PLCB_LF_STORABLE; vspec->need_free = 1; vspec->value = serialize_convert(object->cv_serialize, vspec->value, CONVERT_OUT); } else if (fmt == PLCB_CF_RAW) { vspec->flags = PLCB_CF_RAW | PLCB_LF_RAW; vspec->need_free = 0; if (!SvPOK(pv)) { die("Raw conversion requires string value!"); } } else if (vspec->spec == PLCB_CF_UTF8) { vspec->flags = PLCB_CF_UTF8 | PLCB_LF_UTF8; vspec->need_free = 0; sv_utf8_upgrade(pv); } else { die("Unrecognized flags used (0x%x) but no custom converted installed!", vspec->spec); } if (!vspec->need_free) { /* Use input as-is */ vspec->value = pv; } /* Assume the resultant value is an SV */ if (SvTYPE(vspec->value) == SVt_PV) { vspec->encoded = SvPVX(vspec->value); vspec->len = SvCUR(vspec->value); } else { vspec->encoded = SvPV(vspec->value, vspec->len); } }
static void report_event(PSTATE* p_state, event_id_t event, char *beg, char *end, U32 utf8, token_pos_t *tokens, int num_tokens, SV* self ) { struct p_handler *h; dTHX; dSP; AV *array; STRLEN my_na; char *argspec; char *s; #ifdef UNICODE_HTML_PARSER #define CHR_DIST(a,b) (utf8 ? utf8_distance((U8*)(a),(U8*)(b)) : (a) - (b)) #else #define CHR_DIST(a,b) ((a) - (b)) #endif /* capture offsets */ STRLEN offset = p_state->offset; STRLEN line = p_state->line; STRLEN column = p_state->column; #if 0 { /* used for debugging at some point */ char *s = beg; int i; /* print debug output */ switch(event) { case E_DECLARATION: printf("DECLARATION"); break; case E_COMMENT: printf("COMMENT"); break; case E_START: printf("START"); break; case E_END: printf("END"); break; case E_TEXT: printf("TEXT"); break; case E_PROCESS: printf("PROCESS"); break; case E_NONE: printf("NONE"); break; default: printf("EVENT #%d", event); break; } printf(" ["); while (s < end) { if (*s == '\n') { putchar('\\'); putchar('n'); } else putchar(*s); s++; } printf("] %d\n", end - beg); for (i = 0; i < num_tokens; i++) { printf(" token %d: %d %d\n", i, tokens[i].beg - beg, tokens[i].end - tokens[i].beg); } } #endif if (p_state->pending_end_tag && event != E_TEXT && event != E_COMMENT) { token_pos_t t; char dummy; t.beg = p_state->pending_end_tag; t.end = p_state->pending_end_tag + strlen(p_state->pending_end_tag); p_state->pending_end_tag = 0; report_event(p_state, E_END, &dummy, &dummy, 0, &t, 1, self); SPAGAIN; } /* update offsets */ p_state->offset += CHR_DIST(end, beg); if (line) { char *s = beg; char *nl = NULL; while (s < end) { if (*s == '\n') { p_state->line++; nl = s; } s++; } if (nl) p_state->column = CHR_DIST(end, nl) - 1; else p_state->column += CHR_DIST(end, beg); } if (event == E_NONE) goto IGNORE_EVENT; #ifdef MARKED_SECTION if (p_state->ms == MS_IGNORE) goto IGNORE_EVENT; #endif /* tag filters */ if (p_state->ignore_tags || p_state->report_tags || p_state->ignore_elements) { if (event == E_START || event == E_END) { SV* tagname = p_state->tmp; assert(num_tokens >= 1); sv_setpvn(tagname, tokens[0].beg, tokens[0].end - tokens[0].beg); if (utf8) SvUTF8_on(tagname); else SvUTF8_off(tagname); if (!CASE_SENSITIVE(p_state)) sv_lower(aTHX_ tagname); if (p_state->ignoring_element) { if (sv_eq(p_state->ignoring_element, tagname)) { if (event == E_START) p_state->ignore_depth++; else if (--p_state->ignore_depth == 0) { SvREFCNT_dec(p_state->ignoring_element); p_state->ignoring_element = 0; } } goto IGNORE_EVENT; } if (p_state->ignore_elements && hv_fetch_ent(p_state->ignore_elements, tagname, 0, 0)) { p_state->ignoring_element = newSVsv(tagname); p_state->ignore_depth = 1; goto IGNORE_EVENT; } if (p_state->ignore_tags && hv_fetch_ent(p_state->ignore_tags, tagname, 0, 0)) { goto IGNORE_EVENT; } if (p_state->report_tags && !hv_fetch_ent(p_state->report_tags, tagname, 0, 0)) { goto IGNORE_EVENT; } } else if (p_state->ignoring_element) { goto IGNORE_EVENT; } } h = &p_state->handlers[event]; if (!h->cb) { /* event = E_DEFAULT; */ h = &p_state->handlers[E_DEFAULT]; if (!h->cb) goto IGNORE_EVENT; } if (SvTYPE(h->cb) != SVt_PVAV && !SvTRUE(h->cb)) { /* FALSE scalar ('' or 0) means IGNORE this event */ return; } if (p_state->unbroken_text && event == E_TEXT) { /* should buffer text */ if (!p_state->pend_text) p_state->pend_text = newSV(256); if (SvOK(p_state->pend_text)) { if (p_state->is_cdata != p_state->pend_text_is_cdata) { flush_pending_text(p_state, self); SPAGAIN; goto INIT_PEND_TEXT; } } else { INIT_PEND_TEXT: p_state->pend_text_offset = offset; p_state->pend_text_line = line; p_state->pend_text_column = column; p_state->pend_text_is_cdata = p_state->is_cdata; sv_setpvn(p_state->pend_text, "", 0); if (!utf8) SvUTF8_off(p_state->pend_text); } #ifdef UNICODE_HTML_PARSER if (utf8 && !SvUTF8(p_state->pend_text)) sv_utf8_upgrade(p_state->pend_text); if (utf8 || !SvUTF8(p_state->pend_text)) { sv_catpvn(p_state->pend_text, beg, end - beg); } else { SV *tmp = newSVpvn(beg, end - beg); sv_utf8_upgrade(tmp); sv_catsv(p_state->pend_text, tmp); SvREFCNT_dec(tmp); } #else sv_catpvn(p_state->pend_text, beg, end - beg); #endif return; } else if (p_state->pend_text && SvOK(p_state->pend_text)) { flush_pending_text(p_state, self); SPAGAIN; } /* At this point we have decided to generate an event callback */ argspec = h->argspec ? SvPV(h->argspec, my_na) : ""; if (SvTYPE(h->cb) == SVt_PVAV) { if (*argspec == ARG_FLAG_FLAT_ARRAY) { argspec++; array = (AV*)h->cb; } else { /* start sub-array for accumulator array */ array = newAV(); } } else { array = 0; if (*argspec == ARG_FLAG_FLAT_ARRAY) argspec++; /* start argument stack for callback */ ENTER; SAVETMPS; PUSHMARK(SP); } for (s = argspec; *s; s++) { SV* arg = 0; int push_arg = 1; enum argcode argcode = (enum argcode)*s; switch( argcode ) { case ARG_SELF: arg = sv_mortalcopy(self); break; case ARG_TOKENS: if (num_tokens >= 1) { AV* av = newAV(); SV* prev_token = &PL_sv_undef; int i; av_extend(av, num_tokens); for (i = 0; i < num_tokens; i++) { if (tokens[i].beg) { prev_token = newSVpvn(tokens[i].beg, tokens[i].end-tokens[i].beg); if (utf8) SvUTF8_on(prev_token); av_push(av, prev_token); } else { /* boolean */ av_push(av, p_state->bool_attr_val ? newSVsv(p_state->bool_attr_val) : newSVsv(prev_token)); } } arg = sv_2mortal(newRV_noinc((SV*)av)); } break; case ARG_TOKENPOS: if (num_tokens >= 1 && tokens[0].beg >= beg) { AV* av = newAV(); int i; av_extend(av, num_tokens*2); for (i = 0; i < num_tokens; i++) { if (tokens[i].beg) { av_push(av, newSViv(CHR_DIST(tokens[i].beg, beg))); av_push(av, newSViv(CHR_DIST(tokens[i].end, tokens[i].beg))); } else { /* boolean tag value */ av_push(av, newSViv(0)); av_push(av, newSViv(0)); } } arg = sv_2mortal(newRV_noinc((SV*)av)); } break; case ARG_TOKEN0: case ARG_TAGNAME: /* fall through */ case ARG_TAG: if (num_tokens >= 1) { arg = sv_2mortal(newSVpvn(tokens[0].beg, tokens[0].end - tokens[0].beg)); if (utf8) SvUTF8_on(arg); if (!CASE_SENSITIVE(p_state) && argcode != ARG_TOKEN0) sv_lower(aTHX_ arg); if (argcode == ARG_TAG && event != E_START) { char *e_type = "!##/#?#"; sv_insert(arg, 0, 0, &e_type[event], 1); } } break; case ARG_ATTR: case ARG_ATTRARR: if (event == E_START) { HV* hv; int i; if (argcode == ARG_ATTR) { hv = newHV(); arg = sv_2mortal(newRV_noinc((SV*)hv)); } else { #ifdef __GNUC__ /* gcc -Wall reports this variable as possibly used uninitialized */ hv = 0; #endif push_arg = 0; /* deal with argument pushing here */ } for (i = 1; i < num_tokens; i += 2) { SV* attrname = newSVpvn(tokens[i].beg, tokens[i].end-tokens[i].beg); SV* attrval; if (utf8) SvUTF8_on(attrname); if (tokens[i+1].beg) { char *beg = tokens[i+1].beg; STRLEN len = tokens[i+1].end - beg; if (*beg == '"' || *beg == '\'') { assert(len >= 2 && *beg == beg[len-1]); beg++; len -= 2; } attrval = newSVpvn(beg, len); if (utf8) SvUTF8_on(attrval); if (!p_state->attr_encoded) { #ifdef UNICODE_HTML_PARSER if (p_state->utf8_mode) sv_utf8_decode(attrval); #endif decode_entities(aTHX_ attrval, p_state->entity2char, 0); if (p_state->utf8_mode) SvUTF8_off(attrval); } } else { /* boolean */ if (p_state->bool_attr_val) attrval = newSVsv(p_state->bool_attr_val); else attrval = newSVsv(attrname); } if (!CASE_SENSITIVE(p_state)) sv_lower(aTHX_ attrname); if (argcode == ARG_ATTR) { if (hv_exists_ent(hv, attrname, 0) || !hv_store_ent(hv, attrname, attrval, 0)) { SvREFCNT_dec(attrval); } SvREFCNT_dec(attrname); } else { /* ARG_ATTRARR */ if (array) { av_push(array, attrname); av_push(array, attrval); } else { XPUSHs(sv_2mortal(attrname)); XPUSHs(sv_2mortal(attrval)); } } } } else if (argcode == ARG_ATTRARR) { push_arg = 0; } break; case ARG_ATTRSEQ: /* (v2 compatibility stuff) */ if (event == E_START) { AV* av = newAV(); int i; for (i = 1; i < num_tokens; i += 2) { SV* attrname = newSVpvn(tokens[i].beg, tokens[i].end-tokens[i].beg); if (utf8) SvUTF8_on(attrname); if (!CASE_SENSITIVE(p_state)) sv_lower(aTHX_ attrname); av_push(av, attrname); } arg = sv_2mortal(newRV_noinc((SV*)av)); } break; case ARG_TEXT: arg = sv_2mortal(newSVpvn(beg, end - beg)); if (utf8) SvUTF8_on(arg); break; case ARG_DTEXT: if (event == E_TEXT) { arg = sv_2mortal(newSVpvn(beg, end - beg)); if (utf8) SvUTF8_on(arg); if (!p_state->is_cdata) { #ifdef UNICODE_HTML_PARSER if (p_state->utf8_mode) sv_utf8_decode(arg); #endif decode_entities(aTHX_ arg, p_state->entity2char, 1); if (p_state->utf8_mode) SvUTF8_off(arg); } } break; case ARG_IS_CDATA: if (event == E_TEXT) { arg = boolSV(p_state->is_cdata); } break; case ARG_SKIPPED_TEXT: arg = sv_2mortal(p_state->skipped_text); p_state->skipped_text = newSVpvn("", 0); break; case ARG_OFFSET: arg = sv_2mortal(newSViv(offset)); break; case ARG_OFFSET_END: arg = sv_2mortal(newSViv(offset + CHR_DIST(end, beg))); break; case ARG_LENGTH: arg = sv_2mortal(newSViv(CHR_DIST(end, beg))); break; case ARG_LINE: arg = sv_2mortal(newSViv(line)); break; case ARG_COLUMN: arg = sv_2mortal(newSViv(column)); break; case ARG_EVENT: assert(event >= 0 && event < EVENT_COUNT); arg = sv_2mortal(newSVpv(event_id_str[event], 0)); break; case ARG_LITERAL: { int len = (unsigned char)s[1]; arg = sv_2mortal(newSVpvn(s+2, len)); if (SvUTF8(h->argspec)) SvUTF8_on(arg); s += len + 1; } break; case ARG_UNDEF: arg = sv_mortalcopy(&PL_sv_undef); break; default: arg = sv_2mortal(newSVpvf("Bad argspec %d", *s)); break; } if (push_arg) { if (!arg) arg = sv_mortalcopy(&PL_sv_undef); if (array) { /* have to fix mortality here or add mortality to * XPUSHs after removing it from the switch cases. */ av_push(array, SvREFCNT_inc(arg)); } else { XPUSHs(arg); } } } if (array) { if (array != (AV*)h->cb) av_push((AV*)h->cb, newRV_noinc((SV*)array)); } else { PUTBACK; if ((enum argcode)*argspec == ARG_SELF && !SvROK(h->cb)) { char *method = SvPV(h->cb, my_na); perl_call_method(method, G_DISCARD | G_EVAL | G_VOID); } else { perl_call_sv(h->cb, G_DISCARD | G_EVAL | G_VOID); } if (SvTRUE(ERRSV)) { RETHROW; } FREETMPS; LEAVE; } if (p_state->skipped_text) SvCUR_set(p_state->skipped_text, 0); return; IGNORE_EVENT: if (p_state->skipped_text) { if (event != E_TEXT && p_state->pend_text && SvOK(p_state->pend_text)) flush_pending_text(p_state, self); #ifdef UNICODE_HTML_PARSER if (utf8 && !SvUTF8(p_state->skipped_text)) sv_utf8_upgrade(p_state->skipped_text); if (utf8 || !SvUTF8(p_state->skipped_text)) { #endif sv_catpvn(p_state->skipped_text, beg, end - beg); #ifdef UNICODE_HTML_PARSER } else { SV *tmp = newSVpvn(beg, end - beg); sv_utf8_upgrade(tmp); sv_catsv(p_state->pend_text, tmp); SvREFCNT_dec(tmp); } #endif } #undef CHR_DIST return; }