Esempio n. 1
0
char *
Perl_sv_pvutf8n(pTHX_ SV *sv, STRLEN *lp)
{
    PERL_ARGS_ASSERT_SV_PVUTF8N;

    sv_utf8_upgrade(sv);
    return sv_pvn(sv,lp);
}
Esempio n. 2
0
char *
Perl_sv_pvutf8(pTHX_ SV *sv)
{
    PERL_ARGS_ASSERT_SV_PVUTF8;

    sv_utf8_upgrade(sv);
    return sv_pv(sv);
}
Esempio n. 3
0
void
plcb_convert_storage(PLCB_t *object, AV *docav, plcb_DOCVAL *vspec)
{
    SV *pv = SvROK(vspec->value) ? SvRV(vspec->value) : vspec->value;
    uint32_t fmt = vspec->spec;

    if (object->cv_customenc) {
        vspec->need_free = 1;
        vspec->value = custom_convert(docav, object->cv_customenc, vspec->value, &vspec->flags, CONVERT_OUT);

    } else if (fmt == PLCB_CF_JSON) {
        vspec->flags = PLCB_LF_JSON|PLCB_CF_JSON;
        vspec->need_free = 1;
        vspec->value = serialize_convert(object->cv_jsonenc, vspec->value, CONVERT_OUT);

    } else if (fmt == PLCB_CF_STORABLE) {
        vspec->flags = PLCB_CF_STORABLE | PLCB_LF_STORABLE;
        vspec->need_free = 1;
        vspec->value = serialize_convert(object->cv_serialize, vspec->value, CONVERT_OUT);

    } else if (fmt == PLCB_CF_RAW) {
        vspec->flags = PLCB_CF_RAW | PLCB_LF_RAW;
        vspec->need_free = 0;
        if (!SvPOK(pv)) {
            die("Raw conversion requires string value!");
        }
    } else if (vspec->spec == PLCB_CF_UTF8) {
        vspec->flags = PLCB_CF_UTF8 | PLCB_LF_UTF8;
        vspec->need_free = 0;
        sv_utf8_upgrade(pv);

    } else {
        die("Unrecognized flags used (0x%x) but no custom converted installed!", vspec->spec);
    }

    if (!vspec->need_free) {
        /* Use input as-is */
        vspec->value = pv;
    }

    /* Assume the resultant value is an SV */
    if (SvTYPE(vspec->value) == SVt_PV) {
        vspec->encoded = SvPVX(vspec->value);
        vspec->len = SvCUR(vspec->value);
    } else {
        vspec->encoded = SvPV(vspec->value, vspec->len);
    }
}
Esempio n. 4
0
static void
report_event(PSTATE* p_state,
	     event_id_t event,
	     char *beg, char *end, U32 utf8,
	     token_pos_t *tokens, int num_tokens,
	     SV* self
	    )
{
    struct p_handler *h;
    dTHX;
    dSP;
    AV *array;
    STRLEN my_na;
    char *argspec;
    char *s;

#ifdef UNICODE_HTML_PARSER
    #define CHR_DIST(a,b) (utf8 ? utf8_distance((U8*)(a),(U8*)(b)) : (a) - (b))
#else
    #define CHR_DIST(a,b) ((a) - (b))
#endif

    /* capture offsets */
    STRLEN offset = p_state->offset;
    STRLEN line = p_state->line;
    STRLEN column = p_state->column;

#if 0
    {  /* used for debugging at some point */
	char *s = beg;
	int i;

	/* print debug output */
	switch(event) {
	case E_DECLARATION: printf("DECLARATION"); break;
	case E_COMMENT:     printf("COMMENT"); break;
	case E_START:       printf("START"); break;
	case E_END:         printf("END"); break;
	case E_TEXT:        printf("TEXT"); break;
	case E_PROCESS:     printf("PROCESS"); break;
	case E_NONE:        printf("NONE"); break;
	default:            printf("EVENT #%d", event); break;
	}

	printf(" [");
	while (s < end) {
	    if (*s == '\n') {
		putchar('\\'); putchar('n');
	    }
	    else
		putchar(*s);
	    s++;
	}
	printf("] %d\n", end - beg);
	for (i = 0; i < num_tokens; i++) {
	    printf("  token %d: %d %d\n",
		   i,
		   tokens[i].beg - beg,
		   tokens[i].end - tokens[i].beg);
	}
    }
#endif

    if (p_state->pending_end_tag && event != E_TEXT && event != E_COMMENT) {
	token_pos_t t;
	char dummy;
	t.beg = p_state->pending_end_tag;
	t.end = p_state->pending_end_tag + strlen(p_state->pending_end_tag);
	p_state->pending_end_tag = 0;
	report_event(p_state, E_END, &dummy, &dummy, 0, &t, 1, self);
	SPAGAIN;
    }

    /* update offsets */
    p_state->offset += CHR_DIST(end, beg);
    if (line) {
	char *s = beg;
	char *nl = NULL;
	while (s < end) {
	    if (*s == '\n') {
		p_state->line++;
		nl = s;
	    }
	    s++;
	}
	if (nl)
	    p_state->column = CHR_DIST(end, nl) - 1;
	else
	    p_state->column += CHR_DIST(end, beg);
    }

    if (event == E_NONE)
	goto IGNORE_EVENT;
    
#ifdef MARKED_SECTION
    if (p_state->ms == MS_IGNORE)
	goto IGNORE_EVENT;
#endif

    /* tag filters */
    if (p_state->ignore_tags || p_state->report_tags || p_state->ignore_elements) {

	if (event == E_START || event == E_END) {
	    SV* tagname = p_state->tmp;

	    assert(num_tokens >= 1);
	    sv_setpvn(tagname, tokens[0].beg, tokens[0].end - tokens[0].beg);
	    if (utf8)
		SvUTF8_on(tagname);
	    else
		SvUTF8_off(tagname);
	    if (!CASE_SENSITIVE(p_state))
		sv_lower(aTHX_ tagname);

	    if (p_state->ignoring_element) {
		if (sv_eq(p_state->ignoring_element, tagname)) {
		    if (event == E_START)
			p_state->ignore_depth++;
		    else if (--p_state->ignore_depth == 0) {
			SvREFCNT_dec(p_state->ignoring_element);
			p_state->ignoring_element = 0;
		    }
		}
		goto IGNORE_EVENT;
	    }

	    if (p_state->ignore_elements &&
		hv_fetch_ent(p_state->ignore_elements, tagname, 0, 0))
	    {
		p_state->ignoring_element = newSVsv(tagname);
		p_state->ignore_depth = 1;
		goto IGNORE_EVENT;
	    }

	    if (p_state->ignore_tags &&
		hv_fetch_ent(p_state->ignore_tags, tagname, 0, 0))
	    {
		goto IGNORE_EVENT;
	    }
	    if (p_state->report_tags &&
		!hv_fetch_ent(p_state->report_tags, tagname, 0, 0))
	    {
		goto IGNORE_EVENT;
	    }
	}
	else if (p_state->ignoring_element) {
	    goto IGNORE_EVENT;
	}
    }

    h = &p_state->handlers[event];
    if (!h->cb) {
	/* event = E_DEFAULT; */
	h = &p_state->handlers[E_DEFAULT];
	if (!h->cb)
	    goto IGNORE_EVENT;
    }

    if (SvTYPE(h->cb) != SVt_PVAV && !SvTRUE(h->cb)) {
	/* FALSE scalar ('' or 0) means IGNORE this event */
	return;
    }

    if (p_state->unbroken_text && event == E_TEXT) {
	/* should buffer text */
	if (!p_state->pend_text)
	    p_state->pend_text = newSV(256);
	if (SvOK(p_state->pend_text)) {
	    if (p_state->is_cdata != p_state->pend_text_is_cdata) {
		flush_pending_text(p_state, self);
		SPAGAIN;
		goto INIT_PEND_TEXT;
	    }
	}
	else {
	INIT_PEND_TEXT:
	    p_state->pend_text_offset = offset;
	    p_state->pend_text_line = line;
	    p_state->pend_text_column = column;
	    p_state->pend_text_is_cdata = p_state->is_cdata;
	    sv_setpvn(p_state->pend_text, "", 0);
	    if (!utf8)
		SvUTF8_off(p_state->pend_text);
	}
#ifdef UNICODE_HTML_PARSER
	if (utf8 && !SvUTF8(p_state->pend_text))
	    sv_utf8_upgrade(p_state->pend_text);
	if (utf8 || !SvUTF8(p_state->pend_text)) {
	    sv_catpvn(p_state->pend_text, beg, end - beg);
	}
	else {
	    SV *tmp = newSVpvn(beg, end - beg);
	    sv_utf8_upgrade(tmp);
	    sv_catsv(p_state->pend_text, tmp);
	    SvREFCNT_dec(tmp);
	}
#else
	sv_catpvn(p_state->pend_text, beg, end - beg);
#endif
	return;
    }
    else if (p_state->pend_text && SvOK(p_state->pend_text)) {
	flush_pending_text(p_state, self);
	SPAGAIN;
    }

    /* At this point we have decided to generate an event callback */

    argspec = h->argspec ? SvPV(h->argspec, my_na) : "";

    if (SvTYPE(h->cb) == SVt_PVAV) {
	
	if (*argspec == ARG_FLAG_FLAT_ARRAY) {
	    argspec++;
	    array = (AV*)h->cb;
	}
	else {
	    /* start sub-array for accumulator array */
	    array = newAV();
	}
    }
    else {
	array = 0;
	if (*argspec == ARG_FLAG_FLAT_ARRAY)
	    argspec++;

	/* start argument stack for callback */
	ENTER;
	SAVETMPS;
	PUSHMARK(SP);
    }

    for (s = argspec; *s; s++) {
	SV* arg = 0;
	int push_arg = 1;
	enum argcode argcode = (enum argcode)*s;

	switch( argcode ) {

	case ARG_SELF:
	    arg = sv_mortalcopy(self);
	    break;

	case ARG_TOKENS:
	    if (num_tokens >= 1) {
		AV* av = newAV();
		SV* prev_token = &PL_sv_undef;
		int i;
		av_extend(av, num_tokens);
		for (i = 0; i < num_tokens; i++) {
		    if (tokens[i].beg) {
			prev_token = newSVpvn(tokens[i].beg, tokens[i].end-tokens[i].beg);
			if (utf8)
			    SvUTF8_on(prev_token);
			av_push(av, prev_token);
		    }
		    else { /* boolean */
			av_push(av, p_state->bool_attr_val
				? newSVsv(p_state->bool_attr_val)
				: newSVsv(prev_token));
		    }
		}
		arg = sv_2mortal(newRV_noinc((SV*)av));
	    }
	    break;

	case ARG_TOKENPOS:
	    if (num_tokens >= 1 && tokens[0].beg >= beg) {
		AV* av = newAV();
		int i;
		av_extend(av, num_tokens*2);
		for (i = 0; i < num_tokens; i++) {
		    if (tokens[i].beg) {
			av_push(av, newSViv(CHR_DIST(tokens[i].beg, beg)));
			av_push(av, newSViv(CHR_DIST(tokens[i].end, tokens[i].beg)));
		    }
		    else { /* boolean tag value */
			av_push(av, newSViv(0));
			av_push(av, newSViv(0));
		    }
		}
		arg = sv_2mortal(newRV_noinc((SV*)av));
	    }
	    break;

	case ARG_TOKEN0:
	case ARG_TAGNAME:
	    /* fall through */

	case ARG_TAG:
	    if (num_tokens >= 1) {
		arg = sv_2mortal(newSVpvn(tokens[0].beg,
					  tokens[0].end - tokens[0].beg));
		if (utf8)
		    SvUTF8_on(arg);
		if (!CASE_SENSITIVE(p_state) && argcode != ARG_TOKEN0)
		    sv_lower(aTHX_ arg);
		if (argcode == ARG_TAG && event != E_START) {
		    char *e_type = "!##/#?#";
		    sv_insert(arg, 0, 0, &e_type[event], 1);
		}
	    }
	    break;

	case ARG_ATTR:
	case ARG_ATTRARR:
	    if (event == E_START) {
		HV* hv;
		int i;
		if (argcode == ARG_ATTR) {
		    hv = newHV();
		    arg = sv_2mortal(newRV_noinc((SV*)hv));
		}
		else {
#ifdef __GNUC__
		    /* gcc -Wall reports this variable as possibly used uninitialized */
		    hv = 0;
#endif
		    push_arg = 0;  /* deal with argument pushing here */
		}

		for (i = 1; i < num_tokens; i += 2) {
		    SV* attrname = newSVpvn(tokens[i].beg,
					    tokens[i].end-tokens[i].beg);
		    SV* attrval;

		    if (utf8)
			SvUTF8_on(attrname);
		    if (tokens[i+1].beg) {
			char *beg = tokens[i+1].beg;
			STRLEN len = tokens[i+1].end - beg;
			if (*beg == '"' || *beg == '\'') {
			    assert(len >= 2 && *beg == beg[len-1]);
			    beg++; len -= 2;
			}
			attrval = newSVpvn(beg, len);
			if (utf8)
			    SvUTF8_on(attrval);
			if (!p_state->attr_encoded) {
#ifdef UNICODE_HTML_PARSER
			    if (p_state->utf8_mode)
				sv_utf8_decode(attrval);
#endif
			    decode_entities(aTHX_ attrval, p_state->entity2char, 0);
			    if (p_state->utf8_mode)
				SvUTF8_off(attrval);
			}
		    }
		    else { /* boolean */
			if (p_state->bool_attr_val)
			    attrval = newSVsv(p_state->bool_attr_val);
			else
			    attrval = newSVsv(attrname);
		    }

		    if (!CASE_SENSITIVE(p_state))
			sv_lower(aTHX_ attrname);

		    if (argcode == ARG_ATTR) {
			if (hv_exists_ent(hv, attrname, 0) ||
			    !hv_store_ent(hv, attrname, attrval, 0)) {
			    SvREFCNT_dec(attrval);
			}
			SvREFCNT_dec(attrname);
		    }
		    else { /* ARG_ATTRARR */
			if (array) {
			    av_push(array, attrname);
			    av_push(array, attrval);
			}
			else {
			    XPUSHs(sv_2mortal(attrname));
			    XPUSHs(sv_2mortal(attrval));
			}
		    }
		}
	    }
	    else if (argcode == ARG_ATTRARR) {
		push_arg = 0;
	    }
	    break;

	case ARG_ATTRSEQ:       /* (v2 compatibility stuff) */
	    if (event == E_START) {
		AV* av = newAV();
		int i;
		for (i = 1; i < num_tokens; i += 2) {
		    SV* attrname = newSVpvn(tokens[i].beg,
					    tokens[i].end-tokens[i].beg);
		    if (utf8)
			SvUTF8_on(attrname);
		    if (!CASE_SENSITIVE(p_state))
			sv_lower(aTHX_ attrname);
		    av_push(av, attrname);
		}
		arg = sv_2mortal(newRV_noinc((SV*)av));
	    }
	    break;
	
	case ARG_TEXT:
	    arg = sv_2mortal(newSVpvn(beg, end - beg));
	    if (utf8)
		SvUTF8_on(arg);
	    break;

	case ARG_DTEXT:
	    if (event == E_TEXT) {
		arg = sv_2mortal(newSVpvn(beg, end - beg));
		if (utf8)
		    SvUTF8_on(arg);
		if (!p_state->is_cdata) {
#ifdef UNICODE_HTML_PARSER
		    if (p_state->utf8_mode)
			sv_utf8_decode(arg);
#endif
		    decode_entities(aTHX_ arg, p_state->entity2char, 1);
		    if (p_state->utf8_mode)
			SvUTF8_off(arg);
		}
	    }
	    break;
      
	case ARG_IS_CDATA:
	    if (event == E_TEXT) {
		arg = boolSV(p_state->is_cdata);
	    }
	    break;

        case ARG_SKIPPED_TEXT:
	    arg = sv_2mortal(p_state->skipped_text);
	    p_state->skipped_text = newSVpvn("", 0);
            break;

	case ARG_OFFSET:
	    arg = sv_2mortal(newSViv(offset));
	    break;

	case ARG_OFFSET_END:
	    arg = sv_2mortal(newSViv(offset + CHR_DIST(end, beg)));
	    break;

	case ARG_LENGTH:
	    arg = sv_2mortal(newSViv(CHR_DIST(end, beg)));
	    break;

	case ARG_LINE:
	    arg = sv_2mortal(newSViv(line));
	    break;

	case ARG_COLUMN:
	    arg = sv_2mortal(newSViv(column));
	    break;

	case ARG_EVENT:
	    assert(event >= 0 && event < EVENT_COUNT);
	    arg = sv_2mortal(newSVpv(event_id_str[event], 0));
	    break;

	case ARG_LITERAL:
	{
	    int len = (unsigned char)s[1];
	    arg = sv_2mortal(newSVpvn(s+2, len));
	    if (SvUTF8(h->argspec))
		SvUTF8_on(arg);
	    s += len + 1;
	}
	break;

	case ARG_UNDEF:
	    arg = sv_mortalcopy(&PL_sv_undef);
	    break;
      
	default:
	    arg = sv_2mortal(newSVpvf("Bad argspec %d", *s));
	    break;
	}

	if (push_arg) {
	    if (!arg)
		arg = sv_mortalcopy(&PL_sv_undef);

	    if (array) {
		/* have to fix mortality here or add mortality to
		 * XPUSHs after removing it from the switch cases.
		 */
		av_push(array, SvREFCNT_inc(arg));
	    }
	    else {
		XPUSHs(arg);
	    }
	}
    }

    if (array) {
	if (array != (AV*)h->cb)
	    av_push((AV*)h->cb, newRV_noinc((SV*)array));
    }
    else {
	PUTBACK;

	if ((enum argcode)*argspec == ARG_SELF && !SvROK(h->cb)) {
	    char *method = SvPV(h->cb, my_na);
	    perl_call_method(method, G_DISCARD | G_EVAL | G_VOID);
	}
	else {
	    perl_call_sv(h->cb, G_DISCARD | G_EVAL | G_VOID);
	}

	if (SvTRUE(ERRSV)) {
	    RETHROW;
	}

	FREETMPS;
	LEAVE;
    }
    if (p_state->skipped_text)
	SvCUR_set(p_state->skipped_text, 0);
    return;

IGNORE_EVENT:
    if (p_state->skipped_text) {
	if (event != E_TEXT && p_state->pend_text && SvOK(p_state->pend_text))
	    flush_pending_text(p_state, self);
#ifdef UNICODE_HTML_PARSER
	if (utf8 && !SvUTF8(p_state->skipped_text))
	    sv_utf8_upgrade(p_state->skipped_text);
	if (utf8 || !SvUTF8(p_state->skipped_text)) {
#endif
	    sv_catpvn(p_state->skipped_text, beg, end - beg);
#ifdef UNICODE_HTML_PARSER
	}
	else {
	    SV *tmp = newSVpvn(beg, end - beg);
	    sv_utf8_upgrade(tmp);
	    sv_catsv(p_state->pend_text, tmp);
	    SvREFCNT_dec(tmp);
	}
#endif
    }
#undef CHR_DIST    
    return;
}