Ejemplo n.º 1
0
void op_fnp1(mval *src, int delim, int trgpcidx,  mval *dst)
{
	unsigned char	*first, *last, *start, *end;
	unsigned int	*pcoff, *pcoffmax, fnpc_indx, slen;
	int		trgpc, cpcidx, spcidx, mblen, dlmlen;
	boolean_t       valid_char;
	mval		ldst;		/* Local copy since &dst == &src .. move to dst at return */
	fnpc   		*cfnpc;
	delimfmt	ldelim;
	DCL_THREADGBL_ACCESS;

	SETUP_THREADGBL_ACCESS;
	assert(gtm_utf8_mode);
	MV_FORCE_STR(src);
	ldelim.unichar_val = delim;
	if (!UTF8_VALID(ldelim.unibytes_val, ldelim.unibytes_val + SIZEOF(ldelim.unibytes_val), dlmlen) &&
			!badchar_inhibit)
	{ /* The delimiter is a bad character so error out if badchar not inhibited */
		UTF8_BADCHAR(0, ldelim.unibytes_val, ldelim.unibytes_val + SIZEOF(ldelim.unibytes_val), 0, NULL);
	}

	ldst.mvtype = MV_STR;
	start = first = last = (unsigned char *)src->str.addr;
	slen = src->str.len;
	end = start + slen;

	/* Detect annoyance cases and deal with quickly so we don't muck up the
	   logic below trying to handle it properly */
	if (0 >= trgpcidx || 0 == slen)
	{
		ldst.str.addr = (char *)start;
		ldst.str.len  = 0;
		*dst = ldst;
		return;
	}

	/* Test mval for valid cache: index ok, mval addr same, delim same. One additional test
	 * is if the cache entry is byte_oriented, then this cache entry was created by $ZPIECE
	 * (using bytes) and since its results are not same as $PIECE(), we must ignore the cache
	 * and rebuild it for this mval. */
	fnpc_indx = src->fnpc_indx - 1;
	cfnpc = &(TREF(fnpca)).fnpcs[fnpc_indx];
	if (FNPC_MAX > fnpc_indx && cfnpc->last_str.addr == (char *)first &&
	    cfnpc->last_str.len == slen && cfnpc->delim == ldelim.unichar_val &&
	    !cfnpc->byte_oriented) /* cannot use the cache created by an earlier $ZPIECE() */
	{
		/* Have valid cache. See if piece we want already in cache */
		COUNT_EVENT(hit);
		INCR_COUNT(pskip, cfnpc->npcs);

		if (trgpcidx <= cfnpc->npcs)
		{
			/* Piece is totally in cache no scan needed */
			ldst.str.addr = (char *)first + cfnpc->pstart[trgpcidx - 1];
			ldst.str.len = cfnpc->pstart[trgpcidx] - cfnpc->pstart[trgpcidx - 1] - dlmlen;
			assert(ldst.str.len >= 0 && ldst.str.len <= src->str.len);
			*dst = ldst;
			return;
		} else
		{
			/* Not in cache but pick up scan where we left off */
			cpcidx = cfnpc->npcs;
			first = last = start + cfnpc->pstart[cpcidx];	/* First byte of next pc */
			pcoff = &cfnpc->pstart[cpcidx];
			if (pcoff == cfnpc->pcoffmax)
				++pcoff; 		/* No further updates to pstart array */
			++cpcidx;			/* Now past last piece and on to next one */
			COUNT_EVENT(parscan);
		}
	} else
	{
		/* The piece cache index or mval validation was incorrect.
		   Start from the beginning */

		COUNT_EVENT(miss);

		/* Need to steal a new piece cache, get "least recently reused" */
		cfnpc = (TREF(fnpca)).fnpcsteal;	/* Get next element to steal */
		if ((TREF(fnpca)).fnpcmax < cfnpc)
			cfnpc = &(TREF(fnpca)).fnpcs[0];
		(TREF(fnpca)).fnpcsteal = cfnpc + 1;	/* -> next element to steal */

		cfnpc->last_str = src->str;		/* Save validation info */
		cfnpc->delim = ldelim.unichar_val;
		cfnpc->npcs = 0;
		cfnpc->byte_oriented = FALSE;
		src->fnpc_indx = cfnpc->indx + 1;	/* Save where we are putting this element
							   (1 based index in mval so 0 isn't so common) */
		pcoff = &cfnpc->pstart[0];
		cpcidx = 1;				/* current piece index */
	}

	/* Do scan filling in offsets of pieces if they fit in the cache */
	spcidx = cpcidx;				/* Starting value for search */
	pcoffmax = cfnpc->pcoffmax;			/* Local end of array value */
	while (cpcidx <= trgpcidx && last < end)
	{
		/* Once through for each piece we pass, last time through to find length of piece we want */
		first = last;				/* first char of current piece */
		while (last < end)
		{
			valid_char = UTF8_VALID(last, end, mblen);	/* Length of next char */
			if (!valid_char)
			{	/* Next character is not valid unicode. If badchar error is not inhibited,
				   signal it now. If it is inhibited, just treat the character as a single
				   character and continue.
				*/
				if (!badchar_inhibit)
					utf8_badchar(0, last, end, 0, NULL);
				assert(1 == mblen);
			}
			/* Getting mblen first allows us to do quick length compare before the
			   heavier weight memcmp call.
			*/
			assert(0 < mblen);
			if (mblen == dlmlen)
			{
				if (1 == dlmlen)
				{
					if (*last == ldelim.unibytes_val[0])			/* Shortcut - test single byte */
						break;
				} else if (0 == memcmp(last, ldelim.unibytes_val, dlmlen))	/* Longcut - for multibyte chk */
					break;
			}
			last += mblen;  	/* Find delim signaling end of piece */
		}
		last += dlmlen;				/* Bump past delim to first byte of next piece. The length of
							   the delimiter is assumed in the pcoff array and is removed
							   when piece length is calculated so even if we hit the end of
							   the scanned source, we bump the pointer so this extra length
							   is reflected in the pcoff array consistently.
							*/
		++cpcidx;				/* Next piece */
		if (pcoff < pcoffmax)
			*pcoff++ =(unsigned int)(first - start);	/* offset to this piece */
		if (pcoff == pcoffmax)
			*pcoff++ = (unsigned int)(last - start);	/* store start of first piece beyond what is in cache */
	}

	ldst.str.addr = (char *)first;

	/* If we scanned some chars, adjust end pointer and save end of final piece */
	if (spcidx != cpcidx)
	{
		if (pcoff < pcoffmax)
			*pcoff = (unsigned int)(last - start);		/* If not at end of cache, save start of "next" piece */

		last -= dlmlen;				/* Undo bump past last delim (existing or not)
							   of piece for accurate string len */
		/* Update count of pieces in cache */
		cfnpc->npcs = MIN((cfnpc->npcs + cpcidx - spcidx), FNPC_ELEM_MAX);
		assert(cfnpc->npcs <= FNPC_ELEM_MAX);
		assert(cfnpc->npcs > 0);

		/* If we the above loop ended prematurely because we ran out of text, we return null string */
		if (trgpcidx < cpcidx)
			ldst.str.len = INTCAST(last - first);	/* Length of piece we located */
		else
			ldst.str.len = 0;

		INCR_COUNT(pscan, cpcidx - spcidx);	/* Pieces scanned */
	} else
		ldst.str.len  = 0;

	assert(cfnpc->npcs > 0);
	assert(ldst.str.len >= 0 && ldst.str.len <= src->str.len);
	*dst = ldst;
	return;
}
Ejemplo n.º 2
0
/*
 * ----------------------------------------------------------
 * Fast path setpiece when delimiter is one (lit) char replacing
 * a single piece (last is same as first). Unicode flavor.
 *
 * Arguments:
 *	src	- source mval
 *	delim	- delimiter char
 *	expr	- expression string mval
 *	ind	- index in source mval to be set
 *	dst	- destination mval where the result is saved.
 *
 * Return:
 *	none
 * ----------------------------------------------------------
 */
void op_setp1(mval *src, int delim, mval *expr, int ind, mval *dst)
{
	size_t		str_len, delim_cnt;
	int		len, pfx_str_len, sfx_start_offset, sfx_str_len, rep_str_len, pfx_scan_offset;
	int		dlmlen, cpy_cache_lines, mblen;
	unsigned char	*start_sfx, *str_addr, *end_pfx, *end_src, *start_pfx;
	boolean_t	do_scan, delim_last_scan, valid_char;
	mval		dummymval;	/* It's value is not used but is part of the call to op_fnp1() */
	fnpc		*cfnpc, *pfnpc;
	delimfmt	ldelim;
	DCL_THREADGBL_ACCESS;

	SETUP_THREADGBL_ACCESS;
	assert(gtm_utf8_mode);
	do_scan = FALSE;
	cpy_cache_lines = -1;
	ldelim.unichar_val = delim;
        if (!UTF8_VALID(ldelim.unibytes_val, (ldelim.unibytes_val + SIZEOF(ldelim.unibytes_val)), dlmlen)
	    && !badchar_inhibit)
	{	/* The delimiter is a bad character so error out if badchar not inhibited */
		UTF8_BADCHAR(0, ldelim.unibytes_val, ldelim.unibytes_val + SIZEOF(ldelim.unibytes_val), 0, NULL);
	}
	MV_FORCE_STR(expr);	/* Expression to put into piece place */
	if (MV_DEFINED(src))
	{
		/* We have 3 possible scenarios:
		 * 1) The source string is null. Nothing to do but proceed to building output.
		 * 2) If the requested piece is larger than can be cached by op_fnp1, call fnp1
		 *    for the maximum piece possible, use the cache info to "prime the pump" and
		 *    then process the rest of the string ourselves.
		 * 3) If the requested piece can be obtained from the cache, call op_fnp1 to validate
		 *    and rebuild the cache if necessary and then retrieve the necessary info from
		 *    the fnpc cache.
		 */
		MV_FORCE_STR(src);	/* Make sure is string prior to length check */
		if (0 == src->str.len)
		{	/* We have a null source string */
			pfx_str_len = sfx_str_len = sfx_start_offset = 0;
			delim_cnt = (0 < ind) ? (size_t)ind - 1 : 0;
		} else if (FNPC_ELEM_MAX >= ind)
		{	/* 3) Best of all possible cases. The op_fnp1 can do most of our work for us
			 *    and we can preload the cache on the new string to help its subsequent
			 *    uses along as well.
			 */
			SETWON;
			op_fnp1(src, delim, ind, &dummymval);
			SETWOFF;
			cfnpc = &(TREF(fnpca)).fnpcs[src->fnpc_indx - 1];
			assert(cfnpc->last_str.addr == src->str.addr);
			assert(cfnpc->last_str.len == src->str.len);
			assert(cfnpc->delim == delim);
			assert(0 < cfnpc->npcs);
			/* Three more scenarios: #1 piece all in cache, #2 piece would be in cache but ran
			 * out of text or #3 piece is beyond what can be cached
			 */
			if (cfnpc->npcs >= ind)
			{	/* #1 The piece we want is totally within the cache which is good news */
				pfx_str_len = cfnpc->pstart[ind - 1];
				delim_cnt = 0;
				sfx_start_offset = cfnpc->pstart[ind] - dlmlen;				/* Include delimiter */
				rep_str_len = cfnpc->pstart[ind] - cfnpc->pstart[ind - 1] - dlmlen;	/* Replace string length */
				sfx_str_len = src->str.len - pfx_str_len - rep_str_len;
				cpy_cache_lines = ind - 1;
			} else
			{	/* #2 The string was too short so the cache does not contain our string. This means
				 * that the prefix becomes any text that IS in the cache and we set the delim_cnt
				 * to be the number of missing pieces so the delimiters can be put in as part of the
				 * prefix when we build the new string.
				 */
				pfx_str_len = cfnpc->pstart[cfnpc->npcs] - dlmlen;
				delim_cnt = (size_t)(ind - cfnpc->npcs);
				sfx_start_offset = 0;
				sfx_str_len = 0;
				cpy_cache_lines = cfnpc->npcs;
			}
		} else
		{	/* 2) We have a element that would not be able to be in the fnpc cache. Go ahead
			 *    and call op_fnp1 to get cache info up to the maximum and then we will continue
			 *    the scan on our own.
			 */
			SETWON;
			op_fnp1(src, delim, FNPC_ELEM_MAX, &dummymval);
			SETWOFF;
			cfnpc = &(TREF(fnpca)).fnpcs[src->fnpc_indx - 1];
			assert(cfnpc->last_str.addr == src->str.addr);
			assert(cfnpc->last_str.len == src->str.len);
			assert(cfnpc->delim == delim);
			assert(0 < cfnpc->npcs);
			if (FNPC_ELEM_MAX > cfnpc->npcs)
			{	/* We ran out of text so the scan is complete. This is basically the same
				 * as case #2 above.
				 */
				pfx_str_len = cfnpc->pstart[cfnpc->npcs] - dlmlen;
				delim_cnt = (size_t)(ind - cfnpc->npcs);
				sfx_start_offset = 0;
				sfx_str_len = 0;
				cpy_cache_lines = cfnpc->npcs;
			} else
			{	/* We have a case where the piece we want cannot be kept in cache. In the special
				 * case where there is no more text to handle, we don't need to scan further. Otherwise
				 * we prime the pump and continue the scan where the cache left off.
				 */
				if ((pfx_scan_offset = cfnpc->pstart[FNPC_ELEM_MAX]) < src->str.len)	/* Note assignment */
					/* Normal case where we prime the pump */
					do_scan = TRUE;
				else
				{	/* Special case -- no more text to scan */
					pfx_str_len = cfnpc->pstart[FNPC_ELEM_MAX] - dlmlen;
					sfx_start_offset = 0;
					sfx_str_len = 0;
				}
				delim_cnt = (size_t)ind - FNPC_ELEM_MAX;
				cpy_cache_lines = FNPC_ELEM_MAX;
			}
		}
	} else
	{	/* Source is not defined -- treat as a null string */
		pfx_str_len = sfx_str_len = sfx_start_offset = 0;
		delim_cnt = (size_t)ind - 1;
	}
	/* If we have been forced to do our own scan, do that here. Note the variable pfx_scan_offset has been
	 * set to where the scan should begin in the src string and delim_cnt has been set to how many delimiters
	 * still need to be processed.
	 */
	if (do_scan)
	{	/* Scan the line isolating prefix piece, and end of the
		 * piece being replaced
		 */
		COUNT_EVENT(small);
		end_pfx = start_sfx = (unsigned char *)src->str.addr + pfx_scan_offset;
		end_src = (unsigned char *)src->str.addr + src->str.len;
		/* The compiler would unroll this loop this way anyway but we want to
		 * adjust the start_sfx pointer after the loop but only if we have gone
		 * into it at least once.
		 */
		if ((0 < delim_cnt) && (start_sfx < end_src))
		{
			do
			{
				end_pfx = start_sfx;
				delim_last_scan = FALSE;		/* Whether delimiter is last character scanned */
				while (start_sfx < end_src)
				{
					valid_char = UTF8_VALID(start_sfx, end_src, mblen); /* Length of next char */
					if (!valid_char)
					{	/* Next character is not valid unicode. If badchar error is not inhibited,
						 * signal it now. If it is inhibited, just treat the character as a single
						 * character and continue.
						 */
						if (!badchar_inhibit)
							utf8_badchar(0, start_sfx, end_src, 0, NULL);
						assert(1 == mblen);
					}
					/* Getting mblen first allows us to do quick length compare before the
					 * heavier weight memcmp call.
					 */
					assert(0 < mblen);
					if (mblen == dlmlen && 0 == memcmp(start_sfx, ldelim.unibytes_val, dlmlen))
					{
						delim_last_scan = TRUE;
						break;
					}
					/* Increment ptrs by size of found char */
					start_sfx += mblen;
				}
				start_sfx += dlmlen;
				delim_cnt--;
			} while ((0 < delim_cnt) && (start_sfx < end_src));
			/* We have to backup up the suffix start pointer except under the condition
			 * that the last character in the buffer is the last delimiter we were looking
			 * for.
			 */
			if ((0 == delim_cnt) || (start_sfx < end_src) || !delim_last_scan)
				start_sfx -= dlmlen;			/* Back up suffix to include delimiter char */
			/* If we scanned to the end (no text left) and still have delimiters to
			 * find, the entire src text should be part of the prefix
			 */
			if ((start_sfx >= end_src) && (0 < delim_cnt))
			{
				end_pfx = start_sfx;
				if (delim_last_scan)			/* if last char was delim, reduce delim cnt */
					--delim_cnt;
			}
		} else
		{
			/* If not doing any token finding, then this count becomes the number
			 * of tokens to output. Adjust accordingly.
			 */
			if (0 < delim_cnt)
				--delim_cnt;
		}
		INCR_COUNT(small_pcs, (int)((size_t)ind - delim_cnt));
		/* Now having the following situation:
		 * end_pfx	-> end of the prefix piece including delimiter
		 * start_sfx	-> start of suffix piece (with delimiter) or = end_pfx/src->str.addr if none
		 */
		pfx_str_len = (int)(end_pfx - (unsigned char *)src->str.addr);
		if (0 > pfx_str_len)
			pfx_str_len = 0;
		sfx_start_offset = (int)(start_sfx - (unsigned char *)src->str.addr);
		sfx_str_len = src->str.len - sfx_start_offset;
		if (0 > sfx_str_len)
			sfx_str_len = 0;
	}
	/* Calculate total string len. delim_cnt has needed padding delimiters for null fields */
	str_len = (size_t)expr->str.len + (size_t)pfx_str_len + (delim_cnt * (size_t)dlmlen) + (size_t)sfx_str_len;
	if (MAX_STRLEN < str_len)
		rts_error_csa(CSA_ARG(NULL) VARLSTCNT(1) ERR_MAXSTRLEN);
	ENSURE_STP_FREE_SPACE((int)str_len);
	str_addr = stringpool.free;
	start_pfx = (unsigned char *)src->str.addr;
	/* copy prefix */
	if (0 < pfx_str_len)
	{
		memcpy(str_addr, src->str.addr, pfx_str_len);
		str_addr += pfx_str_len;
	}
	/* copy delimiters */
	while (0 < delim_cnt--)
	{
		memcpy(str_addr, ldelim.unibytes_val, dlmlen);
		str_addr += dlmlen;
	}
	/* copy expression */
	if (0 < expr->str.len)
	{
		memcpy(str_addr, expr->str.addr, expr->str.len);
		str_addr += expr->str.len;
	}
	/* copy suffix */
	if (0 < sfx_str_len)
	{
		memcpy(str_addr, start_pfx + sfx_start_offset, sfx_str_len);
		str_addr += sfx_str_len;
	}
	assert(IS_AT_END_OF_STRINGPOOL(str_addr, -str_len));
	dst->mvtype = MV_STR;
	dst->str.len = INTCAST(str_addr - stringpool.free);
	dst->str.addr = (char *)stringpool.free;
	stringpool.free = str_addr;
	/* If available, update the cache information for this newly created mval to hopefully
	 * give it a head start on its next usage. Note that we can only copy over the cache info
	 * for the prefix. We cannot include information for the 'expression' except where it starts
	 * because the expression could itself contain delimiters that would be found on a rescan.
	 */
	if (0 < cpy_cache_lines)
	{
		pfnpc = cfnpc;				/* pointer for src mval's cache */
		do
		{
			cfnpc = (TREF(fnpca)).fnpcsteal;	/* Next cache element to steal */
			if ((TREF(fnpca)).fnpcmax < cfnpc)
				cfnpc = &(TREF(fnpca)).fnpcs[0];
			(TREF(fnpca)).fnpcsteal = cfnpc + 1;	/* -> next element to steal */
		} while (cfnpc == pfnpc);		/* Make sure we don't step on ourselves */
		cfnpc->last_str = dst->str;		/* Save validation info */
		cfnpc->delim = delim;
		cfnpc->npcs = cpy_cache_lines;
		dst->fnpc_indx = cfnpc->indx + 1;	/* Save where we are putting this element
							 * (1 based index in mval so 0 isn't so common)
							 */
		memcpy(&cfnpc->pstart[0], &pfnpc->pstart[0], (cfnpc->npcs + 1) * SIZEOF(unsigned int));
	} else
Ejemplo n.º 3
0
/*
 * ----------------------------------------------------------
 * Set version of $extract
 *
 * Arguments:
 *	src	- source mval
 *	expr	- expression string mval to be inserted into source
 *	schar	- starting character index to be replaced
 *	echar	- ending character index to be replaced
 *	dst	- destination mval where the result is saved.
 *
 * Return:
 *	none
 * ----------------------------------------------------------
 */
void op_setextract(mval *src, mval *expr, int schar, int echar, mval *dst)
{
	int		srclen, padlen, pfxlen, sfxoff, sfxlen, dstlen, bytelen, skip, char_len;
	unsigned char	*srcptr, *srcbase, *srctop, *straddr;

	error_def(ERR_MAXSTRLEN);

	padlen = pfxlen = sfxlen = 0;
	MV_FORCE_STR(expr);	/* Expression to put into piece place */
	if (MV_DEFINED(src))
	{
		MV_FORCE_STR(src);	/* Make sure is string prior to length check */
		srclen = src->str.len;
	} else	/* Source is not defined -- treat as a null string */
		srclen = 0;
	schar = MAX(schar, 1);	/* schar starts at 1 at a minimum */

	/* There are four cases in the spec:
	   1) schar > echar or echar < 1 -- glvn and naked indicator are not changed. This is
	                                    handled by generated code in m_set
	   2) echar >= schar-1 > $L(src) -- dst = src_$J("",schar-1-$L(src))_expr
	   3) schar-1 <= $L(src) < echar -- dst = $E(src,1,schar-1)_expr
	   4) All others                 -- dst = $E(src,1,schar-1)_expr_$E(src,echar+1,$L(src))
	*/
	srcbase = (unsigned char *)src->str.addr;
	srctop = srcbase + srclen;
	for (srcptr = srcbase, skip = schar - 1; (skip > 0 && srcptr < srctop); --skip)
	{ /* skip the first schar - 1 characters */
		if (!UTF8_VALID(srcptr, srctop, bytelen) && !badchar_inhibit)
			utf8_badchar(0, srcptr, srctop, 0, NULL);
		srcptr += bytelen;
	}
	pfxlen = (int)(srcptr - srcbase);
	if (skip > 0)
	{ /* Case #2: schar is past the string length. echar test handled in generated code.
	     Should be padded with as many spaces as characters remained to be skipped */
		padlen = skip;
	}
	for (skip = echar - schar + 1; (skip > 0 && srcptr < srctop); --skip)
	{ /* skip up to the character position echar */
		if (!UTF8_VALID(srcptr, srctop, bytelen) && !badchar_inhibit)
			utf8_badchar(0, srcptr, srctop, 0, NULL);
		srcptr += bytelen;
	}
	char_len = 0;
	if (skip <= 0)
	{ /* Case #4: echar is within the string length, suffix to be added */
		sfxoff = (int)(srcptr - srcbase);
		sfxlen = (int)(srctop - srcptr);
		if (!badchar_inhibit && sfxlen > 0)
		{ /* validate the suffix, and we can compute char_len as well */
			for (; (srcptr < srctop); ++char_len)
			{
				if (!UTF8_VALID(srcptr, srctop, bytelen))
					utf8_badchar(0, srcptr, srctop, 0, NULL);
				srcptr += bytelen;
			}
			MV_FORCE_LEN(expr);
			char_len += schar - 1 + expr->str.char_len;
		}
	}

	/* Calculate total string len */
	dstlen = pfxlen + padlen + expr->str.len + sfxlen;
	if (dstlen > MAX_STRLEN)
		rts_error(VARLSTCNT(1) ERR_MAXSTRLEN);
	ENSURE_STP_FREE_SPACE(dstlen);

	srcbase = (unsigned char *)src->str.addr;
	straddr = stringpool.free;

	if (0 < pfxlen)
	{ /* copy prefix */
		memcpy(straddr, srcbase, pfxlen);
		straddr += pfxlen;
	}
	if (0 < padlen)
	{ /* insert padding */
		memset(straddr, ' ', padlen);
		straddr += padlen;
	}
	if (0 < expr->str.len)
	{ /* copy expression */
		memcpy(straddr, expr->str.addr, expr->str.len);
		straddr += expr->str.len;
	}
	if (0 < sfxlen)
	{ /* copy suffix */
		memcpy(straddr, srcbase + sfxoff, sfxlen);
		straddr += sfxlen;
	}
	assert(straddr - stringpool.free == dstlen);
	MV_INIT_STRING(dst, straddr - stringpool.free, (char *)stringpool.free);
	if (0 < char_len)
	{
		dst->mvtype |= MV_UTF_LEN;
		dst->str.char_len = char_len;
	}
	stringpool.free = straddr;
}