Beispiel #1
0
/*
 * Main parse routine for a buffer.
 * It assumes encoding and line numbering are already set up.
 * It can recurse directly (for invocations of user-defined
 * macros, inline equations, and input line traps)
 * and indirectly (for .so file inclusion).
 */
static void
mparse_buf_r(struct mparse *curp, struct buf blk, size_t i, int start)
{
	const struct tbl_span	*span;
	struct buf	 ln;
	const char	*save_file;
	char		*cp;
	size_t		 pos; /* byte number in the ln buffer */
	enum rofferr	 rr;
	int		 of;
	int		 lnn; /* line number in the real file */
	int		 fd;
	unsigned char	 c;

	memset(&ln, 0, sizeof(ln));

	lnn = curp->line;
	pos = 0;

	while (i < blk.sz) {
		if (0 == pos && '\0' == blk.buf[i])
			break;

		if (start) {
			curp->line = lnn;
			curp->reparse_count = 0;

			if (lnn < 3 &&
			    curp->filenc & MPARSE_UTF8 &&
			    curp->filenc & MPARSE_LATIN1)
				curp->filenc = preconv_cue(&blk, i);
		}

		while (i < blk.sz && (start || blk.buf[i] != '\0')) {

			/*
			 * When finding an unescaped newline character,
			 * leave the character loop to process the line.
			 * Skip a preceding carriage return, if any.
			 */

			if ('\r' == blk.buf[i] && i + 1 < blk.sz &&
			    '\n' == blk.buf[i + 1])
				++i;
			if ('\n' == blk.buf[i]) {
				++i;
				++lnn;
				break;
			}

			/*
			 * Make sure we have space for the worst
			 * case of 11 bytes: "\\[u10ffff]\0"
			 */

			if (pos + 11 > ln.sz)
				resize_buf(&ln, 256);

			/*
			 * Encode 8-bit input.
			 */

			c = blk.buf[i];
			if (c & 0x80) {
				if ( ! (curp->filenc && preconv_encode(
				    &blk, &i, &ln, &pos, &curp->filenc))) {
					mandoc_vmsg(MANDOCERR_CHAR_BAD, curp,
					    curp->line, pos, "0x%x", c);
					ln.buf[pos++] = '?';
					i++;
				}
				continue;
			}

			/*
			 * Exclude control characters.
			 */

			if (c == 0x7f || (c < 0x20 && c != 0x09)) {
				mandoc_vmsg(c == 0x00 || c == 0x04 ||
				    c > 0x0a ? MANDOCERR_CHAR_BAD :
				    MANDOCERR_CHAR_UNSUPP,
				    curp, curp->line, pos, "0x%x", c);
				i++;
				if (c != '\r')
					ln.buf[pos++] = '?';
				continue;
			}

			/* Trailing backslash = a plain char. */

			if (blk.buf[i] != '\\' || i + 1 == blk.sz) {
				ln.buf[pos++] = blk.buf[i++];
				continue;
			}

			/*
			 * Found escape and at least one other character.
			 * When it's a newline character, skip it.
			 * When there is a carriage return in between,
			 * skip that one as well.
			 */

			if ('\r' == blk.buf[i + 1] && i + 2 < blk.sz &&
			    '\n' == blk.buf[i + 2])
				++i;
			if ('\n' == blk.buf[i + 1]) {
				i += 2;
				++lnn;
				continue;
			}

			if ('"' == blk.buf[i + 1] || '#' == blk.buf[i + 1]) {
				i += 2;
				/* Comment, skip to end of line */
				for (; i < blk.sz; ++i) {
					if ('\n' == blk.buf[i]) {
						++i;
						++lnn;
						break;
					}
				}

				/* Backout trailing whitespaces */
				for (; pos > 0; --pos) {
					if (ln.buf[pos - 1] != ' ')
						break;
					if (pos > 2 && ln.buf[pos - 2] == '\\')
						break;
				}
				break;
			}

			/* Catch escaped bogus characters. */

			c = (unsigned char) blk.buf[i+1];

			if ( ! (isascii(c) &&
			    (isgraph(c) || isblank(c)))) {
				mandoc_vmsg(MANDOCERR_CHAR_BAD, curp,
				    curp->line, pos, "0x%x", c);
				i += 2;
				ln.buf[pos++] = '?';
				continue;
			}

			/* Some other escape sequence, copy & cont. */

			ln.buf[pos++] = blk.buf[i++];
			ln.buf[pos++] = blk.buf[i++];
		}

		if (pos >= ln.sz)
			resize_buf(&ln, 256);

		ln.buf[pos] = '\0';

		/*
		 * A significant amount of complexity is contained by
		 * the roff preprocessor.  It's line-oriented but can be
		 * expressed on one line, so we need at times to
		 * readjust our starting point and re-run it.  The roff
		 * preprocessor can also readjust the buffers with new
		 * data, so we pass them in wholesale.
		 */

		of = 0;

		/*
		 * Maintain a lookaside buffer of all parsed lines.  We
		 * only do this if mparse_keep() has been invoked (the
		 * buffer may be accessed with mparse_getkeep()).
		 */

		if (curp->secondary) {
			curp->secondary->buf = mandoc_realloc(
			    curp->secondary->buf,
			    curp->secondary->sz + pos + 2);
			memcpy(curp->secondary->buf +
			    curp->secondary->sz,
			    ln.buf, pos);
			curp->secondary->sz += pos;
			curp->secondary->buf
				[curp->secondary->sz] = '\n';
			curp->secondary->sz++;
			curp->secondary->buf
				[curp->secondary->sz] = '\0';
		}
rerun:
		rr = roff_parseln(curp->roff, curp->line, &ln, &of);

		switch (rr) {
		case ROFF_REPARSE:
			if (REPARSE_LIMIT >= ++curp->reparse_count)
				mparse_buf_r(curp, ln, of, 0);
			else
				mandoc_msg(MANDOCERR_ROFFLOOP, curp,
				    curp->line, pos, NULL);
			pos = 0;
			continue;
		case ROFF_APPEND:
			pos = strlen(ln.buf);
			continue;
		case ROFF_RERUN:
			goto rerun;
		case ROFF_IGN:
			pos = 0;
			continue;
		case ROFF_SO:
			if ( ! (curp->options & MPARSE_SO) &&
			    (i >= blk.sz || blk.buf[i] == '\0')) {
				curp->sodest = mandoc_strdup(ln.buf + of);
				free(ln.buf);
				return;
			}
			/*
			 * We remove `so' clauses from our lookaside
			 * buffer because we're going to descend into
			 * the file recursively.
			 */
			if (curp->secondary)
				curp->secondary->sz -= pos + 1;
			save_file = curp->file;
			if ((fd = mparse_open(curp, ln.buf + of)) != -1) {
				mparse_readfd(curp, fd, ln.buf + of);
				close(fd);
				curp->file = save_file;
			} else {
				curp->file = save_file;
				mandoc_vmsg(MANDOCERR_SO_FAIL,
				    curp, curp->line, pos,
				    ".so %s", ln.buf + of);
				ln.sz = mandoc_asprintf(&cp,
				    ".sp\nSee the file %s.\n.sp",
				    ln.buf + of);
				free(ln.buf);
				ln.buf = cp;
				of = 0;
				mparse_buf_r(curp, ln, of, 0);
			}
			pos = 0;
			continue;
		default:
			break;
		}

		/*
		 * If input parsers have not been allocated, do so now.
		 * We keep these instanced between parsers, but set them
		 * locally per parse routine since we can use different
		 * parsers with each one.
		 */

		if (curp->man == NULL ||
		    curp->man->macroset == MACROSET_NONE)
			choose_parser(curp);

		/*
		 * Lastly, push down into the parsers themselves.
		 * If libroff returns ROFF_TBL, then add it to the
		 * currently open parse.  Since we only get here if
		 * there does exist data (see tbl_data.c), we're
		 * guaranteed that something's been allocated.
		 * Do the same for ROFF_EQN.
		 */

		if (rr == ROFF_TBL)
			while ((span = roff_span(curp->roff)) != NULL)
				roff_addtbl(curp->man, span);
		else if (rr == ROFF_EQN)
			roff_addeqn(curp->man, roff_eqn(curp->roff));
		else if ((curp->man->macroset == MACROSET_MDOC ?
		    mdoc_parseln(curp->man, curp->line, ln.buf, of) :
		    man_parseln(curp->man, curp->line, ln.buf, of)) == 2)
				break;

		/* Temporary buffers typically are not full. */

		if (0 == start && '\0' == blk.buf[i])
			break;

		/* Start the next input line. */

		pos = 0;
	}

	free(ln.buf);
}
Beispiel #2
0
/*
 * Main parse routine for an opened file.  This is called for each
 * opened file and simply loops around the full input file, possibly
 * nesting (i.e., with `so').
 */
static void
mparse_buf_r(struct mparse *curp, struct buf blk, int start)
{
	const struct tbl_span	*span;
	struct buf	 ln;
	enum rofferr	 rr;
	int		 i, of, rc;
	int		 pos; /* byte number in the ln buffer */
	int		 lnn; /* line number in the real file */
	unsigned char	 c;

	memset(&ln, 0, sizeof(struct buf));

	lnn = curp->line; 
	pos = 0; 

	for (i = 0; i < (int)blk.sz; ) {
		if (0 == pos && '\0' == blk.buf[i])
			break;

		if (start) {
			curp->line = lnn;
			curp->reparse_count = 0;
		}

		while (i < (int)blk.sz && (start || '\0' != blk.buf[i])) {

			/*
			 * When finding an unescaped newline character,
			 * leave the character loop to process the line.
			 * Skip a preceding carriage return, if any.
			 */

			if ('\r' == blk.buf[i] && i + 1 < (int)blk.sz &&
			    '\n' == blk.buf[i + 1])
				++i;
			if ('\n' == blk.buf[i]) {
				++i;
				++lnn;
				break;
			}

			/* 
			 * Warn about bogus characters.  If you're using
			 * non-ASCII encoding, you're screwing your
			 * readers.  Since I'd rather this not happen,
			 * I'll be helpful and drop these characters so
			 * we don't display gibberish.  Note to manual
			 * writers: use special characters.
			 */

			c = (unsigned char) blk.buf[i];

			if ( ! (isascii(c) && 
					(isgraph(c) || isblank(c)))) {
				mandoc_msg(MANDOCERR_BADCHAR, curp,
						curp->line, pos, "ignoring byte");
				i++;
				continue;
			}

			/* Trailing backslash = a plain char. */

			if ('\\' != blk.buf[i] || i + 1 == (int)blk.sz) {
				if (pos >= (int)ln.sz)
					resize_buf(&ln, 256);
				ln.buf[pos++] = blk.buf[i++];
				continue;
			}

			/*
			 * Found escape and at least one other character.
			 * When it's a newline character, skip it.
			 * When there is a carriage return in between,
			 * skip that one as well.
			 */

			if ('\r' == blk.buf[i + 1] && i + 2 < (int)blk.sz &&
			    '\n' == blk.buf[i + 2])
				++i;
			if ('\n' == blk.buf[i + 1]) {
				i += 2;
				++lnn;
				continue;
			}

			if ('"' == blk.buf[i + 1] || '#' == blk.buf[i + 1]) {
				i += 2;
				/* Comment, skip to end of line */
				for (; i < (int)blk.sz; ++i) {
					if ('\n' == blk.buf[i]) {
						++i;
						++lnn;
						break;
					}
				}

				/* Backout trailing whitespaces */
				for (; pos > 0; --pos) {
					if (ln.buf[pos - 1] != ' ')
						break;
					if (pos > 2 && ln.buf[pos - 2] == '\\')
						break;
				}
				break;
			}

			/* Some other escape sequence, copy & cont. */

			if (pos + 1 >= (int)ln.sz)
				resize_buf(&ln, 256);

			ln.buf[pos++] = blk.buf[i++];
			ln.buf[pos++] = blk.buf[i++];
		}

 		if (pos >= (int)ln.sz)
			resize_buf(&ln, 256);

		ln.buf[pos] = '\0';

		/*
		 * A significant amount of complexity is contained by
		 * the roff preprocessor.  It's line-oriented but can be
		 * expressed on one line, so we need at times to
		 * readjust our starting point and re-run it.  The roff
		 * preprocessor can also readjust the buffers with new
		 * data, so we pass them in wholesale.
		 */

		of = 0;

		/*
		 * Maintain a lookaside buffer of all parsed lines.  We
		 * only do this if mparse_keep() has been invoked (the
		 * buffer may be accessed with mparse_getkeep()).
		 */

		if (curp->secondary) {
			curp->secondary->buf = 
				mandoc_realloc
				(curp->secondary->buf, 
				 curp->secondary->sz + pos + 2);
			memcpy(curp->secondary->buf + 
					curp->secondary->sz, 
					ln.buf, pos);
			curp->secondary->sz += pos;
			curp->secondary->buf
				[curp->secondary->sz] = '\n';
			curp->secondary->sz++;
			curp->secondary->buf
				[curp->secondary->sz] = '\0';
		}
rerun:
		rr = roff_parseln
			(curp->roff, curp->line, 
			 &ln.buf, &ln.sz, of, &of);

		switch (rr) {
		case (ROFF_REPARSE):
			if (REPARSE_LIMIT >= ++curp->reparse_count)
				mparse_buf_r(curp, ln, 0);
			else
				mandoc_msg(MANDOCERR_ROFFLOOP, curp,
					curp->line, pos, NULL);
			pos = 0;
			continue;
		case (ROFF_APPEND):
			pos = (int)strlen(ln.buf);
			continue;
		case (ROFF_RERUN):
			goto rerun;
		case (ROFF_IGN):
			pos = 0;
			continue;
		case (ROFF_ERR):
			assert(MANDOCLEVEL_FATAL <= curp->file_status);
			break;
		case (ROFF_SO):
			/*
			 * We remove `so' clauses from our lookaside
			 * buffer because we're going to descend into
			 * the file recursively.
			 */
			if (curp->secondary) 
				curp->secondary->sz -= pos + 1;
			mparse_readfd_r(curp, -1, ln.buf + of, 1);
			if (MANDOCLEVEL_FATAL <= curp->file_status)
				break;
			pos = 0;
			continue;
		default:
			break;
		}

		/*
		 * If we encounter errors in the recursive parse, make
		 * sure we don't continue parsing.
		 */

		if (MANDOCLEVEL_FATAL <= curp->file_status)
			break;

		/*
		 * If input parsers have not been allocated, do so now.
		 * We keep these instanced between parsers, but set them
		 * locally per parse routine since we can use different
		 * parsers with each one.
		 */

		if ( ! (curp->man || curp->mdoc))
			pset(ln.buf + of, pos - of, curp);

		/* 
		 * Lastly, push down into the parsers themselves.  One
		 * of these will have already been set in the pset()
		 * routine.
		 * If libroff returns ROFF_TBL, then add it to the
		 * currently open parse.  Since we only get here if
		 * there does exist data (see tbl_data.c), we're
		 * guaranteed that something's been allocated.
		 * Do the same for ROFF_EQN.
		 */

		rc = -1;

		if (ROFF_TBL == rr)
			while (NULL != (span = roff_span(curp->roff))) {
				rc = curp->man ?
					man_addspan(curp->man, span) :
					mdoc_addspan(curp->mdoc, span);
				if (0 == rc)
					break;
			}
		else if (ROFF_EQN == rr)
			rc = curp->mdoc ? 
				mdoc_addeqn(curp->mdoc, 
					roff_eqn(curp->roff)) :
				man_addeqn(curp->man,
					roff_eqn(curp->roff));
		else if (curp->man || curp->mdoc)
			rc = curp->man ?
				man_parseln(curp->man, 
					curp->line, ln.buf, of) :
				mdoc_parseln(curp->mdoc, 
					curp->line, ln.buf, of);

		if (0 == rc) {
			assert(MANDOCLEVEL_FATAL <= curp->file_status);
			break;
		}

		/* Temporary buffers typically are not full. */

		if (0 == start && '\0' == blk.buf[i])
			break;

		/* Start the next input line. */

		pos = 0;
	}

	free(ln.buf);
}