static void pdesc(struct mparse *curp, const char *file, int fd) { struct buf blk; int with_mmap; /* * Run for each opened file; may be called more than once for * each full parse sequence if the opened file is nested (i.e., * from `so'). Simply sucks in the whole file and moves into * the parse phase for the file. */ if ( ! read_whole_file(file, fd, &blk, &with_mmap)) { curp->file_status = MANDOCLEVEL_SYSERR; return; } /* Line number is per-file. */ curp->line = 1; mparse_buf_r(curp, blk, 1); if (with_mmap) munmap(blk.buf, blk.sz); else free(blk.buf); }
static void mparse_parse_buffer(struct mparse *curp, struct buf blk, const char *file) { struct buf *svprimary; const char *svfile; size_t offset; static int recursion_depth; if (64 < recursion_depth) { mandoc_msg(MANDOCERR_ROFFLOOP, curp, curp->line, 0, NULL); return; } /* Line number is per-file. */ svfile = curp->file; curp->file = file; svprimary = curp->primary; curp->primary = &blk; curp->line = 1; recursion_depth++; /* Skip an UTF-8 byte order mark. */ if (curp->filenc & MPARSE_UTF8 && blk.sz > 2 && (unsigned char)blk.buf[0] == 0xef && (unsigned char)blk.buf[1] == 0xbb && (unsigned char)blk.buf[2] == 0xbf) { offset = 3; curp->filenc &= ~MPARSE_LATIN1; } else offset = 0; mparse_buf_r(curp, blk, offset, 1); if (--recursion_depth == 0) mparse_end(curp); curp->primary = svprimary; curp->file = svfile; }
static void mparse_parse_buffer(struct mparse *curp, struct buf blk, const char *file) { const char *svfile; static int recursion_depth; if (64 < recursion_depth) { mandoc_msg(MANDOCERR_ROFFLOOP, curp, curp->line, 0, NULL); return; } /* Line number is per-file. */ svfile = curp->file; curp->file = file; curp->line = 1; recursion_depth++; mparse_buf_r(curp, blk, 1); if (0 == --recursion_depth && MANDOCLEVEL_FATAL > curp->file_status) mparse_end(curp); curp->file = svfile; }
/* * Main parse routine for a buffer. * It assumes encoding and line numbering are already set up. * It can recurse directly (for invocations of user-defined * macros, inline equations, and input line traps) * and indirectly (for .so file inclusion). */ static void mparse_buf_r(struct mparse *curp, struct buf blk, size_t i, int start) { const struct tbl_span *span; struct buf ln; const char *save_file; char *cp; size_t pos; /* byte number in the ln buffer */ enum rofferr rr; int of; int lnn; /* line number in the real file */ int fd; unsigned char c; memset(&ln, 0, sizeof(ln)); lnn = curp->line; pos = 0; while (i < blk.sz) { if (0 == pos && '\0' == blk.buf[i]) break; if (start) { curp->line = lnn; curp->reparse_count = 0; if (lnn < 3 && curp->filenc & MPARSE_UTF8 && curp->filenc & MPARSE_LATIN1) curp->filenc = preconv_cue(&blk, i); } while (i < blk.sz && (start || blk.buf[i] != '\0')) { /* * When finding an unescaped newline character, * leave the character loop to process the line. * Skip a preceding carriage return, if any. */ if ('\r' == blk.buf[i] && i + 1 < blk.sz && '\n' == blk.buf[i + 1]) ++i; if ('\n' == blk.buf[i]) { ++i; ++lnn; break; } /* * Make sure we have space for the worst * case of 11 bytes: "\\[u10ffff]\0" */ if (pos + 11 > ln.sz) resize_buf(&ln, 256); /* * Encode 8-bit input. */ c = blk.buf[i]; if (c & 0x80) { if ( ! (curp->filenc && preconv_encode( &blk, &i, &ln, &pos, &curp->filenc))) { mandoc_vmsg(MANDOCERR_CHAR_BAD, curp, curp->line, pos, "0x%x", c); ln.buf[pos++] = '?'; i++; } continue; } /* * Exclude control characters. */ if (c == 0x7f || (c < 0x20 && c != 0x09)) { mandoc_vmsg(c == 0x00 || c == 0x04 || c > 0x0a ? MANDOCERR_CHAR_BAD : MANDOCERR_CHAR_UNSUPP, curp, curp->line, pos, "0x%x", c); i++; if (c != '\r') ln.buf[pos++] = '?'; continue; } /* Trailing backslash = a plain char. */ if (blk.buf[i] != '\\' || i + 1 == blk.sz) { ln.buf[pos++] = blk.buf[i++]; continue; } /* * Found escape and at least one other character. * When it's a newline character, skip it. * When there is a carriage return in between, * skip that one as well. */ if ('\r' == blk.buf[i + 1] && i + 2 < blk.sz && '\n' == blk.buf[i + 2]) ++i; if ('\n' == blk.buf[i + 1]) { i += 2; ++lnn; continue; } if ('"' == blk.buf[i + 1] || '#' == blk.buf[i + 1]) { i += 2; /* Comment, skip to end of line */ for (; i < blk.sz; ++i) { if ('\n' == blk.buf[i]) { ++i; ++lnn; break; } } /* Backout trailing whitespaces */ for (; pos > 0; --pos) { if (ln.buf[pos - 1] != ' ') break; if (pos > 2 && ln.buf[pos - 2] == '\\') break; } break; } /* Catch escaped bogus characters. */ c = (unsigned char) blk.buf[i+1]; if ( ! (isascii(c) && (isgraph(c) || isblank(c)))) { mandoc_vmsg(MANDOCERR_CHAR_BAD, curp, curp->line, pos, "0x%x", c); i += 2; ln.buf[pos++] = '?'; continue; } /* Some other escape sequence, copy & cont. */ ln.buf[pos++] = blk.buf[i++]; ln.buf[pos++] = blk.buf[i++]; } if (pos >= ln.sz) resize_buf(&ln, 256); ln.buf[pos] = '\0'; /* * A significant amount of complexity is contained by * the roff preprocessor. It's line-oriented but can be * expressed on one line, so we need at times to * readjust our starting point and re-run it. The roff * preprocessor can also readjust the buffers with new * data, so we pass them in wholesale. */ of = 0; /* * Maintain a lookaside buffer of all parsed lines. We * only do this if mparse_keep() has been invoked (the * buffer may be accessed with mparse_getkeep()). */ if (curp->secondary) { curp->secondary->buf = mandoc_realloc( curp->secondary->buf, curp->secondary->sz + pos + 2); memcpy(curp->secondary->buf + curp->secondary->sz, ln.buf, pos); curp->secondary->sz += pos; curp->secondary->buf [curp->secondary->sz] = '\n'; curp->secondary->sz++; curp->secondary->buf [curp->secondary->sz] = '\0'; } rerun: rr = roff_parseln(curp->roff, curp->line, &ln, &of); switch (rr) { case ROFF_REPARSE: if (REPARSE_LIMIT >= ++curp->reparse_count) mparse_buf_r(curp, ln, of, 0); else mandoc_msg(MANDOCERR_ROFFLOOP, curp, curp->line, pos, NULL); pos = 0; continue; case ROFF_APPEND: pos = strlen(ln.buf); continue; case ROFF_RERUN: goto rerun; case ROFF_IGN: pos = 0; continue; case ROFF_SO: if ( ! (curp->options & MPARSE_SO) && (i >= blk.sz || blk.buf[i] == '\0')) { curp->sodest = mandoc_strdup(ln.buf + of); free(ln.buf); return; } /* * We remove `so' clauses from our lookaside * buffer because we're going to descend into * the file recursively. */ if (curp->secondary) curp->secondary->sz -= pos + 1; save_file = curp->file; if ((fd = mparse_open(curp, ln.buf + of)) != -1) { mparse_readfd(curp, fd, ln.buf + of); close(fd); curp->file = save_file; } else { curp->file = save_file; mandoc_vmsg(MANDOCERR_SO_FAIL, curp, curp->line, pos, ".so %s", ln.buf + of); ln.sz = mandoc_asprintf(&cp, ".sp\nSee the file %s.\n.sp", ln.buf + of); free(ln.buf); ln.buf = cp; of = 0; mparse_buf_r(curp, ln, of, 0); } pos = 0; continue; default: break; } /* * If input parsers have not been allocated, do so now. * We keep these instanced between parsers, but set them * locally per parse routine since we can use different * parsers with each one. */ if (curp->man == NULL || curp->man->macroset == MACROSET_NONE) choose_parser(curp); /* * Lastly, push down into the parsers themselves. * If libroff returns ROFF_TBL, then add it to the * currently open parse. Since we only get here if * there does exist data (see tbl_data.c), we're * guaranteed that something's been allocated. * Do the same for ROFF_EQN. */ if (rr == ROFF_TBL) while ((span = roff_span(curp->roff)) != NULL) roff_addtbl(curp->man, span); else if (rr == ROFF_EQN) roff_addeqn(curp->man, roff_eqn(curp->roff)); else if ((curp->man->macroset == MACROSET_MDOC ? mdoc_parseln(curp->man, curp->line, ln.buf, of) : man_parseln(curp->man, curp->line, ln.buf, of)) == 2) break; /* Temporary buffers typically are not full. */ if (0 == start && '\0' == blk.buf[i]) break; /* Start the next input line. */ pos = 0; } free(ln.buf); }
/* * Main parse routine for an opened file. This is called for each * opened file and simply loops around the full input file, possibly * nesting (i.e., with `so'). */ static void mparse_buf_r(struct mparse *curp, struct buf blk, int start) { const struct tbl_span *span; struct buf ln; enum rofferr rr; int i, of, rc; int pos; /* byte number in the ln buffer */ int lnn; /* line number in the real file */ unsigned char c; memset(&ln, 0, sizeof(struct buf)); lnn = curp->line; pos = 0; for (i = 0; i < (int)blk.sz; ) { if (0 == pos && '\0' == blk.buf[i]) break; if (start) { curp->line = lnn; curp->reparse_count = 0; } while (i < (int)blk.sz && (start || '\0' != blk.buf[i])) { /* * When finding an unescaped newline character, * leave the character loop to process the line. * Skip a preceding carriage return, if any. */ if ('\r' == blk.buf[i] && i + 1 < (int)blk.sz && '\n' == blk.buf[i + 1]) ++i; if ('\n' == blk.buf[i]) { ++i; ++lnn; break; } /* * Warn about bogus characters. If you're using * non-ASCII encoding, you're screwing your * readers. Since I'd rather this not happen, * I'll be helpful and drop these characters so * we don't display gibberish. Note to manual * writers: use special characters. */ c = (unsigned char) blk.buf[i]; if ( ! (isascii(c) && (isgraph(c) || isblank(c)))) { mandoc_msg(MANDOCERR_BADCHAR, curp, curp->line, pos, "ignoring byte"); i++; continue; } /* Trailing backslash = a plain char. */ if ('\\' != blk.buf[i] || i + 1 == (int)blk.sz) { if (pos >= (int)ln.sz) resize_buf(&ln, 256); ln.buf[pos++] = blk.buf[i++]; continue; } /* * Found escape and at least one other character. * When it's a newline character, skip it. * When there is a carriage return in between, * skip that one as well. */ if ('\r' == blk.buf[i + 1] && i + 2 < (int)blk.sz && '\n' == blk.buf[i + 2]) ++i; if ('\n' == blk.buf[i + 1]) { i += 2; ++lnn; continue; } if ('"' == blk.buf[i + 1] || '#' == blk.buf[i + 1]) { i += 2; /* Comment, skip to end of line */ for (; i < (int)blk.sz; ++i) { if ('\n' == blk.buf[i]) { ++i; ++lnn; break; } } /* Backout trailing whitespaces */ for (; pos > 0; --pos) { if (ln.buf[pos - 1] != ' ') break; if (pos > 2 && ln.buf[pos - 2] == '\\') break; } break; } /* Some other escape sequence, copy & cont. */ if (pos + 1 >= (int)ln.sz) resize_buf(&ln, 256); ln.buf[pos++] = blk.buf[i++]; ln.buf[pos++] = blk.buf[i++]; } if (pos >= (int)ln.sz) resize_buf(&ln, 256); ln.buf[pos] = '\0'; /* * A significant amount of complexity is contained by * the roff preprocessor. It's line-oriented but can be * expressed on one line, so we need at times to * readjust our starting point and re-run it. The roff * preprocessor can also readjust the buffers with new * data, so we pass them in wholesale. */ of = 0; /* * Maintain a lookaside buffer of all parsed lines. We * only do this if mparse_keep() has been invoked (the * buffer may be accessed with mparse_getkeep()). */ if (curp->secondary) { curp->secondary->buf = mandoc_realloc (curp->secondary->buf, curp->secondary->sz + pos + 2); memcpy(curp->secondary->buf + curp->secondary->sz, ln.buf, pos); curp->secondary->sz += pos; curp->secondary->buf [curp->secondary->sz] = '\n'; curp->secondary->sz++; curp->secondary->buf [curp->secondary->sz] = '\0'; } rerun: rr = roff_parseln (curp->roff, curp->line, &ln.buf, &ln.sz, of, &of); switch (rr) { case (ROFF_REPARSE): if (REPARSE_LIMIT >= ++curp->reparse_count) mparse_buf_r(curp, ln, 0); else mandoc_msg(MANDOCERR_ROFFLOOP, curp, curp->line, pos, NULL); pos = 0; continue; case (ROFF_APPEND): pos = (int)strlen(ln.buf); continue; case (ROFF_RERUN): goto rerun; case (ROFF_IGN): pos = 0; continue; case (ROFF_ERR): assert(MANDOCLEVEL_FATAL <= curp->file_status); break; case (ROFF_SO): /* * We remove `so' clauses from our lookaside * buffer because we're going to descend into * the file recursively. */ if (curp->secondary) curp->secondary->sz -= pos + 1; mparse_readfd_r(curp, -1, ln.buf + of, 1); if (MANDOCLEVEL_FATAL <= curp->file_status) break; pos = 0; continue; default: break; } /* * If we encounter errors in the recursive parse, make * sure we don't continue parsing. */ if (MANDOCLEVEL_FATAL <= curp->file_status) break; /* * If input parsers have not been allocated, do so now. * We keep these instanced between parsers, but set them * locally per parse routine since we can use different * parsers with each one. */ if ( ! (curp->man || curp->mdoc)) pset(ln.buf + of, pos - of, curp); /* * Lastly, push down into the parsers themselves. One * of these will have already been set in the pset() * routine. * If libroff returns ROFF_TBL, then add it to the * currently open parse. Since we only get here if * there does exist data (see tbl_data.c), we're * guaranteed that something's been allocated. * Do the same for ROFF_EQN. */ rc = -1; if (ROFF_TBL == rr) while (NULL != (span = roff_span(curp->roff))) { rc = curp->man ? man_addspan(curp->man, span) : mdoc_addspan(curp->mdoc, span); if (0 == rc) break; } else if (ROFF_EQN == rr) rc = curp->mdoc ? mdoc_addeqn(curp->mdoc, roff_eqn(curp->roff)) : man_addeqn(curp->man, roff_eqn(curp->roff)); else if (curp->man || curp->mdoc) rc = curp->man ? man_parseln(curp->man, curp->line, ln.buf, of) : mdoc_parseln(curp->mdoc, curp->line, ln.buf, of); if (0 == rc) { assert(MANDOCLEVEL_FATAL <= curp->file_status); break; } /* Temporary buffers typically are not full. */ if (0 == start && '\0' == blk.buf[i]) break; /* Start the next input line. */ pos = 0; } free(ln.buf); }