cmark_chunk cmark_clean_title(cmark_chunk *title) { cmark_strbuf buf = GH_BUF_INIT; unsigned char first, last; if (title->len == 0) { cmark_chunk result = CMARK_CHUNK_EMPTY; return result; } first = title->data[0]; last = title->data[title->len - 1]; // remove surrounding quotes if any: if ((first == '\'' && last == '\'') || (first == '(' && last == ')') || (first == '"' && last == '"')) { houdini_unescape_html_f(&buf, title->data + 1, title->len - 2); } else { houdini_unescape_html_f(&buf, title->data, title->len); } cmark_strbuf_unescape(&buf); return cmark_chunk_buf_detach(&buf); }
void cmark_consolidate_text_nodes(cmark_node *root) { cmark_iter *iter = cmark_iter_new(root); cmark_strbuf buf = GH_BUF_INIT; cmark_event_type ev_type; cmark_node *cur, *tmp, *next; while ((ev_type = cmark_iter_next(iter)) != CMARK_EVENT_DONE) { cur = cmark_iter_get_node(iter); if (ev_type == CMARK_EVENT_ENTER && cur->type == CMARK_NODE_TEXT && cur->next && cur->next->type == CMARK_NODE_TEXT) { cmark_strbuf_clear(&buf); cmark_strbuf_put(&buf, cur->as.literal.data, cur->as.literal.len); tmp = cur->next; while (tmp && tmp->type == CMARK_NODE_TEXT) { cmark_iter_next(iter); // advance pointer cmark_strbuf_put(&buf, tmp->as.literal.data, tmp->as.literal.len); next = tmp->next; cmark_node_free(tmp); tmp = next; } cmark_chunk_free(&cur->as.literal); cur->as.literal = cmark_chunk_buf_detach(&buf); } } cmark_strbuf_free(&buf); cmark_iter_free(iter); }
// Like make_str, but parses entities. // Returns an inline sequence consisting of str and entity elements. static cmark_node *make_str_with_entities(cmark_chunk *content) { cmark_strbuf unescaped = GH_BUF_INIT; if (houdini_unescape_html(&unescaped, content->data, (size_t)content->len)) { return make_str(cmark_chunk_buf_detach(&unescaped)); } else { return make_str(*content); } }
// Like make_str, but parses entities. static cmark_node *make_str_with_entities(cmark_mem *mem, cmark_chunk *content) { cmark_strbuf unescaped = CMARK_BUF_INIT(mem); if (houdini_unescape_html(&unescaped, content->data, content->len)) { return make_str(mem, cmark_chunk_buf_detach(&unescaped)); } else { return make_str(mem, *content); } }
static cmark_node *www_match(cmark_parser *parser, cmark_node *parent, cmark_inline_parser *inline_parser) { cmark_chunk *chunk = cmark_inline_parser_get_chunk(inline_parser); size_t max_rewind = cmark_inline_parser_get_offset(inline_parser); uint8_t *data = chunk->data + max_rewind; size_t size = chunk->len - max_rewind; int start = cmark_inline_parser_get_column(inline_parser); size_t link_end; if (max_rewind > 0 && strchr("*_~(", data[-1]) == NULL && !cmark_isspace(data[-1])) return 0; if (size < 4 || memcmp(data, "www.", strlen("www.")) != 0) return 0; link_end = check_domain(data, size, 0); if (link_end == 0) return NULL; while (link_end < size && !cmark_isspace(data[link_end])) link_end++; link_end = autolink_delim(data, link_end); if (link_end == 0) return NULL; cmark_inline_parser_set_offset(inline_parser, (int)(max_rewind + link_end)); cmark_node *node = cmark_node_new_with_mem(CMARK_NODE_LINK, parser->mem); cmark_strbuf buf; cmark_strbuf_init(parser->mem, &buf, 10); cmark_strbuf_puts(&buf, "http://"); cmark_strbuf_put(&buf, data, (bufsize_t)link_end); node->as.link.url = cmark_chunk_buf_detach(&buf); cmark_node *text = cmark_node_new_with_mem(CMARK_NODE_TEXT, parser->mem); text->as.literal = cmark_chunk_dup(chunk, (bufsize_t)max_rewind, (bufsize_t)link_end); cmark_node_append_child(node, text); node->start_line = text->start_line = node->end_line = text->end_line = cmark_inline_parser_get_line(inline_parser); node->start_column = text->start_column = start - 1; node->end_column = text->end_column = cmark_inline_parser_get_column(inline_parser) - 1; return node; }
// Clean a URL: remove surrounding whitespace, and remove \ that escape // punctuation. cmark_chunk cmark_clean_url(cmark_mem *mem, cmark_chunk *url) { cmark_strbuf buf = CMARK_BUF_INIT(mem); cmark_chunk_trim(url); if (url->len == 0) { cmark_chunk result = CMARK_CHUNK_EMPTY; return result; } houdini_unescape_html_f(&buf, url->data, url->len); cmark_strbuf_unescape(&buf); return cmark_chunk_buf_detach(&buf); }
// Parse an entity or a regular "&" string. // Assumes the subject has an '&' character at the current position. static cmark_node *handle_entity(subject *subj) { cmark_strbuf ent = CMARK_BUF_INIT(subj->mem); bufsize_t len; advance(subj); len = houdini_unescape_ent(&ent, subj->input.data + subj->pos, subj->input.len - subj->pos); if (len == 0) return make_str(subj->mem, cmark_chunk_literal("&")); subj->pos += len; return make_str(subj->mem, cmark_chunk_buf_detach(&ent)); }
static cmark_chunk cmark_clean_autolink(cmark_mem *mem, cmark_chunk *url, int is_email) { cmark_strbuf buf = CMARK_BUF_INIT(mem); cmark_chunk_trim(url); if (url->len == 0) { cmark_chunk result = CMARK_CHUNK_EMPTY; return result; } if (is_email) cmark_strbuf_puts(&buf, "mailto:"); houdini_unescape_html_f(&buf, url->data, url->len); return cmark_chunk_buf_detach(&buf); }
// Parse backtick code section or raw backticks, return an inline. // Assumes that the subject has a backtick at the current position. static cmark_node* handle_backticks(subject *subj) { cmark_chunk openticks = take_while(subj, isbacktick); int startpos = subj->pos; int endpos = scan_to_closing_backticks(subj, openticks.len); if (endpos == 0) { // not found subj->pos = startpos; // rewind return make_str(openticks); } else { cmark_strbuf buf = GH_BUF_INIT; cmark_strbuf_set(&buf, subj->input.data + startpos, endpos - startpos - openticks.len); cmark_strbuf_trim(&buf); cmark_strbuf_normalize_whitespace(&buf); return make_code(cmark_chunk_buf_detach(&buf)); } }
// Assumes we have a hyphen at the current position. static cmark_node *handle_hyphen(subject *subj, bool smart) { cmark_strbuf buf = CMARK_BUF_INIT(NULL); int startpos = subj->pos; int en_count = 0; int em_count = 0; int numhyphens; int i; advance(subj); if (!smart || peek_char(subj) != '-') { return make_str(subj->mem, cmark_chunk_literal("-")); } while (smart && peek_char(subj) == '-') { advance(subj); } numhyphens = subj->pos - startpos; buf.mem = subj->mem; if (numhyphens % 3 == 0) { // if divisible by 3, use all em dashes em_count = numhyphens / 3; } else if (numhyphens % 2 == 0) { // if divisible by 2, use all en dashes en_count = numhyphens / 2; } else if (numhyphens % 3 == 2) { // use one en dash at end en_count = 1; em_count = (numhyphens - 2) / 3; } else { // use two en dashes at the end en_count = 2; em_count = (numhyphens - 4) / 3; } for (i = em_count; i > 0; i--) { cmark_strbuf_puts(&buf, EMDASH); } for (i = en_count; i > 0; i--) { cmark_strbuf_puts(&buf, ENDASH); } return make_str(subj->mem, cmark_chunk_buf_detach(&buf)); }
// Clean a URL: remove surrounding whitespace and surrounding <>, // and remove \ that escape punctuation. cmark_chunk cmark_clean_url(cmark_chunk *url) { cmark_strbuf buf = GH_BUF_INIT; cmark_chunk_trim(url); if (url->len == 0) { cmark_chunk result = CMARK_CHUNK_EMPTY; return result; } if (url->data[0] == '<' && url->data[url->len - 1] == '>') { houdini_unescape_html_f(&buf, url->data + 1, url->len - 2); } else { houdini_unescape_html_f(&buf, url->data, url->len); } cmark_strbuf_unescape(&buf); return cmark_chunk_buf_detach(&buf); }
static cmark_node* finalize(cmark_parser *parser, cmark_node* b) { int pos; cmark_node* item; cmark_node* subitem; cmark_node* parent; parent = b->parent; assert(b->open); // shouldn't call finalize on closed blocks b->open = false; if (parser->curline->size == 0) { // end of input - line number has not been incremented b->end_line = parser->line_number; b->end_column = parser->last_line_length; } else if (b->type == NODE_DOCUMENT || (b->type == NODE_CODE_BLOCK && b->as.code.fenced) || (b->type == NODE_HEADER && b->as.header.setext)) { b->end_line = parser->line_number; b->end_column = parser->curline->size; if (b->end_column && parser->curline->ptr[b->end_column - 1] == '\n') b->end_column--; if (b->end_column && parser->curline->ptr[b->end_column - 1] == '\r') b->end_column--; } else { b->end_line = parser->line_number - 1; b->end_column = parser->last_line_length; } switch (b->type) { case NODE_PARAGRAPH: while (cmark_strbuf_at(&b->string_content, 0) == '[' && (pos = cmark_parse_reference_inline(&b->string_content, parser->refmap))) { cmark_strbuf_drop(&b->string_content, pos); } if (is_blank(&b->string_content, 0)) { // remove blank node (former reference def) cmark_node_free(b); } break; case NODE_CODE_BLOCK: if (!b->as.code.fenced) { // indented code remove_trailing_blank_lines(&b->string_content); cmark_strbuf_putc(&b->string_content, '\n'); } else { // first line of contents becomes info for (pos = 0; pos < b->string_content.size; ++pos) { if (b->string_content.ptr[pos] == '\r' || b->string_content.ptr[pos] == '\n') break; } assert(pos < b->string_content.size); cmark_strbuf tmp = GH_BUF_INIT; houdini_unescape_html_f( &tmp, b->string_content.ptr, pos ); cmark_strbuf_trim(&tmp); cmark_strbuf_unescape(&tmp); b->as.code.info = cmark_chunk_buf_detach(&tmp); if (b->string_content.ptr[pos] == '\r') pos += 1; if (b->string_content.ptr[pos] == '\n') pos += 1; cmark_strbuf_drop(&b->string_content, pos); } b->as.code.literal = cmark_chunk_buf_detach(&b->string_content); break; case NODE_HTML: b->as.literal = cmark_chunk_buf_detach(&b->string_content); break; case NODE_LIST: // determine tight/loose status b->as.list.tight = true; // tight by default item = b->first_child; while (item) { // check for non-final non-empty list item ending with blank line: if (item->last_line_blank && item->next) { b->as.list.tight = false; break; } // recurse into children of list item, to see if there are // spaces between them: subitem = item->first_child; while (subitem) { if (ends_with_blank_line(subitem) && (item->next || subitem->next)) { b->as.list.tight = false; break; } subitem = subitem->next; } if (!(b->as.list.tight)) { break; } item = item->next; } break; default: break; } return parent; }
static void postprocess_text(cmark_parser *parser, cmark_node *text, int offset) { size_t link_end; uint8_t *data = text->as.literal.data, *at; size_t size = text->as.literal.len; int rewind, max_rewind, nb = 0, np = 0, ns = 0; if (offset < 0 || (size_t)offset >= size) return; data += offset; size -= offset; at = (uint8_t *)memchr(data, '@', size); if (!at) return; max_rewind = (int)(at - data); data += max_rewind; size -= max_rewind; for (rewind = 0; rewind < max_rewind; ++rewind) { uint8_t c = data[-rewind - 1]; if (cmark_isalnum(c)) continue; if (strchr(".+-_", c) != NULL) continue; if (c == '/') ns++; break; } if (rewind == 0 || ns > 0) { postprocess_text(parser, text, max_rewind + 1 + offset); return; } for (link_end = 0; link_end < size; ++link_end) { uint8_t c = data[link_end]; if (cmark_isalnum(c)) continue; if (c == '@') nb++; else if (c == '.' && link_end < size - 1 && cmark_isalnum(data[link_end + 1])) np++; else if (c != '-' && c != '_') break; } if (link_end < 2 || nb != 1 || np == 0 || (!cmark_isalpha(data[link_end - 1]) && data[link_end - 1] != '.')) { postprocess_text(parser, text, max_rewind + 1 + offset); return; } link_end = autolink_delim(data, link_end); if (link_end == 0) { postprocess_text(parser, text, max_rewind + 1 + offset); return; } cmark_chunk_to_cstr(parser->mem, &text->as.literal); cmark_node *link_node = cmark_node_new_with_mem(CMARK_NODE_LINK, parser->mem); cmark_strbuf buf; cmark_strbuf_init(parser->mem, &buf, 10); cmark_strbuf_puts(&buf, "mailto:"); cmark_strbuf_put(&buf, data - rewind, (bufsize_t)(link_end + rewind)); link_node->as.link.url = cmark_chunk_buf_detach(&buf); cmark_node *link_text = cmark_node_new_with_mem(CMARK_NODE_TEXT, parser->mem); cmark_chunk email = cmark_chunk_dup( &text->as.literal, offset + max_rewind - rewind, (bufsize_t)(link_end + rewind)); cmark_chunk_to_cstr(parser->mem, &email); link_text->as.literal = email; cmark_node_append_child(link_node, link_text); cmark_node_insert_after(text, link_node); cmark_node *post = cmark_node_new_with_mem(CMARK_NODE_TEXT, parser->mem); post->as.literal = cmark_chunk_dup(&text->as.literal, (bufsize_t)(offset + max_rewind + link_end), (bufsize_t)(size - link_end)); cmark_chunk_to_cstr(parser->mem, &post->as.literal); cmark_node_insert_after(link_node, post); text->as.literal.len = offset + max_rewind - rewind; text->as.literal.data[text->as.literal.len] = 0; postprocess_text(parser, post, 0); }
static cmark_node* finalize(cmark_parser *parser, cmark_node* b, int line_number) { int firstlinelen; int pos; cmark_node* item; cmark_node* subitem; cmark_node* parent; parent = b->parent; // don't do anything if the cmark_node is already closed if (!b->open) return parent; b->open = false; if (line_number > b->start_line) { b->end_line = line_number - 1; } else { b->end_line = line_number; } switch (b->type) { case NODE_PARAGRAPH: while (cmark_strbuf_at(&b->string_content, 0) == '[' && (pos = cmark_parse_reference_inline(&b->string_content, parser->refmap))) { cmark_strbuf_drop(&b->string_content, pos); } if (is_blank(&b->string_content, 0)) { // remove blank node (former reference def) cmark_node_free(b); } break; case NODE_CODE_BLOCK: if (!b->as.code.fenced) { // indented code remove_trailing_blank_lines(&b->string_content); cmark_strbuf_putc(&b->string_content, '\n'); } else { // first line of contents becomes info firstlinelen = cmark_strbuf_strchr(&b->string_content, '\n', 0); cmark_strbuf tmp = GH_BUF_INIT; houdini_unescape_html_f( &tmp, b->string_content.ptr, firstlinelen ); cmark_strbuf_trim(&tmp); cmark_strbuf_unescape(&tmp); b->as.code.info = cmark_chunk_buf_detach(&tmp); cmark_strbuf_drop(&b->string_content, firstlinelen + 1); } b->as.code.literal = cmark_chunk_buf_detach(&b->string_content); break; case NODE_HTML: b->as.literal = cmark_chunk_buf_detach(&b->string_content); break; case NODE_LIST: // determine tight/loose status b->as.list.tight = true; // tight by default item = b->first_child; while (item) { // check for non-final non-empty list item ending with blank line: if (item->last_line_blank && item->next) { b->as.list.tight = false; break; } // recurse into children of list item, to see if there are // spaces between them: subitem = item->first_child; while (subitem) { if (ends_with_blank_line(subitem) && (item->next || subitem->next)) { b->as.list.tight = false; break; } subitem = subitem->next; } if (!(b->as.list.tight)) { break; } item = item->next; } break; default: break; } return parent; }