// Parse an autolink or HTML tag. // Assumes the subject has a '<' character at the current position. static cmark_node *handle_pointy_brace(subject *subj) { bufsize_t matchlen = 0; cmark_chunk contents; advance(subj); // advance past first < // first try to match a URL autolink matchlen = scan_autolink_uri(&subj->input, subj->pos); if (matchlen > 0) { contents = cmark_chunk_dup(&subj->input, subj->pos, matchlen - 1); subj->pos += matchlen; return make_autolink(subj->mem, contents, 0); } // next try to match an email autolink matchlen = scan_autolink_email(&subj->input, subj->pos); if (matchlen > 0) { contents = cmark_chunk_dup(&subj->input, subj->pos, matchlen - 1); subj->pos += matchlen; return make_autolink(subj->mem, contents, 1); } // finally, try to match an html tag matchlen = scan_html_tag(&subj->input, subj->pos); if (matchlen > 0) { contents = cmark_chunk_dup(&subj->input, subj->pos - 1, matchlen + 1); subj->pos += matchlen; return make_raw_html(subj->mem, contents); } // if nothing matches, just return the opening <: return make_str(subj->mem, cmark_chunk_literal("<")); }
// Parse reference. Assumes string begins with '[' character. // Modify refmap if a reference is encountered. // Return 0 if no reference found, otherwise position of subject // after reference is parsed. int cmark_parse_reference_inline(cmark_strbuf *input, cmark_reference_map *refmap) { subject subj; cmark_chunk lab; cmark_chunk url; cmark_chunk title; int matchlen = 0; int beforetitle; subject_from_buf(&subj, input, NULL); // parse label: if (!link_label(&subj, &lab)) return 0; // colon: if (peek_char(&subj) == ':') { advance(&subj); } else { return 0; } // parse link url: spnl(&subj); matchlen = scan_link_url(&subj.input, subj.pos); if (matchlen) { url = cmark_chunk_dup(&subj.input, subj.pos, matchlen); subj.pos += matchlen; } else { return 0; } // parse optional link_title beforetitle = subj.pos; spnl(&subj); matchlen = scan_link_title(&subj.input, subj.pos); if (matchlen) { title = cmark_chunk_dup(&subj.input, subj.pos, matchlen); subj.pos += matchlen; } else { subj.pos = beforetitle; title = cmark_chunk_literal(""); } // parse final spaces and newline: while (peek_char(&subj) == ' ') { advance(&subj); } if (peek_char(&subj) == '\n') { advance(&subj); } else if (peek_char(&subj) != 0) { return 0; } // insert reference into refmap cmark_reference_create(refmap, &lab, &url, &title); return subj.pos; }
// Assumes the subject has a c at the current position. static cmark_node* handle_delim(subject* subj, unsigned char c, bool smart) { int numdelims; cmark_node * inl_text; bool can_open, can_close; cmark_chunk contents; numdelims = scan_delims(subj, c, &can_open, &can_close); if (c == '\'' && smart) { contents = cmark_chunk_literal(RIGHTSINGLEQUOTE); } else if (c == '"' && smart) { contents = cmark_chunk_literal(can_close ? RIGHTDOUBLEQUOTE : LEFTDOUBLEQUOTE); } else { contents = cmark_chunk_dup(&subj->input, subj->pos - numdelims, numdelims); } inl_text = make_str(contents); if ((can_open || can_close) && (!(c == '\'' || c == '"') || smart)) { push_delimiter(subj, c, can_open, can_close, inl_text); } return inl_text; }
static cmark_node *www_match(cmark_parser *parser, cmark_node *parent, cmark_inline_parser *inline_parser) { cmark_chunk *chunk = cmark_inline_parser_get_chunk(inline_parser); size_t max_rewind = cmark_inline_parser_get_offset(inline_parser); uint8_t *data = chunk->data + max_rewind; size_t size = chunk->len - max_rewind; int start = cmark_inline_parser_get_column(inline_parser); size_t link_end; if (max_rewind > 0 && strchr("*_~(", data[-1]) == NULL && !cmark_isspace(data[-1])) return 0; if (size < 4 || memcmp(data, "www.", strlen("www.")) != 0) return 0; link_end = check_domain(data, size, 0); if (link_end == 0) return NULL; while (link_end < size && !cmark_isspace(data[link_end])) link_end++; link_end = autolink_delim(data, link_end); if (link_end == 0) return NULL; cmark_inline_parser_set_offset(inline_parser, (int)(max_rewind + link_end)); cmark_node *node = cmark_node_new_with_mem(CMARK_NODE_LINK, parser->mem); cmark_strbuf buf; cmark_strbuf_init(parser->mem, &buf, 10); cmark_strbuf_puts(&buf, "http://"); cmark_strbuf_put(&buf, data, (bufsize_t)link_end); node->as.link.url = cmark_chunk_buf_detach(&buf); cmark_node *text = cmark_node_new_with_mem(CMARK_NODE_TEXT, parser->mem); text->as.literal = cmark_chunk_dup(chunk, (bufsize_t)max_rewind, (bufsize_t)link_end); cmark_node_append_child(node, text); node->start_line = text->start_line = node->end_line = text->end_line = cmark_inline_parser_get_line(inline_parser); node->start_column = text->start_column = start - 1; node->end_column = text->end_column = cmark_inline_parser_get_column(inline_parser) - 1; return node; }
// Take characters while a predicate holds, and return a string. static CMARK_INLINE cmark_chunk take_while(subject *subj, int (*f)(int)) { unsigned char c; bufsize_t startpos = subj->pos; bufsize_t len = 0; while ((c = peek_char(subj)) && (*f)(c)) { advance(subj); len++; } return cmark_chunk_dup(&subj->input, startpos, len); }
// Take characters while a predicate holds, and return a string. static inline cmark_chunk take_while(subject* subj, int (*f)(int)) { unsigned char c; int startpos = subj->pos; int len = 0; while ((c = peek_char(subj)) && (*f)(c)) { advance(subj); len++; } return cmark_chunk_dup(&subj->input, startpos, len); }
// Parse backslash-escape or just a backslash, returning an inline. static cmark_node *handle_backslash(subject *subj) { advance(subj); unsigned char nextchar = peek_char(subj); if (cmark_ispunct( nextchar)) { // only ascii symbols and newline can be escaped advance(subj); return make_str(subj->mem, cmark_chunk_dup(&subj->input, subj->pos - 1, 1)); } else if (!is_eof(subj) && skip_line_end(subj)) { return make_linebreak(subj->mem); } else { return make_str(subj->mem, cmark_chunk_literal("\\")); } }
static cmark_node *url_match(cmark_parser *parser, cmark_node *parent, cmark_inline_parser *inline_parser) { size_t link_end, domain_len; int rewind = 0; cmark_chunk *chunk = cmark_inline_parser_get_chunk(inline_parser); int max_rewind = cmark_inline_parser_get_offset(inline_parser); uint8_t *data = chunk->data + max_rewind; size_t size = chunk->len - max_rewind; if (size < 4 || data[1] != '/' || data[2] != '/') return 0; while (rewind < max_rewind && cmark_isalpha(data[-rewind - 1])) rewind++; if (!sd_autolink_issafe(data - rewind, size + rewind)) return 0; link_end = strlen("://"); domain_len = check_domain(data + link_end, size - link_end, 1); if (domain_len == 0) return 0; link_end += domain_len; while (link_end < size && !cmark_isspace(data[link_end])) link_end++; link_end = autolink_delim(data, link_end); if (link_end == 0) return NULL; cmark_inline_parser_set_offset(inline_parser, (int)(max_rewind + link_end)); cmark_node_unput(parent, rewind); cmark_node *node = cmark_node_new_with_mem(CMARK_NODE_LINK, parser->mem); cmark_chunk url = cmark_chunk_dup(chunk, max_rewind - rewind, (bufsize_t)(link_end + rewind)); node->as.link.url = url; cmark_node *text = cmark_node_new_with_mem(CMARK_NODE_TEXT, parser->mem); text->as.literal = url; cmark_node_append_child(node, text); return node; }
// Parse strong/emph or a fallback. // Assumes the subject has '_' or '*' at the current position. static cmark_node* handle_strong_emph(subject* subj, unsigned char c) { int numdelims; cmark_node * inl_text; bool can_open, can_close; numdelims = scan_delims(subj, c, &can_open, &can_close); inl_text = make_str(cmark_chunk_dup(&subj->input, subj->pos - numdelims, numdelims)); if (can_open || can_close) { push_delimiter(subj, c, can_open, can_close, inl_text); } return inl_text; }
// Parse a link label. Returns 1 if successful. // Note: unescaped brackets are not allowed in labels. // The label begins with `[` and ends with the first `]` character // encountered. Backticks in labels do not start code spans. static int link_label(subject* subj, cmark_chunk *raw_label) { bufsize_t startpos = subj->pos; int length = 0; unsigned char c; // advance past [ if (peek_char(subj) == '[') { advance(subj); } else { return 0; } while ((c = peek_char(subj)) && c != '[' && c != ']') { if (c == '\\') { advance(subj); length++; if (cmark_ispunct(peek_char(subj))) { advance(subj); length++; } } else { advance(subj); length++; } if (length > MAX_LINK_LABEL_LENGTH) { goto noMatch; } } if (c == ']') { // match found *raw_label = cmark_chunk_dup(&subj->input, startpos + 1, subj->pos - (startpos + 1)); cmark_chunk_trim(raw_label); advance(subj); // advance past ] return 1; } noMatch: subj->pos = startpos; // rewind return 0; }
// Parse an inline, advancing subject, and add it as a child of parent. // Return 0 if no inline can be parsed, 1 otherwise. static int parse_inline(subject* subj, cmark_node * parent, int options) { cmark_node* new_inl = NULL; cmark_chunk contents; unsigned char c; int endpos; c = peek_char(subj); if (c == 0) { return 0; } switch(c) { case '\n': new_inl = handle_newline(subj); break; case '`': new_inl = handle_backticks(subj); break; case '\\': new_inl = handle_backslash(subj); break; case '&': new_inl = handle_entity(subj); break; case '<': new_inl = handle_pointy_brace(subj); break; case '*': case '_': case '\'': case '"': new_inl = handle_delim(subj, c, options & CMARK_OPT_SMART); break; case '-': new_inl = handle_hyphen(subj, options & CMARK_OPT_SMART); break; case '.': new_inl = handle_period(subj, options & CMARK_OPT_SMART); break; case '[': advance(subj); new_inl = make_str(cmark_chunk_literal("[")); push_delimiter(subj, '[', true, false, new_inl); break; case ']': new_inl = handle_close_bracket(subj, parent); break; case '!': advance(subj); if (peek_char(subj) == '[') { advance(subj); new_inl = make_str(cmark_chunk_literal("![")); push_delimiter(subj, '!', false, true, new_inl); } else { new_inl = make_str(cmark_chunk_literal("!")); } break; default: endpos = subject_find_special_char(subj, options); contents = cmark_chunk_dup(&subj->input, subj->pos, endpos - subj->pos); subj->pos = endpos; // if we're at a newline, strip trailing spaces. if (peek_char(subj) == '\n') { cmark_chunk_rtrim(&contents); } new_inl = make_str(contents); } if (new_inl != NULL) { cmark_node_append_child(parent, new_inl); } return 1; }
// Return a link, an image, or a literal close bracket. static cmark_node* handle_close_bracket(subject* subj, cmark_node *parent) { int initial_pos; int starturl, endurl, starttitle, endtitle, endall; int n; int sps; cmark_reference *ref; bool is_image = false; cmark_chunk url_chunk, title_chunk; unsigned char *url, *title; delimiter *opener; cmark_node *link_text; cmark_node *inl; cmark_chunk raw_label; int found_label; advance(subj); // advance past ] initial_pos = subj->pos; // look through list of delimiters for a [ or ! opener = subj->last_delim; while (opener) { if (opener->delim_char == '[' || opener->delim_char == '!') { break; } opener = opener->previous; } if (opener == NULL) { return make_str(cmark_chunk_literal("]")); } if (!opener->active) { // take delimiter off stack remove_delimiter(subj, opener); return make_str(cmark_chunk_literal("]")); } // If we got here, we matched a potential link/image text. is_image = opener->delim_char == '!'; link_text = opener->inl_text->next; // Now we check to see if it's a link/image. // First, look for an inline link. if (peek_char(subj) == '(' && ((sps = scan_spacechars(&subj->input, subj->pos + 1)) > -1) && ((n = scan_link_url(&subj->input, subj->pos + 1 + sps)) > -1)) { // try to parse an explicit link: starturl = subj->pos + 1 + sps; // after ( endurl = starturl + n; starttitle = endurl + scan_spacechars(&subj->input, endurl); // ensure there are spaces btw url and title endtitle = (starttitle == endurl) ? starttitle : starttitle + scan_link_title(&subj->input, starttitle); endall = endtitle + scan_spacechars(&subj->input, endtitle); if (peek_at(subj, endall) == ')') { subj->pos = endall + 1; url_chunk = cmark_chunk_dup(&subj->input, starturl, endurl - starturl); title_chunk = cmark_chunk_dup(&subj->input, starttitle, endtitle - starttitle); url = cmark_clean_url(&url_chunk); title = cmark_clean_title(&title_chunk); cmark_chunk_free(&url_chunk); cmark_chunk_free(&title_chunk); goto match; } else { goto noMatch; } } // Next, look for a following [link label] that matches in refmap. // skip spaces subj->pos = subj->pos + scan_spacechars(&subj->input, subj->pos); raw_label = cmark_chunk_literal(""); found_label = link_label(subj, &raw_label); if (!found_label || raw_label.len == 0) { cmark_chunk_free(&raw_label); raw_label = cmark_chunk_dup(&subj->input, opener->position, initial_pos - opener->position - 1); } if (!found_label) { // If we have a shortcut reference link, back up // to before the spacse we skipped. subj->pos = initial_pos; } ref = cmark_reference_lookup(subj->refmap, &raw_label); cmark_chunk_free(&raw_label); if (ref != NULL) { // found url = bufdup(ref->url); title = bufdup(ref->title); goto match; } else { goto noMatch; } noMatch: // If we fall through to here, it means we didn't match a link: remove_delimiter(subj, opener); // remove this opener from delimiter list subj->pos = initial_pos; return make_str(cmark_chunk_literal("]")); match: inl = opener->inl_text; inl->type = is_image ? NODE_IMAGE : NODE_LINK; cmark_chunk_free(&inl->as.literal); inl->first_child = link_text; process_emphasis(subj, opener->previous); inl->as.link.url = url; inl->as.link.title = title; inl->next = NULL; if (link_text) { cmark_node *tmp; link_text->prev = NULL; for (tmp = link_text; tmp->next != NULL; tmp = tmp->next) { tmp->parent = inl; } tmp->parent = inl; inl->last_child = tmp; } parent->last_child = inl; // process_emphasis will remove this delimiter and all later ones. // Now, if we have a link, we also want to deactivate earlier link // delimiters. (This code can be removed if we decide to allow links // inside links.) if (!is_image) { opener = subj->last_delim; while (opener != NULL) { if (opener->delim_char == '[') { if (!opener->active) { break; } else { opener->active = false; } } opener = opener->previous; } } return NULL; }
static void postprocess_text(cmark_parser *parser, cmark_node *text, int offset) { size_t link_end; uint8_t *data = text->as.literal.data, *at; size_t size = text->as.literal.len; int rewind, max_rewind, nb = 0, np = 0, ns = 0; if (offset < 0 || (size_t)offset >= size) return; data += offset; size -= offset; at = (uint8_t *)memchr(data, '@', size); if (!at) return; max_rewind = (int)(at - data); data += max_rewind; size -= max_rewind; for (rewind = 0; rewind < max_rewind; ++rewind) { uint8_t c = data[-rewind - 1]; if (cmark_isalnum(c)) continue; if (strchr(".+-_", c) != NULL) continue; if (c == '/') ns++; break; } if (rewind == 0 || ns > 0) { postprocess_text(parser, text, max_rewind + 1 + offset); return; } for (link_end = 0; link_end < size; ++link_end) { uint8_t c = data[link_end]; if (cmark_isalnum(c)) continue; if (c == '@') nb++; else if (c == '.' && link_end < size - 1 && cmark_isalnum(data[link_end + 1])) np++; else if (c != '-' && c != '_') break; } if (link_end < 2 || nb != 1 || np == 0 || (!cmark_isalpha(data[link_end - 1]) && data[link_end - 1] != '.')) { postprocess_text(parser, text, max_rewind + 1 + offset); return; } link_end = autolink_delim(data, link_end); if (link_end == 0) { postprocess_text(parser, text, max_rewind + 1 + offset); return; } cmark_chunk_to_cstr(parser->mem, &text->as.literal); cmark_node *link_node = cmark_node_new_with_mem(CMARK_NODE_LINK, parser->mem); cmark_strbuf buf; cmark_strbuf_init(parser->mem, &buf, 10); cmark_strbuf_puts(&buf, "mailto:"); cmark_strbuf_put(&buf, data - rewind, (bufsize_t)(link_end + rewind)); link_node->as.link.url = cmark_chunk_buf_detach(&buf); cmark_node *link_text = cmark_node_new_with_mem(CMARK_NODE_TEXT, parser->mem); cmark_chunk email = cmark_chunk_dup( &text->as.literal, offset + max_rewind - rewind, (bufsize_t)(link_end + rewind)); cmark_chunk_to_cstr(parser->mem, &email); link_text->as.literal = email; cmark_node_append_child(link_node, link_text); cmark_node_insert_after(text, link_node); cmark_node *post = cmark_node_new_with_mem(CMARK_NODE_TEXT, parser->mem); post->as.literal = cmark_chunk_dup(&text->as.literal, (bufsize_t)(offset + max_rewind + link_end), (bufsize_t)(size - link_end)); cmark_chunk_to_cstr(parser->mem, &post->as.literal); cmark_node_insert_after(link_node, post); text->as.literal.len = offset + max_rewind - rewind; text->as.literal.data[text->as.literal.len] = 0; postprocess_text(parser, post, 0); }
// Return a link, an image, or a literal close bracket. static cmark_node *handle_close_bracket(subject *subj) { bufsize_t initial_pos, after_link_text_pos; bufsize_t endurl, starttitle, endtitle, endall; bufsize_t sps, n; cmark_reference *ref = NULL; cmark_chunk url_chunk, title_chunk; cmark_chunk url, title; bracket *opener; cmark_node *inl; cmark_chunk raw_label; int found_label; cmark_node *tmp, *tmpnext; bool is_image; advance(subj); // advance past ] initial_pos = subj->pos; // get last [ or ![ opener = subj->last_bracket; if (opener == NULL) { return make_str(subj->mem, cmark_chunk_literal("]")); } if (!opener->active) { // take delimiter off stack pop_bracket(subj); return make_str(subj->mem, cmark_chunk_literal("]")); } // If we got here, we matched a potential link/image text. // Now we check to see if it's a link/image. is_image = opener->image; after_link_text_pos = subj->pos; // First, look for an inline link. if (peek_char(subj) == '(' && ((sps = scan_spacechars(&subj->input, subj->pos + 1)) > -1) && ((n = manual_scan_link_url(&subj->input, subj->pos + 1 + sps, &url_chunk)) > -1)) { // try to parse an explicit link: endurl = subj->pos + 1 + sps + n; starttitle = endurl + scan_spacechars(&subj->input, endurl); // ensure there are spaces btw url and title endtitle = (starttitle == endurl) ? starttitle : starttitle + scan_link_title(&subj->input, starttitle); endall = endtitle + scan_spacechars(&subj->input, endtitle); if (peek_at(subj, endall) == ')') { subj->pos = endall + 1; title_chunk = cmark_chunk_dup(&subj->input, starttitle, endtitle - starttitle); url = cmark_clean_url(subj->mem, &url_chunk); title = cmark_clean_title(subj->mem, &title_chunk); cmark_chunk_free(subj->mem, &url_chunk); cmark_chunk_free(subj->mem, &title_chunk); goto match; } else { // it could still be a shortcut reference link subj->pos = after_link_text_pos; } } // Next, look for a following [link label] that matches in refmap. // skip spaces raw_label = cmark_chunk_literal(""); found_label = link_label(subj, &raw_label); if (!found_label) { // If we have a shortcut reference link, back up // to before the spacse we skipped. subj->pos = initial_pos; } if ((!found_label || raw_label.len == 0) && !opener->bracket_after) { cmark_chunk_free(subj->mem, &raw_label); raw_label = cmark_chunk_dup(&subj->input, opener->position, initial_pos - opener->position - 1); found_label = true; } if (found_label) { ref = cmark_reference_lookup(subj->refmap, &raw_label); cmark_chunk_free(subj->mem, &raw_label); } if (ref != NULL) { // found url = chunk_clone(subj->mem, &ref->url); title = chunk_clone(subj->mem, &ref->title); goto match; } else { goto noMatch; } noMatch: // If we fall through to here, it means we didn't match a link: pop_bracket(subj); // remove this opener from delimiter list subj->pos = initial_pos; return make_str(subj->mem, cmark_chunk_literal("]")); match: inl = make_simple(subj->mem, is_image ? CMARK_NODE_IMAGE : CMARK_NODE_LINK); inl->as.link.url = url; inl->as.link.title = title; cmark_node_insert_before(opener->inl_text, inl); // Add link text: tmp = opener->inl_text->next; while (tmp) { tmpnext = tmp->next; cmark_node_append_child(inl, tmp); tmp = tmpnext; } // Free the bracket [: cmark_node_free(opener->inl_text); process_emphasis(subj, opener->previous_delimiter); pop_bracket(subj); // Now, if we have a link, we also want to deactivate earlier link // delimiters. (This code can be removed if we decide to allow links // inside links.) if (!is_image) { opener = subj->last_bracket; while (opener != NULL) { if (!opener->image) { if (!opener->active) { break; } else { opener->active = false; } } opener = opener->previous; } } return NULL; }
// Parse reference. Assumes string begins with '[' character. // Modify refmap if a reference is encountered. // Return 0 if no reference found, otherwise position of subject // after reference is parsed. bufsize_t cmark_parse_reference_inline(cmark_mem *mem, cmark_strbuf *input, cmark_reference_map *refmap) { subject subj; cmark_chunk lab; cmark_chunk url; cmark_chunk title; bufsize_t matchlen = 0; bufsize_t beforetitle; subject_from_buf(mem, &subj, input, NULL); // parse label: if (!link_label(&subj, &lab) || lab.len == 0) return 0; // colon: if (peek_char(&subj) == ':') { advance(&subj); } else { return 0; } // parse link url: spnl(&subj); if ((matchlen = manual_scan_link_url(&subj.input, subj.pos, &url)) > -1 && url.len > 0) { subj.pos += matchlen; } else { return 0; } // parse optional link_title beforetitle = subj.pos; spnl(&subj); matchlen = scan_link_title(&subj.input, subj.pos); if (matchlen) { title = cmark_chunk_dup(&subj.input, subj.pos, matchlen); subj.pos += matchlen; } else { subj.pos = beforetitle; title = cmark_chunk_literal(""); } // parse final spaces and newline: skip_spaces(&subj); if (!skip_line_end(&subj)) { if (matchlen) { // try rewinding before title subj.pos = beforetitle; skip_spaces(&subj); if (!skip_line_end(&subj)) { return 0; } } else { return 0; } } // insert reference into refmap cmark_reference_create(refmap, &lab, &url, &title); return subj.pos; }