Beispiel #1
0
cmark_chunk cmark_clean_title(cmark_chunk *title)
{
	cmark_strbuf buf = GH_BUF_INIT;
	unsigned char first, last;

	if (title->len == 0) {
		cmark_chunk result = CMARK_CHUNK_EMPTY;
		return result;
	}

	first = title->data[0];
	last = title->data[title->len - 1];

	// remove surrounding quotes if any:
	if ((first == '\'' && last == '\'') ||
	    (first == '(' && last == ')') ||
	    (first == '"' && last == '"')) {
		houdini_unescape_html_f(&buf, title->data + 1, title->len - 2);
	} else {
		houdini_unescape_html_f(&buf, title->data, title->len);
	}

	cmark_strbuf_unescape(&buf);
	return cmark_chunk_buf_detach(&buf);
}
Beispiel #2
0
void cmark_consolidate_text_nodes(cmark_node *root) {
  cmark_iter *iter = cmark_iter_new(root);
  cmark_strbuf buf = GH_BUF_INIT;
  cmark_event_type ev_type;
  cmark_node *cur, *tmp, *next;

  while ((ev_type = cmark_iter_next(iter)) != CMARK_EVENT_DONE) {
    cur = cmark_iter_get_node(iter);
    if (ev_type == CMARK_EVENT_ENTER && cur->type == CMARK_NODE_TEXT &&
        cur->next && cur->next->type == CMARK_NODE_TEXT) {
      cmark_strbuf_clear(&buf);
      cmark_strbuf_put(&buf, cur->as.literal.data, cur->as.literal.len);
      tmp = cur->next;
      while (tmp && tmp->type == CMARK_NODE_TEXT) {
        cmark_iter_next(iter); // advance pointer
        cmark_strbuf_put(&buf, tmp->as.literal.data, tmp->as.literal.len);
        next = tmp->next;
        cmark_node_free(tmp);
        tmp = next;
      }
      cmark_chunk_free(&cur->as.literal);
      cur->as.literal = cmark_chunk_buf_detach(&buf);
    }
  }

  cmark_strbuf_free(&buf);
  cmark_iter_free(iter);
}
Beispiel #3
0
// Like make_str, but parses entities.
// Returns an inline sequence consisting of str and entity elements.
static cmark_node *make_str_with_entities(cmark_chunk *content)
{
	cmark_strbuf unescaped = GH_BUF_INIT;

	if (houdini_unescape_html(&unescaped, content->data, (size_t)content->len)) {
		return make_str(cmark_chunk_buf_detach(&unescaped));
	} else {
		return make_str(*content);
	}
}
Beispiel #4
0
// Like make_str, but parses entities.
static cmark_node *make_str_with_entities(cmark_mem *mem,
                                          cmark_chunk *content) {
  cmark_strbuf unescaped = CMARK_BUF_INIT(mem);

  if (houdini_unescape_html(&unescaped, content->data, content->len)) {
    return make_str(mem, cmark_chunk_buf_detach(&unescaped));
  } else {
    return make_str(mem, *content);
  }
}
static cmark_node *www_match(cmark_parser *parser, cmark_node *parent,
                             cmark_inline_parser *inline_parser) {
  cmark_chunk *chunk = cmark_inline_parser_get_chunk(inline_parser);
  size_t max_rewind = cmark_inline_parser_get_offset(inline_parser);
  uint8_t *data = chunk->data + max_rewind;
  size_t size = chunk->len - max_rewind;
  int start = cmark_inline_parser_get_column(inline_parser);

  size_t link_end;

  if (max_rewind > 0 && strchr("*_~(", data[-1]) == NULL &&
      !cmark_isspace(data[-1]))
    return 0;

  if (size < 4 || memcmp(data, "www.", strlen("www.")) != 0)
    return 0;

  link_end = check_domain(data, size, 0);

  if (link_end == 0)
    return NULL;

  while (link_end < size && !cmark_isspace(data[link_end]))
    link_end++;

  link_end = autolink_delim(data, link_end);

  if (link_end == 0)
    return NULL;

  cmark_inline_parser_set_offset(inline_parser, (int)(max_rewind + link_end));

  cmark_node *node = cmark_node_new_with_mem(CMARK_NODE_LINK, parser->mem);

  cmark_strbuf buf;
  cmark_strbuf_init(parser->mem, &buf, 10);
  cmark_strbuf_puts(&buf, "http://");
  cmark_strbuf_put(&buf, data, (bufsize_t)link_end);
  node->as.link.url = cmark_chunk_buf_detach(&buf);

  cmark_node *text = cmark_node_new_with_mem(CMARK_NODE_TEXT, parser->mem);
  text->as.literal =
      cmark_chunk_dup(chunk, (bufsize_t)max_rewind, (bufsize_t)link_end);
  cmark_node_append_child(node, text);

  node->start_line = text->start_line =
    node->end_line = text->end_line =
    cmark_inline_parser_get_line(inline_parser);

  node->start_column = text->start_column = start - 1;
  node->end_column = text->end_column = cmark_inline_parser_get_column(inline_parser) - 1;

  return node;
}
Beispiel #6
0
// Clean a URL: remove surrounding whitespace, and remove \ that escape
// punctuation.
cmark_chunk cmark_clean_url(cmark_mem *mem, cmark_chunk *url) {
  cmark_strbuf buf = CMARK_BUF_INIT(mem);

  cmark_chunk_trim(url);

  if (url->len == 0) {
    cmark_chunk result = CMARK_CHUNK_EMPTY;
    return result;
  }

  houdini_unescape_html_f(&buf, url->data, url->len);

  cmark_strbuf_unescape(&buf);
  return cmark_chunk_buf_detach(&buf);
}
Beispiel #7
0
// Parse an entity or a regular "&" string.
// Assumes the subject has an '&' character at the current position.
static cmark_node *handle_entity(subject *subj) {
  cmark_strbuf ent = CMARK_BUF_INIT(subj->mem);
  bufsize_t len;

  advance(subj);

  len = houdini_unescape_ent(&ent, subj->input.data + subj->pos,
                             subj->input.len - subj->pos);

  if (len == 0)
    return make_str(subj->mem, cmark_chunk_literal("&"));

  subj->pos += len;
  return make_str(subj->mem, cmark_chunk_buf_detach(&ent));
}
Beispiel #8
0
static cmark_chunk cmark_clean_autolink(cmark_mem *mem, cmark_chunk *url,
                                        int is_email) {
  cmark_strbuf buf = CMARK_BUF_INIT(mem);

  cmark_chunk_trim(url);

  if (url->len == 0) {
    cmark_chunk result = CMARK_CHUNK_EMPTY;
    return result;
  }

  if (is_email)
    cmark_strbuf_puts(&buf, "mailto:");

  houdini_unescape_html_f(&buf, url->data, url->len);
  return cmark_chunk_buf_detach(&buf);
}
Beispiel #9
0
// Parse backtick code section or raw backticks, return an inline.
// Assumes that the subject has a backtick at the current position.
static cmark_node* handle_backticks(subject *subj)
{
	cmark_chunk openticks = take_while(subj, isbacktick);
	int startpos = subj->pos;
	int endpos = scan_to_closing_backticks(subj, openticks.len);

	if (endpos == 0) { // not found
		subj->pos = startpos; // rewind
		return make_str(openticks);
	} else {
		cmark_strbuf buf = GH_BUF_INIT;

		cmark_strbuf_set(&buf, subj->input.data + startpos, endpos - startpos - openticks.len);
		cmark_strbuf_trim(&buf);
		cmark_strbuf_normalize_whitespace(&buf);

		return make_code(cmark_chunk_buf_detach(&buf));
	}
}
Beispiel #10
0
// Assumes we have a hyphen at the current position.
static cmark_node *handle_hyphen(subject *subj, bool smart) {
  cmark_strbuf buf = CMARK_BUF_INIT(NULL);
  int startpos = subj->pos;
  int en_count = 0;
  int em_count = 0;
  int numhyphens;
  int i;

  advance(subj);

  if (!smart || peek_char(subj) != '-') {
    return make_str(subj->mem, cmark_chunk_literal("-"));
  }

  while (smart && peek_char(subj) == '-') {
    advance(subj);
  }

  numhyphens = subj->pos - startpos;
  buf.mem = subj->mem;

  if (numhyphens % 3 == 0) { // if divisible by 3, use all em dashes
    em_count = numhyphens / 3;
  } else if (numhyphens % 2 == 0) { // if divisible by 2, use all en dashes
    en_count = numhyphens / 2;
  } else if (numhyphens % 3 == 2) { // use one en dash at end
    en_count = 1;
    em_count = (numhyphens - 2) / 3;
  } else { // use two en dashes at the end
    en_count = 2;
    em_count = (numhyphens - 4) / 3;
  }

  for (i = em_count; i > 0; i--) {
    cmark_strbuf_puts(&buf, EMDASH);
  }

  for (i = en_count; i > 0; i--) {
    cmark_strbuf_puts(&buf, ENDASH);
  }

  return make_str(subj->mem, cmark_chunk_buf_detach(&buf));
}
Beispiel #11
0
// Clean a URL: remove surrounding whitespace and surrounding <>,
// and remove \ that escape punctuation.
cmark_chunk cmark_clean_url(cmark_chunk *url)
{
	cmark_strbuf buf = GH_BUF_INIT;

	cmark_chunk_trim(url);

	if (url->len == 0) {
		cmark_chunk result = CMARK_CHUNK_EMPTY;
		return result;
	}

	if (url->data[0] == '<' && url->data[url->len - 1] == '>') {
		houdini_unescape_html_f(&buf, url->data + 1, url->len - 2);
	} else {
		houdini_unescape_html_f(&buf, url->data, url->len);
	}

	cmark_strbuf_unescape(&buf);
	return cmark_chunk_buf_detach(&buf);
}
Beispiel #12
0
static cmark_node*
finalize(cmark_parser *parser, cmark_node* b)
{
	int pos;
	cmark_node* item;
	cmark_node* subitem;
	cmark_node* parent;

	parent = b->parent;

	assert(b->open);  // shouldn't call finalize on closed blocks
	b->open = false;

	if (parser->curline->size == 0) {
		// end of input - line number has not been incremented
		b->end_line = parser->line_number;
		b->end_column = parser->last_line_length;
	} else if (b->type == NODE_DOCUMENT ||
	           (b->type == NODE_CODE_BLOCK && b->as.code.fenced) ||
	           (b->type == NODE_HEADER && b->as.header.setext)) {
		b->end_line = parser->line_number;
		b->end_column = parser->curline->size;
		if (b->end_column && parser->curline->ptr[b->end_column - 1] == '\n')
			b->end_column--;
		if (b->end_column && parser->curline->ptr[b->end_column - 1] == '\r')
			b->end_column--;
	} else {
		b->end_line = parser->line_number - 1;
		b->end_column = parser->last_line_length;
	}

	switch (b->type) {
	case NODE_PARAGRAPH:
		while (cmark_strbuf_at(&b->string_content, 0) == '[' &&
		       (pos = cmark_parse_reference_inline(&b->string_content, parser->refmap))) {

			cmark_strbuf_drop(&b->string_content, pos);
		}
		if (is_blank(&b->string_content, 0)) {
			// remove blank node (former reference def)
			cmark_node_free(b);
		}
		break;

	case NODE_CODE_BLOCK:
		if (!b->as.code.fenced) { // indented code
			remove_trailing_blank_lines(&b->string_content);
			cmark_strbuf_putc(&b->string_content, '\n');
		} else {

			// first line of contents becomes info
			for (pos = 0; pos < b->string_content.size; ++pos) {
				if (b->string_content.ptr[pos] == '\r' ||
				    b->string_content.ptr[pos] == '\n')
					break;
			}
			assert(pos < b->string_content.size);

			cmark_strbuf tmp = GH_BUF_INIT;
			houdini_unescape_html_f(
			    &tmp,
			    b->string_content.ptr,
			    pos
			);
			cmark_strbuf_trim(&tmp);
			cmark_strbuf_unescape(&tmp);
			b->as.code.info = cmark_chunk_buf_detach(&tmp);

			if (b->string_content.ptr[pos] == '\r')
				pos += 1;
			if (b->string_content.ptr[pos] == '\n')
				pos += 1;
			cmark_strbuf_drop(&b->string_content, pos);
		}
		b->as.code.literal = cmark_chunk_buf_detach(&b->string_content);
		break;

	case NODE_HTML:
		b->as.literal = cmark_chunk_buf_detach(&b->string_content);
		break;

	case NODE_LIST: // determine tight/loose status
		b->as.list.tight = true; // tight by default
		item = b->first_child;

		while (item) {
			// check for non-final non-empty list item ending with blank line:
			if (item->last_line_blank && item->next) {
				b->as.list.tight = false;
				break;
			}
			// recurse into children of list item, to see if there are
			// spaces between them:
			subitem = item->first_child;
			while (subitem) {
				if (ends_with_blank_line(subitem) &&
				    (item->next || subitem->next)) {
					b->as.list.tight = false;
					break;
				}
				subitem = subitem->next;
			}
			if (!(b->as.list.tight)) {
				break;
			}
			item = item->next;
		}

		break;

	default:
		break;
	}
	return parent;
}
Beispiel #13
0
static void postprocess_text(cmark_parser *parser, cmark_node *text, int offset) {
  size_t link_end;
  uint8_t *data = text->as.literal.data,
    *at;
  size_t size = text->as.literal.len;
  int rewind, max_rewind,
      nb = 0, np = 0, ns = 0;

  if (offset < 0 || (size_t)offset >= size)
    return;

  data += offset;
  size -= offset;

  at = (uint8_t *)memchr(data, '@', size);
  if (!at)
    return;

  max_rewind = (int)(at - data);
  data += max_rewind;
  size -= max_rewind;

  for (rewind = 0; rewind < max_rewind; ++rewind) {
    uint8_t c = data[-rewind - 1];

    if (cmark_isalnum(c))
      continue;

    if (strchr(".+-_", c) != NULL)
      continue;

    if (c == '/')
      ns++;

    break;
  }

  if (rewind == 0 || ns > 0) {
    postprocess_text(parser, text, max_rewind + 1 + offset);
    return;
  }

  for (link_end = 0; link_end < size; ++link_end) {
    uint8_t c = data[link_end];

    if (cmark_isalnum(c))
      continue;

    if (c == '@')
      nb++;
    else if (c == '.' && link_end < size - 1 && cmark_isalnum(data[link_end + 1]))
      np++;
    else if (c != '-' && c != '_')
      break;
  }

  if (link_end < 2 || nb != 1 || np == 0 ||
      (!cmark_isalpha(data[link_end - 1]) && data[link_end - 1] != '.')) {
    postprocess_text(parser, text, max_rewind + 1 + offset);
    return;
  }

  link_end = autolink_delim(data, link_end);

  if (link_end == 0) {
    postprocess_text(parser, text, max_rewind + 1 + offset);
    return;
  }

  cmark_chunk_to_cstr(parser->mem, &text->as.literal);

  cmark_node *link_node = cmark_node_new_with_mem(CMARK_NODE_LINK, parser->mem);
  cmark_strbuf buf;
  cmark_strbuf_init(parser->mem, &buf, 10);
  cmark_strbuf_puts(&buf, "mailto:");
  cmark_strbuf_put(&buf, data - rewind, (bufsize_t)(link_end + rewind));
  link_node->as.link.url = cmark_chunk_buf_detach(&buf);

  cmark_node *link_text = cmark_node_new_with_mem(CMARK_NODE_TEXT, parser->mem);
  cmark_chunk email = cmark_chunk_dup(
      &text->as.literal,
      offset + max_rewind - rewind,
      (bufsize_t)(link_end + rewind));
  cmark_chunk_to_cstr(parser->mem, &email);
  link_text->as.literal = email;
  cmark_node_append_child(link_node, link_text);

  cmark_node_insert_after(text, link_node);

  cmark_node *post = cmark_node_new_with_mem(CMARK_NODE_TEXT, parser->mem);
  post->as.literal = cmark_chunk_dup(&text->as.literal,
    (bufsize_t)(offset + max_rewind + link_end),
    (bufsize_t)(size - link_end));
  cmark_chunk_to_cstr(parser->mem, &post->as.literal);

  cmark_node_insert_after(link_node, post);

  text->as.literal.len = offset + max_rewind - rewind;
  text->as.literal.data[text->as.literal.len] = 0;

  postprocess_text(parser, post, 0);
}
Beispiel #14
0
static cmark_node*
finalize(cmark_parser *parser, cmark_node* b, int line_number)
{
	int firstlinelen;
	int pos;
	cmark_node* item;
	cmark_node* subitem;
	cmark_node* parent;

	parent = b->parent;

	// don't do anything if the cmark_node is already closed
	if (!b->open)
		return parent;

	b->open = false;
	if (line_number > b->start_line) {
		b->end_line = line_number - 1;
	} else {
		b->end_line = line_number;
	}

	switch (b->type) {
		case NODE_PARAGRAPH:
			while (cmark_strbuf_at(&b->string_content, 0) == '[' &&
					(pos = cmark_parse_reference_inline(&b->string_content, parser->refmap))) {

				cmark_strbuf_drop(&b->string_content, pos);
			}
			if (is_blank(&b->string_content, 0)) {
				// remove blank node (former reference def)
				cmark_node_free(b);
			}
			break;

		case NODE_CODE_BLOCK:
			if (!b->as.code.fenced) { // indented code
				remove_trailing_blank_lines(&b->string_content);
				cmark_strbuf_putc(&b->string_content, '\n');
			} else {

				// first line of contents becomes info
				firstlinelen = cmark_strbuf_strchr(&b->string_content, '\n', 0);

				cmark_strbuf tmp = GH_BUF_INIT;
				houdini_unescape_html_f(
					&tmp,
					b->string_content.ptr,
					firstlinelen
					);
				cmark_strbuf_trim(&tmp);
				cmark_strbuf_unescape(&tmp);
				b->as.code.info = cmark_chunk_buf_detach(&tmp);

				cmark_strbuf_drop(&b->string_content, firstlinelen + 1);
			}
			b->as.code.literal = cmark_chunk_buf_detach(&b->string_content);
			break;

	        case NODE_HTML:
			b->as.literal = cmark_chunk_buf_detach(&b->string_content);
			break;

		case NODE_LIST: // determine tight/loose status
			b->as.list.tight = true; // tight by default
			item = b->first_child;

			while (item) {
				// check for non-final non-empty list item ending with blank line:
				if (item->last_line_blank && item->next) {
					b->as.list.tight = false;
					break;
				}
				// recurse into children of list item, to see if there are
				// spaces between them:
				subitem = item->first_child;
				while (subitem) {
					if (ends_with_blank_line(subitem) &&
							(item->next || subitem->next)) {
						b->as.list.tight = false;
						break;
					}
					subitem = subitem->next;
				}
				if (!(b->as.list.tight)) {
					break;
				}
				item = item->next;
			}

			break;

		default:
			break;
	}
	return parent;
}