Beispiel #1
0
static size_t
smartypants_cb__number(struct buf *ob, struct smartypants_data *smrt, uint8_t previous_char, const uint8_t *text, size_t size)
{
	if (word_boundary(previous_char) && size >= 3) {
		if (text[0] == '1' && text[1] == '/' && text[2] == '2') {
			if (size == 3 || word_boundary(text[3])) {
				BUFPUTSL(ob, "½");
				return 2;
			}
		}

		if (text[0] == '1' && text[1] == '/' && text[2] == '4') {
			if (size == 3 || word_boundary(text[3]) ||
				(size >= 5 && tolower(text[3]) == 't' && tolower(text[4]) == 'h')) {
				BUFPUTSL(ob, "¼");
				return 2;
			}
		}

		if (text[0] == '3' && text[1] == '/' && text[2] == '4') {
			if (size == 3 || word_boundary(text[3]) ||
				(size >= 6 && tolower(text[3]) == 't' && tolower(text[4]) == 'h' && tolower(text[5]) == 's')) {
				BUFPUTSL(ob, "¾");
				return 2;
			}
		}
	}

	bufputc(ob, text[0]);
	return 0;
}
Beispiel #2
0
static int
smartypants_cmpsub(const struct buf *buf, size_t start, const char *prefix)
{
	size_t i;

	if (prefix[0] == '<') {
		if (start == 0 || !word_boundary(buf->data[start - 1]))
			return 0;

		prefix++;
	}

	for (i = start; i < buf->size; ++i) {
		char c, p;

		c = tolower(buf->data[i]);
		p = *prefix++;

		if (p == 0)
			return 1;

		if (p == '>')
			return word_boundary(c);

		if (c != p)
			return 0;
	}

	return (*prefix == '>');
}
Beispiel #3
0
static int
smartypants_quotes(struct buf *ob, uint8_t previous_char, uint8_t next_char, uint8_t quote, int *is_open)
{
	char ent[8];

	if (*is_open && !word_boundary(next_char))
		return 0;

	if (!(*is_open) && !word_boundary(previous_char))
		return 0;

	snprintf(ent, sizeof(ent), "&%c%cquo;", (*is_open) ? 'r' : 'l', quote);
	*is_open = !(*is_open);
	bufputs(ob, ent);
	return 1;
}
Beispiel #4
0
static int
smartypants_quotes(struct buf *ob, struct buf *text, size_t i, int is_open)
{
	char ent[8];

	if (is_open && i + 1 < text->size && !word_boundary(text->data[i + 1]))
		return 0;

	if (!is_open && i > 0 && !word_boundary(text->data[i - 1]))
		return 0;

	snprintf(ent, sizeof(ent), "&%c%cquo;",
		is_open ? 'r' : 'l',
		text->data[i] == '\'' ? 's' : 'd');

	bufputs(ob, ent);
	return 1;
}
Beispiel #5
0
static size_t
smartypants_cb__squote(struct buf *ob, struct smartypants_data *smrt, uint8_t previous_char, const uint8_t *text, size_t size)
{
	if (size >= 2) {
		uint8_t t1 = tolower(text[1]);

		if (t1 == '\'') {
			if (smartypants_quotes(ob, previous_char, size >= 3 ? text[2] : 0, 'd', &smrt->in_dquote))
				return 1;
		}

		if ((t1 == 's' || t1 == 't' || t1 == 'm' || t1 == 'd') &&
			(size == 3 || word_boundary(text[2]))) {
			BUFPUTSL(ob, "&rsquo;");
			return 0;
		}

		if (size >= 3) {
			uint8_t t2 = tolower(text[2]);

			if (((t1 == 'r' && t2 == 'e') ||
				(t1 == 'l' && t2 == 'l') ||
				(t1 == 'v' && t2 == 'e')) &&
				(size == 4 || word_boundary(text[3]))) {
				BUFPUTSL(ob, "&rsquo;");
				return 0;
			}
		}
	}

	if (smartypants_quotes(ob, previous_char, size > 0 ? text[1] : 0, 's', &smrt->in_squote))
		return 0;

	bufputc(ob, text[0]);
	return 0;
}
Beispiel #6
0
int main()
{
    try {
        std::cout << "Testing word boundary" << std::endl;
        word_boundary();
        std::cout << "Testing character boundary" << std::endl;
        test_boundaries(character,nones,0,lb::character);
        std::cout << "Testing sentence boundary" << std::endl;
        test_boundaries(sentence1,sentence1a,sentence1b,lb::sentence);
        std::cout << "Testing line boundary" << std::endl;
        test_boundaries(line1,line1a,line1b,lb::line);
    }
    catch(std::exception const &e) {
        std::cerr << "Failed " << e.what() << std::endl;
        return EXIT_FAILURE;
    }
    FINALIZE();
}
Beispiel #7
0
bool sane_linkage_morphism(Sentence sent, Linkage lkg, Parse_Options opts)
{
	Wordgraph_pathpos *wp_new = NULL;
	Wordgraph_pathpos *wp_old = NULL;
	Wordgraph_pathpos *wpp;
	Gword **next; /* next Wordgraph words of the current word */

	size_t i;
	Linkage_info * const lifo = &lkg->lifo;

	bool match_found = true; /* if all the words are null - it's still a match */
	Gword **lwg_path;

	Dictionary afdict = sent->dict->affix_table;       /* for SANEMORPHISM */
	char *const affix_types = alloca(sent->length*2 + 1);   /* affix types */

	affix_types[0] = '\0';

	/* Populate the path word queue, initializing the path to NULL. */
	for (next = sent->wordgraph->next; *next; next++)
	{
		wordgraph_path_append(&wp_new, /*path*/NULL, /*add_word*/NULL, *next);
	}
	assert(NULL != wp_new, "Path word queue is empty");

	for (i = 0; i < lkg->num_words; i++)
	{
		Disjunct *cdj;            /* chosen disjunct */

		lgdebug(D_SLM, "%p Word %zu: ", lkg, i);

		if (NULL == wp_new)
		{
			lgdebug(+D_SLM, "- No more words in the wordgraph\n");
			match_found = false;
			break;
		}

		if (wp_old != wp_new)
		{
			wordgraph_path_free(wp_old, true);
			wp_old = wp_new;
		}
		wp_new = NULL;
		//wordgraph_pathpos_print(wp_old);

		cdj = lkg->chosen_disjuncts[i];
		/* Handle null words */
		if (NULL == cdj)
		{
			lgdebug(D_SLM, "- Null word\n");
			/* A null word matches any word in the Wordgraph -
			 * so, unconditionally proceed in all paths in parallel. */
			match_found = false;
			for (wpp = wp_old; NULL != wpp->word; wpp++)
			{
				if (NULL == wpp->word->next)
					continue; /* This path encountered the Wordgraph end */

				/* The null words cannot be marked here because wpp->path consists
				 * of pointers to the Wordgraph words, and these words are common to
				 * all the linkages, with potentially different null words in each
				 * of them. However, the position of the null words can be inferred
				 * from the null words in the word array of the Linkage structure.
				 */
				for (next = wpp->word->next; NULL != *next; next++)
				{
					match_found = true;
					wordgraph_path_append(&wp_new, wpp->path, wpp->word, *next);
				}
			}
			continue;
		}

		if (!match_found)
		{
			const char *e = "Internal error: Too many words in the linkage\n";
			lgdebug(D_SLM, "- %s", e);
			prt_error("Error: %s.", e);
			break;
		}

		assert(MT_EMPTY != cdj->word[0]->morpheme_type); /* already discarded */

		if (debug_level(D_SLM)) print_with_subscript_dot(cdj->string);

		match_found = false;
		/* Proceed in all the paths in which the word is found. */
		for (wpp = wp_old; NULL != wpp->word; wpp++)
		{
			const Gword **wlp; /* disjunct word list */

			for (wlp = cdj->word; *wlp; wlp++)
			{
				if (*wlp == wpp->word)
				{
					match_found = true;
					for (next = wpp->word->next; NULL != *next; next++)
					{
						wordgraph_path_append(&wp_new, wpp->path, wpp->word, *next);
					}
					break;
				}
			}
		}

		if (!match_found)
		{
			/* FIXME? A message can be added here if there are too many words
			 * in the linkage (can happen only if there is an internal error). */
			lgdebug(D_SLM, "- No Wordgraph match\n");
			break;
		}
		lgdebug(D_SLM, "\n");
	}

	if (match_found)
	{
		match_found = false;
		/* Validate that there are no missing words in the linkage. It is so if
		 * the dummy termination word is found in the new pathpos queue. */
		if (NULL != wp_new)
		{
			for (wpp = wp_new; NULL != wpp->word; wpp++)
			{
				if (MT_INFRASTRUCTURE == wpp->word->morpheme_type) {
					match_found = true;
					/* Exit the loop with with wpp of the termination word. */
					break;
				}
			}
		}
		if (!match_found)
		    lgdebug(D_SLM, "%p Missing word(s) at the end of the linkage.\n", lkg);
	}

#define DEBUG_morpheme_type 0
	/* Check the morpheme type combination.
	 * If null_count > 0, the morpheme type combination may be invalid
	 * due to null subwords, so skip this check. */
	if (match_found && (0 == sent->null_count) &&
		(NULL != afdict) && (NULL != afdict->regex_root))
	{
		const Gword **w;
		char *affix_types_p = affix_types;

		/* Construct the affix_types string. */
#if DEBUG_morpheme_type
		print_lwg_path(wpp->path);
#endif
		i = 0;
		for (w = wpp->path; *w; w++)
		{
			i++;
			if (MT_EMPTY == (*w)->morpheme_type) continue; /* really a null word */

#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wswitch-enum"
			switch ((*w)->morpheme_type)
			{
#pragma GCC diagnostic pop
				default:
					/* What to do with the rest? */
				case MT_WORD:
					*affix_types_p = AFFIXTYPE_WORD;
					break;
				case MT_PREFIX:
					*affix_types_p = AFFIXTYPE_PREFIX;
					break;
				case MT_STEM:
					*affix_types_p = AFFIXTYPE_STEM;
					break;
				case MT_MIDDLE:
					*affix_types_p = AFFIXTYPE_MIDDLE;
					break;
				case MT_SUFFIX:
					*affix_types_p = AFFIXTYPE_SUFFIX;
					break;
			}

#if DEBUG_morpheme_type
			lgdebug(D_SLM, "Word %zu: %s affixtype=%c\n",
			     i, (*w)->subword,  *affix_types_p);
#endif

			affix_types_p++;
		}
		*affix_types_p = '\0';

#ifdef WORD_BOUNDARIES /* not yet implemented */
		{
			const Gword *uw;

			/* If w is an "end subword", return its unsplit word, else NULL. */
			uw = word_boundary(w); /* word_boundary() unimplemented */

			if (NULL != uw)
			{
				*affix_types_p++ = AFFIXTYPE_END;
				lgdebug(D_SLM, "%p End of Gword %s\n", lkg, uw->subword);
			}
		}
#endif

		/* Check if affix_types is valid according to SANEMORPHISM. */
		if (('\0' != affix_types[0]) &&
		    (NULL == match_regex(afdict->regex_root, affix_types)))
		{
			/* Morpheme type combination is invalid */
			match_found = false;
			/* Notify to stdout, so it will be shown along with the result.
			 * XXX We should have a better way to notify. */
			if (0 < opts->verbosity)
				printf("Warning: Invalid morpheme type combination '%s', "
				       "run with !bad and !verbosity=4 to debug\n", affix_types);
		}
	}

	if (match_found) lwg_path = (Gword **)wpp->path; /* OK to modify */
	wordgraph_path_free(wp_old, true);
	wordgraph_path_free(wp_new, !match_found);

	if (match_found)
	{
		if ('\0' != affix_types[0])
		{
			lgdebug(D_SLM, "%p Morpheme type combination '%s'\n", lkg, affix_types);
		}
		lgdebug(+D_SLM, "%p SUCCEEDED\n", lkg);
		lkg->wg_path = lwg_path;
		return true;
	}

	/* Oh no ... invalid morpheme combination! */
	sent->num_valid_linkages --;
	lifo->N_violations++;
	lifo->pp_violation_msg = "Invalid morphism construction.";
	lkg->wg_path = NULL;
	lifo->discarded = true;
	lgdebug(D_SLM, "%p FAILED\n", lkg);
	return false;
}