Esempio n. 1
0
/**
 * rdictionary_lookup() -- recursive dictionary lookup
 * Walk binary tree, given by 'dn', looking for the string 's'.
 * For every node in the tree where 's' matches (including wildcards)
 * make a copy of that node, and append it to llist.
 */
static Dict_node * rdictionary_lookup(Dict_node *llist,
                                      Dict_node * dn, const char * s, int match_idiom)
{
	/* see comment in dictionary_lookup below */
	int m;
	Dict_node * dn_new;
	if (dn == NULL) return llist;
	m = dict_order_wild(s, dn->string);
	if (m >= 0)
	{
		llist = rdictionary_lookup(llist, dn->right, s, match_idiom);
	}
	if ((m == 0) && (match_idiom || !is_idiom_word(dn->string)))
	{
		dn_new = dict_node_new();
		*dn_new = *dn;
		dn_new->right = llist;
		llist = dn_new;
	}
	if (m <= 0)
	{
		llist = rdictionary_lookup(llist, dn->left, s, match_idiom);
	}
	return llist;
}
Esempio n. 2
0
/*
  returns TRUE if the list of words contains only words that are
  idiom words.  This is useful, because under this condition you want
   to be able to insert the word anyway, as long as it doesn't match
   exactly.
*/
int only_idiom_words(Dict_node * dn)
{
	while(dn != NULL) {
		if (!is_idiom_word(dn->string)) return FALSE;
		dn = dn->right;
	}
	return TRUE;
}
Esempio n. 3
0
/**
 * insert_list() -
 * p points to a list of dict_nodes connected by their left pointers.
 * l is the length of this list (the last ptr may not be NULL).
 * It inserts the list into the dictionary.
 * It does the middle one first, then the left half, then the right.
 *
 * Note: I think this insert middle, then left, then right, has
 * its origins as a lame attempt to hack around the fact that the 
 * resulting binary tree is rather badly unbalanced. This has been 
 * fixed by using the DSW rebalancing algo. Now, that would seem
 * to render this crazy bisected-insertion algo obsoloete, but ..
 * oddly enough, it seems to make the DSW balancing go really fast!
 * Faster than a simple insertion. Go figure. I think this has
 * something to do with the fact that the dictionaries are in
 * alphabetical order! This subdivision helps randomize a bit.
 */
static void insert_list(Dictionary dict, Dict_node * p, int l)
{
	Dict_node * dn, *dn_head, *dn_second_half;
	int k, i; /* length of first half */

	if (l == 0) return;

	k = (l-1)/2;
	dn = p;
	for (i = 0; i < k; i++)
	{
		dn = dn->left;
	}

	/* dn now points to the middle element */
	dn_second_half = dn->left;
	dn->left = dn->right = NULL;

	if (contains_underbar(dn->string))
	{
		insert_idiom(dict, dn);
	}
	else if (is_idiom_word(dn->string))
	{
		err_ctxt ec;
		ec.sent = NULL;
		err_msg(&ec, Warn, "Warning: Word \"%s\" found near line %d.\n"
		        "\tWords ending \".Ix\" (x a number) are reserved for idioms.\n"
		        "\tThis word will be ignored.\n",
		        dn->string, dict->line_number);
		free_dict_node(dn);
	}
	else if ((dn_head = abridged_lookup_list(dict, dn->string)) != NULL)
	{
		Dict_node *dnx;
		err_ctxt ec;
		ec.sent = NULL;
		err_msg(&ec, Warn, "Warning: The word \"%s\" "
		          "found near line %d of %s matches the following words:\n",
	             dn->string, dict->line_number, dict->name);
		for (dnx = dn_head; dnx != NULL; dnx = dnx->right) {
			fprintf(stderr, "\t%s", dnx->string);
		}
		fprintf(stderr, "\n\tThis word will be ignored.\n");
		free_lookup_list(dn_head);
		free_dict_node(dn);
	}
	else
	{
		dict->root = insert_dict(dict, dict->root, dn);
		dict->num_entries++;
	}

	insert_list(dict, p, k);
	insert_list(dict, dn_second_half, l-k-1);
}
Esempio n. 4
0
/**
 * Returns true if it finds a non-idiom dict_node in a file that matches
 * the string s.
 *
 * Also sets parent and to_be_deleted appropriately.
 * Note: this function is used in only one place: delete_dictionary_words()
 * which is, itself, not currently used ...
 */
static bool find_one_non_idiom_node(Dict_node * p, Dict_node * dn,
                                   const char * s,
                                   Dict_node **parent, Dict_node **to_be_deleted)
{
	int m;
	if (dn == NULL) return false;
	m = dict_order_bare(s, dn);
	if (m <= 0) {
		if (find_one_non_idiom_node(dn, dn->left, s, parent, to_be_deleted)) return true;
	}
/*	if ((m == 0) && (!is_idiom_word(dn->string)) && (dn->file != NULL)) { */
	if ((m == 0) && (!is_idiom_word(dn->string))) {
		*to_be_deleted = dn;
		*parent = p;
		return true;
	}
	if (m >= 0) {
		if (find_one_non_idiom_node(dn, dn->right, s, parent, to_be_deleted)) return true;
	}
	return false;
}
Esempio n. 5
0
void compute_chosen_words(Sentence sent, Linkage linkage, Parse_Options opts)
{
	WordIdx i;   /* index of chosen_words */
	WordIdx j;
	Disjunct **cdjp = linkage->chosen_disjuncts;
	const char **chosen_words = alloca(linkage->num_words * sizeof(*chosen_words));
	int *remap = alloca(linkage->num_words * sizeof(*remap));
	bool *show_word = alloca(linkage->num_words * sizeof(*show_word));
	bool display_morphology = opts->display_morphology;

	Gword **lwg_path = linkage->wg_path;
	Gword **n_lwg_path = NULL; /* new Wordgraph path, to match chosen_words */

	Gword **nullblock_start = NULL; /* start of a null block, to be put in [] */
	size_t nbsize = 0;              /* number of word in a null block */
	Gword *sentence_word;

	memset(show_word, 0, linkage->num_words * sizeof(*show_word));

	if (verbosity_level(D_CCW))
		print_lwg_path(lwg_path, "Linkage");

	for (i = 0; i < linkage->num_words; i++)
	{
		Disjunct *cdj = cdjp[i];
		Gword *w;              /* current word */
		const Gword *nw;       /* next word (NULL if none) */
		Gword **wgp;           /* wordgraph_path traversing pointer */

		const char *t = NULL;  /* current word string */
		bool at_nullblock_end; /* current word is at end of a nullblock */
		bool join_alt = false; /* morpheme-join this alternative */
		char *s;
		size_t l;
		size_t m;

		lgdebug(D_CCW, "Loop start, word%zu: cdj %s, path %s\n",
		        i, cdj ? cdj->word_string : "NULL",
		        lwg_path[i] ? lwg_path[i]->subword : "NULL");

		w = lwg_path[i];
		nw = lwg_path[i+1];
		wgp = &lwg_path[i];
		sentence_word = wg_get_sentence_word(sent, w);

		/* FIXME If the original word was capitalized in a capitalizable
		 * position, the displayed null word may be its downcase version. */

		if (NULL == cdj) /* a null word (the chosen disjunct was NULL) */
		{
			chosen_words[i] = NULL;
			nbsize++;
			if (NULL == nullblock_start) /* it starts a new null block */
				nullblock_start = wgp;

			at_nullblock_end = (NULL == nw) ||
				(wg_get_sentence_word(sent, nw->unsplit_word) != sentence_word);

			/* Accumulate null words in this alternative */
			if (!at_nullblock_end && (NULL == cdjp[i+1]) &&
			    ((w->morpheme_type == MT_PUNC) == (nw->morpheme_type == MT_PUNC)))
			{
				lgdebug(D_CCW, "Skipping word%zu cdjp=NULL#%zu, path %s\n",
				        i, nbsize, w->subword);
				chosen_words[i] = NULL;
				continue;
			}

			if (NULL != nullblock_start)
			{
				/* If we are here, this null word is an end of a null block */
				lgdebug(+D_CCW, "Handling %zu null words at %zu: ", nbsize, i);

				if (1 == nbsize)
				{
					/* Case 1: A single null subword. */
					lgdebug(D_CCW, "A single null subword.\n");
					t = join_null_word(sent, wgp, nbsize);

					gwordlist_append(&n_lwg_path, w);
				}
				else
				{
					lgdebug(D_CCW, "Combining null subwords");
					/* Use alternative_id to check for start of alternative. */
					if (((*nullblock_start)->alternative_id == *nullblock_start)
					    && at_nullblock_end)
					{
						/* Case 2: A null unsplit_word (all-nulls alternative).*/
						lgdebug(D_CCW, " (null alternative)\n");
						t = sentence_word->subword;

						gwordlist_append(&n_lwg_path, sentence_word);
					}
					else
					{
						/* Case 3: Join together >=2 null morphemes. */
						Gword *wgnull;

						lgdebug(D_CCW, " (null partial word)\n");
						wgnull = wordgraph_null_join(sent, wgp-nbsize+1, wgp);
						gwordlist_append(&n_lwg_path, wgnull);
						t = wgnull->subword;
					}
				}

				nullblock_start = NULL;
				nbsize = 0;
				show_word[i] = true;

				if (MT_WALL != w->morpheme_type)
				{
					/* Put brackets around the null word. */
					l = strlen(t) + 2;
					s = (char *) alloca(l+1);
					s[0] = NULLWORD_START;
					strcpy(&s[1], t);
					s[l-1] = NULLWORD_END;
					s[l] = '\0';
					t = string_set_add(s, sent->string_set);
					lgdebug(D_CCW, " %s\n", t);
					/* Null words have no links, so take care not to drop them. */
				}
			}
		}
		else
		{
			/* This word has a linkage. */

			/* TODO: Suppress "virtual-morphemes", currently the dictcap ones. */
			char *sm;

			t = cdj->word_string;
			/* Print the subscript, as in "dog.n" as opposed to "dog". */

			if (0)
			{
				/* TODO */
			}
			else
			{
				/* Get rid of those ugly ".Ixx" */
				if (is_idiom_word(t))
				{
					s = strdupa(t);
					sm = strrchr(s, SUBSCRIPT_MARK); /* Possible double subscript. */
					UNREACHABLE(NULL == sm); /* We know it has a subscript. */
					*sm = '\0';
					t = string_set_add(s, sent->string_set);
				}
				else if (HIDE_MORPHO)
				{
					/* Concatenate the word morphemes together into one word.
					 * Concatenate their subscripts into one subscript.
					 * Use subscript separator SUBSCRIPT_SEP.
					 * XXX Check whether we can encounter an idiom word here.
					 * FIXME Combining contracted words is not handled yet, because
					 * combining morphemes which have non-LL links to other words is
					 * not yet implemented.
					 * FIXME Move to a separate function. */
					Gword **wgaltp;
					size_t join_len = 0;
					size_t mcnt = 0;

					/* If the alternative contains morpheme subwords, mark it
					 * for joining... */

					const Gword *unsplit_word = w->unsplit_word;
					for (wgaltp = wgp, j = i; NULL != *wgaltp; wgaltp++, j++)
					{

						if ((*wgaltp)->unsplit_word != unsplit_word) break;
						if (MT_INFRASTRUCTURE ==
						    (*wgaltp)->unsplit_word->morpheme_type) break;

						mcnt++;

						if (NULL == cdjp[j])
						{
							/* ... but not if it contains a null word */
							join_alt = false;
							break;
						}
						join_len += strlen(cdjp[j]->word_string) + 1;
						if ((*wgaltp)->morpheme_type & IS_REG_MORPHEME)
							join_alt = true;
					}

					if (join_alt)
					{
						/* Join it in two steps: 1. Base words. 2. Subscripts.
						 * FIXME? Can be done in one step (more efficient but maybe
						 * less clear).
						 * Put SUBSCRIPT_SEP between the subscripts.
						 * XXX No 1-1 correspondence between the hidden base words
						 * and the subscripts after the join, in case there are base
						 * words with and without subscripts. */

						const char subscript_sep_str[] = { SUBSCRIPT_SEP, '\0'};
						char *join = calloc(join_len + 1, 1); /* zeroed out */

						join[0] = '\0';

						/* 1. Join base words. (Could just use the unsplit_word.) */
						for (wgaltp = wgp, m = 0; m < mcnt; wgaltp++, m++)
						{
							add_morpheme_unmarked(sent, join, cdjp[i+m]->word_string,
							                      (*wgaltp)->morpheme_type);
						}

						strcat(join, subscript_mark_str()); /* tentative */

						/* 2. Join subscripts. */
						for (wgaltp = wgp, m = 0; m < mcnt; wgaltp++, m++)
						{
							/* Cannot NULLify the word - we may have links to it. */
							if (m != mcnt-1) chosen_words[i+m] = "";

							sm =  strchr(cdjp[i+m]->word_string, SUBSCRIPT_MARK);

							if (NULL != sm)
							{
								/* Supposing stem subscript is .=x (x optional) */
								if (MT_STEM == (*wgaltp)->morpheme_type)
								{
									sm += 1 + STEM_MARK_L; /* sm+strlen(".=") */
									if ('\0' == *sm) sm = NULL;
#if 0
									if ((cnt-1) == m)
									{
										/* Support a prefix-stem combination. In that case
										 * we have just nullified the combined word, so we
										 * need to move it to the position of the prefix.
										 * FIXME: May still not be good enough. */
										move_combined_word = i+m-1;

										/* And the later chosen_word assignment should be:
										 * chosen_words[-1 != move_combined_word ?
										 *    move_combined_word : i] = t;
										 */
									}
									else
									{
										move_combined_word = -1;
									}
#endif
								}
							}
							if (NULL != sm)
							{
								strcat(join, sm+1);
								strcat(join, subscript_sep_str);
							}
						}

						/* Remove an extra mark, if any */
						join_len = strlen(join);
						if ((SUBSCRIPT_SEP == join[join_len-1]) ||
							 (SUBSCRIPT_MARK == join[join_len-1]))
							join[join_len-1] = '\0';

						gwordlist_append(&n_lwg_path, sentence_word);
						t = string_set_add(join, sent->string_set);
						free(join);

						i += mcnt-1;
					}
				}
			}

			if (!join_alt) gwordlist_append(&n_lwg_path, *wgp);

			/*
			 * Add guess marks in [] square brackets, if needed, at the
			 * end of the base word. Convert the badly-printing
			 * SUBSCRIPT_MARK (hex 03 or ^C) into a period.
			 */
			if (t)
			{

				s = strdupa(t);
				sm = strrchr(s, SUBSCRIPT_MARK);
				if (sm) *sm = SUBSCRIPT_DOT;

				if ((!(w->status & WS_GUESS) && (w->status & WS_INDICT))
				    || !DISPLAY_GUESS_MARKS)
				{
					t = string_set_add(s, sent->string_set);
				}
				else
				{
					const char *regex_name = w->regex_name;
					/* 4 = 1(null) + 1(guess_mark) + 2 (sizeof "[]") */
					int baselen = NULL == sm ? strlen(t) : (size_t)(sm-s);
					char guess_mark = 0;

					switch (w->status & WS_GUESS)
					{
						case WS_SPELL:
							guess_mark = GM_SPELL;
							break;
						case WS_RUNON:
							guess_mark = GM_RUNON;
							break;
						case WS_REGEX:
							guess_mark = GM_REGEX;
							break;
						case 0:
							guess_mark = GM_UNKNOWN;
							break;
						default:
							assert(0, "Missing 'case: %2x'", w->status & WS_GUESS);
					}

					/* In the case of display_morphology==0, the guess indication of
					 * the last subword is used as the guess indication of the whole
					 * word.
					 * FIXME? The guess indications of other subwords are ignored in
					 * this mode. This implies that if a first or middle subword has
					 * a guess indication but the last subword doesn't have, no guess
					 * indication would be shown at all. */

					if ((NULL == regex_name) || HIDE_MORPHO) regex_name = "";
					s = alloca(strlen(t) + strlen(regex_name) + 4);
					strncpy(s, t, baselen);
					s[baselen] = '[';
					s[baselen + 1] = guess_mark;
					strcpy(s + baselen + 2, regex_name);
					strcat(s, "]");
					if (NULL != sm) strcat(s, sm);
					t = string_set_add(s, sent->string_set);
				}
			}
		}

		assert(t != NULL, "Word %zu: NULL", i);
		chosen_words[i] = t;
	}

	/* Conditional test removal of quotation marks and the "capdict" tokens,
	 * to facilitate using diff on sentence batch runs. */
	if (test_enabled("removeZZZ"))
	{
		for (i=0, j=0; i<linkage->num_links; i++)
		{
			Link *lnk = &(linkage->link_array[i]);

			if (0 == strcmp("ZZZ", lnk->link_name))
				chosen_words[lnk->rw] = NULL;
		}
	}

	/* If morphology printing is being suppressed, then all links
	 * connecting morphemes will be discarded. */
	if (HIDE_MORPHO)
	{
		/* Discard morphology links. */
		for (i=0; i<linkage->num_links; i++)
		{
			Link * lnk = &linkage->link_array[i];

			if (is_morphology_link(lnk->link_name))
			{
				/* Mark link for discarding. */
				lnk->link_name = NULL;
			}
			else
			{
				/* Mark word for not discarding. */
				show_word[lnk->rw] = true;
				show_word[lnk->lw] = true;
			}
		}
	}

	/* We alloc a little more than needed, but so what... */
	linkage->word = (const char **) exalloc(linkage->num_words*sizeof(char *));

	/* Copy over the chosen words, dropping the discarded words.
	 * However, don't discard existing words (chosen_words[i][0]).
	 * Note that if a word only has morphology links and is not combined with
	 * another word, then it will get displayed with no links at all (e.g.
	 * when explicitly specifying root and suffix for debug: root.= =suf */
	for (i=0, j=0; i<linkage->num_words; ++i)
	{
		if (chosen_words[i] &&
		    (chosen_words[i][0] || (!HIDE_MORPHO || show_word[i])))
		{
			const char *cwtmp = linkage->word[j];
			linkage->word[j] = chosen_words[i];
			chosen_words[i] = cwtmp;
			remap[i] = j;
			j++;
		}
		else
		{
			remap[i] = -1;
		}
	}
	linkage->num_words = j;

	remap_linkages(linkage, remap); /* Update linkage->link_array / num_links. */

	linkage->wg_path_display = n_lwg_path;

	if (verbosity_level(D_CCW))
		print_lwg_path(n_lwg_path, "Display");
}