Пример #1
0
/**
 * Assumes that the sentence expression lists have been generated.
 */
void prepare_to_parse(Sentence sent, Parse_Options opts)
{
	size_t i;

	build_sentence_disjuncts(sent, opts->disjunct_cost, opts);
	if (verbosity_level(D_PREP))
	{
		prt_error("Debug: After expanding expressions into disjuncts:\n");
		print_disjunct_counts(sent);
	}
	print_time(opts, "Built disjuncts");

	for (i=0; i<sent->length; i++)
	{
		sent->word[i].d = eliminate_duplicate_disjuncts(sent->word[i].d);

		/* Some long Russian sentences can really blow up, here. */
		if (resources_exhausted(opts->resources))
			return;
	}
	print_time(opts, "Eliminated duplicate disjuncts");

	if (verbosity_level(D_PREP))
	{
		prt_error("Debug: After expression pruning and duplicate elimination:\n");
		print_disjunct_counts(sent);
	}

	setup_connectors(sent);
}
Пример #2
0
static pp_linkset *read_link_set(pp_knowledge *k,
                                 const char *label, String_set *ss)
{
  /* read link set, marked by label in knowledge file, into a set of links
     whose handle is returned. Return NULL if link set not defined in file,
     in which case the set is taken to be empty. */
  int n_strings,i;
  pp_linkset *ls;
  if (!pp_lexer_set_label(k->lt, label))
  {
    if (verbosity_level(+D_PPK))
      prt_error("Warning: File %s: Link set %s not defined: assuming empty\n",
             k->path, label);
    n_strings = 0;
  }
  else
  {
    n_strings = pp_lexer_count_tokens_of_label(k->lt);
    if (-1 == n_strings) return &LINK_SET_ERROR;
  }
  ls = pp_linkset_open(n_strings);
  for (i=0; i<n_strings; i++)
    pp_linkset_add(ls,
                   string_set_add(pp_lexer_get_next_token_of_label(k->lt),ss));
  return ls;
}
Пример #3
0
static bool read_form_a_cycle_rules(pp_knowledge *k, const char *label)
{
  size_t n_commas, n_tokens;
  size_t r, i;
  pp_linkset *lsHandle;
  const char **tokens;
  if (!pp_lexer_set_label(k->lt, label)) {
      k->n_form_a_cycle_rules = 0;
      if (verbosity_level(+D_PPK))
          prt_error("Warning: File %s: Not using any 'form a cycle' rules\n",
                    k->path);
  }
  else {
    n_commas = pp_lexer_count_commas_of_label(k->lt);
    k->n_form_a_cycle_rules = (n_commas + 1)/2;
  }
  k->form_a_cycle_rules=
    (pp_rule*) malloc ((1+k->n_form_a_cycle_rules)*sizeof(pp_rule));
  for (r=0; r<k->n_form_a_cycle_rules; r++)
    {
      /* read link set */
      tokens = pp_lexer_get_next_group_of_tokens_of_label(k->lt, &n_tokens);
      if (n_tokens <= 0)
      {
        prt_error("Error: File %s: Syntax error\n", k->path);
        return false;
      }
      lsHandle = pp_linkset_open(n_tokens);
      for (i=0; i<n_tokens; i++)
          pp_linkset_add(lsHandle,string_set_add(tokens[i], k->string_set));
      k->form_a_cycle_rules[r].link_set = lsHandle;

      /* read error message */
      tokens = pp_lexer_get_next_group_of_tokens_of_label(k->lt, &n_tokens);
      if (n_tokens > 1)
      {
         prt_error("Error: File %s: Invalid syntax (rule %zu of %s)\n",
                   k->path, r+1,label);
         return false;
      }
      k->form_a_cycle_rules[r].msg = string_set_add(tokens[0], k->string_set);
      k->form_a_cycle_rules[r].use_count = 0;
    }

  /* sentinel entry */
  k->form_a_cycle_rules[k->n_form_a_cycle_rules].msg = 0;
  k->form_a_cycle_rules[k->n_form_a_cycle_rules].use_count = 0;

  return true;
}
Пример #4
0
static bool read_bounded_rules(pp_knowledge *k, const char *label)
{
  const char **tokens;
  size_t n_commas, n_tokens;
  size_t r;
  if (!pp_lexer_set_label(k->lt, label)) {
      k->n_bounded_rules = 0;
      if (verbosity_level(+D_PPK))
        prt_error("Warning: File %s: Not using any 'bounded' rules\n", k->path);
  }
  else {
    n_commas = pp_lexer_count_commas_of_label(k->lt);
    k->n_bounded_rules = (n_commas + 1)/2;
  }
  k->bounded_rules = (pp_rule*) malloc ((1+k->n_bounded_rules)*sizeof(pp_rule));
  for (r=0; r<k->n_bounded_rules; r++)
    {
      /* read domain */
      tokens = pp_lexer_get_next_group_of_tokens_of_label(k->lt, &n_tokens);
      if (n_tokens!=1)
      {
        prt_error("Error: File %s: Invalid syntax: rule %zu of %s\n",
                  k->path, r+1,label);
        return false;
      }
      k->bounded_rules[r].domain = (int) tokens[0][0];

      /* read error message */
      tokens = pp_lexer_get_next_group_of_tokens_of_label(k->lt, &n_tokens);
      if (n_tokens!=1)
      {
        prt_error("Error: File %s: Invalid syntax: rule %zu of %s\n",
                  k->path, r+1,label);
        return false;
      }
      k->bounded_rules[r].msg = string_set_add(tokens[0], k->string_set);
      k->bounded_rules[r].use_count = 0;
    }

  /* sentinel entry */
  k->bounded_rules[k->n_bounded_rules].msg = 0;
  k->bounded_rules[k->n_bounded_rules].use_count = 0;

  return true;
}
Пример #5
0
static bool read_contains_rules(pp_knowledge *k, const char *label,
                                pp_rule **rules, size_t *nRules)
{
  /* Reading the 'contains_one_rules' and reading the
     'contains_none_rules' into their respective arrays */
  size_t n_tokens, i, r;
  int n_commas;
  const char *p;
  const char **tokens;
  if (!pp_lexer_set_label(k->lt, label)) {
      *nRules = 0;
      if (verbosity_level(+D_PPK))
        prt_error("Warning: File %s: Not using any %s rules\n", k->path, label);
  }
  else {
    n_commas = pp_lexer_count_commas_of_label(k->lt);
    if (-1 == n_commas) return false;
    *nRules = (n_commas + 1)/3;
  }
  *rules = (pp_rule*) malloc ((1+*nRules)*sizeof(pp_rule));
  for (r=0; r<*nRules; r++)
    {
      /* first read link */
      tokens = pp_lexer_get_next_group_of_tokens_of_label(k->lt, &n_tokens);
      if (n_tokens > 1)
      {
        prt_error("Error: File %s: Invalid syntax in %s (rule %zu)\n",
                  k->path, label, r+1);
        return false;
      }

      (*rules)[r].selector = string_set_add(tokens[0], k->string_set);

      /* read link set */
      tokens = pp_lexer_get_next_group_of_tokens_of_label(k->lt, &n_tokens);
      (*rules)[r].link_set = pp_linkset_open(n_tokens);
      (*rules)[r].link_set_size = n_tokens;
      (*rules)[r].link_array = (const char **) malloc((1+n_tokens)*sizeof(const char*));
      for (i=0; i<n_tokens; i++)
      {
        p = string_set_add(tokens[i], k->string_set);
        pp_linkset_add((*rules)[r].link_set, p);
        (*rules)[r].link_array[i] = p;
      }
      (*rules)[r].link_array[i]=0; /* NULL-terminator */

      /* read error message */
      tokens = pp_lexer_get_next_group_of_tokens_of_label(k->lt, &n_tokens);
      if (n_tokens > 1)
      {
        prt_error("Error: File %s: Invalid syntax in %s (rule %zu)\n",
                  k->path, label, r+1);
        return false;
      }

      (*rules)[r].msg = string_set_add(tokens[0], k->string_set);
      (*rules)[r].use_count = 0;
    }

  /* sentinel entry */
  (*rules)[*nRules].msg = 0;
  (*rules)[*nRules].use_count = 0;

  return true;
}
Пример #6
0
void * object_open(const char *filename,
                   void * (*opencb)(const char *, const void *),
                   const void * user_data)
{
	/* Dictionary data directory path cache -- per-thread storage. */
	static TLS char *path_found;
	char *completename = NULL;
	void *fp = NULL;
	char *data_dir = NULL;
	const char **path = NULL;

	if (NULL == filename)
	{
		/* Invalidate the dictionary data directory path cache. */
		char *pf = path_found;
		path_found = NULL;
		free(pf);
		return NULL;
	}

	if (NULL == path_found)
	{
		data_dir = dictionary_get_data_dir();
		if (verbosity_level(D_USER_FILES))
		{
			char cwd[MAX_PATH_NAME];
			char *cwdp = getcwd(cwd, sizeof(cwd));
			prt_error("Debug: Current directory: %s\n", NULL == cwdp ? "NULL": cwdp);
			prt_error("Debug: Last-resort data directory: %s\n",
					  data_dir ? data_dir : "NULL");
		}
	}

	/* Look for absolute filename.
	 * Unix: starts with leading slash.
	 * Windows: starts with C:\  except that the drive letter may differ. */
	if ((filename[0] == '/')
#ifdef _WIN32
		|| ((filename[1] == ':')
			 && ((filename[2] == '\\') || (filename[2] == '/')))
		|| (filename[0] == '\\') /* UNC path */
#endif /* _WIN32 */
	   )
	{
		/* opencb() returns NULL if the file does not exist. */
		fp = opencb(filename, user_data);
		lgdebug(D_USER_FILES, "Debug: Opening file %s%s\n", filename, NOTFOUND(fp));
	}
	else
	{
		/* A path list in which to search for dictionaries.
		 * path_found, data_dir or DEFAULTPATH may be NULL. */
		const char *dictpath[] =
		{
			path_found,
			".",
			"." DIR_SEPARATOR "data",
			"..",
			".." DIR_SEPARATOR "data",
			data_dir,
			DEFAULTPATH,
		};
		size_t i = sizeof(dictpath)/sizeof(dictpath[0]);

		for (path = dictpath; i-- > 0; path++)
		{
			if (NULL == *path) continue;

			free(completename);
			completename = join_path(*path, filename);
			fp = opencb(completename, user_data);
			lgdebug(D_USER_FILES, "Debug: Opening file %s%s\n", completename, NOTFOUND(fp));
			if ((NULL != fp) || (NULL != path_found)) break;
		}
	}

	if (NULL == fp)
	{
		fp = opencb(filename, user_data);
		lgdebug(D_USER_FILES, "Debug: Opening file %s%s\n", filename, NOTFOUND(fp));
	}
	else if (NULL == path_found)
	{
		char *pfnd = strdup((NULL != completename) ? completename : filename);
		if ((0 < verbosity) && (dict_file_open == opencb))
			prt_error("Info: Dictionary found at %s\n", pfnd);
		for (size_t i = 0; i < 2; i++)
		{
			char *root = strrchr(pfnd, DIR_SEPARATOR[0]);
			if (NULL != root) *root = '\0';
		}
		path_found = pfnd;
	}

	free(data_dir);
	free(completename);
	return fp;
}
Пример #7
0
/**
 * This fills the linkage array with morphologically-acceptable
 * linkages.
 */
static void process_linkages(Sentence sent, extractor_t* pex,
                             bool overflowed, Parse_Options opts)
{
	if (0 == sent->num_linkages_found) return;
	if (0 == sent->num_linkages_alloced) return; /* Avoid a later crash. */

	/* Pick random linkages if we get more than what was asked for. */
	bool pick_randomly = overflowed ||
	    (sent->num_linkages_found > (int) sent->num_linkages_alloced);

	sent->num_valid_linkages = 0;
	size_t N_invalid_morphism = 0;

	int itry = 0;
	size_t in = 0;
	int maxtries;

	/* In the case of overflow, which will happen for some long
	 * sentences, but is particularly common for the amy/ady random
	 * splitters, we want to find as many morpho-acceptable linkages
	 * as possible, but keep the CPU usage down, as these might be
	 * very rare. This is due to a bug/feature in the interaction
	 * between the word-graph and the parser: valid morph linkages
	 * can be one-in-a-thousand.. or worse.  Search for them, but
	 * don't over-do it.
	 * Note: This problem has recently been alleviated by an
	 * alternatives-compatibility check in the fast matcher - see
	 * alt_connection_possible().
	 */
#define MAX_TRIES 250000

	if (pick_randomly)
	{
		/* Try picking many more linkages, but not more than possible. */
		maxtries = MIN((int) sent->num_linkages_alloced + MAX_TRIES,
		               sent->num_linkages_found);
	}
	else
	{
		maxtries = sent->num_linkages_alloced;
	}

	bool need_init = true;
	for (itry=0; itry<maxtries; itry++)
	{
		Linkage lkg = &sent->lnkages[in];
		Linkage_info * lifo = &lkg->lifo;

		/* Negative values tell extract-links to pick randomly; for
		 * reproducible-rand, the actual value is the rand seed. */
		lifo->index = pick_randomly ? -(itry+1) : itry;

		if (need_init)
		{
			partial_init_linkage(sent, lkg, sent->length);
			need_init = false;
		}
		extract_links(pex, lkg);
		compute_link_names(lkg, sent->string_set);

		if (verbosity_level(+D_PL))
		{
			err_msg(lg_Debug, "chosen_disjuncts before:\n\\");
			print_chosen_disjuncts_words(lkg, /*prt_opt*/true);
		}

		if (sane_linkage_morphism(sent, lkg, opts))
		{
			remove_empty_words(lkg);

			if (verbosity_level(+D_PL))
			{
				err_msg(lg_Debug, "chosen_disjuncts after:\n\\");
				print_chosen_disjuncts_words(lkg, /*prt_opt*/false);
			}

			need_init = true;
			in++;
			if (in >= sent->num_linkages_alloced) break;
		}
		else
		{
			N_invalid_morphism++;
			lkg->num_links = 0;
			lkg->num_words = sent->length;
			// memset(lkg->link_array, 0, lkg->lasz * sizeof(Link));
			memset(lkg->chosen_disjuncts, 0, sent->length * sizeof(Disjunct *));
		}
	}

	/* The last one was alloced, but never actually used. Free it. */
	if (!need_init) free_linkage(&sent->lnkages[in]);

	sent->num_valid_linkages = in;

	/* The remainder of the array is garbage; we never filled it in.
	 * So just pretend that it's shorter than it is */
	sent->num_linkages_alloced = sent->num_valid_linkages;

	lgdebug(D_PARSE, "Info: sane_morphism(): %zu of %d linkages had "
	        "invalid morphology construction\n", N_invalid_morphism,
	        itry + (itry != maxtries));
}
Пример #8
0
void compute_chosen_words(Sentence sent, Linkage linkage, Parse_Options opts)
{
	WordIdx i;   /* index of chosen_words */
	WordIdx j;
	Disjunct **cdjp = linkage->chosen_disjuncts;
	const char **chosen_words = alloca(linkage->num_words * sizeof(*chosen_words));
	int *remap = alloca(linkage->num_words * sizeof(*remap));
	bool *show_word = alloca(linkage->num_words * sizeof(*show_word));
	bool display_morphology = opts->display_morphology;

	Gword **lwg_path = linkage->wg_path;
	Gword **n_lwg_path = NULL; /* new Wordgraph path, to match chosen_words */

	Gword **nullblock_start = NULL; /* start of a null block, to be put in [] */
	size_t nbsize = 0;              /* number of word in a null block */
	Gword *sentence_word;

	memset(show_word, 0, linkage->num_words * sizeof(*show_word));

	if (verbosity_level(D_CCW))
		print_lwg_path(lwg_path, "Linkage");

	for (i = 0; i < linkage->num_words; i++)
	{
		Disjunct *cdj = cdjp[i];
		Gword *w;              /* current word */
		const Gword *nw;       /* next word (NULL if none) */
		Gword **wgp;           /* wordgraph_path traversing pointer */

		const char *t = NULL;  /* current word string */
		bool at_nullblock_end; /* current word is at end of a nullblock */
		bool join_alt = false; /* morpheme-join this alternative */
		char *s;
		size_t l;
		size_t m;

		lgdebug(D_CCW, "Loop start, word%zu: cdj %s, path %s\n",
		        i, cdj ? cdj->word_string : "NULL",
		        lwg_path[i] ? lwg_path[i]->subword : "NULL");

		w = lwg_path[i];
		nw = lwg_path[i+1];
		wgp = &lwg_path[i];
		sentence_word = wg_get_sentence_word(sent, w);

		/* FIXME If the original word was capitalized in a capitalizable
		 * position, the displayed null word may be its downcase version. */

		if (NULL == cdj) /* a null word (the chosen disjunct was NULL) */
		{
			chosen_words[i] = NULL;
			nbsize++;
			if (NULL == nullblock_start) /* it starts a new null block */
				nullblock_start = wgp;

			at_nullblock_end = (NULL == nw) ||
				(wg_get_sentence_word(sent, nw->unsplit_word) != sentence_word);

			/* Accumulate null words in this alternative */
			if (!at_nullblock_end && (NULL == cdjp[i+1]) &&
			    ((w->morpheme_type == MT_PUNC) == (nw->morpheme_type == MT_PUNC)))
			{
				lgdebug(D_CCW, "Skipping word%zu cdjp=NULL#%zu, path %s\n",
				        i, nbsize, w->subword);
				chosen_words[i] = NULL;
				continue;
			}

			if (NULL != nullblock_start)
			{
				/* If we are here, this null word is an end of a null block */
				lgdebug(+D_CCW, "Handling %zu null words at %zu: ", nbsize, i);

				if (1 == nbsize)
				{
					/* Case 1: A single null subword. */
					lgdebug(D_CCW, "A single null subword.\n");
					t = join_null_word(sent, wgp, nbsize);

					gwordlist_append(&n_lwg_path, w);
				}
				else
				{
					lgdebug(D_CCW, "Combining null subwords");
					/* Use alternative_id to check for start of alternative. */
					if (((*nullblock_start)->alternative_id == *nullblock_start)
					    && at_nullblock_end)
					{
						/* Case 2: A null unsplit_word (all-nulls alternative).*/
						lgdebug(D_CCW, " (null alternative)\n");
						t = sentence_word->subword;

						gwordlist_append(&n_lwg_path, sentence_word);
					}
					else
					{
						/* Case 3: Join together >=2 null morphemes. */
						Gword *wgnull;

						lgdebug(D_CCW, " (null partial word)\n");
						wgnull = wordgraph_null_join(sent, wgp-nbsize+1, wgp);
						gwordlist_append(&n_lwg_path, wgnull);
						t = wgnull->subword;
					}
				}

				nullblock_start = NULL;
				nbsize = 0;
				show_word[i] = true;

				if (MT_WALL != w->morpheme_type)
				{
					/* Put brackets around the null word. */
					l = strlen(t) + 2;
					s = (char *) alloca(l+1);
					s[0] = NULLWORD_START;
					strcpy(&s[1], t);
					s[l-1] = NULLWORD_END;
					s[l] = '\0';
					t = string_set_add(s, sent->string_set);
					lgdebug(D_CCW, " %s\n", t);
					/* Null words have no links, so take care not to drop them. */
				}
			}
		}
		else
		{
			/* This word has a linkage. */

			/* TODO: Suppress "virtual-morphemes", currently the dictcap ones. */
			char *sm;

			t = cdj->word_string;
			/* Print the subscript, as in "dog.n" as opposed to "dog". */

			if (0)
			{
				/* TODO */
			}
			else
			{
				/* Get rid of those ugly ".Ixx" */
				if (is_idiom_word(t))
				{
					s = strdupa(t);
					sm = strrchr(s, SUBSCRIPT_MARK); /* Possible double subscript. */
					UNREACHABLE(NULL == sm); /* We know it has a subscript. */
					*sm = '\0';
					t = string_set_add(s, sent->string_set);
				}
				else if (HIDE_MORPHO)
				{
					/* Concatenate the word morphemes together into one word.
					 * Concatenate their subscripts into one subscript.
					 * Use subscript separator SUBSCRIPT_SEP.
					 * XXX Check whether we can encounter an idiom word here.
					 * FIXME Combining contracted words is not handled yet, because
					 * combining morphemes which have non-LL links to other words is
					 * not yet implemented.
					 * FIXME Move to a separate function. */
					Gword **wgaltp;
					size_t join_len = 0;
					size_t mcnt = 0;

					/* If the alternative contains morpheme subwords, mark it
					 * for joining... */

					const Gword *unsplit_word = w->unsplit_word;
					for (wgaltp = wgp, j = i; NULL != *wgaltp; wgaltp++, j++)
					{

						if ((*wgaltp)->unsplit_word != unsplit_word) break;
						if (MT_INFRASTRUCTURE ==
						    (*wgaltp)->unsplit_word->morpheme_type) break;

						mcnt++;

						if (NULL == cdjp[j])
						{
							/* ... but not if it contains a null word */
							join_alt = false;
							break;
						}
						join_len += strlen(cdjp[j]->word_string) + 1;
						if ((*wgaltp)->morpheme_type & IS_REG_MORPHEME)
							join_alt = true;
					}

					if (join_alt)
					{
						/* Join it in two steps: 1. Base words. 2. Subscripts.
						 * FIXME? Can be done in one step (more efficient but maybe
						 * less clear).
						 * Put SUBSCRIPT_SEP between the subscripts.
						 * XXX No 1-1 correspondence between the hidden base words
						 * and the subscripts after the join, in case there are base
						 * words with and without subscripts. */

						const char subscript_sep_str[] = { SUBSCRIPT_SEP, '\0'};
						char *join = calloc(join_len + 1, 1); /* zeroed out */

						join[0] = '\0';

						/* 1. Join base words. (Could just use the unsplit_word.) */
						for (wgaltp = wgp, m = 0; m < mcnt; wgaltp++, m++)
						{
							add_morpheme_unmarked(sent, join, cdjp[i+m]->word_string,
							                      (*wgaltp)->morpheme_type);
						}

						strcat(join, subscript_mark_str()); /* tentative */

						/* 2. Join subscripts. */
						for (wgaltp = wgp, m = 0; m < mcnt; wgaltp++, m++)
						{
							/* Cannot NULLify the word - we may have links to it. */
							if (m != mcnt-1) chosen_words[i+m] = "";

							sm =  strchr(cdjp[i+m]->word_string, SUBSCRIPT_MARK);

							if (NULL != sm)
							{
								/* Supposing stem subscript is .=x (x optional) */
								if (MT_STEM == (*wgaltp)->morpheme_type)
								{
									sm += 1 + STEM_MARK_L; /* sm+strlen(".=") */
									if ('\0' == *sm) sm = NULL;
#if 0
									if ((cnt-1) == m)
									{
										/* Support a prefix-stem combination. In that case
										 * we have just nullified the combined word, so we
										 * need to move it to the position of the prefix.
										 * FIXME: May still not be good enough. */
										move_combined_word = i+m-1;

										/* And the later chosen_word assignment should be:
										 * chosen_words[-1 != move_combined_word ?
										 *    move_combined_word : i] = t;
										 */
									}
									else
									{
										move_combined_word = -1;
									}
#endif
								}
							}
							if (NULL != sm)
							{
								strcat(join, sm+1);
								strcat(join, subscript_sep_str);
							}
						}

						/* Remove an extra mark, if any */
						join_len = strlen(join);
						if ((SUBSCRIPT_SEP == join[join_len-1]) ||
							 (SUBSCRIPT_MARK == join[join_len-1]))
							join[join_len-1] = '\0';

						gwordlist_append(&n_lwg_path, sentence_word);
						t = string_set_add(join, sent->string_set);
						free(join);

						i += mcnt-1;
					}
				}
			}

			if (!join_alt) gwordlist_append(&n_lwg_path, *wgp);

			/*
			 * Add guess marks in [] square brackets, if needed, at the
			 * end of the base word. Convert the badly-printing
			 * SUBSCRIPT_MARK (hex 03 or ^C) into a period.
			 */
			if (t)
			{

				s = strdupa(t);
				sm = strrchr(s, SUBSCRIPT_MARK);
				if (sm) *sm = SUBSCRIPT_DOT;

				if ((!(w->status & WS_GUESS) && (w->status & WS_INDICT))
				    || !DISPLAY_GUESS_MARKS)
				{
					t = string_set_add(s, sent->string_set);
				}
				else
				{
					const char *regex_name = w->regex_name;
					/* 4 = 1(null) + 1(guess_mark) + 2 (sizeof "[]") */
					int baselen = NULL == sm ? strlen(t) : (size_t)(sm-s);
					char guess_mark = 0;

					switch (w->status & WS_GUESS)
					{
						case WS_SPELL:
							guess_mark = GM_SPELL;
							break;
						case WS_RUNON:
							guess_mark = GM_RUNON;
							break;
						case WS_REGEX:
							guess_mark = GM_REGEX;
							break;
						case 0:
							guess_mark = GM_UNKNOWN;
							break;
						default:
							assert(0, "Missing 'case: %2x'", w->status & WS_GUESS);
					}

					/* In the case of display_morphology==0, the guess indication of
					 * the last subword is used as the guess indication of the whole
					 * word.
					 * FIXME? The guess indications of other subwords are ignored in
					 * this mode. This implies that if a first or middle subword has
					 * a guess indication but the last subword doesn't have, no guess
					 * indication would be shown at all. */

					if ((NULL == regex_name) || HIDE_MORPHO) regex_name = "";
					s = alloca(strlen(t) + strlen(regex_name) + 4);
					strncpy(s, t, baselen);
					s[baselen] = '[';
					s[baselen + 1] = guess_mark;
					strcpy(s + baselen + 2, regex_name);
					strcat(s, "]");
					if (NULL != sm) strcat(s, sm);
					t = string_set_add(s, sent->string_set);
				}
			}
		}

		assert(t != NULL, "Word %zu: NULL", i);
		chosen_words[i] = t;
	}

	/* Conditional test removal of quotation marks and the "capdict" tokens,
	 * to facilitate using diff on sentence batch runs. */
	if (test_enabled("removeZZZ"))
	{
		for (i=0, j=0; i<linkage->num_links; i++)
		{
			Link *lnk = &(linkage->link_array[i]);

			if (0 == strcmp("ZZZ", lnk->link_name))
				chosen_words[lnk->rw] = NULL;
		}
	}

	/* If morphology printing is being suppressed, then all links
	 * connecting morphemes will be discarded. */
	if (HIDE_MORPHO)
	{
		/* Discard morphology links. */
		for (i=0; i<linkage->num_links; i++)
		{
			Link * lnk = &linkage->link_array[i];

			if (is_morphology_link(lnk->link_name))
			{
				/* Mark link for discarding. */
				lnk->link_name = NULL;
			}
			else
			{
				/* Mark word for not discarding. */
				show_word[lnk->rw] = true;
				show_word[lnk->lw] = true;
			}
		}
	}

	/* We alloc a little more than needed, but so what... */
	linkage->word = (const char **) exalloc(linkage->num_words*sizeof(char *));

	/* Copy over the chosen words, dropping the discarded words.
	 * However, don't discard existing words (chosen_words[i][0]).
	 * Note that if a word only has morphology links and is not combined with
	 * another word, then it will get displayed with no links at all (e.g.
	 * when explicitly specifying root and suffix for debug: root.= =suf */
	for (i=0, j=0; i<linkage->num_words; ++i)
	{
		if (chosen_words[i] &&
		    (chosen_words[i][0] || (!HIDE_MORPHO || show_word[i])))
		{
			const char *cwtmp = linkage->word[j];
			linkage->word[j] = chosen_words[i];
			chosen_words[i] = cwtmp;
			remap[i] = j;
			j++;
		}
		else
		{
			remap[i] = -1;
		}
	}
	linkage->num_words = j;

	remap_linkages(linkage, remap); /* Update linkage->link_array / num_links. */

	linkage->wg_path_display = n_lwg_path;

	if (verbosity_level(D_CCW))
		print_lwg_path(n_lwg_path, "Display");
}
Пример #9
0
void WordTag::insert_connectors(Exp* exp, int& dfs_position,
                                bool& leading_right, bool& leading_left,
                                std::vector<int>& eps_right,
                                std::vector<int>& eps_left,
                                char* var, bool root, double parent_cost,
                                Exp* parent_exp, const X_node *word_xnode)
{
  double cost = parent_cost + exp->cost;

#ifdef DEBUG
  if (0 && verbosity_level(+D_IC)) { // Extreme debug
    printf("Expression type %d for Word%d, var %s:\n", exp->type, _word, var);
    printf("parent_exp: "); print_expression(parent_exp);
    printf("exp: "); print_expression(exp);
  }
#endif

  if (exp->type == CONNECTOR_type) {
    dfs_position++;

    Connector connector;
    connector.multi = exp->multi;
    connector.desc = exp->u.condesc;
    set_connector_length_limit(&connector, _opts);

    switch (exp->dir) {
    case '+':
      _position.push_back(_right_connectors.size());
      _dir.push_back('+');
      _right_connectors.push_back(
           PositionConnector(parent_exp, &connector, '+', _word, dfs_position,
                             exp->cost, cost, leading_right, false,
                             eps_right, eps_left, word_xnode));
      leading_right = false;
      break;
    case '-':
      _position.push_back(_left_connectors.size());
      _dir.push_back('-');
      _left_connectors.push_back(
           PositionConnector(parent_exp, &connector, '-', _word, dfs_position,
                             exp->cost, cost, false, leading_left,
                             eps_right, eps_left, word_xnode));
      leading_left = false;
      break;
    default:
      throw std::string("Unknown connector direction: ") + exp->dir;
    }
  } else if (exp->type == AND_type) {
    if (exp->u.l == NULL) {
      /* zeroary and */
    } else
      if (exp->u.l != NULL && exp->u.l->next == NULL) {
        /* unary and - skip */
        insert_connectors(exp->u.l->e, dfs_position, leading_right,
             leading_left, eps_right, eps_left, var, root, cost, parent_exp, word_xnode);
      } else {
        int i;
        E_list* l;

        char new_var[MAX_VARIABLE_NAME];
        char* last_new_var = new_var;
        char* last_var = var;
        while ((*last_new_var = *last_var)) {
          last_new_var++;
          last_var++;
        }

        for (i = 0, l = exp->u.l; l != NULL; l = l->next, i++) {
          char* s = last_new_var;
          *s++ = 'c';
          fast_sprintf(s, i);

          insert_connectors(l->e, dfs_position, leading_right, leading_left,
                eps_right, eps_left, new_var, false, cost, parent_exp, word_xnode);

#ifdef POWER_PRUNE_CONNECTORS
          if (leading_right) {
            eps_right.push_back(_variables->epsilon(new_var, '+'));
          }
          if (leading_left) {
            eps_left.push_back(_variables->epsilon(new_var, '-'));
          }
#endif
        }
      }
  } else if (exp->type == OR_type) {
    if (exp->u.l != NULL && exp->u.l->next == NULL) {
      /* unary or - skip */
      insert_connectors(exp->u.l->e, dfs_position, leading_right, leading_left,
          eps_right, eps_left, var, root, cost, exp->u.l->e, word_xnode);
    } else {
      int i;
      E_list* l;
      bool ll_true = false;
      bool lr_true = false;

      char new_var[MAX_VARIABLE_NAME];
      char* last_new_var = new_var;
      char* last_var = var;
      while ((*last_new_var = *last_var)) {
        last_new_var++;
        last_var++;
      }

#ifdef DEBUG
      if (0 && verbosity_level(+D_IC)) { // Extreme debug
        printf("Word%d, var %s OR_type:\n", _word, var);
        printf("exp mem: "); prt_exp_mem(exp, 0);
      }
#endif

      for (i = 0, l = exp->u.l; l != NULL; l = l->next, i++) {
        bool lr = leading_right, ll = leading_left;
        std::vector<int> er = eps_right, el = eps_left;

        char* s = last_new_var;
        *s++ = 'd';
        fast_sprintf(s, i);

        lgdebug(+D_IC, "Word%d: var: %s; exp%d=%p; X_node: %s\n",
                _word, var, i, l, word_xnode ? word_xnode->word->subword : "NULL X_node");
        assert(word_xnode != NULL, "NULL X_node for var %s", new_var);
        if (root && parent_exp == NULL && l->e != word_xnode->exp) {
          E_list *we = NULL;

          if (word_xnode->exp->type == OR_type) {
            for (we = word_xnode->exp->u.l; we != NULL; we = we-> next) {
              if (l->e == we->e)
                break;
            }
          }
          if (we == NULL && word_xnode->next != NULL) {
            lgdebug(+D_IC, "Next word_xnode for word %d is needed\n", _word);
            word_xnode = word_xnode->next;
          }
        }
        insert_connectors(l->e, dfs_position, lr, ll, er, el, new_var, false, cost, l->e, word_xnode);

        if (lr)
          lr_true = true;
        if (ll)
          ll_true = true;
      }
      leading_right = lr_true;
      leading_left = ll_true;
    }
  }
}
Пример #10
0
/** The return value is the number of disjuncts deleted.
 *  Implementation notes:
 *  Normally all the identical disjunct-jets are memory shared.
 *  The suffix_id of each connector serves as its reference count
 *  in the power table. Each time when a connector that cannot match
 *  is discovered, its reference count is decreased, and its
 *  nearest_word field is assigned BAD_WORD. Due to the memory sharing,
 *  each such an assignment affects immediately all the identical
 *  disjunct-jets.
 *  */
static int power_prune(Sentence sent, Parse_Options opts)
{
	power_table pt;
	prune_context pc;
	int N_deleted[2] = {0}; /* [0] counts first deletions, [1] counts dups. */
	int total_deleted = 0;

	power_table_alloc(sent, &pt);
	power_table_init(sent, &pt);

	pc.pt = &pt;
	pc.power_cost = 0;
	pc.null_links = (opts->min_null_count > 0);
	pc.N_changed = 1;  /* forces it always to make at least two passes */
	pc.sent = sent;

	while (1)
	{
		/* left-to-right pass */
		for (WordIdx w = 0; w < sent->length; w++)
		{
			for (Disjunct **dd = &sent->word[w].d; *dd != NULL; /* See: NEXT */)
			{
				Disjunct *d = *dd; /* just for convenience */
				if (d->left == NULL)
				{
					dd = &d->next;  /* NEXT */
					continue;
				}

				bool is_bad = d->left->nearest_word == BAD_WORD;

				if (is_bad || left_connector_list_update(&pc, d->left, w, true) < 0)
				{
					mark_connector_sequence_for_dequeue(d->left, true);
					mark_connector_sequence_for_dequeue(d->right, false);

					/* discard the current disjunct */
					*dd = d->next; /* NEXT - set current disjunct to the next one */
					N_deleted[(int)is_bad]++;
					continue;
				}

				dd = &d->next; /* NEXT */
			}

			clean_table(pt.r_table_size[w], pt.r_table[w]);
		}

		total_deleted += N_deleted[0] + N_deleted[1];
		lgdebug(D_PRUNE, "Debug: l->r pass changed %d and deleted %d (%d+%d)\n",
		        pc.N_changed, N_deleted[0]+N_deleted[1], N_deleted[0], N_deleted[1]);

		if (pc.N_changed == 0 && N_deleted[0] == 0 && N_deleted[1] == 0) break;
		pc.N_changed = N_deleted[0] = N_deleted[1] = 0;

		/* right-to-left pass */
		for (WordIdx w = sent->length-1; w != (WordIdx) -1; w--)
		{
			for (Disjunct **dd = &sent->word[w].d; *dd != NULL; /* See: NEXT */)
			{
				Disjunct *d = *dd; /* just for convenience */
				if (d->right == NULL)
				{
					dd = &d->next;  /* NEXT */
					continue;
				}

				bool is_bad = d->right->nearest_word == BAD_WORD;

				if (is_bad || right_connector_list_update(&pc, d->right, w, true) >= sent->length)
				{
					mark_connector_sequence_for_dequeue(d->right, true);
					mark_connector_sequence_for_dequeue(d->left, false);

					/* Discard the current disjunct. */
					*dd = d->next; /* NEXT - set current disjunct to the next one */
					N_deleted[(int)is_bad]++;
					continue;
				}

				dd = &d->next; /* NEXT */
			}

			clean_table(pt.l_table_size[w], pt.l_table[w]);
		}

		total_deleted += N_deleted[0] + N_deleted[1];
		lgdebug(D_PRUNE, "Debug: r->l pass changed %d and deleted %d (%d+%d)\n",
		        pc.N_changed, N_deleted[0]+N_deleted[1], N_deleted[0], N_deleted[1]);

		if (pc.N_changed == 0 && N_deleted[0] == 0 && N_deleted[1] == 0) break;
		pc.N_changed = N_deleted[0] = N_deleted[1] = 0;
	}
	power_table_delete(&pt);

	lgdebug(D_PRUNE, "Debug: power prune cost: %d\n", pc.power_cost);

	print_time(opts, "power pruned");
	if (verbosity_level(D_PRUNE))
	{
		prt_error("\n\\");
		prt_error("Debug: After power_pruning:\n\\");
		print_disjunct_counts(sent);
	}

#ifdef DEBUG
	for (WordIdx w = 0; w < sent->length; w++)
	{
		for (Disjunct *d = sent->word[w].d; NULL != d; d = d->next)
		{
			for (Connector *c = d->left; NULL != c; c = c->next)
				assert(c->nearest_word != BAD_WORD);
			for (Connector *c = d->right; NULL != c; c = c->next)
				assert(c->nearest_word != BAD_WORD);
		}
	}
#endif

	return total_deleted;
}
Пример #11
0
static int pp_prune(Sentence sent, Parse_Options opts)
{
	pp_knowledge * knowledge;
	size_t i, w;
	int total_deleted, N_deleted;
	bool change, deleteme;
	multiset_table *cmt;

	if (sent->postprocessor == NULL) return 0;
	if (!opts->perform_pp_prune) return 0;

	knowledge = sent->postprocessor->knowledge;

	cmt = cms_table_new();

	for (w = 0; w < sent->length; w++)
	{
		Disjunct *d;
		for (d = sent->word[w].d; d != NULL; d = d->next)
		{
			char dir;
			d->marked = true;
			for (dir=0; dir < 2; dir++)
			{
				Connector *c;
				for (c = ((dir) ? (d->left) : (d->right)); c != NULL; c = c->next)
				{
					insert_in_cms_table(cmt, connector_string(c));
				}
			}
		}
	}

	total_deleted = 0;
	change = true;
	while (change)
	{
		char dir;

		change = false;
		N_deleted = 0;
		for (w = 0; w < sent->length; w++)
		{
			Disjunct *d;
			for (d = sent->word[w].d; d != NULL; d = d->next)
			{
				if (!d->marked) continue;
				deleteme = false;
				for (i = 0; i < knowledge->n_contains_one_rules; i++)
				{
					pp_rule* rule = &knowledge->contains_one_rules[i]; /* the ith rule */
					const char * selector = rule->selector;  /* selector string for this rule */
					pp_linkset * link_set = rule->link_set;  /* the set of criterion links */

					if (rule->selector_has_wildcard) continue;  /* If it has a * forget it */

					for (dir = 0; dir < 2; dir++)
					{
						Connector *c;
						for (c = ((dir) ? (d->left) : (d->right)); c != NULL; c = c->next)
						{

							if (!post_process_match(selector, connector_string(c))) continue;

							/*
							printf("pp_prune: trigger ok.  selector = %s  c->string = %s\n", selector, c->string);
							*/

							/* We know c matches the trigger link of the rule. */
							/* Now check the criterion links */

							if (!rule_satisfiable(cmt, link_set))
							{
								deleteme = true;
								rule->use_count++;
							}
							if (deleteme) break;
						}
						if (deleteme) break;
					}
					if (deleteme) break;
				}

				if (deleteme)         /* now we delete this disjunct */
				{
					N_deleted++;
					total_deleted++;
					d->marked = false; /* mark for deletion later */
					for (dir=0; dir < 2; dir++)
					{
						Connector *c;
						for (c = ((dir) ? (d->left) : (d->right)); c != NULL; c = c->next)
						{
							change |= delete_from_cms_table(cmt, connector_string(c));
						}
					}
				}
			}
		}

		lgdebug(D_PRUNE, "Debug: pp_prune pass deleted %d\n", N_deleted);
	}
	cms_table_delete(cmt);

	if (total_deleted > 0)
	{
		delete_unmarked_disjuncts(sent);
		if (verbosity_level(D_PRUNE))
		{
			prt_error("\n\\");
			prt_error("Debug: After pp_prune:\n\\");
			print_disjunct_counts(sent);
		}
	}

	print_time(opts, "pp pruning");

	return total_deleted;
}