Пример #1
0
/**
 * This uses link_array.  It post-processes
 * this linkage, and prints the appropriate thing.  There are no fat
 * links in it.
 */
Linkage_info analyze_thin_linkage(Sentence sent, Parse_Options opts, int analyze_pass)
{
	int i;
	Linkage_info li;
	PP_node * pp;
	Postprocessor * postprocessor;
	Sublinkage *sublinkage;
	Parse_info pi = sent->parse_info;

	build_digraph(pi, word_links);
	memset(&li, 0, sizeof(li));

	sublinkage = x_create_sublinkage(pi);
	postprocessor = sent->dict->postprocessor;

	compute_link_names(sent);
	for (i=0; i<pi->N_links; i++) {
	  copy_full_link(&(sublinkage->link[i]), &(pi->link_array[i]));
	}

	if (analyze_pass==PP_FIRST_PASS) {
		post_process_scan_linkage(postprocessor, opts, sent, sublinkage);
		free_sublinkage(sublinkage);
		free_digraph(pi, word_links);
		return li;
	}

	/* The code below can be used to generate the "islands" array. For this to work,
	 * however, you have to call "build_digraph" first (as in analyze_fat_linkage).
	 * and then "free_digraph". For some reason this causes a space leak. */

	pp = post_process(postprocessor, opts, sent, sublinkage, TRUE);

	li.N_violations = 0;
	li.and_cost = 0;
	li.unused_word_cost = unused_word_cost(sent->parse_info);
	li.improper_fat_linkage = FALSE;
	li.inconsistent_domains = FALSE;
	li.disjunct_cost = disjunct_cost(pi);
	li.null_cost = null_cost(pi);
	li.link_cost = link_cost(pi);
	li.andlist = NULL;

	if (pp==NULL) {
		if (postprocessor != NULL) li.N_violations = 1;
	} else if (pp->violation!=NULL) {
		li.N_violations++;
	}

	free_sublinkage(sublinkage);
	free_digraph(pi, word_links);
	return li;
}
Пример #2
0
void extract_thin_linkage(Sentence sent, Parse_Options opts, Linkage linkage)
{
	int i;
	Sublinkage *sublinkage;
	Parse_info pi = sent->parse_info;

	sublinkage = x_create_sublinkage(pi);
	compute_link_names(sent);
	for (i=0; i<pi->N_links; i++) {
		copy_full_link(&sublinkage->link[i],&(pi->link_array[i]));
	}

	linkage->num_sublinkages = 1;
	linkage->sublinkage = ex_create_sublinkage(pi);

	for (i=0; i<pi->N_links; ++i) {
		linkage->sublinkage->link[i] = excopy_link(sublinkage->link[i]);
	}

	free_sublinkage(sublinkage);
}
Пример #3
0
/** The extract_links() call sets the chosen_disjuncts array */
static void compute_chosen_disjuncts(Sentence sent)
{
	size_t in;
	size_t N_linkages_alloced = sent->num_linkages_alloced;
	Parse_info pi = sent->parse_info;

	for (in=0; in < N_linkages_alloced; in++)
	{
		Linkage lkg = &sent->lnkages[in];
		Linkage_info *lifo = &lkg->lifo;

		if (lifo->discarded || lifo->N_violations) continue;

		partial_init_linkage(lkg, pi->N_words);
		extract_links(lkg, pi);
		compute_link_names(lkg, sent->string_set);
		/* Because the empty words are used only in the parsing stage, they are
		 * removed here along with their links, so from now on we will not need to
		 * consider them. */
		remove_empty_words(lkg);
	}
}
Пример #4
0
/**
 * This procedure mimics analyze_fat_linkage in order to
 * extract the sublinkages and copy them to the Linkage
 * data structure passed in.
 */
void extract_fat_linkage(Sentence sent, Parse_Options opts, Linkage linkage)
{
	int i, j, N_thin_links;
	DIS_node *d_root;
	int num_sublinkages;
	Sublinkage * sublinkage;
	Parse_info pi = sent->parse_info;

	sublinkage = x_create_sublinkage(pi);
	build_digraph(pi, word_links);
	structure_violation = FALSE;
	d_root = build_DIS_CON_tree(pi, word_links);

	if (structure_violation) {
		compute_link_names(sent);
		for (i=0; i<pi->N_links; i++) {
			copy_full_link(&sublinkage->link[i],&(pi->link_array[i]));
		}

		linkage->num_sublinkages=1;
		linkage->sublinkage = ex_create_sublinkage(pi);

		/* This will have fat links! */
		for (i=0; i<pi->N_links; ++i) {
			linkage->sublinkage->link[i] = excopy_link(sublinkage->link[i]);
		}

		free_sublinkage(sublinkage);
		free_digraph(pi, word_links);
		free_DIS_tree(d_root);
		return;
	}

	/* first get number of sublinkages and allocate space */
	num_sublinkages = 0;
	for (;;) {
		num_sublinkages++;
		if (!advance_DIS(d_root)) break;
	}

	linkage->num_sublinkages = num_sublinkages;
	linkage->sublinkage =
		(Sublinkage *) exalloc(sizeof(Sublinkage)*num_sublinkages);
	for (i=0; i<num_sublinkages; ++i) {
		linkage->sublinkage[i].link = NULL;
		linkage->sublinkage[i].pp_info = NULL;
		linkage->sublinkage[i].violation = NULL;
	}

	/* now fill out the sublinkage arrays */
	compute_link_names(sent);

	num_sublinkages = 0;
	for (;;) {
		for (i=0; i<pi->N_links; i++) {
			patch_array[i].used = patch_array[i].changed = FALSE;
			patch_array[i].newl = pi->link_array[i].l;
			patch_array[i].newr = pi->link_array[i].r;
			copy_full_link(&sublinkage->link[i], &(pi->link_array[i]));
		}
		fill_patch_array_DIS(d_root, NULL, word_links);

		for (i=0; i<pi->N_links; i++) {
			if (patch_array[i].changed || patch_array[i].used) {
				sublinkage->link[i]->l = patch_array[i].newl;
				sublinkage->link[i]->r = patch_array[i].newr;
			} else if ((dfs_root_word[pi->link_array[i].l] != -1) &&
					   (dfs_root_word[pi->link_array[i].r] != -1)) {
				sublinkage->link[i]->l = -1;
			}
		}

		compute_pp_link_array_connectors(sent, sublinkage);
		compute_pp_link_names(sent, sublinkage);

		/* Don't copy the fat links into the linkage */
		N_thin_links = 0;
		for (i= 0; i<pi->N_links; ++i) {
			if (sublinkage->link[i]->l == -1) continue;
			N_thin_links++;
		}

		linkage->sublinkage[num_sublinkages].num_links = N_thin_links;
		linkage->sublinkage[num_sublinkages].link =
			(Link *) exalloc(sizeof(Link)*N_thin_links);
		linkage->sublinkage[num_sublinkages].pp_info = NULL;
		linkage->sublinkage[num_sublinkages].violation = NULL;

		for (i=0, j=0; i<pi->N_links; ++i) {
			if (sublinkage->link[i]->l == -1) continue;
			linkage->sublinkage[num_sublinkages].link[j++] =
				excopy_link(sublinkage->link[i]);
		}


		num_sublinkages++;
		if (!advance_DIS(d_root)) break;
	}

	free_sublinkage(sublinkage);
	free_digraph(pi, word_links);
	free_DIS_tree(d_root);
}
Пример #5
0
/**
 * This uses link_array.  It enumerates and post-processes
 * all the linkages represented by this one.  We know this contains
 * at least one fat link.
 */
Linkage_info analyze_fat_linkage(Sentence sent, Parse_Options opts, int analyze_pass)
{
	int i;
	Linkage_info li;
	DIS_node *d_root;
	PP_node *pp;
	Postprocessor *postprocessor;
	Sublinkage *sublinkage;
	Parse_info pi = sent->parse_info;
	PP_node accum;			   /* for domain ancestry check */
	D_type_list * dtl0, * dtl1;  /* for domain ancestry check */

	sublinkage = x_create_sublinkage(pi);
	postprocessor = sent->dict->postprocessor;
	build_digraph(pi, word_links);
	structure_violation = FALSE;
	d_root = build_DIS_CON_tree(pi, word_links); /* may set structure_violation to TRUE */

	li.N_violations = 0;
	li.improper_fat_linkage = structure_violation;
	li.inconsistent_domains = FALSE;
	li.unused_word_cost = unused_word_cost(sent->parse_info);
	li.disjunct_cost = disjunct_cost(pi);
	li.null_cost = null_cost(pi);
	li.link_cost = link_cost(pi);
	li.and_cost = 0;
	li.andlist = NULL;

	if (structure_violation) {
		li.N_violations++;
		free_sublinkage(sublinkage);
		free_digraph(pi, word_links);
		free_DIS_tree(d_root);
		return li;
	}

	if (analyze_pass==PP_SECOND_PASS) {
	  li.andlist = build_andlist(sent, word_links);
	  li.and_cost = li.andlist->cost;
	}
	else li.and_cost = 0;

	compute_link_names(sent);

	for (i=0; i<pi->N_links; i++) accum.d_type_array[i] = NULL;

	for (;;) {		/* loop through all the sub linkages */
		for (i=0; i<pi->N_links; i++) {
			patch_array[i].used = patch_array[i].changed = FALSE;
			patch_array[i].newl = pi->link_array[i].l;
			patch_array[i].newr = pi->link_array[i].r;
			copy_full_link(&sublinkage->link[i], &(pi->link_array[i]));
		}
		fill_patch_array_DIS(d_root, NULL, word_links);

		for (i=0; i<pi->N_links; i++) {
			if (patch_array[i].changed || patch_array[i].used) {
				sublinkage->link[i]->l = patch_array[i].newl;
				sublinkage->link[i]->r = patch_array[i].newr;
			}
			else if ((dfs_root_word[pi->link_array[i].l] != -1) &&
					 (dfs_root_word[pi->link_array[i].r] != -1)) {
				sublinkage->link[i]->l = -1;
			}
		}

		compute_pp_link_array_connectors(sent, sublinkage);
		compute_pp_link_names(sent, sublinkage);

		/* 'analyze_pass' logic added ALB 1/97 */
		if (analyze_pass==PP_FIRST_PASS) {
			post_process_scan_linkage(postprocessor,opts,sent,sublinkage);
			if (!advance_DIS(d_root)) break;
			else continue;
		}

		pp = post_process(postprocessor, opts, sent, sublinkage, TRUE);

		if (pp==NULL) {
			if (postprocessor != NULL) li.N_violations = 1;
		}
		else if (pp->violation == NULL)  {
			/* the purpose of this stuff is to make sure the domain
			   ancestry for a link in each of its sentences is consistent. */

			for (i=0; i<pi->N_links; i++) {
				if (sublinkage->link[i]->l == -1) continue;
				if (accum.d_type_array[i] == NULL) {
					accum.d_type_array[i] = copy_d_type(pp->d_type_array[i]);
				} else {
					dtl0 = pp->d_type_array[i];
					dtl1 = accum.d_type_array[i];
					while((dtl0 != NULL) && (dtl1 != NULL) && (dtl0->type == dtl1->type)) {
						dtl0 = dtl0->next;
						dtl1 = dtl1->next;
					}
					if ((dtl0 != NULL) || (dtl1 != NULL)) break;
				}
			}
			if (i != pi->N_links) {
				li.N_violations++;
				li.inconsistent_domains = TRUE;
			}
		}
		else if (pp->violation!=NULL) {
			li.N_violations++;
		}

		if (!advance_DIS(d_root)) break;
	}

	for (i=0; i<pi->N_links; ++i) {
		free_d_type(accum.d_type_array[i]);
	}

	/* if (display_on && (li.N_violations != 0) &&
	   (verbosity > 3) && should_print_messages)
	   printf("P.P. violation in one part of conjunction.\n"); */
	free_sublinkage(sublinkage);
	free_digraph(pi, word_links);
	free_DIS_tree(d_root);
	return li;
}
Пример #6
0
/** This does basic post-processing for all linkages.
 */
static void post_process_linkages(Sentence sent, Parse_Options opts)
{
	size_t in;
	size_t N_linkages_post_processed = 0;
	size_t N_valid_linkages = sent->num_valid_linkages;
	size_t N_linkages_alloced = sent->num_linkages_alloced;
	bool twopass = sent->length >= opts->twopass_length;

	/* (optional) First pass: just visit the linkages */
	/* The purpose of the first pass is to make the post-processing
	 * more efficient.  Because (hopefully) by the time the real work
	 * is done in the 2nd pass, the relevant rule set has been pruned
	 * in the first pass.
	 */
	if (twopass)
	{
		for (in=0; in < N_linkages_alloced; in++)
		{
			Linkage lkg = &sent->lnkages[in];
			Linkage_info *lifo = &lkg->lifo;
			if (lifo->discarded) continue;

			/* We still need link names, even if there has been a morfo
			 * violation. */
			compute_link_names(lkg, sent->string_set);
			if (lifo->N_violations) continue;

			post_process_scan_linkage(sent->postprocessor, lkg);

			if ((49 == in%50) && resources_exhausted(opts->resources)) break;
		}
	}

	/* Second pass: actually perform post-processing */
	for (in=0; in < N_linkages_alloced; in++)
	{
		PP_node *ppn;
		Linkage lkg = &sent->lnkages[in];
		Linkage_info *lifo = &lkg->lifo;

		if (lifo->discarded) continue; /* Invalid morphism construction */

		/* We need link names, even if morfo check fails */
		if (!twopass) compute_link_names(lkg, sent->string_set);

		ppn = do_post_process(sent->postprocessor, lkg, twopass);
	   post_process_free_data(&sent->postprocessor->pp_data);

		if (NULL != ppn->violation)
		{
			N_valid_linkages--;
			lifo->N_violations++;

			/* Set the message, only if not set (e.g. by sane_morphism) */
			if (NULL == lifo->pp_violation_msg)
				lifo->pp_violation_msg = ppn->violation;
		}
		N_linkages_post_processed++;

		linkage_score(lkg, opts);
		if ((9 == in%10) && resources_exhausted(opts->resources)) break;
	}

	/* If the timer expired, then we never finished post-processing.
	 * Mark the remaining sentences as bad, as otherwise strange
	 * results get reported.  At any rate, need to compute the link
	 * names, as otherwise linkage_create() will crash and burn
	 * trying to touch them. */
	for (; in < N_linkages_alloced; in++)
	{
		Linkage lkg = &sent->lnkages[in];
		Linkage_info *lifo = &lkg->lifo;
		if (lifo->discarded) continue;
		if (!twopass) compute_link_names(lkg, sent->string_set);
		N_valid_linkages--;
		lifo->N_violations++;

		/* Set the message, only if not set (e.g. by sane_morphism) */
		if (NULL == lifo->pp_violation_msg)
			lifo->pp_violation_msg = "Timeout during postprocessing";
	}

	print_time(opts, "Postprocessed all linkages");

	if (opts->verbosity > 1)
	{
		err_ctxt ec;
		ec.sent = sent;
		err_msg(&ec, Info, "Info: %zu of %zu linkages with no P.P. violations\n",
		        N_valid_linkages, N_linkages_post_processed);
	}

	sent->num_linkages_post_processed = N_linkages_post_processed;
	sent->num_valid_linkages = N_valid_linkages;
}
Пример #7
0
/**
 * This fills the linkage array with morphologically-acceptable
 * linkages.
 */
static void process_linkages(Sentence sent, extractor_t* pex,
                             bool overflowed, Parse_Options opts)
{
	if (0 == sent->num_linkages_found) return;
	if (0 == sent->num_linkages_alloced) return; /* Avoid a later crash. */

	/* Pick random linkages if we get more than what was asked for. */
	bool pick_randomly = overflowed ||
	    (sent->num_linkages_found > (int) sent->num_linkages_alloced);

	sent->num_valid_linkages = 0;
	size_t N_invalid_morphism = 0;

	int itry = 0;
	size_t in = 0;
	int maxtries;

	/* In the case of overflow, which will happen for some long
	 * sentences, but is particularly common for the amy/ady random
	 * splitters, we want to find as many morpho-acceptable linkages
	 * as possible, but keep the CPU usage down, as these might be
	 * very rare. This is due to a bug/feature in the interaction
	 * between the word-graph and the parser: valid morph linkages
	 * can be one-in-a-thousand.. or worse.  Search for them, but
	 * don't over-do it.
	 * Note: This problem has recently been alleviated by an
	 * alternatives-compatibility check in the fast matcher - see
	 * alt_connection_possible().
	 */
#define MAX_TRIES 250000

	if (pick_randomly)
	{
		/* Try picking many more linkages, but not more than possible. */
		maxtries = MIN((int) sent->num_linkages_alloced + MAX_TRIES,
		               sent->num_linkages_found);
	}
	else
	{
		maxtries = sent->num_linkages_alloced;
	}

	bool need_init = true;
	for (itry=0; itry<maxtries; itry++)
	{
		Linkage lkg = &sent->lnkages[in];
		Linkage_info * lifo = &lkg->lifo;

		/* Negative values tell extract-links to pick randomly; for
		 * reproducible-rand, the actual value is the rand seed. */
		lifo->index = pick_randomly ? -(itry+1) : itry;

		if (need_init)
		{
			partial_init_linkage(sent, lkg, sent->length);
			need_init = false;
		}
		extract_links(pex, lkg);
		compute_link_names(lkg, sent->string_set);

		if (verbosity_level(+D_PL))
		{
			err_msg(lg_Debug, "chosen_disjuncts before:\n\\");
			print_chosen_disjuncts_words(lkg, /*prt_opt*/true);
		}

		if (sane_linkage_morphism(sent, lkg, opts))
		{
			remove_empty_words(lkg);

			if (verbosity_level(+D_PL))
			{
				err_msg(lg_Debug, "chosen_disjuncts after:\n\\");
				print_chosen_disjuncts_words(lkg, /*prt_opt*/false);
			}

			need_init = true;
			in++;
			if (in >= sent->num_linkages_alloced) break;
		}
		else
		{
			N_invalid_morphism++;
			lkg->num_links = 0;
			lkg->num_words = sent->length;
			// memset(lkg->link_array, 0, lkg->lasz * sizeof(Link));
			memset(lkg->chosen_disjuncts, 0, sent->length * sizeof(Disjunct *));
		}
	}

	/* The last one was alloced, but never actually used. Free it. */
	if (!need_init) free_linkage(&sent->lnkages[in]);

	sent->num_valid_linkages = in;

	/* The remainder of the array is garbage; we never filled it in.
	 * So just pretend that it's shorter than it is */
	sent->num_linkages_alloced = sent->num_valid_linkages;

	lgdebug(D_PARSE, "Info: sane_morphism(): %zu of %d linkages had "
	        "invalid morphology construction\n", N_invalid_morphism,
	        itry + (itry != maxtries));
}