Пример #1
0
static int read_constituents_from_domains(con_context_t *ctxt, Linkage linkage,
                                          int numcon_total, int s)
{
	int d, c, leftlimit, l, leftmost, rightmost, w, c2, numcon_subl=0, w2;
	List_o_links * dlink;
	int rootright, rootleft, adjustment_made;
	Sublinkage * subl;
	const char * name;
	Domain domain;

	r_limit = linkage->num_words-2; /**PV**/

	subl = &linkage->sublinkage[s];

	for (d=0, c=numcon_total; d<subl->pp_data.N_domains; d++, c++) {
		domain = subl->pp_data.domain_array[d];
		rootright = linkage_get_link_rword(linkage, domain.start_link);
		rootleft =  linkage_get_link_lword(linkage, domain.start_link);

		if ((domain.type=='c') ||
			(domain.type=='d') ||
			(domain.type=='e') ||
			(domain.type=='f') ||
			(domain.type=='g') ||
			(domain.type=='u') ||
			(domain.type=='y')) {
			leftlimit = 0;
			leftmost = linkage_get_link_lword(linkage, domain.start_link);
			rightmost = linkage_get_link_lword(linkage, domain.start_link);
		}
		else {
			leftlimit = linkage_get_link_lword(linkage, domain.start_link)+1;
			leftmost = linkage_get_link_rword(linkage, domain.start_link);
			rightmost = linkage_get_link_rword(linkage, domain.start_link);
		}

		/* Start by assigning both left and right limits to the
		   right word of the start link. This will always be contained
		   in the constituent. This will also handle the case
		   where the domain contains no links. */

		for (dlink = domain.lol; dlink!=NULL; dlink=dlink->next) {
			l=dlink->link;

			if ((linkage_get_link_lword(linkage, l) < leftmost) &&
				(linkage_get_link_lword(linkage, l) >= leftlimit))
				leftmost = linkage_get_link_lword(linkage, l);

			if (linkage_get_link_rword(linkage, l) > rightmost)
				rightmost = linkage_get_link_rword(linkage, l);
		}

		c--;
		c = add_constituent(ctxt, c, linkage, domain, leftmost, rightmost,
						cons_of_domain(domain.type));

		if (domain.type=='z') {
			c = add_constituent(ctxt, c, linkage, domain, leftmost, rightmost, "S");
		}
		if (domain.type=='c') {
			c = add_constituent(ctxt, c, linkage, domain, leftmost, rightmost, "S");
		}
		if ((post_process_match("Ce*", ctxt->constituent[c].start_link)==1) ||
			(post_process_match("Rn", ctxt->constituent[c].start_link)==1)) {
			c = add_constituent(ctxt, c, linkage, domain, leftmost, rightmost, "SBAR");
		}
		if ((post_process_match("R*", ctxt->constituent[c].start_link)==1) ||
			(post_process_match("MX#r", ctxt->constituent[c].start_link)==1)) {
			w=leftmost;
			if (strcmp(linkage->word[w], ",")==0) w++;
			c = add_constituent(ctxt, c, linkage, domain, w, w, "WHNP");
		}
		if (post_process_match("Mj", ctxt->constituent[c].start_link)==1) {
			w=leftmost;
			if (strcmp(linkage->word[w], ",")==0) w++;
			c = add_constituent(ctxt, c, linkage, domain, w, w+1, "WHPP");
			c = add_constituent(ctxt, c, linkage, domain, w+1, w+1, "WHNP");
		}
		if ((post_process_match("Ss#d", ctxt->constituent[c].start_link)==1) ||
			(post_process_match("B#d", ctxt->constituent[c].start_link)==1)) {
			c = add_constituent(ctxt, c, linkage, domain, rootleft, rootleft, "WHNP");
			c = add_constituent(ctxt, c, linkage, domain,
							rootleft, ctxt->constituent[c-1].right, "SBAR");
		}
		if (post_process_match("CP", ctxt->constituent[c].start_link)==1) {
			if (strcmp(linkage->word[leftmost], ",")==0)
				ctxt->constituent[c].left++;
			c = add_constituent(ctxt, c, linkage, domain, 1, linkage->num_words-1, "S");
		}
		if ((post_process_match("MVs", ctxt->constituent[c].start_link)==1) ||
			(domain.type=='f')) {
			w=ctxt->constituent[c].left;
			if (strcmp(linkage->word[w], ",")==0)
				w++;
			if (strcmp(linkage->word[w], "when")==0) {
				c = add_constituent(ctxt, c, linkage, domain, w, w, "WHADVP");
			}
		}
		if (domain.type=='t') {
			c = add_constituent(ctxt, c, linkage, domain, leftmost, rightmost, "S");
		}
		if ((post_process_match("QI", ctxt->constituent[c].start_link)==1) ||
			(post_process_match("Mr", ctxt->constituent[c].start_link)==1) ||
			(post_process_match("MX#d", ctxt->constituent[c].start_link)==1)) {
			w = leftmost;
			if (strcmp(linkage->word[w], ",")==0) w++;
			if (ctxt->wordtype[w] == NONE)
				name = "WHADVP";
			else if (ctxt->wordtype[w] == QTYPE)
				name = "WHNP";
			else if (ctxt->wordtype[w] == QDTYPE)
				name = "WHNP";
			else
				assert(0, "Unexpected word type");
			c = add_constituent(ctxt, c, linkage, domain, w, w, name);

			if (ctxt->wordtype[w] == QDTYPE) {
				/* Now find the finite verb to the right, start an S */
				/* Limit w2 to sentence length. */
				// for( w2=w+1; w2 < r_limit-1; w2++ )
				for (w2 = w+1; w2 < rightmost; w2++)
				  if ((ctxt->wordtype[w2] == STYPE) || (ctxt->wordtype[w2] == PTYPE)) break;

				/* Adjust the right boundary of previous constituent */
				ctxt->constituent[c].right = w2-1;
				c = add_constituent(ctxt, c, linkage, domain, w2, rightmost, "S");
			  }
		}

		if (ctxt->constituent[c].domain_type=='\0') {
			error("Error: no domain type assigned to constituent\n");
		}
		if (ctxt->constituent[c].start_link==NULL) {
			error("Error: no type assigned to constituent\n");
		}
	}

	numcon_subl = c - numcon_total;
	/* numcon_subl = handle_islands(linkage, numcon_total, numcon_subl);  */

	if (verbosity >= 2)
		printf("Constituents added at first stage for subl %d:\n",
			   linkage->current);
	for (c = numcon_total; c < numcon_total + numcon_subl; c++)
	{
		print_constituent(ctxt, linkage, c);
	}

	/* Opener case - generates S around main clause.
	   (This must be done first; the S generated will be needed for
	   later cases.) */
	numcon_subl =
		gen_comp(ctxt, linkage, numcon_total, numcon_subl, "SBAR", "S", "S", 5);

	/* pp opener case */
	numcon_subl =
		gen_comp(ctxt, linkage, numcon_total, numcon_subl, "PP", "S", "S", 6);

	/* participle opener case */
	numcon_subl =
		gen_comp(ctxt, linkage, numcon_total, numcon_subl, "S", "S", "S", 9);

	/* Subject-phrase case; every main VP generates an S */
	numcon_subl =
		gen_comp(ctxt, linkage, numcon_total, numcon_subl, "VP", "S", "NP", 1);

	/* Relative clause case; an SBAR generates a complement NP */
	numcon_subl =
		gen_comp(ctxt, linkage, numcon_total, numcon_subl, "SBAR", "NP", "NP", 3);

	/* Participle modifier case */
	numcon_subl =
		gen_comp(ctxt, linkage, numcon_total, numcon_subl, "VP", "NP", "NP", 8);

	/* PP modifying NP */
	numcon_subl =
		gen_comp(ctxt, linkage, numcon_total, numcon_subl, "PP", "NP", "NP", 8);

	/* Appositive case */
	numcon_subl =
		gen_comp(ctxt, linkage, numcon_total, numcon_subl, "NP", "NP", "NP", 4);

	/* S-V inversion case; an NP generates a complement VP */
	numcon_subl =
		gen_comp(ctxt, linkage, numcon_total, numcon_subl, "NP", "SINV", "VP", 7);

	adjust_subordinate_clauses(ctxt, linkage, numcon_total, numcon_subl);
	for (c=numcon_total; c<numcon_total + numcon_subl; c++) {
		if ((ctxt->constituent[c].domain_type=='p') &&
			(strcmp(linkage->word[ctxt->constituent[c].left], ",")==0)) {
			ctxt->constituent[c].left++;
		}
	}

	/* Make sure the constituents are nested. If two constituents
	 * are not nested: whichever constituent has the furthest left
	 * boundary, shift that boundary rightwards to the left boundary
	 * of the other one.
	 */
	while (1) {
		adjustment_made=0;
		for (c=numcon_total; c<numcon_total + numcon_subl; c++) {
			for (c2=numcon_total; c2<numcon_total + numcon_subl; c2++) {
				if ((ctxt->constituent[c].left < ctxt->constituent[c2].left) &&
					(ctxt->constituent[c].right < ctxt->constituent[c2].right) &&
					(ctxt->constituent[c].right >= ctxt->constituent[c2].left)) {

					/* We've found two overlapping constituents.
					   If one is larger, except the smaller one
					   includes an extra comma, adjust the smaller one
					   to exclude the comma */

					if ((strcmp(linkage->word[ctxt->constituent[c2].right], ",")==0) ||
						(strcmp(linkage->word[ctxt->constituent[c2].right],
								"RIGHT-WALL")==0)) {
						if (verbosity>=2)
							printf("Adjusting %d to fix comma overlap\n", c2);
						adjust_for_right_comma(ctxt, linkage, c2);
						adjustment_made=1;
					}
					else if (strcmp(linkage->word[ctxt->constituent[c].left], ",")==0) {
						if (verbosity>=2)
							printf("Adjusting c %d to fix comma overlap\n", c);
						adjust_for_left_comma(ctxt, linkage, c);
						adjustment_made=1;
					}
					else {
					  if (verbosity>=2) {
						printf("WARNING: the constituents aren't nested! Adjusting them." \
							   "(%d, %d)\n", c, c2);
					  }
					  ctxt->constituent[c].left = ctxt->constituent[c2].left;
					}
				}
			}
		}
		if (adjustment_made==0) break;
	}

	/* This labels certain words as auxiliaries (such as forms of "be"
	   with passives, forms of "have" wth past participles,
	   "to" with infinitives). These words start VP's which include
	   them. In Treebank I, these don't get printed unless they're part of an
	   andlist, in which case they get labeled "X". (this is why we need to
	   label them as "aux".) In Treebank II, however, they seem to be treated
	   just like other verbs, so the "aux" stuff isn't needed. */


	for (c=numcon_total; c<numcon_total + numcon_subl; c++) {
		ctxt->constituent[c].subl = linkage->current;
		if (((ctxt->constituent[c].domain_type == 'v') &&
			(ctxt->wordtype[linkage_get_link_rword(linkage,
											 ctxt->constituent[c].start_num)]==PTYPE))
		   ||
		   ((ctxt->constituent[c].domain_type == 't') &&
			(strcmp(ctxt->constituent[c].type, "VP")==0))) {
			ctxt->constituent[c].aux=1;
		}
		else ctxt->constituent[c].aux=0;
	}

	for (c=numcon_total; c<numcon_total + numcon_subl; c++) {
		ctxt->constituent[c].subl = linkage->current;
		ctxt->constituent[c].aux=0;
	}

	return numcon_subl;
}
Пример #2
0
static int read_constituents_from_domains(con_context_t *ctxt, Linkage linkage,
                                          int numcon_total)
{
	size_t d, l, w2;
	int c, w, c2, numcon_subl = 0;

	for (d = 0, c = numcon_total; d < linkage->hpsg_pp_data.N_domains; d++, c++)
	{
		size_t leftmost, rightmost, leftlimit;
		int rootleft;
		List_o_links * dlink;

		Domain domain = linkage->hpsg_pp_data.domain_array[d];

		// rootright = linkage_get_link_rword(linkage, domain.start_link);
		rootleft =  linkage_get_link_lword(linkage, domain.start_link);

		if ((domain.type=='c') ||
			(domain.type=='d') ||
			(domain.type=='e') ||
			(domain.type=='f') ||
			(domain.type=='g') ||
			(domain.type=='u') ||
			(domain.type=='y'))
		{
			leftlimit = 0;
			leftmost = linkage_get_link_lword(linkage, domain.start_link);
			rightmost = linkage_get_link_lword(linkage, domain.start_link);
		}
		else
		{
			leftlimit = linkage_get_link_lword(linkage, domain.start_link) + 1;
			leftmost = linkage_get_link_rword(linkage, domain.start_link);
			rightmost = linkage_get_link_rword(linkage, domain.start_link);
		}

		/* Start by assigning both left and right limits to the
		 * right word of the start link. This will always be contained
		 * in the constituent. This will also handle the case
		 * where the domain contains no links.
		 */
		for (dlink = domain.lol; dlink != NULL; dlink = dlink->next)
		{
			l = dlink->link;

			if ((linkage_get_link_lword(linkage, l) < leftmost) &&
				(linkage_get_link_lword(linkage, l) >= leftlimit))
			{
				leftmost = linkage_get_link_lword(linkage, l);
			}

			if (linkage_get_link_rword(linkage, l) > rightmost)
			{
				rightmost = linkage_get_link_rword(linkage, l);
			}
		}

		c--;
		c = add_constituent(ctxt, c, linkage, &domain, leftmost, rightmost,
						cons_of_domain(linkage, domain.type));

		if (domain.type == 'z')
		{
			c = add_constituent(ctxt, c, linkage, &domain, leftmost, rightmost, "S");
		}
		if (domain.type=='c')
		{
			c = add_constituent(ctxt, c, linkage, &domain, leftmost, rightmost, "S");
		}
		if ((post_process_match("Ce*", ctxt->constituent[c].start_link)==1) ||
			(post_process_match("Rn", ctxt->constituent[c].start_link)==1))
		{
			c = add_constituent(ctxt, c, linkage, &domain, leftmost, rightmost, "SBAR");
		}
		if ((post_process_match("R*", ctxt->constituent[c].start_link)==1) ||
			(post_process_match("MX#r", ctxt->constituent[c].start_link)==1))
		{
			w = leftmost;
			if (strcmp(linkage->word[w], ",") == 0) w++;
			c = add_constituent(ctxt, c, linkage, &domain, w, w, "WHNP");
		}
		if (post_process_match("Mj", ctxt->constituent[c].start_link) == 1)
		{
			w = leftmost;
			if (strcmp(linkage->word[w], ",") == 0) w++;
			c = add_constituent(ctxt, c, linkage, &domain, w, w+1, "WHPP");
			c = add_constituent(ctxt, c, linkage, &domain, w+1, w+1, "WHNP");
		}
		if ((post_process_match("Ss#d", ctxt->constituent[c].start_link)==1) ||
			(post_process_match("B#d", ctxt->constituent[c].start_link)==1))
		{
			c = add_constituent(ctxt, c, linkage, &domain, rootleft, rootleft, "WHNP");
			c = add_constituent(ctxt, c, linkage, &domain,
							rootleft, ctxt->constituent[c-1].right, "SBAR");
		}
		if (post_process_match("CP", ctxt->constituent[c].start_link)==1)
		{
			if (strcmp(linkage->word[leftmost], ",") == 0)
				ctxt->constituent[c].left++;
			c = add_constituent(ctxt, c, linkage, &domain, 1, linkage->num_words-1, "S");
		}
		if ((post_process_match("MVs", ctxt->constituent[c].start_link)==1) ||
			(domain.type=='f'))
		{
			w = ctxt->constituent[c].left;
			if (strcmp(linkage->word[w], ",") == 0)
				w++;
			if (strcmp(linkage->word[w], "when") == 0)
			{
				c = add_constituent(ctxt, c, linkage, &domain, w, w, "WHADVP");
			}
		}
		if (domain.type=='t')
		{
			c = add_constituent(ctxt, c, linkage, &domain, leftmost, rightmost, "S");
		}
		if ((post_process_match("QI", ctxt->constituent[c].start_link) == 1) ||
			(post_process_match("Mr", ctxt->constituent[c].start_link) == 1) ||
			(post_process_match("MX#d", ctxt->constituent[c].start_link) == 1))
		{
			const char * name = "";
			w = leftmost;
			if (strcmp(linkage->word[w], ",") == 0) w++;
			if (ctxt->wordtype[w] == NONE)
				name = "WHADVP";
			else if (ctxt->wordtype[w] == QTYPE)
				name = "WHNP";
			else if (ctxt->wordtype[w] == QDTYPE)
				name = "WHNP";
			else
				assert(0, "Unexpected word type");
			c = add_constituent(ctxt, c, linkage, &domain, w, w, name);

			if (ctxt->wordtype[w] == QDTYPE)
			{
				/* Now find the finite verb to the right, start an S */
				/* Limit w2 to sentence length. */
				// for( w2=w+1; w2 < ctxt->r_limit-1; w2++ )
				for (w2 = w+1; w2 < rightmost; w2++)
				  if ((ctxt->wordtype[w2] == STYPE) || (ctxt->wordtype[w2] == PTYPE)) break;

				/* Adjust the right boundary of previous constituent */
				ctxt->constituent[c].right = w2 - 1;
				c = add_constituent(ctxt, c, linkage, &domain, w2, rightmost, "S");
			}
		}

		if (ctxt->constituent[c].domain_type == '\0')
		{
			err_ctxt ec;
			err_msg(&ec, Error, "Error: no domain type assigned to constituent\n");
		}
		if (ctxt->constituent[c].start_link == NULL)
		{
			err_ctxt ec;
			err_msg(&ec, Error, "Error: no type assigned to constituent\n");
		}
	}

	numcon_subl = c - numcon_total;
	/* numcon_subl = handle_islands(linkage, numcon_total, numcon_subl);  */

	if (verbosity >= 2)
		printf("Constituents added at first stage:\n");

	for (c = numcon_total; c < numcon_total + numcon_subl; c++)
	{
		print_constituent(ctxt, linkage, c);
	}

	/* Opener case - generates S around main clause.
	   (This must be done first; the S generated will be needed for
	   later cases.) */
	numcon_subl =
		gen_comp(ctxt, linkage, numcon_total, numcon_subl, "SBAR", "S", "S", CASE_OPENER);

	/* pp opener case */
	numcon_subl =
		gen_comp(ctxt, linkage, numcon_total, numcon_subl, "PP", "S", "S", CASE_PPOPEN);

	/* participle opener case */
	numcon_subl =
		gen_comp(ctxt, linkage, numcon_total, numcon_subl, "S", "S", "S", CASE_PART_OPEN);

	/* Subject-phrase case; every main VP generates an S */
	numcon_subl =
		gen_comp(ctxt, linkage, numcon_total, numcon_subl, "VP", "S", "NP", CASE_S);

	/* Relative clause case; an SBAR generates a complement NP */
	numcon_subl =
		gen_comp(ctxt, linkage, numcon_total, numcon_subl, "SBAR", "NP", "NP", CASE_REL_CLAUSE);

	/* Participle modifier case */
	numcon_subl =
		gen_comp(ctxt, linkage, numcon_total, numcon_subl, "VP", "NP", "NP", CASE_PART_MOD);

	/* PP modifying NP */
	numcon_subl =
		gen_comp(ctxt, linkage, numcon_total, numcon_subl, "PP", "NP", "NP", CASE_PART_MOD);

	/* Appositive case */
	numcon_subl =
		gen_comp(ctxt, linkage, numcon_total, numcon_subl, "NP", "NP", "NP", CASE_APPOS);

	/* S-V inversion case; an NP generates a complement VP */
	numcon_subl =
		gen_comp(ctxt, linkage, numcon_total, numcon_subl, "NP", "SINV", "VP", CASE_SVINV);

	adjust_subordinate_clauses(ctxt, linkage, numcon_total, numcon_subl);
	for (c = numcon_total; c < numcon_total + numcon_subl; c++)
	{
		if ((ctxt->constituent[c].domain_type=='p') &&
			(strcmp(linkage->word[ctxt->constituent[c].left], ",")==0))
		{
			ctxt->constituent[c].left++;
		}
	}

	/* Make sure the constituents are nested. If two constituents
	 * are not nested: whichever constituent has the furthest left
	 * boundary, shift that boundary rightwards to the left boundary
	 * of the other one.
	 */
	while (true)
	{
		bool adjustment_made = false;
		for (c = numcon_total; c < numcon_total + numcon_subl; c++)
		{
			for (c2 = numcon_total; c2 < numcon_total + numcon_subl; c2++)
			{
				if ((ctxt->constituent[c].left < ctxt->constituent[c2].left) &&
					(ctxt->constituent[c].right < ctxt->constituent[c2].right) &&
					(ctxt->constituent[c].right >= ctxt->constituent[c2].left))
				{
					/* We've found two overlapping constituents.
					   If one is larger, except the smaller one
					   includes an extra comma, adjust the smaller one
					   to exclude the comma */

					if ((strcmp(linkage->word[ctxt->constituent[c2].right], ",") == 0) ||
						(strcmp(linkage->word[ctxt->constituent[c2].right],
								"RIGHT-WALL") == 0))
					{
						if (verbosity >= 2)
							printf("Adjusting %d to fix comma overlap\n", c2);
						adjust_for_right_comma(ctxt, linkage, c2);
						adjustment_made = true;
					}
					else if (strcmp(linkage->word[ctxt->constituent[c].left], ",") == 0)
					{
						if (verbosity >= 2)
							printf("Adjusting c %d to fix comma overlap\n", c);
						adjust_for_left_comma(ctxt, linkage, c);
						adjustment_made = true;
					}
					else
					{
						if (verbosity >= 2)
						{
							err_ctxt ec;
							err_msg(&ec, Warn,
							      "Warning: the constituents aren't nested! "
							      "Adjusting them. (%d, %d)\n", c, c2);
					  }
					  ctxt->constituent[c].left = ctxt->constituent[c2].left;
					}
				}
			}
		}
		if (adjustment_made == false) break;
	}

	assert (numcon_total + numcon_subl < ctxt->conlen, "Too many constituents");
	return numcon_subl;
}