예제 #1
0
/**
 * The disjunct d (whose left or right pointer points to c) is put
 *  into the appropriate hash table
 * dir =  1, we're putting this into a right table.
 * dir = -1, we're putting this into a left table.
 */
static void put_into_match_table(int size, Match_node ** t,
								 Disjunct * d, Connector * c, int dir )
{
	int h;
	Match_node * m;
	h = connector_hash(c) & (size-1);
	m = (Match_node *) xalloc (sizeof(Match_node));
	m->next = NULL;
	m->d = d;
	if (dir == 1) {
		t[h] = add_to_right_table_list(m, t[h]);
	} else {
		t[h] = add_to_left_table_list(m, t[h]);
	}
}
예제 #2
0
/**
 * Forms and returns a list of disjuncts that might match lc or rc or both.
 * lw and rw are the words from which lc and rc came respectively.
 * The list is formed by the link pointers of Match_nodes.
 * The list contains no duplicates.  A quadratic algorithm is used to
 * eliminate duplicates.  In practice the match_cost is less than the
 * parse_cost (and the loop is tiny), so there's no reason to bother
 * to fix this.
 */
Match_node * 
form_match_list(Sentence sent, int w, 
                Connector *lc, int lw, Connector *rc, int rw)
{
	Match_node *ml, *mr, *mx, *my, * mz, *front, *free_later;

	match_context_t *ctxt = sent->match_ctxt;

	if (lc != NULL) {
		ml = ctxt->l_table[w][connector_hash(lc) & (ctxt->l_table_size[w]-1)];
	} else {
		ml = NULL;
	}
	if (rc != NULL) {
		mr = ctxt->r_table[w][connector_hash(rc) & (ctxt->r_table_size[w]-1)];
	} else {
		mr = NULL;
	}

	front = NULL;
	for (mx = ml; mx != NULL; mx = mx->next)
	{
		if (mx->d->left->word < lw) break;
		my = get_match_node(ctxt);
		my->d = mx->d;
		my->next = front;
		front = my;
	}
	ml = front;   /* ml is now the list of things that could match the left */

	front = NULL;
	for (mx = mr; mx != NULL; mx = mx->next)
	{
		if (mx->d->right->word > rw) break;
		my = get_match_node(ctxt);
		my->d = mx->d;
		my->next = front;
		front = my;
	}
	mr = front;   /* mr is now the list of things that could match the right */

	/* now we want to eliminate duplicates from the lists */

	free_later = NULL;
	front = NULL;
	for (mx = mr; mx != NULL; mx = mz)
	{
		/* see if mx in first list, put it in if its not */
		mz = mx->next;
		ctxt->match_cost++;
		for (my=ml; my!=NULL; my=my->next) {
			ctxt->match_cost++;
			if (mx->d == my->d) break;
		}
		if (my != NULL) { /* mx was in the l list */
			mx->next = free_later;
			free_later = mx;
		}
		if (my==NULL) {  /* it was not there */
			mx->next = front;
			front = mx;
		}
	}
	mr = front;  /* mr is now the abbreviated right list */
	put_match_list(sent, free_later);

	/* now catenate the two lists */
	if (mr == NULL) return ml;
	for (mx = mr; mx->next != NULL; mx = mx->next)
	  ;
	mx->next = ml;
	return mr;
}
예제 #3
0
/**
 * This hash function only looks at the leading upper case letters of
 * the connector string, and the label fields.  This ensures that if two
 * strings match (formally), then they must hash to the same place.
 */
static inline unsigned int hash_S(Connector * c)
{
	unsigned int h = connector_hash(c);
	return (h & (CONTABSZ-1));
}
예제 #4
0
/**
 * Forms and returns a list of disjuncts coming from word w, that might
 * match lc or rc or both. The lw and rw are the words from which lc
 * and rc came respectively.
 *
 * The list is returned in a linked list of Match_nodes.
 * The list contains no duplicates.  A quadratic algorithm is used to
 * eliminate duplicates.  In practice the match_cost is less than the
 * parse_cost (and the loop is tiny), so there's no reason to bother
 * to fix this.  The number of times through the loop is counted with
 * 'match_cost', if verbosity>1, then it this will be printed at the end.
 *
 * Well, with one exception: for long sentences that have parse
 * overflows, this can sometimes get match lists that are hundreds of
 * elements long, dominating the total time spent in the algo; viz.
 * in excess of 50% of the time.
 */
Match_node *
form_match_list(fast_matcher_t *ctxt, int w,
                Connector *lc, int lw,
                Connector *rc, int rw)
{
	size_t rlen = 0, llen = 0;
	Match_node *ml, *mr, *mx, *my, *mz, *front, *free_later;

	if (lc != NULL) {
		ml = ctxt->l_table[w][connector_hash(lc) & (ctxt->l_table_size[w]-1)];
	} else {
		ml = NULL;
	}
	if (rc != NULL) {
		mr = ctxt->r_table[w][connector_hash(rc) & (ctxt->r_table_size[w]-1)];
	} else {
		mr = NULL;
	}

	front = NULL;
	for (mx = ml; mx != NULL; mx = mx->next)
	{
		if (mx->d->left->word < lw) break;
		my = get_match_node(ctxt);
		my->d = mx->d;
		my->next = front;
		front = my;
		llen++;
	}
	ml = front;   /* ml is now the list of things that could match the left */

	front = NULL;
	for (mx = mr; mx != NULL; mx = mx->next)
	{
		if (mx->d->right->word > rw) break;
		my = get_match_node(ctxt);
		my->d = mx->d;
		my->next = front;
		front = my;
		rlen++;
	}
	mr = front;   /* mr is now the list of things that could match the right */

	if (mr == NULL) return ml;
	if (ml == NULL) return mr;

	/* Now we want to eliminate duplicates from the lists. */
	/* If the left-lest is reasonably short, then just do a quadratic
	 * search for duplicates. But if the list is long, optimize the
	 * search.  Based on quickie measurements, the optimized version
	 * seems to dominate when 250 < llen and 8 < rlen. Roughly.
	 */
	if (llen < 250 || rlen < 9)
	{
		/* Perform a simple quadratic-time search. viz two nested loops.
		 * Runtime blows up horribly for lengths over a few hundred. */
		free_later = NULL;
		front = NULL;
		for (mx = mr; mx != NULL; mx = mz)
		{
			/* See if mx in first list, put it in if its not. */
			mz = mx->next;
			ctxt->match_cost++;
			for (my=ml; my!=NULL; my=my->next) {
				ctxt->match_cost++;
				if (mx->d == my->d) break;
			}
			if (my != NULL) { /* mx was in the l list */
				mx->next = free_later;
				free_later = mx;
			} else {  /* It was not there. */
				mx->next = front;
				front = mx;
			}
		}
		mr = front;  /* mr is now the abbreviated right list */
		put_match_list(ctxt, free_later);
	}
	else
	{
		/* Perform an O(N log N) search, by sorting first, and then
		 * doing a linear-line run through the sorted arrays.
		 */
		size_t i,j;
		Match_node* mx;
		Match_node** mra = alloca(rlen * sizeof(Match_node*));
		Match_node** mla = alloca(llen * sizeof(Match_node*));

		i = 0;
		for (mx = mr; mx != NULL; mx = mx->next) mra[i++] = mx;
		qsort((void *) mra, rlen, sizeof(Match_node*), addr_compare);

		i = 0;
		for (mx = ml; mx != NULL; mx = mx->next) mla[i++] = mx;
		qsort((void *) mla, llen, sizeof(Match_node*), addr_compare);

		/* Compare addresses side-by side in a linear loop.
		 * Be careful not to run past bounds arrays. */
		free_later = NULL;
		front = NULL;
		i = 0;
		j = 0;
		while (i < rlen)
		{
			while (i < rlen && mra[i]->d < mla[j]->d)
			{
				mra[i]->next = front;
				front = mra[i];
				i++;
			}
			if (i == rlen) break;

			if (mra[i]->d == mla[j]->d)
			{
				mra[i]->next = free_later;
				free_later = mra[i];
				i++; j++;
			}
			if (i == rlen) break;

			while (j < llen && mra[i]->d > mla[j]->d)
				j++;

			/* Drain the rest of the right-hand list. */
			if (j == llen)
			{
				while (i < rlen)
				{
					mra[i]->next = front;
					front = mra[i];
					i++;
				}
				break;
			}
		}
		mr = front;  /* mr is now the abbreviated right list */
		put_match_list(ctxt, free_later);
	}

	/* Now catenate the two lists. */
	if (mr == NULL) return ml;
	if (ml == NULL) return mr;
	for (mx = mr; mx->next != NULL; mx = mx->next)
	  ;
	mx->next = ml;
	return mr;
}