Ejemplo n.º 1
0
/**
 * returns NULL if there are no ways to parse, or returns a pointer
 * to a set structure representing all the ways to parse.
 *
 * This code is similar to do_count() in count.c -- for a good reason:
 * the do_count() function did a full parse, but didn't actually
 * allocate an memory structures to hold the parse.  This also does
 * a full parse, but it also allocates and fills out the various
 * parse structures.
 */
static
Parse_set * mk_parse_set(Sentence sent, fast_matcher_t *mchxt,
                         count_context_t * ctxt,
                         Disjunct *ld, Disjunct *rd, int lw, int rw,
                         Connector *le, Connector *re, unsigned int null_count,
                         bool islands_ok, Parse_info pi)
{
    Disjunct * d, * dis;
    int start_word, end_word, w;
    bool Lmatch, Rmatch;
    unsigned int lnull_count, rnull_count;
    int i, j;
    Parse_set *ls[4], *rs[4], *lset, *rset;
    Parse_choice * a_choice;

    Match_node * m, *m1;
    X_table_connector *xt;
    s64 count;

    assert(null_count < 0x7fff, "mk_parse_set() called with null_count < 0.");

    count = table_lookup(ctxt, lw, rw, le, re, null_count);

    /*
      assert(count >= 0, "mk_parse_set() called on params that were not in the table.");
      Actually, we can't assert this, because of the pseudocount technique that's
      used in count().  It's not the case that every call to mk_parse_set() has already
      been put into the table.
     */

    if ((count == 0) || (count == -1)) return NULL;

    xt = x_table_pointer(lw, rw, le, re, null_count, pi);

    if (xt != NULL) return xt->set;  /* we've already computed it */

    /* Start it out with the empty set of options. */
    /* This entry must be updated before we return. */
    xt = x_table_store(lw, rw, le, re, null_count, pi);

    xt->set->count = count;  /* the count we already computed */
    /* this count is non-zero */

    if (rw == 1 + lw) return xt->set;

    if ((le == NULL) && (re == NULL))
    {
        if (!islands_ok && (lw != -1)) return xt->set;

        if (null_count == 0) return xt->set;

        w = lw + 1;
        for (dis = sent->word[w].d; dis != NULL; dis = dis->next)
        {
            if (dis->left == NULL)
            {
                rs[0] = mk_parse_set(sent, mchxt, ctxt, dis, NULL, w, rw, dis->right,
                                     NULL, null_count-1, islands_ok, pi);
                if (rs[0] == NULL) continue;
                a_choice = make_choice(dummy_set(), lw, w, NULL, NULL,
                                       rs[0], w, rw, NULL, NULL,
                                       NULL, NULL, NULL);
                put_choice_in_set(xt->set, a_choice);
            }
        }
        rs[0] = mk_parse_set(sent, mchxt, ctxt, NULL, NULL, w, rw, NULL, NULL,
                             null_count-1, islands_ok, pi);
        if (rs[0] != NULL)
        {
            a_choice = make_choice(dummy_set(), lw, w, NULL, NULL,
                                   rs[0], w, rw, NULL, NULL,
                                   NULL, NULL, NULL);
            put_choice_in_set(xt->set, a_choice);
        }
        return xt->set;
    }

    if (le == NULL)
    {
        start_word = lw + 1;
    }
    else
    {
        start_word = le->word;
    }

    if (re == NULL)
    {
        end_word = rw;
    }
    else
    {
        end_word = re->word + 1;
    }

    /* This condition can never be true here. It is included so GCC will be able
     * to optimize the loop over "null_count".  Without this check, GCC thinks this
     * loop may be an infinite loop and it may omit some optimizations. */
    if (UINT_MAX == null_count) return NULL;

    for (w = start_word; w < end_word; w++)
    {
        m1 = m = form_match_list(mchxt, w, le, lw, re, rw);
        for (; m!=NULL; m=m->next)
        {
            d = m->d;
            for (lnull_count = 0; lnull_count <= null_count; lnull_count++)
            {
                rnull_count = null_count-lnull_count;
                /* now lnull_count and rnull_count are the null_counts we're assigning to
                 * those parts respectively */

                /* Now, we determine if (based on table only) we can see that
                   the current range is not parsable. */

                Lmatch = (le != NULL) && (d->left != NULL) && do_match(le, d->left, lw, w);
                Rmatch = (d->right != NULL) && (re != NULL) && do_match(d->right, re, w, rw);
                for (i=0; i<4; i++) {
                    ls[i] = rs[i] = NULL;
                }
                if (Lmatch)
                {
                    ls[0] = mk_parse_set(sent, mchxt, ctxt, ld, d, lw, w, le->next, d->left->next, lnull_count, islands_ok, pi);
                    if (le->multi) ls[1] = mk_parse_set(sent, mchxt, ctxt, ld, d, lw, w, le, d->left->next, lnull_count, islands_ok, pi);
                    if (d->left->multi) ls[2] = mk_parse_set(sent, mchxt, ctxt, ld, d, lw, w, le->next, d->left, lnull_count, islands_ok, pi);
                    if (le->multi && d->left->multi) ls[3] = mk_parse_set(sent, mchxt, ctxt, ld, d, lw, w, le, d->left, lnull_count, islands_ok, pi);
                }
                if (Rmatch)
                {
                    rs[0] = mk_parse_set(sent, mchxt, ctxt, d, rd, w, rw, d->right->next, re->next, rnull_count, islands_ok, pi);
                    if (d->right->multi) rs[1] = mk_parse_set(sent, mchxt, ctxt, d, rd, w,rw,d->right,re->next, rnull_count, islands_ok, pi);
                    if (re->multi) rs[2] = mk_parse_set(sent, mchxt, ctxt, d, rd, w, rw, d->right->next, re, rnull_count, islands_ok, pi);
                    if (d->right->multi && re->multi) rs[3] = mk_parse_set(sent, mchxt, ctxt, d, rd, w, rw, d->right, re, rnull_count, islands_ok, pi);
                }

                for (i=0; i<4; i++)
                {
                    /* this ordering is probably not consistent with that
                     *  needed to use list_links */
                    if (ls[i] == NULL) continue;
                    for (j=0; j<4; j++)
                    {
                        if (rs[j] == NULL) continue;
                        a_choice = make_choice(ls[i], lw, w, le, d->left,
                                               rs[j], w, rw, d->right, re,
                                               ld, d, rd);
                        put_choice_in_set(xt->set, a_choice);
                    }
                }

                if (ls[0] != NULL || ls[1] != NULL || ls[2] != NULL || ls[3] != NULL)
                {
                    /* evaluate using the left match, but not the right */
                    rset = mk_parse_set(sent, mchxt, ctxt, d, rd, w, rw, d->right, re, rnull_count, islands_ok, pi);
                    if (rset != NULL)
                    {
                        for (i=0; i<4; i++)
                        {
                            if (ls[i] == NULL) continue;
                            /* this ordering is probably not consistent with
                             * that needed to use list_links */
                            a_choice = make_choice(ls[i], lw, w, le, d->left,
                                                   rset, w, rw, NULL /* d->right */,
                                                   re,  /* the NULL indicates no link*/
                                                   ld, d, rd);
                            put_choice_in_set(xt->set, a_choice);
                        }
                    }
                }
                if ((le == NULL) && (rs[0] != NULL ||
                                     rs[1] != NULL || rs[2] != NULL || rs[3] != NULL))
                {
                    /* evaluate using the right match, but not the left */
                    lset = mk_parse_set(sent, mchxt, ctxt, ld, d, lw, w, le, d->left, lnull_count, islands_ok, pi);

                    if (lset != NULL)
                    {
                        for (i=0; i<4; i++)
                        {
                            if (rs[i] == NULL) continue;
                            /* this ordering is probably not consistent with
                             * that needed to use list_links */
                            a_choice = make_choice(lset, lw, w, NULL /* le */,
                                                   d->left,  /* NULL indicates no link */
                                                   rs[i], w, rw, d->right, re,
                                                   ld, d, rd);
                            put_choice_in_set(xt->set, a_choice);
                        }
                    }
                }
            }
        }
        put_match_list(mchxt, m1);
    }
    xt->set->current = xt->set->first;
    return xt->set;
}
Ejemplo n.º 2
0
/**
 * Forms and returns a list of disjuncts that might match lc or rc or both.
 * lw and rw are the words from which lc and rc came respectively.
 * The list is formed by the link pointers of Match_nodes.
 * The list contains no duplicates.  A quadratic algorithm is used to
 * eliminate duplicates.  In practice the match_cost is less than the
 * parse_cost (and the loop is tiny), so there's no reason to bother
 * to fix this.
 */
Match_node * 
form_match_list(Sentence sent, int w, 
                Connector *lc, int lw, Connector *rc, int rw)
{
	Match_node *ml, *mr, *mx, *my, * mz, *front, *free_later;

	match_context_t *ctxt = sent->match_ctxt;

	if (lc != NULL) {
		ml = ctxt->l_table[w][connector_hash(lc) & (ctxt->l_table_size[w]-1)];
	} else {
		ml = NULL;
	}
	if (rc != NULL) {
		mr = ctxt->r_table[w][connector_hash(rc) & (ctxt->r_table_size[w]-1)];
	} else {
		mr = NULL;
	}

	front = NULL;
	for (mx = ml; mx != NULL; mx = mx->next)
	{
		if (mx->d->left->word < lw) break;
		my = get_match_node(ctxt);
		my->d = mx->d;
		my->next = front;
		front = my;
	}
	ml = front;   /* ml is now the list of things that could match the left */

	front = NULL;
	for (mx = mr; mx != NULL; mx = mx->next)
	{
		if (mx->d->right->word > rw) break;
		my = get_match_node(ctxt);
		my->d = mx->d;
		my->next = front;
		front = my;
	}
	mr = front;   /* mr is now the list of things that could match the right */

	/* now we want to eliminate duplicates from the lists */

	free_later = NULL;
	front = NULL;
	for (mx = mr; mx != NULL; mx = mz)
	{
		/* see if mx in first list, put it in if its not */
		mz = mx->next;
		ctxt->match_cost++;
		for (my=ml; my!=NULL; my=my->next) {
			ctxt->match_cost++;
			if (mx->d == my->d) break;
		}
		if (my != NULL) { /* mx was in the l list */
			mx->next = free_later;
			free_later = mx;
		}
		if (my==NULL) {  /* it was not there */
			mx->next = front;
			front = mx;
		}
	}
	mr = front;  /* mr is now the abbreviated right list */
	put_match_list(sent, free_later);

	/* now catenate the two lists */
	if (mr == NULL) return ml;
	for (mx = mr; mx->next != NULL; mx = mx->next)
	  ;
	mx->next = ml;
	return mr;
}
Ejemplo n.º 3
0
/**
 * Mark as useful all disjuncts involved in some way to complete the
 * structure within the current region.  Note that only disjuncts
 * strictly between lw and rw will be marked.  If it so happens that
 * this region itself is not valid, then this fact will be recorded
 * in the table, and nothing else happens.
 */
static void mark_region(Sentence sent,
                        int lw, int rw, Connector *le, Connector *re)
{

	Disjunct * d;
	int left_valid, right_valid, i;
	int start_word, end_word;
	int w;
	Match_node * m, *m1;
	count_context_t *ctxt = sent->count_ctxt;

	i = region_valid(sent, lw, rw, le, re);
	if ((i==0) || (i==2)) return;
	/* we only reach this point if it's a valid unmarked region, i=1 */
	table_update(ctxt, lw, rw, le, re, 0, 2);

	if ((le == NULL) && (re == NULL) && (ctxt->null_links) && (rw != 1+lw)) {
		w = lw+1;
		for (d = ctxt->local_sent[w].d; d != NULL; d = d->next) {
			if ((d->left == NULL) && region_valid(sent, w, rw, d->right, NULL)) {
				d->marked = TRUE;
				mark_region(sent, w, rw, d->right, NULL);
			}
		}
		mark_region(sent, w, rw, NULL, NULL);
		return;
	}

	if (le == NULL) {
		start_word = lw+1;
	} else {
		start_word = le->word;
	}
	if (re == NULL) {
		end_word = rw-1;
	} else {
		end_word = re->word;
	}

	for (w=start_word; w < end_word+1; w++) {
		m1 = m = form_match_list(sent, w, le, lw, re, rw);
		for (; m!=NULL; m=m->next) {
			d = m->d;
			/* mark_cost++;*/
			left_valid = (((le != NULL) && (d->left != NULL) && x_prune_match(ctxt, le, d->left, lw, w)) &&
						  ((region_valid(sent, lw, w, le->next, d->left->next)) ||
						   ((le->multi) && region_valid(sent, lw, w, le, d->left->next)) ||
						   ((d->left->multi) && region_valid(sent, lw, w, le->next, d->left)) ||
						   ((le->multi && d->left->multi) && region_valid(sent, lw, w, le, d->left))));
			right_valid = (((d->right != NULL) && (re != NULL) && x_prune_match(ctxt, d->right, re, w, rw)) &&
						   ((region_valid(sent, w, rw, d->right->next,re->next)) ||
							((d->right->multi) && region_valid(sent, w,rw,d->right,re->next))  ||
							((re->multi) && region_valid(sent, w, rw, d->right->next, re)) ||
							((d->right->multi && re->multi) && region_valid(sent, w, rw, d->right, re))));

			/* The following if statements could be restructured to avoid superfluous calls
			   to mark_region.  It didn't seem a high priority, so I didn't optimize this.
			   */

			if (left_valid && region_valid(sent, w, rw, d->right, re)) {
				d->marked = TRUE;
				mark_region(sent, w, rw, d->right, re);
				mark_region(sent, lw, w, le->next, d->left->next);
				if (le->multi) mark_region(sent, lw, w, le, d->left->next);
				if (d->left->multi) mark_region(sent, lw, w, le->next, d->left);
				if (le->multi && d->left->multi) mark_region(sent, lw, w, le, d->left);
			}

			if (right_valid && region_valid(sent, lw, w, le, d->left)) {
				d->marked = TRUE;
				mark_region(sent, lw, w, le, d->left);
				mark_region(sent, w, rw, d->right->next,re->next);
				if (d->right->multi) mark_region(sent, w,rw,d->right,re->next);
				if (re->multi) mark_region(sent, w, rw, d->right->next, re);
				if (d->right->multi && re->multi) mark_region(sent, w, rw, d->right, re);
			}

			if (left_valid && right_valid) {
				d->marked = TRUE;
				mark_region(sent, lw, w, le->next, d->left->next);
				if (le->multi) mark_region(sent, lw, w, le, d->left->next);
				if (d->left->multi) mark_region(sent, lw, w, le->next, d->left);
				if (le->multi && d->left->multi) mark_region(sent, lw, w, le, d->left);
				mark_region(sent, w, rw, d->right->next,re->next);
				if (d->right->multi) mark_region(sent, w,rw,d->right,re->next);
				if (re->multi) mark_region(sent, w, rw, d->right->next, re);
				if (d->right->multi && re->multi) mark_region(sent, w, rw, d->right, re);
			}
		}
		put_match_list(sent, m1);
	}
}
Ejemplo n.º 4
0
Parse_set * parse_set(Disjunct *ld, Disjunct *rd, int lw, int rw, 
		      Connector *le, Connector *re, int cost, Parse_info * pi) {
    /* returns NULL if there are no ways to parse, or returns a pointer
       to a set structure representing all the ways to parse */

    Disjunct * d, * dis;
    int start_word, end_word, w;
    int lcost, rcost, Lmatch, Rmatch;
    int i, j;
    Parse_set *ls[4], *rs[4], *lset, *rset;
    Parse_choice * a_choice;

    Match_node * m, *m1;
    X_table_connector *xt;
    int count;

    assert(cost >= 0, "parse_set() called with cost < 0.");

    count = table_lookup(lw, rw, le, re, cost);

    /*
      assert(count >= 0, "parse_set() called on params that were not in the table.");
      Actually, we can't assert this, because of the pseudocount technique that's
      used in count().  It's not the case that every call to parse_set() has already
      been put into the table.
     */

    if ((count == 0) || (count == -1)) return NULL;
    
    xt = x_table_pointer(lw, rw, le, re, cost, pi);

    if (xt == NULL) {
	xt = x_table_store(lw, rw, le, re, cost, empty_set(), pi);
	/* start it out with the empty set of options */
	/* this entry must be updated before we return */
    } else {
	return xt->set;  /* we've already computed it */
    }

    xt->set->count = count;  /* the count we already computed */
    /* this count is non-zero */
    
    if (rw == 1+lw) return xt->set;
    if ((le == NULL) && (re == NULL)) {
	if (!islands_ok && (lw != -1)) {
	    return xt->set;
	}
	if (cost == 0) {
	    return xt->set;
	} else {
	    w = lw+1;
	    for (dis = local_sent[w].d; dis != NULL; dis = dis->next) {
		if (dis->left == NULL) {
		    rs[0] = parse_set(dis, NULL, w, rw, dis->right, NULL, cost-1, pi);
		    if (rs[0] == NULL) continue;
		    a_choice = make_choice(dummy_set(), lw, w, NULL, NULL,
					   rs[0], w, rw, NULL, NULL,
					   NULL, NULL, NULL);
		    put_choice_in_set(xt->set, a_choice);
		}
	    }
	    rs[0] = parse_set(NULL, NULL, w, rw, NULL, NULL, cost-1, pi); 
	    if (rs[0] != NULL) {
		a_choice = make_choice(dummy_set(), lw, w, NULL, NULL,
				       rs[0], w, rw, NULL, NULL,
				       NULL, NULL, NULL);
		put_choice_in_set(xt->set, a_choice);
	    }
	    return xt->set;
	}
    }
    
    if (le == NULL) {
	start_word = lw+1;
    } else {
	start_word = le->word;

    }

    if (re == NULL) {
	end_word = rw-1;
    } else {
	end_word = re->word;
    }
    
    for (w=start_word; w <= end_word; w++) {
	m1 = m = form_match_list(w, le, lw, re, rw); 
	for (; m!=NULL; m=m->next) {
	    d = m->d;
	    for (lcost = 0; lcost <= cost; lcost++) {
		rcost = cost-lcost;
		/* now lcost and rcost are the costs we're assigning to those parts respectively */

		/* Now, we determine if (based on table only) we can see that
		   the current range is not parsable. */

		Lmatch = (le != NULL) && (d->left != NULL) && match(le, d->left, lw, w);
		Rmatch = (d->right != NULL) && (re != NULL) && match(d->right, re, w, rw);
		for (i=0; i<4; i++) {ls[i] = rs[i] = NULL;}
		if (Lmatch) {
		    ls[0] = parse_set(ld, d, lw, w, le->next, d->left->next, lcost, pi);
		    if (le->multi) ls[1] = parse_set(ld, d, lw, w, le, d->left->next, lcost, pi);
		    if (d->left->multi) ls[2] = parse_set(ld, d, lw, w, le->next, d->left, lcost, pi);
		    if (le->multi && d->left->multi) ls[3] = parse_set(ld, d, lw, w, le, d->left, lcost, pi);
		}
		if (Rmatch) {
		    rs[0] = parse_set(d, rd, w, rw, d->right->next, re->next, rcost, pi);
		    if (d->right->multi) rs[1] = parse_set(d, rd, w,rw,d->right,re->next, rcost, pi);
		    if (re->multi) rs[2] = parse_set(d, rd, w, rw, d->right->next, re, rcost, pi);
		    if (d->right->multi && re->multi) rs[3] = parse_set(d, rd, w, rw, d->right, re, rcost, pi);
		}

		for (i=0; i<4; i++) {
		    /* this ordering is probably not consistent with that needed to use list_links */
		    if (ls[i] == NULL) continue;
		    for (j=0; j<4; j++) {
			if (rs[j] == NULL) continue;
			a_choice = make_choice(ls[i], lw, w, le, d->left,
					       rs[j], w, rw, d->right, re,
					       ld, d, rd);
			put_choice_in_set(xt->set, a_choice);
		    }
		}
		
		if (ls[0] != NULL || ls[1] != NULL || ls[2] != NULL || ls[3] != NULL) {
		    /* evaluate using the left match, but not the right */
		    rset = parse_set(d, rd, w, rw, d->right, re, rcost, pi);
		    if (rset != NULL) {
			for (i=0; i<4; i++) {
			    if (ls[i] == NULL) continue;
			    /* this ordering is probably not consistent with that needed to use list_links */
			    a_choice = make_choice(ls[i], lw, w, le, d->left,
						   rset, w, rw, NULL /* d->right */, re,  /* the NULL indicates no link*/
						   ld, d, rd);
			    put_choice_in_set(xt->set, a_choice);
			}
		    }
		}
		if ((le == NULL) && (rs[0] != NULL || rs[1] != NULL || rs[2] != NULL || rs[3] != NULL)) {
		    /* evaluate using the right match, but not the left */
		    lset = parse_set(ld, d, lw, w, le, d->left, lcost, pi);

		    if (lset != NULL) {
			for (i=0; i<4; i++) {
			    if (rs[i] == NULL) continue;
			    /* this ordering is probably not consistent with that needed to use list_links */
			    a_choice = make_choice(lset, lw, w, NULL /* le */, d->left,  /* NULL indicates no link */
						   rs[i], w, rw, d->right, re,
						   ld, d, rd);
			    put_choice_in_set(xt->set, a_choice);
			}
		    }
		}
	    }
	}
	put_match_list(m1);
    }
    xt->set->current = xt->set->first;
    return xt->set;
}
Ejemplo n.º 5
0
/**
 * Returns 0 if this range cannot be successfully filled in with
 * links.  Returns 1 if it can, and it's not been marked, and returns
 * 2 if it can and it has been marked.
 */
static int region_valid(Sentence sent, int lw, int rw, Connector *le, Connector *re)
{
	Disjunct * d;
	int left_valid, right_valid, found;
	int i, start_word, end_word;
	int w;
	Match_node * m, *m1;

	count_context_t *ctxt = sent->count_ctxt;

	i = table_lookup(sent, lw, rw, le, re, 0);
	if (i >= 0) return i;

	if ((le == NULL) && (re == NULL) && ctxt->deletable[lw][rw]) {
		table_store(ctxt, lw, rw, le, re, 0, 1);
		return 1;
	}

	if (le == NULL) {
		start_word = lw+1;
	} else {
		start_word = le->word;
	}
	if (re == NULL) {
		end_word = rw-1;
	} else {
		end_word = re->word;
	}

	found = 0;

	for (w=start_word; w < end_word+1; w++) {
		m1 = m = form_match_list(sent, w, le, lw, re, rw);
		for (; m!=NULL; m=m->next) {
			d = m->d;
			/* mark_cost++;*/
			/* in the following expressions we use the fact that 0=FALSE. Could eliminate
			   by always saying "region_valid(...) != 0"  */
			left_valid = (((le != NULL) && (d->left != NULL) && x_prune_match(ctxt, le, d->left, lw, w)) &&
						  ((region_valid(sent, lw, w, le->next, d->left->next)) ||
						   ((le->multi) && region_valid(sent, lw, w, le, d->left->next)) ||
						   ((d->left->multi) && region_valid(sent, lw, w, le->next, d->left)) ||
						   ((le->multi && d->left->multi) && region_valid(sent, lw, w, le, d->left))));
			if (left_valid && region_valid(sent, w, rw, d->right, re)) {
				found = 1;
				break;
			}
			right_valid = (((d->right != NULL) && (re != NULL) && x_prune_match(ctxt, d->right, re, w, rw)) &&
						   ((region_valid(sent, w, rw, d->right->next,re->next))	||
							((d->right->multi) && region_valid(sent, w,rw,d->right,re->next))  ||
							((re->multi) && region_valid(sent, w, rw, d->right->next, re))  ||
							((d->right->multi && re->multi) && region_valid(sent, w, rw, d->right, re))));
			if ((left_valid && right_valid) || (right_valid && region_valid(sent, lw, w, le, d->left))) {
				found = 1;
				break;
			}
		}
		put_match_list(sent, m1);
		if (found != 0) break;
	}
	table_store(ctxt, lw, rw, le, re, 0, found);
	return found;
}
Ejemplo n.º 6
0
static s64 do_count(Sentence sent, int lw, int rw,
                    Connector *le, Connector *re, int cost)
{
	Disjunct * d;
	s64 total, pseudototal;
	int start_word, end_word, w;
	s64 leftcount, rightcount;
	int lcost, rcost, Lmatch, Rmatch;

	Match_node * m, *m1;
	Table_connector *t;

	count_context_t *ctxt = sent->count_ctxt;

	if (cost < 0) return 0;  /* will we ever call it with cost<0 ? */

	t = find_table_pointer(ctxt, lw, rw, le, re, cost);

	if (t == NULL) {
		/* Create the table entry with a tentative cost of 0. 
	    * This cost must be updated before we return. */
		t = table_store(ctxt, lw, rw, le, re, cost, 0);
	} else {
		return t->count;
	}

	if (rw == 1+lw)
	{
		/* lw and rw are neighboring words */
		/* You can't have a linkage here with cost > 0 */
		if ((le == NULL) && (re == NULL) && (cost == 0))
		{
			t->count = 1;
		}
		else
		{
			t->count = 0;
		}
		return t->count;
	}

	if ((le == NULL) && (re == NULL))
	{
		if (!ctxt->islands_ok && (lw != -1))
		{
			/* If we don't allow islands (a set of words linked together
			 * but separate from the rest of the sentence) then the cost
			 * of skipping n words is just n */
			if (cost == ((rw-lw-1) + ctxt->null_block-1)/ctxt->null_block)
			{
				/* If null_block=4 then the cost of
				   1,2,3,4 nulls is 1; and 5,6,7,8 is 2 etc. */
				t->count = 1;
			}
			else
			{
				t->count = 0;
			}
			return t->count;
		}
		if (cost == 0)
		{
			/* There is no zero-cost solution in this case. There is
			 * a slight efficiency hack to separate this cost=0 case
			 * out, but not necessary for correctness */
			t->count = 0;
		}
		else
		{
			total = 0;
			w = lw+1;
			for (d = ctxt->local_sent[w].d; d != NULL; d = d->next)
			{
				if (d->left == NULL)
				{
					total += do_count(sent, w, rw, d->right, NULL, cost-1);
				}
			}
			total += do_count(sent, w, rw, NULL, NULL, cost-1);
			t->count = total;
		}
		return t->count;
	}

	if (le == NULL)
	{
		start_word = lw+1;
	}
	else
	{
		start_word = le->word;
	}

	if (re == NULL)
	{
		end_word = rw-1;
	}
	else
	{
		end_word = re->word;
	}

	total = 0;

	for (w = start_word; w < end_word+1; w++)
	{
		m1 = m = form_match_list(sent, w, le, lw, re, rw);
		for (; m!=NULL; m=m->next)
		{
			d = m->d;
			for (lcost = 0; lcost <= cost; lcost++)
			{
				rcost = cost-lcost;
				/* Now lcost and rcost are the costs we're assigning
				 * to those parts respectively */

				/* Now, we determine if (based on table only) we can see that
				   the current range is not parsable. */
				Lmatch = (le != NULL) && (d->left != NULL) && 
				         do_match(sent, le, d->left, lw, w);
				Rmatch = (d->right != NULL) && (re != NULL) && 
				         do_match(sent, d->right, re, w, rw);

				rightcount = leftcount = 0;
				if (Lmatch)
				{
					leftcount = pseudocount(sent, lw, w, le->next, d->left->next, lcost);
					if (le->multi) leftcount += pseudocount(sent, lw, w, le, d->left->next, lcost);
					if (d->left->multi) leftcount += pseudocount(sent, lw, w, le->next, d->left, lcost);
					if (le->multi && d->left->multi) leftcount += pseudocount(sent, lw, w, le, d->left, lcost);
				}

				if (Rmatch)
				{
					rightcount = pseudocount(sent, w, rw, d->right->next, re->next, rcost);
					if (d->right->multi) rightcount += pseudocount(sent, w,rw,d->right,re->next, rcost);
					if (re->multi) rightcount += pseudocount(sent, w, rw, d->right->next, re, rcost);
					if (d->right->multi && re->multi) rightcount += pseudocount(sent, w, rw, d->right, re, rcost);
				}

				/* total number where links are used on both sides */
				pseudototal = leftcount*rightcount;

				if (leftcount > 0) {
					/* evaluate using the left match, but not the right */
					pseudototal += leftcount * pseudocount(sent, w, rw, d->right, re, rcost);
				}
				if ((le == NULL) && (rightcount > 0)) {
					/* evaluate using the right match, but not the left */
					pseudototal += rightcount * pseudocount(sent, lw, w, le, d->left, lcost);
				}

				/* now pseudototal is 0 implies that we know that the true total is 0 */
				if (pseudototal != 0) {
					rightcount = leftcount = 0;
					if (Lmatch) {
						leftcount = do_count(sent, lw, w, le->next, d->left->next, lcost);
						if (le->multi) leftcount += do_count(sent, lw, w, le, d->left->next, lcost);
						if (d->left->multi) leftcount += do_count(sent, lw, w, le->next, d->left, lcost);
						if (le->multi && d->left->multi) leftcount += do_count(sent, lw, w, le, d->left, lcost);
					}

					if (Rmatch) {
						rightcount = do_count(sent, w, rw, d->right->next, re->next, rcost);
						if (d->right->multi) rightcount += do_count(sent, w,rw,d->right,re->next, rcost);
						if (re->multi) rightcount += do_count(sent, w, rw, d->right->next, re, rcost);
						if (d->right->multi && re->multi) rightcount += do_count(sent, w, rw, d->right, re, rcost);
					}

					total += leftcount*rightcount;  /* total number where links are used on both sides */

					if (leftcount > 0) {
						/* evaluate using the left match, but not the right */
						total += leftcount * do_count(sent, w, rw, d->right, re, rcost);
					}
					if ((le == NULL) && (rightcount > 0)) {
						/* evaluate using the right match, but not the left */
						total += rightcount * do_count(sent, lw, w, le, d->left, lcost);
					}
				}
			}
		}

		put_match_list(sent, m1);
	}
	t->count = total;
	return total;
}
Ejemplo n.º 7
0
static Count_bin do_count(fast_matcher_t *mchxt,
                          count_context_t *ctxt,
                          int lw, int rw,
                          Connector *le, Connector *re,
                          int null_count)
{
	Count_bin zero = hist_zero();
	Count_bin total;
	int start_word, end_word, w;
	Table_connector *t;

	assert (0 <= null_count, "Bad null count");

	t = find_table_pointer(ctxt, lw, rw, le, re, null_count);

	if (t) return t->count;

	/* Create the table entry with a tentative null count of 0.
	 * This count must be updated before we return. */
	t = table_store(ctxt, lw, rw, le, re, null_count);

	if (rw == 1+lw)
	{
		/* lw and rw are neighboring words */
		/* You can't have a linkage here with null_count > 0 */
		if ((le == NULL) && (re == NULL) && (null_count == 0))
		{
			t->count = hist_one();
		}
		else
		{
			t->count = zero;
		}
		return t->count;
	}

	/* The left and right connectors are null, but the two words are
	 * NOT next to each-other. */
	if ((le == NULL) && (re == NULL))
	{
		if (!ctxt->islands_ok && (lw != -1))
		{
			/* If we don't allow islands (a set of words linked together
			 * but separate from the rest of the sentence) then the
			 * null_count of skipping n words is just n. */
			if (null_count == (rw-lw-1))
			{
				t->count = hist_one();
			}
			else
			{
				t->count = zero;
			}
			return t->count;
		}
		if (null_count == 0)
		{
			/* There is no solution without nulls in this case. There is
			 * a slight efficiency hack to separate this null_count==0
			 * case out, but not necessary for correctness */
			t->count = zero;
		}
		else
		{
			t->count = zero;
			Disjunct * d;
			int w = lw + 1;
			for (d = ctxt->local_sent[w].d; d != NULL; d = d->next)
			{
				if (d->left == NULL)
				{
					hist_accumv(&t->count, d->cost,
						do_count(mchxt, ctxt, w, rw, d->right, NULL, null_count-1));
				}
			}
			hist_accumv(&t->count, 0.0,
				do_count(mchxt, ctxt, w, rw, NULL, NULL, null_count-1));
		}
		return t->count;
	}

	if (le == NULL)
	{
		start_word = lw+1;
	}
	else
	{
		start_word = le->word;
	}

	if (re == NULL)
	{
		end_word = rw;
	}
	else
	{
		end_word = re->word +1;
	}

	total = zero;

	for (w = start_word; w < end_word; w++)
	{
		Match_node *m, *m1;
		m1 = m = form_match_list(mchxt, w, le, lw, re, rw);
		for (; m != NULL; m = m->next)
		{
			unsigned int lnull_cnt, rnull_cnt;
			Disjunct * d = m->d;
			/* _p1 avoids a gcc warning about unsafe loop opt */
			unsigned int null_count_p1 = null_count + 1;

			for (lnull_cnt = 0; lnull_cnt < null_count_p1; lnull_cnt++)
			{
				bool Lmatch, Rmatch;
				bool leftpcount = false;
				bool rightpcount = false;
				bool pseudototal = false;

				rnull_cnt = null_count - lnull_cnt;
				/* Now lnull_cnt and rnull_cnt are the costs we're assigning
				 * to those parts respectively */

				/* Now, we determine if (based on table only) we can see that
				   the current range is not parsable. */
				Lmatch = (le != NULL) && (d->left != NULL) &&
				         do_match(le, d->left, lw, w);
				Rmatch = (d->right != NULL) && (re != NULL) &&
				         do_match(d->right, re, w, rw);

				/* First, perform pseudocounting as an optimization. If
				 * the pseudocount is zero, then we know that the true
				 * count will be zero, and so skip counting entirely,
				 * in that case.
				 */
				if (Lmatch)
				{
					leftpcount = pseudocount(ctxt, lw, w, le->next, d->left->next, lnull_cnt);
					if (!leftpcount && le->multi)
						leftpcount =
							pseudocount(ctxt, lw, w, le, d->left->next, lnull_cnt);
					if (!leftpcount && d->left->multi)
						leftpcount =
							pseudocount(ctxt, lw, w, le->next, d->left, lnull_cnt);
					if (!leftpcount && le->multi && d->left->multi)
						leftpcount =
							pseudocount(ctxt, lw, w, le, d->left, lnull_cnt);
				}

				if (Rmatch)
				{
					rightpcount = pseudocount(ctxt, w, rw, d->right->next, re->next, rnull_cnt);
					if (!rightpcount && d->right->multi)
						rightpcount =
							pseudocount(ctxt, w,rw, d->right, re->next, rnull_cnt);
					if (!rightpcount && re->multi)
						rightpcount =
							pseudocount(ctxt, w, rw, d->right->next, re, rnull_cnt);
					if (!rightpcount && d->right->multi && re->multi)
						rightpcount =
							pseudocount(ctxt, w, rw, d->right, re, rnull_cnt);
				}

				/* Total number where links are used on both sides */
				pseudototal = leftpcount && rightpcount;

				if (!pseudototal && leftpcount) {
					/* Evaluate using the left match, but not the right. */
					pseudototal =
						pseudocount(ctxt, w, rw, d->right, re, rnull_cnt);
				}
				if (!pseudototal && (le == NULL) && rightpcount) {
					/* Evaluate using the right match, but not the left. */
					pseudototal =
						pseudocount(ctxt, lw, w, le, d->left, lnull_cnt);
				}

				/* If pseudototal is zero (false), that implies that
				 * we know that the true total is zero. So we don't
				 * bother counting at all, in that case. */
				if (pseudototal)
				{
					Count_bin leftcount = zero;
					Count_bin rightcount = zero;
					if (Lmatch) {
						leftcount = do_count(mchxt, ctxt, lw, w, le->next, d->left->next, lnull_cnt);
						if (le->multi)
							hist_accumv(&leftcount, d->cost,
								do_count(mchxt, ctxt, lw, w, le, d->left->next, lnull_cnt));
						if (d->left->multi)
							hist_accumv(&leftcount, d->cost,
								 do_count(mchxt, ctxt, lw, w, le->next, d->left, lnull_cnt));
						if (le->multi && d->left->multi)
							hist_accumv(&leftcount, d->cost,
								do_count(mchxt, ctxt, lw, w, le, d->left, lnull_cnt));
					}

					if (Rmatch) {
						rightcount = do_count(mchxt, ctxt, w, rw, d->right->next, re->next, rnull_cnt);
						if (d->right->multi)
							hist_accumv(&rightcount, d->cost,
								do_count(mchxt, ctxt, w, rw, d->right,re->next, rnull_cnt));
						if (re->multi)
							hist_accumv(&rightcount, d->cost,
								do_count(mchxt, ctxt, w, rw, d->right->next, re, rnull_cnt));
						if (d->right->multi && re->multi)
							hist_accumv(&rightcount, d->cost,
								do_count(mchxt, ctxt, w, rw, d->right, re, rnull_cnt));
					}

					/* Total number where links are used on both sides */
					hist_muladd(&total, &leftcount, 0.0, &rightcount);

					if (0 < hist_total(&leftcount))
					{
						/* Evaluate using the left match, but not the right */
						hist_muladdv(&total, &leftcount, d->cost,
							do_count(mchxt, ctxt, w, rw, d->right, re, rnull_cnt));
					}
					if ((le == NULL) && (0 < hist_total(&rightcount)))
					{
						/* Evaluate using the right match, but not the left */
						hist_muladdv(&total, &rightcount, d->cost,
							do_count(mchxt, ctxt, lw, w, le, d->left, lnull_cnt));
					}

					/* Sigh. Overflows can and do occur, esp for the ANY language. */
					if (INT_MAX < hist_total(&total))
					{
#ifdef PERFORM_COUNT_HISTOGRAMMING
						total.total = INT_MAX;
#else
						total = INT_MAX;
#endif /* PERFORM_COUNT_HISTOGRAMMING */
						t->count = total;
						put_match_list(mchxt, m1);
						return total;
					}
				}
			}
		}
		put_match_list(mchxt, m1);
	}
	t->count = total;
	return total;
}
Ejemplo n.º 8
0
Match_node * form_match_list
      (int w, Connector *lc, int lw, Connector *rc, int rw) {
/* Forms and returns a list of disjuncts that might match lc or rc or both.
   lw and rw are the words from which lc and rc came respectively.
   The list is formed by the link pointers of Match_nodes.
   The list contains no duplicates.  A quadratic algorithm is used to
   eliminate duplicates.  In practice the match_cost is less than the
   parse_cost (and the loop is tiny), so there's no reason to bother
   to fix this.
*/
    Match_node *ml, *mr, *mx, *my, * mz, *front, *free_later;

    if (lc!=NULL) {
	ml = l_table[w][fast_match_hash(lc) & (l_table_size[w]-1)];
    } else {
	ml = NULL;
    }
    if (rc!=NULL) {
	mr = r_table[w][fast_match_hash(rc) & (r_table_size[w]-1)];
    } else {
	mr = NULL;
    }

    front = NULL;
    for (mx = ml; mx!=NULL; mx=mx->next) {
	if (mx->d->left->word < lw) break;
	my = get_match_node();
	my->d = mx->d;
	my->next = front;
	front = my;
    }
    ml = front;   /* ml is now the list of things that could match the left */

    front = NULL;
    for (mx = mr; mx!=NULL; mx=mx->next) {
	if (mx->d->right->word > rw) break;
	my = get_match_node();
	my->d = mx->d;
	my->next = front;
	front = my;
    }
    mr = front;   /* mr is now the list of things that could match the right */

    /* now we want to eliminate duplicates from the lists */

    free_later = NULL;
    front = NULL;
    for(mx = mr; mx != NULL; mx=mz) {
	/* see if mx in first list, put it in if its not */
	mz = mx->next;
	match_cost++;
	for (my=ml; my!=NULL; my=my->next) {
	    match_cost++;
	    if (mx->d == my->d) break;
	}
	if (my != NULL) { /* mx was in the l list */
	    mx->next = free_later;
	    free_later = mx;
	}
	if (my==NULL) {  /* it was not there */
	    mx->next = front;
	    front = mx;
	}
    }
    mr = front;  /* mr is now the abbreviated right list */
    put_match_list(free_later);

    /* now catenate the two lists */
    if (mr == NULL) return ml;
    for (mx = mr; mx->next != NULL; mx = mx->next)
      ;
    mx->next = ml;
    return mr;
}
Ejemplo n.º 9
0
/**
 * Forms and returns a list of disjuncts coming from word w, that might
 * match lc or rc or both. The lw and rw are the words from which lc
 * and rc came respectively.
 *
 * The list is returned in a linked list of Match_nodes.
 * The list contains no duplicates.  A quadratic algorithm is used to
 * eliminate duplicates.  In practice the match_cost is less than the
 * parse_cost (and the loop is tiny), so there's no reason to bother
 * to fix this.  The number of times through the loop is counted with
 * 'match_cost', if verbosity>1, then it this will be printed at the end.
 *
 * Well, with one exception: for long sentences that have parse
 * overflows, this can sometimes get match lists that are hundreds of
 * elements long, dominating the total time spent in the algo; viz.
 * in excess of 50% of the time.
 */
Match_node *
form_match_list(fast_matcher_t *ctxt, int w,
                Connector *lc, int lw,
                Connector *rc, int rw)
{
	size_t rlen = 0, llen = 0;
	Match_node *ml, *mr, *mx, *my, *mz, *front, *free_later;

	if (lc != NULL) {
		ml = ctxt->l_table[w][connector_hash(lc) & (ctxt->l_table_size[w]-1)];
	} else {
		ml = NULL;
	}
	if (rc != NULL) {
		mr = ctxt->r_table[w][connector_hash(rc) & (ctxt->r_table_size[w]-1)];
	} else {
		mr = NULL;
	}

	front = NULL;
	for (mx = ml; mx != NULL; mx = mx->next)
	{
		if (mx->d->left->word < lw) break;
		my = get_match_node(ctxt);
		my->d = mx->d;
		my->next = front;
		front = my;
		llen++;
	}
	ml = front;   /* ml is now the list of things that could match the left */

	front = NULL;
	for (mx = mr; mx != NULL; mx = mx->next)
	{
		if (mx->d->right->word > rw) break;
		my = get_match_node(ctxt);
		my->d = mx->d;
		my->next = front;
		front = my;
		rlen++;
	}
	mr = front;   /* mr is now the list of things that could match the right */

	if (mr == NULL) return ml;
	if (ml == NULL) return mr;

	/* Now we want to eliminate duplicates from the lists. */
	/* If the left-lest is reasonably short, then just do a quadratic
	 * search for duplicates. But if the list is long, optimize the
	 * search.  Based on quickie measurements, the optimized version
	 * seems to dominate when 250 < llen and 8 < rlen. Roughly.
	 */
	if (llen < 250 || rlen < 9)
	{
		/* Perform a simple quadratic-time search. viz two nested loops.
		 * Runtime blows up horribly for lengths over a few hundred. */
		free_later = NULL;
		front = NULL;
		for (mx = mr; mx != NULL; mx = mz)
		{
			/* See if mx in first list, put it in if its not. */
			mz = mx->next;
			ctxt->match_cost++;
			for (my=ml; my!=NULL; my=my->next) {
				ctxt->match_cost++;
				if (mx->d == my->d) break;
			}
			if (my != NULL) { /* mx was in the l list */
				mx->next = free_later;
				free_later = mx;
			} else {  /* It was not there. */
				mx->next = front;
				front = mx;
			}
		}
		mr = front;  /* mr is now the abbreviated right list */
		put_match_list(ctxt, free_later);
	}
	else
	{
		/* Perform an O(N log N) search, by sorting first, and then
		 * doing a linear-line run through the sorted arrays.
		 */
		size_t i,j;
		Match_node* mx;
		Match_node** mra = alloca(rlen * sizeof(Match_node*));
		Match_node** mla = alloca(llen * sizeof(Match_node*));

		i = 0;
		for (mx = mr; mx != NULL; mx = mx->next) mra[i++] = mx;
		qsort((void *) mra, rlen, sizeof(Match_node*), addr_compare);

		i = 0;
		for (mx = ml; mx != NULL; mx = mx->next) mla[i++] = mx;
		qsort((void *) mla, llen, sizeof(Match_node*), addr_compare);

		/* Compare addresses side-by side in a linear loop.
		 * Be careful not to run past bounds arrays. */
		free_later = NULL;
		front = NULL;
		i = 0;
		j = 0;
		while (i < rlen)
		{
			while (i < rlen && mra[i]->d < mla[j]->d)
			{
				mra[i]->next = front;
				front = mra[i];
				i++;
			}
			if (i == rlen) break;

			if (mra[i]->d == mla[j]->d)
			{
				mra[i]->next = free_later;
				free_later = mra[i];
				i++; j++;
			}
			if (i == rlen) break;

			while (j < llen && mra[i]->d > mla[j]->d)
				j++;

			/* Drain the rest of the right-hand list. */
			if (j == llen)
			{
				while (i < rlen)
				{
					mra[i]->next = front;
					front = mra[i];
					i++;
				}
				break;
			}
		}
		mr = front;  /* mr is now the abbreviated right list */
		put_match_list(ctxt, free_later);
	}

	/* Now catenate the two lists. */
	if (mr == NULL) return ml;
	if (ml == NULL) return mr;
	for (mx = mr; mx->next != NULL; mx = mx->next)
	  ;
	mx->next = ml;
	return mr;
}