/** * psuedocount is used to check to see if a parse is even possible, * so that we don't waste cpu time performing an actual count, only * to discover that it is zero. * * Returns false if and only if this entry is in the hash table * with a count value of 0. If an entry is not in the hash table, * we have to assume the worst case: that the count might be non-zero, * and since we don't know, we return true. However, if the entry is * in the hash table, and its zero, then we know, for sure, that the * count is zero. */ static bool pseudocount(count_context_t * ctxt, int lw, int rw, Connector *le, Connector *re, unsigned int null_count) { Count_bin * count = table_lookup(ctxt, lw, rw, le, re, null_count); if (NULL == count) return true; if (hist_total(count) == 0) return false; return true; }
/** * returns NULL if there are no ways to parse, or returns a pointer * to a set structure representing all the ways to parse. * * This code is similar to do_count() in count.c -- for a good reason: * the do_count() function did a full parse, but didn't actually * allocate an memory structures to hold the parse. This also does * a full parse, but it also allocates and fills out the various * parse structures. */ static Parse_set * mk_parse_set(Sentence sent, fast_matcher_t *mchxt, count_context_t * ctxt, Disjunct *ld, Disjunct *rd, int lw, int rw, Connector *le, Connector *re, unsigned int null_count, bool islands_ok, Parse_info pi) { int start_word, end_word, w; X_table_connector *xt; Count_bin * count; assert(null_count < 0x7fff, "mk_parse_set() called with null_count < 0."); count = table_lookup(ctxt, lw, rw, le, re, null_count); /* If there's no counter, then there's no way to parse. */ if (NULL == count) return NULL; if (hist_total(count) == 0) return NULL; xt = x_table_pointer(lw, rw, le, re, null_count, pi); /* Perhaps we've already computed it; if so, return it. */ if (xt != NULL) return &xt->set; /* Start it out with the empty set of parse chocies. */ /* This entry must be updated before we return. */ xt = x_table_store(lw, rw, le, re, null_count, pi); /* The count we previously computed; its non-zero. */ xt->set.count = hist_total(count); #define NUM_PARSES 4 // xt->set.cost_cutoff = hist_cost_cutoff(count, NUM_PARSES); // xt->set.cut_count = hist_cut_total(count, NUM_PARSES); #define RECOUNT(X) /* Make it disappear... */ RECOUNT({xt->set.recount = 1;}) /* If the two words are next to each other, the count == 1 */ if (lw + 1 == rw) return &xt->set;
/** Misnamed, this has nothing to do with chart parsing */ static void chart_parse(Sentence sent, Parse_Options opts) { int nl; fast_matcher_t * mchxt; count_context_t * ctxt; /* Build lists of disjuncts */ prepare_to_parse(sent, opts); if (resources_exhausted(opts->resources)) return; mchxt = alloc_fast_matcher(sent); ctxt = alloc_count_context(sent->length); print_time(opts, "Initialized fast matcher"); if (resources_exhausted(opts->resources)) { free_count_context(ctxt); free_fast_matcher(mchxt); return; } /* A parse set may have been already been built for this sentence, * if it was previously parsed. If so we free it up before * building another. Huh ?? How could that happen? */ free_parse_info(sent->parse_info); sent->parse_info = parse_info_new(sent->length); nl = opts->min_null_count; while (true) { Count_bin hist; s64 total; if (resources_exhausted(opts->resources)) break; sent->null_count = nl; hist = do_parse(sent, mchxt, ctxt, sent->null_count, opts); total = hist_total(&hist); if (opts->verbosity > 1) { prt_error("Info: Total count with %zu null links: %lld\n", sent->null_count, total); } /* total is 64-bit, num_linkages_found is 32-bit. Clamp */ total = (total > INT_MAX) ? INT_MAX : total; total = (total < 0) ? INT_MAX : total; sent->num_linkages_found = (int) total; print_time(opts, "Counted parses"); select_linkages(sent, mchxt, ctxt, opts); compute_chosen_disjuncts(sent); sane_morphism(sent, opts); post_process_linkages(sent, opts); if (sent->num_valid_linkages > 0) break; /* If we are here, then no valid linkages were found. * If there was a parse overflow, give up now. */ if (PARSE_NUM_OVERFLOW < total) break; /* loop termination */ if (nl == opts->max_null_count) break; /* If we are here, we are going round again. Free stuff. */ free_linkages(sent); nl++; } sort_linkages(sent, opts); free_count_context(ctxt); free_fast_matcher(mchxt); }
static Count_bin do_count(fast_matcher_t *mchxt, count_context_t *ctxt, int lw, int rw, Connector *le, Connector *re, int null_count) { Count_bin zero = hist_zero(); Count_bin total; int start_word, end_word, w; Table_connector *t; assert (0 <= null_count, "Bad null count"); t = find_table_pointer(ctxt, lw, rw, le, re, null_count); if (t) return t->count; /* Create the table entry with a tentative null count of 0. * This count must be updated before we return. */ t = table_store(ctxt, lw, rw, le, re, null_count); if (rw == 1+lw) { /* lw and rw are neighboring words */ /* You can't have a linkage here with null_count > 0 */ if ((le == NULL) && (re == NULL) && (null_count == 0)) { t->count = hist_one(); } else { t->count = zero; } return t->count; } /* The left and right connectors are null, but the two words are * NOT next to each-other. */ if ((le == NULL) && (re == NULL)) { if (!ctxt->islands_ok && (lw != -1)) { /* If we don't allow islands (a set of words linked together * but separate from the rest of the sentence) then the * null_count of skipping n words is just n. */ if (null_count == (rw-lw-1)) { t->count = hist_one(); } else { t->count = zero; } return t->count; } if (null_count == 0) { /* There is no solution without nulls in this case. There is * a slight efficiency hack to separate this null_count==0 * case out, but not necessary for correctness */ t->count = zero; } else { t->count = zero; Disjunct * d; int w = lw + 1; for (d = ctxt->local_sent[w].d; d != NULL; d = d->next) { if (d->left == NULL) { hist_accumv(&t->count, d->cost, do_count(mchxt, ctxt, w, rw, d->right, NULL, null_count-1)); } } hist_accumv(&t->count, 0.0, do_count(mchxt, ctxt, w, rw, NULL, NULL, null_count-1)); } return t->count; } if (le == NULL) { start_word = lw+1; } else { start_word = le->word; } if (re == NULL) { end_word = rw; } else { end_word = re->word +1; } total = zero; for (w = start_word; w < end_word; w++) { Match_node *m, *m1; m1 = m = form_match_list(mchxt, w, le, lw, re, rw); for (; m != NULL; m = m->next) { unsigned int lnull_cnt, rnull_cnt; Disjunct * d = m->d; /* _p1 avoids a gcc warning about unsafe loop opt */ unsigned int null_count_p1 = null_count + 1; for (lnull_cnt = 0; lnull_cnt < null_count_p1; lnull_cnt++) { bool Lmatch, Rmatch; bool leftpcount = false; bool rightpcount = false; bool pseudototal = false; rnull_cnt = null_count - lnull_cnt; /* Now lnull_cnt and rnull_cnt are the costs we're assigning * to those parts respectively */ /* Now, we determine if (based on table only) we can see that the current range is not parsable. */ Lmatch = (le != NULL) && (d->left != NULL) && do_match(le, d->left, lw, w); Rmatch = (d->right != NULL) && (re != NULL) && do_match(d->right, re, w, rw); /* First, perform pseudocounting as an optimization. If * the pseudocount is zero, then we know that the true * count will be zero, and so skip counting entirely, * in that case. */ if (Lmatch) { leftpcount = pseudocount(ctxt, lw, w, le->next, d->left->next, lnull_cnt); if (!leftpcount && le->multi) leftpcount = pseudocount(ctxt, lw, w, le, d->left->next, lnull_cnt); if (!leftpcount && d->left->multi) leftpcount = pseudocount(ctxt, lw, w, le->next, d->left, lnull_cnt); if (!leftpcount && le->multi && d->left->multi) leftpcount = pseudocount(ctxt, lw, w, le, d->left, lnull_cnt); } if (Rmatch) { rightpcount = pseudocount(ctxt, w, rw, d->right->next, re->next, rnull_cnt); if (!rightpcount && d->right->multi) rightpcount = pseudocount(ctxt, w,rw, d->right, re->next, rnull_cnt); if (!rightpcount && re->multi) rightpcount = pseudocount(ctxt, w, rw, d->right->next, re, rnull_cnt); if (!rightpcount && d->right->multi && re->multi) rightpcount = pseudocount(ctxt, w, rw, d->right, re, rnull_cnt); } /* Total number where links are used on both sides */ pseudototal = leftpcount && rightpcount; if (!pseudototal && leftpcount) { /* Evaluate using the left match, but not the right. */ pseudototal = pseudocount(ctxt, w, rw, d->right, re, rnull_cnt); } if (!pseudototal && (le == NULL) && rightpcount) { /* Evaluate using the right match, but not the left. */ pseudototal = pseudocount(ctxt, lw, w, le, d->left, lnull_cnt); } /* If pseudototal is zero (false), that implies that * we know that the true total is zero. So we don't * bother counting at all, in that case. */ if (pseudototal) { Count_bin leftcount = zero; Count_bin rightcount = zero; if (Lmatch) { leftcount = do_count(mchxt, ctxt, lw, w, le->next, d->left->next, lnull_cnt); if (le->multi) hist_accumv(&leftcount, d->cost, do_count(mchxt, ctxt, lw, w, le, d->left->next, lnull_cnt)); if (d->left->multi) hist_accumv(&leftcount, d->cost, do_count(mchxt, ctxt, lw, w, le->next, d->left, lnull_cnt)); if (le->multi && d->left->multi) hist_accumv(&leftcount, d->cost, do_count(mchxt, ctxt, lw, w, le, d->left, lnull_cnt)); } if (Rmatch) { rightcount = do_count(mchxt, ctxt, w, rw, d->right->next, re->next, rnull_cnt); if (d->right->multi) hist_accumv(&rightcount, d->cost, do_count(mchxt, ctxt, w, rw, d->right,re->next, rnull_cnt)); if (re->multi) hist_accumv(&rightcount, d->cost, do_count(mchxt, ctxt, w, rw, d->right->next, re, rnull_cnt)); if (d->right->multi && re->multi) hist_accumv(&rightcount, d->cost, do_count(mchxt, ctxt, w, rw, d->right, re, rnull_cnt)); } /* Total number where links are used on both sides */ hist_muladd(&total, &leftcount, 0.0, &rightcount); if (0 < hist_total(&leftcount)) { /* Evaluate using the left match, but not the right */ hist_muladdv(&total, &leftcount, d->cost, do_count(mchxt, ctxt, w, rw, d->right, re, rnull_cnt)); } if ((le == NULL) && (0 < hist_total(&rightcount))) { /* Evaluate using the right match, but not the left */ hist_muladdv(&total, &rightcount, d->cost, do_count(mchxt, ctxt, lw, w, le, d->left, lnull_cnt)); } /* Sigh. Overflows can and do occur, esp for the ANY language. */ if (INT_MAX < hist_total(&total)) { #ifdef PERFORM_COUNT_HISTOGRAMMING total.total = INT_MAX; #else total = INT_MAX; #endif /* PERFORM_COUNT_HISTOGRAMMING */ t->count = total; put_match_list(mchxt, m1); return total; } } } } put_match_list(mchxt, m1); } t->count = total; return total; }
/** * classic_parse() -- parse the given sentence. * Perform parsing, using the original link-grammar parsing algorithm * given in the original link-grammar papers. * * Do the parse with the minimum number of null-links within the range * specified by opts->min_null_count and opts->max_null_count. * * To that end, call do_parse() with an increasing null_count, from * opts->min_null_count up to (including) opts->max_null_count, until a * parse is found. * * A note about the disjuncts save/restore that is done here: * To increase the parsing speed, before invoking do_parse(), * pp_and_power_prune() is invoked to remove connectors which have no * possibility to connect. It includes a significant optimization when * null_count==0 that makes a more aggressive removal, but this * optimization is not appropriate when null_count>0. * * So in case this optimization has been done and a complete parse (i.e. * a parse when null_count==0) is not found, we are left with sentence * disjuncts which are not appropriate to continue do_parse() tries with * null_count>0. To solve that, we need to restore the original * disjuncts of the sentence and call pp_and_power_prune() once again. */ void classic_parse(Sentence sent, Parse_Options opts) { fast_matcher_t * mchxt = NULL; count_context_t * ctxt = NULL; bool pp_and_power_prune_done = false; Disjunct **disjuncts_copy = NULL; bool is_null_count_0 = (0 == opts->min_null_count); int max_null_count = MIN((int)sent->length, opts->max_null_count); /* Build lists of disjuncts */ prepare_to_parse(sent, opts); if (resources_exhausted(opts->resources)) return; if (is_null_count_0 && (0 < max_null_count)) { /* Save the disjuncts in case we need to parse with null_count>0. */ disjuncts_copy = alloca(sent->length * sizeof(Disjunct *)); for (size_t i = 0; i < sent->length; i++) disjuncts_copy[i] = disjuncts_dup(sent->word[i].d); } for (int nl = opts->min_null_count; nl <= max_null_count; nl++) { Count_bin hist; s64 total; if (!pp_and_power_prune_done) { if (0 != nl) { pp_and_power_prune_done = true; if (is_null_count_0) opts->min_null_count = 1; /* Don't optimize for null_count==0. */ /* We are parsing now with null_count>0, when previously we * parsed with null_count==0. Restore the save disjuncts. */ if (NULL != disjuncts_copy) { free_sentence_disjuncts(sent); for (size_t i = 0; i < sent->length; i++) sent->word[i].d = disjuncts_copy[i]; disjuncts_copy = NULL; } } pp_and_power_prune(sent, opts); if (is_null_count_0) opts->min_null_count = 0; if (resources_exhausted(opts->resources)) break; free_count_context(ctxt, sent); free_fast_matcher(sent, mchxt); pack_sentence(sent); ctxt = alloc_count_context(sent); mchxt = alloc_fast_matcher(sent); print_time(opts, "Initialized fast matcher"); } if (resources_exhausted(opts->resources)) break; free_linkages(sent); sent->null_count = nl; hist = do_parse(sent, mchxt, ctxt, sent->null_count, opts); total = hist_total(&hist); lgdebug(D_PARSE, "Info: Total count with %zu null links: %lld\n", sent->null_count, total); /* total is 64-bit, num_linkages_found is 32-bit. Clamp */ total = (total > INT_MAX) ? INT_MAX : total; total = (total < 0) ? INT_MAX : total; sent->num_linkages_found = (int) total; print_time(opts, "Counted parses"); extractor_t * pex = extractor_new(sent->length, sent->rand_state); bool ovfl = setup_linkages(sent, pex, mchxt, ctxt, opts); process_linkages(sent, pex, ovfl, opts); free_extractor(pex); post_process_lkgs(sent, opts); if (sent->num_valid_linkages > 0) break; if ((0 == nl) && (0 < max_null_count) && verbosity > 0) prt_error("No complete linkages found.\n"); /* If we are here, then no valid linkages were found. * If there was a parse overflow, give up now. */ if (PARSE_NUM_OVERFLOW < total) break; //if (sent->num_linkages_found > 0 && nl>0) printf("NUM_LINKAGES_FOUND %d\n", sent->num_linkages_found); } sort_linkages(sent, opts); if (NULL != disjuncts_copy) { for (size_t i = 0; i < sent->length; i++) free_disjuncts(disjuncts_copy[i]); } free_count_context(ctxt, sent); free_fast_matcher(sent, mchxt); }