static bool setup_linkages(Sentence sent, extractor_t* pex, fast_matcher_t* mchxt, count_context_t* ctxt, Parse_Options opts) { bool overflowed = build_parse_set(pex, sent, mchxt, ctxt, sent->null_count, opts); print_time(opts, "Built parse set"); if (overflowed && (1 < opts->verbosity)) { err_ctxt ec = { sent }; err_msgc(&ec, lg_Warn, "Count overflow.\n" "Considering a random subset of %zu of an unknown and large number of linkages\n", opts->linkage_limit); } if (sent->num_linkages_found == 0) { sent->num_linkages_alloced = 0; sent->num_linkages_post_processed = 0; sent->num_valid_linkages = 0; sent->lnkages = NULL; return overflowed; } sent->num_linkages_alloced = MIN(sent->num_linkages_found, (int) opts->linkage_limit); /* Now actually malloc the array in which we will process linkages. */ /* We may have been called before, e.g. this might be a panic parse, * and the linkages array may still be there from last time. * XXX free_linkages() zeros sent->num_linkages_found. */ if (sent->lnkages) free_linkages(sent); sent->lnkages = linkage_array_new(sent->num_linkages_alloced); return overflowed; }
static void select_linkages(Sentence sent, fast_matcher_t* mchxt, count_context_t* ctxt, Parse_Options opts) { size_t in; size_t N_linkages_found, N_linkages_alloced; bool overflowed = build_parse_set(sent, mchxt, ctxt, sent->null_count, opts); print_time(opts, "Built parse set"); if (overflowed && (1 < opts->verbosity)) { err_ctxt ec; ec.sent = sent; err_msg(&ec, Warn, "Warning: Count overflow.\n" "Considering a random subset of %zu of an unknown and large number of linkages\n", opts->linkage_limit); } N_linkages_found = sent->num_linkages_found; if (sent->num_linkages_found == 0) { sent->num_linkages_alloced = 0; sent->num_linkages_post_processed = 0; sent->num_valid_linkages = 0; sent->lnkages = NULL; return; } if (N_linkages_found > opts->linkage_limit) { N_linkages_alloced = opts->linkage_limit; if (opts->verbosity > 1) { err_ctxt ec; ec.sent = sent; err_msg(&ec, Warn, "Warning: Considering a random subset of %zu of %zu linkages\n", N_linkages_alloced, N_linkages_found); } } else { N_linkages_alloced = N_linkages_found; } /* Now actually malloc the array in which we will process linkages. */ /* We may have been called before, e.g this might be a panic parse, * and the linkages array may still be there from last time. * XXX free_linkages() zeros sent->num_linkages_found. */ if (sent->lnkages) free_linkages(sent); sent->num_linkages_found = N_linkages_found; sent->lnkages = linkage_array_new(N_linkages_alloced); /* Generate an array of linkage indices to examine */ if (overflowed) { /* The negative index means that a random subset of links * will be picked later on, in extract_links(). */ for (in=0; in < N_linkages_alloced; in++) { sent->lnkages[in].lifo.index = -(in+1); } } else if (N_linkages_found == N_linkages_alloced) { for (in=0; in<N_linkages_alloced; in++) sent->lnkages[in].lifo.index = in; } else { /* There are more linkages found than we can handle */ /* Pick a (quasi-)uniformly distributed random subset. */ if (opts->repeatable_rand) sent->rand_state = N_linkages_found + sent->length; for (in=0; in<N_linkages_alloced; in++) { size_t block_bottom, block_top; double frac = (double) N_linkages_found; frac /= (double) N_linkages_alloced; block_bottom = (int) (((double) in) * frac); block_top = (int) (((double) (in+1)) * frac); sent->lnkages[in].lifo.index = block_bottom + (rand_r(&sent->rand_state) % (block_top-block_bottom)); } } sent->num_linkages_alloced = N_linkages_alloced; /* Later, we subtract the number of invalid linkages */ sent->num_valid_linkages = N_linkages_alloced; }