Linkage linkage_create(int k, Sentence sent, Parse_Options opts) { Linkage linkage; assert((k < sent->num_linkages_post_processed) && (k >= 0), "index out of range"); /* Using exalloc since this is external to the parser itself. */ linkage = (Linkage) exalloc(sizeof(struct Linkage_s)); linkage->num_words = sent->length; linkage->word = (char **) exalloc(linkage->num_words*sizeof(char *)); linkage->current = 0; linkage->num_sublinkages=0; linkage->sublinkage = NULL; linkage->unionized = FALSE; linkage->sent = sent; linkage->opts = opts; linkage->info = sent->link_info[k]; extract_links(sent->link_info[k].index, sent->null_count, sent->parse_info); compute_chosen_words(sent, linkage); if (set_has_fat_down(sent)) { extract_fat_linkage(sent, opts, linkage); } else { extract_thin_linkage(sent, opts, linkage); } if (sent->dict->postprocessor != NULL) { linkage_post_process(linkage, sent->dict->postprocessor); } return linkage; }
/** The extract_links() call sets the chosen_disjuncts array */ static void compute_chosen_disjuncts(Sentence sent) { size_t in; size_t N_linkages_alloced = sent->num_linkages_alloced; Parse_info pi = sent->parse_info; for (in=0; in < N_linkages_alloced; in++) { Linkage lkg = &sent->lnkages[in]; Linkage_info *lifo = &lkg->lifo; if (lifo->discarded || lifo->N_violations) continue; partial_init_linkage(lkg, pi->N_words); extract_links(lkg, pi); /* Because the empty words are used only in the parsing stage, they are * removed here along with their links, so from now on we will not need to * consider them. */ remove_empty_words(lkg); } }
void post_process_linkages(Sentence sent, Parse_Options opts) { int *indices; int in, block_bottom, block_top; int N_linkages_found, N_linkages_alloced; int N_linkages_post_processed, N_valid_linkages; int overflowed, only_canonical_allowed; double denom; Linkage_info *link_info; int canonical; free_post_processing(sent); overflowed = build_parse_set(sent, sent->null_count, opts); print_time(opts, "Built parse set"); if (overflowed) { /* We know that sent->num_linkages_found is bogus, possibly negative */ sent->num_linkages_found = opts->linkage_limit; if (opts->verbosity > 1) fprintf(stdout, "Warning: Count overflow.\n" "Considering a random subset of %d of an unknown and large number of linkages\n", opts->linkage_limit); } N_linkages_found = sent->num_linkages_found; if (sent->num_linkages_found == 0) { sent->num_linkages_alloced = 0; sent->num_linkages_post_processed = 0; sent->num_valid_linkages = 0; sent->link_info = NULL; return; } if (N_linkages_found > opts->linkage_limit) { N_linkages_alloced = opts->linkage_limit; if (opts->verbosity > 1) fprintf(stdout, "Warning: Considering a random subset of %d of %d linkages\n", N_linkages_alloced, N_linkages_found); } else N_linkages_alloced = N_linkages_found; link_info=(Linkage_info *)xalloc(N_linkages_alloced * sizeof(Linkage_info)); N_linkages_post_processed = N_valid_linkages = 0; /* generate an array of linkage indices to examine */ indices = (int *) xalloc(N_linkages_alloced * sizeof(int)); if (overflowed) { for (in=0; in<N_linkages_alloced; in++) { indices[in] = -(in+1); } } else { my_random_initialize(N_linkages_found + sent->length); for (in=0; in<N_linkages_alloced; in++) { denom = (double) N_linkages_alloced; block_bottom = (int) (((double)in*(double) N_linkages_found)/denom); block_top = (int) (((double)(in+1)*(double)N_linkages_found)/denom); indices[in] = block_bottom + (my_random() % (block_top-block_bottom)); } my_random_finalize(); } only_canonical_allowed = (!(overflowed || (N_linkages_found > 2*opts->linkage_limit))); /* When we're processing only a small subset of the linkages, don't worry about restricting the set we consider to be canonical ones. In the extreme case where we are only generating 1 in a million linkages, it's very unlikely that we'll hit two symmetric variants of the same linkage anyway. */ /* (optional) first pass: just visit the linkages */ /* The purpose of these two passes is to make the post-processing more efficient. Because (hopefully) by the time you do the real work in the 2nd pass you've pruned the relevant rule set in the first pass. */ if (sent->length >= opts->twopass_length) { for (in=0; (in < N_linkages_alloced) && (!resources_exhausted(opts->resources)); in++) { extract_links(indices[in], sent->null_count, sent->parse_info); if (set_has_fat_down(sent)) { if (only_canonical_allowed && !is_canonical_linkage(sent)) continue; analyze_fat_linkage(sent, opts, PP_FIRST_PASS); } else { analyze_thin_linkage(sent, opts, PP_FIRST_PASS); } } } /* second pass: actually perform post-processing */ for (in=0; (in < N_linkages_alloced) && (!resources_exhausted(opts->resources)); in++) { extract_links(indices[in], sent->null_count, sent->parse_info); if (set_has_fat_down(sent)) { canonical = is_canonical_linkage(sent); if (only_canonical_allowed && !canonical) continue; link_info[N_linkages_post_processed] = analyze_fat_linkage(sent, opts, PP_SECOND_PASS); link_info[N_linkages_post_processed].fat = TRUE; link_info[N_linkages_post_processed].canonical = canonical; } else { link_info[N_linkages_post_processed] = analyze_thin_linkage(sent, opts, PP_SECOND_PASS); link_info[N_linkages_post_processed].fat = FALSE; link_info[N_linkages_post_processed].canonical = TRUE; } if (link_info[N_linkages_post_processed].N_violations==0) N_valid_linkages++; link_info[N_linkages_post_processed].index = indices[in]; N_linkages_post_processed++; } print_time(opts, "Postprocessed all linkages"); qsort((void *)link_info, N_linkages_post_processed, sizeof(Linkage_info), (int (*)(const void *, const void *)) opts->cost_model.compare_fn); if (!resources_exhausted(opts->resources)) { assert(! ((N_linkages_post_processed == 0) && (N_linkages_found > 0) && (N_linkages_found < opts->linkage_limit)), "None of the linkages is canonical"); } if (opts->verbosity > 1) { fprintf(stdout, "%d of %d linkages with no P.P. violations\n", N_valid_linkages, N_linkages_post_processed); } print_time(opts, "Sorted all linkages"); sent->num_linkages_alloced = N_linkages_alloced; sent->num_linkages_post_processed = N_linkages_post_processed; sent->num_valid_linkages = N_valid_linkages; sent->link_info = link_info; xfree(indices, N_linkages_alloced * sizeof(int)); /*if(N_valid_linkages == 0) free_andlists(sent); */ }
/** * This fills the linkage array with morphologically-acceptable * linkages. */ static void process_linkages(Sentence sent, extractor_t* pex, bool overflowed, Parse_Options opts) { if (0 == sent->num_linkages_found) return; if (0 == sent->num_linkages_alloced) return; /* Avoid a later crash. */ /* Pick random linkages if we get more than what was asked for. */ bool pick_randomly = overflowed || (sent->num_linkages_found > (int) sent->num_linkages_alloced); sent->num_valid_linkages = 0; size_t N_invalid_morphism = 0; int itry = 0; size_t in = 0; int maxtries; /* In the case of overflow, which will happen for some long * sentences, but is particularly common for the amy/ady random * splitters, we want to find as many morpho-acceptable linkages * as possible, but keep the CPU usage down, as these might be * very rare. This is due to a bug/feature in the interaction * between the word-graph and the parser: valid morph linkages * can be one-in-a-thousand.. or worse. Search for them, but * don't over-do it. * Note: This problem has recently been alleviated by an * alternatives-compatibility check in the fast matcher - see * alt_connection_possible(). */ #define MAX_TRIES 250000 if (pick_randomly) { /* Try picking many more linkages, but not more than possible. */ maxtries = MIN((int) sent->num_linkages_alloced + MAX_TRIES, sent->num_linkages_found); } else { maxtries = sent->num_linkages_alloced; } bool need_init = true; for (itry=0; itry<maxtries; itry++) { Linkage lkg = &sent->lnkages[in]; Linkage_info * lifo = &lkg->lifo; /* Negative values tell extract-links to pick randomly; for * reproducible-rand, the actual value is the rand seed. */ lifo->index = pick_randomly ? -(itry+1) : itry; if (need_init) { partial_init_linkage(sent, lkg, sent->length); need_init = false; } extract_links(pex, lkg); compute_link_names(lkg, sent->string_set); if (verbosity_level(+D_PL)) { err_msg(lg_Debug, "chosen_disjuncts before:\n\\"); print_chosen_disjuncts_words(lkg, /*prt_opt*/true); } if (sane_linkage_morphism(sent, lkg, opts)) { remove_empty_words(lkg); if (verbosity_level(+D_PL)) { err_msg(lg_Debug, "chosen_disjuncts after:\n\\"); print_chosen_disjuncts_words(lkg, /*prt_opt*/false); } need_init = true; in++; if (in >= sent->num_linkages_alloced) break; } else { N_invalid_morphism++; lkg->num_links = 0; lkg->num_words = sent->length; // memset(lkg->link_array, 0, lkg->lasz * sizeof(Link)); memset(lkg->chosen_disjuncts, 0, sent->length * sizeof(Disjunct *)); } } /* The last one was alloced, but never actually used. Free it. */ if (!need_init) free_linkage(&sent->lnkages[in]); sent->num_valid_linkages = in; /* The remainder of the array is garbage; we never filled it in. * So just pretend that it's shorter than it is */ sent->num_linkages_alloced = sent->num_valid_linkages; lgdebug(D_PARSE, "Info: sane_morphism(): %zu of %d linkages had " "invalid morphology construction\n", N_invalid_morphism, itry + (itry != maxtries)); }