/** * Remove the empty words from a linkage. * XXX Should we remove here also the dict-cap tokens? In any case, for now they * are left for debug. */ static void remove_empty_words(Linkage lkg) { size_t i, j; Disjunct **cdj = lkg->chosen_disjuncts; int *remap = alloca(lkg->num_words * sizeof(*remap)); if (4 <= verbosity) { lgdebug(0, "Info: chosen_disjuncts before removing empty words:\n"); print_chosen_disjuncts_words(lkg); } for (i = 0, j = 0; i < lkg->num_words; i++) { if ((NULL != cdj[i]) && (MT_EMPTY == cdj[i]->word[0]->morpheme_type)) { remap[i] = -1; } else { cdj[j] = cdj[i]; remap[i] = j; j++; } } lkg->num_words = j; /* Unused memory not freed - all of it will be freed in free_linkages(). */ if (4 <= verbosity) { lgdebug(0, "Info: chosen_disjuncts after removing empty words:\n"); print_chosen_disjuncts_words(lkg); } for (i = 0, j = 0; i < lkg->num_links; i++) { const Link *old_lnk = &(lkg->link_array[i]); if ((-1 != remap[old_lnk->rw]) && (-1 != remap[old_lnk->lw])) { Link *new_lnk = &(lkg->link_array[j]); /* Copy the entire link contents, thunking the word numbers. * Note that j is always <= i so this is always safe. */ new_lnk->lw = remap[old_lnk->lw]; new_lnk->rw = remap[old_lnk->rw]; new_lnk->lc = old_lnk->lc; new_lnk->rc = old_lnk->rc; new_lnk->link_name = old_lnk->link_name; j++; } } lkg->num_links = j; /* Unused memory not freed - all of it will be freed in free_linkages(). */ }
/** * This fills the linkage array with morphologically-acceptable * linkages. */ static void process_linkages(Sentence sent, extractor_t* pex, bool overflowed, Parse_Options opts) { if (0 == sent->num_linkages_found) return; if (0 == sent->num_linkages_alloced) return; /* Avoid a later crash. */ /* Pick random linkages if we get more than what was asked for. */ bool pick_randomly = overflowed || (sent->num_linkages_found > (int) sent->num_linkages_alloced); sent->num_valid_linkages = 0; size_t N_invalid_morphism = 0; int itry = 0; size_t in = 0; int maxtries; /* In the case of overflow, which will happen for some long * sentences, but is particularly common for the amy/ady random * splitters, we want to find as many morpho-acceptable linkages * as possible, but keep the CPU usage down, as these might be * very rare. This is due to a bug/feature in the interaction * between the word-graph and the parser: valid morph linkages * can be one-in-a-thousand.. or worse. Search for them, but * don't over-do it. * Note: This problem has recently been alleviated by an * alternatives-compatibility check in the fast matcher - see * alt_connection_possible(). */ #define MAX_TRIES 250000 if (pick_randomly) { /* Try picking many more linkages, but not more than possible. */ maxtries = MIN((int) sent->num_linkages_alloced + MAX_TRIES, sent->num_linkages_found); } else { maxtries = sent->num_linkages_alloced; } bool need_init = true; for (itry=0; itry<maxtries; itry++) { Linkage lkg = &sent->lnkages[in]; Linkage_info * lifo = &lkg->lifo; /* Negative values tell extract-links to pick randomly; for * reproducible-rand, the actual value is the rand seed. */ lifo->index = pick_randomly ? -(itry+1) : itry; if (need_init) { partial_init_linkage(sent, lkg, sent->length); need_init = false; } extract_links(pex, lkg); compute_link_names(lkg, sent->string_set); if (verbosity_level(+D_PL)) { err_msg(lg_Debug, "chosen_disjuncts before:\n\\"); print_chosen_disjuncts_words(lkg, /*prt_opt*/true); } if (sane_linkage_morphism(sent, lkg, opts)) { remove_empty_words(lkg); if (verbosity_level(+D_PL)) { err_msg(lg_Debug, "chosen_disjuncts after:\n\\"); print_chosen_disjuncts_words(lkg, /*prt_opt*/false); } need_init = true; in++; if (in >= sent->num_linkages_alloced) break; } else { N_invalid_morphism++; lkg->num_links = 0; lkg->num_words = sent->length; // memset(lkg->link_array, 0, lkg->lasz * sizeof(Link)); memset(lkg->chosen_disjuncts, 0, sent->length * sizeof(Disjunct *)); } } /* The last one was alloced, but never actually used. Free it. */ if (!need_init) free_linkage(&sent->lnkages[in]); sent->num_valid_linkages = in; /* The remainder of the array is garbage; we never filled it in. * So just pretend that it's shorter than it is */ sent->num_linkages_alloced = sent->num_valid_linkages; lgdebug(D_PARSE, "Info: sane_morphism(): %zu of %d linkages had " "invalid morphology construction\n", N_invalid_morphism, itry + (itry != maxtries)); }