/** * This uses link_array. It post-processes * this linkage, and prints the appropriate thing. There are no fat * links in it. */ Linkage_info analyze_thin_linkage(Sentence sent, Parse_Options opts, int analyze_pass) { int i; Linkage_info li; PP_node * pp; Postprocessor * postprocessor; Sublinkage *sublinkage; Parse_info pi = sent->parse_info; build_digraph(pi, word_links); memset(&li, 0, sizeof(li)); sublinkage = x_create_sublinkage(pi); postprocessor = sent->dict->postprocessor; compute_link_names(sent); for (i=0; i<pi->N_links; i++) { copy_full_link(&(sublinkage->link[i]), &(pi->link_array[i])); } if (analyze_pass==PP_FIRST_PASS) { post_process_scan_linkage(postprocessor, opts, sent, sublinkage); free_sublinkage(sublinkage); free_digraph(pi, word_links); return li; } /* The code below can be used to generate the "islands" array. For this to work, * however, you have to call "build_digraph" first (as in analyze_fat_linkage). * and then "free_digraph". For some reason this causes a space leak. */ pp = post_process(postprocessor, opts, sent, sublinkage, TRUE); li.N_violations = 0; li.and_cost = 0; li.unused_word_cost = unused_word_cost(sent->parse_info); li.improper_fat_linkage = FALSE; li.inconsistent_domains = FALSE; li.disjunct_cost = disjunct_cost(pi); li.null_cost = null_cost(pi); li.link_cost = link_cost(pi); li.andlist = NULL; if (pp==NULL) { if (postprocessor != NULL) li.N_violations = 1; } else if (pp->violation!=NULL) { li.N_violations++; } free_sublinkage(sublinkage); free_digraph(pi, word_links); return li; }
void extract_thin_linkage(Sentence sent, Parse_Options opts, Linkage linkage) { int i; Sublinkage *sublinkage; Parse_info pi = sent->parse_info; sublinkage = x_create_sublinkage(pi); compute_link_names(sent); for (i=0; i<pi->N_links; i++) { copy_full_link(&sublinkage->link[i],&(pi->link_array[i])); } linkage->num_sublinkages = 1; linkage->sublinkage = ex_create_sublinkage(pi); for (i=0; i<pi->N_links; ++i) { linkage->sublinkage->link[i] = excopy_link(sublinkage->link[i]); } free_sublinkage(sublinkage); }
/** The extract_links() call sets the chosen_disjuncts array */ static void compute_chosen_disjuncts(Sentence sent) { size_t in; size_t N_linkages_alloced = sent->num_linkages_alloced; Parse_info pi = sent->parse_info; for (in=0; in < N_linkages_alloced; in++) { Linkage lkg = &sent->lnkages[in]; Linkage_info *lifo = &lkg->lifo; if (lifo->discarded || lifo->N_violations) continue; partial_init_linkage(lkg, pi->N_words); extract_links(lkg, pi); compute_link_names(lkg, sent->string_set); /* Because the empty words are used only in the parsing stage, they are * removed here along with their links, so from now on we will not need to * consider them. */ remove_empty_words(lkg); } }
/** * This procedure mimics analyze_fat_linkage in order to * extract the sublinkages and copy them to the Linkage * data structure passed in. */ void extract_fat_linkage(Sentence sent, Parse_Options opts, Linkage linkage) { int i, j, N_thin_links; DIS_node *d_root; int num_sublinkages; Sublinkage * sublinkage; Parse_info pi = sent->parse_info; sublinkage = x_create_sublinkage(pi); build_digraph(pi, word_links); structure_violation = FALSE; d_root = build_DIS_CON_tree(pi, word_links); if (structure_violation) { compute_link_names(sent); for (i=0; i<pi->N_links; i++) { copy_full_link(&sublinkage->link[i],&(pi->link_array[i])); } linkage->num_sublinkages=1; linkage->sublinkage = ex_create_sublinkage(pi); /* This will have fat links! */ for (i=0; i<pi->N_links; ++i) { linkage->sublinkage->link[i] = excopy_link(sublinkage->link[i]); } free_sublinkage(sublinkage); free_digraph(pi, word_links); free_DIS_tree(d_root); return; } /* first get number of sublinkages and allocate space */ num_sublinkages = 0; for (;;) { num_sublinkages++; if (!advance_DIS(d_root)) break; } linkage->num_sublinkages = num_sublinkages; linkage->sublinkage = (Sublinkage *) exalloc(sizeof(Sublinkage)*num_sublinkages); for (i=0; i<num_sublinkages; ++i) { linkage->sublinkage[i].link = NULL; linkage->sublinkage[i].pp_info = NULL; linkage->sublinkage[i].violation = NULL; } /* now fill out the sublinkage arrays */ compute_link_names(sent); num_sublinkages = 0; for (;;) { for (i=0; i<pi->N_links; i++) { patch_array[i].used = patch_array[i].changed = FALSE; patch_array[i].newl = pi->link_array[i].l; patch_array[i].newr = pi->link_array[i].r; copy_full_link(&sublinkage->link[i], &(pi->link_array[i])); } fill_patch_array_DIS(d_root, NULL, word_links); for (i=0; i<pi->N_links; i++) { if (patch_array[i].changed || patch_array[i].used) { sublinkage->link[i]->l = patch_array[i].newl; sublinkage->link[i]->r = patch_array[i].newr; } else if ((dfs_root_word[pi->link_array[i].l] != -1) && (dfs_root_word[pi->link_array[i].r] != -1)) { sublinkage->link[i]->l = -1; } } compute_pp_link_array_connectors(sent, sublinkage); compute_pp_link_names(sent, sublinkage); /* Don't copy the fat links into the linkage */ N_thin_links = 0; for (i= 0; i<pi->N_links; ++i) { if (sublinkage->link[i]->l == -1) continue; N_thin_links++; } linkage->sublinkage[num_sublinkages].num_links = N_thin_links; linkage->sublinkage[num_sublinkages].link = (Link *) exalloc(sizeof(Link)*N_thin_links); linkage->sublinkage[num_sublinkages].pp_info = NULL; linkage->sublinkage[num_sublinkages].violation = NULL; for (i=0, j=0; i<pi->N_links; ++i) { if (sublinkage->link[i]->l == -1) continue; linkage->sublinkage[num_sublinkages].link[j++] = excopy_link(sublinkage->link[i]); } num_sublinkages++; if (!advance_DIS(d_root)) break; } free_sublinkage(sublinkage); free_digraph(pi, word_links); free_DIS_tree(d_root); }
/** * This uses link_array. It enumerates and post-processes * all the linkages represented by this one. We know this contains * at least one fat link. */ Linkage_info analyze_fat_linkage(Sentence sent, Parse_Options opts, int analyze_pass) { int i; Linkage_info li; DIS_node *d_root; PP_node *pp; Postprocessor *postprocessor; Sublinkage *sublinkage; Parse_info pi = sent->parse_info; PP_node accum; /* for domain ancestry check */ D_type_list * dtl0, * dtl1; /* for domain ancestry check */ sublinkage = x_create_sublinkage(pi); postprocessor = sent->dict->postprocessor; build_digraph(pi, word_links); structure_violation = FALSE; d_root = build_DIS_CON_tree(pi, word_links); /* may set structure_violation to TRUE */ li.N_violations = 0; li.improper_fat_linkage = structure_violation; li.inconsistent_domains = FALSE; li.unused_word_cost = unused_word_cost(sent->parse_info); li.disjunct_cost = disjunct_cost(pi); li.null_cost = null_cost(pi); li.link_cost = link_cost(pi); li.and_cost = 0; li.andlist = NULL; if (structure_violation) { li.N_violations++; free_sublinkage(sublinkage); free_digraph(pi, word_links); free_DIS_tree(d_root); return li; } if (analyze_pass==PP_SECOND_PASS) { li.andlist = build_andlist(sent, word_links); li.and_cost = li.andlist->cost; } else li.and_cost = 0; compute_link_names(sent); for (i=0; i<pi->N_links; i++) accum.d_type_array[i] = NULL; for (;;) { /* loop through all the sub linkages */ for (i=0; i<pi->N_links; i++) { patch_array[i].used = patch_array[i].changed = FALSE; patch_array[i].newl = pi->link_array[i].l; patch_array[i].newr = pi->link_array[i].r; copy_full_link(&sublinkage->link[i], &(pi->link_array[i])); } fill_patch_array_DIS(d_root, NULL, word_links); for (i=0; i<pi->N_links; i++) { if (patch_array[i].changed || patch_array[i].used) { sublinkage->link[i]->l = patch_array[i].newl; sublinkage->link[i]->r = patch_array[i].newr; } else if ((dfs_root_word[pi->link_array[i].l] != -1) && (dfs_root_word[pi->link_array[i].r] != -1)) { sublinkage->link[i]->l = -1; } } compute_pp_link_array_connectors(sent, sublinkage); compute_pp_link_names(sent, sublinkage); /* 'analyze_pass' logic added ALB 1/97 */ if (analyze_pass==PP_FIRST_PASS) { post_process_scan_linkage(postprocessor,opts,sent,sublinkage); if (!advance_DIS(d_root)) break; else continue; } pp = post_process(postprocessor, opts, sent, sublinkage, TRUE); if (pp==NULL) { if (postprocessor != NULL) li.N_violations = 1; } else if (pp->violation == NULL) { /* the purpose of this stuff is to make sure the domain ancestry for a link in each of its sentences is consistent. */ for (i=0; i<pi->N_links; i++) { if (sublinkage->link[i]->l == -1) continue; if (accum.d_type_array[i] == NULL) { accum.d_type_array[i] = copy_d_type(pp->d_type_array[i]); } else { dtl0 = pp->d_type_array[i]; dtl1 = accum.d_type_array[i]; while((dtl0 != NULL) && (dtl1 != NULL) && (dtl0->type == dtl1->type)) { dtl0 = dtl0->next; dtl1 = dtl1->next; } if ((dtl0 != NULL) || (dtl1 != NULL)) break; } } if (i != pi->N_links) { li.N_violations++; li.inconsistent_domains = TRUE; } } else if (pp->violation!=NULL) { li.N_violations++; } if (!advance_DIS(d_root)) break; } for (i=0; i<pi->N_links; ++i) { free_d_type(accum.d_type_array[i]); } /* if (display_on && (li.N_violations != 0) && (verbosity > 3) && should_print_messages) printf("P.P. violation in one part of conjunction.\n"); */ free_sublinkage(sublinkage); free_digraph(pi, word_links); free_DIS_tree(d_root); return li; }
/** This does basic post-processing for all linkages. */ static void post_process_linkages(Sentence sent, Parse_Options opts) { size_t in; size_t N_linkages_post_processed = 0; size_t N_valid_linkages = sent->num_valid_linkages; size_t N_linkages_alloced = sent->num_linkages_alloced; bool twopass = sent->length >= opts->twopass_length; /* (optional) First pass: just visit the linkages */ /* The purpose of the first pass is to make the post-processing * more efficient. Because (hopefully) by the time the real work * is done in the 2nd pass, the relevant rule set has been pruned * in the first pass. */ if (twopass) { for (in=0; in < N_linkages_alloced; in++) { Linkage lkg = &sent->lnkages[in]; Linkage_info *lifo = &lkg->lifo; if (lifo->discarded) continue; /* We still need link names, even if there has been a morfo * violation. */ compute_link_names(lkg, sent->string_set); if (lifo->N_violations) continue; post_process_scan_linkage(sent->postprocessor, lkg); if ((49 == in%50) && resources_exhausted(opts->resources)) break; } } /* Second pass: actually perform post-processing */ for (in=0; in < N_linkages_alloced; in++) { PP_node *ppn; Linkage lkg = &sent->lnkages[in]; Linkage_info *lifo = &lkg->lifo; if (lifo->discarded) continue; /* Invalid morphism construction */ /* We need link names, even if morfo check fails */ if (!twopass) compute_link_names(lkg, sent->string_set); ppn = do_post_process(sent->postprocessor, lkg, twopass); post_process_free_data(&sent->postprocessor->pp_data); if (NULL != ppn->violation) { N_valid_linkages--; lifo->N_violations++; /* Set the message, only if not set (e.g. by sane_morphism) */ if (NULL == lifo->pp_violation_msg) lifo->pp_violation_msg = ppn->violation; } N_linkages_post_processed++; linkage_score(lkg, opts); if ((9 == in%10) && resources_exhausted(opts->resources)) break; } /* If the timer expired, then we never finished post-processing. * Mark the remaining sentences as bad, as otherwise strange * results get reported. At any rate, need to compute the link * names, as otherwise linkage_create() will crash and burn * trying to touch them. */ for (; in < N_linkages_alloced; in++) { Linkage lkg = &sent->lnkages[in]; Linkage_info *lifo = &lkg->lifo; if (lifo->discarded) continue; if (!twopass) compute_link_names(lkg, sent->string_set); N_valid_linkages--; lifo->N_violations++; /* Set the message, only if not set (e.g. by sane_morphism) */ if (NULL == lifo->pp_violation_msg) lifo->pp_violation_msg = "Timeout during postprocessing"; } print_time(opts, "Postprocessed all linkages"); if (opts->verbosity > 1) { err_ctxt ec; ec.sent = sent; err_msg(&ec, Info, "Info: %zu of %zu linkages with no P.P. violations\n", N_valid_linkages, N_linkages_post_processed); } sent->num_linkages_post_processed = N_linkages_post_processed; sent->num_valid_linkages = N_valid_linkages; }
/** * This fills the linkage array with morphologically-acceptable * linkages. */ static void process_linkages(Sentence sent, extractor_t* pex, bool overflowed, Parse_Options opts) { if (0 == sent->num_linkages_found) return; if (0 == sent->num_linkages_alloced) return; /* Avoid a later crash. */ /* Pick random linkages if we get more than what was asked for. */ bool pick_randomly = overflowed || (sent->num_linkages_found > (int) sent->num_linkages_alloced); sent->num_valid_linkages = 0; size_t N_invalid_morphism = 0; int itry = 0; size_t in = 0; int maxtries; /* In the case of overflow, which will happen for some long * sentences, but is particularly common for the amy/ady random * splitters, we want to find as many morpho-acceptable linkages * as possible, but keep the CPU usage down, as these might be * very rare. This is due to a bug/feature in the interaction * between the word-graph and the parser: valid morph linkages * can be one-in-a-thousand.. or worse. Search for them, but * don't over-do it. * Note: This problem has recently been alleviated by an * alternatives-compatibility check in the fast matcher - see * alt_connection_possible(). */ #define MAX_TRIES 250000 if (pick_randomly) { /* Try picking many more linkages, but not more than possible. */ maxtries = MIN((int) sent->num_linkages_alloced + MAX_TRIES, sent->num_linkages_found); } else { maxtries = sent->num_linkages_alloced; } bool need_init = true; for (itry=0; itry<maxtries; itry++) { Linkage lkg = &sent->lnkages[in]; Linkage_info * lifo = &lkg->lifo; /* Negative values tell extract-links to pick randomly; for * reproducible-rand, the actual value is the rand seed. */ lifo->index = pick_randomly ? -(itry+1) : itry; if (need_init) { partial_init_linkage(sent, lkg, sent->length); need_init = false; } extract_links(pex, lkg); compute_link_names(lkg, sent->string_set); if (verbosity_level(+D_PL)) { err_msg(lg_Debug, "chosen_disjuncts before:\n\\"); print_chosen_disjuncts_words(lkg, /*prt_opt*/true); } if (sane_linkage_morphism(sent, lkg, opts)) { remove_empty_words(lkg); if (verbosity_level(+D_PL)) { err_msg(lg_Debug, "chosen_disjuncts after:\n\\"); print_chosen_disjuncts_words(lkg, /*prt_opt*/false); } need_init = true; in++; if (in >= sent->num_linkages_alloced) break; } else { N_invalid_morphism++; lkg->num_links = 0; lkg->num_words = sent->length; // memset(lkg->link_array, 0, lkg->lasz * sizeof(Link)); memset(lkg->chosen_disjuncts, 0, sent->length * sizeof(Disjunct *)); } } /* The last one was alloced, but never actually used. Free it. */ if (!need_init) free_linkage(&sent->lnkages[in]); sent->num_valid_linkages = in; /* The remainder of the array is garbage; we never filled it in. * So just pretend that it's shorter than it is */ sent->num_linkages_alloced = sent->num_valid_linkages; lgdebug(D_PARSE, "Info: sane_morphism(): %zu of %d linkages had " "invalid morphology construction\n", N_invalid_morphism, itry + (itry != maxtries)); }