/** * Assumes that the sentence expression lists have been generated. */ void prepare_to_parse(Sentence sent, Parse_Options opts) { size_t i; build_sentence_disjuncts(sent, opts->disjunct_cost); if (verbosity > 2) { printf("After expanding expressions into disjuncts:"); print_disjunct_counts(sent); } print_time(opts, "Built disjuncts"); for (i=0; i<sent->length; i++) { sent->word[i].d = eliminate_duplicate_disjuncts(sent->word[i].d); /* Some long Russian sentences can really blow up, here. */ if (resources_exhausted(opts->resources)) return; } print_time(opts, "Eliminated duplicate disjuncts"); if (verbosity > 2) { printf("\nAfter expression pruning and duplicate elimination:\n"); print_disjunct_counts(sent); } set_connector_length_limits(sent, opts); pp_and_power_prune(sent, opts); }
/** * Assumes that the sentence expression lists have been generated. */ void prepare_to_parse(Sentence sent, Parse_Options opts) { size_t i; build_sentence_disjuncts(sent, opts->disjunct_cost, opts); if (verbosity_level(5)) { prt_error("Debug: After expanding expressions into disjuncts:\n"); print_disjunct_counts(sent); } print_time(opts, "Built disjuncts"); for (i=0; i<sent->length; i++) { sent->word[i].d = eliminate_duplicate_disjuncts(sent->word[i].d); /* Some long Russian sentences can really blow up, here. */ if (resources_exhausted(opts->resources)) return; } print_time(opts, "Eliminated duplicate disjuncts"); if (verbosity_level(5)) { prt_error("Debug: After expression pruning and duplicate elimination:\n"); print_disjunct_counts(sent); } gword_record_in_connector(sent); setup_connectors(sent); }
void my_prepare_to_parse(Sentence sent, Parse_Options opts) { /* assumes that the sentence expression lists have been generated */ /* this does all the necessary pruning and building of and */ /* structures. */ int i, has_conjunction; // build_sentence_disjuncts(sent, opts->disjunct_cost); // if (verbosity > 2) { //printf("After expanding expressions into disjuncts:") ; //print_disjunct_counts(sent); // } print_time(opts, "Built disjuncts"); for (i=0; i<sent->length; i++) { sent->word[i].d = eliminate_duplicate_disjuncts(sent->word[i].d); } print_time(opts, "Eliminated duplicate disjuncts"); if (verbosity > 2) { printf("\nAfter expression pruning and duplicate elimination:\n"); print_disjunct_counts(sent); } null_links = (opts->min_null_count > 0); has_conjunction = sentence_contains_conjunction(sent); set_connector_length_limits(sent, opts); build_deletable(sent, has_conjunction); build_effective_dist(sent, has_conjunction); /* why do we do these here instead of in first_prepare_to_parse() only? The reason is that the deletable region depends on if null links are in use. with null_links everything is deletable */ if (!has_conjunction) { pp_and_power_prune(sent, RUTHLESS, opts); } else { pp_and_power_prune(sent, GENTLE, opts); /*if (verbosity > 2) { printf("\nAfter Gentle power pruning:\n"); print_disjunct_counts(sent); } */ /*print_time(opts, "Finished gentle power pruning"); */ conjunction_prune(sent, opts); if (verbosity > 2) { printf("\nAfter conjunction pruning:\n"); print_disjunct_counts(sent); print_statistics(); } print_time(opts, "Done conjunction pruning"); build_conjunction_tables(sent); install_fat_connectors(sent); install_special_conjunctive_connectors(sent); if (verbosity > 2) { printf("After conjunctions, disjuncts counts:\n"); print_disjunct_counts(sent); } set_connector_length_limits(sent, opts); /* have to do this again cause of the new fat connectors and disjuncts */ print_time(opts, "Constructed fat disjuncts"); prune(sent); print_time(opts, "Pruned fat disjuncts"); for (i=0; i<sent->length; i++) { sent->word[i].d = eliminate_duplicate_disjuncts(sent->word[i].d); } if (verbosity > 2) { printf("After pruning and duplicate elimination:\n"); print_disjunct_counts(sent); } print_time(opts, "Eliminated duplicate disjuncts (again)"); if (verbosity > 2) print_AND_statistics(sent); power_prune(sent, RUTHLESS, opts); } /* if (verbosity > 2) { printf("\nAfter RUTHLESS power-pruning:\n"); print_disjunct_counts(sent); } */ /* print time for power pruning used to be here */ /* now done in power_prune itself */ print_time(opts, "Initialized fast matcher and hash table"); }
/** The return value is the number of disjuncts deleted. * Implementation notes: * Normally all the identical disjunct-jets are memory shared. * The suffix_id of each connector serves as its reference count * in the power table. Each time when a connector that cannot match * is discovered, its reference count is decreased, and its * nearest_word field is assigned BAD_WORD. Due to the memory sharing, * each such an assignment affects immediately all the identical * disjunct-jets. * */ static int power_prune(Sentence sent, Parse_Options opts) { power_table pt; prune_context pc; int N_deleted[2] = {0}; /* [0] counts first deletions, [1] counts dups. */ int total_deleted = 0; power_table_alloc(sent, &pt); power_table_init(sent, &pt); pc.pt = &pt; pc.power_cost = 0; pc.null_links = (opts->min_null_count > 0); pc.N_changed = 1; /* forces it always to make at least two passes */ pc.sent = sent; while (1) { /* left-to-right pass */ for (WordIdx w = 0; w < sent->length; w++) { for (Disjunct **dd = &sent->word[w].d; *dd != NULL; /* See: NEXT */) { Disjunct *d = *dd; /* just for convenience */ if (d->left == NULL) { dd = &d->next; /* NEXT */ continue; } bool is_bad = d->left->nearest_word == BAD_WORD; if (is_bad || left_connector_list_update(&pc, d->left, w, true) < 0) { mark_connector_sequence_for_dequeue(d->left, true); mark_connector_sequence_for_dequeue(d->right, false); /* discard the current disjunct */ *dd = d->next; /* NEXT - set current disjunct to the next one */ N_deleted[(int)is_bad]++; continue; } dd = &d->next; /* NEXT */ } clean_table(pt.r_table_size[w], pt.r_table[w]); } total_deleted += N_deleted[0] + N_deleted[1]; lgdebug(D_PRUNE, "Debug: l->r pass changed %d and deleted %d (%d+%d)\n", pc.N_changed, N_deleted[0]+N_deleted[1], N_deleted[0], N_deleted[1]); if (pc.N_changed == 0 && N_deleted[0] == 0 && N_deleted[1] == 0) break; pc.N_changed = N_deleted[0] = N_deleted[1] = 0; /* right-to-left pass */ for (WordIdx w = sent->length-1; w != (WordIdx) -1; w--) { for (Disjunct **dd = &sent->word[w].d; *dd != NULL; /* See: NEXT */) { Disjunct *d = *dd; /* just for convenience */ if (d->right == NULL) { dd = &d->next; /* NEXT */ continue; } bool is_bad = d->right->nearest_word == BAD_WORD; if (is_bad || right_connector_list_update(&pc, d->right, w, true) >= sent->length) { mark_connector_sequence_for_dequeue(d->right, true); mark_connector_sequence_for_dequeue(d->left, false); /* Discard the current disjunct. */ *dd = d->next; /* NEXT - set current disjunct to the next one */ N_deleted[(int)is_bad]++; continue; } dd = &d->next; /* NEXT */ } clean_table(pt.l_table_size[w], pt.l_table[w]); } total_deleted += N_deleted[0] + N_deleted[1]; lgdebug(D_PRUNE, "Debug: r->l pass changed %d and deleted %d (%d+%d)\n", pc.N_changed, N_deleted[0]+N_deleted[1], N_deleted[0], N_deleted[1]); if (pc.N_changed == 0 && N_deleted[0] == 0 && N_deleted[1] == 0) break; pc.N_changed = N_deleted[0] = N_deleted[1] = 0; } power_table_delete(&pt); lgdebug(D_PRUNE, "Debug: power prune cost: %d\n", pc.power_cost); print_time(opts, "power pruned"); if (verbosity_level(D_PRUNE)) { prt_error("\n\\"); prt_error("Debug: After power_pruning:\n\\"); print_disjunct_counts(sent); } #ifdef DEBUG for (WordIdx w = 0; w < sent->length; w++) { for (Disjunct *d = sent->word[w].d; NULL != d; d = d->next) { for (Connector *c = d->left; NULL != c; c = c->next) assert(c->nearest_word != BAD_WORD); for (Connector *c = d->right; NULL != c; c = c->next) assert(c->nearest_word != BAD_WORD); } } #endif return total_deleted; }
static int pp_prune(Sentence sent, Parse_Options opts) { pp_knowledge * knowledge; size_t i, w; int total_deleted, N_deleted; bool change, deleteme; multiset_table *cmt; if (sent->postprocessor == NULL) return 0; if (!opts->perform_pp_prune) return 0; knowledge = sent->postprocessor->knowledge; cmt = cms_table_new(); for (w = 0; w < sent->length; w++) { Disjunct *d; for (d = sent->word[w].d; d != NULL; d = d->next) { char dir; d->marked = true; for (dir=0; dir < 2; dir++) { Connector *c; for (c = ((dir) ? (d->left) : (d->right)); c != NULL; c = c->next) { insert_in_cms_table(cmt, connector_string(c)); } } } } total_deleted = 0; change = true; while (change) { char dir; change = false; N_deleted = 0; for (w = 0; w < sent->length; w++) { Disjunct *d; for (d = sent->word[w].d; d != NULL; d = d->next) { if (!d->marked) continue; deleteme = false; for (i = 0; i < knowledge->n_contains_one_rules; i++) { pp_rule* rule = &knowledge->contains_one_rules[i]; /* the ith rule */ const char * selector = rule->selector; /* selector string for this rule */ pp_linkset * link_set = rule->link_set; /* the set of criterion links */ if (rule->selector_has_wildcard) continue; /* If it has a * forget it */ for (dir = 0; dir < 2; dir++) { Connector *c; for (c = ((dir) ? (d->left) : (d->right)); c != NULL; c = c->next) { if (!post_process_match(selector, connector_string(c))) continue; /* printf("pp_prune: trigger ok. selector = %s c->string = %s\n", selector, c->string); */ /* We know c matches the trigger link of the rule. */ /* Now check the criterion links */ if (!rule_satisfiable(cmt, link_set)) { deleteme = true; rule->use_count++; } if (deleteme) break; } if (deleteme) break; } if (deleteme) break; } if (deleteme) /* now we delete this disjunct */ { N_deleted++; total_deleted++; d->marked = false; /* mark for deletion later */ for (dir=0; dir < 2; dir++) { Connector *c; for (c = ((dir) ? (d->left) : (d->right)); c != NULL; c = c->next) { change |= delete_from_cms_table(cmt, connector_string(c)); } } } } } lgdebug(D_PRUNE, "Debug: pp_prune pass deleted %d\n", N_deleted); } cms_table_delete(cmt); if (total_deleted > 0) { delete_unmarked_disjuncts(sent); if (verbosity_level(D_PRUNE)) { prt_error("\n\\"); prt_error("Debug: After pp_prune:\n\\"); print_disjunct_counts(sent); } } print_time(opts, "pp pruning"); return total_deleted; }