/** * Assumes that the sentence expression lists have been generated. */ void prepare_to_parse(Sentence sent, Parse_Options opts) { size_t i; build_sentence_disjuncts(sent, opts->disjunct_cost, opts); if (verbosity_level(D_PREP)) { prt_error("Debug: After expanding expressions into disjuncts:\n"); print_disjunct_counts(sent); } print_time(opts, "Built disjuncts"); for (i=0; i<sent->length; i++) { sent->word[i].d = eliminate_duplicate_disjuncts(sent->word[i].d); /* Some long Russian sentences can really blow up, here. */ if (resources_exhausted(opts->resources)) return; } print_time(opts, "Eliminated duplicate disjuncts"); if (verbosity_level(D_PREP)) { prt_error("Debug: After expression pruning and duplicate elimination:\n"); print_disjunct_counts(sent); } setup_connectors(sent); }
static pp_linkset *read_link_set(pp_knowledge *k, const char *label, String_set *ss) { /* read link set, marked by label in knowledge file, into a set of links whose handle is returned. Return NULL if link set not defined in file, in which case the set is taken to be empty. */ int n_strings,i; pp_linkset *ls; if (!pp_lexer_set_label(k->lt, label)) { if (verbosity_level(+D_PPK)) prt_error("Warning: File %s: Link set %s not defined: assuming empty\n", k->path, label); n_strings = 0; } else { n_strings = pp_lexer_count_tokens_of_label(k->lt); if (-1 == n_strings) return &LINK_SET_ERROR; } ls = pp_linkset_open(n_strings); for (i=0; i<n_strings; i++) pp_linkset_add(ls, string_set_add(pp_lexer_get_next_token_of_label(k->lt),ss)); return ls; }
static bool read_form_a_cycle_rules(pp_knowledge *k, const char *label) { size_t n_commas, n_tokens; size_t r, i; pp_linkset *lsHandle; const char **tokens; if (!pp_lexer_set_label(k->lt, label)) { k->n_form_a_cycle_rules = 0; if (verbosity_level(+D_PPK)) prt_error("Warning: File %s: Not using any 'form a cycle' rules\n", k->path); } else { n_commas = pp_lexer_count_commas_of_label(k->lt); k->n_form_a_cycle_rules = (n_commas + 1)/2; } k->form_a_cycle_rules= (pp_rule*) malloc ((1+k->n_form_a_cycle_rules)*sizeof(pp_rule)); for (r=0; r<k->n_form_a_cycle_rules; r++) { /* read link set */ tokens = pp_lexer_get_next_group_of_tokens_of_label(k->lt, &n_tokens); if (n_tokens <= 0) { prt_error("Error: File %s: Syntax error\n", k->path); return false; } lsHandle = pp_linkset_open(n_tokens); for (i=0; i<n_tokens; i++) pp_linkset_add(lsHandle,string_set_add(tokens[i], k->string_set)); k->form_a_cycle_rules[r].link_set = lsHandle; /* read error message */ tokens = pp_lexer_get_next_group_of_tokens_of_label(k->lt, &n_tokens); if (n_tokens > 1) { prt_error("Error: File %s: Invalid syntax (rule %zu of %s)\n", k->path, r+1,label); return false; } k->form_a_cycle_rules[r].msg = string_set_add(tokens[0], k->string_set); k->form_a_cycle_rules[r].use_count = 0; } /* sentinel entry */ k->form_a_cycle_rules[k->n_form_a_cycle_rules].msg = 0; k->form_a_cycle_rules[k->n_form_a_cycle_rules].use_count = 0; return true; }
static bool read_bounded_rules(pp_knowledge *k, const char *label) { const char **tokens; size_t n_commas, n_tokens; size_t r; if (!pp_lexer_set_label(k->lt, label)) { k->n_bounded_rules = 0; if (verbosity_level(+D_PPK)) prt_error("Warning: File %s: Not using any 'bounded' rules\n", k->path); } else { n_commas = pp_lexer_count_commas_of_label(k->lt); k->n_bounded_rules = (n_commas + 1)/2; } k->bounded_rules = (pp_rule*) malloc ((1+k->n_bounded_rules)*sizeof(pp_rule)); for (r=0; r<k->n_bounded_rules; r++) { /* read domain */ tokens = pp_lexer_get_next_group_of_tokens_of_label(k->lt, &n_tokens); if (n_tokens!=1) { prt_error("Error: File %s: Invalid syntax: rule %zu of %s\n", k->path, r+1,label); return false; } k->bounded_rules[r].domain = (int) tokens[0][0]; /* read error message */ tokens = pp_lexer_get_next_group_of_tokens_of_label(k->lt, &n_tokens); if (n_tokens!=1) { prt_error("Error: File %s: Invalid syntax: rule %zu of %s\n", k->path, r+1,label); return false; } k->bounded_rules[r].msg = string_set_add(tokens[0], k->string_set); k->bounded_rules[r].use_count = 0; } /* sentinel entry */ k->bounded_rules[k->n_bounded_rules].msg = 0; k->bounded_rules[k->n_bounded_rules].use_count = 0; return true; }
static bool read_contains_rules(pp_knowledge *k, const char *label, pp_rule **rules, size_t *nRules) { /* Reading the 'contains_one_rules' and reading the 'contains_none_rules' into their respective arrays */ size_t n_tokens, i, r; int n_commas; const char *p; const char **tokens; if (!pp_lexer_set_label(k->lt, label)) { *nRules = 0; if (verbosity_level(+D_PPK)) prt_error("Warning: File %s: Not using any %s rules\n", k->path, label); } else { n_commas = pp_lexer_count_commas_of_label(k->lt); if (-1 == n_commas) return false; *nRules = (n_commas + 1)/3; } *rules = (pp_rule*) malloc ((1+*nRules)*sizeof(pp_rule)); for (r=0; r<*nRules; r++) { /* first read link */ tokens = pp_lexer_get_next_group_of_tokens_of_label(k->lt, &n_tokens); if (n_tokens > 1) { prt_error("Error: File %s: Invalid syntax in %s (rule %zu)\n", k->path, label, r+1); return false; } (*rules)[r].selector = string_set_add(tokens[0], k->string_set); /* read link set */ tokens = pp_lexer_get_next_group_of_tokens_of_label(k->lt, &n_tokens); (*rules)[r].link_set = pp_linkset_open(n_tokens); (*rules)[r].link_set_size = n_tokens; (*rules)[r].link_array = (const char **) malloc((1+n_tokens)*sizeof(const char*)); for (i=0; i<n_tokens; i++) { p = string_set_add(tokens[i], k->string_set); pp_linkset_add((*rules)[r].link_set, p); (*rules)[r].link_array[i] = p; } (*rules)[r].link_array[i]=0; /* NULL-terminator */ /* read error message */ tokens = pp_lexer_get_next_group_of_tokens_of_label(k->lt, &n_tokens); if (n_tokens > 1) { prt_error("Error: File %s: Invalid syntax in %s (rule %zu)\n", k->path, label, r+1); return false; } (*rules)[r].msg = string_set_add(tokens[0], k->string_set); (*rules)[r].use_count = 0; } /* sentinel entry */ (*rules)[*nRules].msg = 0; (*rules)[*nRules].use_count = 0; return true; }
void * object_open(const char *filename, void * (*opencb)(const char *, const void *), const void * user_data) { /* Dictionary data directory path cache -- per-thread storage. */ static TLS char *path_found; char *completename = NULL; void *fp = NULL; char *data_dir = NULL; const char **path = NULL; if (NULL == filename) { /* Invalidate the dictionary data directory path cache. */ char *pf = path_found; path_found = NULL; free(pf); return NULL; } if (NULL == path_found) { data_dir = dictionary_get_data_dir(); if (verbosity_level(D_USER_FILES)) { char cwd[MAX_PATH_NAME]; char *cwdp = getcwd(cwd, sizeof(cwd)); prt_error("Debug: Current directory: %s\n", NULL == cwdp ? "NULL": cwdp); prt_error("Debug: Last-resort data directory: %s\n", data_dir ? data_dir : "NULL"); } } /* Look for absolute filename. * Unix: starts with leading slash. * Windows: starts with C:\ except that the drive letter may differ. */ if ((filename[0] == '/') #ifdef _WIN32 || ((filename[1] == ':') && ((filename[2] == '\\') || (filename[2] == '/'))) || (filename[0] == '\\') /* UNC path */ #endif /* _WIN32 */ ) { /* opencb() returns NULL if the file does not exist. */ fp = opencb(filename, user_data); lgdebug(D_USER_FILES, "Debug: Opening file %s%s\n", filename, NOTFOUND(fp)); } else { /* A path list in which to search for dictionaries. * path_found, data_dir or DEFAULTPATH may be NULL. */ const char *dictpath[] = { path_found, ".", "." DIR_SEPARATOR "data", "..", ".." DIR_SEPARATOR "data", data_dir, DEFAULTPATH, }; size_t i = sizeof(dictpath)/sizeof(dictpath[0]); for (path = dictpath; i-- > 0; path++) { if (NULL == *path) continue; free(completename); completename = join_path(*path, filename); fp = opencb(completename, user_data); lgdebug(D_USER_FILES, "Debug: Opening file %s%s\n", completename, NOTFOUND(fp)); if ((NULL != fp) || (NULL != path_found)) break; } } if (NULL == fp) { fp = opencb(filename, user_data); lgdebug(D_USER_FILES, "Debug: Opening file %s%s\n", filename, NOTFOUND(fp)); } else if (NULL == path_found) { char *pfnd = strdup((NULL != completename) ? completename : filename); if ((0 < verbosity) && (dict_file_open == opencb)) prt_error("Info: Dictionary found at %s\n", pfnd); for (size_t i = 0; i < 2; i++) { char *root = strrchr(pfnd, DIR_SEPARATOR[0]); if (NULL != root) *root = '\0'; } path_found = pfnd; } free(data_dir); free(completename); return fp; }
/** * This fills the linkage array with morphologically-acceptable * linkages. */ static void process_linkages(Sentence sent, extractor_t* pex, bool overflowed, Parse_Options opts) { if (0 == sent->num_linkages_found) return; if (0 == sent->num_linkages_alloced) return; /* Avoid a later crash. */ /* Pick random linkages if we get more than what was asked for. */ bool pick_randomly = overflowed || (sent->num_linkages_found > (int) sent->num_linkages_alloced); sent->num_valid_linkages = 0; size_t N_invalid_morphism = 0; int itry = 0; size_t in = 0; int maxtries; /* In the case of overflow, which will happen for some long * sentences, but is particularly common for the amy/ady random * splitters, we want to find as many morpho-acceptable linkages * as possible, but keep the CPU usage down, as these might be * very rare. This is due to a bug/feature in the interaction * between the word-graph and the parser: valid morph linkages * can be one-in-a-thousand.. or worse. Search for them, but * don't over-do it. * Note: This problem has recently been alleviated by an * alternatives-compatibility check in the fast matcher - see * alt_connection_possible(). */ #define MAX_TRIES 250000 if (pick_randomly) { /* Try picking many more linkages, but not more than possible. */ maxtries = MIN((int) sent->num_linkages_alloced + MAX_TRIES, sent->num_linkages_found); } else { maxtries = sent->num_linkages_alloced; } bool need_init = true; for (itry=0; itry<maxtries; itry++) { Linkage lkg = &sent->lnkages[in]; Linkage_info * lifo = &lkg->lifo; /* Negative values tell extract-links to pick randomly; for * reproducible-rand, the actual value is the rand seed. */ lifo->index = pick_randomly ? -(itry+1) : itry; if (need_init) { partial_init_linkage(sent, lkg, sent->length); need_init = false; } extract_links(pex, lkg); compute_link_names(lkg, sent->string_set); if (verbosity_level(+D_PL)) { err_msg(lg_Debug, "chosen_disjuncts before:\n\\"); print_chosen_disjuncts_words(lkg, /*prt_opt*/true); } if (sane_linkage_morphism(sent, lkg, opts)) { remove_empty_words(lkg); if (verbosity_level(+D_PL)) { err_msg(lg_Debug, "chosen_disjuncts after:\n\\"); print_chosen_disjuncts_words(lkg, /*prt_opt*/false); } need_init = true; in++; if (in >= sent->num_linkages_alloced) break; } else { N_invalid_morphism++; lkg->num_links = 0; lkg->num_words = sent->length; // memset(lkg->link_array, 0, lkg->lasz * sizeof(Link)); memset(lkg->chosen_disjuncts, 0, sent->length * sizeof(Disjunct *)); } } /* The last one was alloced, but never actually used. Free it. */ if (!need_init) free_linkage(&sent->lnkages[in]); sent->num_valid_linkages = in; /* The remainder of the array is garbage; we never filled it in. * So just pretend that it's shorter than it is */ sent->num_linkages_alloced = sent->num_valid_linkages; lgdebug(D_PARSE, "Info: sane_morphism(): %zu of %d linkages had " "invalid morphology construction\n", N_invalid_morphism, itry + (itry != maxtries)); }
void compute_chosen_words(Sentence sent, Linkage linkage, Parse_Options opts) { WordIdx i; /* index of chosen_words */ WordIdx j; Disjunct **cdjp = linkage->chosen_disjuncts; const char **chosen_words = alloca(linkage->num_words * sizeof(*chosen_words)); int *remap = alloca(linkage->num_words * sizeof(*remap)); bool *show_word = alloca(linkage->num_words * sizeof(*show_word)); bool display_morphology = opts->display_morphology; Gword **lwg_path = linkage->wg_path; Gword **n_lwg_path = NULL; /* new Wordgraph path, to match chosen_words */ Gword **nullblock_start = NULL; /* start of a null block, to be put in [] */ size_t nbsize = 0; /* number of word in a null block */ Gword *sentence_word; memset(show_word, 0, linkage->num_words * sizeof(*show_word)); if (verbosity_level(D_CCW)) print_lwg_path(lwg_path, "Linkage"); for (i = 0; i < linkage->num_words; i++) { Disjunct *cdj = cdjp[i]; Gword *w; /* current word */ const Gword *nw; /* next word (NULL if none) */ Gword **wgp; /* wordgraph_path traversing pointer */ const char *t = NULL; /* current word string */ bool at_nullblock_end; /* current word is at end of a nullblock */ bool join_alt = false; /* morpheme-join this alternative */ char *s; size_t l; size_t m; lgdebug(D_CCW, "Loop start, word%zu: cdj %s, path %s\n", i, cdj ? cdj->word_string : "NULL", lwg_path[i] ? lwg_path[i]->subword : "NULL"); w = lwg_path[i]; nw = lwg_path[i+1]; wgp = &lwg_path[i]; sentence_word = wg_get_sentence_word(sent, w); /* FIXME If the original word was capitalized in a capitalizable * position, the displayed null word may be its downcase version. */ if (NULL == cdj) /* a null word (the chosen disjunct was NULL) */ { chosen_words[i] = NULL; nbsize++; if (NULL == nullblock_start) /* it starts a new null block */ nullblock_start = wgp; at_nullblock_end = (NULL == nw) || (wg_get_sentence_word(sent, nw->unsplit_word) != sentence_word); /* Accumulate null words in this alternative */ if (!at_nullblock_end && (NULL == cdjp[i+1]) && ((w->morpheme_type == MT_PUNC) == (nw->morpheme_type == MT_PUNC))) { lgdebug(D_CCW, "Skipping word%zu cdjp=NULL#%zu, path %s\n", i, nbsize, w->subword); chosen_words[i] = NULL; continue; } if (NULL != nullblock_start) { /* If we are here, this null word is an end of a null block */ lgdebug(+D_CCW, "Handling %zu null words at %zu: ", nbsize, i); if (1 == nbsize) { /* Case 1: A single null subword. */ lgdebug(D_CCW, "A single null subword.\n"); t = join_null_word(sent, wgp, nbsize); gwordlist_append(&n_lwg_path, w); } else { lgdebug(D_CCW, "Combining null subwords"); /* Use alternative_id to check for start of alternative. */ if (((*nullblock_start)->alternative_id == *nullblock_start) && at_nullblock_end) { /* Case 2: A null unsplit_word (all-nulls alternative).*/ lgdebug(D_CCW, " (null alternative)\n"); t = sentence_word->subword; gwordlist_append(&n_lwg_path, sentence_word); } else { /* Case 3: Join together >=2 null morphemes. */ Gword *wgnull; lgdebug(D_CCW, " (null partial word)\n"); wgnull = wordgraph_null_join(sent, wgp-nbsize+1, wgp); gwordlist_append(&n_lwg_path, wgnull); t = wgnull->subword; } } nullblock_start = NULL; nbsize = 0; show_word[i] = true; if (MT_WALL != w->morpheme_type) { /* Put brackets around the null word. */ l = strlen(t) + 2; s = (char *) alloca(l+1); s[0] = NULLWORD_START; strcpy(&s[1], t); s[l-1] = NULLWORD_END; s[l] = '\0'; t = string_set_add(s, sent->string_set); lgdebug(D_CCW, " %s\n", t); /* Null words have no links, so take care not to drop them. */ } } } else { /* This word has a linkage. */ /* TODO: Suppress "virtual-morphemes", currently the dictcap ones. */ char *sm; t = cdj->word_string; /* Print the subscript, as in "dog.n" as opposed to "dog". */ if (0) { /* TODO */ } else { /* Get rid of those ugly ".Ixx" */ if (is_idiom_word(t)) { s = strdupa(t); sm = strrchr(s, SUBSCRIPT_MARK); /* Possible double subscript. */ UNREACHABLE(NULL == sm); /* We know it has a subscript. */ *sm = '\0'; t = string_set_add(s, sent->string_set); } else if (HIDE_MORPHO) { /* Concatenate the word morphemes together into one word. * Concatenate their subscripts into one subscript. * Use subscript separator SUBSCRIPT_SEP. * XXX Check whether we can encounter an idiom word here. * FIXME Combining contracted words is not handled yet, because * combining morphemes which have non-LL links to other words is * not yet implemented. * FIXME Move to a separate function. */ Gword **wgaltp; size_t join_len = 0; size_t mcnt = 0; /* If the alternative contains morpheme subwords, mark it * for joining... */ const Gword *unsplit_word = w->unsplit_word; for (wgaltp = wgp, j = i; NULL != *wgaltp; wgaltp++, j++) { if ((*wgaltp)->unsplit_word != unsplit_word) break; if (MT_INFRASTRUCTURE == (*wgaltp)->unsplit_word->morpheme_type) break; mcnt++; if (NULL == cdjp[j]) { /* ... but not if it contains a null word */ join_alt = false; break; } join_len += strlen(cdjp[j]->word_string) + 1; if ((*wgaltp)->morpheme_type & IS_REG_MORPHEME) join_alt = true; } if (join_alt) { /* Join it in two steps: 1. Base words. 2. Subscripts. * FIXME? Can be done in one step (more efficient but maybe * less clear). * Put SUBSCRIPT_SEP between the subscripts. * XXX No 1-1 correspondence between the hidden base words * and the subscripts after the join, in case there are base * words with and without subscripts. */ const char subscript_sep_str[] = { SUBSCRIPT_SEP, '\0'}; char *join = calloc(join_len + 1, 1); /* zeroed out */ join[0] = '\0'; /* 1. Join base words. (Could just use the unsplit_word.) */ for (wgaltp = wgp, m = 0; m < mcnt; wgaltp++, m++) { add_morpheme_unmarked(sent, join, cdjp[i+m]->word_string, (*wgaltp)->morpheme_type); } strcat(join, subscript_mark_str()); /* tentative */ /* 2. Join subscripts. */ for (wgaltp = wgp, m = 0; m < mcnt; wgaltp++, m++) { /* Cannot NULLify the word - we may have links to it. */ if (m != mcnt-1) chosen_words[i+m] = ""; sm = strchr(cdjp[i+m]->word_string, SUBSCRIPT_MARK); if (NULL != sm) { /* Supposing stem subscript is .=x (x optional) */ if (MT_STEM == (*wgaltp)->morpheme_type) { sm += 1 + STEM_MARK_L; /* sm+strlen(".=") */ if ('\0' == *sm) sm = NULL; #if 0 if ((cnt-1) == m) { /* Support a prefix-stem combination. In that case * we have just nullified the combined word, so we * need to move it to the position of the prefix. * FIXME: May still not be good enough. */ move_combined_word = i+m-1; /* And the later chosen_word assignment should be: * chosen_words[-1 != move_combined_word ? * move_combined_word : i] = t; */ } else { move_combined_word = -1; } #endif } } if (NULL != sm) { strcat(join, sm+1); strcat(join, subscript_sep_str); } } /* Remove an extra mark, if any */ join_len = strlen(join); if ((SUBSCRIPT_SEP == join[join_len-1]) || (SUBSCRIPT_MARK == join[join_len-1])) join[join_len-1] = '\0'; gwordlist_append(&n_lwg_path, sentence_word); t = string_set_add(join, sent->string_set); free(join); i += mcnt-1; } } } if (!join_alt) gwordlist_append(&n_lwg_path, *wgp); /* * Add guess marks in [] square brackets, if needed, at the * end of the base word. Convert the badly-printing * SUBSCRIPT_MARK (hex 03 or ^C) into a period. */ if (t) { s = strdupa(t); sm = strrchr(s, SUBSCRIPT_MARK); if (sm) *sm = SUBSCRIPT_DOT; if ((!(w->status & WS_GUESS) && (w->status & WS_INDICT)) || !DISPLAY_GUESS_MARKS) { t = string_set_add(s, sent->string_set); } else { const char *regex_name = w->regex_name; /* 4 = 1(null) + 1(guess_mark) + 2 (sizeof "[]") */ int baselen = NULL == sm ? strlen(t) : (size_t)(sm-s); char guess_mark = 0; switch (w->status & WS_GUESS) { case WS_SPELL: guess_mark = GM_SPELL; break; case WS_RUNON: guess_mark = GM_RUNON; break; case WS_REGEX: guess_mark = GM_REGEX; break; case 0: guess_mark = GM_UNKNOWN; break; default: assert(0, "Missing 'case: %2x'", w->status & WS_GUESS); } /* In the case of display_morphology==0, the guess indication of * the last subword is used as the guess indication of the whole * word. * FIXME? The guess indications of other subwords are ignored in * this mode. This implies that if a first or middle subword has * a guess indication but the last subword doesn't have, no guess * indication would be shown at all. */ if ((NULL == regex_name) || HIDE_MORPHO) regex_name = ""; s = alloca(strlen(t) + strlen(regex_name) + 4); strncpy(s, t, baselen); s[baselen] = '['; s[baselen + 1] = guess_mark; strcpy(s + baselen + 2, regex_name); strcat(s, "]"); if (NULL != sm) strcat(s, sm); t = string_set_add(s, sent->string_set); } } } assert(t != NULL, "Word %zu: NULL", i); chosen_words[i] = t; } /* Conditional test removal of quotation marks and the "capdict" tokens, * to facilitate using diff on sentence batch runs. */ if (test_enabled("removeZZZ")) { for (i=0, j=0; i<linkage->num_links; i++) { Link *lnk = &(linkage->link_array[i]); if (0 == strcmp("ZZZ", lnk->link_name)) chosen_words[lnk->rw] = NULL; } } /* If morphology printing is being suppressed, then all links * connecting morphemes will be discarded. */ if (HIDE_MORPHO) { /* Discard morphology links. */ for (i=0; i<linkage->num_links; i++) { Link * lnk = &linkage->link_array[i]; if (is_morphology_link(lnk->link_name)) { /* Mark link for discarding. */ lnk->link_name = NULL; } else { /* Mark word for not discarding. */ show_word[lnk->rw] = true; show_word[lnk->lw] = true; } } } /* We alloc a little more than needed, but so what... */ linkage->word = (const char **) exalloc(linkage->num_words*sizeof(char *)); /* Copy over the chosen words, dropping the discarded words. * However, don't discard existing words (chosen_words[i][0]). * Note that if a word only has morphology links and is not combined with * another word, then it will get displayed with no links at all (e.g. * when explicitly specifying root and suffix for debug: root.= =suf */ for (i=0, j=0; i<linkage->num_words; ++i) { if (chosen_words[i] && (chosen_words[i][0] || (!HIDE_MORPHO || show_word[i]))) { const char *cwtmp = linkage->word[j]; linkage->word[j] = chosen_words[i]; chosen_words[i] = cwtmp; remap[i] = j; j++; } else { remap[i] = -1; } } linkage->num_words = j; remap_linkages(linkage, remap); /* Update linkage->link_array / num_links. */ linkage->wg_path_display = n_lwg_path; if (verbosity_level(D_CCW)) print_lwg_path(n_lwg_path, "Display"); }
void WordTag::insert_connectors(Exp* exp, int& dfs_position, bool& leading_right, bool& leading_left, std::vector<int>& eps_right, std::vector<int>& eps_left, char* var, bool root, double parent_cost, Exp* parent_exp, const X_node *word_xnode) { double cost = parent_cost + exp->cost; #ifdef DEBUG if (0 && verbosity_level(+D_IC)) { // Extreme debug printf("Expression type %d for Word%d, var %s:\n", exp->type, _word, var); printf("parent_exp: "); print_expression(parent_exp); printf("exp: "); print_expression(exp); } #endif if (exp->type == CONNECTOR_type) { dfs_position++; Connector connector; connector.multi = exp->multi; connector.desc = exp->u.condesc; set_connector_length_limit(&connector, _opts); switch (exp->dir) { case '+': _position.push_back(_right_connectors.size()); _dir.push_back('+'); _right_connectors.push_back( PositionConnector(parent_exp, &connector, '+', _word, dfs_position, exp->cost, cost, leading_right, false, eps_right, eps_left, word_xnode)); leading_right = false; break; case '-': _position.push_back(_left_connectors.size()); _dir.push_back('-'); _left_connectors.push_back( PositionConnector(parent_exp, &connector, '-', _word, dfs_position, exp->cost, cost, false, leading_left, eps_right, eps_left, word_xnode)); leading_left = false; break; default: throw std::string("Unknown connector direction: ") + exp->dir; } } else if (exp->type == AND_type) { if (exp->u.l == NULL) { /* zeroary and */ } else if (exp->u.l != NULL && exp->u.l->next == NULL) { /* unary and - skip */ insert_connectors(exp->u.l->e, dfs_position, leading_right, leading_left, eps_right, eps_left, var, root, cost, parent_exp, word_xnode); } else { int i; E_list* l; char new_var[MAX_VARIABLE_NAME]; char* last_new_var = new_var; char* last_var = var; while ((*last_new_var = *last_var)) { last_new_var++; last_var++; } for (i = 0, l = exp->u.l; l != NULL; l = l->next, i++) { char* s = last_new_var; *s++ = 'c'; fast_sprintf(s, i); insert_connectors(l->e, dfs_position, leading_right, leading_left, eps_right, eps_left, new_var, false, cost, parent_exp, word_xnode); #ifdef POWER_PRUNE_CONNECTORS if (leading_right) { eps_right.push_back(_variables->epsilon(new_var, '+')); } if (leading_left) { eps_left.push_back(_variables->epsilon(new_var, '-')); } #endif } } } else if (exp->type == OR_type) { if (exp->u.l != NULL && exp->u.l->next == NULL) { /* unary or - skip */ insert_connectors(exp->u.l->e, dfs_position, leading_right, leading_left, eps_right, eps_left, var, root, cost, exp->u.l->e, word_xnode); } else { int i; E_list* l; bool ll_true = false; bool lr_true = false; char new_var[MAX_VARIABLE_NAME]; char* last_new_var = new_var; char* last_var = var; while ((*last_new_var = *last_var)) { last_new_var++; last_var++; } #ifdef DEBUG if (0 && verbosity_level(+D_IC)) { // Extreme debug printf("Word%d, var %s OR_type:\n", _word, var); printf("exp mem: "); prt_exp_mem(exp, 0); } #endif for (i = 0, l = exp->u.l; l != NULL; l = l->next, i++) { bool lr = leading_right, ll = leading_left; std::vector<int> er = eps_right, el = eps_left; char* s = last_new_var; *s++ = 'd'; fast_sprintf(s, i); lgdebug(+D_IC, "Word%d: var: %s; exp%d=%p; X_node: %s\n", _word, var, i, l, word_xnode ? word_xnode->word->subword : "NULL X_node"); assert(word_xnode != NULL, "NULL X_node for var %s", new_var); if (root && parent_exp == NULL && l->e != word_xnode->exp) { E_list *we = NULL; if (word_xnode->exp->type == OR_type) { for (we = word_xnode->exp->u.l; we != NULL; we = we-> next) { if (l->e == we->e) break; } } if (we == NULL && word_xnode->next != NULL) { lgdebug(+D_IC, "Next word_xnode for word %d is needed\n", _word); word_xnode = word_xnode->next; } } insert_connectors(l->e, dfs_position, lr, ll, er, el, new_var, false, cost, l->e, word_xnode); if (lr) lr_true = true; if (ll) ll_true = true; } leading_right = lr_true; leading_left = ll_true; } } }
/** The return value is the number of disjuncts deleted. * Implementation notes: * Normally all the identical disjunct-jets are memory shared. * The suffix_id of each connector serves as its reference count * in the power table. Each time when a connector that cannot match * is discovered, its reference count is decreased, and its * nearest_word field is assigned BAD_WORD. Due to the memory sharing, * each such an assignment affects immediately all the identical * disjunct-jets. * */ static int power_prune(Sentence sent, Parse_Options opts) { power_table pt; prune_context pc; int N_deleted[2] = {0}; /* [0] counts first deletions, [1] counts dups. */ int total_deleted = 0; power_table_alloc(sent, &pt); power_table_init(sent, &pt); pc.pt = &pt; pc.power_cost = 0; pc.null_links = (opts->min_null_count > 0); pc.N_changed = 1; /* forces it always to make at least two passes */ pc.sent = sent; while (1) { /* left-to-right pass */ for (WordIdx w = 0; w < sent->length; w++) { for (Disjunct **dd = &sent->word[w].d; *dd != NULL; /* See: NEXT */) { Disjunct *d = *dd; /* just for convenience */ if (d->left == NULL) { dd = &d->next; /* NEXT */ continue; } bool is_bad = d->left->nearest_word == BAD_WORD; if (is_bad || left_connector_list_update(&pc, d->left, w, true) < 0) { mark_connector_sequence_for_dequeue(d->left, true); mark_connector_sequence_for_dequeue(d->right, false); /* discard the current disjunct */ *dd = d->next; /* NEXT - set current disjunct to the next one */ N_deleted[(int)is_bad]++; continue; } dd = &d->next; /* NEXT */ } clean_table(pt.r_table_size[w], pt.r_table[w]); } total_deleted += N_deleted[0] + N_deleted[1]; lgdebug(D_PRUNE, "Debug: l->r pass changed %d and deleted %d (%d+%d)\n", pc.N_changed, N_deleted[0]+N_deleted[1], N_deleted[0], N_deleted[1]); if (pc.N_changed == 0 && N_deleted[0] == 0 && N_deleted[1] == 0) break; pc.N_changed = N_deleted[0] = N_deleted[1] = 0; /* right-to-left pass */ for (WordIdx w = sent->length-1; w != (WordIdx) -1; w--) { for (Disjunct **dd = &sent->word[w].d; *dd != NULL; /* See: NEXT */) { Disjunct *d = *dd; /* just for convenience */ if (d->right == NULL) { dd = &d->next; /* NEXT */ continue; } bool is_bad = d->right->nearest_word == BAD_WORD; if (is_bad || right_connector_list_update(&pc, d->right, w, true) >= sent->length) { mark_connector_sequence_for_dequeue(d->right, true); mark_connector_sequence_for_dequeue(d->left, false); /* Discard the current disjunct. */ *dd = d->next; /* NEXT - set current disjunct to the next one */ N_deleted[(int)is_bad]++; continue; } dd = &d->next; /* NEXT */ } clean_table(pt.l_table_size[w], pt.l_table[w]); } total_deleted += N_deleted[0] + N_deleted[1]; lgdebug(D_PRUNE, "Debug: r->l pass changed %d and deleted %d (%d+%d)\n", pc.N_changed, N_deleted[0]+N_deleted[1], N_deleted[0], N_deleted[1]); if (pc.N_changed == 0 && N_deleted[0] == 0 && N_deleted[1] == 0) break; pc.N_changed = N_deleted[0] = N_deleted[1] = 0; } power_table_delete(&pt); lgdebug(D_PRUNE, "Debug: power prune cost: %d\n", pc.power_cost); print_time(opts, "power pruned"); if (verbosity_level(D_PRUNE)) { prt_error("\n\\"); prt_error("Debug: After power_pruning:\n\\"); print_disjunct_counts(sent); } #ifdef DEBUG for (WordIdx w = 0; w < sent->length; w++) { for (Disjunct *d = sent->word[w].d; NULL != d; d = d->next) { for (Connector *c = d->left; NULL != c; c = c->next) assert(c->nearest_word != BAD_WORD); for (Connector *c = d->right; NULL != c; c = c->next) assert(c->nearest_word != BAD_WORD); } } #endif return total_deleted; }
static int pp_prune(Sentence sent, Parse_Options opts) { pp_knowledge * knowledge; size_t i, w; int total_deleted, N_deleted; bool change, deleteme; multiset_table *cmt; if (sent->postprocessor == NULL) return 0; if (!opts->perform_pp_prune) return 0; knowledge = sent->postprocessor->knowledge; cmt = cms_table_new(); for (w = 0; w < sent->length; w++) { Disjunct *d; for (d = sent->word[w].d; d != NULL; d = d->next) { char dir; d->marked = true; for (dir=0; dir < 2; dir++) { Connector *c; for (c = ((dir) ? (d->left) : (d->right)); c != NULL; c = c->next) { insert_in_cms_table(cmt, connector_string(c)); } } } } total_deleted = 0; change = true; while (change) { char dir; change = false; N_deleted = 0; for (w = 0; w < sent->length; w++) { Disjunct *d; for (d = sent->word[w].d; d != NULL; d = d->next) { if (!d->marked) continue; deleteme = false; for (i = 0; i < knowledge->n_contains_one_rules; i++) { pp_rule* rule = &knowledge->contains_one_rules[i]; /* the ith rule */ const char * selector = rule->selector; /* selector string for this rule */ pp_linkset * link_set = rule->link_set; /* the set of criterion links */ if (rule->selector_has_wildcard) continue; /* If it has a * forget it */ for (dir = 0; dir < 2; dir++) { Connector *c; for (c = ((dir) ? (d->left) : (d->right)); c != NULL; c = c->next) { if (!post_process_match(selector, connector_string(c))) continue; /* printf("pp_prune: trigger ok. selector = %s c->string = %s\n", selector, c->string); */ /* We know c matches the trigger link of the rule. */ /* Now check the criterion links */ if (!rule_satisfiable(cmt, link_set)) { deleteme = true; rule->use_count++; } if (deleteme) break; } if (deleteme) break; } if (deleteme) break; } if (deleteme) /* now we delete this disjunct */ { N_deleted++; total_deleted++; d->marked = false; /* mark for deletion later */ for (dir=0; dir < 2; dir++) { Connector *c; for (c = ((dir) ? (d->left) : (d->right)); c != NULL; c = c->next) { change |= delete_from_cms_table(cmt, connector_string(c)); } } } } } lgdebug(D_PRUNE, "Debug: pp_prune pass deleted %d\n", N_deleted); } cms_table_delete(cmt); if (total_deleted > 0) { delete_unmarked_disjuncts(sent); if (verbosity_level(D_PRUNE)) { prt_error("\n\\"); prt_error("Debug: After pp_prune:\n\\"); print_disjunct_counts(sent); } } print_time(opts, "pp pruning"); return total_deleted; }