/** * Remove the empty words from a linkage. * XXX Should we remove here also the dict-cap tokens? In any case, for now they * are left for debug. */ static void remove_empty_words(Linkage lkg) { size_t i, j; Disjunct **cdj = lkg->chosen_disjuncts; int *remap = alloca(lkg->num_words * sizeof(*remap)); if (4 <= verbosity) { lgdebug(0, "Info: chosen_disjuncts before removing empty words:\n"); print_chosen_disjuncts_words(lkg); } for (i = 0, j = 0; i < lkg->num_words; i++) { if ((NULL != cdj[i]) && (MT_EMPTY == cdj[i]->word[0]->morpheme_type)) { remap[i] = -1; } else { cdj[j] = cdj[i]; remap[i] = j; j++; } } lkg->num_words = j; /* Unused memory not freed - all of it will be freed in free_linkages(). */ if (4 <= verbosity) { lgdebug(0, "Info: chosen_disjuncts after removing empty words:\n"); print_chosen_disjuncts_words(lkg); } for (i = 0, j = 0; i < lkg->num_links; i++) { const Link *old_lnk = &(lkg->link_array[i]); if ((-1 != remap[old_lnk->rw]) && (-1 != remap[old_lnk->lw])) { Link *new_lnk = &(lkg->link_array[j]); /* Copy the entire link contents, thunking the word numbers. * Note that j is always <= i so this is always safe. */ new_lnk->lw = remap[old_lnk->lw]; new_lnk->rw = remap[old_lnk->rw]; new_lnk->lc = old_lnk->lc; new_lnk->rc = old_lnk->rc; new_lnk->link_name = old_lnk->link_name; j++; } } lkg->num_links = j; /* Unused memory not freed - all of it will be freed in free_linkages(). */ }
/** * Reuse the given memory pool. * Reset the pool pointers without freeing its memory. * pool_alloc() will then reuse the existing pool blocks before allocating * new blocks. */ void pool_reuse(Pool_desc *mp) { lgdebug(+D_MEMPOOL, "Used %zu elements (pool '%s' created in %s())\n", mp->curr_elements, mp->name, mp->func); mp->ring = mp->chain; mp->alloc_next = mp->ring; }
/** * Delete the given memory pool. */ void pool_delete(Pool_desc *mp) { if (NULL == mp) return; lgdebug(+D_MEMPOOL, "Used %zu elements (pool '%s' created in %s())\n", mp->curr_elements, mp->name, mp->func); /* Free its chained memory blocks. */ char *c_next; size_t alloc_size; #if POOL_ALLOCATOR alloc_size = mp->data_size; #else alloc_size = mp->element_size; #endif for (char *c = mp->chain; c != NULL; c = c_next) { c_next = POOL_NEXT_BLOCK(c, alloc_size); #if POOL_ALLOCATOR aligned_free(c); #else free(c); #endif } free(mp); }
/** * Check that the given locale known by the system. * In case we don't have locale_t, actually set the locale * in order to find out if it is fine. This side effect doesn't cause * harm, as the locale would be set up to that value anyway shortly. * @param locale Locale string * @return True if known, false if unknown. */ bool try_locale(const char *locale) { #ifdef HAVE_LOCALE_T locale_t ltmp = newlocale_LC_CTYPE(locale); if ((locale_t)0 == ltmp) return false; freelocale(ltmp); #else lgdebug(D_USER_FILES, "Debug: Setting program's locale \"%s\"", locale); if (NULL == setlocale(LC_CTYPE, locale)) { lgdebug(D_USER_FILES, " failed!\n"); return false; } lgdebug(D_USER_FILES, ".\n"); #endif /* HAVE_LOCALE_T */ return true; }
char * get_default_locale(void) { const char *lc_vars[] = {"LC_ALL", "LC_CTYPE", "LANG", NULL}; char *ev; const char **evname; char *locale = NULL; for(evname = lc_vars; NULL != *evname; evname++) { ev = getenv(*evname); if ((NULL != ev) && ('\0' != ev[0])) break; } if (NULL != *evname) { locale = ev; lgdebug(D_USER_FILES, "Debug: Environment locale \"%s=%s\"\n", *evname, ev); #ifdef _WIN32 /* If compiled with MSVC/MinGW, we still support running under Cygwin. */ const char *ostype = getenv("OSTYPE"); if ((NULL != ostype) && (0 == strcmp(ostype, "cygwin"))) { /* Convert to Windows style locale */ locale = strdupa(locale); locale[strcspn(locale, "_")] = '-'; locale[strcspn(locale, ".@")] = '\0'; } #endif /* _WIN32 */ } else { lgdebug(D_USER_FILES, "Debug: Environment locale not set\n"); #ifdef _WIN32 locale = win32_getlocale(); if (NULL == locale) lgdebug(D_USER_FILES, "Debug: Cannot find user default locale\n"); else lgdebug(D_USER_FILES, "Debug: User default locale \"%s\"\n", locale); return locale; /* Already strdup'ed */ #endif /* _WIN32 */ } return safe_strdup(locale); }
Dictionary dictionary_create_from_db(const char *lang) { char *dbname; const char * t; Dictionary dict; Dict_node *dict_node; dict = (Dictionary) xalloc(sizeof(struct Dictionary_s)); memset(dict, 0, sizeof(struct Dictionary_s)); /* Language and file-name stuff */ dict->string_set = string_set_create(); t = strrchr (lang, '/'); t = (NULL == t) ? lang : t+1; dict->lang = string_set_add(t, dict->string_set); lgdebug(D_USER_FILES, "Debug: Language: %s\n", dict->lang); /* To disable spell-checking, just set the checker to NULL */ dict->spell_checker = spellcheck_create(dict->lang); #if defined HAVE_HUNSPELL || defined HAVE_ASPELL if (NULL == dict->spell_checker) prt_error("Info: Spell checker disabled."); #endif dict->base_knowledge = NULL; dict->hpsg_knowledge = NULL; dbname = join_path (lang, "dict.db"); dict->name = string_set_add(dbname, dict->string_set); free(dbname); /* Set up the database */ dict->db_handle = object_open(dict->name, db_open, NULL); dict->lookup_list = db_lookup_list; dict->free_lookup = db_free_llist; dict->lookup = db_lookup; dict->close = db_close; /* Misc remaining common (generic) dict setup work */ dict->left_wall_defined = boolean_dictionary_lookup(dict, LEFT_WALL_WORD); dict->right_wall_defined = boolean_dictionary_lookup(dict, RIGHT_WALL_WORD); dict->empty_word_defined = boolean_dictionary_lookup(dict, EMPTY_WORD_MARK); dict->unknown_word_defined = boolean_dictionary_lookup(dict, UNKNOWN_WORD); dict->use_unknown_word = true; dict_node = dictionary_lookup_list(dict, UNLIMITED_CONNECTORS_WORD); if (dict_node != NULL) dict->unlimited_connector_set = connector_set_create(dict_node->exp); free_lookup_list(dict, dict_node); return dict; }
static int new_style_conjunctions(con_context_t *ctxt, Linkage linkage, int numcon_total) { #ifdef DEBUG int c; for (c = 0; c < numcon_total; c++) { constituent_t *ct = &ctxt->constituent[c]; lgdebug(3, "ola %d valid=%d %s start=%s lr=%zu %zu\n", c, ct->valid, ct->type, ct->start_link, ct->left, ct->right); } #endif return numcon_total; }
/** * Given a word, find its alternative ID. * An alternative is identified by a pointer to its first word, which is * getting set at the time the alternative is created at * issue_word_alternative(). (It could be any unique identifier - for coding * convenience it is a pointer.) * * Return the alternative_id of this alternative. */ static Gword *find_alternative(Gword *word) { assert(NULL != word, "find_alternative(NULL)"); assert(NULL != word->alternative_id, "find_alternative(%s): NULL id", word->subword); #if 0 lgdebug(+0, "find_alternative(%s): '%s'\n", word->subword, debug_show_subword(word->alternative_id)); #endif return word->alternative_id; }
static bool morpheme_match(Sentence sent, const char *word, int l, p_list pl) { Dictionary afdict = sent->dict->affix_table; anysplit_params *as = afdict->anysplit; int pos = 0; int p; Regex_node *re; char *prefix_string = alloca(l+1); lgdebug(+2, "word=%s: ", word); for (p = 0; p < as->nparts; p++) { strncpy(prefix_string, &word[pos], pl[p]-pos); prefix_string[pl[p]-pos] = '\0'; /* For flexibility, REGRPE is matched only to the prefix part, * REGMID only to the middle suffixes, and REGSUF only to the suffix part - * which cannot be the prefix. */ if (0 == p) re = as->regpre; else if (pl[p] == l) re = as->regsuf; else re = as->regmid; lgdebug(2, "re=%s part%d=%s: ", re->name, p, prefix_string); /* A NULL regex always matches */ if ((NULL != re) && (NULL == match_regex(re ,prefix_string))) { lgdebug(2, "No match\n"); return false; } pos = pl[p]; if (pos == l) break; } lgdebug(2, "Match\n"); return true; }
void lgdebug_initialize(char* filename){ // char* logfilename = (char*) xparams_get_param(LGPE_LOG_DIR_VAR,"./"); // char* logfilename = "./log"; char* logfilename = "."; char af[100]; memset(af,0,100); if(logfilename == NULL){ lgdebug(DBG_ERROR,"lgdebug: No log file specified \n"); return; } strcpy(af,logfilename); strcat(af,"/"); strcat(af,filename); //open log file g_logfile = fopen(af, "w"); if(g_logfile == NULL){ lgdebug(DBG_ERROR,"lgdebug: could not open log file %s: %s\n", af, strerror(errno)); return; } }
char * dictionary_get_data_dir(void) { char * data_dir = NULL; if (custom_data_dir != NULL) { data_dir = safe_strdup(custom_data_dir); return data_dir; } #ifdef _WIN32 /* Dynamically locate invocation directory of our program. * Non-ASCII characters are not supported (files will not be found). */ char prog_path[MAX_PATH_NAME]; if (!GetModuleFileNameA(NULL, prog_path, sizeof(prog_path))) { prt_error("Warning: GetModuleFileName error %d\n", (int)GetLastError()); } else { if (NULL == prog_path) { /* Can it happen? */ prt_error("Warning: GetModuleFileName returned a NULL program path!\n"); } else { if (!PathRemoveFileSpecA(prog_path)) { prt_error("Warning: Cannot get directory from program path '%s'!\n", prog_path); } else { /* Unconvertible characters are marked as '?' */ const char *unsupported = (NULL != strchr(prog_path, '?')) ? " (containing unsupported character)" : ""; lgdebug(D_USER_FILES, "Debug: Directory of executable: %s%s\n", unsupported, prog_path); data_dir = safe_strdup(prog_path); } } } #endif /* _WIN32 */ return data_dir; }
/* * Reuse the given fake memory pool by freeing its memory. */ void pool_reuse(Pool_desc *mp) { if (NULL == mp) return; lgdebug(+D_MEMPOOL, "Used %zu elements (pool '%s' created in %s())\n", mp->curr_elements, mp->name, mp->func); /* Free its chained memory blocks. */ char *c_next; for (char *c = mp->chain; c != NULL; c = c_next) { c_next = POOL_NEXT_BLOCK(c, mp->element_size); free(c); } mp->chain = NULL; }
const char *match_regex(const Regex_node *rn, const char *s) { while (rn != NULL) { int rc; bool nomatch; bool match; regex_t *re = rn->re; /* Make sure the regex has been compiled. */ assert(re); #if HAVE_PCRE2_H rc = pcre2_match(re->re_code, (PCRE2_SPTR)s, PCRE2_ZERO_TERMINATED, /*startoffset*/0, PCRE2_NO_UTF_CHECK, re->re_md, NULL); match = (rc >= 0); nomatch = (rc == PCRE2_ERROR_NOMATCH); #else rc = regexec(rn->re, s, 0, NULL, /*eflags*/0); match = (rc == 0); nomatch = (rc == REG_NOMATCH); #endif if (match) { lgdebug(+D_MRE, "%s%s %s\n", &"!"[!rn->neg], rn->name, s); if (!rn->neg) return rn->name; /* Match found - return--no multiple matches. */ /* Negative match - skip this regex name. */ for (const char *nre_name = rn->name; rn->next != NULL; rn = rn->next) { if (strcmp(nre_name, rn->next->name) != 0) break; } } else if (!nomatch) { /* We have an error. */ prt_regerror("Regex matching error", rn, rc, -1); } rn = rn->next; } return NULL; /* No matches. */ }
const char *match_regex(const Regex_node *re, const char *s) { int rc; const char *nre_name; while (re != NULL) { if (re->re == NULL) { /* Re not compiled; if this happens, it's likely an * internal error, but nevermind for now. */ continue; } /* Try to match with no extra data (NULL), whole str (0 to strlen(s)), * and default options (second 0). */ /* int rc = pcre_exec(re->re, NULL, s, strlen(s), 0, * 0, ovector, PCRE_OVEC_SIZE); */ rc = regexec((regex_t*) re->re, s, 0, NULL, 0); if (0 == rc) { lgdebug(+D_MRE, "%s%s %s\n", &"!"[!re->neg], re->name, s); if (!re->neg) return re->name; /* Match found - return--no multiple matches. */ /* Negative match - skip this regex name. */ for (nre_name = re->name; re->next != NULL; re = re->next) { if (strcmp(nre_name, re->next->name) != 0) break; } } else if (rc != REG_NOMATCH) { /* We have an error. */ prt_regerror("Regex matching error", re, rc); } re = re->next; } return NULL; /* No matches. */ }
/** * Create a memory pool descriptor. * 1. If required, set the allocation size to a power of 2 of the element size. * 2. Save the given parameters in the pool descriptor, to be used by * pool_alloc(); * 3. Chain the pool descriptor to the given pool_list, so it can be * automatically freed. */ Pool_desc *pool_new(const char *func, const char *name, size_t num_elements, size_t element_size, bool zero_out, bool align, bool exact) { Pool_desc *mp = malloc(sizeof(Pool_desc)); mp->func = func; mp->name = name; if (align) { mp->element_size = align_size(element_size); mp->alignment = MAX(MIN_ALIGNMENT, mp->element_size); mp->alignment = MIN(MAX_ALIGNMENT, mp->alignment); mp->data_size = num_elements * mp->element_size; mp->block_size = ALIGN(mp->data_size + FLDSIZE_NEXT, mp->alignment); } else { mp->element_size = element_size; mp->alignment = MIN_ALIGNMENT; mp->data_size = num_elements * mp->element_size; mp->block_size = mp->data_size + FLDSIZE_NEXT; } mp->zero_out = zero_out; mp->exact = exact; mp->alloc_next = NULL; mp->chain = NULL; mp->ring = NULL; mp->free_list = NULL; mp->curr_elements = 0; mp->num_elements = num_elements; lgdebug(+D_MEMPOOL, "%sElement size %zu, alignment %zu (pool '%s' created in %s())\n", POOL_ALLOCATOR?"":"(Fake pool allocator) ", mp->element_size, mp->alignment, mp->name, mp->func); return mp; }
/** * Compare a portion of the tokenized string, starting at word_stat with length * of numchar, to the dictionary or affix class word that is defined in the * capture group whose info is pointed to by cgnump. * * FIXME: Return int instead of bool, see the comment at E1 below. */ static bool is_word(const char *word_start, int numchar, cgnum_t *cgnump) { Dictionary const dict = cgnump->dict; const char * const afclass = cgnump->afclass; const int lookup_mark_len = (NULL != cgnump->lookup_mark) ? strlen(cgnump->lookup_mark) : 0; char * const word = alloca(numchar+lookup_mark_len+1); #ifdef AFFIX_DICTIONARY_TREE const Dict_node *dn; #endif const Afdict_class *ac; size_t i; /* Append/prepend stem/infix marks. */ if (NULL == cgnump->lookup_mark) { strncpy(word, word_start, numchar); word[numchar] = '\0'; } else { switch (cgnump->lookup_mark_pos) { case 'p': /* prepend a mark */ strcpy(word, cgnump->lookup_mark); strncat(word, word_start, numchar); word[numchar+lookup_mark_len] = '\0'; break; case 'a': /* append a mark */ strncpy(word, word_start, numchar); strcpy(word+numchar, cgnump->lookup_mark); break; default: printf("is_word:E3('%x' %s)", cgnump->lookup_mark_pos, cgnump->lookup_mark); strncpy(word, word_start, numchar); word[numchar] = '\0'; } } lgdebug(7, "LOOKUP '%s' in %s: ", word, dict->name); if (0 == afclass) return boolean_dictionary_lookup(dict, word); /* We don't have for now a tree representation of the affix file, only lists */ #ifdef AFFIX_DICTIONARY_TREE dn = lookup_list(dict, word); printf("WORD %s afclass %s dn %p\n", word, afclass, dn); if (NULL == dn) return false; for (; NULL != dn; dn = dn->left) { const char *con = word_only_connector(dn); if (NULL == con) { /* Internal error - nothing else to do for now unless we don't * rerun bool, but return an int so -1 signifies an error. */ printf("is_word(%s):E1 ", word); } printf("CON '%s'\n", con); if (0 == strcmp(afclass, con)) return true; } #else /* Make it the hard way. */ ac = afdict_find(dict, afclass, /*notify_err*/false); if (NULL == ac) { /* Internal error - nothing else to do for now unless we don't * rerun bool, but return an int so -1 signifies an error. */ printf("is_word(%s):E2 ", word); } for (i = 0; i < ac->length; i++) { if (0 == strcmp(ac->string[i], word)) return true; } #endif return false; }
/** The return value is the number of disjuncts deleted. * Implementation notes: * Normally all the identical disjunct-jets are memory shared. * The suffix_id of each connector serves as its reference count * in the power table. Each time when a connector that cannot match * is discovered, its reference count is decreased, and its * nearest_word field is assigned BAD_WORD. Due to the memory sharing, * each such an assignment affects immediately all the identical * disjunct-jets. * */ static int power_prune(Sentence sent, Parse_Options opts) { power_table pt; prune_context pc; int N_deleted[2] = {0}; /* [0] counts first deletions, [1] counts dups. */ int total_deleted = 0; power_table_alloc(sent, &pt); power_table_init(sent, &pt); pc.pt = &pt; pc.power_cost = 0; pc.null_links = (opts->min_null_count > 0); pc.N_changed = 1; /* forces it always to make at least two passes */ pc.sent = sent; while (1) { /* left-to-right pass */ for (WordIdx w = 0; w < sent->length; w++) { for (Disjunct **dd = &sent->word[w].d; *dd != NULL; /* See: NEXT */) { Disjunct *d = *dd; /* just for convenience */ if (d->left == NULL) { dd = &d->next; /* NEXT */ continue; } bool is_bad = d->left->nearest_word == BAD_WORD; if (is_bad || left_connector_list_update(&pc, d->left, w, true) < 0) { mark_connector_sequence_for_dequeue(d->left, true); mark_connector_sequence_for_dequeue(d->right, false); /* discard the current disjunct */ *dd = d->next; /* NEXT - set current disjunct to the next one */ N_deleted[(int)is_bad]++; continue; } dd = &d->next; /* NEXT */ } clean_table(pt.r_table_size[w], pt.r_table[w]); } total_deleted += N_deleted[0] + N_deleted[1]; lgdebug(D_PRUNE, "Debug: l->r pass changed %d and deleted %d (%d+%d)\n", pc.N_changed, N_deleted[0]+N_deleted[1], N_deleted[0], N_deleted[1]); if (pc.N_changed == 0 && N_deleted[0] == 0 && N_deleted[1] == 0) break; pc.N_changed = N_deleted[0] = N_deleted[1] = 0; /* right-to-left pass */ for (WordIdx w = sent->length-1; w != (WordIdx) -1; w--) { for (Disjunct **dd = &sent->word[w].d; *dd != NULL; /* See: NEXT */) { Disjunct *d = *dd; /* just for convenience */ if (d->right == NULL) { dd = &d->next; /* NEXT */ continue; } bool is_bad = d->right->nearest_word == BAD_WORD; if (is_bad || right_connector_list_update(&pc, d->right, w, true) >= sent->length) { mark_connector_sequence_for_dequeue(d->right, true); mark_connector_sequence_for_dequeue(d->left, false); /* Discard the current disjunct. */ *dd = d->next; /* NEXT - set current disjunct to the next one */ N_deleted[(int)is_bad]++; continue; } dd = &d->next; /* NEXT */ } clean_table(pt.l_table_size[w], pt.l_table[w]); } total_deleted += N_deleted[0] + N_deleted[1]; lgdebug(D_PRUNE, "Debug: r->l pass changed %d and deleted %d (%d+%d)\n", pc.N_changed, N_deleted[0]+N_deleted[1], N_deleted[0], N_deleted[1]); if (pc.N_changed == 0 && N_deleted[0] == 0 && N_deleted[1] == 0) break; pc.N_changed = N_deleted[0] = N_deleted[1] = 0; } power_table_delete(&pt); lgdebug(D_PRUNE, "Debug: power prune cost: %d\n", pc.power_cost); print_time(opts, "power pruned"); if (verbosity_level(D_PRUNE)) { prt_error("\n\\"); prt_error("Debug: After power_pruning:\n\\"); print_disjunct_counts(sent); } #ifdef DEBUG for (WordIdx w = 0; w < sent->length; w++) { for (Disjunct *d = sent->word[w].d; NULL != d; d = d->next) { for (Connector *c = d->left; NULL != c; c = c->next) assert(c->nearest_word != BAD_WORD); for (Connector *c = d->right; NULL != c; c = c->next) assert(c->nearest_word != BAD_WORD); } } #endif return total_deleted; }
void WordTag::insert_connectors(Exp* exp, int& dfs_position, bool& leading_right, bool& leading_left, std::vector<int>& eps_right, std::vector<int>& eps_left, char* var, bool root, double parent_cost, Exp* parent_exp, const X_node *word_xnode) { double cost = parent_cost + exp->cost; #ifdef DEBUG if (0 && verbosity_level(+D_IC)) { // Extreme debug printf("Expression type %d for Word%d, var %s:\n", exp->type, _word, var); printf("parent_exp: "); print_expression(parent_exp); printf("exp: "); print_expression(exp); } #endif if (exp->type == CONNECTOR_type) { dfs_position++; Connector connector; connector.multi = exp->multi; connector.desc = exp->u.condesc; set_connector_length_limit(&connector, _opts); switch (exp->dir) { case '+': _position.push_back(_right_connectors.size()); _dir.push_back('+'); _right_connectors.push_back( PositionConnector(parent_exp, &connector, '+', _word, dfs_position, exp->cost, cost, leading_right, false, eps_right, eps_left, word_xnode)); leading_right = false; break; case '-': _position.push_back(_left_connectors.size()); _dir.push_back('-'); _left_connectors.push_back( PositionConnector(parent_exp, &connector, '-', _word, dfs_position, exp->cost, cost, false, leading_left, eps_right, eps_left, word_xnode)); leading_left = false; break; default: throw std::string("Unknown connector direction: ") + exp->dir; } } else if (exp->type == AND_type) { if (exp->u.l == NULL) { /* zeroary and */ } else if (exp->u.l != NULL && exp->u.l->next == NULL) { /* unary and - skip */ insert_connectors(exp->u.l->e, dfs_position, leading_right, leading_left, eps_right, eps_left, var, root, cost, parent_exp, word_xnode); } else { int i; E_list* l; char new_var[MAX_VARIABLE_NAME]; char* last_new_var = new_var; char* last_var = var; while ((*last_new_var = *last_var)) { last_new_var++; last_var++; } for (i = 0, l = exp->u.l; l != NULL; l = l->next, i++) { char* s = last_new_var; *s++ = 'c'; fast_sprintf(s, i); insert_connectors(l->e, dfs_position, leading_right, leading_left, eps_right, eps_left, new_var, false, cost, parent_exp, word_xnode); #ifdef POWER_PRUNE_CONNECTORS if (leading_right) { eps_right.push_back(_variables->epsilon(new_var, '+')); } if (leading_left) { eps_left.push_back(_variables->epsilon(new_var, '-')); } #endif } } } else if (exp->type == OR_type) { if (exp->u.l != NULL && exp->u.l->next == NULL) { /* unary or - skip */ insert_connectors(exp->u.l->e, dfs_position, leading_right, leading_left, eps_right, eps_left, var, root, cost, exp->u.l->e, word_xnode); } else { int i; E_list* l; bool ll_true = false; bool lr_true = false; char new_var[MAX_VARIABLE_NAME]; char* last_new_var = new_var; char* last_var = var; while ((*last_new_var = *last_var)) { last_new_var++; last_var++; } #ifdef DEBUG if (0 && verbosity_level(+D_IC)) { // Extreme debug printf("Word%d, var %s OR_type:\n", _word, var); printf("exp mem: "); prt_exp_mem(exp, 0); } #endif for (i = 0, l = exp->u.l; l != NULL; l = l->next, i++) { bool lr = leading_right, ll = leading_left; std::vector<int> er = eps_right, el = eps_left; char* s = last_new_var; *s++ = 'd'; fast_sprintf(s, i); lgdebug(+D_IC, "Word%d: var: %s; exp%d=%p; X_node: %s\n", _word, var, i, l, word_xnode ? word_xnode->word->subword : "NULL X_node"); assert(word_xnode != NULL, "NULL X_node for var %s", new_var); if (root && parent_exp == NULL && l->e != word_xnode->exp) { E_list *we = NULL; if (word_xnode->exp->type == OR_type) { for (we = word_xnode->exp->u.l; we != NULL; we = we-> next) { if (l->e == we->e) break; } } if (we == NULL && word_xnode->next != NULL) { lgdebug(+D_IC, "Next word_xnode for word %d is needed\n", _word); word_xnode = word_xnode->next; } } insert_connectors(l->e, dfs_position, lr, ll, er, el, new_var, false, cost, l->e, word_xnode); if (lr) lr_true = true; if (ll) ll_true = true; } leading_right = lr_true; leading_left = ll_true; } } }
static Dictionary dictionary_six_str(const char * lang, const char * input, const char * dict_name, const char * pp_name, const char * cons_name, const char * affix_name, const char * regex_name) { const char * t; Dictionary dict; Dict_node *dict_node; dict = (Dictionary) xalloc(sizeof(struct Dictionary_s)); memset(dict, 0, sizeof(struct Dictionary_s)); /* Language and file-name stuff */ dict->string_set = string_set_create(); t = strrchr (lang, '/'); t = (NULL == t) ? lang : t+1; dict->lang = string_set_add(t, dict->string_set); lgdebug(D_USER_FILES, "Debug: Language: %s\n", dict->lang); dict->name = string_set_add(dict_name, dict->string_set); /* * A special setup per dictionary type. The check here assumes the affix * dictionary name contains "affix". FIXME: For not using this * assumption, the dictionary creating stuff needs a rearrangement. */ if (0 == strstr(dict->name, "affix")) { /* To disable spell-checking, just set the checker to NULL */ dict->spell_checker = spellcheck_create(dict->lang); #if defined HAVE_HUNSPELL || defined HAVE_ASPELL /* TODO: * 1. Set the spell option to 0, to signify no spell checking is done. * 2. On verbosity >= 1, add a detailed message on the reason. */ if (NULL == dict->spell_checker) prt_error("Info: Spell checker disabled."); #endif dict->insert_entry = insert_list; dict->lookup_list = lookup_list; dict->free_lookup = free_llist; dict->lookup = boolean_lookup; } else { /* * Affix dictionary. */ size_t i; dict->insert_entry = load_affix; dict->lookup = return_true; /* initialize the class table */ dict->afdict_class = malloc(sizeof(*dict->afdict_class) * ARRAY_SIZE(afdict_classname)); for (i = 0; i < ARRAY_SIZE(afdict_classname); i++) { dict->afdict_class[i].mem_elems = 0; dict->afdict_class[i].length = 0; dict->afdict_class[i].string = NULL; } } dict->affix_table = NULL; /* Read dictionary from the input string. */ dict->input = input; dict->pin = dict->input; if (!read_dictionary(dict)) { dict->pin = NULL; dict->input = NULL; goto failure; } dict->pin = NULL; dict->input = NULL; if (NULL == affix_name) { /* * The affix table is handled alone in this invocation. * Skip the rest of processing! * FIXME: The dictionary creating stuff needs a rearrangement. */ return dict; } /* If we don't have a locale per dictionary, the following * will also set the program's locale. */ dict->locale = linkgrammar_get_dict_locale(dict); set_utf8_program_locale(); #ifdef HAVE_LOCALE_T /* We have a locale per dictionary. */ if (NULL != dict->locale) dict->locale_t = newlocale_LC_CTYPE(dict->locale); /* If we didn't succeed to set the dictionary locale, the program will * SEGFAULT when it tries to use it with the isw*() functions. * So set it to the current program's locale as a last resort. */ if (NULL == dict->locale) { dict->locale = setlocale(LC_CTYPE, NULL); dict->locale_t = newlocale_LC_CTYPE(setlocale(LC_CTYPE, NULL)); prt_error("Warning: Couldn't set dictionary locale! " "Using current program locale %s", dict->locale); } /* If dict->locale is still not set, there is a bug. */ assert((locale_t)0 != dict->locale_t, "Dictionary locale is not set."); #else /* We don't have a locale per dictionary - but anyway make sure * dict->locale is consistent with the current program's locale, * and especially that it is not NULL. It still indicates the intended * locale of this dictionary and the locale of the compiled regexs. */ dict->locale = setlocale(LC_CTYPE, NULL); #endif /* HAVE_LOCALE_T */ dict->affix_table = dictionary_six(lang, affix_name, NULL, NULL, NULL, NULL); if (dict->affix_table == NULL) { prt_error("Error: Could not open affix file %s", affix_name); goto failure; } if (! afdict_init(dict)) goto failure; /* * Process the regex file. * We have to compile regexs using the dictionary locale, * so make a temporary locale swap. */ if (read_regex_file(dict, regex_name)) goto failure; const char *locale = setlocale(LC_CTYPE, NULL); locale = strdupa(locale); /* setlocale() uses static memory. */ setlocale(LC_CTYPE, dict->locale); lgdebug(+D_DICT, "Regexs locale %s\n", setlocale(LC_CTYPE, NULL)); if (compile_regexs(dict->regex_root, dict)) { locale = setlocale(LC_CTYPE, locale); goto failure; } locale = setlocale(LC_CTYPE, locale); assert(NULL != locale, "Cannot restore program locale\n"); #ifdef USE_CORPUS dict->corpus = lg_corpus_new(); #endif dict->left_wall_defined = boolean_dictionary_lookup(dict, LEFT_WALL_WORD); dict->right_wall_defined = boolean_dictionary_lookup(dict, RIGHT_WALL_WORD); dict->empty_word_defined = boolean_dictionary_lookup(dict, EMPTY_WORD_MARK); dict->base_knowledge = pp_knowledge_open(pp_name); dict->hpsg_knowledge = pp_knowledge_open(cons_name); dict->unknown_word_defined = boolean_dictionary_lookup(dict, UNKNOWN_WORD); dict->use_unknown_word = true; dict_node = dictionary_lookup_list(dict, UNLIMITED_CONNECTORS_WORD); if (dict_node != NULL) dict->unlimited_connector_set = connector_set_create(dict_node->exp); free_lookup(dict_node); return dict; failure: string_set_delete(dict->string_set); if (dict->affix_table) xfree(dict->affix_table, sizeof(struct Dictionary_s)); xfree(dict, sizeof(struct Dictionary_s)); return NULL; }
static bool afdict_init(Dictionary dict) { Afdict_class * ac; Dictionary afdict = dict->affix_table; /* FIXME: read_entry() builds word lists in reverse order (can we * just create the list top-down without breaking anything?). Unless * it is fixed to preserve the order, reverse here the word list for * each affix class. */ for (ac = afdict->afdict_class; ac < &afdict->afdict_class[ARRAY_SIZE(afdict_classname)]; ac++) { int i; int l = ac->length - 1; const char * t; for (i = 0; i < l; i++, l--) { t = ac->string[i]; ac->string[i] = ac->string[l]; ac->string[l] = t; } } /* Create the affix lists */ ac = AFCLASS(afdict, AFDICT_INFIXMARK); if ((1 < ac->length) || ((1 == ac->length) && (1 != strlen(ac->string[0])))) { prt_error("Error: afdict_init: Invalid value for class %s in file %s" " (should have been one ASCII punctuation - ignored)\n", afdict_classname[AFDICT_INFIXMARK], afdict->name); free((void *)ac->string); ac->length = 0; ac->mem_elems = 0; ac->string = NULL; } /* XXX For now there is a possibility to use predefined SUF and PRE lists. * So if SUF or PRE are defined, don't extract any of them from the dict. */ if (1 == ac->length) { if ((0 == AFCLASS(afdict, AFDICT_PRE)->length) && (0 == AFCLASS(afdict, AFDICT_SUF)->length)) { char last_entry[MAX_WORD+1] = ""; get_dict_affixes(dict, dict->root, ac->string[0][0], last_entry); } } else { /* No INFIX_MARK - create a dummy one that always mismatches */ affix_list_add(afdict, &afdict->afdict_class[AFDICT_INFIXMARK], ""); } if (debug_level(+D_AI)) { size_t l; for (ac = afdict->afdict_class; ac < &afdict->afdict_class[ARRAY_SIZE(afdict_classname)]; ac++) { if (0 == ac->length) continue; lgdebug(+0, "Class %s, %zd items:", afdict_classname[ac-afdict->afdict_class], ac->length); for (l = 0; l < ac->length; l++) lgdebug(0, " '%s'", ac->string[l]); lgdebug(0, "\n"); } } #undef D_AI /* Store the SANEMORPHISM regex in the unused (up to now) * regex_root element of the affix dictionary, and precompile it */ assert(NULL == afdict->regex_root, "SM regex is already assigned"); ac = AFCLASS(afdict, AFDICT_SANEMORPHISM); if (0 != ac->length) { int rc; Regex_node *sm_re = malloc(sizeof(*sm_re)); dyn_str *rebuf = dyn_str_new(); /* The regex used to be converted to: ^((original-regex)b)+$ * In the initial wordgraph version word boundaries are not supported, * so instead it is converted to: ^(original-regex)+$ */ #ifdef WORD_BOUNDARIES dyn_strcat(rebuf, "^(("); #else dyn_strcat(rebuf, "^("); #endif dyn_strcat(rebuf, ac->string[0]); #ifdef WORD_BOUNDARIES dyn_strcat(rebuf, ")b)+$"); #else dyn_strcat(rebuf, ")+$"); #endif sm_re->pattern = strdup(rebuf->str); dyn_str_delete(rebuf); afdict->regex_root = sm_re; sm_re->name = strdup(afdict_classname[AFDICT_SANEMORPHISM]); sm_re->re = NULL; sm_re->next = NULL; sm_re->neg = false; rc = compile_regexs(afdict->regex_root, afdict); if (rc) { prt_error("Error: afdict_init: Failed to compile " "regex '%s' in file %s, return code %d\n", afdict_classname[AFDICT_SANEMORPHISM], afdict->name, rc); return false; } lgdebug(+5, "%s regex %s\n", afdict_classname[AFDICT_SANEMORPHISM], sm_re->pattern); } /* sort the UNITS list */ /* Longer unit names must get split off before shorter ones. * This prevents single-letter splits from screwing things * up. e.g. split 7gram before 7am before 7m */ ac = AFCLASS(afdict, AFDICT_UNITS); if (0 < ac->length) { qsort(ac->string, ac->length, sizeof(char *), cmplen); } #ifdef AFDICT_ORDER_NOT_PRESERVED /* pre-sort the MPRE list */ ac = AFCLASS(afdict, AFDICT_MPRE); if (0 < ac->length) { /* Longer subwords have priority over shorter ones, * reverse-sort by length. * XXX mprefix_split() for Hebrew depends on that. */ qsort(ac->string, ac->length, sizeof(char *), revcmplen); } #endif /* AFDICT_ORDER_NOT_PRESERVED */ concat_class(afdict, AFDICT_QUOTES); concat_class(afdict, AFDICT_BULLETS); if (! anysplit_init(afdict)) return false; return true; }
void compute_chosen_words(Sentence sent, Linkage linkage, Parse_Options opts) { WordIdx i; /* index of chosen_words */ WordIdx j; Disjunct **cdjp = linkage->chosen_disjuncts; const char **chosen_words = alloca(linkage->num_words * sizeof(*chosen_words)); int *remap = alloca(linkage->num_words * sizeof(*remap)); bool *show_word = alloca(linkage->num_words * sizeof(*show_word)); bool display_morphology = opts->display_morphology; Gword **lwg_path = linkage->wg_path; Gword **n_lwg_path = NULL; /* new Wordgraph path, to match chosen_words */ Gword **nullblock_start = NULL; /* start of a null block, to be put in [] */ size_t nbsize = 0; /* number of word in a null block */ Gword *sentence_word; memset(show_word, 0, linkage->num_words * sizeof(*show_word)); if (verbosity_level(D_CCW)) print_lwg_path(lwg_path, "Linkage"); for (i = 0; i < linkage->num_words; i++) { Disjunct *cdj = cdjp[i]; Gword *w; /* current word */ const Gword *nw; /* next word (NULL if none) */ Gword **wgp; /* wordgraph_path traversing pointer */ const char *t = NULL; /* current word string */ bool at_nullblock_end; /* current word is at end of a nullblock */ bool join_alt = false; /* morpheme-join this alternative */ char *s; size_t l; size_t m; lgdebug(D_CCW, "Loop start, word%zu: cdj %s, path %s\n", i, cdj ? cdj->word_string : "NULL", lwg_path[i] ? lwg_path[i]->subword : "NULL"); w = lwg_path[i]; nw = lwg_path[i+1]; wgp = &lwg_path[i]; sentence_word = wg_get_sentence_word(sent, w); /* FIXME If the original word was capitalized in a capitalizable * position, the displayed null word may be its downcase version. */ if (NULL == cdj) /* a null word (the chosen disjunct was NULL) */ { chosen_words[i] = NULL; nbsize++; if (NULL == nullblock_start) /* it starts a new null block */ nullblock_start = wgp; at_nullblock_end = (NULL == nw) || (wg_get_sentence_word(sent, nw->unsplit_word) != sentence_word); /* Accumulate null words in this alternative */ if (!at_nullblock_end && (NULL == cdjp[i+1]) && ((w->morpheme_type == MT_PUNC) == (nw->morpheme_type == MT_PUNC))) { lgdebug(D_CCW, "Skipping word%zu cdjp=NULL#%zu, path %s\n", i, nbsize, w->subword); chosen_words[i] = NULL; continue; } if (NULL != nullblock_start) { /* If we are here, this null word is an end of a null block */ lgdebug(+D_CCW, "Handling %zu null words at %zu: ", nbsize, i); if (1 == nbsize) { /* Case 1: A single null subword. */ lgdebug(D_CCW, "A single null subword.\n"); t = join_null_word(sent, wgp, nbsize); gwordlist_append(&n_lwg_path, w); } else { lgdebug(D_CCW, "Combining null subwords"); /* Use alternative_id to check for start of alternative. */ if (((*nullblock_start)->alternative_id == *nullblock_start) && at_nullblock_end) { /* Case 2: A null unsplit_word (all-nulls alternative).*/ lgdebug(D_CCW, " (null alternative)\n"); t = sentence_word->subword; gwordlist_append(&n_lwg_path, sentence_word); } else { /* Case 3: Join together >=2 null morphemes. */ Gword *wgnull; lgdebug(D_CCW, " (null partial word)\n"); wgnull = wordgraph_null_join(sent, wgp-nbsize+1, wgp); gwordlist_append(&n_lwg_path, wgnull); t = wgnull->subword; } } nullblock_start = NULL; nbsize = 0; show_word[i] = true; if (MT_WALL != w->morpheme_type) { /* Put brackets around the null word. */ l = strlen(t) + 2; s = (char *) alloca(l+1); s[0] = NULLWORD_START; strcpy(&s[1], t); s[l-1] = NULLWORD_END; s[l] = '\0'; t = string_set_add(s, sent->string_set); lgdebug(D_CCW, " %s\n", t); /* Null words have no links, so take care not to drop them. */ } } } else { /* This word has a linkage. */ /* TODO: Suppress "virtual-morphemes", currently the dictcap ones. */ char *sm; t = cdj->word_string; /* Print the subscript, as in "dog.n" as opposed to "dog". */ if (0) { /* TODO */ } else { /* Get rid of those ugly ".Ixx" */ if (is_idiom_word(t)) { s = strdupa(t); sm = strrchr(s, SUBSCRIPT_MARK); /* Possible double subscript. */ UNREACHABLE(NULL == sm); /* We know it has a subscript. */ *sm = '\0'; t = string_set_add(s, sent->string_set); } else if (HIDE_MORPHO) { /* Concatenate the word morphemes together into one word. * Concatenate their subscripts into one subscript. * Use subscript separator SUBSCRIPT_SEP. * XXX Check whether we can encounter an idiom word here. * FIXME Combining contracted words is not handled yet, because * combining morphemes which have non-LL links to other words is * not yet implemented. * FIXME Move to a separate function. */ Gword **wgaltp; size_t join_len = 0; size_t mcnt = 0; /* If the alternative contains morpheme subwords, mark it * for joining... */ const Gword *unsplit_word = w->unsplit_word; for (wgaltp = wgp, j = i; NULL != *wgaltp; wgaltp++, j++) { if ((*wgaltp)->unsplit_word != unsplit_word) break; if (MT_INFRASTRUCTURE == (*wgaltp)->unsplit_word->morpheme_type) break; mcnt++; if (NULL == cdjp[j]) { /* ... but not if it contains a null word */ join_alt = false; break; } join_len += strlen(cdjp[j]->word_string) + 1; if ((*wgaltp)->morpheme_type & IS_REG_MORPHEME) join_alt = true; } if (join_alt) { /* Join it in two steps: 1. Base words. 2. Subscripts. * FIXME? Can be done in one step (more efficient but maybe * less clear). * Put SUBSCRIPT_SEP between the subscripts. * XXX No 1-1 correspondence between the hidden base words * and the subscripts after the join, in case there are base * words with and without subscripts. */ const char subscript_sep_str[] = { SUBSCRIPT_SEP, '\0'}; char *join = calloc(join_len + 1, 1); /* zeroed out */ join[0] = '\0'; /* 1. Join base words. (Could just use the unsplit_word.) */ for (wgaltp = wgp, m = 0; m < mcnt; wgaltp++, m++) { add_morpheme_unmarked(sent, join, cdjp[i+m]->word_string, (*wgaltp)->morpheme_type); } strcat(join, subscript_mark_str()); /* tentative */ /* 2. Join subscripts. */ for (wgaltp = wgp, m = 0; m < mcnt; wgaltp++, m++) { /* Cannot NULLify the word - we may have links to it. */ if (m != mcnt-1) chosen_words[i+m] = ""; sm = strchr(cdjp[i+m]->word_string, SUBSCRIPT_MARK); if (NULL != sm) { /* Supposing stem subscript is .=x (x optional) */ if (MT_STEM == (*wgaltp)->morpheme_type) { sm += 1 + STEM_MARK_L; /* sm+strlen(".=") */ if ('\0' == *sm) sm = NULL; #if 0 if ((cnt-1) == m) { /* Support a prefix-stem combination. In that case * we have just nullified the combined word, so we * need to move it to the position of the prefix. * FIXME: May still not be good enough. */ move_combined_word = i+m-1; /* And the later chosen_word assignment should be: * chosen_words[-1 != move_combined_word ? * move_combined_word : i] = t; */ } else { move_combined_word = -1; } #endif } } if (NULL != sm) { strcat(join, sm+1); strcat(join, subscript_sep_str); } } /* Remove an extra mark, if any */ join_len = strlen(join); if ((SUBSCRIPT_SEP == join[join_len-1]) || (SUBSCRIPT_MARK == join[join_len-1])) join[join_len-1] = '\0'; gwordlist_append(&n_lwg_path, sentence_word); t = string_set_add(join, sent->string_set); free(join); i += mcnt-1; } } } if (!join_alt) gwordlist_append(&n_lwg_path, *wgp); /* * Add guess marks in [] square brackets, if needed, at the * end of the base word. Convert the badly-printing * SUBSCRIPT_MARK (hex 03 or ^C) into a period. */ if (t) { s = strdupa(t); sm = strrchr(s, SUBSCRIPT_MARK); if (sm) *sm = SUBSCRIPT_DOT; if ((!(w->status & WS_GUESS) && (w->status & WS_INDICT)) || !DISPLAY_GUESS_MARKS) { t = string_set_add(s, sent->string_set); } else { const char *regex_name = w->regex_name; /* 4 = 1(null) + 1(guess_mark) + 2 (sizeof "[]") */ int baselen = NULL == sm ? strlen(t) : (size_t)(sm-s); char guess_mark = 0; switch (w->status & WS_GUESS) { case WS_SPELL: guess_mark = GM_SPELL; break; case WS_RUNON: guess_mark = GM_RUNON; break; case WS_REGEX: guess_mark = GM_REGEX; break; case 0: guess_mark = GM_UNKNOWN; break; default: assert(0, "Missing 'case: %2x'", w->status & WS_GUESS); } /* In the case of display_morphology==0, the guess indication of * the last subword is used as the guess indication of the whole * word. * FIXME? The guess indications of other subwords are ignored in * this mode. This implies that if a first or middle subword has * a guess indication but the last subword doesn't have, no guess * indication would be shown at all. */ if ((NULL == regex_name) || HIDE_MORPHO) regex_name = ""; s = alloca(strlen(t) + strlen(regex_name) + 4); strncpy(s, t, baselen); s[baselen] = '['; s[baselen + 1] = guess_mark; strcpy(s + baselen + 2, regex_name); strcat(s, "]"); if (NULL != sm) strcat(s, sm); t = string_set_add(s, sent->string_set); } } } assert(t != NULL, "Word %zu: NULL", i); chosen_words[i] = t; } /* Conditional test removal of quotation marks and the "capdict" tokens, * to facilitate using diff on sentence batch runs. */ if (test_enabled("removeZZZ")) { for (i=0, j=0; i<linkage->num_links; i++) { Link *lnk = &(linkage->link_array[i]); if (0 == strcmp("ZZZ", lnk->link_name)) chosen_words[lnk->rw] = NULL; } } /* If morphology printing is being suppressed, then all links * connecting morphemes will be discarded. */ if (HIDE_MORPHO) { /* Discard morphology links. */ for (i=0; i<linkage->num_links; i++) { Link * lnk = &linkage->link_array[i]; if (is_morphology_link(lnk->link_name)) { /* Mark link for discarding. */ lnk->link_name = NULL; } else { /* Mark word for not discarding. */ show_word[lnk->rw] = true; show_word[lnk->lw] = true; } } } /* We alloc a little more than needed, but so what... */ linkage->word = (const char **) exalloc(linkage->num_words*sizeof(char *)); /* Copy over the chosen words, dropping the discarded words. * However, don't discard existing words (chosen_words[i][0]). * Note that if a word only has morphology links and is not combined with * another word, then it will get displayed with no links at all (e.g. * when explicitly specifying root and suffix for debug: root.= =suf */ for (i=0, j=0; i<linkage->num_words; ++i) { if (chosen_words[i] && (chosen_words[i][0] || (!HIDE_MORPHO || show_word[i]))) { const char *cwtmp = linkage->word[j]; linkage->word[j] = chosen_words[i]; chosen_words[i] = cwtmp; remap[i] = j; j++; } else { remap[i] = -1; } } linkage->num_words = j; remap_linkages(linkage, remap); /* Update linkage->link_array / num_links. */ linkage->wg_path_display = n_lwg_path; if (verbosity_level(D_CCW)) print_lwg_path(n_lwg_path, "Display"); }
bool sane_linkage_morphism(Sentence sent, Linkage lkg, Parse_Options opts) { Wordgraph_pathpos *wp_new = NULL; Wordgraph_pathpos *wp_old = NULL; Wordgraph_pathpos *wpp; Gword **next; /* next Wordgraph words of the current word */ size_t i; Linkage_info * const lifo = &lkg->lifo; bool match_found = true; /* if all the words are null - it's still a match */ Gword **lwg_path; Dictionary afdict = sent->dict->affix_table; /* for SANEMORPHISM */ char *const affix_types = alloca(sent->length*2 + 1); /* affix types */ affix_types[0] = '\0'; /* Populate the path word queue, initializing the path to NULL. */ for (next = sent->wordgraph->next; *next; next++) { wordgraph_path_append(&wp_new, /*path*/NULL, /*add_word*/NULL, *next); } assert(NULL != wp_new, "Path word queue is empty"); for (i = 0; i < lkg->num_words; i++) { Disjunct *cdj; /* chosen disjunct */ lgdebug(D_SLM, "%p Word %zu: ", lkg, i); if (NULL == wp_new) { lgdebug(+D_SLM, "- No more words in the wordgraph\n"); match_found = false; break; } if (wp_old != wp_new) { wordgraph_path_free(wp_old, true); wp_old = wp_new; } wp_new = NULL; //wordgraph_pathpos_print(wp_old); cdj = lkg->chosen_disjuncts[i]; /* Handle null words */ if (NULL == cdj) { lgdebug(D_SLM, "- Null word\n"); /* A null word matches any word in the Wordgraph - * so, unconditionally proceed in all paths in parallel. */ match_found = false; for (wpp = wp_old; NULL != wpp->word; wpp++) { if (NULL == wpp->word->next) continue; /* This path encountered the Wordgraph end */ /* The null words cannot be marked here because wpp->path consists * of pointers to the Wordgraph words, and these words are common to * all the linkages, with potentially different null words in each * of them. However, the position of the null words can be inferred * from the null words in the word array of the Linkage structure. */ for (next = wpp->word->next; NULL != *next; next++) { match_found = true; wordgraph_path_append(&wp_new, wpp->path, wpp->word, *next); } } continue; } if (!match_found) { const char *e = "Internal error: Too many words in the linkage\n"; lgdebug(D_SLM, "- %s", e); prt_error("Error: %s.", e); break; } assert(MT_EMPTY != cdj->word[0]->morpheme_type); /* already discarded */ if (debug_level(D_SLM)) print_with_subscript_dot(cdj->string); match_found = false; /* Proceed in all the paths in which the word is found. */ for (wpp = wp_old; NULL != wpp->word; wpp++) { const Gword **wlp; /* disjunct word list */ for (wlp = cdj->word; *wlp; wlp++) { if (*wlp == wpp->word) { match_found = true; for (next = wpp->word->next; NULL != *next; next++) { wordgraph_path_append(&wp_new, wpp->path, wpp->word, *next); } break; } } } if (!match_found) { /* FIXME? A message can be added here if there are too many words * in the linkage (can happen only if there is an internal error). */ lgdebug(D_SLM, "- No Wordgraph match\n"); break; } lgdebug(D_SLM, "\n"); } if (match_found) { match_found = false; /* Validate that there are no missing words in the linkage. It is so if * the dummy termination word is found in the new pathpos queue. */ if (NULL != wp_new) { for (wpp = wp_new; NULL != wpp->word; wpp++) { if (MT_INFRASTRUCTURE == wpp->word->morpheme_type) { match_found = true; /* Exit the loop with with wpp of the termination word. */ break; } } } if (!match_found) lgdebug(D_SLM, "%p Missing word(s) at the end of the linkage.\n", lkg); } #define DEBUG_morpheme_type 0 /* Check the morpheme type combination. * If null_count > 0, the morpheme type combination may be invalid * due to null subwords, so skip this check. */ if (match_found && (0 == sent->null_count) && (NULL != afdict) && (NULL != afdict->regex_root)) { const Gword **w; char *affix_types_p = affix_types; /* Construct the affix_types string. */ #if DEBUG_morpheme_type print_lwg_path(wpp->path); #endif i = 0; for (w = wpp->path; *w; w++) { i++; if (MT_EMPTY == (*w)->morpheme_type) continue; /* really a null word */ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wswitch-enum" switch ((*w)->morpheme_type) { #pragma GCC diagnostic pop default: /* What to do with the rest? */ case MT_WORD: *affix_types_p = AFFIXTYPE_WORD; break; case MT_PREFIX: *affix_types_p = AFFIXTYPE_PREFIX; break; case MT_STEM: *affix_types_p = AFFIXTYPE_STEM; break; case MT_MIDDLE: *affix_types_p = AFFIXTYPE_MIDDLE; break; case MT_SUFFIX: *affix_types_p = AFFIXTYPE_SUFFIX; break; } #if DEBUG_morpheme_type lgdebug(D_SLM, "Word %zu: %s affixtype=%c\n", i, (*w)->subword, *affix_types_p); #endif affix_types_p++; } *affix_types_p = '\0'; #ifdef WORD_BOUNDARIES /* not yet implemented */ { const Gword *uw; /* If w is an "end subword", return its unsplit word, else NULL. */ uw = word_boundary(w); /* word_boundary() unimplemented */ if (NULL != uw) { *affix_types_p++ = AFFIXTYPE_END; lgdebug(D_SLM, "%p End of Gword %s\n", lkg, uw->subword); } } #endif /* Check if affix_types is valid according to SANEMORPHISM. */ if (('\0' != affix_types[0]) && (NULL == match_regex(afdict->regex_root, affix_types))) { /* Morpheme type combination is invalid */ match_found = false; /* Notify to stdout, so it will be shown along with the result. * XXX We should have a better way to notify. */ if (0 < opts->verbosity) printf("Warning: Invalid morpheme type combination '%s', " "run with !bad and !verbosity=4 to debug\n", affix_types); } } if (match_found) lwg_path = (Gword **)wpp->path; /* OK to modify */ wordgraph_path_free(wp_old, true); wordgraph_path_free(wp_new, !match_found); if (match_found) { if ('\0' != affix_types[0]) { lgdebug(D_SLM, "%p Morpheme type combination '%s'\n", lkg, affix_types); } lgdebug(+D_SLM, "%p SUCCEEDED\n", lkg); lkg->wg_path = lwg_path; return true; } /* Oh no ... invalid morpheme combination! */ sent->num_valid_linkages --; lifo->N_violations++; lifo->pp_violation_msg = "Invalid morphism construction."; lkg->wg_path = NULL; lifo->discarded = true; lgdebug(D_SLM, "%p FAILED\n", lkg); return false; }
/** * Split randomly. * Return true on success. * Return false when: * - disabled (i.e. when doing regular language processing). * - an error occurs (the behavior then is undefined). * Such an error has not been observed yet. */ bool anysplit(Sentence sent, const char *word) { Dictionary afdict = sent->dict->affix_table; anysplit_params *as; Afdict_class * stemsubscr; size_t stemsubscr_len; size_t l = strlen(word); p_list pl; size_t pos; int p; int sample_point; size_t nsplits; size_t rndtried = 0; size_t rndissued = 0; size_t i; unsigned int seed = 0; char *prefix_string = alloca(l+2+1); /* word + ".=" + NUL */ char *suffix_string = alloca(l+1); /* word + NUL */ bool use_sampling = true; const char infix_mark = INFIX_MARK(afdict); if (NULL == afdict) return false; as = afdict->anysplit; if ((NULL == as) || (0 == as->nparts)) return false; /* Anysplit disabled */ if (0 == l) { prt_error("Warning: anysplit(): word length 0\n"); return false; } stemsubscr = AFCLASS(afdict, AFDICT_STEMSUBSCR); stemsubscr_len = (NULL == stemsubscr->string[0]) ? 0 : strlen(stemsubscr->string[0]); /* Don't split morphemes again. If INFIXMARK and/or SUBSCRMARK are * not defined in the affix file, then morphemes may get split again unless * restricted by REGPRE/REGMID/REGSUF. */ if (word[0] == infix_mark) return true; if ((l > stemsubscr_len) && (0 == strcmp(word+l-stemsubscr_len, stemsubscr->string[0]))) return true; // seed = time(NULL)+(unsigned int)(long)&seed; #if DEBUG_ANYSPLIT gw = word; #endif nsplits = split(l, as->nparts, &as->scl[l]); if (0 == nsplits) { prt_error("Warning: anysplit(): split() failed (shouldn't happen)\n"); return false; } if (as->altsmax >= nsplits) { /* Issue everything */ sample_point = -1; use_sampling = false; } lgdebug(+2, "Start%s sampling: word=%s, nsplits=%zu, maxsplits=%d, " "as->altsmin=%zu, as->altsmax=%zu\n", use_sampling ? "" : " no", word, nsplits, as->nparts, as->altsmin, as->altsmax); while (rndtried < nsplits && (!use_sampling || (rndissued < as->altsmin))) { if (use_sampling) { sample_point = rng_uniform(&seed, nsplits); if (sample_point < 0) /* Cannot happen with rand_r() */ { prt_error("Error: rng: %s\n", strerror(errno)); return false; } } else { sample_point++; } lgdebug(2, "Sample: %d ", sample_point); if (as->scl[l].p_tried[sample_point]) { lgdebug(4, "(repeated)\n"); continue; } lgdebug(4, "(new)"); rndtried++; as->scl[l].p_tried[sample_point] = true; if (morpheme_match(sent, word, l, &as->scl[l].sp[sample_point*as->nparts])) { as->scl[l].p_selected[sample_point] = true; rndissued++; } else { lgdebug(2, "\n"); } } lgdebug(2, "Results: word '%s' (length=%zu): %zu/%zu:\n", word, l, rndissued, nsplits); for (i = 0; i < nsplits; i++) { const char **suffixes = NULL; int num_suffixes = 0; if (!as->scl[l].p_selected[i]) continue; pl = &as->scl[l].sp[i*as->nparts]; pos = 0; for (p = 0; p < as->nparts; p++) { if (pl[0] == (int)l) /* This is the whole word */ { strncpy(prefix_string, &word[pos], pl[p]-pos); prefix_string[pl[p]-pos] = '\0'; } else if (0 == pos) /* The first but not the only morpheme */ { strncpy(prefix_string, &word[pos], pl[p]-pos); prefix_string[pl[p]-pos] = '\0'; if (0 != stemsubscr->length) strcat(prefix_string, stemsubscr->string[0]); } else /* 2nd and on morphemes */ { strncpy(suffix_string, &word[pos], pl[p]-pos); suffix_string[pl[p]-pos] = '\0'; altappend(sent, &suffixes, suffix_string); num_suffixes++; } pos = pl[p]; if (pos == l) break; } /* Here a leading INFIX_MARK is added to the suffixes if needed. */ add_alternative(sent, 0,NULL, 1,(const char **)&prefix_string, num_suffixes,suffixes); free(suffixes); } return true; }
/** * This fills the linkage array with morphologically-acceptable * linkages. */ static void process_linkages(Sentence sent, extractor_t* pex, bool overflowed, Parse_Options opts) { if (0 == sent->num_linkages_found) return; if (0 == sent->num_linkages_alloced) return; /* Avoid a later crash. */ /* Pick random linkages if we get more than what was asked for. */ bool pick_randomly = overflowed || (sent->num_linkages_found > (int) sent->num_linkages_alloced); sent->num_valid_linkages = 0; size_t N_invalid_morphism = 0; int itry = 0; size_t in = 0; int maxtries; /* In the case of overflow, which will happen for some long * sentences, but is particularly common for the amy/ady random * splitters, we want to find as many morpho-acceptable linkages * as possible, but keep the CPU usage down, as these might be * very rare. This is due to a bug/feature in the interaction * between the word-graph and the parser: valid morph linkages * can be one-in-a-thousand.. or worse. Search for them, but * don't over-do it. * Note: This problem has recently been alleviated by an * alternatives-compatibility check in the fast matcher - see * alt_connection_possible(). */ #define MAX_TRIES 250000 if (pick_randomly) { /* Try picking many more linkages, but not more than possible. */ maxtries = MIN((int) sent->num_linkages_alloced + MAX_TRIES, sent->num_linkages_found); } else { maxtries = sent->num_linkages_alloced; } bool need_init = true; for (itry=0; itry<maxtries; itry++) { Linkage lkg = &sent->lnkages[in]; Linkage_info * lifo = &lkg->lifo; /* Negative values tell extract-links to pick randomly; for * reproducible-rand, the actual value is the rand seed. */ lifo->index = pick_randomly ? -(itry+1) : itry; if (need_init) { partial_init_linkage(sent, lkg, sent->length); need_init = false; } extract_links(pex, lkg); compute_link_names(lkg, sent->string_set); if (verbosity_level(+D_PL)) { err_msg(lg_Debug, "chosen_disjuncts before:\n\\"); print_chosen_disjuncts_words(lkg, /*prt_opt*/true); } if (sane_linkage_morphism(sent, lkg, opts)) { remove_empty_words(lkg); if (verbosity_level(+D_PL)) { err_msg(lg_Debug, "chosen_disjuncts after:\n\\"); print_chosen_disjuncts_words(lkg, /*prt_opt*/false); } need_init = true; in++; if (in >= sent->num_linkages_alloced) break; } else { N_invalid_morphism++; lkg->num_links = 0; lkg->num_words = sent->length; // memset(lkg->link_array, 0, lkg->lasz * sizeof(Link)); memset(lkg->chosen_disjuncts, 0, sent->length * sizeof(Disjunct *)); } } /* The last one was alloced, but never actually used. Free it. */ if (!need_init) free_linkage(&sent->lnkages[in]); sent->num_valid_linkages = in; /* The remainder of the array is garbage; we never filled it in. * So just pretend that it's shorter than it is */ sent->num_linkages_alloced = sent->num_valid_linkages; lgdebug(D_PARSE, "Info: sane_morphism(): %zu of %d linkages had " "invalid morphology construction\n", N_invalid_morphism, itry + (itry != maxtries)); }
/* Was main() of the test program... */ static int regex_split(const char *inpat, int flags, const char *str, Dictionary dict) { const char *p; dyn_str *pat; int plevel; /* paren level */ int cglevel; /* capture group level */ int nplevel; /* paren level within named capture group */ int icgnum; /* capture group number*/ int options; const char *errptr; int erroffset; pcre *pcre; const char * const prog = "regex_tokenizer_test"; int rc; pcre_extra *extra = NULL; #define OVCNT 15 int ovector[OVCNT]; callout_data_t callout_data; #if 0 const char **wordlist; #endif bool word_compare_flag = true; #ifdef notdef dyn_str *wordalts; #endif const char *group_name = NULL; char *word_classname; char c0[2] = "\0\0"; /* FIXME: validate we use PCRE version 2 at least. */ /* Find the number of capturing groups in the input pattern. */ icgnum = 0; for (p = inpat; '\0' != *p; p++) { /* Count as capture groups only (string) or (?<name>). Especially, avoid * counting (?<=...) (positive look behind) and (?(condition)...) (the * (condition) part). * FIXME: support () inside []. * FIXME: support \. */ if ((*p == '(') && (*p != '*') && ((p[1] != '?') || ((p[2] == '<') && (p[3] != '='))) && ((p-inpat < 2) || (p[-2] != '(') || (p[-1] != '?'))) { icgnum++; } } if (0 == icgnum) { printf("%s: pattern must include at least one () group (was: %s)\n", prog, inpat); return 9; } #if 0 if (p[-1] != '$') { /* FIXME: add $ if needed */ printf("%s: pattern must end with $ (was: %s)\n", prog, inpat); return 9; } #endif /* Regex syntax check of the pattern. * FIXME: Add support for "(?J)" */ options = PCRE_UTF8; pcre = pcre_compile(inpat, options, &errptr, &erroffset, NULL); if (NULL == pcre) { printf("%s: pcre_compile: Error in pattern '%s' at offset %d: %s\n", prog, inpat, erroffset, errptr); return 2; } callout_data.wordlist = NULL; callout_data.cgnum = NULL; if (word_compare_flag) { int i; #if 0 callout_data.wordlist = malloc(sizeof(*callout_data.wordlist)*icgnum); #endif callout_data.cgnum = malloc(sizeof(*callout_data.cgnum)*icgnum); //printf("ALLOCATED callout_data.cgnum %ld for %d groups\n", //sizeof(*callout_data.wordlist)*cgnum, icgnum); for (i = 0; i < icgnum; i++) { #if 0 callout_data.wordlist[i] = NULL; #endif callout_data.cgnum[i] = NULL; } } /* Build the pattern that finds all possible matches. */ pat = dyn_str_new(); plevel = 0; cglevel = 0; icgnum = -1; /* First capture group (plevel==1) is icgnum==0. */ /* Convert the input regex to the tokenizer regex. * cglevel counts named capture groups * plevel counts all groups * * FIXME: Add support for: * (?x) - comment mode. * (?i) - ignore case. * \ - backslash for ()<>?* . * [] - () inside it * FIXME: Add "(?: ... )" over the result pattern. */ //dyn_strcat(pat, "(?J)"); for (p = inpat; '\0' != *p; p++) { char *re = NULL; /* a regex from the 4.0.regex file */ switch (*p) { const char *c; case '(': if (cglevel > 0) { printf("Error at position %ld: Tokenizer capture groups cannot have nested groups\n", p-inpat); } plevel++; if ((p[1] == '*') || ((p[1] == '?') && ((p[2] != '<') || (p[3] == '='))) || ((p-inpat > 1) && (p[-2] == '(') && (p[-1] == '?'))) { break; } cglevel++; if (cglevel > 1) { printf("Error at position %ld: Tokenizer aregex cannot have capture group level > 1\n", p-inpat); free(callout_data.cgnum); return 199; } icgnum++; dyn_strcat(pat, "(?:"); group_name = NULL; break; case ')': plevel--; if (cglevel > 0) { cglevel--; /* Add the dict lookup and capturing callback. */ dyn_strcat(pat, ")(?C)"); } group_name = NULL; break; case '<': /* Remember it as a potential start of a named group. */ if ((p-2 >= inpat) && (p[-2] == '(') && (p[-1] == '?') && (p[1] != '=')) { group_name = p + 1; } else group_name = NULL; break; case '>': if (NULL != group_name) { /* Check if this is actually a group name */ for (c = group_name; c < p; c++) { /* FIXME: 'a' and 'p' are part of a hack for lookup_mark. * FIXME: 'r' is part of a hack for regex names that match affix * class names. The fix is not to use matching names. */ if ((*c > 'Z' || *c < 'A') && *c != 'a' && *c != 'p' && *c != 'r') break; } if (c == p) { word_classname = malloc(p-group_name+1); strncpy(word_classname, group_name, p-group_name); word_classname[p-group_name] = '\0'; } else { printf("%s: Invalid class name in group name found at '%s'\n", prog, group_name-4); word_classname = NULL; } } else { word_classname = NULL; } if (!word_classname) { group_name = NULL; break; } dyn_strcat(pat, ">"); lgdebug(6, "Found word-class %s\n", word_classname); #if 0 wordlist = readwords(word_classname); if (NULL == wordlist) { printf("i%s: Invalid class name %s in group name\n", prog, word_classname); return 100; } if (!word_compare_flag) { printf("Invocation without -w is not supported\n"); return 103; } #endif if (word_compare_flag) { char *t; const char *lookup_mark = NULL; #if 0 callout_data.wordlist[icgnum] = wordlist; printf("WORDLIST %p at cgnum %d\n", wordlist, icgnum); #endif /* Allocate per group info */ callout_data.cgnum[icgnum] = malloc(sizeof(*(callout_data.cgnum)[0])); callout_data.cgnum[icgnum]->name = NULL; //printf("ALLOCATED cgnum[%d]=%p\n", icgnum, //callout_data.cgnum[icgnum]); /* A hack for testing: Handle WORDpX or WORDaX. * The above a/p marks mean append/prepend X to word before making * the lookup. * FIXME: Find another way to specify that, maybe in the affix file * or in a tokenizer definition file. */ t = strpbrk(word_classname, "pa"); if (NULL != t) { Afdict_class *ac; callout_data.cgnum[icgnum]->lookup_mark_pos = *t; *t = '\0'; ac = afdict_find(dict->affix_table, t+1, /*notify_err*/false); if (NULL == ac) { printf("%s: Unknown afclass '%s'\n", prog, t+1); return 253; } /* Check if the requested affix class is defined and is not an * empty string (like the default INFIXMARK). */ if (0 == ac->length || '\0' == ac->string[0][0]) { printf("%s: No value for afclass '%s'\n", prog, t+1); return 252; } lookup_mark = ac->string[0]; /* FIXME: support more than one value. */ } callout_data.cgnum[icgnum]->lookup_mark = lookup_mark; callout_data.cgnum[icgnum]->name = word_classname; if (0 == strcmp(word_classname, "DICTWORD")) { /* Assign data for looking up a word in the main dict. */ callout_data.cgnum[icgnum]->dict = dict; callout_data.cgnum[icgnum]->afclass = NULL; } else if (afdict_find(dict->affix_table, word_classname, /*notify_err*/false)) { callout_data.cgnum[icgnum]->dict = dict->affix_table; callout_data.cgnum[icgnum]->afclass = word_classname; } else { if ('r' == word_classname[0]) word_classname++; re = get_regex_by_name(dict, word_classname); if (re) { lgdebug(6, "Regex %s with modified groups: '%s'\n", word_classname, re); callout_data.cgnum[icgnum]->dict = NULL; /* FIXME: No need to allocate callout_data.cgnum[icgnum] in this * case. */ } else { printf("%s: Unknown word classname '%s'\n", prog, word_classname); return 254; } } /* TODO: Assign flags, e.g. for emitting the words with stem/infix marks. */ } else { #if 0 wordalts = make_wordalts(wordlist); dyn_strcat(pat, wordalts->str); dyn_str_delete(wordalts); free(wordlist); #else printf("%s: Invocation without -w is not supported\n", prog); return 103; #endif } /* Default match for dictionary lookup is ".*". * Allow replacing it by something else. * E.g: .{2,}|a */ if (')' == p[1]) { if (NULL == re) { dyn_strcat(pat, ".*"); } else { dyn_strcat(pat, re); free(re); re = NULL; } } else { nplevel = 1; /* FIXME: Add support for: * (?x) - comment mode. * \ - backslash for ()<>?* . * [] - () inside it */ for (; p[1] != '\0' && nplevel > 0; p++) { switch (p[1]) { case '(': if (('?' != p[2]) && ('*' != p[2]) && ((p[-1] != '(') || (p[0] != '?'))) { printf("%s: Capture_group %d: Nested capture group is not supported\n", prog, icgnum+1); return 250; } nplevel++; break; case ')': nplevel--; if (0 == nplevel) continue; /* we are done */ break; } c0[0] = p[1]; dyn_strcat(pat, c0); } p--; } word_classname = NULL; group_name = NULL; continue; } c0[0] = *p; dyn_strcat(pat, c0); } /* Add '$' at the end if needed. */ if ('$' != pat->str[pat->end-1]) dyn_strcat(pat, "$"); /* Add the backtracking callback. */ dyn_strcat(pat, "(?C1)"); printf("Modified pattern: %s", pat->str); lgdebug(2, " (len %zu/%zu)", pat->end, pat->len); printf("\n"); pcre_callout = callout; callout_data.function = 1; callout_data.subp_i = 0; callout_data.subp[0].s = 0; callout_data.subp[0].e = SUBP0END_DEBUG_SIGNATURE; callout_data.subp_ovfl = false; callout_data.capture_last = 0; callout_data.pattern = pat->str; callout_data.alt_counter = 0; options = PCRE_UTF8; pcre = pcre_compile(pat->str, options, &errptr, &erroffset, NULL); if (NULL == pcre) { printf("%s: Internal error: pcre_compile: Error in pattern '%s' at offset %d: %s\n", prog, pat->str, erroffset, errptr); return 99; } /* TODO: Check if using JIT may optimize out some needed callouts. */ options = 0; //PCRE_STUDY_JIT_COMPILE; extra = pcre_study(pcre, options, &errptr); if (NULL == extra) { if (NULL != errptr) { printf("%s: pcre_study: Error for pattern '%s': %s\n", prog, pat->str, errptr); return 3; } extra = malloc(sizeof(*extra)); memset(extra, 0, sizeof(*extra)); } else { /* For some reason JIT is sometimes done even though it was not requested. * But the callouts are still invoked as expected in such cases. */ lgdebug(6, "%s: pcre_study: JIT %ld\n", prog, extra->flags & PCRE_STUDY_JIT_COMPILE); } #if 0 extra->match_limit = 10000; extra->match_limit_recursion = 10000; extra->flags |= PCRE_EXTRA_MATCH_LIMIT|PCRE_EXTRA_MATCH_LIMIT_RECURSION; #endif extra->callout_data = (void *)&callout_data; extra->flags |= PCRE_EXTRA_CALLOUT_DATA; #if 0 printf("CGNUM %d\n", icgnum); if (NULL != callout_data.cgnum) { int i; for (i = 0; i <= icgnum; i++) { printf("callout_data.cgnum[%d] %p\n", i, callout_data.cgnum[i]); } } else printf("CGNUM %p\n", callout_data.cgnum); #endif options = PCRE_ANCHORED; /* XXX Maybe PCRE_NO_START_OPTIMIZE is needed too */ rc = pcre_exec(pcre, extra, str, strlen(str), 0, options, ovector, OVCNT); if (rc < 0) { if (PCRE_ERROR_NOMATCH == rc) { lgdebug(2, "No match (must always happen)\n"); } else { printf("%s: pcre_exec: Error %d\n", prog, rc); } } else { printf("Internal error: Unexpected match, rc=%d\n", rc); } if (0 == rc) { rc = OVCNT/3; printf("ovector only has room for %d captured substrings\n", rc - 1); } printov(str, (ov_t *)ovector, rc, NULL, /*is_pcreov*/true); if (verbosity > 6) { if (0 != callout_data.subp_i) { printf("Callout stack:\n"); printov(str, callout_data.subp, callout_data.subp_i, &callout_data, /*is_pcreov*/false); } } /* Free everything. */ dyn_str_delete(pat); /* note - callback_data uses parts of pat */ pcre_free_study(extra); /* safe even if malloc'ed */ free(pcre); if (NULL != callout_data.cgnum) { int i; for (i = 0; i <= icgnum; i++) { if (callout_data.cgnum[i]) { /* FIXME: Free also word_classname. */ free(callout_data.cgnum[i]); } } free(callout_data.cgnum); } #if 0 if (NULL != callout_data.wordlist) { int i; for (i = 0; i < icgnum; i++) { free(callout_data.wordlist[i]); } free(callout_data.wordlist); } #endif return 0; }
static int pp_prune(Sentence sent, Parse_Options opts) { pp_knowledge * knowledge; size_t i, w; int total_deleted, N_deleted; bool change, deleteme; multiset_table *cmt; if (sent->postprocessor == NULL) return 0; if (!opts->perform_pp_prune) return 0; knowledge = sent->postprocessor->knowledge; cmt = cms_table_new(); for (w = 0; w < sent->length; w++) { Disjunct *d; for (d = sent->word[w].d; d != NULL; d = d->next) { char dir; d->marked = true; for (dir=0; dir < 2; dir++) { Connector *c; for (c = ((dir) ? (d->left) : (d->right)); c != NULL; c = c->next) { insert_in_cms_table(cmt, connector_string(c)); } } } } total_deleted = 0; change = true; while (change) { char dir; change = false; N_deleted = 0; for (w = 0; w < sent->length; w++) { Disjunct *d; for (d = sent->word[w].d; d != NULL; d = d->next) { if (!d->marked) continue; deleteme = false; for (i = 0; i < knowledge->n_contains_one_rules; i++) { pp_rule* rule = &knowledge->contains_one_rules[i]; /* the ith rule */ const char * selector = rule->selector; /* selector string for this rule */ pp_linkset * link_set = rule->link_set; /* the set of criterion links */ if (rule->selector_has_wildcard) continue; /* If it has a * forget it */ for (dir = 0; dir < 2; dir++) { Connector *c; for (c = ((dir) ? (d->left) : (d->right)); c != NULL; c = c->next) { if (!post_process_match(selector, connector_string(c))) continue; /* printf("pp_prune: trigger ok. selector = %s c->string = %s\n", selector, c->string); */ /* We know c matches the trigger link of the rule. */ /* Now check the criterion links */ if (!rule_satisfiable(cmt, link_set)) { deleteme = true; rule->use_count++; } if (deleteme) break; } if (deleteme) break; } if (deleteme) break; } if (deleteme) /* now we delete this disjunct */ { N_deleted++; total_deleted++; d->marked = false; /* mark for deletion later */ for (dir=0; dir < 2; dir++) { Connector *c; for (c = ((dir) ? (d->left) : (d->right)); c != NULL; c = c->next) { change |= delete_from_cms_table(cmt, connector_string(c)); } } } } } lgdebug(D_PRUNE, "Debug: pp_prune pass deleted %d\n", N_deleted); } cms_table_delete(cmt); if (total_deleted > 0) { delete_unmarked_disjuncts(sent); if (verbosity_level(D_PRUNE)) { prt_error("\n\\"); prt_error("Debug: After pp_prune:\n\\"); print_disjunct_counts(sent); } } print_time(opts, "pp pruning"); return total_deleted; }
/** * classic_parse() -- parse the given sentence. * Perform parsing, using the original link-grammar parsing algorithm * given in the original link-grammar papers. * * Do the parse with the minimum number of null-links within the range * specified by opts->min_null_count and opts->max_null_count. * * To that end, call do_parse() with an increasing null_count, from * opts->min_null_count up to (including) opts->max_null_count, until a * parse is found. * * A note about the disjuncts save/restore that is done here: * To increase the parsing speed, before invoking do_parse(), * pp_and_power_prune() is invoked to remove connectors which have no * possibility to connect. It includes a significant optimization when * null_count==0 that makes a more aggressive removal, but this * optimization is not appropriate when null_count>0. * * So in case this optimization has been done and a complete parse (i.e. * a parse when null_count==0) is not found, we are left with sentence * disjuncts which are not appropriate to continue do_parse() tries with * null_count>0. To solve that, we need to restore the original * disjuncts of the sentence and call pp_and_power_prune() once again. */ void classic_parse(Sentence sent, Parse_Options opts) { fast_matcher_t * mchxt = NULL; count_context_t * ctxt = NULL; bool pp_and_power_prune_done = false; Disjunct **disjuncts_copy = NULL; bool is_null_count_0 = (0 == opts->min_null_count); int max_null_count = MIN((int)sent->length, opts->max_null_count); /* Build lists of disjuncts */ prepare_to_parse(sent, opts); if (resources_exhausted(opts->resources)) return; if (is_null_count_0 && (0 < max_null_count)) { /* Save the disjuncts in case we need to parse with null_count>0. */ disjuncts_copy = alloca(sent->length * sizeof(Disjunct *)); for (size_t i = 0; i < sent->length; i++) disjuncts_copy[i] = disjuncts_dup(sent->word[i].d); } for (int nl = opts->min_null_count; nl <= max_null_count; nl++) { Count_bin hist; s64 total; if (!pp_and_power_prune_done) { if (0 != nl) { pp_and_power_prune_done = true; if (is_null_count_0) opts->min_null_count = 1; /* Don't optimize for null_count==0. */ /* We are parsing now with null_count>0, when previously we * parsed with null_count==0. Restore the save disjuncts. */ if (NULL != disjuncts_copy) { free_sentence_disjuncts(sent); for (size_t i = 0; i < sent->length; i++) sent->word[i].d = disjuncts_copy[i]; disjuncts_copy = NULL; } } pp_and_power_prune(sent, opts); if (is_null_count_0) opts->min_null_count = 0; if (resources_exhausted(opts->resources)) break; free_count_context(ctxt, sent); free_fast_matcher(sent, mchxt); pack_sentence(sent); ctxt = alloc_count_context(sent); mchxt = alloc_fast_matcher(sent); print_time(opts, "Initialized fast matcher"); } if (resources_exhausted(opts->resources)) break; free_linkages(sent); sent->null_count = nl; hist = do_parse(sent, mchxt, ctxt, sent->null_count, opts); total = hist_total(&hist); lgdebug(D_PARSE, "Info: Total count with %zu null links: %lld\n", sent->null_count, total); /* total is 64-bit, num_linkages_found is 32-bit. Clamp */ total = (total > INT_MAX) ? INT_MAX : total; total = (total < 0) ? INT_MAX : total; sent->num_linkages_found = (int) total; print_time(opts, "Counted parses"); extractor_t * pex = extractor_new(sent->length, sent->rand_state); bool ovfl = setup_linkages(sent, pex, mchxt, ctxt, opts); process_linkages(sent, pex, ovfl, opts); free_extractor(pex); post_process_lkgs(sent, opts); if (sent->num_valid_linkages > 0) break; if ((0 == nl) && (0 < max_null_count) && verbosity > 0) prt_error("No complete linkages found.\n"); /* If we are here, then no valid linkages were found. * If there was a parse overflow, give up now. */ if (PARSE_NUM_OVERFLOW < total) break; //if (sent->num_linkages_found > 0 && nl>0) printf("NUM_LINKAGES_FOUND %d\n", sent->num_linkages_found); } sort_linkages(sent, opts); if (NULL != disjuncts_copy) { for (size_t i = 0; i < sent->length; i++) free_disjuncts(disjuncts_copy[i]); } free_count_context(ctxt, sent); free_fast_matcher(sent, mchxt); }
void * object_open(const char *filename, void * (*opencb)(const char *, const void *), const void * user_data) { /* Dictionary data directory path cache -- per-thread storage. */ static TLS char *path_found; char *completename = NULL; void *fp = NULL; char *data_dir = NULL; const char **path = NULL; if (NULL == filename) { /* Invalidate the dictionary data directory path cache. */ char *pf = path_found; path_found = NULL; free(pf); return NULL; } if (NULL == path_found) { data_dir = dictionary_get_data_dir(); if (verbosity_level(D_USER_FILES)) { char cwd[MAX_PATH_NAME]; char *cwdp = getcwd(cwd, sizeof(cwd)); prt_error("Debug: Current directory: %s\n", NULL == cwdp ? "NULL": cwdp); prt_error("Debug: Last-resort data directory: %s\n", data_dir ? data_dir : "NULL"); } } /* Look for absolute filename. * Unix: starts with leading slash. * Windows: starts with C:\ except that the drive letter may differ. */ if ((filename[0] == '/') #ifdef _WIN32 || ((filename[1] == ':') && ((filename[2] == '\\') || (filename[2] == '/'))) || (filename[0] == '\\') /* UNC path */ #endif /* _WIN32 */ ) { /* opencb() returns NULL if the file does not exist. */ fp = opencb(filename, user_data); lgdebug(D_USER_FILES, "Debug: Opening file %s%s\n", filename, NOTFOUND(fp)); } else { /* A path list in which to search for dictionaries. * path_found, data_dir or DEFAULTPATH may be NULL. */ const char *dictpath[] = { path_found, ".", "." DIR_SEPARATOR "data", "..", ".." DIR_SEPARATOR "data", data_dir, DEFAULTPATH, }; size_t i = sizeof(dictpath)/sizeof(dictpath[0]); for (path = dictpath; i-- > 0; path++) { if (NULL == *path) continue; free(completename); completename = join_path(*path, filename); fp = opencb(completename, user_data); lgdebug(D_USER_FILES, "Debug: Opening file %s%s\n", completename, NOTFOUND(fp)); if ((NULL != fp) || (NULL != path_found)) break; } } if (NULL == fp) { fp = opencb(filename, user_data); lgdebug(D_USER_FILES, "Debug: Opening file %s%s\n", filename, NOTFOUND(fp)); } else if (NULL == path_found) { char *pfnd = strdup((NULL != completename) ? completename : filename); if ((0 < verbosity) && (dict_file_open == opencb)) prt_error("Info: Dictionary found at %s\n", pfnd); for (size_t i = 0; i < 2; i++) { char *root = strrchr(pfnd, DIR_SEPARATOR[0]); if (NULL != root) *root = '\0'; } path_found = pfnd; } free(data_dir); free(completename); return fp; }
static int callout(pcre_callout_block *cb) { callout_data_t *cd = cb->callout_data; ov_t *cb_ov = (ov_t *)&cb->offset_vector[2*cb->capture_last]; #if 0 const char **wordlist = NULL; #endif cgnum_t *pcgnum = NULL; const char *openp; const char *endname; bool subp_updated = false; if ((NULL != cd->cgnum) && (-1 != cb->capture_last)) { pcgnum = cd->cgnum[cb->capture_last-1]; } lgdebug(6, "Callout %d: capture_last %d cgnum %p\n", cb->callout_number, cb->capture_last, pcgnum); if (verbosity >= 6) printov(cb->subject, (ov_t *)cb->offset_vector, cb->capture_top, cd, /*is_pcreov*/true); switch(cb->callout_number) { case CALLBACK_REP: if (cb->capture_last > 0) { int subp_i = cd->subp_i; ov_t *subp = &cd->subp[subp_i]; lgdebug(2, "Current capture %d: s=%d, e=%d\n", cb->capture_last, cb_ov->s, cb_ov->e); assert(cb_ov->s>=0 && cb_ov->e>=0, "Bad start/end in capture group %d: s=%d e=%d", cb->capture_last, cb_ov->s, cb_ov->e); if (verbosity >= 6) { printf("INITIAL subp:\n"); if (cd->subp_ovfl) printf("OVERFLOW\n"); /* shouldn't happen */ printov(cb->subject, cd->subp, cd->subp_i+1, cd, /*is_pcreov*/false); } /* Record all the captures into the subp (sub-pattern) vector. * If we capture a continuation to another capture then it is a new * capture. Else we update a previous position in subp. There should be * no gaps between the capture strings. * FIXME: Handled null matches properly. Need to use cd->capture_level * to remember at which level a null match has been captured. * FIXME: Move after the word lookup (efficiency). * FIXME: Increment subp instead of cd->subp_i (cosmetic fix). */ if (cb_ov->s > subp->s) { if (cb_ov->s == subp->e) { cd->subp_i++; if (cd->subp_i == MAX_SUBP) { cd->subp_ovfl = true; return PCRE_ERROR_CALLOUT; } lgdebug(2, "OV start gt, update next sub-pattern %d\n", cd->subp_i); cd->subp[cd->subp_i] = *cb_ov; subp_updated = true; } else { printf("Capture group %d (s=%d e=%d) makes a hole (subp_i %d: s=%d e=%d)\n", cb->capture_last, subp->s, subp->e, subp_i, cb_ov->s, cb_ov->e); return PCRE_ERROR_CALLOUT; } } else { /* A backtrack occurred. */ for (subp_i = cd->subp_i; subp_i >= 0; subp_i--) { subp = &cd->subp[subp_i]; lgdebug(2, "Checking recorded sub-pattern %d: s=%d e=%d: ", subp_i, subp->s, subp->e); if (cb_ov->s == subp->s) { lgdebug(2, "OV start eq, update sub-pattern %d\n", subp_i); *subp = *cb_ov; cd->subp_i = subp_i; subp_updated = true; break; } lgdebug(2, "Backtrack handling\n"); } } assert(subp_i >= 0, "Recorded sub-pattern index"); assert(subp_updated); cd->capture_level[cd->subp_i] = cb->capture_last; if (verbosity >= 6) { printf("AFTER: subp:\n"); if (cd->subp_ovfl) printf("OVERFLOW\n"); /* shouldn't happen */ printov(cb->subject, cd->subp, cd->subp_i+1, cd, /*is_pcreov*/false); } /* Make a dictionary lookup for NAME in capture groups (?<NAME>x) * (x is a constraint for the initial pattern-match comparison done by * PCRE). */ // if (pcgnum && * cd->is_constant) printf("is_constant\n"); /* If we have a cgnum structure with a dict, check if the string to be * matched is in the dict or belongs to the given affix class. * A NULL cgnum->dict means this is a regex from the regex file. */ if (pcgnum && pcgnum->dict) { /* && !cd->is_constant */ int numchar = cb_ov->e - cb_ov->s; /* Debug: Sanity check. */ assert(numchar>=0, "numchar=%d", numchar); endname = NULL; for (openp = &cd->pattern[cb->pattern_position-5]; *openp; openp--) { if (*openp == '>') endname = openp; if (*openp == '(' && openp[1] == '?' && openp[2] == '<' && openp[3] != '=') break; } if (NULL != openp && *openp == '(' && NULL != endname && strncmp(openp, "(?<", 3) == 0 && endname > openp) ; /* Everything is OK. */ else { assert(0, "Error: Not in a named group!"); } lgdebug(6, "GROUP NAME %.*s, cgnum %d, ptr %p, numchar %d\n", (int)(endname - openp - 3), openp+3, cb->capture_last-1, pcgnum, numchar); /* End of debug sanity check. */ lgdebug(2, "Try match '%.*s': ", numchar, cb->subject+cb_ov->s); #if 0 if (0 == numchar) { lgdebug(2, "Zero match denied\n"); return 1; } #endif if (!is_word(cb->subject+cb_ov->s, numchar, pcgnum)) { lgdebug(2, "NO MATCH\n"); return 1; } lgdebug(6, "MATCH\n"); } } #if 0 if (verbosity >= 6) { printf("DEBUG subp:\n"); if (cd->subp_ovfl) printf("OVERFLOW\n"); /* shouldn't happen */ printov(cb->subject, cd->subp, cd->subp_i+1, cd); } #endif // cd->is_constant = false; return 0; /* continue to match the rest of the regex */ break; #if 0 case CALLBACK_CONSTANT_START: // cd->is_constant = true; return 0; break; case CALLBACK_CONSTANT_END: // cd->is_constant = false; return 0; break; #endif case CALLBACK_END: cd->alt_counter++; printf("Alternative %d:\n", cd->alt_counter); /* See the comment for SUBP0END_DEBUG_SIGNATURE. */ assert(cd->subp[0].e>=0, "subp[0].e is %d!", cd->subp[0].e); printov(cb->subject, cd->subp, cd->subp_i+1, cd, /*is_pcreov*/false); /* Remove the last sub-pattern, in case it is a null string (no need to * check, it can be removed anyway since if it is not a null string it is * going to be replaced on the next match). Else the next match, which * will be without this null string, we emit it again as the last * sub-pattern component. FIXME: It doesn't always help. */ if (cd->subp_i > 0) { cd->capture_level[cd->subp_i] = -3; /* mark as invalid, for debug */ cd->subp_i--; } // cd->is_constant = false; return 1; /* signify a backtrack in order to find the next alternative */ break; default: assert("Callout: Unreached" && 0); } return 0; /* Really unreached. */ /* printf("Callout %d, data test %d\n" "version %d\n" "subject '%s\n" "subject_length %d\n" "start_match %d\n" "current_position %d\n" "capture_top %d\n" "capture_last %d\n" "pattern_position %d\n" "next_item_length %d\n", cb->callout_number, ((callout_data *)cb->callout_data)->test, cb->version, cb->subject, cb->subject_length, cb->start_match, cb->current_position, cb->capture_top, cb->capture_last, cb->pattern_position, cb->next_item_length); return 0; */ }
static int pp_prune(Sentence sent, Parse_Options opts) { pp_knowledge *knowledge; multiset_table *cmt; if (sent->postprocessor == NULL) return 0; if (!opts->perform_pp_prune) return 0; knowledge = sent->postprocessor->knowledge; cmt = cms_table_new(); jet_sharing_t *js = &sent->jet_sharing; if (js->table[0] != NULL) { for (int dir = 0; dir < 2; dir++) { for (unsigned int id = 1; id < js->entries[dir] + 1; id++) { for (Connector *c = js->table[dir][id].c; NULL != c; c = c->next) { if (0 == c->refcount) continue; insert_in_cms_table(cmt, c); } } } } else { for (WordIdx w = 0; w < sent->length; w++) { for (Disjunct *d = sent->word[w].d; d != NULL; d = d->next) { for (int dir = 0; dir < 2; dir++) { Connector *first_c = (dir) ? (d->left) : (d->right); for (Connector *c = first_c; c != NULL; c = c->next) { insert_in_cms_table(cmt, c); } } } } } int D_deleted = 0; /* Number of deleted disjuncts */ int Cname_deleted = 0; /* Number of deleted connector names */ /* Since the cms table is unchanged, after applying a rule once we * know if it will be TRUE or FALSE if we need to apply it again. * Values: -1: Undecided yet; 0: Rule unsatisfiable; 1 Rule satisfiable. */ uint8_t *rule_ok = alloca(knowledge->n_contains_one_rules * sizeof(bool)); memset(rule_ok, -1, knowledge->n_contains_one_rules * sizeof(bool)); for (size_t i = 0; i < knowledge->n_contains_one_rules; i++) { if (rule_ok[i] == 1) continue; pp_rule* rule = &knowledge->contains_one_rules[i]; /* The ith rule */ const char *selector = rule->selector; /* Selector string for this rule */ pp_linkset *link_set = rule->link_set; /* The set of criterion links */ unsigned int hash = cms_hash(selector); if (rule->selector_has_wildcard) { rule_ok[i] = 1; continue; /* If it has a * forget it */ } for (Cms *cms = cmt->cms_table[hash]; cms != NULL; cms = cms->next) { Connector *c = cms->c; if (!post_process_match(selector, connector_string(c))) continue; ppdebug("Rule %zu: Selector %s, Connector %s\n", i, selector, connector_string(c)); /* We know c matches the trigger link of the rule. */ /* Now check the criterion links */ if ((rule_ok[i] == 0) || !rule_satisfiable(cmt, link_set)) { rule_ok[i] = 0; ppdebug("DELETE %s refcount %d\n", connector_string(c), c->refcount); c->nearest_word = BAD_WORD; Cname_deleted++; rule->use_count++; } else { rule_ok[i] = 1; break; } } } /* Iterate over all connectors and mark the bad trigger connectors. * If the marked connector is not the shallow one, note that the * shallow one on the same disjunct cannot be marked too (this could * facilitate faster detection by power_prune()) because this would be * wrongly reflected through the cms table. */ if (js->table[0] != NULL) { for (int dir = 0; dir < 2; dir++) { for (unsigned int id = 1; id < js->entries[dir] + 1; id++) { for (Connector *c = js->table[dir][id].c; NULL != c; c = c->next) { if (0 == c->refcount) continue; if (mark_bad_connectors(cmt, c)) { D_deleted++; break; } } } } } else { for (WordIdx w = 0; w < sent->length; w++) { for (Disjunct *d = sent->word[w].d; d != NULL; d = d->next) { for (int dir = 0; dir < 2; dir++) { Connector *first_c = (dir) ? (d->left) : (d->right); for (Connector *c = first_c; c != NULL; c = c->next) { if (mark_bad_connectors(cmt, c)) { D_deleted++; break; } } } } } } lgdebug(+D_PRUNE, "Deleted %d (%d connector names)\n", D_deleted, Cname_deleted); cms_table_delete(cmt); print_time(opts, "pp pruning"); return D_deleted; }