Dictionary dictionary_create_from_db(const char *lang) { char *dbname; const char * t; Dictionary dict; Dict_node *dict_node; dict = (Dictionary) xalloc(sizeof(struct Dictionary_s)); memset(dict, 0, sizeof(struct Dictionary_s)); dict->version = NULL; dict->num_entries = 0; dict->affix_table = NULL; dict->regex_root = NULL; /* Language and file-name stuff */ dict->string_set = string_set_create(); dict->lang = lang; t = strrchr (lang, '/'); if (t) dict->lang = string_set_add(t+1, dict->string_set); /* To disable spell-checking, just set the checker to NULL */ dict->spell_checker = spellcheck_create(dict->lang); dict->base_knowledge = NULL; dict->hpsg_knowledge = NULL; dbname = join_path (lang, "dict.db"); dict->name = string_set_add(dbname, dict->string_set); free(dbname); /* Set up the database */ dict->db_handle = object_open(dict->name, db_open, NULL); dict->lookup_list = db_lookup_list; dict->free_lookup = db_free_llist; dict->lookup = db_lookup; dict->close = db_close; /* Misc remaining common (generic) dict setup work */ dict->left_wall_defined = boolean_dictionary_lookup(dict, LEFT_WALL_WORD); dict->right_wall_defined = boolean_dictionary_lookup(dict, RIGHT_WALL_WORD); dict->empty_word_defined = boolean_dictionary_lookup(dict, EMPTY_WORD_MARK); dict->unknown_word_defined = boolean_dictionary_lookup(dict, UNKNOWN_WORD); dict->use_unknown_word = true; dict_node = dictionary_lookup_list(dict, UNLIMITED_CONNECTORS_WORD); if (dict_node != NULL) { dict->unlimited_connector_set = connector_set_create(dict_node->exp); } else { dict->unlimited_connector_set = NULL; } free_lookup_list(dict, dict_node); return dict; }
/** * Return true if word is in dictionary, or if word is matched by * regex. */ bool find_word_in_dict(const Dictionary dict, const char * word) { const char * regex_name; if (boolean_dictionary_lookup (dict, word)) return true; regex_name = match_regex(dict->regex_root, word); if (NULL == regex_name) return false; return boolean_dictionary_lookup(dict, regex_name); }
Dictionary dictionary_create_from_db(const char *lang) { char *dbname; const char * t; Dictionary dict; Dict_node *dict_node; dict = (Dictionary) xalloc(sizeof(struct Dictionary_s)); memset(dict, 0, sizeof(struct Dictionary_s)); /* Language and file-name stuff */ dict->string_set = string_set_create(); t = strrchr (lang, '/'); t = (NULL == t) ? lang : t+1; dict->lang = string_set_add(t, dict->string_set); lgdebug(D_USER_FILES, "Debug: Language: %s\n", dict->lang); /* To disable spell-checking, just set the checker to NULL */ dict->spell_checker = spellcheck_create(dict->lang); #if defined HAVE_HUNSPELL || defined HAVE_ASPELL if (NULL == dict->spell_checker) prt_error("Info: Spell checker disabled."); #endif dict->base_knowledge = NULL; dict->hpsg_knowledge = NULL; dbname = join_path (lang, "dict.db"); dict->name = string_set_add(dbname, dict->string_set); free(dbname); /* Set up the database */ dict->db_handle = object_open(dict->name, db_open, NULL); dict->lookup_list = db_lookup_list; dict->free_lookup = db_free_llist; dict->lookup = db_lookup; dict->close = db_close; /* Misc remaining common (generic) dict setup work */ dict->left_wall_defined = boolean_dictionary_lookup(dict, LEFT_WALL_WORD); dict->right_wall_defined = boolean_dictionary_lookup(dict, RIGHT_WALL_WORD); dict->empty_word_defined = boolean_dictionary_lookup(dict, EMPTY_WORD_MARK); dict->unknown_word_defined = boolean_dictionary_lookup(dict, UNKNOWN_WORD); dict->use_unknown_word = true; dict_node = dictionary_lookup_list(dict, UNLIMITED_CONNECTORS_WORD); if (dict_node != NULL) dict->unlimited_connector_set = connector_set_create(dict_node->exp); free_lookup_list(dict, dict_node); return dict; }
/** * Compiles all the given regexs. Returns 0 on success, * else an error code. */ int compile_regexs(Regex_node *re, Dictionary dict) { regex_t *preg; int rc; while (re != NULL) { /* If re->re non-null, assume compiled already. */ if(re->re == NULL) { /* Compile with default options (0) and default character * tables (NULL). */ /* re->re = pcre_compile(re->pattern, 0, &error, &erroroffset, NULL); */ preg = (regex_t *) malloc (sizeof(regex_t)); re->re = preg; rc = regcomp(preg, re->pattern, REG_EXTENDED); if (rc) { prt_regerror("Failed to compile regex", re, rc); return rc; } /* Check that the regex name is defined in the dictionary. */ if ((NULL != dict) && !boolean_dictionary_lookup(dict, re->name)) { /* TODO: better error handing. Maybe remove the regex? */ prt_error("Error: Regex name %s not found in dictionary!\n", re->name); } } re = re->next; } return 0; }
/** * This just looks up all the words in the sentence, and builds * up an appropriate error message in case some are not there. * It has no side effect on the sentence. Returns TRUE if all * went well. */ int sentence_in_dictionary(Sentence sent) { int w, ok_so_far; char * s; Dictionary dict = sent->dict; char temp[1024]; ok_so_far = TRUE; for (w=0; w<sent->length; w++) { s = sent->word[w].string; if (!boolean_dictionary_lookup(dict, s) && !(is_utf8_upper(s) && dict->capitalized_word_defined) && !(is_utf8_upper(s) && is_s_word(s) && dict->pl_capitalized_word_defined) && !(ishyphenated(s) && dict->hyphenated_word_defined) && !(is_number(s) && dict->number_word_defined) && !(is_ing_word(s) && dict->ing_word_defined) && !(is_s_word(s) && dict->s_word_defined) && !(is_ed_word(s) && dict->ed_word_defined) && !(is_ly_word(s) && dict->ly_word_defined)) { if (ok_so_far) { safe_strcpy(temp, "The following words are not in the dictionary:", sizeof(temp)); ok_so_far = FALSE; } safe_strcat(temp, " \"", sizeof(temp)); safe_strcat(temp, sent->word[w].string, sizeof(temp)); safe_strcat(temp, "\"", sizeof(temp)); } } if (!ok_so_far) { lperror(NOTINDICT, "\n%s\n", temp); } return ok_so_far; }
static int downcase_is_in_dict(Dictionary dict, char * word) { int i, rc; char low[MB_LEN_MAX]; char save[MB_LEN_MAX]; wchar_t c; int nbl, nbh; if (!is_utf8_upper(word)) return FALSE; nbh = mbtowc (&c, word, 4); c = towlower(c); nbl = wctomb(low, c); if (nbh != nbl) { fprintf(stderr, "Error: can't downcase multi-byte string!\n"); return FALSE; } /* Downcase */ for (i=0; i<nbl; i++) { save[i] = word[i]; word[i] = low[i]; } /* Look it up, then restore old value */ rc = boolean_dictionary_lookup(dict, word); for (i=0; i<nbh; i++) { word[i] = save[i]; } return rc; }
/** * Compile all the given regexs. * Return 0 on success, else an error code. */ int compile_regexs(Regex_node *rn, Dictionary dict) { while (rn != NULL) { /* If rn->re non-null, assume compiled already. */ if(rn->re == NULL) { int rc; regex_t *re = rn->re = malloc(sizeof(regex_t)); #if HAVE_PCRE2_H PCRE2_SIZE erroffset; re->re_code = pcre2_compile((PCRE2_SPTR)rn->pattern, PCRE2_ZERO_TERMINATED, PCRE2_UTF|PCRE2_UCP, &rc, &erroffset, NULL); if (NULL != re->re_code) { rc = 0; re->re_md = pcre2_match_data_create(0, NULL); if (NULL == re->re_md) return -1; /* Unhandled for now. */ } #else const int erroffset = -1; /* REG_ENHANCED is needed for macOS to support \w etc. */ #ifndef REG_ENHANCED #define REG_ENHANCED 0 #endif rc = regcomp(re, rn->pattern, REG_NOSUB|REG_EXTENDED|REG_ENHANCED); #endif if (rc) { prt_regerror("Failed to compile regex", rn, rc ,erroffset); rn->re = NULL; return rc; } /* Check that the regex name is defined in the dictionary. */ if ((NULL != dict) && !boolean_dictionary_lookup(dict, rn->name)) { /* TODO: better error handing. Maybe remove the regex? */ prt_error("Error: Regex name %s not found in dictionary!\n", rn->name); } } rn = rn->next; } return 0; }
static int guessed_string(Sentence sent, int i, const char * s, const char * type) { X_node * e; char *t, *u; char str[MAX_WORD+1]; if (boolean_dictionary_lookup(sent->dict, type)) { sent->word[i].x = build_word_expressions(sent, type); e = sent->word[i].x; if(is_s_word(s)) { for (; e != NULL; e = e->next) { t = strchr(e->string, '.'); if (t != NULL) { sprintf(str, "%.50s[!].%.5s", s, t+1); } else { sprintf(str, "%.50s[!]", s); } t = (char *) xalloc(strlen(str)+1); strcpy(t,str); u = string_set_add(t, sent->string_set); xfree(t, strlen(str)+1); e->string = u; } } else { if(is_ed_word(s)) { sprintf(str, "%.50s[!].v", s); } else if(is_ing_word(s)) { sprintf(str, "%.50s[!].g", s); } else if(is_ly_word(s)) { sprintf(str, "%.50s[!].e", s); } else sprintf(str, "%.50s[!]", s); t = (char *) xalloc(strlen(str)+1); strcpy(t,str); u = string_set_add(t, sent->string_set); xfree(t, strlen(str)+1); e->string = u; } return TRUE; } else { lperror(BUILDEXPR, ".\n To process this sentence your dictionary " "needs the word \"%s\".\n", type); return FALSE; } }
static int special_string(Sentence sent, int i, const char * s) { X_node * e; if (boolean_dictionary_lookup(sent->dict, s)) { sent->word[i].x = build_word_expressions(sent, s); for (e = sent->word[i].x; e != NULL; e = e->next) { e->string = sent->word[i].string; } return TRUE; } else { lperror(BUILDEXPR, ".\n To process this sentence your dictionary " "needs the word \"%s\".\n", s); return FALSE; } }
static Dictionary dictionary_six_str(const char * lang, const char * input, const char * dict_name, const char * pp_name, const char * cons_name, const char * affix_name, const char * regex_name) { const char * t; Dictionary dict; Dict_node *dict_node; dict = (Dictionary) xalloc(sizeof(struct Dictionary_s)); memset(dict, 0, sizeof(struct Dictionary_s)); /* Language and file-name stuff */ dict->string_set = string_set_create(); t = strrchr (lang, '/'); t = (NULL == t) ? lang : t+1; dict->lang = string_set_add(t, dict->string_set); lgdebug(D_USER_FILES, "Debug: Language: %s\n", dict->lang); dict->name = string_set_add(dict_name, dict->string_set); /* * A special setup per dictionary type. The check here assumes the affix * dictionary name contains "affix". FIXME: For not using this * assumption, the dictionary creating stuff needs a rearrangement. */ if (0 == strstr(dict->name, "affix")) { /* To disable spell-checking, just set the checker to NULL */ dict->spell_checker = spellcheck_create(dict->lang); #if defined HAVE_HUNSPELL || defined HAVE_ASPELL /* TODO: * 1. Set the spell option to 0, to signify no spell checking is done. * 2. On verbosity >= 1, add a detailed message on the reason. */ if (NULL == dict->spell_checker) prt_error("Info: Spell checker disabled."); #endif dict->insert_entry = insert_list; dict->lookup_list = lookup_list; dict->free_lookup = free_llist; dict->lookup = boolean_lookup; } else { /* * Affix dictionary. */ size_t i; dict->insert_entry = load_affix; dict->lookup = return_true; /* initialize the class table */ dict->afdict_class = malloc(sizeof(*dict->afdict_class) * ARRAY_SIZE(afdict_classname)); for (i = 0; i < ARRAY_SIZE(afdict_classname); i++) { dict->afdict_class[i].mem_elems = 0; dict->afdict_class[i].length = 0; dict->afdict_class[i].string = NULL; } } dict->affix_table = NULL; /* Read dictionary from the input string. */ dict->input = input; dict->pin = dict->input; if (!read_dictionary(dict)) { dict->pin = NULL; dict->input = NULL; goto failure; } dict->pin = NULL; dict->input = NULL; if (NULL == affix_name) { /* * The affix table is handled alone in this invocation. * Skip the rest of processing! * FIXME: The dictionary creating stuff needs a rearrangement. */ return dict; } /* If we don't have a locale per dictionary, the following * will also set the program's locale. */ dict->locale = linkgrammar_get_dict_locale(dict); set_utf8_program_locale(); #ifdef HAVE_LOCALE_T /* We have a locale per dictionary. */ if (NULL != dict->locale) dict->locale_t = newlocale_LC_CTYPE(dict->locale); /* If we didn't succeed to set the dictionary locale, the program will * SEGFAULT when it tries to use it with the isw*() functions. * So set it to the current program's locale as a last resort. */ if (NULL == dict->locale) { dict->locale = setlocale(LC_CTYPE, NULL); dict->locale_t = newlocale_LC_CTYPE(setlocale(LC_CTYPE, NULL)); prt_error("Warning: Couldn't set dictionary locale! " "Using current program locale %s", dict->locale); } /* If dict->locale is still not set, there is a bug. */ assert((locale_t)0 != dict->locale_t, "Dictionary locale is not set."); #else /* We don't have a locale per dictionary - but anyway make sure * dict->locale is consistent with the current program's locale, * and especially that it is not NULL. It still indicates the intended * locale of this dictionary and the locale of the compiled regexs. */ dict->locale = setlocale(LC_CTYPE, NULL); #endif /* HAVE_LOCALE_T */ dict->affix_table = dictionary_six(lang, affix_name, NULL, NULL, NULL, NULL); if (dict->affix_table == NULL) { prt_error("Error: Could not open affix file %s", affix_name); goto failure; } if (! afdict_init(dict)) goto failure; /* * Process the regex file. * We have to compile regexs using the dictionary locale, * so make a temporary locale swap. */ if (read_regex_file(dict, regex_name)) goto failure; const char *locale = setlocale(LC_CTYPE, NULL); locale = strdupa(locale); /* setlocale() uses static memory. */ setlocale(LC_CTYPE, dict->locale); lgdebug(+D_DICT, "Regexs locale %s\n", setlocale(LC_CTYPE, NULL)); if (compile_regexs(dict->regex_root, dict)) { locale = setlocale(LC_CTYPE, locale); goto failure; } locale = setlocale(LC_CTYPE, locale); assert(NULL != locale, "Cannot restore program locale\n"); #ifdef USE_CORPUS dict->corpus = lg_corpus_new(); #endif dict->left_wall_defined = boolean_dictionary_lookup(dict, LEFT_WALL_WORD); dict->right_wall_defined = boolean_dictionary_lookup(dict, RIGHT_WALL_WORD); dict->empty_word_defined = boolean_dictionary_lookup(dict, EMPTY_WORD_MARK); dict->base_knowledge = pp_knowledge_open(pp_name); dict->hpsg_knowledge = pp_knowledge_open(cons_name); dict->unknown_word_defined = boolean_dictionary_lookup(dict, UNKNOWN_WORD); dict->use_unknown_word = true; dict_node = dictionary_lookup_list(dict, UNLIMITED_CONNECTORS_WORD); if (dict_node != NULL) dict->unlimited_connector_set = connector_set_create(dict_node->exp); free_lookup(dict_node); return dict; failure: string_set_delete(dict->string_set); if (dict->affix_table) xfree(dict->affix_table, sizeof(struct Dictionary_s)); xfree(dict, sizeof(struct Dictionary_s)); return NULL; }
/* The following function is dictionary_create with an extra paramater called "path". If this is non-null, then the path used to find the file is taken from that path. Otherwise the path is taken from the dict_name. This is only needed because an affix_file is opened by a recursive call to this function. */ static Dictionary internal_dictionary_create(char * dict_name, char * pp_name, char * cons_name, char * affix_name, char * path) { Dictionary dict; static int rand_table_inited=FALSE; Dict_node *dict_node; char * dictionary_path_name; dict = (Dictionary) xalloc(sizeof(struct Dictionary_s)); if (!rand_table_inited) { init_randtable(); rand_table_inited=TRUE; } dict->string_set = string_set_create(); dict->name = string_set_add(dict_name, dict->string_set); dict->num_entries = 0; dict->is_special = FALSE; dict->already_got_it = '\0'; dict->line_number = 1; dict->root = NULL; dict->word_file_header = NULL; dict->exp_list = NULL; dict->affix_table = NULL; /* *DS* remove this if (pp_name != NULL) { dict->post_process_filename = string_set_add(pp_name, dict->string_set); } else { dict->post_process_filename = NULL; } */ if (path != NULL) dictionary_path_name = path; else dictionary_path_name = dict_name; if (!open_dictionary(dictionary_path_name, dict)) { lperror(NODICT, dict_name); string_set_delete(dict->string_set); xfree(dict, sizeof(struct Dictionary_s)); return NULL; } if (!read_dictionary(dict)) { string_set_delete(dict->string_set); xfree(dict, sizeof(struct Dictionary_s)); return NULL; } dict->left_wall_defined = boolean_dictionary_lookup(dict, LEFT_WALL_WORD); dict->right_wall_defined = boolean_dictionary_lookup(dict, RIGHT_WALL_WORD); dict->postprocessor = post_process_open(dict->name, pp_name); dict->constituent_pp = post_process_open(dict->name, cons_name); dict->affix_table = NULL; if (affix_name != NULL) { dict->affix_table = internal_dictionary_create(affix_name, NULL, NULL, NULL, dict_name); if (dict->affix_table == NULL) { fprintf(stderr, "%s\n", lperrmsg); exit(-1); } } dict->unknown_word_defined = boolean_dictionary_lookup(dict, UNKNOWN_WORD); dict->use_unknown_word = TRUE; dict->capitalized_word_defined = boolean_dictionary_lookup(dict, PROPER_WORD); dict->pl_capitalized_word_defined = boolean_dictionary_lookup(dict, PL_PROPER_WORD); dict->hyphenated_word_defined = boolean_dictionary_lookup(dict, HYPHENATED_WORD); dict->number_word_defined = boolean_dictionary_lookup(dict, NUMBER_WORD); dict->ing_word_defined = boolean_dictionary_lookup(dict, ING_WORD); dict->s_word_defined = boolean_dictionary_lookup(dict, S_WORD); dict->ed_word_defined = boolean_dictionary_lookup(dict, ED_WORD); dict->ly_word_defined = boolean_dictionary_lookup(dict, LY_WORD); dict->max_cost = 1000; if ((dict_node = dictionary_lookup(dict, ANDABLE_CONNECTORS_WORD)) != NULL) { dict->andable_connector_set = connector_set_create(dict_node->exp); } else { dict->andable_connector_set = NULL; } if ((dict_node = dictionary_lookup(dict, UNLIMITED_CONNECTORS_WORD)) != NULL) { dict->unlimited_connector_set = connector_set_create(dict_node->exp); } else { dict->unlimited_connector_set = NULL; } free_lookup_list(); return dict; }
/** * Compare a portion of the tokenized string, starting at word_stat with length * of numchar, to the dictionary or affix class word that is defined in the * capture group whose info is pointed to by cgnump. * * FIXME: Return int instead of bool, see the comment at E1 below. */ static bool is_word(const char *word_start, int numchar, cgnum_t *cgnump) { Dictionary const dict = cgnump->dict; const char * const afclass = cgnump->afclass; const int lookup_mark_len = (NULL != cgnump->lookup_mark) ? strlen(cgnump->lookup_mark) : 0; char * const word = alloca(numchar+lookup_mark_len+1); #ifdef AFFIX_DICTIONARY_TREE const Dict_node *dn; #endif const Afdict_class *ac; size_t i; /* Append/prepend stem/infix marks. */ if (NULL == cgnump->lookup_mark) { strncpy(word, word_start, numchar); word[numchar] = '\0'; } else { switch (cgnump->lookup_mark_pos) { case 'p': /* prepend a mark */ strcpy(word, cgnump->lookup_mark); strncat(word, word_start, numchar); word[numchar+lookup_mark_len] = '\0'; break; case 'a': /* append a mark */ strncpy(word, word_start, numchar); strcpy(word+numchar, cgnump->lookup_mark); break; default: printf("is_word:E3('%x' %s)", cgnump->lookup_mark_pos, cgnump->lookup_mark); strncpy(word, word_start, numchar); word[numchar] = '\0'; } } lgdebug(7, "LOOKUP '%s' in %s: ", word, dict->name); if (0 == afclass) return boolean_dictionary_lookup(dict, word); /* We don't have for now a tree representation of the affix file, only lists */ #ifdef AFFIX_DICTIONARY_TREE dn = lookup_list(dict, word); printf("WORD %s afclass %s dn %p\n", word, afclass, dn); if (NULL == dn) return false; for (; NULL != dn; dn = dn->left) { const char *con = word_only_connector(dn); if (NULL == con) { /* Internal error - nothing else to do for now unless we don't * rerun bool, but return an int so -1 signifies an error. */ printf("is_word(%s):E1 ", word); } printf("CON '%s'\n", con); if (0 == strcmp(afclass, con)) return true; } #else /* Make it the hard way. */ ac = afdict_find(dict, afclass, /*notify_err*/false); if (NULL == ac) { /* Internal error - nothing else to do for now unless we don't * rerun bool, but return an int so -1 signifies an error. */ printf("is_word(%s):E2 ", word); } for (i = 0; i < ac->length; i++) { if (0 == strcmp(ac->string[i], word)) return true; } #endif return false; }
/** * Read dictionary entries from a wide-character string "input". * All other parts are read from files. */ static Dictionary dictionary_six_str(const char * lang, const char * input, const char * dict_name, const char * pp_name, const char * cons_name, const char * affix_name, const char * regex_name) { const char * t; Dictionary dict; Dict_node *dict_node; dict = (Dictionary) xalloc(sizeof(struct Dictionary_s)); memset(dict, 0, sizeof(struct Dictionary_s)); dict->num_entries = 0; dict->is_special = false; dict->already_got_it = '\0'; dict->line_number = 0; dict->root = NULL; dict->regex_root = NULL; dict->word_file_header = NULL; dict->exp_list = NULL; dict->affix_table = NULL; dict->recursive_error = false; dict->version = NULL; #ifdef HAVE_SQLITE dict->db_handle = NULL; #endif #ifdef USE_ANYSPLIT dict->anysplit = NULL; #endif /* Language and file-name stuff */ dict->string_set = string_set_create(); dict->lang = lang; t = strrchr (lang, '/'); if (t) dict->lang = string_set_add(t+1, dict->string_set); dict->name = string_set_add(dict_name, dict->string_set); /* * A special setup per dictionary type. The check here assumes the affix * dictionary name contains "affix". FIXME: For not using this * assumption, the dictionary creating stuff needs a rearrangement. */ if (0 == strstr(dict->name, "affix")) { /* To disable spell-checking, just set the checker to NULL */ dict->spell_checker = spellcheck_create(dict->lang); dict->insert_entry = insert_list; dict->lookup_list = lookup_list; dict->free_lookup = free_llist; dict->lookup = boolean_lookup; } else { /* * Affix dictionary. */ size_t i; dict->insert_entry = load_affix; dict->lookup = return_true; /* initialize the class table */ dict->afdict_class = malloc(sizeof(*dict->afdict_class) * NUMELEMS(afdict_classname)); for (i = 0; i < NUMELEMS(afdict_classname); i++) { dict->afdict_class[i].mem_elems = 0; dict->afdict_class[i].length = 0; dict->afdict_class[i].string = NULL; } } dict->affix_table = NULL; /* Read dictionary from the input string. */ dict->input = input; dict->pin = dict->input; if (!read_dictionary(dict)) { dict->pin = NULL; dict->input = NULL; goto failure; } dict->pin = NULL; dict->input = NULL; if (NULL == affix_name) { /* * The affix table is handled alone in this invocation. * Skip the rest of processing! * FIXME: The dictionary creating stuff needs a rearrangement. */ return dict; } dict->affix_table = dictionary_six(lang, affix_name, NULL, NULL, NULL, NULL); if (dict->affix_table == NULL) { prt_error("Error: Could not open affix file %s", affix_name); goto failure; } if (! afdict_init(dict)) goto failure; if (read_regex_file(dict, regex_name)) goto failure; if (compile_regexs(dict->regex_root, dict)) goto failure; #ifdef USE_CORPUS dict->corpus = lg_corpus_new(); #endif dict->left_wall_defined = boolean_dictionary_lookup(dict, LEFT_WALL_WORD); dict->right_wall_defined = boolean_dictionary_lookup(dict, RIGHT_WALL_WORD); dict->empty_word_defined = boolean_dictionary_lookup(dict, EMPTY_WORD_MARK); dict->base_knowledge = pp_knowledge_open(pp_name); dict->hpsg_knowledge = pp_knowledge_open(cons_name); dict->unknown_word_defined = boolean_dictionary_lookup(dict, UNKNOWN_WORD); dict->use_unknown_word = true; dict_node = dictionary_lookup_list(dict, UNLIMITED_CONNECTORS_WORD); if (dict_node != NULL) { dict->unlimited_connector_set = connector_set_create(dict_node->exp); } else { dict->unlimited_connector_set = NULL; } free_lookup(dict_node); return dict; failure: string_set_delete(dict->string_set); if (dict->affix_table) xfree(dict->affix_table, sizeof(struct Dictionary_s)); xfree(dict, sizeof(struct Dictionary_s)); return NULL; }
/** * Corrects case of first word, fills in other proper nouns, and * builds the expression lists for the resulting words. * * Algorithm: * Apply the following step to all words w: * if w is in the dictionary, use it. * else if w is upper case use PROPER_WORD disjuncts for w. * else if it's hyphenated, use HYPHENATED_WORD * else if it's a number, use NUMBER_WORD. * * Now, we correct the first word, w. * if w is upper case, let w' be the lower case version of w. * if both w and w' are in the dict, concatenate these disjncts. * else if w' is in dict, use disjuncts of w' * else leave the disjuncts alone */ int build_sentence_expressions(Sentence sent) { int i, first_word; /* the index of the first word after the wall */ char *s, *u, temp_word[MAX_WORD+1]; X_node * e; Dictionary dict = sent->dict; if (dict->left_wall_defined) { first_word = 1; } else { first_word = 0; } /* the following loop treats all words the same (nothing special for 1st word) */ for (i=0; i<sent->length; i++) { s = sent->word[i].string; if (boolean_dictionary_lookup(sent->dict, s)) { sent->word[i].x = build_word_expressions(sent, s); } else if (is_utf8_upper(s) && is_s_word(s) && dict->pl_capitalized_word_defined) { if (!special_string(sent, i, PL_PROPER_WORD)) return FALSE; } else if (is_utf8_upper(s) && dict->capitalized_word_defined) { if (!special_string(sent, i, PROPER_WORD)) return FALSE; } else if (is_number(s) && dict->number_word_defined) { /* we know it's a plural number, or 1 */ /* if the string is 1, we'll only be here if 1's not in the dictionary */ if (!special_string(sent, i, NUMBER_WORD)) return FALSE; } else if (ishyphenated(s) && dict->hyphenated_word_defined) { /* singular hyphenated */ if (!special_string(sent, i, HYPHENATED_WORD)) return FALSE; } /* XXX * The following does some morphology-guessing for words that * that are not in the dictionary. This should be replaced by * a generic morphology-guesser for langauges that aren't english. * XXX */ else if (is_ing_word(s) && dict->ing_word_defined) { if (!guessed_string(sent, i, s, ING_WORD)) return FALSE; } else if (is_s_word(s) && dict->s_word_defined) { if (!guessed_string(sent, i, s, S_WORD)) return FALSE; } else if (is_ed_word(s) && dict->ed_word_defined) { if (!guessed_string(sent, i, s, ED_WORD)) return FALSE; } else if (is_ly_word(s) && dict->ly_word_defined) { if (!guessed_string(sent, i, s, LY_WORD)) return FALSE; } else if (dict->unknown_word_defined && dict->use_unknown_word) { handle_unknown_word(sent, i, s); } else { /* The reason I can assert this is that the word * should have been looked up already if we get here. */ assert(FALSE, "I should have found that word."); } } /* Under certain cases--if it's the first word of the sentence, * or if it follows a colon or a quotation mark--a word that's * capitalized has to be looked up as an uncapitalized word * (as well as a capitalized word). */ for (i=0; i<sent->length; i++) { if (! (i==first_word || (i>0 && strcmp(":", sent->word[i-1].string)==0) || post_quote[i]==1) ) continue; s = sent->word[i].string; if (is_utf8_upper(s)) { downcase_utf8_str(temp_word, s, MAX_WORD); u = string_set_add(temp_word, sent->string_set); /* If the lower-case version is in the dictionary... */ if (boolean_dictionary_lookup(sent->dict, u)) { /* Then check if the upper-case version is there. * If it is, the disjuncts for the upper-case version * have been put there already. So add on the disjuncts * for the lower-case version. */ if (boolean_dictionary_lookup(sent->dict, s)) { e = build_word_expressions(sent, u); sent->word[i].x = catenate_X_nodes(sent->word[i].x, e); } else { /* If the upper-case version isn't there, * replace the u.c. disjuncts with l.c. ones. */ safe_strcpy(s,u, MAX_WORD); e = build_word_expressions(sent, s); free_X_nodes(sent->word[i].x); sent->word[i].x = e; } } } } return TRUE; }
static int separate_word(Sentence sent, char *w, char *wend, int is_first_word, int quote_found) { /* w points to a string, wend points to the char one after the end. The * "word" w contains no blanks. This function splits up the word if * necessary, and calls "issue_sentence_word()" on each of the resulting * parts. The process is described above. returns TRUE of OK, FALSE if * too many punctuation marks */ int i, j, k, l, len; int r_strippable=0, l_strippable=0; int s_strippable=0, p_strippable=0; int n_r_stripped, s_stripped; int word_is_in_dict, s_ok; int r_stripped[MAX_STRIP]; /* these were stripped from the right */ const char ** strip_left=NULL; const char ** strip_right=NULL; const char ** prefix=NULL; const char ** suffix=NULL; char word[MAX_WORD+1]; char newword[MAX_WORD+1]; Dict_node * dn, * dn2, * start_dn; const char * rpunc_con = "RPUNC"; const char * lpunc_con = "LPUNC"; const char * suf_con = "SUF"; const char * pre_con = "PRE"; if (sent->dict->affix_table!=NULL) { start_dn = list_whole_dictionary(sent->dict->affix_table->root, NULL); for (dn = start_dn; dn != NULL; dn = dn->right) { if (word_has_connector(dn, rpunc_con, 0)) r_strippable++; if (word_has_connector(dn, lpunc_con, 0)) l_strippable++; if (word_has_connector(dn, suf_con, 0)) s_strippable++; if (word_has_connector(dn, pre_con, 0)) p_strippable++; } strip_right = (const char **) xalloc(r_strippable * sizeof(char *)); strip_left = (const char **) xalloc(l_strippable * sizeof(char *)); suffix = (const char **) xalloc(s_strippable * sizeof(char *)); prefix = (const char **) xalloc(p_strippable * sizeof(char *)); i=0; j=0; k=0; l=0; dn = start_dn; while (dn != NULL) { if(word_has_connector(dn, rpunc_con, 0)) { strip_right[i] = dn->string; i++; } if(word_has_connector(dn, lpunc_con, 0)) { strip_left[j] = dn->string; j++; } if(word_has_connector(dn, suf_con, 0)) { suffix[k] = dn->string; k++; } if(word_has_connector(dn, pre_con, 0)) { prefix[l] = dn->string; l++; } dn2 = dn->right; dn->right = NULL; xfree(dn, sizeof(Dict_node)); dn = dn2; } } for (;;) { for (i=0; i<l_strippable; i++) { if (strncmp(w, strip_left[i], strlen(strip_left[i])) == 0) { if (!issue_sentence_word(sent, strip_left[i])) return FALSE; w += strlen(strip_left[i]); break; } } if (i==l_strippable) break; } /* Now w points to the string starting just to the right of * any left-stripped characters. * stripped[] is an array of numbers, indicating the index * numbers (in the strip_right array) of any strings stripped off; * stripped[0] is the number of the first string stripped off, etc. * When it breaks out of this loop, n_stripped will be the number * of strings stripped off. */ for (n_r_stripped = 0; n_r_stripped < MAX_STRIP; n_r_stripped++) { strncpy(word, w, MIN(wend-w, MAX_WORD)); word[MIN(wend-w, MAX_WORD)] = '\0'; if (wend == w) break; /* it will work without this */ if (boolean_dictionary_lookup(sent->dict, word) || is_initials_word(word)) break; /* This could happen if it's a word after a colon, also! */ if (is_first_word && downcase_is_in_dict (sent->dict, word)) break; for (i=0; i < r_strippable; i++) { len = strlen(strip_right[i]); /* the remaining w is too short for a possible match */ if ((wend-w) < len) continue; if (strncmp(wend-len, strip_right[i], len) == 0) { r_stripped[n_r_stripped] = i; wend -= len; break; } } if (i == r_strippable) break; } /* Now we strip off suffixes...w points to the remaining word, * "wend" to the end of the word. */ s_stripped = -1; strncpy(word, w, MIN(wend-w, MAX_WORD)); word[MIN(wend-w, MAX_WORD)] = '\0'; word_is_in_dict=0; if (boolean_dictionary_lookup(sent->dict, word)) word_is_in_dict = 1; else if (is_initials_word(word)) word_is_in_dict = 1; else if (is_first_word && downcase_is_in_dict (sent->dict,word)) word_is_in_dict = 1; if(word_is_in_dict==0) { j=0; for (i=0; i < s_strippable+1; i++) { s_ok = 0; /* Go through once for each suffix; then go through one * final time for the no-suffix case */ if(i < s_strippable) { len = strlen(suffix[i]); /* the remaining w is too short for a possible match */ if ((wend-w) < len) continue; if (strncmp(wend-len, suffix[i], len) == 0) s_ok=1; } else len=0; if(s_ok==1 || i==s_strippable) { strncpy(newword, w, MIN((wend-len)-w, MAX_WORD)); newword[MIN((wend-len)-w, MAX_WORD)] = '\0'; /* Check if the remainder is in the dictionary; * for the no-suffix case, it won't be */ if (boolean_dictionary_lookup(sent->dict, newword)) { if(verbosity>1) if(i< s_strippable) printf("Splitting word into two: %s-%s\n", newword, suffix[i]); s_stripped = i; wend -= len; strncpy(word, w, MIN(wend-w, MAX_WORD)); word[MIN(wend-w, MAX_WORD)] = '\0'; break; } /* If the remainder isn't in the dictionary, * try stripping off prefixes */ else { for (j=0; j<p_strippable; j++) { if (strncmp(w, prefix[j], strlen(prefix[j])) == 0) { strncpy(newword, w+strlen(prefix[j]), MIN((wend-len)-(w+strlen(prefix[j])), MAX_WORD)); newword[MIN((wend-len)-(w+strlen(prefix[j])), MAX_WORD)]='\0'; if(boolean_dictionary_lookup(sent->dict, newword)) { if(verbosity>1) if(i < s_strippable) printf("Splitting word into three: %s-%s-%s\n", prefix[j], newword, suffix[i]); if (!issue_sentence_word(sent, prefix[j])) return FALSE; if(i < s_strippable) s_stripped = i; wend -= len; w += strlen(prefix[j]); strncpy(word, w, MIN(wend-w, MAX_WORD)); word[MIN(wend-w, MAX_WORD)] = '\0'; break; } } } } if(j!=p_strippable) break; } } } /* word is now what remains after all the stripping has been done */ /* if (n_stripped == MAX_STRIP) { lperror(SEPARATE, ".\n\"%s\" is followed by too many punctuation marks.\n", word); return FALSE; } */ if (quote_found==1) post_quote[sent->length]=1; if (!issue_sentence_word(sent, word)) return FALSE; if(s_stripped != -1) { if (!issue_sentence_word(sent, suffix[s_stripped])) return FALSE; } for (i=n_r_stripped-1; i>=0; i--) { /* Revert fix r22566, which had a commit message: * "Fix Bug 9756, crash when grammar checking Word document." * This fix added the line: * if (r_stripped[i] > strlen(*strip_right)) continue; * However, the addition of this line will break * the parsing of "Doogie's mother bit her." * * The fix is incorrect, because a NULL has been inserted into strip_right, * making it very short (length 2). Meanwhile, the offset to the 's * is 9 chars (greater than 2!) The string at strip_right[r_stripped[i]] * is pointing at the 's. * * Thus, I'm reverting this fix for now; whatever the problem is, * it needs to be handled in some other way. */ if (!issue_sentence_word(sent, strip_right[r_stripped[i]])) return FALSE; } if(sent->dict->affix_table!=NULL) { xfree(strip_right, r_strippable * sizeof(char *)); xfree(strip_left, l_strippable * sizeof(char *)); xfree(suffix, s_strippable * sizeof(char *)); xfree(prefix, p_strippable * sizeof(char *)); } return TRUE; }