Esempio n. 1
0
Dictionary dictionary_create_from_db(const char *lang)
{
	char *dbname;
	const char * t;
	Dictionary dict;
	Dict_node *dict_node;

	dict = (Dictionary) xalloc(sizeof(struct Dictionary_s));
	memset(dict, 0, sizeof(struct Dictionary_s));

	dict->version = NULL;
	dict->num_entries = 0;
	dict->affix_table = NULL;
	dict->regex_root = NULL;

	/* Language and file-name stuff */
	dict->string_set = string_set_create();
	dict->lang = lang;
	t = strrchr (lang, '/');
	if (t) dict->lang = string_set_add(t+1, dict->string_set);

	/* To disable spell-checking, just set the checker to NULL */
	dict->spell_checker = spellcheck_create(dict->lang);
	dict->base_knowledge = NULL;
	dict->hpsg_knowledge = NULL;

	dbname = join_path (lang, "dict.db");
	dict->name = string_set_add(dbname, dict->string_set);
	free(dbname);

	/* Set up the database */
	dict->db_handle = object_open(dict->name, db_open, NULL);

	dict->lookup_list = db_lookup_list;
	dict->free_lookup = db_free_llist;
	dict->lookup = db_lookup;
	dict->close = db_close;

	/* Misc remaining common (generic) dict setup work */
	dict->left_wall_defined  = boolean_dictionary_lookup(dict, LEFT_WALL_WORD);
	dict->right_wall_defined = boolean_dictionary_lookup(dict, RIGHT_WALL_WORD);

	dict->empty_word_defined = boolean_dictionary_lookup(dict, EMPTY_WORD_MARK);

	dict->unknown_word_defined = boolean_dictionary_lookup(dict, UNKNOWN_WORD);
	dict->use_unknown_word = true;

	dict_node = dictionary_lookup_list(dict, UNLIMITED_CONNECTORS_WORD);
	if (dict_node != NULL) {
		dict->unlimited_connector_set = connector_set_create(dict_node->exp);
	} else {
		dict->unlimited_connector_set = NULL;
	}
	free_lookup_list(dict, dict_node);

	return dict;
}
Esempio n. 2
0
/**
 * Return true if word is in dictionary, or if word is matched by
 * regex.
 */
bool find_word_in_dict(const Dictionary dict, const char * word)
{
	const char * regex_name;
	if (boolean_dictionary_lookup (dict, word)) return true;

	regex_name = match_regex(dict->regex_root, word);
	if (NULL == regex_name) return false;

	return boolean_dictionary_lookup(dict, regex_name);
}
Esempio n. 3
0
Dictionary dictionary_create_from_db(const char *lang)
{
	char *dbname;
	const char * t;
	Dictionary dict;
	Dict_node *dict_node;

	dict = (Dictionary) xalloc(sizeof(struct Dictionary_s));
	memset(dict, 0, sizeof(struct Dictionary_s));

	/* Language and file-name stuff */
	dict->string_set = string_set_create();
	t = strrchr (lang, '/');
	t = (NULL == t) ? lang : t+1;
	dict->lang = string_set_add(t, dict->string_set);
	lgdebug(D_USER_FILES, "Debug: Language: %s\n", dict->lang);

	/* To disable spell-checking, just set the checker to NULL */
	dict->spell_checker = spellcheck_create(dict->lang);
#if defined HAVE_HUNSPELL || defined HAVE_ASPELL
	if (NULL == dict->spell_checker)
		prt_error("Info: Spell checker disabled.");
#endif
	dict->base_knowledge = NULL;
	dict->hpsg_knowledge = NULL;

	dbname = join_path (lang, "dict.db");
	dict->name = string_set_add(dbname, dict->string_set);
	free(dbname);

	/* Set up the database */
	dict->db_handle = object_open(dict->name, db_open, NULL);

	dict->lookup_list = db_lookup_list;
	dict->free_lookup = db_free_llist;
	dict->lookup = db_lookup;
	dict->close = db_close;

	/* Misc remaining common (generic) dict setup work */
	dict->left_wall_defined  = boolean_dictionary_lookup(dict, LEFT_WALL_WORD);
	dict->right_wall_defined = boolean_dictionary_lookup(dict, RIGHT_WALL_WORD);

	dict->empty_word_defined = boolean_dictionary_lookup(dict, EMPTY_WORD_MARK);

	dict->unknown_word_defined = boolean_dictionary_lookup(dict, UNKNOWN_WORD);
	dict->use_unknown_word = true;

	dict_node = dictionary_lookup_list(dict, UNLIMITED_CONNECTORS_WORD);
	if (dict_node != NULL)
		dict->unlimited_connector_set = connector_set_create(dict_node->exp);

	free_lookup_list(dict, dict_node);

	return dict;
}
Esempio n. 4
0
/**
 * Compiles all the given regexs. Returns 0 on success,
 * else an error code.
 */
int compile_regexs(Regex_node *re, Dictionary dict)
{
    regex_t *preg;
    int rc;

    while (re != NULL)
    {
        /* If re->re non-null, assume compiled already. */
        if(re->re == NULL)
        {
            /* Compile with default options (0) and default character
             * tables (NULL). */
            /* re->re = pcre_compile(re->pattern, 0, &error, &erroroffset, NULL); */
            preg = (regex_t *) malloc (sizeof(regex_t));
            re->re = preg;
            rc = regcomp(preg, re->pattern, REG_EXTENDED);
            if (rc)
            {
                prt_regerror("Failed to compile regex", re, rc);
                return rc;
            }

            /* Check that the regex name is defined in the dictionary. */
            if ((NULL != dict) && !boolean_dictionary_lookup(dict, re->name))
            {
                /* TODO: better error handing. Maybe remove the regex? */
                prt_error("Error: Regex name %s not found in dictionary!\n",
                          re->name);
            }
        }
        re = re->next;
    }
    return 0;
}
Esempio n. 5
0
/**
 * This just looks up all the words in the sentence, and builds
 * up an appropriate error message in case some are not there.
 * It has no side effect on the sentence.  Returns TRUE if all
 * went well.
 */
int sentence_in_dictionary(Sentence sent)
{
	int w, ok_so_far;
	char * s;
	Dictionary dict = sent->dict;
	char temp[1024];

	ok_so_far = TRUE;
	for (w=0; w<sent->length; w++)
	{
		s = sent->word[w].string;
		if (!boolean_dictionary_lookup(dict, s) &&
		    !(is_utf8_upper(s)   && dict->capitalized_word_defined) &&
		    !(is_utf8_upper(s) && is_s_word(s) && dict->pl_capitalized_word_defined) &&
		    !(ishyphenated(s) && dict->hyphenated_word_defined)  &&
		    !(is_number(s)	&& dict->number_word_defined) &&
		    !(is_ing_word(s)  && dict->ing_word_defined)  &&
		    !(is_s_word(s)	&& dict->s_word_defined)  &&
		    !(is_ed_word(s)   && dict->ed_word_defined)  &&
		    !(is_ly_word(s)   && dict->ly_word_defined))
		{
			if (ok_so_far) {
				safe_strcpy(temp, "The following words are not in the dictionary:", sizeof(temp));
				ok_so_far = FALSE;
			}
			safe_strcat(temp, " \"", sizeof(temp));
			safe_strcat(temp, sent->word[w].string, sizeof(temp));
			safe_strcat(temp, "\"", sizeof(temp));
		}
	}
	if (!ok_so_far) {
		lperror(NOTINDICT, "\n%s\n", temp);
	}
	return ok_so_far;
}
Esempio n. 6
0
static int downcase_is_in_dict(Dictionary dict, char * word)
{
	int i, rc;
	char low[MB_LEN_MAX];
	char save[MB_LEN_MAX];
	wchar_t c;
	int nbl, nbh;

	if (!is_utf8_upper(word)) return FALSE;

	nbh = mbtowc (&c, word, 4);
	c = towlower(c);
	nbl = wctomb(low, c);
	if (nbh != nbl)
	{
		fprintf(stderr, "Error: can't downcase multi-byte string!\n");
		return FALSE;
	}

	/* Downcase */
	for (i=0; i<nbl; i++) { save[i] = word[i]; word[i] = low[i]; }

	/* Look it up, then restore old value */
	rc = boolean_dictionary_lookup(dict, word);
	for (i=0; i<nbh; i++) { word[i] = save[i]; }

	return rc; 
}
Esempio n. 7
0
/**
 * Compile all the given regexs.
 * Return 0 on success, else an error code.
 */
int compile_regexs(Regex_node *rn, Dictionary dict)
{
	while (rn != NULL)
	{
		/* If rn->re non-null, assume compiled already. */
		if(rn->re == NULL)
		{
			int rc;
			regex_t *re = rn->re = malloc(sizeof(regex_t));

#if HAVE_PCRE2_H
			PCRE2_SIZE erroffset;
			re->re_code =
				pcre2_compile((PCRE2_SPTR)rn->pattern, PCRE2_ZERO_TERMINATED,
				              PCRE2_UTF|PCRE2_UCP, &rc, &erroffset, NULL);
			if (NULL != re->re_code)
			{
				rc = 0;
				re->re_md = pcre2_match_data_create(0, NULL);
				if (NULL == re->re_md) return -1; /* Unhandled for now. */
			}
#else
			const int erroffset = -1;

			/* REG_ENHANCED is needed for macOS to support \w etc. */
#ifndef REG_ENHANCED
#define REG_ENHANCED 0
#endif
			rc = regcomp(re, rn->pattern, REG_NOSUB|REG_EXTENDED|REG_ENHANCED);
#endif

			if (rc)
			{
				prt_regerror("Failed to compile regex", rn, rc ,erroffset);
				rn->re = NULL;
				return rc;
			}

			/* Check that the regex name is defined in the dictionary. */
			if ((NULL != dict) && !boolean_dictionary_lookup(dict, rn->name))
			{
				/* TODO: better error handing. Maybe remove the regex? */
				prt_error("Error: Regex name %s not found in dictionary!\n",
				       rn->name);
			}
		}
		rn = rn->next;
	}
	return 0;
}
Esempio n. 8
0
static int guessed_string(Sentence sent, int i, const char * s, const char * type) {
	X_node * e;
	char *t, *u;
	char str[MAX_WORD+1];
	if (boolean_dictionary_lookup(sent->dict, type)) {
	  sent->word[i].x = build_word_expressions(sent, type);
	  e = sent->word[i].x;
		  if(is_s_word(s)) {

			for (; e != NULL; e = e->next) {
			  t = strchr(e->string, '.');
			  if (t != NULL) {
				sprintf(str, "%.50s[!].%.5s", s, t+1);
			  } else {
				sprintf(str, "%.50s[!]", s);
			  }
			  t = (char *) xalloc(strlen(str)+1);
			  strcpy(t,str);
			  u = string_set_add(t, sent->string_set);
			  xfree(t, strlen(str)+1);
			  e->string = u;
			}
		  }

		  else {
			if(is_ed_word(s)) {
			  sprintf(str, "%.50s[!].v", s);
			}
			else if(is_ing_word(s)) {
			  sprintf(str, "%.50s[!].g", s);
			}
			else if(is_ly_word(s)) {
			  sprintf(str, "%.50s[!].e", s);
			}
			else sprintf(str, "%.50s[!]", s);

			t = (char *) xalloc(strlen(str)+1);
			strcpy(t,str);
			u = string_set_add(t, sent->string_set);
			xfree(t, strlen(str)+1);
			e->string = u;
		  }
		  return TRUE;

	} else {
		lperror(BUILDEXPR, ".\n To process this sentence your dictionary "
				"needs the word \"%s\".\n", type);
		return FALSE;
	}
}
Esempio n. 9
0
static int special_string(Sentence sent, int i, const char * s) {
	X_node * e;
	if (boolean_dictionary_lookup(sent->dict, s)) {
		sent->word[i].x = build_word_expressions(sent, s);
		for (e = sent->word[i].x; e != NULL; e = e->next) {
			e->string = sent->word[i].string;
		}
		return TRUE;
	} else {
		lperror(BUILDEXPR, ".\n To process this sentence your dictionary "
				"needs the word \"%s\".\n", s);
		return FALSE;
	}
}
Esempio n. 10
0
static Dictionary
dictionary_six_str(const char * lang,
                   const char * input,
                   const char * dict_name,
                   const char * pp_name, const char * cons_name,
                   const char * affix_name, const char * regex_name)
{
	const char * t;
	Dictionary dict;
	Dict_node *dict_node;

	dict = (Dictionary) xalloc(sizeof(struct Dictionary_s));
	memset(dict, 0, sizeof(struct Dictionary_s));

	/* Language and file-name stuff */
	dict->string_set = string_set_create();
	t = strrchr (lang, '/');
	t = (NULL == t) ? lang : t+1;
	dict->lang = string_set_add(t, dict->string_set);
	lgdebug(D_USER_FILES, "Debug: Language: %s\n", dict->lang);
	dict->name = string_set_add(dict_name, dict->string_set);

	/*
	 * A special setup per dictionary type. The check here assumes the affix
	 * dictionary name contains "affix". FIXME: For not using this
	 * assumption, the dictionary creating stuff needs a rearrangement.
	 */
	if (0 == strstr(dict->name, "affix"))
	{
		/* To disable spell-checking, just set the checker to NULL */
		dict->spell_checker = spellcheck_create(dict->lang);
#if defined HAVE_HUNSPELL || defined HAVE_ASPELL
		/* TODO:
		 * 1. Set the spell option to 0, to signify no spell checking is done.
		 * 2. On verbosity >= 1, add a detailed message on the reason. */
		if (NULL == dict->spell_checker)
			prt_error("Info: Spell checker disabled.");
#endif
		dict->insert_entry = insert_list;

		dict->lookup_list = lookup_list;
		dict->free_lookup = free_llist;
		dict->lookup = boolean_lookup;
	}
	else
	{
		/*
		 * Affix dictionary.
		 */
		size_t i;

		dict->insert_entry = load_affix;
		dict->lookup = return_true;

		/* initialize the class table */
		dict->afdict_class =
		   malloc(sizeof(*dict->afdict_class) * ARRAY_SIZE(afdict_classname));
		for (i = 0; i < ARRAY_SIZE(afdict_classname); i++)
		{
			dict->afdict_class[i].mem_elems = 0;
			dict->afdict_class[i].length = 0;
			dict->afdict_class[i].string = NULL;
		}
	}
	dict->affix_table = NULL;

	/* Read dictionary from the input string. */
	dict->input = input;
	dict->pin = dict->input;
	if (!read_dictionary(dict))
	{
		dict->pin = NULL;
		dict->input = NULL;
		goto failure;
	}
	dict->pin = NULL;
	dict->input = NULL;

	if (NULL == affix_name)
	{
		/*
		 * The affix table is handled alone in this invocation.
		 * Skip the rest of processing!
		 * FIXME: The dictionary creating stuff needs a rearrangement.
		 */
		return dict;
	}

	/* If we don't have a locale per dictionary, the following
	 * will also set the program's locale. */
	dict->locale = linkgrammar_get_dict_locale(dict);
	set_utf8_program_locale();

#ifdef HAVE_LOCALE_T
	/* We have a locale per dictionary. */
	if (NULL != dict->locale)
		dict->locale_t = newlocale_LC_CTYPE(dict->locale);

	/* If we didn't succeed to set the dictionary locale, the program will
	 * SEGFAULT when it tries to use it with the isw*() functions.
	 * So set it to the current program's locale as a last resort. */
	if (NULL == dict->locale)
	{
		dict->locale = setlocale(LC_CTYPE, NULL);
		dict->locale_t = newlocale_LC_CTYPE(setlocale(LC_CTYPE, NULL));
		prt_error("Warning: Couldn't set dictionary locale! "
		          "Using current program locale %s", dict->locale);
	}
	/* If dict->locale is still not set, there is a bug. */
	assert((locale_t)0 != dict->locale_t, "Dictionary locale is not set.");
#else
	/* We don't have a locale per dictionary - but anyway make sure
	 * dict->locale is consistent with the current program's locale,
	 * and especially that it is not NULL. It still indicates the intended
	 * locale of this dictionary and the locale of the compiled regexs. */
	dict->locale = setlocale(LC_CTYPE, NULL);
#endif /* HAVE_LOCALE_T */

	dict->affix_table = dictionary_six(lang, affix_name, NULL, NULL, NULL, NULL);
	if (dict->affix_table == NULL)
	{
		prt_error("Error: Could not open affix file %s", affix_name);
		goto failure;
	}
	if (! afdict_init(dict))
		goto failure;

	/*
	 * Process the regex file.
	 * We have to compile regexs using the dictionary locale,
	 * so make a temporary locale swap.
	 */
	if (read_regex_file(dict, regex_name)) goto failure;

	const char *locale = setlocale(LC_CTYPE, NULL);
	locale = strdupa(locale); /* setlocale() uses static memory. */
	setlocale(LC_CTYPE, dict->locale);
	lgdebug(+D_DICT, "Regexs locale %s\n", setlocale(LC_CTYPE, NULL));

	if (compile_regexs(dict->regex_root, dict))
	{
		locale = setlocale(LC_CTYPE, locale);
		goto failure;
	}
	locale = setlocale(LC_CTYPE, locale);
	assert(NULL != locale, "Cannot restore program locale\n");

#ifdef USE_CORPUS
	dict->corpus = lg_corpus_new();
#endif

	dict->left_wall_defined  = boolean_dictionary_lookup(dict, LEFT_WALL_WORD);
	dict->right_wall_defined = boolean_dictionary_lookup(dict, RIGHT_WALL_WORD);

	dict->empty_word_defined = boolean_dictionary_lookup(dict, EMPTY_WORD_MARK);

	dict->base_knowledge  = pp_knowledge_open(pp_name);
	dict->hpsg_knowledge  = pp_knowledge_open(cons_name);

	dict->unknown_word_defined = boolean_dictionary_lookup(dict, UNKNOWN_WORD);
	dict->use_unknown_word = true;

	dict_node = dictionary_lookup_list(dict, UNLIMITED_CONNECTORS_WORD);
	if (dict_node != NULL)
		dict->unlimited_connector_set = connector_set_create(dict_node->exp);

	free_lookup(dict_node);

	return dict;

failure:
	string_set_delete(dict->string_set);
	if (dict->affix_table) xfree(dict->affix_table, sizeof(struct Dictionary_s));
	xfree(dict, sizeof(struct Dictionary_s));
	return NULL;
}
Esempio n. 11
0
File: api.c Progetto: mclumd/Alfred
/* The following function is dictionary_create with an extra paramater called "path".
   If this is non-null, then the path used to find the file is taken from that path.
   Otherwise the path is taken from the dict_name.  This is only needed because
   an affix_file is opened by a recursive call to this function.
 */
static Dictionary internal_dictionary_create(char * dict_name, char * pp_name, char * cons_name, char * affix_name, char * path) {
    Dictionary dict;
    static int rand_table_inited=FALSE;
    Dict_node *dict_node;
    char * dictionary_path_name;

    dict = (Dictionary) xalloc(sizeof(struct Dictionary_s));

    if (!rand_table_inited) {
        init_randtable();
	rand_table_inited=TRUE;
    }

    dict->string_set = string_set_create();
    dict->name = string_set_add(dict_name, dict->string_set);
    dict->num_entries = 0;
    dict->is_special = FALSE;
    dict->already_got_it = '\0';
    dict->line_number = 1;
    dict->root = NULL;
    dict->word_file_header = NULL;
    dict->exp_list = NULL;
    dict->affix_table = NULL;

    /*  *DS*  remove this
    if (pp_name != NULL) {
	dict->post_process_filename = string_set_add(pp_name, dict->string_set);
    }
    else {
	dict->post_process_filename = NULL;
    }
    */
    
    if (path != NULL) dictionary_path_name = path; else dictionary_path_name = dict_name;

    if (!open_dictionary(dictionary_path_name, dict)) {
	lperror(NODICT, dict_name);
	string_set_delete(dict->string_set);
	xfree(dict, sizeof(struct Dictionary_s));
	return NULL;
    }

    if (!read_dictionary(dict)) {
	string_set_delete(dict->string_set);
	xfree(dict, sizeof(struct Dictionary_s));
	return NULL;
    }

    dict->left_wall_defined  = boolean_dictionary_lookup(dict, LEFT_WALL_WORD);
    dict->right_wall_defined = boolean_dictionary_lookup(dict, RIGHT_WALL_WORD);
    dict->postprocessor      = post_process_open(dict->name, pp_name);
    dict->constituent_pp     = post_process_open(dict->name, cons_name);
    
    dict->affix_table = NULL;
    if (affix_name != NULL) {
	dict->affix_table = internal_dictionary_create(affix_name, NULL, NULL, NULL, dict_name);
	if (dict->affix_table == NULL) {
	    fprintf(stderr, "%s\n", lperrmsg);
	    exit(-1);
	}
    }
    
    dict->unknown_word_defined = boolean_dictionary_lookup(dict, UNKNOWN_WORD);
    dict->use_unknown_word = TRUE;
    dict->capitalized_word_defined = boolean_dictionary_lookup(dict, PROPER_WORD);
    dict->pl_capitalized_word_defined = boolean_dictionary_lookup(dict, PL_PROPER_WORD);
    dict->hyphenated_word_defined = boolean_dictionary_lookup(dict, HYPHENATED_WORD);
    dict->number_word_defined = boolean_dictionary_lookup(dict, NUMBER_WORD);
    dict->ing_word_defined = boolean_dictionary_lookup(dict, ING_WORD);
    dict->s_word_defined = boolean_dictionary_lookup(dict, S_WORD);
    dict->ed_word_defined = boolean_dictionary_lookup(dict, ED_WORD);
    dict->ly_word_defined = boolean_dictionary_lookup(dict, LY_WORD);
    dict->max_cost = 1000;

    if ((dict_node = dictionary_lookup(dict, ANDABLE_CONNECTORS_WORD)) != NULL) {
	dict->andable_connector_set = connector_set_create(dict_node->exp);
    } else {
	dict->andable_connector_set = NULL;
    }

    if ((dict_node = dictionary_lookup(dict, UNLIMITED_CONNECTORS_WORD)) != NULL) {
	dict->unlimited_connector_set = connector_set_create(dict_node->exp);
    } else {
	dict->unlimited_connector_set = NULL;
    }

    free_lookup_list();
    return dict;
}
Esempio n. 12
0
/**
 * Compare a portion of the tokenized string, starting at word_stat with length
 * of numchar, to the dictionary or affix class word that is defined in the
 * capture group whose info is pointed to by cgnump.
 *
 * FIXME: Return int instead of bool, see the comment at E1 below.
 */
static bool is_word(const char *word_start, int numchar, cgnum_t *cgnump)
{
	Dictionary const dict = cgnump->dict;
	const char * const afclass = cgnump->afclass;
	const int lookup_mark_len =
		(NULL != cgnump->lookup_mark) ? strlen(cgnump->lookup_mark) : 0;
	char * const word = alloca(numchar+lookup_mark_len+1);
#ifdef AFFIX_DICTIONARY_TREE
	const Dict_node *dn;
#endif
	const Afdict_class *ac;
	size_t i;

	/* Append/prepend stem/infix marks. */
	if (NULL == cgnump->lookup_mark)
	{
		strncpy(word, word_start, numchar);
		word[numchar] = '\0';
	}
	else
	{
		switch (cgnump->lookup_mark_pos)
		{
		case 'p': /* prepend a mark */
			strcpy(word, cgnump->lookup_mark);
			strncat(word, word_start, numchar);
			word[numchar+lookup_mark_len] = '\0';
			break;
		case 'a': /* append a mark */
			strncpy(word, word_start, numchar);
			strcpy(word+numchar, cgnump->lookup_mark);
			break;
		default:
			printf("is_word:E3('%x' %s)", cgnump->lookup_mark_pos, cgnump->lookup_mark);
			strncpy(word, word_start, numchar);
			word[numchar] = '\0';
		}
	}

	lgdebug(7, "LOOKUP '%s' in %s: ", word, dict->name);
	if (0 == afclass) return boolean_dictionary_lookup(dict, word);

	/* We don't have for now a tree representation of the affix file, only lists */
#ifdef AFFIX_DICTIONARY_TREE
	dn = lookup_list(dict, word);
	printf("WORD %s afclass %s dn %p\n", word, afclass, dn);
	if (NULL == dn) return false;

	for (; NULL != dn; dn = dn->left)
	{
		const char *con = word_only_connector(dn);
		if (NULL == con)
		{
			/* Internal error - nothing else to do for now unless we don't
			 * rerun bool, but return an int so -1 signifies an error. */
			printf("is_word(%s):E1 ", word);
		}
		printf("CON '%s'\n", con);
		if (0 == strcmp(afclass, con)) return true;
	}
#else
		/* Make it the hard way. */
		ac = afdict_find(dict, afclass, /*notify_err*/false);
		if (NULL == ac)
		{
			/* Internal error - nothing else to do for now unless we don't
			 * rerun bool, but return an int so -1 signifies an error. */
			printf("is_word(%s):E2 ", word);
		}

		for (i = 0; i < ac->length; i++)
		{
			if (0 == strcmp(ac->string[i], word)) return true;
		}
#endif

	return false;
}
Esempio n. 13
0
/**
 * Read dictionary entries from a wide-character string "input".
 * All other parts are read from files.
 */
static Dictionary
dictionary_six_str(const char * lang,
                   const char * input,
                   const char * dict_name,
                   const char * pp_name, const char * cons_name,
                   const char * affix_name, const char * regex_name)
{
	const char * t;
	Dictionary dict;
	Dict_node *dict_node;

	dict = (Dictionary) xalloc(sizeof(struct Dictionary_s));
	memset(dict, 0, sizeof(struct Dictionary_s));

	dict->num_entries = 0;
	dict->is_special = false;
	dict->already_got_it = '\0';
	dict->line_number = 0;
	dict->root = NULL;
	dict->regex_root = NULL;
	dict->word_file_header = NULL;
	dict->exp_list = NULL;
	dict->affix_table = NULL;
	dict->recursive_error = false;
	dict->version = NULL;
#ifdef HAVE_SQLITE
	dict->db_handle = NULL;
#endif
#ifdef USE_ANYSPLIT
	dict->anysplit = NULL;
#endif

	/* Language and file-name stuff */
	dict->string_set = string_set_create();
	dict->lang = lang;
	t = strrchr (lang, '/');
	if (t) dict->lang = string_set_add(t+1, dict->string_set);
	dict->name = string_set_add(dict_name, dict->string_set);

	/*
	 * A special setup per dictionary type. The check here assumes the affix
	 * dictionary name contains "affix". FIXME: For not using this
	 * assumption, the dictionary creating stuff needs a rearrangement.
	 */
	if (0 == strstr(dict->name, "affix"))
	{
		/* To disable spell-checking, just set the checker to NULL */
		dict->spell_checker = spellcheck_create(dict->lang);
		dict->insert_entry = insert_list;

		dict->lookup_list = lookup_list;
		dict->free_lookup = free_llist;
		dict->lookup = boolean_lookup;
	}
	else
	{
		/*
		 * Affix dictionary.
		 */
		size_t i;

		dict->insert_entry = load_affix;
		dict->lookup = return_true;

		/* initialize the class table */
		dict->afdict_class =
		   malloc(sizeof(*dict->afdict_class) * NUMELEMS(afdict_classname));
		for (i = 0; i < NUMELEMS(afdict_classname); i++)
		{
			dict->afdict_class[i].mem_elems = 0;
			dict->afdict_class[i].length = 0;
			dict->afdict_class[i].string = NULL;
		}
	}
	dict->affix_table = NULL;

	/* Read dictionary from the input string. */
	dict->input = input;
	dict->pin = dict->input;
	if (!read_dictionary(dict))
	{
		dict->pin = NULL;
		dict->input = NULL;
		goto failure;
	}
	dict->pin = NULL;
	dict->input = NULL;

	if (NULL == affix_name)
	{
		/*
		 * The affix table is handled alone in this invocation.
		 * Skip the rest of processing!
		 * FIXME: The dictionary creating stuff needs a rearrangement.
		 */
		return dict;
	}

	dict->affix_table = dictionary_six(lang, affix_name, NULL, NULL, NULL, NULL);
	if (dict->affix_table == NULL)
	{
		prt_error("Error: Could not open affix file %s", affix_name);
		goto failure;
	}
	if (! afdict_init(dict))
		goto failure;

	if (read_regex_file(dict, regex_name)) goto failure;
	if (compile_regexs(dict->regex_root, dict)) goto failure;

#ifdef USE_CORPUS
	dict->corpus = lg_corpus_new();
#endif

	dict->left_wall_defined  = boolean_dictionary_lookup(dict, LEFT_WALL_WORD);
	dict->right_wall_defined = boolean_dictionary_lookup(dict, RIGHT_WALL_WORD);

	dict->empty_word_defined = boolean_dictionary_lookup(dict, EMPTY_WORD_MARK);

	dict->base_knowledge  = pp_knowledge_open(pp_name);
	dict->hpsg_knowledge  = pp_knowledge_open(cons_name);

	dict->unknown_word_defined = boolean_dictionary_lookup(dict, UNKNOWN_WORD);
	dict->use_unknown_word = true;

	dict_node = dictionary_lookup_list(dict, UNLIMITED_CONNECTORS_WORD);
	if (dict_node != NULL) {
		dict->unlimited_connector_set = connector_set_create(dict_node->exp);
	} else {
		dict->unlimited_connector_set = NULL;
	}
	free_lookup(dict_node);

	return dict;

failure:
	string_set_delete(dict->string_set);
	if (dict->affix_table) xfree(dict->affix_table, sizeof(struct Dictionary_s));
	xfree(dict, sizeof(struct Dictionary_s));
	return NULL;
}
Esempio n. 14
0
/**
 * Corrects case of first word, fills in other proper nouns, and
 * builds the expression lists for the resulting words.
 *
 * Algorithm:
 * Apply the following step to all words w:
 * if w is in the dictionary, use it.
 * else if w is upper case use PROPER_WORD disjuncts for w.
 * else if it's hyphenated, use HYPHENATED_WORD
 * else if it's a number, use NUMBER_WORD.
 *
 * Now, we correct the first word, w.
 * if w is upper case, let w' be the lower case version of w.
 * if both w and w' are in the dict, concatenate these disjncts.
 * else if w' is in dict, use disjuncts of w'
 * else leave the disjuncts alone
 */
int build_sentence_expressions(Sentence sent)
{
	int i, first_word;  /* the index of the first word after the wall */
	char *s, *u, temp_word[MAX_WORD+1];
	X_node * e;
	Dictionary dict = sent->dict;

	if (dict->left_wall_defined) {
		first_word = 1;
	} else {
		first_word = 0;
	}

	/* the following loop treats all words the same
	   (nothing special for 1st word) */
	for (i=0; i<sent->length; i++)
	{
		s = sent->word[i].string;
		if (boolean_dictionary_lookup(sent->dict, s))
		{
			sent->word[i].x = build_word_expressions(sent, s);
		}
		else if (is_utf8_upper(s) && is_s_word(s) && dict->pl_capitalized_word_defined) 
		{
			if (!special_string(sent, i, PL_PROPER_WORD)) return FALSE;
		}
		else if (is_utf8_upper(s) && dict->capitalized_word_defined)
		{
			if (!special_string(sent, i, PROPER_WORD)) return FALSE;
		}
		else if (is_number(s) && dict->number_word_defined)
		{
			/* we know it's a plural number, or 1 */
			/* if the string is 1, we'll only be here if 1's not in the dictionary */
			if (!special_string(sent, i, NUMBER_WORD)) return FALSE;
		}
		else if (ishyphenated(s) && dict->hyphenated_word_defined)
		{
			/* singular hyphenated */
			if (!special_string(sent, i, HYPHENATED_WORD)) return FALSE;
		} 
		/* XXX
		 * The following does some morphology-guessing for words that
		 * that are not in the dictionary. This should be replaced by
		 * a generic morphology-guesser for langauges that aren't english.
		 * XXX
		 */
		else if (is_ing_word(s) && dict->ing_word_defined) 
		{
			if (!guessed_string(sent, i, s, ING_WORD)) return FALSE;
		}
		else if (is_s_word(s) && dict->s_word_defined)
		{
			if (!guessed_string(sent, i, s, S_WORD)) return FALSE;
		}
		else if (is_ed_word(s) && dict->ed_word_defined)
		{
			if (!guessed_string(sent, i, s, ED_WORD)) return FALSE;
		}
		else if (is_ly_word(s) && dict->ly_word_defined)
		{
			if (!guessed_string(sent, i, s, LY_WORD)) return FALSE;
		}
		else if (dict->unknown_word_defined && dict->use_unknown_word)
		{
			handle_unknown_word(sent, i, s);
		}
		else 
		{
			/* The reason I can assert this is that the word
			 * should have been looked up already if we get here.
			 */
			assert(FALSE, "I should have found that word.");
		}
	}

	/* Under certain cases--if it's the first word of the sentence,
	 * or if it follows a colon or a quotation mark--a word that's 
	 * capitalized has to be looked up as an uncapitalized word
	 * (as well as a capitalized word).
	 */
	for (i=0; i<sent->length; i++)
	{
		if (! (i==first_word || (i>0 && strcmp(":", sent->word[i-1].string)==0) || post_quote[i]==1) ) continue;
		s = sent->word[i].string;

		if (is_utf8_upper(s))
		{
			downcase_utf8_str(temp_word, s, MAX_WORD);
			u = string_set_add(temp_word, sent->string_set);

			/* If the lower-case version is in the dictionary... */
			if (boolean_dictionary_lookup(sent->dict, u))
			{
				/* Then check if the upper-case version is there. 
				 * If it is, the disjuncts for the upper-case version 
				 * have been put there already. So add on the disjuncts
				 * for the lower-case version. */
				if (boolean_dictionary_lookup(sent->dict, s))
				{
					e = build_word_expressions(sent, u);
					sent->word[i].x =
						catenate_X_nodes(sent->word[i].x, e);
				} 
				else
				{
					/* If the upper-case version isn't there,
					 * replace the u.c. disjuncts with l.c. ones.
					 */
					safe_strcpy(s,u, MAX_WORD);
					e = build_word_expressions(sent, s);
					free_X_nodes(sent->word[i].x);
					sent->word[i].x = e;
				}
			}
		}
	}

	return TRUE;
}
Esempio n. 15
0
static int separate_word(Sentence sent, char *w, char *wend, int is_first_word, int quote_found)
{
	/* w points to a string, wend points to the char one after the end.  The
	 * "word" w contains no blanks.  This function splits up the word if
	 * necessary, and calls "issue_sentence_word()" on each of the resulting
	 * parts.  The process is described above.  returns TRUE of OK, FALSE if
	 * too many punctuation marks */
	int i, j, k, l, len;
	int r_strippable=0, l_strippable=0;
	int s_strippable=0, p_strippable=0;
	int  n_r_stripped, s_stripped;
	int word_is_in_dict, s_ok;
	int r_stripped[MAX_STRIP];  /* these were stripped from the right */
	const char ** strip_left=NULL;
	const char ** strip_right=NULL;
	const char ** prefix=NULL;
	const char ** suffix=NULL;
	char word[MAX_WORD+1];
	char newword[MAX_WORD+1];
	Dict_node * dn, * dn2, * start_dn;
	const char * rpunc_con = "RPUNC";
	const char * lpunc_con = "LPUNC";
	const char * suf_con = "SUF";
	const char * pre_con = "PRE";

	if (sent->dict->affix_table!=NULL)
	{
		start_dn = list_whole_dictionary(sent->dict->affix_table->root, NULL);
		for (dn = start_dn; dn != NULL; dn = dn->right)
		{
			if (word_has_connector(dn, rpunc_con, 0)) r_strippable++;
			if (word_has_connector(dn, lpunc_con, 0)) l_strippable++;
			if (word_has_connector(dn, suf_con, 0)) s_strippable++;
			if (word_has_connector(dn, pre_con, 0)) p_strippable++;
	  	}
		strip_right = (const char **) xalloc(r_strippable * sizeof(char *));
		strip_left = (const char **) xalloc(l_strippable * sizeof(char *));
		suffix = (const char **) xalloc(s_strippable * sizeof(char *));
		prefix = (const char **) xalloc(p_strippable * sizeof(char *));

		i=0;
		j=0;
		k=0;
		l=0;
		dn = start_dn;
		while (dn != NULL)
		{
			if(word_has_connector(dn, rpunc_con, 0)) {
				strip_right[i] = dn->string;
				i++;
			}
			if(word_has_connector(dn, lpunc_con, 0)) {
				strip_left[j] = dn->string;
				j++;
			}
			if(word_has_connector(dn, suf_con, 0)) {
				suffix[k] = dn->string;
				k++;
			}
			if(word_has_connector(dn, pre_con, 0)) {
				prefix[l] = dn->string;
				l++;
			}
			dn2 = dn->right;
			dn->right = NULL;
			xfree(dn, sizeof(Dict_node));
			dn = dn2;
		}
	}

	for (;;) {
		for (i=0; i<l_strippable; i++) {
			if (strncmp(w, strip_left[i], strlen(strip_left[i])) == 0) {
				if (!issue_sentence_word(sent, strip_left[i])) return FALSE;
				w += strlen(strip_left[i]);
				break;
			}
		}
		if (i==l_strippable) break;
	}

	/* Now w points to the string starting just to the right of
	 * any left-stripped characters.
	 * stripped[] is an array of numbers, indicating the index
	 * numbers (in the strip_right array) of any strings stripped off;
	 * stripped[0] is the number of the first string stripped off, etc.
	 * When it breaks out of this loop, n_stripped will be the number
	 * of strings stripped off.
	 */
	for (n_r_stripped = 0; n_r_stripped < MAX_STRIP; n_r_stripped++) 
	{
		strncpy(word, w, MIN(wend-w, MAX_WORD));
		word[MIN(wend-w, MAX_WORD)] = '\0';
		if (wend == w) break;  /* it will work without this */

		if (boolean_dictionary_lookup(sent->dict, word) || is_initials_word(word)) break;

		/* This could happen if it's a word after a colon, also! */
		if (is_first_word && downcase_is_in_dict (sent->dict, word)) break;

		for (i=0; i < r_strippable; i++)
		{
			len = strlen(strip_right[i]);

			/* the remaining w is too short for a possible match */
			if ((wend-w) < len) continue;
			if (strncmp(wend-len, strip_right[i], len) == 0) {
				r_stripped[n_r_stripped] = i;
				wend -= len;
				break;
			}
		}
		if (i == r_strippable) break;
	}

	/* Now we strip off suffixes...w points to the remaining word, 
	 * "wend" to the end of the word. */

	s_stripped = -1;
	strncpy(word, w, MIN(wend-w, MAX_WORD));
	word[MIN(wend-w, MAX_WORD)] = '\0';
	word_is_in_dict=0;

	if (boolean_dictionary_lookup(sent->dict, word))
		word_is_in_dict = 1;
	else if (is_initials_word(word))
		word_is_in_dict = 1;
	else if (is_first_word && downcase_is_in_dict (sent->dict,word))
		word_is_in_dict = 1;

	if(word_is_in_dict==0)
	{
	  j=0;
	  for (i=0; i < s_strippable+1; i++) {
		s_ok = 0;
		/* Go through once for each suffix; then go through one 
		 * final time for the no-suffix case */
		if(i < s_strippable) {
		  len = strlen(suffix[i]);

		  /* the remaining w is too short for a possible match */
		  if ((wend-w) < len) continue;

		  if (strncmp(wend-len, suffix[i], len) == 0) s_ok=1;
				  }
		else len=0;

		if(s_ok==1 || i==s_strippable)
		{
			strncpy(newword, w, MIN((wend-len)-w, MAX_WORD));
			newword[MIN((wend-len)-w, MAX_WORD)] = '\0';

			/* Check if the remainder is in the dictionary;
			 * for the no-suffix case, it won't be */
			if (boolean_dictionary_lookup(sent->dict, newword)) {
				if(verbosity>1) if(i< s_strippable) printf("Splitting word into two: %s-%s\n", newword, suffix[i]);
				s_stripped = i;
				wend -= len;
				strncpy(word, w, MIN(wend-w, MAX_WORD));
				word[MIN(wend-w, MAX_WORD)] = '\0';
				break;
			}

			/* If the remainder isn't in the dictionary, 
			 * try stripping off prefixes */
		  else {
			for (j=0; j<p_strippable; j++) {
			  if (strncmp(w, prefix[j], strlen(prefix[j])) == 0) {
				strncpy(newword, w+strlen(prefix[j]), MIN((wend-len)-(w+strlen(prefix[j])), MAX_WORD));
				newword[MIN((wend-len)-(w+strlen(prefix[j])), MAX_WORD)]='\0';
				if(boolean_dictionary_lookup(sent->dict, newword)) {
				  if(verbosity>1) if(i < s_strippable) printf("Splitting word into three: %s-%s-%s\n", prefix[j], newword, suffix[i]);
				  if (!issue_sentence_word(sent, prefix[j])) return FALSE;
				  if(i < s_strippable) s_stripped = i;
				  wend -= len;
				  w += strlen(prefix[j]);
				  strncpy(word, w, MIN(wend-w, MAX_WORD));
				  word[MIN(wend-w, MAX_WORD)] = '\0';
				  break;
				}
			  }
			}
		  }
		  if(j!=p_strippable) break;
		}
	  }
	}

	/* word is now what remains after all the stripping has been done */

	/*
	if (n_stripped == MAX_STRIP) {
		lperror(SEPARATE,
				".\n\"%s\" is followed by too many punctuation marks.\n", word);
		return FALSE;
	} */

	if (quote_found==1) post_quote[sent->length]=1;

	if (!issue_sentence_word(sent, word)) return FALSE;

	if(s_stripped != -1) {
	  if (!issue_sentence_word(sent, suffix[s_stripped])) return FALSE;
	}

	for (i=n_r_stripped-1; i>=0; i--) {

		/* Revert fix r22566, which had a commit message:
		 * "Fix Bug 9756, crash when grammar checking Word document."
		 * This fix added the line:
		 *    if (r_stripped[i] > strlen(*strip_right)) continue;
		 * However, the addition of this line will break
		 * the parsing of "Doogie's mother bit her."
		 *
		 * The fix is incorrect, because a NULL has been inserted into strip_right,
		 * making it very short (length 2). Meanwhile, the offset to the 's 
		 * is 9 chars (greater than 2!)  The string at strip_right[r_stripped[i]]
		 * is pointing at the 's.
		 *
		 * Thus, I'm reverting this fix for now; whatever the problem is,
		 * it needs to be handled in some other way.
		 */
		if (!issue_sentence_word(sent, strip_right[r_stripped[i]])) return FALSE;
	}

	if(sent->dict->affix_table!=NULL) {
	  xfree(strip_right, r_strippable * sizeof(char *));
	  xfree(strip_left, l_strippable * sizeof(char *));
	  xfree(suffix, s_strippable * sizeof(char *));
	  xfree(prefix, p_strippable * sizeof(char *));
	}
	return TRUE;
}