コード例 #1
0
ファイル: dictionary.c プロジェクト: eugeneai/link-grammar
static void load_affix(Dictionary afdict, Dict_node *dn, int l)
{
	Dict_node * dnx = NULL;
	for (; NULL != dn; dn = dnx)
	{
		char *string;
		const char *con = word_only_connector(dn);
		if (NULL == con)
		{
			/* ??? should we support here more than one class? */
			prt_error("Warning: Word \"%s\" found near line %d of %s.\n"
			          "\tWord has more than one connector.\n"
			          "\tThis word will be ignored.",
			          dn->string, afdict->line_number, afdict->name);
			return;
		}

		/* The affix files serve a dual purpose: they indicate both
		 * what a unit is, connector-wise, and what is strippable, as
		 * a string.  When the unit is an 'idiom' (i.e. two words,
		 * e.g. base_pair or degrees_C) then only the first word can
		 * be stripped away from a run-on expression (e.g. "86degrees C")
		 */
		if (contains_underbar(dn->string))
		{
			char *p;
			string = strdup(dn->string);
			p = string+1;
			while (*p != '_' && *p != '\0') p++;
			*p = '\0';
		}
		else
		{
			string = deinflect(dn->string);
		}

		affix_list_add(afdict, afdict_find(afdict, con,
		               /*notify_err*/true), string);
		free(string);

		dnx = dn->left;
		xfree((char *)dn, sizeof(Dict_node));
	}
}
コード例 #2
0
ファイル: regex-tokenizer.c プロジェクト: linas/link-grammar
/**
 * Compare a portion of the tokenized string, starting at word_stat with length
 * of numchar, to the dictionary or affix class word that is defined in the
 * capture group whose info is pointed to by cgnump.
 *
 * FIXME: Return int instead of bool, see the comment at E1 below.
 */
static bool is_word(const char *word_start, int numchar, cgnum_t *cgnump)
{
	Dictionary const dict = cgnump->dict;
	const char * const afclass = cgnump->afclass;
	const int lookup_mark_len =
		(NULL != cgnump->lookup_mark) ? strlen(cgnump->lookup_mark) : 0;
	char * const word = alloca(numchar+lookup_mark_len+1);
#ifdef AFFIX_DICTIONARY_TREE
	const Dict_node *dn;
#endif
	const Afdict_class *ac;
	size_t i;

	/* Append/prepend stem/infix marks. */
	if (NULL == cgnump->lookup_mark)
	{
		strncpy(word, word_start, numchar);
		word[numchar] = '\0';
	}
	else
	{
		switch (cgnump->lookup_mark_pos)
		{
		case 'p': /* prepend a mark */
			strcpy(word, cgnump->lookup_mark);
			strncat(word, word_start, numchar);
			word[numchar+lookup_mark_len] = '\0';
			break;
		case 'a': /* append a mark */
			strncpy(word, word_start, numchar);
			strcpy(word+numchar, cgnump->lookup_mark);
			break;
		default:
			printf("is_word:E3('%x' %s)", cgnump->lookup_mark_pos, cgnump->lookup_mark);
			strncpy(word, word_start, numchar);
			word[numchar] = '\0';
		}
	}

	lgdebug(7, "LOOKUP '%s' in %s: ", word, dict->name);
	if (0 == afclass) return boolean_dictionary_lookup(dict, word);

	/* We don't have for now a tree representation of the affix file, only lists */
#ifdef AFFIX_DICTIONARY_TREE
	dn = lookup_list(dict, word);
	printf("WORD %s afclass %s dn %p\n", word, afclass, dn);
	if (NULL == dn) return false;

	for (; NULL != dn; dn = dn->left)
	{
		const char *con = word_only_connector(dn);
		if (NULL == con)
		{
			/* Internal error - nothing else to do for now unless we don't
			 * rerun bool, but return an int so -1 signifies an error. */
			printf("is_word(%s):E1 ", word);
		}
		printf("CON '%s'\n", con);
		if (0 == strcmp(afclass, con)) return true;
	}
#else
		/* Make it the hard way. */
		ac = afdict_find(dict, afclass, /*notify_err*/false);
		if (NULL == ac)
		{
			/* Internal error - nothing else to do for now unless we don't
			 * rerun bool, but return an int so -1 signifies an error. */
			printf("is_word(%s):E2 ", word);
		}

		for (i = 0; i < ac->length; i++)
		{
			if (0 == strcmp(ac->string[i], word)) return true;
		}
#endif

	return false;
}