Esempio n. 1
0
/**
 * Concatenate the definitions for the given affix class.
 * This allows specifying the characters in different definitions
 * instead in a one long string, e.g. instead of:
 * ""«»《》【】『』`„": QUOTES+;
 * One can specify (note the added spaces):
 * """  «»  《》 【】 『』  ` „: QUOTES+;
 * Or even:
 * """: QUOTES+;
 * «» : QUOTES+;
 * etc.
 * Note that if there are no definitions or only one definition, there is
 * nothing to do.
 * The result is written to the first entry.
 * @param classno The given affix class.
 */
static void concat_class(Dictionary afdict, int classno)
{
	Afdict_class * ac;
	size_t i;
	dyn_str * qs;

	ac = AFCLASS(afdict, classno);
	if (1 >= ac->length) return;

	qs = dyn_str_new();
	for (i = 0; i < ac->length; i++)
		dyn_strcat(qs, ac->string[i]);

	ac->string[0] = string_set_add(qs->str, afdict->string_set);
	dyn_str_delete(qs);
}
Esempio n. 2
0
/**
 * Convert a list of utf8 chars to wide-chars.  The reason for doing
 * this is kind-of dorky: its so that we can easily find,
 * character-by-character, if a given character is a quotation mark
 * or a bullet.  This works only because the quotation marks and
 * bullets are exactly one (wide) character in length. I would like
 * it better if we didn't do this wide-char conversion, since wide-chars
 * are badly-behaved in crazy locales, and on MS Windows.
 */
static bool afdict_to_wide(Dictionary afdict, int classno)
{
	Afdict_class * ac;
	wchar_t * wqs;
	mbstate_t mbs;
	size_t i;
	int w;
	dyn_str * qs;
	const char *pqs;

	ac = AFCLASS(afdict, classno);
	if (0 == ac->length) return true;

	qs = dyn_str_new();
	for (i = 0; i < ac->length; i++)
		dyn_strcat(qs, ac->string[i]);

	/*
	 * Convert utf8 to wide chars before use.
	 * In case of error the result is undefined.
	 */
	pqs = qs->str;
	memset(&mbs, 0, sizeof(mbs));
	w = mbsrtowcs(NULL, &pqs, 0, &mbs);
	if (0 > w)
	{
		prt_error("Error: Affix dictionary: %s: "
		          "Invalid utf8 character\n", afdict_classname[classno]);
		return false;
	}

	/* Store the wide char version at the AFCLASS entry. */
	ac->mem_elems =  sizeof(*wqs) * (w+1); /* bytes here, but we don't care */
	ac->string = malloc(ac->mem_elems);
	wqs = (wchar_t *)ac->string;
	pqs = qs->str;
	(void)mbsrtowcs(wqs, &pqs, w, &mbs);
	wqs[w] = L'\0';

	dyn_str_delete(qs);

	return true;
}
Esempio n. 3
0
static bool afdict_init(Dictionary dict)
{
	Afdict_class * ac;
	Dictionary afdict = dict->affix_table;

	/* FIXME: read_entry() builds word lists in reverse order (can we
	 * just create the list top-down without breaking anything?). Unless
	 * it is fixed to preserve the order, reverse here the word list for
	 * each affix class. */
	for (ac = afdict->afdict_class;
		  ac < &afdict->afdict_class[ARRAY_SIZE(afdict_classname)]; ac++)
	{
		int i;
		int l = ac->length - 1;
		const char * t;

		for (i = 0;  i < l; i++, l--)
		{
			t = ac->string[i];
			ac->string[i] = ac->string[l];
			ac->string[l] = t;
		}
	}

	/* Create the affix lists */
	ac = AFCLASS(afdict, AFDICT_INFIXMARK);
	if ((1 < ac->length) || ((1 == ac->length) && (1 != strlen(ac->string[0]))))
	{
		prt_error("Error: afdict_init: Invalid value for class %s in file %s"
		          " (should have been one ASCII punctuation - ignored)\n",
		          afdict_classname[AFDICT_INFIXMARK], afdict->name);
		free((void *)ac->string);
		ac->length = 0;
		ac->mem_elems = 0;
		ac->string = NULL;
	}
	/* XXX For now there is a possibility to use predefined SUF and PRE lists.
	 * So if SUF or PRE are defined, don't extract any of them from the dict. */
	if (1 == ac->length)
	{
		if ((0 == AFCLASS(afdict, AFDICT_PRE)->length) &&
		    (0 == AFCLASS(afdict, AFDICT_SUF)->length))
		{
			char last_entry[MAX_WORD+1] = "";
			get_dict_affixes(dict, dict->root, ac->string[0][0], last_entry);
		}
	}
	else
	{
		/* No INFIX_MARK - create a dummy one that always mismatches */
		affix_list_add(afdict, &afdict->afdict_class[AFDICT_INFIXMARK], "");
	}

	if (debug_level(+D_AI))
	{
		size_t l;

		for (ac = afdict->afdict_class;
		     ac < &afdict->afdict_class[ARRAY_SIZE(afdict_classname)]; ac++)
		{
				if (0 == ac->length) continue;
				lgdebug(+0, "Class %s, %zd items:",
				        afdict_classname[ac-afdict->afdict_class], ac->length);
				for (l = 0; l < ac->length; l++)
					lgdebug(0, " '%s'", ac->string[l]);
				lgdebug(0, "\n");
		}
	}
#undef D_AI

	/* Store the SANEMORPHISM regex in the unused (up to now)
	 * regex_root element of the affix dictionary, and precompile it */
	assert(NULL == afdict->regex_root, "SM regex is already assigned");
	ac = AFCLASS(afdict, AFDICT_SANEMORPHISM);
	if (0 != ac->length)
	{
		int rc;

		Regex_node *sm_re = malloc(sizeof(*sm_re));
		dyn_str *rebuf = dyn_str_new();

		/* The regex used to be converted to: ^((original-regex)b)+$
		 * In the initial wordgraph version word boundaries are not supported,
		 * so instead it is converted to: ^(original-regex)+$ */
#ifdef WORD_BOUNDARIES
		dyn_strcat(rebuf, "^((");
#else
		dyn_strcat(rebuf, "^(");
#endif
		dyn_strcat(rebuf, ac->string[0]);
#ifdef WORD_BOUNDARIES
		dyn_strcat(rebuf, ")b)+$");
#else
		dyn_strcat(rebuf, ")+$");
#endif
		sm_re->pattern = strdup(rebuf->str);
		dyn_str_delete(rebuf);

		afdict->regex_root = sm_re;
		sm_re->name = strdup(afdict_classname[AFDICT_SANEMORPHISM]);
		sm_re->re = NULL;
		sm_re->next = NULL;
		sm_re->neg = false;
		rc = compile_regexs(afdict->regex_root, afdict);
		if (rc) {
			prt_error("Error: afdict_init: Failed to compile "
			          "regex '%s' in file %s, return code %d\n",
			          afdict_classname[AFDICT_SANEMORPHISM], afdict->name, rc);
			return false;
		}
		lgdebug(+5, "%s regex %s\n",
		        afdict_classname[AFDICT_SANEMORPHISM], sm_re->pattern);
	}

	/* sort the UNITS list */
	/* Longer unit names must get split off before shorter ones.
	 * This prevents single-letter splits from screwing things
	 * up. e.g. split 7gram before 7am before 7m
	 */
	ac = AFCLASS(afdict, AFDICT_UNITS);
	if (0 < ac->length)
	{
		qsort(ac->string, ac->length, sizeof(char *), cmplen);
	}

#ifdef AFDICT_ORDER_NOT_PRESERVED
	/* pre-sort the MPRE list */
	ac = AFCLASS(afdict, AFDICT_MPRE);
	if (0 < ac->length)
	{
		/* Longer subwords have priority over shorter ones,
		 * reverse-sort by length.
		 * XXX mprefix_split() for Hebrew depends on that. */
		qsort(ac->string, ac->length, sizeof(char *), revcmplen);
	}
#endif /* AFDICT_ORDER_NOT_PRESERVED */

	concat_class(afdict, AFDICT_QUOTES);
	concat_class(afdict, AFDICT_BULLETS);

	if (! anysplit_init(afdict)) return false;

	return true;
}
Esempio n. 4
0
/**
 * Split randomly.
 * Return true on success.
 * Return false when:
 * - disabled (i.e. when doing regular language processing).
 * - an error occurs (the behavior then is undefined).
 *   Such an error has not been observed yet.
 */
bool anysplit(Sentence sent, const char *word)
{
	Dictionary afdict = sent->dict->affix_table;
	anysplit_params *as;
	Afdict_class * stemsubscr;
	size_t stemsubscr_len;

	size_t l = strlen(word);
	p_list pl;
	size_t pos;
	int p;
	int sample_point;
	size_t nsplits;
	size_t rndtried = 0;
	size_t rndissued = 0;
	size_t i;
	unsigned int seed = 0;
	char *prefix_string = alloca(l+2+1); /* word + ".=" + NUL */
	char *suffix_string = alloca(l+1);   /* word + NUL */
	bool use_sampling = true;
	const char infix_mark = INFIX_MARK(afdict);


	if (NULL == afdict) return false;
	as = afdict->anysplit;

	if ((NULL == as) || (0 == as->nparts)) return false; /* Anysplit disabled */

	if (0 == l)
	{
		prt_error("Warning: anysplit(): word length 0\n");
		return false;
	}

	stemsubscr = AFCLASS(afdict, AFDICT_STEMSUBSCR);
	stemsubscr_len = (NULL == stemsubscr->string[0]) ? 0 :
		strlen(stemsubscr->string[0]);

	/* Don't split morphemes again. If INFIXMARK and/or SUBSCRMARK are
	 * not defined in the affix file, then morphemes may get split again unless
	 * restricted by REGPRE/REGMID/REGSUF. */
	if (word[0] == infix_mark) return true;
	if ((l > stemsubscr_len) &&
	    (0 == strcmp(word+l-stemsubscr_len, stemsubscr->string[0])))
		return true;

	// seed = time(NULL)+(unsigned int)(long)&seed;

#if DEBUG_ANYSPLIT
	gw = word;
#endif

	nsplits = split(l, as->nparts, &as->scl[l]);
	if (0 == nsplits)
	{
		prt_error("Warning: anysplit(): split() failed (shouldn't happen)\n");
		return false;
	}

	if (as->altsmax >= nsplits)
	{
		/* Issue everything */
		sample_point = -1;
		use_sampling = false;
	}

	lgdebug(+2, "Start%s sampling: word=%s, nsplits=%zu, maxsplits=%d, "
	        "as->altsmin=%zu, as->altsmax=%zu\n", use_sampling ? "" : " no",
	        word, nsplits, as->nparts, as->altsmin, as->altsmax);

	while (rndtried < nsplits && (!use_sampling || (rndissued < as->altsmin)))
	{
		if (use_sampling)
		{
			sample_point = rng_uniform(&seed, nsplits);

			if (sample_point < 0) /* Cannot happen with rand_r() */
			{
				prt_error("Error: rng: %s\n", strerror(errno));
				return false;
			}
		}
		else
		{
			sample_point++;
		}

		lgdebug(2, "Sample: %d ", sample_point);
		if (as->scl[l].p_tried[sample_point])
		{
			lgdebug(4, "(repeated)\n");
			continue;
		}
		lgdebug(4, "(new)");
		rndtried++;
		as->scl[l].p_tried[sample_point] = true;
		if (morpheme_match(sent, word, l, &as->scl[l].sp[sample_point*as->nparts]))
		{
			as->scl[l].p_selected[sample_point] = true;
			rndissued++;
		}
		else
		{
			lgdebug(2, "\n");
		}
	}

	lgdebug(2, "Results: word '%s' (length=%zu): %zu/%zu:\n", word, l, rndissued, nsplits);

	for (i = 0; i < nsplits; i++)
	{
		const char **suffixes = NULL;
		int num_suffixes = 0;

		if (!as->scl[l].p_selected[i]) continue;

		pl = &as->scl[l].sp[i*as->nparts];
		pos = 0;
		for (p = 0; p < as->nparts; p++)
		{
			if (pl[0] == (int)l)  /* This is the whole word */
			{
				strncpy(prefix_string, &word[pos], pl[p]-pos);
				prefix_string[pl[p]-pos] = '\0';
			}
			else
			if (0 == pos)   /* The first but not the only morpheme */
			{
				strncpy(prefix_string, &word[pos], pl[p]-pos);
				prefix_string[pl[p]-pos] = '\0';

				if (0 != stemsubscr->length)
				    strcat(prefix_string, stemsubscr->string[0]);
			}
			else           /* 2nd and on morphemes */
			{
				strncpy(suffix_string, &word[pos], pl[p]-pos);
				suffix_string[pl[p]-pos] = '\0';
				altappend(sent, &suffixes, suffix_string);
				num_suffixes++;
			}

			pos = pl[p];
			if (pos == l) break;
		}

		/* Here a leading INFIX_MARK is added to the suffixes if needed. */
		add_alternative(sent,
		   0,NULL, 1,(const char **)&prefix_string, num_suffixes,suffixes);
		free(suffixes);
	}

	return true;
}
Esempio n. 5
0
/**
 * Initialize the anysplit parameter and cache structure.
 */
bool anysplit_init(Dictionary afdict)
{
	anysplit_params *as;
	size_t i;

	Afdict_class *regpre = AFCLASS(afdict, AFDICT_REGPRE);
	Afdict_class *regmid = AFCLASS(afdict, AFDICT_REGMID);
	Afdict_class *regsuf = AFCLASS(afdict, AFDICT_REGSUF);

	Afdict_class *regalts = AFCLASS(afdict, AFDICT_REGALTS);
	Afdict_class *regparts = AFCLASS(afdict, AFDICT_REGPARTS);

	if (0 == regparts->length)
	{
		/* FIXME: Early assignment of verbosity by -v=x argument. */
		if (verbosity > 1)
			prt_error("Warning: File %s: Anysplit disabled (%s not defined)",
		             afdict->name, afdict_classname[AFDICT_REGPARTS]);
		return true;
	}
	if (1 != regparts->length)
	{
		prt_error("Error: File %s: Must have %s defined with one value",
		          afdict->name, afdict_classname[AFDICT_REGPARTS]);
		return false;
	}

	as = malloc(sizeof(anysplit_params));
	for (i = 0; i < NUMELEMS(as->scl); i++) as->scl[i].sp = NULL;
	afdict->anysplit = as;

	as->regpre = regbuild(regpre->string, regpre->length, AFDICT_REGPRE);
	as->regmid = regbuild(regmid->string, regmid->length, AFDICT_REGMID);
	as->regsuf = regbuild(regsuf->string, regsuf->length, AFDICT_REGSUF);

	if (compile_regexs(as->regpre, NULL) != 0) return false;
	if (compile_regexs(as->regmid, NULL) != 0) return false;
	if (compile_regexs(as->regsuf, NULL) != 0) return false;

	as->nparts = atoi(regparts->string[0]);
	if (as->nparts < 0)
	{
		prt_error("Error: File %s: Value of %s must be a non-negative number",
		          afdict->name, afdict_classname[AFDICT_REGPARTS]);
		return false;
	}
	if (0 == as->nparts)
	{
		prt_error("Warning: File %s: Anysplit disabled (0: %s)\n",
		          afdict->name, afdict_classname[AFDICT_REGPARTS]);
		return true;
	}

	if (2 != regalts->length)
	{
		prt_error("Error: File %s: Must have %s defined with 2 values",
		          afdict->name, afdict_classname[AFDICT_REGALTS]);
		return false;
	}
	as->altsmin = atoi(regalts->string[0]);
	as->altsmax = atoi(regalts->string[1]);
	if ((atoi(regalts->string[0]) <= 0) || (atoi(regalts->string[1]) <= 0))
	{
		prt_error("Error: File %s: Value of %s must be 2 positive numbers",
		          afdict->name, afdict_classname[AFDICT_REGALTS]);
		return false;
	}

	return true;
}