示例#1
0
/**
 * Append an unmarked (i.e. without INFIXMARK) morpheme to join_buff.
 * join_buff is a zeroed-out buffer which has enough room for morpheme to be
 * added + terminating NUL.
 * Note that MT_PREFIX or MT_SUFFIX can be without an INFIX_MARK, in case
 * INFIX_MARK is not defined. XXX: What about MT_MIDDLE? (not in use yet).
 *
 * FIXME Combining contracted words is not handled yet, because combining
 * morphemes which have non-LL links to other words is not yet implemented.
 */
static void add_morpheme_unmarked(Sentence sent, char *join_buff,
                                  const char *wm, Morpheme_type mt)
{
	const char infix_mark = INFIX_MARK(sent->dict->affix_table);
	const char *sm =  strrchr(wm, SUBSCRIPT_MARK);

	if (NULL == sm) sm = (char *)wm + strlen(wm);

	if ((MT_PREFIX == mt) && (infix_mark == sm[-INFIX_MARK_L]))
		strncat(join_buff, wm, sm-wm-INFIX_MARK_L);
	else if ((MT_SUFFIX == mt) && (infix_mark == wm[0]))
		strncat(join_buff, INFIX_MARK_L+wm, sm-wm-INFIX_MARK_L);
	else if ((MT_MIDDLE == mt))
		strncat(join_buff, INFIX_MARK_L+wm, sm-wm-2*INFIX_MARK_L);
	else
		strncat(join_buff, wm, sm-wm);
}
示例#2
0
文件: anysplit.c 项目: lagleki/jorne
/**
 * Split randomly.
 * Return true on success.
 * Return false when:
 * - disabled (i.e. when doing regular language processing).
 * - an error occurs (the behavior then is undefined).
 *   Such an error has not been observed yet.
 */
bool anysplit(Sentence sent, const char *word)
{
	Dictionary afdict = sent->dict->affix_table;
	anysplit_params *as;
	Afdict_class * stemsubscr;
	size_t stemsubscr_len;

	size_t l = strlen(word);
	p_list pl;
	size_t pos;
	int p;
	int sample_point;
	size_t nsplits;
	size_t rndtried = 0;
	size_t rndissued = 0;
	size_t i;
	unsigned int seed = 0;
	char *prefix_string = alloca(l+2+1); /* word + ".=" + NUL */
	char *suffix_string = alloca(l+1);   /* word + NUL */
	bool use_sampling = true;
	const char infix_mark = INFIX_MARK(afdict);


	if (NULL == afdict) return false;
	as = afdict->anysplit;

	if ((NULL == as) || (0 == as->nparts)) return false; /* Anysplit disabled */

	if (0 == l)
	{
		prt_error("Warning: anysplit(): word length 0\n");
		return false;
	}

	stemsubscr = AFCLASS(afdict, AFDICT_STEMSUBSCR);
	stemsubscr_len = (NULL == stemsubscr->string[0]) ? 0 :
		strlen(stemsubscr->string[0]);

	/* Don't split morphemes again. If INFIXMARK and/or SUBSCRMARK are
	 * not defined in the affix file, then morphemes may get split again unless
	 * restricted by REGPRE/REGMID/REGSUF. */
	if (word[0] == infix_mark) return true;
	if ((l > stemsubscr_len) &&
	    (0 == strcmp(word+l-stemsubscr_len, stemsubscr->string[0])))
		return true;

	// seed = time(NULL)+(unsigned int)(long)&seed;

#if DEBUG_ANYSPLIT
	gw = word;
#endif

	nsplits = split(l, as->nparts, &as->scl[l]);
	if (0 == nsplits)
	{
		prt_error("Warning: anysplit(): split() failed (shouldn't happen)\n");
		return false;
	}

	if (as->altsmax >= nsplits)
	{
		/* Issue everything */
		sample_point = -1;
		use_sampling = false;
	}

	lgdebug(+2, "Start%s sampling: word=%s, nsplits=%zu, maxsplits=%d, "
	        "as->altsmin=%zu, as->altsmax=%zu\n", use_sampling ? "" : " no",
	        word, nsplits, as->nparts, as->altsmin, as->altsmax);

	while (rndtried < nsplits && (!use_sampling || (rndissued < as->altsmin)))
	{
		if (use_sampling)
		{
			sample_point = rng_uniform(&seed, nsplits);

			if (sample_point < 0) /* Cannot happen with rand_r() */
			{
				prt_error("Error: rng: %s\n", strerror(errno));
				return false;
			}
		}
		else
		{
			sample_point++;
		}

		lgdebug(2, "Sample: %d ", sample_point);
		if (as->scl[l].p_tried[sample_point])
		{
			lgdebug(4, "(repeated)\n");
			continue;
		}
		lgdebug(4, "(new)");
		rndtried++;
		as->scl[l].p_tried[sample_point] = true;
		if (morpheme_match(sent, word, l, &as->scl[l].sp[sample_point*as->nparts]))
		{
			as->scl[l].p_selected[sample_point] = true;
			rndissued++;
		}
		else
		{
			lgdebug(2, "\n");
		}
	}

	lgdebug(2, "Results: word '%s' (length=%zu): %zu/%zu:\n", word, l, rndissued, nsplits);

	for (i = 0; i < nsplits; i++)
	{
		const char **suffixes = NULL;
		int num_suffixes = 0;

		if (!as->scl[l].p_selected[i]) continue;

		pl = &as->scl[l].sp[i*as->nparts];
		pos = 0;
		for (p = 0; p < as->nparts; p++)
		{
			if (pl[0] == (int)l)  /* This is the whole word */
			{
				strncpy(prefix_string, &word[pos], pl[p]-pos);
				prefix_string[pl[p]-pos] = '\0';
			}
			else
			if (0 == pos)   /* The first but not the only morpheme */
			{
				strncpy(prefix_string, &word[pos], pl[p]-pos);
				prefix_string[pl[p]-pos] = '\0';

				if (0 != stemsubscr->length)
				    strcat(prefix_string, stemsubscr->string[0]);
			}
			else           /* 2nd and on morphemes */
			{
				strncpy(suffix_string, &word[pos], pl[p]-pos);
				suffix_string[pl[p]-pos] = '\0';
				altappend(sent, &suffixes, suffix_string);
				num_suffixes++;
			}

			pos = pl[p];
			if (pos == l) break;
		}

		/* Here a leading INFIX_MARK is added to the suffixes if needed. */
		add_alternative(sent,
		   0,NULL, 1,(const char **)&prefix_string, num_suffixes,suffixes);
		free(suffixes);
	}

	return true;
}