コード例 #1
0
ファイル: utilities.c プロジェクト: jbicha/link-grammar
/**
 * Upcase the first letter of the word.
 * XXX FIXME This works 'most of the time', but is not technically correct.
 * This is because towlower() and towupper() are locale dependent, and also
 * because the byte-counts might not match up, e.g. German ß and SS.
 * The correct long-term fix is to use ICU or glib g_utf8_strup(), etc.
 */
void upcase_utf8_str(char *to, const char * from, size_t usize)
{
	wchar_t c;
	int i, nbl, nbh;
	char low[MB_LEN_MAX];
	mbstate_t mbs;

	memset(&mbs, 0, sizeof(mbs));
	nbh = mbrtowc (&c, from, MB_CUR_MAX, &mbs);
	if (nbh < 0)
	{
		prt_error("Error: Invalid multi-byte string!");
		return;
	}
	c = towupper(c);
	nbl = wctomb_check(low, c);

	/* Check for error on an in-place copy */
	if ((nbh < nbl) && (to == from))
	{
		/* I'm to lazy to fix this */
		prt_error("Error: can't upcase multi-byte string!");
		return;
	}

	/* Upcase */
	for (i=0; i<nbl; i++) { to[i] = low[i]; }

	if ((nbh == nbl) && (to == from)) return;

	from += nbh;
	to += nbl;
	safe_strcpy(to, from, usize-nbl);
}
コード例 #2
0
ファイル: word-file.c プロジェクト: luyi326/linguo
/**
 * Reads in one word from the file, allocates space for it,
 * and returns it.
 */
static const char * get_a_word(Dictionary dict, FILE * fp)
{
	char word[MAX_WORD+4]; /* allow for 4-byte wide chars */
	const char * s;
	wint_t c;
	mbstate_t mbss;
	int j;

	do {
		c = fgetwc(fp);
	} while ((c != WEOF) && iswspace(c));
	if (c == WEOF) return NULL;

	memset(&mbss, 0, sizeof(mbss));
	for (j=0; (j <= MAX_WORD-1) && (!iswspace(c)) && (c != WEOF);)
	{
		j += wctomb_check(&word[j], c, &mbss);
		c = fgetwc(fp);
	}

	if (j >= MAX_WORD) {
		word[MAX_WORD] = 0x0;
		prt_error("Fatal Error: The dictionary contains a word that "
		          "is too long. The word was: %s", word);
		exit(1);
	}
	word[j] = '\0';
	s = string_set_add(word, dict->string_set);
	return s;
}
コード例 #3
0
ファイル: utilities.c プロジェクト: ampli/link-grammar
/**
 * Downcase the first letter of the word.
 * XXX FIXME This works 'most of the time', but is not technically correct.
 * This is because towlower() and towupper() are locale dependent, and also
 * because the byte-counts might not match up, e.g. German ß and SS.
 * The correct long-term fix is to use ICU or glib g_utf8_strup(), etc.
 */
void downcase_utf8_str(char *to, const char * from, size_t usize, locale_t locale)
{
	wchar_t c;
	int i, nbl, nbh;
	char low[MB_LEN_MAX];
	mbstate_t mbs;

	/* Make sure it doesn't contain garbage in case of an error */
	if (to != from) strcpy(to, from);

	memset(&mbs, 0, sizeof(mbs));
	nbh = mbrtowc (&c, from, MB_CUR_MAX, &mbs);
	if (nbh < 0)
	{
		prt_error("Error: Invalid UTF-8 string!\n");
		return;
	}
	c = towlower_l(c, locale);
	nbl = wctomb_check(low, c);

	/* Check for error on an in-place copy */
	if ((nbh < nbl) && (to == from))
	{
		/* I'm to lazy to fix this */
		prt_error("Error: can't downcase UTF-8 string!\n");
		return;
	}

	/* Downcase */
	for (i=0; i<nbl; i++) { to[i] = low[i]; }

	if ((nbh == nbl) && (to == from)) return;

	from += nbh;
	to += nbl;
	safe_strcpy(to, from, usize-nbl);
}
コード例 #4
0
ファイル: read-dict.c プロジェクト: dyne/AutOrg
/**
 * This reads the next token from the input into token.
 * Return 1 if a character was read, else return 0 (and print a warning).
 */
static int link_advance(Dictionary dict)
{
	wchar_t c;
	int nr, i;
	int quote_mode;

	dict->is_special = FALSE;

	if (dict->already_got_it != '\0')
	{
		dict->is_special = is_special(dict->already_got_it, &dict->mbss);
		if (dict->already_got_it == WEOF) {
			dict->token[0] = '\0';
		} else {
			dict->token[0] = dict->already_got_it; /* specials are one byte */
			dict->token[1] = '\0';
		}
		dict->already_got_it = '\0';
		return 1;
	}

	do { c = get_character(dict, FALSE); } while (iswspace(c));

	quote_mode = FALSE;

	i = 0;
	for (;;)
	{
		if (i > MAX_TOKEN_LENGTH-3) {  /* 3 for multi-byte tokens */
			dict_error(dict, "Token too long");
			return 0;
		}
		if (quote_mode) {
			if (c == '\"') {
				quote_mode = FALSE;
				dict->token[i] = '\0';
				return 1;
			}
			if (iswspace(c)) {
				dict_error(dict, "White space inside of token");
				return 0;
			}

			/* Although we read wide chars, we store UTF8 internally, always. */
			nr = wcrtomb(&dict->token[i], c, &dict->mbss);
			if (nr < 0) {
#ifndef _WIN32
				dict_error2(dict, "Unable to read UTF8 string in current locale",
				         nl_langinfo(CODESET));
				fprintf (stderr, "\tTry setting the locale with \"export LANG=en_US.UTF-8\"\n");
#else
				dict_error(dict, "Unable to read UTF8 string in current locale");
#endif
				return 0;
			}
			i += nr;
		} else {
			if (is_special(c, &dict->mbss))
			{
				if (i == 0)
				{
					dict->token[0] = c;  /* special toks are one char always */
					dict->token[1] = '\0';
					dict->is_special = TRUE;
					return 1;
				}
				dict->token[i] = '\0';
				dict->already_got_it = c;
				return 1;
			}
			if (c == 0x0) {
				if (i == 0) {
					dict->token[0] = '\0';
					return 1;
				}
				dict->token[i] = '\0';
				dict->already_got_it = c;
				return 1;
			}
			if (iswspace(c)) {
				dict->token[i] = '\0';
				return 1;
			}
			if (c == '\"') {
				quote_mode = TRUE;
			} else {
				/* store UTF8 internally, always. */
				nr = wctomb_check(&dict->token[i], c, &dict->mbss);
				if (nr < 0) {
#ifndef _WIN32
					dict_error2(dict, "Unable to read UTF8 string in current locale",
					         nl_langinfo(CODESET));
					fprintf (stderr, "\tTry setting the locale with \"export LANG=en_US.UTF-8\"\n");
#else
					dict_error(dict, "Unable to read UTF8 string in current locale");
#endif
					return 0;
				}
				i += nr;
			}
		}
		c = get_character(dict, quote_mode);
	}
	return 1;
}