C++ (Cpp) is_utf8_upper 예제들

예제 #1

0

파일 보기

파일: tokenize.c 프로젝트: arv100kri/linkparser

/**
 * This just looks up all the words in the sentence, and builds
 * up an appropriate error message in case some are not there.
 * It has no side effect on the sentence.  Returns TRUE if all
 * went well.
 */
int sentence_in_dictionary(Sentence sent)
{
	int w, ok_so_far;
	char * s;
	Dictionary dict = sent->dict;
	char temp[1024];

	ok_so_far = TRUE;
	for (w=0; w<sent->length; w++)
	{
		s = sent->word[w].string;
		if (!boolean_dictionary_lookup(dict, s) &&
		    !(is_utf8_upper(s)   && dict->capitalized_word_defined) &&
		    !(is_utf8_upper(s) && is_s_word(s) && dict->pl_capitalized_word_defined) &&
		    !(ishyphenated(s) && dict->hyphenated_word_defined)  &&
		    !(is_number(s)	&& dict->number_word_defined) &&
		    !(is_ing_word(s)  && dict->ing_word_defined)  &&
		    !(is_s_word(s)	&& dict->s_word_defined)  &&
		    !(is_ed_word(s)   && dict->ed_word_defined)  &&
		    !(is_ly_word(s)   && dict->ly_word_defined))
		{
			if (ok_so_far) {
				safe_strcpy(temp, "The following words are not in the dictionary:", sizeof(temp));
				ok_so_far = FALSE;
			}
			safe_strcat(temp, " \"", sizeof(temp));
			safe_strcat(temp, sent->word[w].string, sizeof(temp));
			safe_strcat(temp, "\"", sizeof(temp));
		}
	}
	if (!ok_so_far) {
		lperror(NOTINDICT, "\n%s\n", temp);
	}
	return ok_so_far;
}

예제 #2

0

파일 보기

파일: prune.c 프로젝트: eugeneai/link-grammar

/**
 * This hash function that takes a connector and a seed value i.
 * It only looks at the leading upper case letters of
 * the string, and the label.  This ensures that if two connectors
 * match, then they must hash to the same place.
 */
static int conn_hash(Connector * c, int i)
{
	int nb;
	const char * s;
	s = c->string;

	i = i + (i<<1) + randtable[(c->label + i) & (RTSIZE-1)];
	nb = is_utf8_upper(s);
	while (nb)
	{
		i = i + (i<<1) + randtable[(*s + i) & (RTSIZE-1)];
		s += nb;
		nb = is_utf8_upper(s);
	}
	return i;
}

예제 #3

0

파일 보기

파일: tokenize.c 프로젝트: arv100kri/linkparser

static int downcase_is_in_dict(Dictionary dict, char * word)
{
	int i, rc;
	char low[MB_LEN_MAX];
	char save[MB_LEN_MAX];
	wchar_t c;
	int nbl, nbh;

	if (!is_utf8_upper(word)) return FALSE;

	nbh = mbtowc (&c, word, 4);
	c = towlower(c);
	nbl = wctomb(low, c);
	if (nbh != nbl)
	{
		fprintf(stderr, "Error: can't downcase multi-byte string!\n");
		return FALSE;
	}

	/* Downcase */
	for (i=0; i<nbl; i++) { save[i] = word[i]; word[i] = low[i]; }

	/* Look it up, then restore old value */
	rc = boolean_dictionary_lookup(dict, word);
	for (i=0; i<nbh; i++) { word[i] = save[i]; }

	return rc; 
}

예제 #4

0

파일 보기

파일: tokenize.c 프로젝트: arv100kri/linkparser

/** 
 * The string s is the next word of the sentence. 
 * Do not issue the empty string.  
 * Return false if too many words or the word is too long. 
 */
static int issue_sentence_word(Sentence sent, const char * s)
{
	if (*s == '\0') return TRUE;
	if (strlen(s) > MAX_WORD) {
		lperror(SEPARATE,
				". The word \"%s\" is too long.\n"
				"A word can have a maximum of %d characters.\n", s, MAX_WORD);
		return FALSE;
	}

	if (sent->length == MAX_SENTENCE) {
		lperror(SEPARATE, ". The sentence has too many words.\n");
		return FALSE;
	}

	strcpy(sent->word[sent->length].string, s);

	/* Now we record whether the first character of the word is upper-case.
	   (The first character may be made lower-case
	   later, but we may want to get at the original version) */
	if (is_utf8_upper(s)) sent->word[sent->length].firstupper=1;
	else sent->word[sent->length].firstupper = 0;
	sent->length++;
	return TRUE;
}

예제 #5

0

파일 보기

파일: tokenize.c 프로젝트: arv100kri/linkparser

/**
 * This is rather esoteric and not terribly important.
 * It returns TRUE if the word matches the pattern /[A-Z]\.]+/
 */
static int is_initials_word(const char * word)
{
	int i=0;
	while (word[i])
	{
		int nb = is_utf8_upper(&word[i]);
		if (!nb) return FALSE;
		i += nb;
		if (word[i] != '.') return FALSE;
		i++;
	}
	return TRUE;
}

예제 #6

0

파일 보기

파일: tokenize.c 프로젝트: arv100kri/linkparser

/**
 * Corrects case of first word, fills in other proper nouns, and
 * builds the expression lists for the resulting words.
 *
 * Algorithm:
 * Apply the following step to all words w:
 * if w is in the dictionary, use it.
 * else if w is upper case use PROPER_WORD disjuncts for w.
 * else if it's hyphenated, use HYPHENATED_WORD
 * else if it's a number, use NUMBER_WORD.
 *
 * Now, we correct the first word, w.
 * if w is upper case, let w' be the lower case version of w.
 * if both w and w' are in the dict, concatenate these disjncts.
 * else if w' is in dict, use disjuncts of w'
 * else leave the disjuncts alone
 */
int build_sentence_expressions(Sentence sent)
{
	int i, first_word;  /* the index of the first word after the wall */
	char *s, *u, temp_word[MAX_WORD+1];
	X_node * e;
	Dictionary dict = sent->dict;

	if (dict->left_wall_defined) {
		first_word = 1;
	} else {
		first_word = 0;
	}

	/* the following loop treats all words the same
	   (nothing special for 1st word) */
	for (i=0; i<sent->length; i++)
	{
		s = sent->word[i].string;
		if (boolean_dictionary_lookup(sent->dict, s))
		{
			sent->word[i].x = build_word_expressions(sent, s);
		}
		else if (is_utf8_upper(s) && is_s_word(s) && dict->pl_capitalized_word_defined) 
		{
			if (!special_string(sent, i, PL_PROPER_WORD)) return FALSE;
		}
		else if (is_utf8_upper(s) && dict->capitalized_word_defined)
		{
			if (!special_string(sent, i, PROPER_WORD)) return FALSE;
		}
		else if (is_number(s) && dict->number_word_defined)
		{
			/* we know it's a plural number, or 1 */
			/* if the string is 1, we'll only be here if 1's not in the dictionary */
			if (!special_string(sent, i, NUMBER_WORD)) return FALSE;
		}
		else if (ishyphenated(s) && dict->hyphenated_word_defined)
		{
			/* singular hyphenated */
			if (!special_string(sent, i, HYPHENATED_WORD)) return FALSE;
		} 
		/* XXX
		 * The following does some morphology-guessing for words that
		 * that are not in the dictionary. This should be replaced by
		 * a generic morphology-guesser for langauges that aren't english.
		 * XXX
		 */
		else if (is_ing_word(s) && dict->ing_word_defined) 
		{
			if (!guessed_string(sent, i, s, ING_WORD)) return FALSE;
		}
		else if (is_s_word(s) && dict->s_word_defined)
		{
			if (!guessed_string(sent, i, s, S_WORD)) return FALSE;
		}
		else if (is_ed_word(s) && dict->ed_word_defined)
		{
			if (!guessed_string(sent, i, s, ED_WORD)) return FALSE;
		}
		else if (is_ly_word(s) && dict->ly_word_defined)
		{
			if (!guessed_string(sent, i, s, LY_WORD)) return FALSE;
		}
		else if (dict->unknown_word_defined && dict->use_unknown_word)
		{
			handle_unknown_word(sent, i, s);
		}
		else 
		{
			/* The reason I can assert this is that the word
			 * should have been looked up already if we get here.
			 */
			assert(FALSE, "I should have found that word.");
		}
	}

	/* Under certain cases--if it's the first word of the sentence,
	 * or if it follows a colon or a quotation mark--a word that's 
	 * capitalized has to be looked up as an uncapitalized word
	 * (as well as a capitalized word).
	 */
	for (i=0; i<sent->length; i++)
	{
		if (! (i==first_word || (i>0 && strcmp(":", sent->word[i-1].string)==0) || post_quote[i]==1) ) continue;
		s = sent->word[i].string;

		if (is_utf8_upper(s))
		{
			downcase_utf8_str(temp_word, s, MAX_WORD);
			u = string_set_add(temp_word, sent->string_set);

			/* If the lower-case version is in the dictionary... */
			if (boolean_dictionary_lookup(sent->dict, u))
			{
				/* Then check if the upper-case version is there. 
				 * If it is, the disjuncts for the upper-case version 
				 * have been put there already. So add on the disjuncts
				 * for the lower-case version. */
				if (boolean_dictionary_lookup(sent->dict, s))
				{
					e = build_word_expressions(sent, u);
					sent->word[i].x =
						catenate_X_nodes(sent->word[i].x, e);
				} 
				else
				{
					/* If the upper-case version isn't there,
					 * replace the u.c. disjuncts with l.c. ones.
					 */
					safe_strcpy(s,u, MAX_WORD);
					e = build_word_expressions(sent, s);
					free_X_nodes(sent->word[i].x);
					sent->word[i].x = e;
				}
			}
		}
	}

	return TRUE;
}