Exemple #1
0
Dict_node * list_whole_dictionary(Dict_node *root, Dict_node *dn) {
    Dict_node *c, *d;
    if (root == NULL) return dn;
    c = (Dict_node *) xalloc(sizeof(Dict_node));
    *c = *root;
    d = list_whole_dictionary(root->left, dn);
    c->right = list_whole_dictionary(root->right, d);
    return c;
}
Exemple #2
0
static int separate_word(Sentence sent, char *w, char *wend, int is_first_word, int quote_found)
{
	/* w points to a string, wend points to the char one after the end.  The
	 * "word" w contains no blanks.  This function splits up the word if
	 * necessary, and calls "issue_sentence_word()" on each of the resulting
	 * parts.  The process is described above.  returns TRUE of OK, FALSE if
	 * too many punctuation marks */
	int i, j, k, l, len;
	int r_strippable=0, l_strippable=0;
	int s_strippable=0, p_strippable=0;
	int  n_r_stripped, s_stripped;
	int word_is_in_dict, s_ok;
	int r_stripped[MAX_STRIP];  /* these were stripped from the right */
	const char ** strip_left=NULL;
	const char ** strip_right=NULL;
	const char ** prefix=NULL;
	const char ** suffix=NULL;
	char word[MAX_WORD+1];
	char newword[MAX_WORD+1];
	Dict_node * dn, * dn2, * start_dn;
	const char * rpunc_con = "RPUNC";
	const char * lpunc_con = "LPUNC";
	const char * suf_con = "SUF";
	const char * pre_con = "PRE";

	if (sent->dict->affix_table!=NULL)
	{
		start_dn = list_whole_dictionary(sent->dict->affix_table->root, NULL);
		for (dn = start_dn; dn != NULL; dn = dn->right)
		{
			if (word_has_connector(dn, rpunc_con, 0)) r_strippable++;
			if (word_has_connector(dn, lpunc_con, 0)) l_strippable++;
			if (word_has_connector(dn, suf_con, 0)) s_strippable++;
			if (word_has_connector(dn, pre_con, 0)) p_strippable++;
	  	}
		strip_right = (const char **) xalloc(r_strippable * sizeof(char *));
		strip_left = (const char **) xalloc(l_strippable * sizeof(char *));
		suffix = (const char **) xalloc(s_strippable * sizeof(char *));
		prefix = (const char **) xalloc(p_strippable * sizeof(char *));

		i=0;
		j=0;
		k=0;
		l=0;
		dn = start_dn;
		while (dn != NULL)
		{
			if(word_has_connector(dn, rpunc_con, 0)) {
				strip_right[i] = dn->string;
				i++;
			}
			if(word_has_connector(dn, lpunc_con, 0)) {
				strip_left[j] = dn->string;
				j++;
			}
			if(word_has_connector(dn, suf_con, 0)) {
				suffix[k] = dn->string;
				k++;
			}
			if(word_has_connector(dn, pre_con, 0)) {
				prefix[l] = dn->string;
				l++;
			}
			dn2 = dn->right;
			dn->right = NULL;
			xfree(dn, sizeof(Dict_node));
			dn = dn2;
		}
	}

	for (;;) {
		for (i=0; i<l_strippable; i++) {
			if (strncmp(w, strip_left[i], strlen(strip_left[i])) == 0) {
				if (!issue_sentence_word(sent, strip_left[i])) return FALSE;
				w += strlen(strip_left[i]);
				break;
			}
		}
		if (i==l_strippable) break;
	}

	/* Now w points to the string starting just to the right of
	 * any left-stripped characters.
	 * stripped[] is an array of numbers, indicating the index
	 * numbers (in the strip_right array) of any strings stripped off;
	 * stripped[0] is the number of the first string stripped off, etc.
	 * When it breaks out of this loop, n_stripped will be the number
	 * of strings stripped off.
	 */
	for (n_r_stripped = 0; n_r_stripped < MAX_STRIP; n_r_stripped++) 
	{
		strncpy(word, w, MIN(wend-w, MAX_WORD));
		word[MIN(wend-w, MAX_WORD)] = '\0';
		if (wend == w) break;  /* it will work without this */

		if (boolean_dictionary_lookup(sent->dict, word) || is_initials_word(word)) break;

		/* This could happen if it's a word after a colon, also! */
		if (is_first_word && downcase_is_in_dict (sent->dict, word)) break;

		for (i=0; i < r_strippable; i++)
		{
			len = strlen(strip_right[i]);

			/* the remaining w is too short for a possible match */
			if ((wend-w) < len) continue;
			if (strncmp(wend-len, strip_right[i], len) == 0) {
				r_stripped[n_r_stripped] = i;
				wend -= len;
				break;
			}
		}
		if (i == r_strippable) break;
	}

	/* Now we strip off suffixes...w points to the remaining word, 
	 * "wend" to the end of the word. */

	s_stripped = -1;
	strncpy(word, w, MIN(wend-w, MAX_WORD));
	word[MIN(wend-w, MAX_WORD)] = '\0';
	word_is_in_dict=0;

	if (boolean_dictionary_lookup(sent->dict, word))
		word_is_in_dict = 1;
	else if (is_initials_word(word))
		word_is_in_dict = 1;
	else if (is_first_word && downcase_is_in_dict (sent->dict,word))
		word_is_in_dict = 1;

	if(word_is_in_dict==0)
	{
	  j=0;
	  for (i=0; i < s_strippable+1; i++) {
		s_ok = 0;
		/* Go through once for each suffix; then go through one 
		 * final time for the no-suffix case */
		if(i < s_strippable) {
		  len = strlen(suffix[i]);

		  /* the remaining w is too short for a possible match */
		  if ((wend-w) < len) continue;

		  if (strncmp(wend-len, suffix[i], len) == 0) s_ok=1;
				  }
		else len=0;

		if(s_ok==1 || i==s_strippable)
		{
			strncpy(newword, w, MIN((wend-len)-w, MAX_WORD));
			newword[MIN((wend-len)-w, MAX_WORD)] = '\0';

			/* Check if the remainder is in the dictionary;
			 * for the no-suffix case, it won't be */
			if (boolean_dictionary_lookup(sent->dict, newword)) {
				if(verbosity>1) if(i< s_strippable) printf("Splitting word into two: %s-%s\n", newword, suffix[i]);
				s_stripped = i;
				wend -= len;
				strncpy(word, w, MIN(wend-w, MAX_WORD));
				word[MIN(wend-w, MAX_WORD)] = '\0';
				break;
			}

			/* If the remainder isn't in the dictionary, 
			 * try stripping off prefixes */
		  else {
			for (j=0; j<p_strippable; j++) {
			  if (strncmp(w, prefix[j], strlen(prefix[j])) == 0) {
				strncpy(newword, w+strlen(prefix[j]), MIN((wend-len)-(w+strlen(prefix[j])), MAX_WORD));
				newword[MIN((wend-len)-(w+strlen(prefix[j])), MAX_WORD)]='\0';
				if(boolean_dictionary_lookup(sent->dict, newword)) {
				  if(verbosity>1) if(i < s_strippable) printf("Splitting word into three: %s-%s-%s\n", prefix[j], newword, suffix[i]);
				  if (!issue_sentence_word(sent, prefix[j])) return FALSE;
				  if(i < s_strippable) s_stripped = i;
				  wend -= len;
				  w += strlen(prefix[j]);
				  strncpy(word, w, MIN(wend-w, MAX_WORD));
				  word[MIN(wend-w, MAX_WORD)] = '\0';
				  break;
				}
			  }
			}
		  }
		  if(j!=p_strippable) break;
		}
	  }
	}

	/* word is now what remains after all the stripping has been done */

	/*
	if (n_stripped == MAX_STRIP) {
		lperror(SEPARATE,
				".\n\"%s\" is followed by too many punctuation marks.\n", word);
		return FALSE;
	} */

	if (quote_found==1) post_quote[sent->length]=1;

	if (!issue_sentence_word(sent, word)) return FALSE;

	if(s_stripped != -1) {
	  if (!issue_sentence_word(sent, suffix[s_stripped])) return FALSE;
	}

	for (i=n_r_stripped-1; i>=0; i--) {

		/* Revert fix r22566, which had a commit message:
		 * "Fix Bug 9756, crash when grammar checking Word document."
		 * This fix added the line:
		 *    if (r_stripped[i] > strlen(*strip_right)) continue;
		 * However, the addition of this line will break
		 * the parsing of "Doogie's mother bit her."
		 *
		 * The fix is incorrect, because a NULL has been inserted into strip_right,
		 * making it very short (length 2). Meanwhile, the offset to the 's 
		 * is 9 chars (greater than 2!)  The string at strip_right[r_stripped[i]]
		 * is pointing at the 's.
		 *
		 * Thus, I'm reverting this fix for now; whatever the problem is,
		 * it needs to be handled in some other way.
		 */
		if (!issue_sentence_word(sent, strip_right[r_stripped[i]])) return FALSE;
	}

	if(sent->dict->affix_table!=NULL) {
	  xfree(strip_right, r_strippable * sizeof(char *));
	  xfree(strip_left, l_strippable * sizeof(char *));
	  xfree(suffix, s_strippable * sizeof(char *));
	  xfree(prefix, p_strippable * sizeof(char *));
	}
	return TRUE;
}