Dict_node * list_whole_dictionary(Dict_node *root, Dict_node *dn) { Dict_node *c, *d; if (root == NULL) return dn; c = (Dict_node *) xalloc(sizeof(Dict_node)); *c = *root; d = list_whole_dictionary(root->left, dn); c->right = list_whole_dictionary(root->right, d); return c; }
static int separate_word(Sentence sent, char *w, char *wend, int is_first_word, int quote_found) { /* w points to a string, wend points to the char one after the end. The * "word" w contains no blanks. This function splits up the word if * necessary, and calls "issue_sentence_word()" on each of the resulting * parts. The process is described above. returns TRUE of OK, FALSE if * too many punctuation marks */ int i, j, k, l, len; int r_strippable=0, l_strippable=0; int s_strippable=0, p_strippable=0; int n_r_stripped, s_stripped; int word_is_in_dict, s_ok; int r_stripped[MAX_STRIP]; /* these were stripped from the right */ const char ** strip_left=NULL; const char ** strip_right=NULL; const char ** prefix=NULL; const char ** suffix=NULL; char word[MAX_WORD+1]; char newword[MAX_WORD+1]; Dict_node * dn, * dn2, * start_dn; const char * rpunc_con = "RPUNC"; const char * lpunc_con = "LPUNC"; const char * suf_con = "SUF"; const char * pre_con = "PRE"; if (sent->dict->affix_table!=NULL) { start_dn = list_whole_dictionary(sent->dict->affix_table->root, NULL); for (dn = start_dn; dn != NULL; dn = dn->right) { if (word_has_connector(dn, rpunc_con, 0)) r_strippable++; if (word_has_connector(dn, lpunc_con, 0)) l_strippable++; if (word_has_connector(dn, suf_con, 0)) s_strippable++; if (word_has_connector(dn, pre_con, 0)) p_strippable++; } strip_right = (const char **) xalloc(r_strippable * sizeof(char *)); strip_left = (const char **) xalloc(l_strippable * sizeof(char *)); suffix = (const char **) xalloc(s_strippable * sizeof(char *)); prefix = (const char **) xalloc(p_strippable * sizeof(char *)); i=0; j=0; k=0; l=0; dn = start_dn; while (dn != NULL) { if(word_has_connector(dn, rpunc_con, 0)) { strip_right[i] = dn->string; i++; } if(word_has_connector(dn, lpunc_con, 0)) { strip_left[j] = dn->string; j++; } if(word_has_connector(dn, suf_con, 0)) { suffix[k] = dn->string; k++; } if(word_has_connector(dn, pre_con, 0)) { prefix[l] = dn->string; l++; } dn2 = dn->right; dn->right = NULL; xfree(dn, sizeof(Dict_node)); dn = dn2; } } for (;;) { for (i=0; i<l_strippable; i++) { if (strncmp(w, strip_left[i], strlen(strip_left[i])) == 0) { if (!issue_sentence_word(sent, strip_left[i])) return FALSE; w += strlen(strip_left[i]); break; } } if (i==l_strippable) break; } /* Now w points to the string starting just to the right of * any left-stripped characters. * stripped[] is an array of numbers, indicating the index * numbers (in the strip_right array) of any strings stripped off; * stripped[0] is the number of the first string stripped off, etc. * When it breaks out of this loop, n_stripped will be the number * of strings stripped off. */ for (n_r_stripped = 0; n_r_stripped < MAX_STRIP; n_r_stripped++) { strncpy(word, w, MIN(wend-w, MAX_WORD)); word[MIN(wend-w, MAX_WORD)] = '\0'; if (wend == w) break; /* it will work without this */ if (boolean_dictionary_lookup(sent->dict, word) || is_initials_word(word)) break; /* This could happen if it's a word after a colon, also! */ if (is_first_word && downcase_is_in_dict (sent->dict, word)) break; for (i=0; i < r_strippable; i++) { len = strlen(strip_right[i]); /* the remaining w is too short for a possible match */ if ((wend-w) < len) continue; if (strncmp(wend-len, strip_right[i], len) == 0) { r_stripped[n_r_stripped] = i; wend -= len; break; } } if (i == r_strippable) break; } /* Now we strip off suffixes...w points to the remaining word, * "wend" to the end of the word. */ s_stripped = -1; strncpy(word, w, MIN(wend-w, MAX_WORD)); word[MIN(wend-w, MAX_WORD)] = '\0'; word_is_in_dict=0; if (boolean_dictionary_lookup(sent->dict, word)) word_is_in_dict = 1; else if (is_initials_word(word)) word_is_in_dict = 1; else if (is_first_word && downcase_is_in_dict (sent->dict,word)) word_is_in_dict = 1; if(word_is_in_dict==0) { j=0; for (i=0; i < s_strippable+1; i++) { s_ok = 0; /* Go through once for each suffix; then go through one * final time for the no-suffix case */ if(i < s_strippable) { len = strlen(suffix[i]); /* the remaining w is too short for a possible match */ if ((wend-w) < len) continue; if (strncmp(wend-len, suffix[i], len) == 0) s_ok=1; } else len=0; if(s_ok==1 || i==s_strippable) { strncpy(newword, w, MIN((wend-len)-w, MAX_WORD)); newword[MIN((wend-len)-w, MAX_WORD)] = '\0'; /* Check if the remainder is in the dictionary; * for the no-suffix case, it won't be */ if (boolean_dictionary_lookup(sent->dict, newword)) { if(verbosity>1) if(i< s_strippable) printf("Splitting word into two: %s-%s\n", newword, suffix[i]); s_stripped = i; wend -= len; strncpy(word, w, MIN(wend-w, MAX_WORD)); word[MIN(wend-w, MAX_WORD)] = '\0'; break; } /* If the remainder isn't in the dictionary, * try stripping off prefixes */ else { for (j=0; j<p_strippable; j++) { if (strncmp(w, prefix[j], strlen(prefix[j])) == 0) { strncpy(newword, w+strlen(prefix[j]), MIN((wend-len)-(w+strlen(prefix[j])), MAX_WORD)); newword[MIN((wend-len)-(w+strlen(prefix[j])), MAX_WORD)]='\0'; if(boolean_dictionary_lookup(sent->dict, newword)) { if(verbosity>1) if(i < s_strippable) printf("Splitting word into three: %s-%s-%s\n", prefix[j], newword, suffix[i]); if (!issue_sentence_word(sent, prefix[j])) return FALSE; if(i < s_strippable) s_stripped = i; wend -= len; w += strlen(prefix[j]); strncpy(word, w, MIN(wend-w, MAX_WORD)); word[MIN(wend-w, MAX_WORD)] = '\0'; break; } } } } if(j!=p_strippable) break; } } } /* word is now what remains after all the stripping has been done */ /* if (n_stripped == MAX_STRIP) { lperror(SEPARATE, ".\n\"%s\" is followed by too many punctuation marks.\n", word); return FALSE; } */ if (quote_found==1) post_quote[sent->length]=1; if (!issue_sentence_word(sent, word)) return FALSE; if(s_stripped != -1) { if (!issue_sentence_word(sent, suffix[s_stripped])) return FALSE; } for (i=n_r_stripped-1; i>=0; i--) { /* Revert fix r22566, which had a commit message: * "Fix Bug 9756, crash when grammar checking Word document." * This fix added the line: * if (r_stripped[i] > strlen(*strip_right)) continue; * However, the addition of this line will break * the parsing of "Doogie's mother bit her." * * The fix is incorrect, because a NULL has been inserted into strip_right, * making it very short (length 2). Meanwhile, the offset to the 's * is 9 chars (greater than 2!) The string at strip_right[r_stripped[i]] * is pointing at the 's. * * Thus, I'm reverting this fix for now; whatever the problem is, * it needs to be handled in some other way. */ if (!issue_sentence_word(sent, strip_right[r_stripped[i]])) return FALSE; } if(sent->dict->affix_table!=NULL) { xfree(strip_right, r_strippable * sizeof(char *)); xfree(strip_left, l_strippable * sizeof(char *)); xfree(suffix, s_strippable * sizeof(char *)); xfree(prefix, p_strippable * sizeof(char *)); } return TRUE; }