/** * This just looks up all the words in the sentence, and builds * up an appropriate error message in case some are not there. * It has no side effect on the sentence. Returns TRUE if all * went well. */ int sentence_in_dictionary(Sentence sent) { int w, ok_so_far; char * s; Dictionary dict = sent->dict; char temp[1024]; ok_so_far = TRUE; for (w=0; w<sent->length; w++) { s = sent->word[w].string; if (!boolean_dictionary_lookup(dict, s) && !(is_utf8_upper(s) && dict->capitalized_word_defined) && !(is_utf8_upper(s) && is_s_word(s) && dict->pl_capitalized_word_defined) && !(ishyphenated(s) && dict->hyphenated_word_defined) && !(is_number(s) && dict->number_word_defined) && !(is_ing_word(s) && dict->ing_word_defined) && !(is_s_word(s) && dict->s_word_defined) && !(is_ed_word(s) && dict->ed_word_defined) && !(is_ly_word(s) && dict->ly_word_defined)) { if (ok_so_far) { safe_strcpy(temp, "The following words are not in the dictionary:", sizeof(temp)); ok_so_far = FALSE; } safe_strcat(temp, " \"", sizeof(temp)); safe_strcat(temp, sent->word[w].string, sizeof(temp)); safe_strcat(temp, "\"", sizeof(temp)); } } if (!ok_so_far) { lperror(NOTINDICT, "\n%s\n", temp); } return ok_so_far; }
/** * This hash function that takes a connector and a seed value i. * It only looks at the leading upper case letters of * the string, and the label. This ensures that if two connectors * match, then they must hash to the same place. */ static int conn_hash(Connector * c, int i) { int nb; const char * s; s = c->string; i = i + (i<<1) + randtable[(c->label + i) & (RTSIZE-1)]; nb = is_utf8_upper(s); while (nb) { i = i + (i<<1) + randtable[(*s + i) & (RTSIZE-1)]; s += nb; nb = is_utf8_upper(s); } return i; }
static int downcase_is_in_dict(Dictionary dict, char * word) { int i, rc; char low[MB_LEN_MAX]; char save[MB_LEN_MAX]; wchar_t c; int nbl, nbh; if (!is_utf8_upper(word)) return FALSE; nbh = mbtowc (&c, word, 4); c = towlower(c); nbl = wctomb(low, c); if (nbh != nbl) { fprintf(stderr, "Error: can't downcase multi-byte string!\n"); return FALSE; } /* Downcase */ for (i=0; i<nbl; i++) { save[i] = word[i]; word[i] = low[i]; } /* Look it up, then restore old value */ rc = boolean_dictionary_lookup(dict, word); for (i=0; i<nbh; i++) { word[i] = save[i]; } return rc; }
/** * The string s is the next word of the sentence. * Do not issue the empty string. * Return false if too many words or the word is too long. */ static int issue_sentence_word(Sentence sent, const char * s) { if (*s == '\0') return TRUE; if (strlen(s) > MAX_WORD) { lperror(SEPARATE, ". The word \"%s\" is too long.\n" "A word can have a maximum of %d characters.\n", s, MAX_WORD); return FALSE; } if (sent->length == MAX_SENTENCE) { lperror(SEPARATE, ". The sentence has too many words.\n"); return FALSE; } strcpy(sent->word[sent->length].string, s); /* Now we record whether the first character of the word is upper-case. (The first character may be made lower-case later, but we may want to get at the original version) */ if (is_utf8_upper(s)) sent->word[sent->length].firstupper=1; else sent->word[sent->length].firstupper = 0; sent->length++; return TRUE; }
/** * This is rather esoteric and not terribly important. * It returns TRUE if the word matches the pattern /[A-Z]\.]+/ */ static int is_initials_word(const char * word) { int i=0; while (word[i]) { int nb = is_utf8_upper(&word[i]); if (!nb) return FALSE; i += nb; if (word[i] != '.') return FALSE; i++; } return TRUE; }
/** * Corrects case of first word, fills in other proper nouns, and * builds the expression lists for the resulting words. * * Algorithm: * Apply the following step to all words w: * if w is in the dictionary, use it. * else if w is upper case use PROPER_WORD disjuncts for w. * else if it's hyphenated, use HYPHENATED_WORD * else if it's a number, use NUMBER_WORD. * * Now, we correct the first word, w. * if w is upper case, let w' be the lower case version of w. * if both w and w' are in the dict, concatenate these disjncts. * else if w' is in dict, use disjuncts of w' * else leave the disjuncts alone */ int build_sentence_expressions(Sentence sent) { int i, first_word; /* the index of the first word after the wall */ char *s, *u, temp_word[MAX_WORD+1]; X_node * e; Dictionary dict = sent->dict; if (dict->left_wall_defined) { first_word = 1; } else { first_word = 0; } /* the following loop treats all words the same (nothing special for 1st word) */ for (i=0; i<sent->length; i++) { s = sent->word[i].string; if (boolean_dictionary_lookup(sent->dict, s)) { sent->word[i].x = build_word_expressions(sent, s); } else if (is_utf8_upper(s) && is_s_word(s) && dict->pl_capitalized_word_defined) { if (!special_string(sent, i, PL_PROPER_WORD)) return FALSE; } else if (is_utf8_upper(s) && dict->capitalized_word_defined) { if (!special_string(sent, i, PROPER_WORD)) return FALSE; } else if (is_number(s) && dict->number_word_defined) { /* we know it's a plural number, or 1 */ /* if the string is 1, we'll only be here if 1's not in the dictionary */ if (!special_string(sent, i, NUMBER_WORD)) return FALSE; } else if (ishyphenated(s) && dict->hyphenated_word_defined) { /* singular hyphenated */ if (!special_string(sent, i, HYPHENATED_WORD)) return FALSE; } /* XXX * The following does some morphology-guessing for words that * that are not in the dictionary. This should be replaced by * a generic morphology-guesser for langauges that aren't english. * XXX */ else if (is_ing_word(s) && dict->ing_word_defined) { if (!guessed_string(sent, i, s, ING_WORD)) return FALSE; } else if (is_s_word(s) && dict->s_word_defined) { if (!guessed_string(sent, i, s, S_WORD)) return FALSE; } else if (is_ed_word(s) && dict->ed_word_defined) { if (!guessed_string(sent, i, s, ED_WORD)) return FALSE; } else if (is_ly_word(s) && dict->ly_word_defined) { if (!guessed_string(sent, i, s, LY_WORD)) return FALSE; } else if (dict->unknown_word_defined && dict->use_unknown_word) { handle_unknown_word(sent, i, s); } else { /* The reason I can assert this is that the word * should have been looked up already if we get here. */ assert(FALSE, "I should have found that word."); } } /* Under certain cases--if it's the first word of the sentence, * or if it follows a colon or a quotation mark--a word that's * capitalized has to be looked up as an uncapitalized word * (as well as a capitalized word). */ for (i=0; i<sent->length; i++) { if (! (i==first_word || (i>0 && strcmp(":", sent->word[i-1].string)==0) || post_quote[i]==1) ) continue; s = sent->word[i].string; if (is_utf8_upper(s)) { downcase_utf8_str(temp_word, s, MAX_WORD); u = string_set_add(temp_word, sent->string_set); /* If the lower-case version is in the dictionary... */ if (boolean_dictionary_lookup(sent->dict, u)) { /* Then check if the upper-case version is there. * If it is, the disjuncts for the upper-case version * have been put there already. So add on the disjuncts * for the lower-case version. */ if (boolean_dictionary_lookup(sent->dict, s)) { e = build_word_expressions(sent, u); sent->word[i].x = catenate_X_nodes(sent->word[i].x, e); } else { /* If the upper-case version isn't there, * replace the u.c. disjuncts with l.c. ones. */ safe_strcpy(s,u, MAX_WORD); e = build_word_expressions(sent, s); free_X_nodes(sent->word[i].x); sent->word[i].x = e; } } } } return TRUE; }