static char* SpellCheck(unsigned int i) { // on entry we will have passed over words which are KnownWord (including bases) or isInitialWord (all initials) // wordstarts from 1 ... wordCount is the incoming sentence words (original). We are processing the ith word here. char* word = wordStarts[i]; if (!*word) return NULL; if (!stricmp(word,loginID) || !stricmp(word,computerID)) return word; // dont change his/our name ever size_t len = strlen(word); if (len > 2 && word[len-2] == '\'') return word; // dont do anything with ' words // test for run togetherness like "talkabout fingers" int breakAt = SplitWord(word); if (breakAt > 0)// we found a split, insert 2nd word into word stream { ++wordCount; memmove(wordStarts+i+1,wordStarts+i,sizeof(char*) * (wordCount-i)); // open up a slot for a new word wordStarts[i+1] = reuseAllocation(wordStarts[i+1],wordStarts[i]+breakAt); // set this to the second word (shared from within 1st word) return FindWord(wordStarts[i],breakAt,PRIMARY_CASE_ALLOWED)->word; // 1st word gets replaced, we added valid word after } // now imagine partial runtogetherness, like "talkab out fingers" if (i < wordCount) { char tmp[MAX_WORD_SIZE]; strcpy(tmp,word); strcat(tmp,wordStarts[i+1]); breakAt = SplitWord(tmp); if (breakAt > 0) // replace words with the dual pair { wordStarts[i+1] = reuseAllocation(wordStarts[i+1],StoreWord(tmp+breakAt)->word); // set this to the second word (shared from within 1st word) return FindWord(tmp,breakAt,PRIMARY_CASE_ALLOWED)->word; // 1st word gets replaced, we added valid word after } } // remove any nondigit characters repeated more than once. Dont do this earlier, we want substitutions to have a chance at it first. ammmmmmazing static char word1[MAX_WORD_SIZE]; char* ptr = word-1; char* ptr1 = word1; while (*++ptr) { *ptr1 = *ptr; while (ptr[1] == *ptr1 && ptr[2] == *ptr1 && (*ptr1 < '0' || *ptr1 > '9')) ++ptr; // skip double repeats ++ptr1; } *ptr1 = 0; if (FindCanonical(word1,0,true) && !IsUpperCase(*word1)) return word1; // this is a different form of a canonical word so its ok // now use word spell checker char* d = SpellFix(word,i,PART_OF_SPEECH); return (d) ? d : NULL; }
bool SpellCheckSentence() { WORDP D,E; fixedSpell = false; bool lowercase = false; int language = ENGLISH; char* lang = GetUserVariable((char*)"$cs_language"); if (lang && !stricmp(lang,(char*)"spanish")) language = SPANISH; // check for all uppercase for (int i = FindOOBEnd(1) + 1; i <= wordCount; ++i) // skip start of sentence { char* word = wordStarts[i]; size_t len = strlen(word); for (int j = 0; j < (int)len; ++j) { if (IsLowerCase(word[j])) { lowercase = true; i = j = 1000; } } } if (!lowercase && wordCount > 2) // must have several words in uppercase { for (int i = FindOOBEnd(1); i <= wordCount; ++i) { char* word = wordStarts[i]; MakeLowerCase(word); } } int startWord = FindOOBEnd(1); for (int i = startWord; i <= wordCount; ++i) { char* word = wordStarts[i]; if (!word || !word[1] || *word == '"' ) continue; // illegal or single char or quoted thingy size_t len = strlen(word); // dont spell check uppercase not at start or joined word if (IsUpperCase(word[0]) && (i != startWord || strchr(word,'_')) && tokenControl & NO_PROPER_SPELLCHECK) continue; // dont spell check email or other things with @ or . in them if (strchr(word,'@') || strchr(word,'.') || strchr(word,'$')) continue; // dont spell check names of json objects or arrays if (!strnicmp(word,"ja-",3) || !strnicmp(word,"jo-",3)) continue; char* known = ProbableKnownWord(word); if (known && !strcmp(known,word)) continue; // we know it if (known && strcmp(known,word)) { char* tokens[2]; if (!IsUpperCase(*known)) // revised the word to lower case (avoid to upper case like "fields" to "Fields" { WORDP D = FindWord(known,0,LOWERCASE_LOOKUP); if (D) { tokens[1] = D->word; ReplaceWords(i,1,1,tokens); fixedSpell = true; continue; } } else // is uppercase a concept member? then revise upwards { WORDP D = FindWord(known,0,UPPERCASE_LOOKUP); if (IsConceptMember(D)) { tokens[1] = D->word; ReplaceWords(i,1,1,tokens); fixedSpell = true; continue; } } } char* p = word -1; unsigned char c; char* hyphen = 0; while ((c = *++p) != 0) { ++len; if (c == '-') hyphen = p; // note is hyphenated - use trailing } if (len == 0 || GetTemperatureLetter(word)) continue; // bad ignore utf word or llegal length - also no composite words if (c && c != '@' && c != '.') // illegal word character { if (IsDigit(word[0]) || len == 1){;} // probable numeric? // accidental junk on end of word we do know immedately? else if (i > 1 && !IsAlphaUTF8OrDigit(wordStarts[i][len-1]) ) { WORDP entry,canonical; char word[MAX_WORD_SIZE]; strcpy(word,wordStarts[i]); word[len-1] = 0; uint64 sysflags = 0; uint64 cansysflags = 0; WORDP revise; GetPosData(i,word,revise,entry,canonical,sysflags,cansysflags,true,true); // dont create a non-existent word if (entry && entry->properties & PART_OF_SPEECH) { wordStarts[i] = reuseAllocation(wordStarts[i],entry->word); fixedSpell = true; continue; // not a legal word character, leave it alone } } } // see if we know the other case if (!(tokenControl & (ONLY_LOWERCASE|STRICT_CASING)) || (i == startSentence && !(tokenControl & ONLY_LOWERCASE))) { WORDP E = FindWord(word,0,SECONDARY_CASE_ALLOWED); bool useAlternateCase = false; if (E && E->systemFlags & PATTERN_WORD) useAlternateCase = true; if (E && E->properties & (PART_OF_SPEECH|FOREIGN_WORD)) { // if the word we find is UPPER case, and this might be a lower case noun plural, don't change case. size_t len = strlen(word); if (word[len-1] == 's' ) { WORDP F = FindWord(word,len-1); if (!F || !(F->properties & (PART_OF_SPEECH|FOREIGN_WORD))) useAlternateCase = true; else continue; } else useAlternateCase = true; } else if (E) // does it have a member concept fact { if (IsConceptMember(E)) { useAlternateCase = true; break; } } if (useAlternateCase) { char* tokens[2]; tokens[1] = E->word; ReplaceWords(i,1,1,tokens); fixedSpell = true; continue; } } // merge with next token? char join[MAX_WORD_SIZE * 3]; if (i != wordCount && *wordStarts[i+1] != '"' ) { // direct merge as a single word strcpy(join,word); strcat(join,wordStarts[i+1]); WORDP D = FindWord(join,0,(tokenControl & ONLY_LOWERCASE) ? PRIMARY_CASE_ALLOWED : STANDARD_LOOKUP); strcpy(join,word); if (!D || !(D->properties & PART_OF_SPEECH) ) // merge these two, except "going to" or wordnet composites of normal words // merge as a compound word { strcat(join,(char*)"_"); strcat(join,wordStarts[i+1]); D = FindWord(join,0,(tokenControl & ONLY_LOWERCASE) ? PRIMARY_CASE_ALLOWED : STANDARD_LOOKUP); } if (D && D->properties & PART_OF_SPEECH && !(D->properties & AUX_VERB)) // merge these two, except "going to" or wordnet composites of normal words { WORDP P1 = FindWord(word,0,LOWERCASE_LOOKUP); WORDP P2 = FindWord(wordStarts[i+1],0,LOWERCASE_LOOKUP); if (!P1 || !P2 || !(P1->properties & PART_OF_SPEECH) || !(P2->properties & PART_OF_SPEECH)) { char* tokens[2]; tokens[1] = D->word; ReplaceWords(i,2,1,tokens); fixedSpell = true; continue; } } } // break apart slashed pair like eat/feed char* slash = strchr(word,'/'); if (slash && slash != word && slash[1]) // break apart word/word { if ((wordCount + 2 ) >= REAL_SENTENCE_LIMIT) continue; // no room *slash = 0; D = StoreWord(word); *slash = '/'; E = StoreWord(slash+1); char* tokens[4]; tokens[1] = D->word; tokens[2] = "/"; tokens[3] = E->word; ReplaceWords(i,1,3,tokens); fixedSpell = true; --i; continue; } // see if hypenated word should be separate or joined (ignore obvious adjective suffix) if (hyphen && !stricmp(hyphen,(char*)"-like")) { StoreWord(word,ADJECTIVE_NORMAL|ADJECTIVE); // accept it as a word continue; } else if (hyphen && (hyphen-word) > 1) { char test[MAX_WORD_SIZE]; char first[MAX_WORD_SIZE]; // test for split *hyphen = 0; strcpy(test,hyphen+1); strcpy(first,word); *hyphen = '-'; WORDP E = FindWord(test,0,LOWERCASE_LOOKUP); WORDP D = FindWord(first,0,LOWERCASE_LOOKUP); if (*first == 0) { wordStarts[i] = AllocateString(wordStarts[i] + 1); // -pieces want to lose the leading hypen (2-pieces) fixedSpell = true; } else if (D && E) // 1st word gets replaced, we added another word after { if ((wordCount + 1 ) >= REAL_SENTENCE_LIMIT) continue; // no room char* tokens[3]; tokens[1] = D->word; tokens[2] = E->word; ReplaceWords(i,1,2,tokens); fixedSpell = true; --i; } else if (!stricmp(test,(char*)"old") || !stricmp(test,(char*)"olds")) // break apart 5-year-old { if ((wordCount + 1 ) >= REAL_SENTENCE_LIMIT) continue; // no room D = StoreWord(first); E = StoreWord(test); char* tokens[3]; tokens[1] = D->word; tokens[2] = E->word; ReplaceWords(i,1,2,tokens); fixedSpell = true; --i; } else // remove hyphen entirely? { strcpy(test,first); strcat(test,hyphen+1); D = FindWord(test,0,(tokenControl & ONLY_LOWERCASE) ? PRIMARY_CASE_ALLOWED : STANDARD_LOOKUP); if (D) { wordStarts[i] = D->word; fixedSpell = true; --i; } } continue; // ignore hypenated errors that we couldnt solve, because no one mistypes a hypen } // leave uppercase in first position if not adjusted yet... but check for lower case spell error if (IsUpperCase(word[0]) && tokenControl & NO_PROPER_SPELLCHECK) { char lower[MAX_WORD_SIZE]; MakeLowerCopy(lower,word); WORDP D = FindWord(lower,0,LOWERCASE_LOOKUP); if (!D && i == startWord) { char* okword = SpellFix(lower,i,PART_OF_SPEECH,language); if (okword) { char* tokens[2]; WORDP E = StoreWord(okword); tokens[1] = E->word; ReplaceWords(i,1,1,tokens); fixedSpell = true; } } continue; } if (*word != '\'' && (!FindCanonical(word, i,true) || IsUpperCase(word[0]))) // dont check quoted or findable words unless they are capitalized { word = SpellCheck(i,language); // dont spell check proper names to improper, if word before or after is lower case originally if (word && i != 1 && originalCapState[i] && !IsUpperCase(*word)) { if (!originalCapState[i-1]) return false; else if (i != wordCount && !originalCapState[i+1]) return false; } if (word && !*word) // performed substitution on prior word, restart this one { fixedSpell = true; --i; continue; } if (word) { char* tokens[2]; tokens[1] = word; ReplaceWords(i,1,1,tokens); fixedSpell = true; continue; } } } return fixedSpell; }