bool SpellCheckSentence() { WORDP D,E; fixedSpell = false; bool lowercase = false; int language = ENGLISH; char* lang = GetUserVariable((char*)"$cs_language"); if (lang && !stricmp(lang,(char*)"spanish")) language = SPANISH; // check for all uppercase for (int i = FindOOBEnd(1) + 1; i <= wordCount; ++i) // skip start of sentence { char* word = wordStarts[i]; size_t len = strlen(word); for (int j = 0; j < (int)len; ++j) { if (IsLowerCase(word[j])) { lowercase = true; i = j = 1000; } } } if (!lowercase && wordCount > 2) // must have several words in uppercase { for (int i = FindOOBEnd(1); i <= wordCount; ++i) { char* word = wordStarts[i]; MakeLowerCase(word); } } int startWord = FindOOBEnd(1); for (int i = startWord; i <= wordCount; ++i) { char* word = wordStarts[i]; if (!word || !word[1] || *word == '"' ) continue; // illegal or single char or quoted thingy size_t len = strlen(word); // dont spell check uppercase not at start or joined word if (IsUpperCase(word[0]) && (i != startWord || strchr(word,'_')) && tokenControl & NO_PROPER_SPELLCHECK) continue; // dont spell check email or other things with @ or . in them if (strchr(word,'@') || strchr(word,'.') || strchr(word,'$')) continue; // dont spell check names of json objects or arrays if (!strnicmp(word,"ja-",3) || !strnicmp(word,"jo-",3)) continue; char* known = ProbableKnownWord(word); if (known && !strcmp(known,word)) continue; // we know it if (known && strcmp(known,word)) { char* tokens[2]; if (!IsUpperCase(*known)) // revised the word to lower case (avoid to upper case like "fields" to "Fields" { WORDP D = FindWord(known,0,LOWERCASE_LOOKUP); if (D) { tokens[1] = D->word; ReplaceWords(i,1,1,tokens); fixedSpell = true; continue; } } else // is uppercase a concept member? then revise upwards { WORDP D = FindWord(known,0,UPPERCASE_LOOKUP); if (IsConceptMember(D)) { tokens[1] = D->word; ReplaceWords(i,1,1,tokens); fixedSpell = true; continue; } } } char* p = word -1; unsigned char c; char* hyphen = 0; while ((c = *++p) != 0) { ++len; if (c == '-') hyphen = p; // note is hyphenated - use trailing } if (len == 0 || GetTemperatureLetter(word)) continue; // bad ignore utf word or llegal length - also no composite words if (c && c != '@' && c != '.') // illegal word character { if (IsDigit(word[0]) || len == 1){;} // probable numeric? // accidental junk on end of word we do know immedately? else if (i > 1 && !IsAlphaUTF8OrDigit(wordStarts[i][len-1]) ) { WORDP entry,canonical; char word[MAX_WORD_SIZE]; strcpy(word,wordStarts[i]); word[len-1] = 0; uint64 sysflags = 0; uint64 cansysflags = 0; WORDP revise; GetPosData(i,word,revise,entry,canonical,sysflags,cansysflags,true,true); // dont create a non-existent word if (entry && entry->properties & PART_OF_SPEECH) { wordStarts[i] = reuseAllocation(wordStarts[i],entry->word); fixedSpell = true; continue; // not a legal word character, leave it alone } } } // see if we know the other case if (!(tokenControl & (ONLY_LOWERCASE|STRICT_CASING)) || (i == startSentence && !(tokenControl & ONLY_LOWERCASE))) { WORDP E = FindWord(word,0,SECONDARY_CASE_ALLOWED); bool useAlternateCase = false; if (E && E->systemFlags & PATTERN_WORD) useAlternateCase = true; if (E && E->properties & (PART_OF_SPEECH|FOREIGN_WORD)) { // if the word we find is UPPER case, and this might be a lower case noun plural, don't change case. size_t len = strlen(word); if (word[len-1] == 's' ) { WORDP F = FindWord(word,len-1); if (!F || !(F->properties & (PART_OF_SPEECH|FOREIGN_WORD))) useAlternateCase = true; else continue; } else useAlternateCase = true; } else if (E) // does it have a member concept fact { if (IsConceptMember(E)) { useAlternateCase = true; break; } } if (useAlternateCase) { char* tokens[2]; tokens[1] = E->word; ReplaceWords(i,1,1,tokens); fixedSpell = true; continue; } } // merge with next token? char join[MAX_WORD_SIZE * 3]; if (i != wordCount && *wordStarts[i+1] != '"' ) { // direct merge as a single word strcpy(join,word); strcat(join,wordStarts[i+1]); WORDP D = FindWord(join,0,(tokenControl & ONLY_LOWERCASE) ? PRIMARY_CASE_ALLOWED : STANDARD_LOOKUP); strcpy(join,word); if (!D || !(D->properties & PART_OF_SPEECH) ) // merge these two, except "going to" or wordnet composites of normal words // merge as a compound word { strcat(join,(char*)"_"); strcat(join,wordStarts[i+1]); D = FindWord(join,0,(tokenControl & ONLY_LOWERCASE) ? PRIMARY_CASE_ALLOWED : STANDARD_LOOKUP); } if (D && D->properties & PART_OF_SPEECH && !(D->properties & AUX_VERB)) // merge these two, except "going to" or wordnet composites of normal words { WORDP P1 = FindWord(word,0,LOWERCASE_LOOKUP); WORDP P2 = FindWord(wordStarts[i+1],0,LOWERCASE_LOOKUP); if (!P1 || !P2 || !(P1->properties & PART_OF_SPEECH) || !(P2->properties & PART_OF_SPEECH)) { char* tokens[2]; tokens[1] = D->word; ReplaceWords(i,2,1,tokens); fixedSpell = true; continue; } } } // break apart slashed pair like eat/feed char* slash = strchr(word,'/'); if (slash && slash != word && slash[1]) // break apart word/word { if ((wordCount + 2 ) >= REAL_SENTENCE_LIMIT) continue; // no room *slash = 0; D = StoreWord(word); *slash = '/'; E = StoreWord(slash+1); char* tokens[4]; tokens[1] = D->word; tokens[2] = "/"; tokens[3] = E->word; ReplaceWords(i,1,3,tokens); fixedSpell = true; --i; continue; } // see if hypenated word should be separate or joined (ignore obvious adjective suffix) if (hyphen && !stricmp(hyphen,(char*)"-like")) { StoreWord(word,ADJECTIVE_NORMAL|ADJECTIVE); // accept it as a word continue; } else if (hyphen && (hyphen-word) > 1) { char test[MAX_WORD_SIZE]; char first[MAX_WORD_SIZE]; // test for split *hyphen = 0; strcpy(test,hyphen+1); strcpy(first,word); *hyphen = '-'; WORDP E = FindWord(test,0,LOWERCASE_LOOKUP); WORDP D = FindWord(first,0,LOWERCASE_LOOKUP); if (*first == 0) { wordStarts[i] = AllocateString(wordStarts[i] + 1); // -pieces want to lose the leading hypen (2-pieces) fixedSpell = true; } else if (D && E) // 1st word gets replaced, we added another word after { if ((wordCount + 1 ) >= REAL_SENTENCE_LIMIT) continue; // no room char* tokens[3]; tokens[1] = D->word; tokens[2] = E->word; ReplaceWords(i,1,2,tokens); fixedSpell = true; --i; } else if (!stricmp(test,(char*)"old") || !stricmp(test,(char*)"olds")) // break apart 5-year-old { if ((wordCount + 1 ) >= REAL_SENTENCE_LIMIT) continue; // no room D = StoreWord(first); E = StoreWord(test); char* tokens[3]; tokens[1] = D->word; tokens[2] = E->word; ReplaceWords(i,1,2,tokens); fixedSpell = true; --i; } else // remove hyphen entirely? { strcpy(test,first); strcat(test,hyphen+1); D = FindWord(test,0,(tokenControl & ONLY_LOWERCASE) ? PRIMARY_CASE_ALLOWED : STANDARD_LOOKUP); if (D) { wordStarts[i] = D->word; fixedSpell = true; --i; } } continue; // ignore hypenated errors that we couldnt solve, because no one mistypes a hypen } // leave uppercase in first position if not adjusted yet... but check for lower case spell error if (IsUpperCase(word[0]) && tokenControl & NO_PROPER_SPELLCHECK) { char lower[MAX_WORD_SIZE]; MakeLowerCopy(lower,word); WORDP D = FindWord(lower,0,LOWERCASE_LOOKUP); if (!D && i == startWord) { char* okword = SpellFix(lower,i,PART_OF_SPEECH,language); if (okword) { char* tokens[2]; WORDP E = StoreWord(okword); tokens[1] = E->word; ReplaceWords(i,1,1,tokens); fixedSpell = true; } } continue; } if (*word != '\'' && (!FindCanonical(word, i,true) || IsUpperCase(word[0]))) // dont check quoted or findable words unless they are capitalized { word = SpellCheck(i,language); // dont spell check proper names to improper, if word before or after is lower case originally if (word && i != 1 && originalCapState[i] && !IsUpperCase(*word)) { if (!originalCapState[i-1]) return false; else if (i != wordCount && !originalCapState[i+1]) return false; } if (word && !*word) // performed substitution on prior word, restart this one { fixedSpell = true; --i; continue; } if (word) { char* tokens[2]; tokens[1] = word; ReplaceWords(i,1,1,tokens); fixedSpell = true; continue; } } } return fixedSpell; }
bool SpellCheckSentence() { WORDP D,E; fixedSpell = false; bool lowercase = false; // check for all uppercase (capslock) for (int i = FindOOBEnd(1); i <= wordCount; ++i) // skip start of sentence { char* word = wordStarts[i]; if (!word[1]) continue; // autoconversion of letters to lower case should be ignored (eg A) if (!stricmp(word, "the")) continue; size_t len = strlen(word); for (int j = 0; j < (int)len; ++j) { if (IsLowerCase(word[j])) { lowercase = true; i = j = len+1000; // len might be BIG (oob data) so make sure beyond it) } } } if (!lowercase && wordCount > 2) // must have multiple words all in uppercase { for (int i = FindOOBEnd(1); i <= wordCount; ++i) { char* word = wordStarts[i]; char myword[MAX_WORD_SIZE]; MakeLowerCopy(myword,word); if (strcmp(word, myword)) { char* tokens[2]; tokens[1] = myword; ReplaceWords("caplocWord", i, 1, 1, tokens); originalCapState[i] = false; } } } int startWord = FindOOBEnd(1); for (int i = startWord; i <= wordCount; ++i) { char* word = wordStarts[i]; char* tokens[2]; // change any \ to / char newword[MAX_WORD_SIZE]; bool altered = false; if (strlen(word) < MAX_WORD_SIZE) { strcpy(newword, word); char* at = newword; while ((at = strchr(at,'\\'))) { *at = '/'; altered = true; } if (altered) word = wordStarts[i] = StoreWord(newword, AS_IS)->word; } if (*word == '\'' && !word[1] && i != startWord && IsDigit(*wordStarts[i - 1]) && !stricmp(language, "english")) // fails if not digit bug { tokens[1] = (char*)"foot"; ReplaceWords("' as feet", i, 1, 1, tokens); fixedSpell = true; continue; } if (*word == '"' && !word[1] && i != startWord && IsDigit(*wordStarts[i - 1]) && !stricmp(language, "english")) // fails if not digit bug { tokens[1] = (char*)"inch"; ReplaceWords("' as feet", i, 1, 1, tokens); fixedSpell = true; continue; } if (!word || !word[1] || *word == '"' ) continue; // illegal or single char or quoted thingy size_t len = strlen(word); // dont spell check uppercase not at start or joined word if (IsUpperCase(word[0]) && (i != startWord || strchr(word,'_')) && tokenControl & NO_PROPER_SPELLCHECK) continue; // dont spell check email or other things with @ or . in them if (strchr(word,'@') || strchr(word, '&') || strchr(word,'.') || strchr(word,'$')) continue; // dont spell check names of json objects or arrays if (!strnicmp(word,"ja-",3) || !strnicmp(word,"jo-",3)) continue; // dont spell check web addresses if (!strnicmp(word,"http",4) || !strnicmp(word,"www",3)) continue; // nor fractions if (IsFraction(word)) continue; // fraction? // joined number words like 100dollars char* at = word - 1; while (IsDigit(*++at) || *at == numberPeriod); if (IsDigit(*word) && strlen(at) > 3 && ProbableKnownWord(at)) { char first[MAX_WORD_SIZE]; strncpy(first, word, (at - word)); first[at - word] = 0; char* tokens[3]; tokens[1] = first; tokens[2] = at; ReplaceWords("joined number word", i, 1, 2, tokens); continue; } // nor model numbers if (IsModelNumber(word)) { WORDP X = FindWord(word, 0, UPPERCASE_LOOKUP); if (IsConceptMember(X) && !strcmp(word,X->word)) { char* tokens[2]; tokens[1] = X->word; ReplaceWords("KnownUpperModelNumber", i, 1, 1, tokens); fixedSpell = true; } continue; } char* number; if (GetCurrency((unsigned char*)word, number)) continue; // currency if (!stricmp(word, (char*)"am") && i != startWord && (IsDigit(*wordStarts[i-1]) || IsNumber(wordStarts[i-1]) ==REAL_NUMBER) && !stricmp(language,"english")) // fails if not digit bug { char* tokens[2]; tokens[1] = (char*)"a.m."; ReplaceWords("am as time", i, 1, 1, tokens); fixedSpell = true; continue; } char* known = ProbableKnownWord(word); if (known && !strcmp(known,word)) continue; // we know it if (known && strcmp(known,word)) { WORDP D = FindWord(known); char* tokens[2]; if ((!D || !(D->internalBits & UPPERCASE_HASH)) && !IsUpperCase(*known)) // revised the word to lower case (avoid to upper case like "fields" to "Fields" { WORDP X = FindWord(known,0,LOWERCASE_LOOKUP); if (X) { tokens[1] = X->word; ReplaceWords("KnownWord",i,1,1,tokens); fixedSpell = true; continue; } } else // is uppercase a concept member? then revise upwards { WORDP X = FindWord(known,0,UPPERCASE_LOOKUP); if (IsConceptMember(X) || stricmp(language,"english")) // all german nouns are uppercase { tokens[1] = X->word; ReplaceWords("KnownUpper",i,1,1,tokens); fixedSpell = true; continue; } } } char* p = word -1; unsigned char c; char* hyphen = 0; while ((c = *++p) != 0) { ++len; if (c == '-') hyphen = p; // note is hyphenated - use trailing } if (len == 0 || GetTemperatureLetter(word)) continue; // bad ignore utf word or llegal length - also no composite words if (c && c != '@' && c != '.') // illegal word character { if (IsDigit(word[0]) || len == 1){;} // probable numeric? // accidental junk on end of word we do know immedately? else if (i > 1 && !IsAlphaUTF8OrDigit(wordStarts[i][len-1]) ) { WORDP entry,canonical; char word[MAX_WORD_SIZE]; strcpy(word,wordStarts[i]); word[len-1] = 0; uint64 sysflags = 0; uint64 cansysflags = 0; WORDP revise; GetPosData(i,word,revise,entry,canonical,sysflags,cansysflags,true,true); // dont create a non-existent word if (entry && entry->properties & PART_OF_SPEECH) { wordStarts[i] = entry->word; fixedSpell = true; continue; // not a legal word character, leave it alone } } } // see if we know the other case if (!(tokenControl & (ONLY_LOWERCASE|STRICT_CASING)) || (i == startSentence && !(tokenControl & ONLY_LOWERCASE))) { WORDP E = FindWord(word,0,SECONDARY_CASE_ALLOWED); bool useAlternateCase = false; if (E && E->systemFlags & PATTERN_WORD) useAlternateCase = true; if (E && E->properties & (PART_OF_SPEECH|FOREIGN_WORD)) { // if the word we find is UPPER case, and this might be a lower case noun plural, don't change case. size_t len = strlen(word); if (word[len-1] == 's' ) { WORDP F = FindWord(word,len-1); if (!F || !(F->properties & (PART_OF_SPEECH|FOREIGN_WORD))) useAlternateCase = true; else continue; } else useAlternateCase = true; } else if (E) // does it have a member concept fact { if (IsConceptMember(E)) { useAlternateCase = true; break; } } if (useAlternateCase) { char* tokens[2]; tokens[1] = E->word; ReplaceWords("Alternatecase",i,1,1,tokens); fixedSpell = true; continue; } } // merge with next token? char join[MAX_WORD_SIZE * 3]; if (i != wordCount && *wordStarts[i+1] != '"' ) { // direct merge as a single word strcpy(join,word); strcat(join,wordStarts[i+1]); WORDP D = FindWord(join,0,(tokenControl & ONLY_LOWERCASE) ? PRIMARY_CASE_ALLOWED : STANDARD_LOOKUP); strcpy(join,word); // if (!D || !(D->properties & PART_OF_SPEECH) ) // merge these two, except "going to" or wordnet composites of normal words // merge as a compound word // { // strcat(join,(char*)"_"); // strcat(join,wordStarts[i+1]); // D = FindWord(join,0,(tokenControl & ONLY_LOWERCASE) ? PRIMARY_CASE_ALLOWED : STANDARD_LOOKUP); // } DONT CREATE _ words, let sequence handle it if (D && D->properties & PART_OF_SPEECH && !(D->properties & AUX_VERB)) // merge these two, except "going to" or wordnet composites of normal words { WORDP P1 = FindWord(word,0,LOWERCASE_LOOKUP); WORDP P2 = FindWord(wordStarts[i+1],0,LOWERCASE_LOOKUP); if (!P1 || !P2 || !(P1->properties & PART_OF_SPEECH) || !(P2->properties & PART_OF_SPEECH)) { char* tokens[2]; tokens[1] = D->word; ReplaceWords("merge",i,2,1,tokens); fixedSpell = true; continue; } } } // break apart slashed pair like eat/feed char* slash = strchr(word,'/'); if (slash && !slash[1] && len < MAX_WORD_SIZE) // remove trailing slash { strcpy(newword, word); newword[slash - word] = 0; word = wordStarts[i] = StoreWord(newword, AS_IS)->word; } if (slash && slash != word && slash[1]) // break apart word/word { if ((wordCount + 2 ) >= REAL_SENTENCE_LIMIT) continue; // no room *slash = 0; D = StoreWord(word); *slash = '/'; E = StoreWord(slash+1); char* tokens[4]; tokens[1] = D->word; tokens[2] = "/"; tokens[3] = E->word; ReplaceWords("Split",i,1,3,tokens); fixedSpell = true; --i; continue; } // see if hypenated word should be separate or joined (ignore obvious adjective suffix) if (hyphen && !stricmp(hyphen,(char*)"-like")) { StoreWord(word,ADJECTIVE_NORMAL|ADJECTIVE); // accept it as a word continue; } else if (hyphen && (hyphen-word) > 1 && !IsPlaceNumber(word)) // dont break up fifty-second { char test[MAX_WORD_SIZE]; char first[MAX_WORD_SIZE]; // test for split *hyphen = 0; strcpy(test,hyphen+1); strcpy(first,word); *hyphen = '-'; WORDP E = FindWord(test,0,LOWERCASE_LOOKUP); WORDP D = FindWord(first,0,LOWERCASE_LOOKUP); if (*first == 0) { wordStarts[i] = AllocateHeap(wordStarts[i] + 1); // -pieces want to lose the leading hypen (2-pieces) fixedSpell = true; } else if (D && E) // 1st word gets replaced, we added another word after { if ((wordCount + 1 ) >= REAL_SENTENCE_LIMIT) continue; // no room char* tokens[3]; tokens[1] = D->word; tokens[2] = E->word; ReplaceWords("Pair",i,1,2,tokens); fixedSpell = true; --i; } else if (!stricmp(test,(char*)"old") || !stricmp(test,(char*)"olds")) // break apart 5-year-old { if ((wordCount + 1 ) >= REAL_SENTENCE_LIMIT) continue; // no room D = StoreWord(first); E = StoreWord(test); char* tokens[3]; tokens[1] = D->word; tokens[2] = E->word; ReplaceWords("Break old",i,1,2,tokens); fixedSpell = true; --i; } else // remove hyphen entirely? { strcpy(test,first); strcat(test,hyphen+1); D = FindWord(test,0,(tokenControl & ONLY_LOWERCASE) ? PRIMARY_CASE_ALLOWED : STANDARD_LOOKUP); if (D) { wordStarts[i] = D->word; fixedSpell = true; --i; } } continue; // ignore hypenated errors that we couldnt solve, because no one mistypes a hypen } // see if number in front of unit split like 10mg if (IsDigit(*word)) { char* at = word; while (*++at && IsDigit(*at)) {;} WORDP E = FindWord(at); if (E && strlen(at) > 2 && *at != 'm') // number in front of known word ( but must be longer than 2 char, 5th) but allow mg { char token1[MAX_WORD_SIZE]; int len = at - word; strncpy(token1,word,len); token1[len] = 0; D = StoreWord(token1); char* tokens[4]; tokens[1] = D->word; tokens[2] = E->word; ReplaceWords("Split",i,1,2,tokens); fixedSpell = true; continue; } } // leave uppercase in first position if not adjusted yet... but check for lower case spell error if (IsUpperCase(word[0]) && tokenControl & NO_PROPER_SPELLCHECK) { char lower[MAX_WORD_SIZE]; MakeLowerCopy(lower,word); WORDP D = FindWord(lower,0,LOWERCASE_LOOKUP); if (!D && i == startWord) { char* okword = SpellFix(lower,i,PART_OF_SPEECH); if (okword) { char* tokens[2]; WORDP E = StoreWord(okword); tokens[1] = E->word; ReplaceWords("Spell",i,1,1,tokens); fixedSpell = true; } } continue; } // see if smooshed word pair size_t len1 = strlen(word); int j; if (!IsDigit(*word)) { for (j = 1; j <= len1 - 1; ++j) { WORDP X1 = FindWord(word, j); // any case WORDP X2 = FindWord(word + j, len1 - i); // any case if (X1 && X2 && (X1->word[1] || X1->word[0] == 'i' || X1->word[0] == 'I' || X1->word[0] == 'a')) { char* tokens[3]; tokens[1] = X1->word; tokens[2] = X2->word; ReplaceWords("Split", i, 1, 2, tokens); fixedSpell = true; break; } } if (j != len1) continue; } if (*word != '\'' && (!FindCanonical(word, i,true) || IsUpperCase(word[0]))) // dont check quoted or findable words unless they are capitalized { word = SpellCheck(i); // dont spell check proper names to improper, if word before or after is lower case originally if (word && i != 1 && originalCapState[i] && !IsUpperCase(*word)) { if (!originalCapState[i-1]) continue; else if (i != wordCount && !originalCapState[i+1]) continue; } if (word && !*word) // performed substitution on prior word, restart this one { fixedSpell = true; --i; continue; } if (word) { char* tokens[2]; tokens[1] = word; ReplaceWords("Spell",i,1,1,tokens); fixedSpell = true; continue; } } } return fixedSpell; }