static bool FindPartialInSentenceTest(char* test, int start,int originalstart,bool reverse, int& actualStart, int& actualEnd) { if (!test || !*test) return false; if (reverse) { for ( int i = originalstart-1; i >= 1; --i) // can this be found in sentence backwards { char word[MAX_WORD_SIZE]; MakeLowerCopy(word,wordStarts[i]); if (unmarked[i] || !MatchesPattern(word,test)) continue; // if universally unmarked, skip it. Or if they dont match // we have a match of a word actualStart = i; actualEnd = i; return true; } } else { for (int i = start+1; i <= wordCount; ++i) // can this be found in sentence { char word[MAX_WORD_SIZE]; MakeLowerCopy(word,wordStarts[i]); if (unmarked[i] || !MatchesPattern(word,test)) continue; // if universally unmarked, skip it. Or if they dont match // we have a match of a word actualStart = i; actualEnd = i; return true; } } return false; }
void Login(char* caller,char* usee,char* ip) // select the participants { if (*usee) strcpy(computerID,usee); if (!*computerID) ReadComputerID(); // we are defaulting the chatee // for topic access validation computerIDwSpace[0] = ' '; MakeLowerCopy(computerIDwSpace+1,computerID); strcat(computerIDwSpace," "); if ( *caller == '.' && ip) // make unique with IP adddress at end { char name[MAX_WORD_SIZE]; strcpy(name,caller); sprintf(caller,"%s%s",ip,name); } // alternate to .user for non-unique internet systems, just make them guest with ip at end if (!stricmp(caller,"guest") && ip) { char word[MAX_WORD_SIZE]; strcpy(word,ip); char* ptr; while ((ptr = strchr(word,'.'))) *ptr = '_'; // purify for file system sprintf(caller,"guest%s",ip); } // prepare for chat PartialLogin(caller,ip); }
char* SpellFix(char* originalWord,int start,uint64 posflags,int language) { size_t len = strlen(originalWord); if (len >= 100 || len == 0) return NULL; if (IsDigit(*originalWord)) return NULL; // number-based words and numbers must be treated elsewhere char letterLow = GetLowercaseData(*originalWord); char letterHigh = GetUppercaseData(*originalWord); bool hasUnderscore = (strchr(originalWord,'_')) ? true : false; bool isUpper = IsUpperCase(originalWord[0]); if (IsUpperCase(originalWord[1])) isUpper = false; // not if all caps if (trace == TRACE_SPELLING) Log(STDTRACELOG,(char*)"Spell: %s\r\n",originalWord); char word[MAX_WORD_SIZE]; MakeLowerCopy(word,originalWord); // mark positions of the letters and make lower case char base[257]; memset(base,0,257); char* ptr = word - 1; char c; int position = 0; while ((c = *++ptr) && position < 255) { base[position++ + 1] = GetLowercaseData(c); } // Priority is to a word that looks like what the user typed, because the user probably would have noticed if it didnt and changed it. So add/delete has priority over tranform WORDP choices[4000]; WORDP bestGuess[4000]; unsigned int index = 0; unsigned int bestGuessindex = 0; int min = 30; unsigned char realWordLetterCounts[LETTERMAX]; memset(realWordLetterCounts,0,LETTERMAX); for (int i = 0; i < (int)len; ++i) ++realWordLetterCounts[(unsigned char)letterIndexData[(unsigned char)word[i]]]; // compute number of each kind of character uint64 pos = PART_OF_SPEECH; // all pos allowed WORDP D; if (posflags == PART_OF_SPEECH && start < wordCount) // see if we can restrict word based on next word { D = FindWord(wordStarts[start+1],0,PRIMARY_CASE_ALLOWED); uint64 flags = (D) ? D->properties : (-1); // if we dont know the word, it could be anything if (flags & PREPOSITION) pos &= -1 ^ (PREPOSITION|NOUN); // prep cannot be preceeded by noun or prep if (!(flags & (PREPOSITION|VERB|CONJUNCTION|ADVERB)) && flags & DETERMINER) pos &= -1 ^ (DETERMINER|ADJECTIVE|NOUN|ADJECTIVE_NUMBER|NOUN_NUMBER); // determiner cannot be preceeded by noun determiner adjective if (!(flags & (PREPOSITION|VERB|CONJUNCTION|DETERMINER|ADVERB)) && flags & ADJECTIVE) pos &= -1 ^ (NOUN); if (!(flags & (PREPOSITION|NOUN|CONJUNCTION|DETERMINER|ADVERB|ADJECTIVE)) && flags & VERB) pos &= -1 ^ (VERB); // we know all helper verbs we might be if (D && *D->word == '\'' && D->word[1] == 's' ) pos &= NOUN; // we can only be a noun if possessive - contracted 's should already be removed by now } if (posflags == PART_OF_SPEECH && start > 1) { D = FindWord(wordStarts[start-1],0,PRIMARY_CASE_ALLOWED); uint64 flags = (D) ? D->properties : (-1); // if we dont know the word, it could be anything if (flags & DETERMINER) pos &= -1 ^ (VERB|CONJUNCTION|PREPOSITION|DETERMINER); } posflags &= pos; // if pos types are known and restricted and dont match static int range[] = {0,-1,1,-2,2}; for (unsigned int i = 0; i < 5; ++i) { if (language == ENGLISH && i >= 3) break; // only allow +-2 for spanish MEANING offset = lengthLists[len + range[i]]; if (trace == TRACE_SPELLING) Log(STDTRACELOG,(char*)"\r\n Begin offset %d\r\n",i); while (offset) { D = Meaning2Word(offset); offset = D->spellNode; if (PART_OF_SPEECH == posflags && D->systemFlags & PATTERN_WORD){;} // legal generic match else if (!(D->properties & posflags)) continue; // wrong kind of word if (*D->word != letterLow && *D->word != letterHigh && language == ENGLISH) continue; // we assume no one misspells starting letter char* under = strchr(D->word,'_'); // SPELLING lists have no underscore or space words in them if (hasUnderscore && !under) continue; // require keep any underscore if (!hasUnderscore && under) continue; // require not have any underscore if (isUpper && !(D->internalBits & UPPERCASE_HASH) && start != 1) continue; // dont spell check to lower a word in upper int val = EditDistance(D, D->length, len, base+1,min,realWordLetterCounts,language); if (val <= min) // as good or better { if (val < min) { if (trace == TRACE_SPELLING) Log(STDTRACELOG,(char*)" Better: %s against %s value: %d\r\n",D->word,originalWord,val); index = 0; min = val; } else if ( val == min && trace == TRACE_SPELLING) Log(STDTRACELOG,(char*)" Equal: %s against %s value: %d\r\n",D->word,originalWord,val); if (!(D->internalBits & BEEN_HERE)) { choices[index++] = D; if (index > 3998) break; AddInternalFlag(D,BEEN_HERE); } } } } // try endings ing, s, etc if (start) // no stem spell if COMING from a stem spell attempt (start == 0) { char* stem = StemSpell(word,start); if (stem) { WORDP D = FindWord(stem,0,(tokenControl & ONLY_LOWERCASE) ? PRIMARY_CASE_ALLOWED : STANDARD_LOOKUP); if (D) { for (unsigned int j = 0; j < index; ++j) { if (choices[j] == D) // already in our list { D = NULL; break; } } } if (D) choices[index++] = D; } } if (!index) return NULL; // take our guesses, and pick the most common (earliest learned or most frequently used) word uint64 commonmin = 0; bestGuess[0] = NULL; for (unsigned int j = 0; j < index; ++j) RemoveInternalFlag(choices[j],BEEN_HERE); if (index == 1) { if (trace == TRACE_SPELLING) Log(STDTRACELOG,(char*)" Single best spell: %s\r\n",choices[0]->word); return choices[0]->word; // pick the one } for (unsigned int j = 0; j < index; ++j) { uint64 common = choices[j]->systemFlags & COMMONNESS; if (common < commonmin) continue; if (choices[j]->internalBits & UPPERCASE_HASH && index > 1) continue; // ignore proper names for spell better when some other choice exists if (common > commonmin) { commonmin = common; bestGuessindex = 0; } bestGuess[bestGuessindex++] = choices[j]; } if (bestGuessindex) { if (trace == TRACE_SPELLING) Log(STDTRACELOG,(char*)" Pick spell: %s\r\n",bestGuess[0]->word); return bestGuess[0]->word; } return NULL; }
static int EditDistance(WORDP D, unsigned int size, unsigned int inputLen, char* inputSet, int min, unsigned char realWordLetterCounts[LETTERMAX], int language) {// dictword has no underscores, inputSet is already lower case char dictw[MAX_WORD_SIZE]; MakeLowerCopy(dictw,D->word); char* dictinfo = dictw; char* dictstart = dictinfo; char* inputstart = inputSet; int val = 0; // a difference in length will manifest as a difference in letter count // how many changes (change a letter, transpose adj letters, insert letter, drop letter) if (size != inputLen) { val += (size < inputLen) ? 5 : 2; // real word is shorter than what they typed, not so likely as longer if (size < 7) val += 3; } if (val > min) return 60; // fast abort // match off how many letter counts are correct between the two, need to be close enough to bother with unsigned char dictWordLetterSet[LETTERMAX]; memset(dictWordLetterSet,0,LETTERMAX); for (unsigned int i = 0; i < size; ++i) { int index = letterIndexData[(unsigned char)dictinfo[i]]; ++dictWordLetterSet[index]; // computer number of each kind of letter } unsigned int count = 0; for (unsigned int i = 0; i < LETTERMAX; ++i) // count how many letters are the same in both words { if (dictWordLetterSet[i]) // revised word has these many { int diff = dictWordLetterSet[i] - realWordLetterCounts[i]; // how many of ours does real have? if (diff < 0) count += dictWordLetterSet[i]; // he has more than we have, he gets credit for ours he does have else count += dictWordLetterSet[i] - diff; // he has <= what we have, count them } } unsigned int countVariation = size - ((size > 7) ? 3 : 2); // since size >= 2, this is always >= 0 if (count < countVariation && language == ENGLISH) return 60; // need most letters be in common if (count == size && language == ENGLISH) // same letters (though he may have excess) -- how many transposes { unsigned int bad = 0; for (unsigned int i = 0; i < size; ++i) if (dictinfo[i] != inputSet[i]) ++bad; if (size != inputLen){;} else if (bad <= 2) return val + 3; // 1 transpose else if (bad <= 4) return val + 9; // 2 transpose else return val + 38; // many transpose } // now look at specific letter errors char* dictend = dictinfo+size; char* inputend = inputSet+inputLen; count = 0; while (ALWAYS) { ++count; if (*dictinfo == *inputSet) // match { if (inputSet == inputend && dictinfo == dictend) break; // ended ++inputSet; ++dictinfo; continue; } if (inputSet == inputend || dictinfo == dictend) // one ending, other has to catch up by adding a letter { if (inputSet == inputend) ++dictinfo; else ++inputSet; val += 6; continue; } // letter match failed // can we change an accented letter forward to another similar letter without accent if (*dictinfo == 0xc3) { bool accent = false; if (*inputSet == 'a' && (dictinfo[1] >= 0xa0 && dictinfo[1] <= 0xa5 )) accent = true; else if (*inputSet == 'e' && (dictinfo[1] >= 0xa8 && dictinfo[1] <= 0xab )) accent = true; else if (*inputSet == 'i' && (dictinfo[1] >= 0xac && dictinfo[1] <= 0xaf )) accent = true; else if (*inputSet == 'o' && (dictinfo[1] >= 0xb2 && dictinfo[1] <= 0xb6 )) accent = true; else if (*inputSet == 'u' && (dictinfo[1] >= 0xb9 && dictinfo[1] <= 0xbc )) accent = true; if (accent) { ++dictinfo; ++dictinfo; // double unicode ++inputSet; continue; } } // first and last letter errors are rare, more likely to get them right if (dictinfo == dictstart && *dictstart != *inputstart && language == ENGLISH) val += 6; // costs a lot to change first letter, odds are he types that right if (dictinfo[1] == 0 && inputSet[1] == 0 && *dictinfo != *inputSet) val += 6; // costs more to change last letter, odds are he types that right or sees its wrong // try to resynch series and reduce cost of a transposition of adj letters if (*dictinfo == inputSet[1] && dictinfo[1] == *inputSet) // transpose { if (dictinfo[2] == inputSet[2]) // they match after, so transpose is pretty likely { val += 4; if (dictinfo[2]) // not at end, skip the letter in synch for speed { ++dictinfo; ++inputSet; } } else val += 8; // transposed maybe good, assume it is dictinfo += 2; inputSet += 2; } else if (*dictinfo == inputSet[1]) // current dict letter matches matches his next input letter, so maybe his input inserted a char here and need to delete it { char* prior = inputSet-1; // potential extraneous letter if (*prior == *inputSet) val += 5; // low cost for dropping an excess repeated letter - start of word is prepadded with 0 for prior char else if (*inputSet == '-') val += 3; // very low cost for removing a hypen else if (inputSet+1 == inputend && *inputSet == 's') val += 30; // losing a trailing s is almost not acceptable else val += 9; // high cost removing an extra letter, but not as much as having to change it ++inputSet; } else if (dictinfo[1] == *inputSet) // next dict leter matches current input letter, so maybe his input deleted a char here and needs to insert it { char* prior = (dictinfo == dictstart) ? (char*)" " : (dictinfo-1); if (*dictinfo == *prior && !IsVowel(*dictinfo )) val += 5; else if (IsVowel(*dictinfo )) val += 1; // low cost for missing a vowel ( already charged for short input), might be a texting abbreviation else val += 9; // high cost for deleting a character, but not as much as changing it ++dictinfo; } else // this has no valid neighbors. alter it to be the correct, but charge for multiple occurences { if (count == 1 && *dictinfo != *inputSet && language == ENGLISH) val += 30; //costs a lot to change the first letter, odds are he types that right or sees its wrong // 2 in a row are bad, check for a substituted vowel sound bool swap = false; int oldval = val; if (dictinfo[1] != inputSet[1]) // do multicharacter transformations { if (language == SPANISH) // ch-x | qu-k | c-k | do-o | b-v | bue-w | vue-w | z-s | s-c | h- | y-i | y-ll | m-n 1st is valid { if (*inputSet == 'c' && *dictinfo == 'k') { dictinfo += 1; inputSet += 1; continue; } if (*inputSet == 'b' && *dictinfo == 'v') { dictinfo += 1; inputSet += 1; continue; } if (*inputSet == 'v' && *dictinfo == 'b') { dictinfo += 1; inputSet += 1; continue; } if (*inputSet == 'z' && *dictinfo == 's') { dictinfo += 1; inputSet += 1; continue; } if (*inputSet == 's' && *dictinfo == 'c') { dictinfo += 1; inputSet += 1; continue; } if (*inputSet == 'y' && *dictinfo == 'i') { dictinfo += 1; inputSet += 1; continue; } if (*inputSet == 'm' && *dictinfo == 'n') { dictinfo += 1; inputSet += 1; continue; } if (*inputSet == 'n' && *dictinfo == 'm') { dictinfo += 1; inputSet += 1; continue; } if (*dictinfo == 'h') { dictinfo += 1; continue; } if (*inputSet == 'x' && !strncmp(dictinfo,(char*)"ch",2)) { dictinfo += 2; inputSet += 1; val -= (size < inputLen) ? 5 : 2; if (size < 7) val -= 3; if (val < 0) val = 0; continue; } if (*inputSet == 'k' && !strncmp(dictinfo,(char*)"qu",2)) { dictinfo += 2; inputSet += 1; val -= (size < inputLen) ? 5 : 2; if (size < 7) val -= 3; if (val < 0) val = 0; continue; } if (*inputSet == 'o' && !strncmp(dictinfo,(char*)"do",2) && !inputSet[1] && !dictinfo[2]) // at end { dictinfo += 2; inputSet += 1; val -= (size < inputLen) ? 5 : 2; if (size < 7) val -= 3; if (val < 0) val = 0; continue; } if (*inputSet == 'w' && !strncmp(dictinfo,(char*)"bue",3)) { dictinfo += 3; inputSet += 1; val -= (size < inputLen) ? 5 : 2; if (size < 7) val -= 3; if (val < 0) val = 0; continue; } if (*inputSet == 'w' && !strncmp(dictinfo,(char*)"vue",3)) { dictinfo += 3; inputSet += 1; val -= (size < inputLen) ? 5 : 2; if (size < 7) val -= 3; if (val < 0) val = 0; continue; } if (!strncmp(inputSet,(char*)"ll",2) && *dictinfo == 'y') { inputSet += 2; dictinfo += 1; val -= (size < inputLen) ? 5 : 2; if (size < 7) val -= 3; if (val < 0) val = 0; continue; } if (*inputSet == 'y' && *dictinfo == 'l' && dictinfo[1] == 'l') { inputSet += 1; dictinfo += 2; val -= (size < inputLen) ? 5 : 2; if (size < 7) val -= 3; if (val < 0) val = 0; continue; } } if (*inputSet == 't' && !strncmp(dictinfo,(char*)"ght",3)) { dictinfo += 3; inputSet += 1; val += 5; } else if (!strncmp(inputSet,(char*)"ci",2) && !strncmp(dictinfo,(char*)"cki",3)) { dictinfo += 3; inputSet += 2; val += 5; } else if (*(dictinfo-1) == 'a' && !strcmp(dictinfo,(char*)"ir") && !strcmp(inputSet,(char*)"re")) // prepair prepare as terminal sound { dictinfo += 2; inputSet += 2; val += 3; } else if (!strncmp(inputSet,(char*)"ous",3) && !strncmp(dictinfo,(char*)"eous",4)) { dictinfo += 4; inputSet += 3; val += 5; } else if (!strncmp(inputSet,(char*)"of",2) && !strncmp(dictinfo,(char*)"oph",3)) { dictinfo += 3; inputSet += 2; val += 5; } else if (*dictinfo == 'x' && !strncmp(inputSet,(char*)"cks",3)) { dictinfo += 1; inputSet += 3; val += 5; } else if (*inputSet == 'k' && !strncmp(dictinfo,(char*)"qu",2)) { dictinfo += 2; inputSet += 1; val += 5; } if (oldval != val){;} // swallowed a multiple letter sound change else if (!strncmp(dictinfo,(char*)"able",4) && !strncmp(inputSet,(char*)"ible",4)) swap = true; else if (!strncmp(dictinfo,(char*)"ible",4) && !strncmp(inputSet,(char*)"able",4)) swap = true; else if (*dictinfo == 'a' && dictinfo[1] == 'y' && *inputSet == 'e' && inputSet[1] == 'i') swap = true; else if (*dictinfo == 'e' && dictinfo[1] == 'a' && *inputSet == 'e' && inputSet[1] == 'e') swap = true; else if (*dictinfo == 'e' && dictinfo[1] == 'e' && *inputSet == 'e' && inputSet[1] == 'a') swap = true; else if (*dictinfo == 'e' && dictinfo[1] == 'e' && *inputSet == 'i' && inputSet[1] == 'e') swap = true; else if (*dictinfo == 'e' && dictinfo[1] == 'i' && *inputSet == 'a' && inputSet[1] == 'y') swap = true; else if (*dictinfo == 'e' && dictinfo[1] == 'u' && *inputSet == 'o' && inputSet[1] == 'o') swap = true; else if (*dictinfo == 'e' && dictinfo[1] == 'u' && *inputSet == 'o' && inputSet[1] == 'u') swap = true; else if (*dictinfo == 'i' && dictinfo[1] == 'e' && *inputSet == 'e' && inputSet[1] == 'e') swap = true; else if (*dictinfo == 'o' && dictinfo[1] == 'o' && *inputSet == 'e' && inputSet[1] == 'u') swap = true; else if (*dictinfo == 'o' && dictinfo[1] == 'o' && *inputSet == 'o' && inputSet[1] == 'u') swap = true; else if (*dictinfo == 'o' && dictinfo[1] == 'o' && *inputSet == 'u' && inputSet[1] == 'i') swap = true; else if (*dictinfo == 'o' && dictinfo[1] == 'u' && *inputSet == 'e' && inputSet[1] == 'u') swap = true; else if (*dictinfo == 'o' && dictinfo[1] == 'u' && *inputSet == 'o' && inputSet[1] == 'o') swap = true; else if (*dictinfo == 'u' && dictinfo[1] == 'i' && *inputSet == 'o' && inputSet[1] == 'o') swap = true; if (swap) { dictinfo += 2; inputSet += 2; val += 5; } } // can we change a letter to another similar letter if (oldval == val) { bool convert = false; if (*dictinfo == 'i' && *inputSet== 'y' && count > 1) convert = true;// but not as first letter else if ((*dictinfo == 's' && *inputSet == 'z') || (*dictinfo == 'z' && *inputSet == 's')) convert = true; else if (*dictinfo == 'y' && *inputSet == 'i' && count > 1) convert = true; // but not as first letter else if (*dictinfo == '/' && *inputSet == '-') convert = true; else if (inputSet+1 == inputend && *inputSet == 's') val += 30; // changing a trailing s is almost not acceptable if (convert) val += 5; // low cost for exchange of similar letter, but dont do it often else val += 12; // changing a letter is expensive, since it destroys the visual image ++dictinfo; ++inputSet; } } if (val > min) return val; // too costly, ignore it } return val; }
bool SpellCheckSentence() { WORDP D,E; fixedSpell = false; bool lowercase = false; int language = ENGLISH; char* lang = GetUserVariable((char*)"$cs_language"); if (lang && !stricmp(lang,(char*)"spanish")) language = SPANISH; // check for all uppercase for (int i = FindOOBEnd(1) + 1; i <= wordCount; ++i) // skip start of sentence { char* word = wordStarts[i]; size_t len = strlen(word); for (int j = 0; j < (int)len; ++j) { if (IsLowerCase(word[j])) { lowercase = true; i = j = 1000; } } } if (!lowercase && wordCount > 2) // must have several words in uppercase { for (int i = FindOOBEnd(1); i <= wordCount; ++i) { char* word = wordStarts[i]; MakeLowerCase(word); } } int startWord = FindOOBEnd(1); for (int i = startWord; i <= wordCount; ++i) { char* word = wordStarts[i]; if (!word || !word[1] || *word == '"' ) continue; // illegal or single char or quoted thingy size_t len = strlen(word); // dont spell check uppercase not at start or joined word if (IsUpperCase(word[0]) && (i != startWord || strchr(word,'_')) && tokenControl & NO_PROPER_SPELLCHECK) continue; // dont spell check email or other things with @ or . in them if (strchr(word,'@') || strchr(word,'.') || strchr(word,'$')) continue; // dont spell check names of json objects or arrays if (!strnicmp(word,"ja-",3) || !strnicmp(word,"jo-",3)) continue; char* known = ProbableKnownWord(word); if (known && !strcmp(known,word)) continue; // we know it if (known && strcmp(known,word)) { char* tokens[2]; if (!IsUpperCase(*known)) // revised the word to lower case (avoid to upper case like "fields" to "Fields" { WORDP D = FindWord(known,0,LOWERCASE_LOOKUP); if (D) { tokens[1] = D->word; ReplaceWords(i,1,1,tokens); fixedSpell = true; continue; } } else // is uppercase a concept member? then revise upwards { WORDP D = FindWord(known,0,UPPERCASE_LOOKUP); if (IsConceptMember(D)) { tokens[1] = D->word; ReplaceWords(i,1,1,tokens); fixedSpell = true; continue; } } } char* p = word -1; unsigned char c; char* hyphen = 0; while ((c = *++p) != 0) { ++len; if (c == '-') hyphen = p; // note is hyphenated - use trailing } if (len == 0 || GetTemperatureLetter(word)) continue; // bad ignore utf word or llegal length - also no composite words if (c && c != '@' && c != '.') // illegal word character { if (IsDigit(word[0]) || len == 1){;} // probable numeric? // accidental junk on end of word we do know immedately? else if (i > 1 && !IsAlphaUTF8OrDigit(wordStarts[i][len-1]) ) { WORDP entry,canonical; char word[MAX_WORD_SIZE]; strcpy(word,wordStarts[i]); word[len-1] = 0; uint64 sysflags = 0; uint64 cansysflags = 0; WORDP revise; GetPosData(i,word,revise,entry,canonical,sysflags,cansysflags,true,true); // dont create a non-existent word if (entry && entry->properties & PART_OF_SPEECH) { wordStarts[i] = reuseAllocation(wordStarts[i],entry->word); fixedSpell = true; continue; // not a legal word character, leave it alone } } } // see if we know the other case if (!(tokenControl & (ONLY_LOWERCASE|STRICT_CASING)) || (i == startSentence && !(tokenControl & ONLY_LOWERCASE))) { WORDP E = FindWord(word,0,SECONDARY_CASE_ALLOWED); bool useAlternateCase = false; if (E && E->systemFlags & PATTERN_WORD) useAlternateCase = true; if (E && E->properties & (PART_OF_SPEECH|FOREIGN_WORD)) { // if the word we find is UPPER case, and this might be a lower case noun plural, don't change case. size_t len = strlen(word); if (word[len-1] == 's' ) { WORDP F = FindWord(word,len-1); if (!F || !(F->properties & (PART_OF_SPEECH|FOREIGN_WORD))) useAlternateCase = true; else continue; } else useAlternateCase = true; } else if (E) // does it have a member concept fact { if (IsConceptMember(E)) { useAlternateCase = true; break; } } if (useAlternateCase) { char* tokens[2]; tokens[1] = E->word; ReplaceWords(i,1,1,tokens); fixedSpell = true; continue; } } // merge with next token? char join[MAX_WORD_SIZE * 3]; if (i != wordCount && *wordStarts[i+1] != '"' ) { // direct merge as a single word strcpy(join,word); strcat(join,wordStarts[i+1]); WORDP D = FindWord(join,0,(tokenControl & ONLY_LOWERCASE) ? PRIMARY_CASE_ALLOWED : STANDARD_LOOKUP); strcpy(join,word); if (!D || !(D->properties & PART_OF_SPEECH) ) // merge these two, except "going to" or wordnet composites of normal words // merge as a compound word { strcat(join,(char*)"_"); strcat(join,wordStarts[i+1]); D = FindWord(join,0,(tokenControl & ONLY_LOWERCASE) ? PRIMARY_CASE_ALLOWED : STANDARD_LOOKUP); } if (D && D->properties & PART_OF_SPEECH && !(D->properties & AUX_VERB)) // merge these two, except "going to" or wordnet composites of normal words { WORDP P1 = FindWord(word,0,LOWERCASE_LOOKUP); WORDP P2 = FindWord(wordStarts[i+1],0,LOWERCASE_LOOKUP); if (!P1 || !P2 || !(P1->properties & PART_OF_SPEECH) || !(P2->properties & PART_OF_SPEECH)) { char* tokens[2]; tokens[1] = D->word; ReplaceWords(i,2,1,tokens); fixedSpell = true; continue; } } } // break apart slashed pair like eat/feed char* slash = strchr(word,'/'); if (slash && slash != word && slash[1]) // break apart word/word { if ((wordCount + 2 ) >= REAL_SENTENCE_LIMIT) continue; // no room *slash = 0; D = StoreWord(word); *slash = '/'; E = StoreWord(slash+1); char* tokens[4]; tokens[1] = D->word; tokens[2] = "/"; tokens[3] = E->word; ReplaceWords(i,1,3,tokens); fixedSpell = true; --i; continue; } // see if hypenated word should be separate or joined (ignore obvious adjective suffix) if (hyphen && !stricmp(hyphen,(char*)"-like")) { StoreWord(word,ADJECTIVE_NORMAL|ADJECTIVE); // accept it as a word continue; } else if (hyphen && (hyphen-word) > 1) { char test[MAX_WORD_SIZE]; char first[MAX_WORD_SIZE]; // test for split *hyphen = 0; strcpy(test,hyphen+1); strcpy(first,word); *hyphen = '-'; WORDP E = FindWord(test,0,LOWERCASE_LOOKUP); WORDP D = FindWord(first,0,LOWERCASE_LOOKUP); if (*first == 0) { wordStarts[i] = AllocateString(wordStarts[i] + 1); // -pieces want to lose the leading hypen (2-pieces) fixedSpell = true; } else if (D && E) // 1st word gets replaced, we added another word after { if ((wordCount + 1 ) >= REAL_SENTENCE_LIMIT) continue; // no room char* tokens[3]; tokens[1] = D->word; tokens[2] = E->word; ReplaceWords(i,1,2,tokens); fixedSpell = true; --i; } else if (!stricmp(test,(char*)"old") || !stricmp(test,(char*)"olds")) // break apart 5-year-old { if ((wordCount + 1 ) >= REAL_SENTENCE_LIMIT) continue; // no room D = StoreWord(first); E = StoreWord(test); char* tokens[3]; tokens[1] = D->word; tokens[2] = E->word; ReplaceWords(i,1,2,tokens); fixedSpell = true; --i; } else // remove hyphen entirely? { strcpy(test,first); strcat(test,hyphen+1); D = FindWord(test,0,(tokenControl & ONLY_LOWERCASE) ? PRIMARY_CASE_ALLOWED : STANDARD_LOOKUP); if (D) { wordStarts[i] = D->word; fixedSpell = true; --i; } } continue; // ignore hypenated errors that we couldnt solve, because no one mistypes a hypen } // leave uppercase in first position if not adjusted yet... but check for lower case spell error if (IsUpperCase(word[0]) && tokenControl & NO_PROPER_SPELLCHECK) { char lower[MAX_WORD_SIZE]; MakeLowerCopy(lower,word); WORDP D = FindWord(lower,0,LOWERCASE_LOOKUP); if (!D && i == startWord) { char* okword = SpellFix(lower,i,PART_OF_SPEECH,language); if (okword) { char* tokens[2]; WORDP E = StoreWord(okword); tokens[1] = E->word; ReplaceWords(i,1,1,tokens); fixedSpell = true; } } continue; } if (*word != '\'' && (!FindCanonical(word, i,true) || IsUpperCase(word[0]))) // dont check quoted or findable words unless they are capitalized { word = SpellCheck(i,language); // dont spell check proper names to improper, if word before or after is lower case originally if (word && i != 1 && originalCapState[i] && !IsUpperCase(*word)) { if (!originalCapState[i-1]) return false; else if (i != wordCount && !originalCapState[i+1]) return false; } if (word && !*word) // performed substitution on prior word, restart this one { fixedSpell = true; --i; continue; } if (word) { char* tokens[2]; tokens[1] = word; ReplaceWords(i,1,1,tokens); fixedSpell = true; continue; } } } return fixedSpell; }
char* ProbableKnownWord(char* word) { if (strchr(word,' ') || strchr(word,'_')) return word; // not user input, is synthesized size_t len = strlen(word); // do we know the word as is? WORDP D = FindWord(word,0,PRIMARY_CASE_ALLOWED); if (D) { if (D->properties & FOREIGN_WORD || *D->word == '~' || D->systemFlags & PATTERN_WORD) return D->word; // we know this word clearly or its a concept set ref emotion if (D->properties & PART_OF_SPEECH && !IS_NEW_WORD(D)) return D->word; // old word we know if (IsConceptMember(D)) return D->word; // are there facts using this word? -- issue with facts because on seeing input second time, having made facts of original, we see original // if (GetSubjectNondeadHead(D) || GetObjectNondeadHead(D) || GetVerbNondeadHead(D)) return D->word; } char lower[MAX_WORD_SIZE]; MakeLowerCopy(lower,word); // do we know the word in lower case? D = FindWord(word,0,LOWERCASE_LOOKUP); if (D) // direct recognition { if (D->properties & FOREIGN_WORD || *D->word == '~' || D->systemFlags & PATTERN_WORD) return D->word; // we know this word clearly or its a concept set ref emotion if (D->properties & PART_OF_SPEECH && !IS_NEW_WORD(D)) return D->word; // old word we know if (IsConceptMember(D)) return D->word; // are there facts using this word? // if (GetSubjectNondeadHead(D) || GetObjectNondeadHead(D) || GetVerbNondeadHead(D)) return D->word; } // do we know the word in upper case? char upper[MAX_WORD_SIZE]; MakeLowerCopy(upper,word); upper[0] = GetUppercaseData(upper[0]); D = FindWord(upper,0,UPPERCASE_LOOKUP); if (D) // direct recognition { if (D->properties & FOREIGN_WORD || *D->word == '~' || D->systemFlags & PATTERN_WORD) return D->word; // we know this word clearly or its a concept set ref emotion if (D->properties & PART_OF_SPEECH && !IS_NEW_WORD(D)) return D->word; // old word we know if (IsConceptMember(D)) return D->word; // are there facts using this word? // if (GetSubjectNondeadHead(D) || GetObjectNondeadHead(D) || GetVerbNondeadHead(D)) return D->word; } // interpolate to lower case words uint64 expectedBase = 0; if (ProbableAdjective(word,len,expectedBase) && expectedBase) return word; expectedBase = 0; if (ProbableAdverb(word,len,expectedBase) && expectedBase) return word; // is it a verb form char* verb = GetInfinitive(lower,true); // no new verbs if (verb) { WORDP D = StoreWord(lower,0); // verb form recognized return D->word; } // is it simple plural of a noun? if (word[len-1] == 's') { WORDP E = FindWord(lower,len-1,LOWERCASE_LOOKUP); if (E && E->properties & NOUN) { E = StoreWord(word,NOUN|NOUN_PLURAL); return E->word; } E = FindWord(lower,len-1,UPPERCASE_LOOKUP); if (E && E->properties & NOUN) { *word = toUppercaseData[*word]; E = StoreWord(word,NOUN|NOUN_PROPER_PLURAL); return E->word; } } return NULL; }
static int EditDistance(WORDINFO& dictWordData, WORDINFO& realWordData,int min) {// dictword has no underscores, inputSet is already lower case char dictw[MAX_WORD_SIZE]; MakeLowerCopy(dictw, dictWordData.word); char* dictinfo = dictw; char* realinfo = realWordData.word; char* dictstart = dictinfo; char* realstart = realWordData.word; int val = 0; // a difference in length will manifest as a difference in letter count // look at specific letter errors char priorCharDict[10]; char priorCharReal[10]; *priorCharDict = *priorCharReal = 0; char currentCharReal[10]; char currentCharDict[10]; *currentCharReal = *currentCharDict = 0; char nextCharReal[10]; char nextCharDict[10]; char next1CharReal[10]; char next1CharDict[10]; char* resumeReal2; char* resumeDict2; char* resumeReal; char* resumeDict; char* resumeReal1; char* resumeDict1; char baseCharReal; char baseCharDict; while (ALWAYS) { if (val > min) return 1000; // no good strcpy(priorCharReal, currentCharReal); strcpy(priorCharDict, currentCharDict); resumeReal = IsUTF8((char*)realinfo, currentCharReal); resumeDict = IsUTF8((char*)dictinfo, currentCharDict); if (!*currentCharReal && !*currentCharDict) break; //both at end if (!*currentCharReal || !*currentCharDict) // one ending, other has to catch up by adding a letter { val += 16; // add a letter if (*priorCharReal == *currentCharDict) val -= 10; // doubling letter at end dictinfo = resumeDict; realinfo = resumeReal; continue; } // punctuation in a word is bad tokenization, dont spell check it away if (*currentCharReal == '?' || *currentCharReal == '!' || *currentCharReal == '(' || *currentCharReal == ')' || *currentCharReal == '[' || *currentCharReal == ']' || *currentCharReal == '{' || *currentCharReal == '}') return 200; // dont mess with this resumeReal1 = IsUTF8((char*)resumeReal, nextCharReal); resumeDict1 = IsUTF8((char*)resumeDict, nextCharDict); resumeReal2 = IsUTF8((char*)resumeReal1, next1CharReal); // 2 char ahead resumeDict2 = IsUTF8((char*)resumeDict1, next1CharDict); baseCharReal = UnaccentedChar(currentCharReal); baseCharDict = UnaccentedChar(currentCharDict); if (!stricmp(currentCharReal, currentCharDict)) // match chars { dictinfo = resumeDict; realinfo = resumeReal; continue; } if (baseCharReal && baseCharReal == baseCharDict) { dictinfo = resumeDict; realinfo = resumeReal; val += 1; // minimal charge but separate forms of delivre continue; } // treat german double s and ss equivalent if (!stricmp(language, "german")) { if (*currentCharReal == 0xc3 && currentCharReal[1] == 0x9f && *currentCharDict == 's' && *nextCharDict == 's') { dictinfo = resumeDict1; realinfo = resumeReal; continue; } if (*currentCharDict == 0xc3 && currentCharDict[1] == 0x9f && *currentCharReal == 's' && *nextCharReal == 's') { dictinfo = resumeDict; realinfo = resumeReal1; continue; } } // spanish alternative spellings if (!stricmp(language, "spanish")) // ch-x | qu-k | c-k | do-o | b-v | bue-w | vue-w | z-s | s-c | h- | y-i | y-ll | m-n 1st is valid { if (*currentCharReal == 'c' && *currentCharDict == 'k') { dictinfo = resumeDict; realinfo = resumeReal; continue; } if (*currentCharReal == 'b' && *currentCharDict == 'v') { dictinfo = resumeDict; realinfo = resumeReal; continue; } if (*currentCharReal == 'v' && *currentCharDict == 'b') { dictinfo = resumeDict; realinfo = resumeReal; continue; } if (*currentCharReal == 'z' && *currentCharDict == 's') { dictinfo = resumeDict; realinfo = resumeReal; continue; } if (*currentCharReal == 's' && *currentCharDict == 'c') { dictinfo = resumeDict; realinfo = resumeReal; continue; } if (*currentCharReal == 'y' && *currentCharDict == 'i') { dictinfo = resumeDict; realinfo = resumeReal; continue; } if (*currentCharReal == 'm' && *currentCharDict == 'n') { dictinfo = resumeDict; realinfo = resumeReal; continue; } if (*currentCharReal == 'n' && *currentCharDict == 'm') { dictinfo = resumeDict; realinfo = resumeReal; continue; } if (*currentCharDict == 'h') { dictinfo = resumeDict; continue; } if (*currentCharReal == 'x' && *currentCharDict == 'c' && *nextCharDict == 'h') { dictinfo = resumeDict1; realinfo = resumeReal; continue; } if (*currentCharReal == 'k' && *currentCharDict == 'q' && *nextCharDict == 'u') { dictinfo = resumeDict1; realinfo = resumeReal; continue; } if (*currentCharReal == 'o' && *currentCharDict == 'd' && *nextCharDict == 'o') { dictinfo = resumeDict1; realinfo = resumeReal; continue; } if (*currentCharReal == 'w' && *currentCharDict == 'b' && *nextCharDict == 'u' && *next1CharDict == 'e') { dictinfo = resumeDict2; realinfo = resumeReal; continue; } if (*currentCharReal == 'w' && *currentCharDict == 'v' && *nextCharDict == 'u' && *next1CharDict == 'e') { dictinfo = resumeDict2; realinfo = resumeReal; continue; } if (*currentCharReal == 'l' && *nextCharReal == 'l' && *currentCharDict == 'y') { dictinfo = resumeDict; realinfo = resumeReal1; continue; } if (*currentCharReal == 'y' && *currentCharDict == 'l' && *nextCharDict == 'l') { dictinfo = resumeDict1; realinfo = resumeReal; continue; } } // french common bad spellings if (!stricmp(language, "french")) { if (*currentCharReal == 'a' && SameUTF(currentCharDict,"â")) { val += 1; dictinfo = resumeDict; realinfo = resumeReal; continue; } if (*currentCharReal == 'e' && SameUTF(currentCharDict,"ê")) { val += 10; dictinfo = resumeDict; realinfo = resumeReal; continue; } if (*currentCharReal == 0xc3 && currentCharReal[1] == 0xa8 && SameUTF(currentCharDict,"ê")) { val += 5; dictinfo = resumeDict; realinfo = resumeReal; continue; } if (*currentCharReal == 'i' && SameUTF(currentCharDict,"î")) { val += 1; dictinfo = resumeDict; realinfo = resumeReal; continue; } if (*currentCharReal == 'o' && SameUTF(currentCharDict, "ô")) { val += 1; dictinfo = resumeDict; realinfo = resumeReal; continue; } if (*currentCharReal == 'u' && SameUTF(currentCharDict, "û")) { val += 5; dictinfo = resumeDict; realinfo = resumeReal; continue; } if (*currentCharReal == 'y' && *currentCharDict == 'l' && *nextCharDict == 'l') { val += 10; dictinfo = resumeDict1; realinfo = resumeReal; continue; } if (*currentCharReal == 'k' && *currentCharDict == 'q' && *nextCharDict == 'u') { val += 10; dictinfo = resumeDict1; realinfo = resumeReal; continue; } if (*currentCharReal == 'f' && *currentCharDict == 'p' && *nextCharDict == 'h') { val += 5; dictinfo = resumeDict1; realinfo = resumeReal; continue; } if (*currentCharReal == 's' && *currentCharDict == 'c') { val += 10; dictinfo = resumeDict; realinfo = resumeReal; continue; } if (*currentCharReal == 's' && SameUTF(currentCharDict, "ç")) { val += 5; dictinfo = resumeDict; realinfo = resumeReal; continue; } if (*currentCharReal == 'c' && SameUTF(currentCharDict, "ç")) { val += 5; dictinfo = resumeDict; realinfo = resumeReal; continue; } } // probable transposition since swapping syncs up if (!strcmp(currentCharReal, nextCharDict) && !strcmp(nextCharReal, currentCharDict)) { val += 16; // more expensive if next is not correct after transposition dictinfo = resumeDict2; // skip ahead 2 realinfo = resumeReal2; continue; } // probable mistyped letter since next matches up if (!strcmp(nextCharReal, nextCharDict)) { val += 16; // more expensive if 2nd next is not correct after transposition if (*currentCharReal == 's' && *currentCharDict == 'z') val -= 5; else if (*currentCharReal == 'z' && *currentCharDict == 's') val -= 5; else if (IsVowel(*currentCharReal) && IsVowel(*currentCharDict)) val -= 6; // low cost for switching a vowel dictinfo = resumeDict; realinfo = resumeReal; continue; } // probable excess letter by user since next matches up to current if (!strcmp(nextCharReal, currentCharDict)) { val += 16; // only delete 1 letter if (*priorCharDict == *currentCharReal) val -= 14; // low cost for dropping an excess repeated letter (wherre->where not wherry) else if (*currentCharReal == '-') val -= 10; // very low cost for removing a hypen dictinfo = resumeDict; // skip over letter we match momentarily realinfo = resumeReal1; // move on past our junk letter and match letter continue; } // probable missing letter by user since current matches up to nextdict if (!strcmp(nextCharDict, currentCharReal)) { val += 16; // only add 1 letter // better to add repeated letter than to drop a letter if (*currentCharDict == *priorCharReal) val -= 6; // low cost for adding a repeated letter else if (*currentCharDict == 'e' && *nextCharDict == 'o') val -= 10; // yoman->yeoman dictinfo = resumeDict1; // skip over letter we match momentarily realinfo = resumeReal; // move on past our junk letter and match letter continue; } // complete mismatch with no understanding of why, just fix them and move on dictinfo = resumeDict; // skip over letter we match momentarily realinfo = resumeReal; // move on past our junk letter and match letter val += 16; } return val; }
bool SpellCheckSentence() { WORDP D,E; fixedSpell = false; bool lowercase = false; // check for all uppercase (capslock) for (int i = FindOOBEnd(1); i <= wordCount; ++i) // skip start of sentence { char* word = wordStarts[i]; if (!word[1]) continue; // autoconversion of letters to lower case should be ignored (eg A) if (!stricmp(word, "the")) continue; size_t len = strlen(word); for (int j = 0; j < (int)len; ++j) { if (IsLowerCase(word[j])) { lowercase = true; i = j = len+1000; // len might be BIG (oob data) so make sure beyond it) } } } if (!lowercase && wordCount > 2) // must have multiple words all in uppercase { for (int i = FindOOBEnd(1); i <= wordCount; ++i) { char* word = wordStarts[i]; char myword[MAX_WORD_SIZE]; MakeLowerCopy(myword,word); if (strcmp(word, myword)) { char* tokens[2]; tokens[1] = myword; ReplaceWords("caplocWord", i, 1, 1, tokens); originalCapState[i] = false; } } } int startWord = FindOOBEnd(1); for (int i = startWord; i <= wordCount; ++i) { char* word = wordStarts[i]; char* tokens[2]; // change any \ to / char newword[MAX_WORD_SIZE]; bool altered = false; if (strlen(word) < MAX_WORD_SIZE) { strcpy(newword, word); char* at = newword; while ((at = strchr(at,'\\'))) { *at = '/'; altered = true; } if (altered) word = wordStarts[i] = StoreWord(newword, AS_IS)->word; } if (*word == '\'' && !word[1] && i != startWord && IsDigit(*wordStarts[i - 1]) && !stricmp(language, "english")) // fails if not digit bug { tokens[1] = (char*)"foot"; ReplaceWords("' as feet", i, 1, 1, tokens); fixedSpell = true; continue; } if (*word == '"' && !word[1] && i != startWord && IsDigit(*wordStarts[i - 1]) && !stricmp(language, "english")) // fails if not digit bug { tokens[1] = (char*)"inch"; ReplaceWords("' as feet", i, 1, 1, tokens); fixedSpell = true; continue; } if (!word || !word[1] || *word == '"' ) continue; // illegal or single char or quoted thingy size_t len = strlen(word); // dont spell check uppercase not at start or joined word if (IsUpperCase(word[0]) && (i != startWord || strchr(word,'_')) && tokenControl & NO_PROPER_SPELLCHECK) continue; // dont spell check email or other things with @ or . in them if (strchr(word,'@') || strchr(word, '&') || strchr(word,'.') || strchr(word,'$')) continue; // dont spell check names of json objects or arrays if (!strnicmp(word,"ja-",3) || !strnicmp(word,"jo-",3)) continue; // dont spell check web addresses if (!strnicmp(word,"http",4) || !strnicmp(word,"www",3)) continue; // nor fractions if (IsFraction(word)) continue; // fraction? // joined number words like 100dollars char* at = word - 1; while (IsDigit(*++at) || *at == numberPeriod); if (IsDigit(*word) && strlen(at) > 3 && ProbableKnownWord(at)) { char first[MAX_WORD_SIZE]; strncpy(first, word, (at - word)); first[at - word] = 0; char* tokens[3]; tokens[1] = first; tokens[2] = at; ReplaceWords("joined number word", i, 1, 2, tokens); continue; } // nor model numbers if (IsModelNumber(word)) { WORDP X = FindWord(word, 0, UPPERCASE_LOOKUP); if (IsConceptMember(X) && !strcmp(word,X->word)) { char* tokens[2]; tokens[1] = X->word; ReplaceWords("KnownUpperModelNumber", i, 1, 1, tokens); fixedSpell = true; } continue; } char* number; if (GetCurrency((unsigned char*)word, number)) continue; // currency if (!stricmp(word, (char*)"am") && i != startWord && (IsDigit(*wordStarts[i-1]) || IsNumber(wordStarts[i-1]) ==REAL_NUMBER) && !stricmp(language,"english")) // fails if not digit bug { char* tokens[2]; tokens[1] = (char*)"a.m."; ReplaceWords("am as time", i, 1, 1, tokens); fixedSpell = true; continue; } char* known = ProbableKnownWord(word); if (known && !strcmp(known,word)) continue; // we know it if (known && strcmp(known,word)) { WORDP D = FindWord(known); char* tokens[2]; if ((!D || !(D->internalBits & UPPERCASE_HASH)) && !IsUpperCase(*known)) // revised the word to lower case (avoid to upper case like "fields" to "Fields" { WORDP X = FindWord(known,0,LOWERCASE_LOOKUP); if (X) { tokens[1] = X->word; ReplaceWords("KnownWord",i,1,1,tokens); fixedSpell = true; continue; } } else // is uppercase a concept member? then revise upwards { WORDP X = FindWord(known,0,UPPERCASE_LOOKUP); if (IsConceptMember(X) || stricmp(language,"english")) // all german nouns are uppercase { tokens[1] = X->word; ReplaceWords("KnownUpper",i,1,1,tokens); fixedSpell = true; continue; } } } char* p = word -1; unsigned char c; char* hyphen = 0; while ((c = *++p) != 0) { ++len; if (c == '-') hyphen = p; // note is hyphenated - use trailing } if (len == 0 || GetTemperatureLetter(word)) continue; // bad ignore utf word or llegal length - also no composite words if (c && c != '@' && c != '.') // illegal word character { if (IsDigit(word[0]) || len == 1){;} // probable numeric? // accidental junk on end of word we do know immedately? else if (i > 1 && !IsAlphaUTF8OrDigit(wordStarts[i][len-1]) ) { WORDP entry,canonical; char word[MAX_WORD_SIZE]; strcpy(word,wordStarts[i]); word[len-1] = 0; uint64 sysflags = 0; uint64 cansysflags = 0; WORDP revise; GetPosData(i,word,revise,entry,canonical,sysflags,cansysflags,true,true); // dont create a non-existent word if (entry && entry->properties & PART_OF_SPEECH) { wordStarts[i] = entry->word; fixedSpell = true; continue; // not a legal word character, leave it alone } } } // see if we know the other case if (!(tokenControl & (ONLY_LOWERCASE|STRICT_CASING)) || (i == startSentence && !(tokenControl & ONLY_LOWERCASE))) { WORDP E = FindWord(word,0,SECONDARY_CASE_ALLOWED); bool useAlternateCase = false; if (E && E->systemFlags & PATTERN_WORD) useAlternateCase = true; if (E && E->properties & (PART_OF_SPEECH|FOREIGN_WORD)) { // if the word we find is UPPER case, and this might be a lower case noun plural, don't change case. size_t len = strlen(word); if (word[len-1] == 's' ) { WORDP F = FindWord(word,len-1); if (!F || !(F->properties & (PART_OF_SPEECH|FOREIGN_WORD))) useAlternateCase = true; else continue; } else useAlternateCase = true; } else if (E) // does it have a member concept fact { if (IsConceptMember(E)) { useAlternateCase = true; break; } } if (useAlternateCase) { char* tokens[2]; tokens[1] = E->word; ReplaceWords("Alternatecase",i,1,1,tokens); fixedSpell = true; continue; } } // merge with next token? char join[MAX_WORD_SIZE * 3]; if (i != wordCount && *wordStarts[i+1] != '"' ) { // direct merge as a single word strcpy(join,word); strcat(join,wordStarts[i+1]); WORDP D = FindWord(join,0,(tokenControl & ONLY_LOWERCASE) ? PRIMARY_CASE_ALLOWED : STANDARD_LOOKUP); strcpy(join,word); // if (!D || !(D->properties & PART_OF_SPEECH) ) // merge these two, except "going to" or wordnet composites of normal words // merge as a compound word // { // strcat(join,(char*)"_"); // strcat(join,wordStarts[i+1]); // D = FindWord(join,0,(tokenControl & ONLY_LOWERCASE) ? PRIMARY_CASE_ALLOWED : STANDARD_LOOKUP); // } DONT CREATE _ words, let sequence handle it if (D && D->properties & PART_OF_SPEECH && !(D->properties & AUX_VERB)) // merge these two, except "going to" or wordnet composites of normal words { WORDP P1 = FindWord(word,0,LOWERCASE_LOOKUP); WORDP P2 = FindWord(wordStarts[i+1],0,LOWERCASE_LOOKUP); if (!P1 || !P2 || !(P1->properties & PART_OF_SPEECH) || !(P2->properties & PART_OF_SPEECH)) { char* tokens[2]; tokens[1] = D->word; ReplaceWords("merge",i,2,1,tokens); fixedSpell = true; continue; } } } // break apart slashed pair like eat/feed char* slash = strchr(word,'/'); if (slash && !slash[1] && len < MAX_WORD_SIZE) // remove trailing slash { strcpy(newword, word); newword[slash - word] = 0; word = wordStarts[i] = StoreWord(newword, AS_IS)->word; } if (slash && slash != word && slash[1]) // break apart word/word { if ((wordCount + 2 ) >= REAL_SENTENCE_LIMIT) continue; // no room *slash = 0; D = StoreWord(word); *slash = '/'; E = StoreWord(slash+1); char* tokens[4]; tokens[1] = D->word; tokens[2] = "/"; tokens[3] = E->word; ReplaceWords("Split",i,1,3,tokens); fixedSpell = true; --i; continue; } // see if hypenated word should be separate or joined (ignore obvious adjective suffix) if (hyphen && !stricmp(hyphen,(char*)"-like")) { StoreWord(word,ADJECTIVE_NORMAL|ADJECTIVE); // accept it as a word continue; } else if (hyphen && (hyphen-word) > 1 && !IsPlaceNumber(word)) // dont break up fifty-second { char test[MAX_WORD_SIZE]; char first[MAX_WORD_SIZE]; // test for split *hyphen = 0; strcpy(test,hyphen+1); strcpy(first,word); *hyphen = '-'; WORDP E = FindWord(test,0,LOWERCASE_LOOKUP); WORDP D = FindWord(first,0,LOWERCASE_LOOKUP); if (*first == 0) { wordStarts[i] = AllocateHeap(wordStarts[i] + 1); // -pieces want to lose the leading hypen (2-pieces) fixedSpell = true; } else if (D && E) // 1st word gets replaced, we added another word after { if ((wordCount + 1 ) >= REAL_SENTENCE_LIMIT) continue; // no room char* tokens[3]; tokens[1] = D->word; tokens[2] = E->word; ReplaceWords("Pair",i,1,2,tokens); fixedSpell = true; --i; } else if (!stricmp(test,(char*)"old") || !stricmp(test,(char*)"olds")) // break apart 5-year-old { if ((wordCount + 1 ) >= REAL_SENTENCE_LIMIT) continue; // no room D = StoreWord(first); E = StoreWord(test); char* tokens[3]; tokens[1] = D->word; tokens[2] = E->word; ReplaceWords("Break old",i,1,2,tokens); fixedSpell = true; --i; } else // remove hyphen entirely? { strcpy(test,first); strcat(test,hyphen+1); D = FindWord(test,0,(tokenControl & ONLY_LOWERCASE) ? PRIMARY_CASE_ALLOWED : STANDARD_LOOKUP); if (D) { wordStarts[i] = D->word; fixedSpell = true; --i; } } continue; // ignore hypenated errors that we couldnt solve, because no one mistypes a hypen } // see if number in front of unit split like 10mg if (IsDigit(*word)) { char* at = word; while (*++at && IsDigit(*at)) {;} WORDP E = FindWord(at); if (E && strlen(at) > 2 && *at != 'm') // number in front of known word ( but must be longer than 2 char, 5th) but allow mg { char token1[MAX_WORD_SIZE]; int len = at - word; strncpy(token1,word,len); token1[len] = 0; D = StoreWord(token1); char* tokens[4]; tokens[1] = D->word; tokens[2] = E->word; ReplaceWords("Split",i,1,2,tokens); fixedSpell = true; continue; } } // leave uppercase in first position if not adjusted yet... but check for lower case spell error if (IsUpperCase(word[0]) && tokenControl & NO_PROPER_SPELLCHECK) { char lower[MAX_WORD_SIZE]; MakeLowerCopy(lower,word); WORDP D = FindWord(lower,0,LOWERCASE_LOOKUP); if (!D && i == startWord) { char* okword = SpellFix(lower,i,PART_OF_SPEECH); if (okword) { char* tokens[2]; WORDP E = StoreWord(okword); tokens[1] = E->word; ReplaceWords("Spell",i,1,1,tokens); fixedSpell = true; } } continue; } // see if smooshed word pair size_t len1 = strlen(word); int j; if (!IsDigit(*word)) { for (j = 1; j <= len1 - 1; ++j) { WORDP X1 = FindWord(word, j); // any case WORDP X2 = FindWord(word + j, len1 - i); // any case if (X1 && X2 && (X1->word[1] || X1->word[0] == 'i' || X1->word[0] == 'I' || X1->word[0] == 'a')) { char* tokens[3]; tokens[1] = X1->word; tokens[2] = X2->word; ReplaceWords("Split", i, 1, 2, tokens); fixedSpell = true; break; } } if (j != len1) continue; } if (*word != '\'' && (!FindCanonical(word, i,true) || IsUpperCase(word[0]))) // dont check quoted or findable words unless they are capitalized { word = SpellCheck(i); // dont spell check proper names to improper, if word before or after is lower case originally if (word && i != 1 && originalCapState[i] && !IsUpperCase(*word)) { if (!originalCapState[i-1]) continue; else if (i != wordCount && !originalCapState[i+1]) continue; } if (word && !*word) // performed substitution on prior word, restart this one { fixedSpell = true; --i; continue; } if (word) { char* tokens[2]; tokens[1] = word; ReplaceWords("Spell",i,1,1,tokens); fixedSpell = true; continue; } } } return fixedSpell; }
char* SpellFix(char* originalWord,int start,uint64 posflags) { multichoice = false; char word[MAX_WORD_SIZE]; MakeLowerCopy(word, originalWord); char word1[MAX_WORD_SIZE]; MakeUpperCopy(word1, originalWord); WORDINFO realWordData; ComputeWordData(word, &realWordData); if (realWordData.bytelen >= 100 || realWordData.bytelen == 0) return NULL; if (IsDigit(*originalWord)) return NULL; // number-based words and numbers must be treated elsewhere char letterLow = *word; char letterHigh = *word1; bool hasUnderscore = (strchr(originalWord,'_')) ? true : false; bool isUpper = IsUpperCase(originalWord[0]); if (IsUpperCase(originalWord[1])) isUpper = false; // not if all caps if (trace == TRACE_SPELLING) Log(STDTRACELOG,(char*)"Spell: %s\r\n",originalWord); // Priority is to a word that looks like what the user typed, because the user probably would have noticed if it didnt and changed it. So add/delete has priority over tranform WORDP choices[4000]; WORDP bestGuess[4000]; unsigned int index = 0; unsigned int bestGuessindex = 0; int min = 35; // allow 2 changes as needed uint64 pos = PART_OF_SPEECH; // all pos allowed WORDP D; if (posflags == PART_OF_SPEECH && start < wordCount) // see if we can restrict word based on next word { D = FindWord(wordStarts[start+1],0,PRIMARY_CASE_ALLOWED); uint64 flags = (D) ? D->properties : (-1ull); // if we dont know the word, it could be anything if ((flags & PART_OF_SPEECH) == PREPOSITION) pos &= -1 ^ (PREPOSITION | NOUN); // prep cannot be preceeded by noun or prep if (!(flags & (PREPOSITION | VERB | CONJUNCTION | ADVERB)) && flags & DETERMINER) pos &= -1 ^ (DETERMINER | ADJECTIVE | NOUN | ADJECTIVE_NUMBER | NOUN_NUMBER); // determiner cannot be preceeded by noun determiner adjective if (!(flags & (PREPOSITION | VERB | CONJUNCTION | DETERMINER | ADVERB)) && flags & ADJECTIVE) pos &= -1 ^ (NOUN); if (!(flags & (PREPOSITION | NOUN | CONJUNCTION | DETERMINER | ADVERB | ADJECTIVE)) && flags & VERB) pos &= -1 ^ (VERB); // we know all helper verbs we might be if (D && *D->word == '\'' && D->word[1] == 's' ) pos &= NOUN; // we can only be a noun if possessive - contracted 's should already be removed by now } if (posflags == PART_OF_SPEECH && start > 1) { D = FindWord(wordStarts[start-1],0,PRIMARY_CASE_ALLOWED); uint64 flags = (D) ? D->properties : (-1); // if we dont know the word, it could be anything if (flags & DETERMINER) pos &= -1 ^ (VERB|CONJUNCTION|PREPOSITION|DETERMINER); } posflags &= pos; // if pos types are known and restricted and dont match static int range[] = {0,-1,1,-2,2}; for (unsigned int i = 0; i < 5; ++i) { if (i >= 3) break; MEANING offset = lengthLists[realWordData.charlen + range[i]]; if (trace == TRACE_SPELLING) Log(STDTRACELOG,(char*)"\r\n Begin offset %d\r\n",i); while (offset) { D = Meaning2Word(offset); offset = D->spellNode; if (PART_OF_SPEECH == posflags && D->systemFlags & PATTERN_WORD){;} // legal generic match else if (!(D->properties & posflags)) continue; // wrong kind of word char* under = strchr(D->word,'_'); // SPELLING lists have no underscore or space words in them if (hasUnderscore && !under) continue; // require keep any underscore if (!hasUnderscore && under) continue; // require not have any underscore if (isUpper && !(D->internalBits & UPPERCASE_HASH) && start != 1) continue; // dont spell check to lower a word in upper WORDINFO dictWordData; ComputeWordData(D->word, &dictWordData); int val = EditDistance(dictWordData, realWordData, min); if (val <= min) // as good or better { if (val < min) { if (trace == TRACE_SPELLING) Log(STDTRACELOG,(char*)" Better: %s against %s value: %d\r\n",D->word,originalWord,val); index = 0; min = val; } else if ( val == min && trace == TRACE_SPELLING) Log(STDTRACELOG,(char*)" Equal: %s against %s value: %d\r\n",D->word,originalWord,val); if (!(D->internalBits & BEEN_HERE)) { choices[index++] = D; if (index > 3998) break; AddInternalFlag(D,BEEN_HERE); } } } } // try endings ing, s, etc if (start && !index && !stricmp(language,"english")) // no stem spell if COMING from a stem spell attempt (start == 0) or we have a good guess already { uint64 flags = 0; char* stem = StemSpell(word,start,flags); if (stem) { WORDP X = StoreWord(stem,flags); if (X) choices[index++] = X; } } if (!index) return NULL; if (index > 1) multichoice = true; // take our guesses, and pick the most common (earliest learned or most frequently used) word uint64 commonmin = 0; bestGuess[0] = NULL; for (unsigned int j = 0; j < index; ++j) RemoveInternalFlag(choices[j],BEEN_HERE); if (index == 1) { if (trace == TRACE_SPELLING) Log(STDTRACELOG,(char*)" Single best spell: %s\r\n",choices[0]->word); return choices[0]->word; // pick the one } for (unsigned int j = 0; j < index; ++j) { uint64 common = choices[j]->systemFlags & COMMONNESS; if (common < commonmin) continue; if (choices[j]->internalBits & UPPERCASE_HASH && index > 1) continue; // ignore proper names for spell better when some other choice exists if (common > commonmin) // this one is more common { commonmin = common; bestGuessindex = 0; } bestGuess[bestGuessindex++] = choices[j]; } if (bestGuessindex) { if (bestGuessindex > 1) multichoice = true; if (trace == TRACE_SPELLING) Log(STDTRACELOG,(char*)" Pick spell: %s\r\n",bestGuess[0]->word); return bestGuess[0]->word; } return NULL; }