void ReadComputerID() { strcpy(computerID,"anonymous"); WORDP D = FindWord("defaultbot",0); // do we have a FACT with the default bot in it as verb if (D) { FACT* F = GetVerbHead(D); if (F) { D = Meaning2Word(F->subject); strcpy(computerID,D->word); } } }
char* SpellFix(char* originalWord,int start,uint64 posflags,int language) { size_t len = strlen(originalWord); if (len >= 100 || len == 0) return NULL; if (IsDigit(*originalWord)) return NULL; // number-based words and numbers must be treated elsewhere char letterLow = GetLowercaseData(*originalWord); char letterHigh = GetUppercaseData(*originalWord); bool hasUnderscore = (strchr(originalWord,'_')) ? true : false; bool isUpper = IsUpperCase(originalWord[0]); if (IsUpperCase(originalWord[1])) isUpper = false; // not if all caps if (trace == TRACE_SPELLING) Log(STDTRACELOG,(char*)"Spell: %s\r\n",originalWord); char word[MAX_WORD_SIZE]; MakeLowerCopy(word,originalWord); // mark positions of the letters and make lower case char base[257]; memset(base,0,257); char* ptr = word - 1; char c; int position = 0; while ((c = *++ptr) && position < 255) { base[position++ + 1] = GetLowercaseData(c); } // Priority is to a word that looks like what the user typed, because the user probably would have noticed if it didnt and changed it. So add/delete has priority over tranform WORDP choices[4000]; WORDP bestGuess[4000]; unsigned int index = 0; unsigned int bestGuessindex = 0; int min = 30; unsigned char realWordLetterCounts[LETTERMAX]; memset(realWordLetterCounts,0,LETTERMAX); for (int i = 0; i < (int)len; ++i) ++realWordLetterCounts[(unsigned char)letterIndexData[(unsigned char)word[i]]]; // compute number of each kind of character uint64 pos = PART_OF_SPEECH; // all pos allowed WORDP D; if (posflags == PART_OF_SPEECH && start < wordCount) // see if we can restrict word based on next word { D = FindWord(wordStarts[start+1],0,PRIMARY_CASE_ALLOWED); uint64 flags = (D) ? D->properties : (-1); // if we dont know the word, it could be anything if (flags & PREPOSITION) pos &= -1 ^ (PREPOSITION|NOUN); // prep cannot be preceeded by noun or prep if (!(flags & (PREPOSITION|VERB|CONJUNCTION|ADVERB)) && flags & DETERMINER) pos &= -1 ^ (DETERMINER|ADJECTIVE|NOUN|ADJECTIVE_NUMBER|NOUN_NUMBER); // determiner cannot be preceeded by noun determiner adjective if (!(flags & (PREPOSITION|VERB|CONJUNCTION|DETERMINER|ADVERB)) && flags & ADJECTIVE) pos &= -1 ^ (NOUN); if (!(flags & (PREPOSITION|NOUN|CONJUNCTION|DETERMINER|ADVERB|ADJECTIVE)) && flags & VERB) pos &= -1 ^ (VERB); // we know all helper verbs we might be if (D && *D->word == '\'' && D->word[1] == 's' ) pos &= NOUN; // we can only be a noun if possessive - contracted 's should already be removed by now } if (posflags == PART_OF_SPEECH && start > 1) { D = FindWord(wordStarts[start-1],0,PRIMARY_CASE_ALLOWED); uint64 flags = (D) ? D->properties : (-1); // if we dont know the word, it could be anything if (flags & DETERMINER) pos &= -1 ^ (VERB|CONJUNCTION|PREPOSITION|DETERMINER); } posflags &= pos; // if pos types are known and restricted and dont match static int range[] = {0,-1,1,-2,2}; for (unsigned int i = 0; i < 5; ++i) { if (language == ENGLISH && i >= 3) break; // only allow +-2 for spanish MEANING offset = lengthLists[len + range[i]]; if (trace == TRACE_SPELLING) Log(STDTRACELOG,(char*)"\r\n Begin offset %d\r\n",i); while (offset) { D = Meaning2Word(offset); offset = D->spellNode; if (PART_OF_SPEECH == posflags && D->systemFlags & PATTERN_WORD){;} // legal generic match else if (!(D->properties & posflags)) continue; // wrong kind of word if (*D->word != letterLow && *D->word != letterHigh && language == ENGLISH) continue; // we assume no one misspells starting letter char* under = strchr(D->word,'_'); // SPELLING lists have no underscore or space words in them if (hasUnderscore && !under) continue; // require keep any underscore if (!hasUnderscore && under) continue; // require not have any underscore if (isUpper && !(D->internalBits & UPPERCASE_HASH) && start != 1) continue; // dont spell check to lower a word in upper int val = EditDistance(D, D->length, len, base+1,min,realWordLetterCounts,language); if (val <= min) // as good or better { if (val < min) { if (trace == TRACE_SPELLING) Log(STDTRACELOG,(char*)" Better: %s against %s value: %d\r\n",D->word,originalWord,val); index = 0; min = val; } else if ( val == min && trace == TRACE_SPELLING) Log(STDTRACELOG,(char*)" Equal: %s against %s value: %d\r\n",D->word,originalWord,val); if (!(D->internalBits & BEEN_HERE)) { choices[index++] = D; if (index > 3998) break; AddInternalFlag(D,BEEN_HERE); } } } } // try endings ing, s, etc if (start) // no stem spell if COMING from a stem spell attempt (start == 0) { char* stem = StemSpell(word,start); if (stem) { WORDP D = FindWord(stem,0,(tokenControl & ONLY_LOWERCASE) ? PRIMARY_CASE_ALLOWED : STANDARD_LOOKUP); if (D) { for (unsigned int j = 0; j < index; ++j) { if (choices[j] == D) // already in our list { D = NULL; break; } } } if (D) choices[index++] = D; } } if (!index) return NULL; // take our guesses, and pick the most common (earliest learned or most frequently used) word uint64 commonmin = 0; bestGuess[0] = NULL; for (unsigned int j = 0; j < index; ++j) RemoveInternalFlag(choices[j],BEEN_HERE); if (index == 1) { if (trace == TRACE_SPELLING) Log(STDTRACELOG,(char*)" Single best spell: %s\r\n",choices[0]->word); return choices[0]->word; // pick the one } for (unsigned int j = 0; j < index; ++j) { uint64 common = choices[j]->systemFlags & COMMONNESS; if (common < commonmin) continue; if (choices[j]->internalBits & UPPERCASE_HASH && index > 1) continue; // ignore proper names for spell better when some other choice exists if (common > commonmin) { commonmin = common; bestGuessindex = 0; } bestGuess[bestGuessindex++] = choices[j]; } if (bestGuessindex) { if (trace == TRACE_SPELLING) Log(STDTRACELOG,(char*)" Pick spell: %s\r\n",bestGuess[0]->word); return bestGuess[0]->word; } return NULL; }
char* SpellFix(char* originalWord,int start,uint64 posflags) { multichoice = false; char word[MAX_WORD_SIZE]; MakeLowerCopy(word, originalWord); char word1[MAX_WORD_SIZE]; MakeUpperCopy(word1, originalWord); WORDINFO realWordData; ComputeWordData(word, &realWordData); if (realWordData.bytelen >= 100 || realWordData.bytelen == 0) return NULL; if (IsDigit(*originalWord)) return NULL; // number-based words and numbers must be treated elsewhere char letterLow = *word; char letterHigh = *word1; bool hasUnderscore = (strchr(originalWord,'_')) ? true : false; bool isUpper = IsUpperCase(originalWord[0]); if (IsUpperCase(originalWord[1])) isUpper = false; // not if all caps if (trace == TRACE_SPELLING) Log(STDTRACELOG,(char*)"Spell: %s\r\n",originalWord); // Priority is to a word that looks like what the user typed, because the user probably would have noticed if it didnt and changed it. So add/delete has priority over tranform WORDP choices[4000]; WORDP bestGuess[4000]; unsigned int index = 0; unsigned int bestGuessindex = 0; int min = 35; // allow 2 changes as needed uint64 pos = PART_OF_SPEECH; // all pos allowed WORDP D; if (posflags == PART_OF_SPEECH && start < wordCount) // see if we can restrict word based on next word { D = FindWord(wordStarts[start+1],0,PRIMARY_CASE_ALLOWED); uint64 flags = (D) ? D->properties : (-1ull); // if we dont know the word, it could be anything if ((flags & PART_OF_SPEECH) == PREPOSITION) pos &= -1 ^ (PREPOSITION | NOUN); // prep cannot be preceeded by noun or prep if (!(flags & (PREPOSITION | VERB | CONJUNCTION | ADVERB)) && flags & DETERMINER) pos &= -1 ^ (DETERMINER | ADJECTIVE | NOUN | ADJECTIVE_NUMBER | NOUN_NUMBER); // determiner cannot be preceeded by noun determiner adjective if (!(flags & (PREPOSITION | VERB | CONJUNCTION | DETERMINER | ADVERB)) && flags & ADJECTIVE) pos &= -1 ^ (NOUN); if (!(flags & (PREPOSITION | NOUN | CONJUNCTION | DETERMINER | ADVERB | ADJECTIVE)) && flags & VERB) pos &= -1 ^ (VERB); // we know all helper verbs we might be if (D && *D->word == '\'' && D->word[1] == 's' ) pos &= NOUN; // we can only be a noun if possessive - contracted 's should already be removed by now } if (posflags == PART_OF_SPEECH && start > 1) { D = FindWord(wordStarts[start-1],0,PRIMARY_CASE_ALLOWED); uint64 flags = (D) ? D->properties : (-1); // if we dont know the word, it could be anything if (flags & DETERMINER) pos &= -1 ^ (VERB|CONJUNCTION|PREPOSITION|DETERMINER); } posflags &= pos; // if pos types are known and restricted and dont match static int range[] = {0,-1,1,-2,2}; for (unsigned int i = 0; i < 5; ++i) { if (i >= 3) break; MEANING offset = lengthLists[realWordData.charlen + range[i]]; if (trace == TRACE_SPELLING) Log(STDTRACELOG,(char*)"\r\n Begin offset %d\r\n",i); while (offset) { D = Meaning2Word(offset); offset = D->spellNode; if (PART_OF_SPEECH == posflags && D->systemFlags & PATTERN_WORD){;} // legal generic match else if (!(D->properties & posflags)) continue; // wrong kind of word char* under = strchr(D->word,'_'); // SPELLING lists have no underscore or space words in them if (hasUnderscore && !under) continue; // require keep any underscore if (!hasUnderscore && under) continue; // require not have any underscore if (isUpper && !(D->internalBits & UPPERCASE_HASH) && start != 1) continue; // dont spell check to lower a word in upper WORDINFO dictWordData; ComputeWordData(D->word, &dictWordData); int val = EditDistance(dictWordData, realWordData, min); if (val <= min) // as good or better { if (val < min) { if (trace == TRACE_SPELLING) Log(STDTRACELOG,(char*)" Better: %s against %s value: %d\r\n",D->word,originalWord,val); index = 0; min = val; } else if ( val == min && trace == TRACE_SPELLING) Log(STDTRACELOG,(char*)" Equal: %s against %s value: %d\r\n",D->word,originalWord,val); if (!(D->internalBits & BEEN_HERE)) { choices[index++] = D; if (index > 3998) break; AddInternalFlag(D,BEEN_HERE); } } } } // try endings ing, s, etc if (start && !index && !stricmp(language,"english")) // no stem spell if COMING from a stem spell attempt (start == 0) or we have a good guess already { uint64 flags = 0; char* stem = StemSpell(word,start,flags); if (stem) { WORDP X = StoreWord(stem,flags); if (X) choices[index++] = X; } } if (!index) return NULL; if (index > 1) multichoice = true; // take our guesses, and pick the most common (earliest learned or most frequently used) word uint64 commonmin = 0; bestGuess[0] = NULL; for (unsigned int j = 0; j < index; ++j) RemoveInternalFlag(choices[j],BEEN_HERE); if (index == 1) { if (trace == TRACE_SPELLING) Log(STDTRACELOG,(char*)" Single best spell: %s\r\n",choices[0]->word); return choices[0]->word; // pick the one } for (unsigned int j = 0; j < index; ++j) { uint64 common = choices[j]->systemFlags & COMMONNESS; if (common < commonmin) continue; if (choices[j]->internalBits & UPPERCASE_HASH && index > 1) continue; // ignore proper names for spell better when some other choice exists if (common > commonmin) // this one is more common { commonmin = common; bestGuessindex = 0; } bestGuess[bestGuessindex++] = choices[j]; } if (bestGuessindex) { if (bestGuessindex > 1) multichoice = true; if (trace == TRACE_SPELLING) Log(STDTRACELOG,(char*)" Pick spell: %s\r\n",bestGuess[0]->word); return bestGuess[0]->word; } return NULL; }