void BkTree::insert(std::string m_item) { if( !m_root ){ m_root = new Node(m_item, -1); return; } Node *t = m_root; while( true ) { size_t d = EditDistance( t->m_item, m_item ); if( !d ) return; Node *ch = t->m_firstChild; while( ch ) { if( ch->m_distToParent == d ) { t = ch; break; } ch = ch->m_nextSibling; } if( !ch ) { Node *newChild = new Node(m_item, d); newChild->m_nextSibling = t->m_firstChild; t->m_firstChild = newChild; break; } } }
void suggestion(struct bkTree*root,char word[]) { if(!root) return; int d=EditDistance(root->str,word); if(d==max) printf("%s\n",root->str); int i; i=d-max; if(i>=1) { for(;i<=d+max;i++) { suggestion(root->childNode[i],word); } } }
int BkTree::getWithinDistance(std::string center, size_t k) { if( !m_root ) return 0; int found = 0; std::queue< Node* > q; q.push( m_root ); while( !q.empty() ) { Node *t = q.front(); q.pop(); size_t d = EditDistance( t->m_item, center ); if( d <= k ) { found++; } Node *ch = t->m_firstChild; while( ch ) { if( d - k <= ch->m_distToParent && ch->m_distToParent <= d + k ) q.push(ch); ch = ch->m_nextSibling; } } return found; }
char* SpellFix(char* originalWord,int start,uint64 posflags,int language) { size_t len = strlen(originalWord); if (len >= 100 || len == 0) return NULL; if (IsDigit(*originalWord)) return NULL; // number-based words and numbers must be treated elsewhere char letterLow = GetLowercaseData(*originalWord); char letterHigh = GetUppercaseData(*originalWord); bool hasUnderscore = (strchr(originalWord,'_')) ? true : false; bool isUpper = IsUpperCase(originalWord[0]); if (IsUpperCase(originalWord[1])) isUpper = false; // not if all caps if (trace == TRACE_SPELLING) Log(STDTRACELOG,(char*)"Spell: %s\r\n",originalWord); char word[MAX_WORD_SIZE]; MakeLowerCopy(word,originalWord); // mark positions of the letters and make lower case char base[257]; memset(base,0,257); char* ptr = word - 1; char c; int position = 0; while ((c = *++ptr) && position < 255) { base[position++ + 1] = GetLowercaseData(c); } // Priority is to a word that looks like what the user typed, because the user probably would have noticed if it didnt and changed it. So add/delete has priority over tranform WORDP choices[4000]; WORDP bestGuess[4000]; unsigned int index = 0; unsigned int bestGuessindex = 0; int min = 30; unsigned char realWordLetterCounts[LETTERMAX]; memset(realWordLetterCounts,0,LETTERMAX); for (int i = 0; i < (int)len; ++i) ++realWordLetterCounts[(unsigned char)letterIndexData[(unsigned char)word[i]]]; // compute number of each kind of character uint64 pos = PART_OF_SPEECH; // all pos allowed WORDP D; if (posflags == PART_OF_SPEECH && start < wordCount) // see if we can restrict word based on next word { D = FindWord(wordStarts[start+1],0,PRIMARY_CASE_ALLOWED); uint64 flags = (D) ? D->properties : (-1); // if we dont know the word, it could be anything if (flags & PREPOSITION) pos &= -1 ^ (PREPOSITION|NOUN); // prep cannot be preceeded by noun or prep if (!(flags & (PREPOSITION|VERB|CONJUNCTION|ADVERB)) && flags & DETERMINER) pos &= -1 ^ (DETERMINER|ADJECTIVE|NOUN|ADJECTIVE_NUMBER|NOUN_NUMBER); // determiner cannot be preceeded by noun determiner adjective if (!(flags & (PREPOSITION|VERB|CONJUNCTION|DETERMINER|ADVERB)) && flags & ADJECTIVE) pos &= -1 ^ (NOUN); if (!(flags & (PREPOSITION|NOUN|CONJUNCTION|DETERMINER|ADVERB|ADJECTIVE)) && flags & VERB) pos &= -1 ^ (VERB); // we know all helper verbs we might be if (D && *D->word == '\'' && D->word[1] == 's' ) pos &= NOUN; // we can only be a noun if possessive - contracted 's should already be removed by now } if (posflags == PART_OF_SPEECH && start > 1) { D = FindWord(wordStarts[start-1],0,PRIMARY_CASE_ALLOWED); uint64 flags = (D) ? D->properties : (-1); // if we dont know the word, it could be anything if (flags & DETERMINER) pos &= -1 ^ (VERB|CONJUNCTION|PREPOSITION|DETERMINER); } posflags &= pos; // if pos types are known and restricted and dont match static int range[] = {0,-1,1,-2,2}; for (unsigned int i = 0; i < 5; ++i) { if (language == ENGLISH && i >= 3) break; // only allow +-2 for spanish MEANING offset = lengthLists[len + range[i]]; if (trace == TRACE_SPELLING) Log(STDTRACELOG,(char*)"\r\n Begin offset %d\r\n",i); while (offset) { D = Meaning2Word(offset); offset = D->spellNode; if (PART_OF_SPEECH == posflags && D->systemFlags & PATTERN_WORD){;} // legal generic match else if (!(D->properties & posflags)) continue; // wrong kind of word if (*D->word != letterLow && *D->word != letterHigh && language == ENGLISH) continue; // we assume no one misspells starting letter char* under = strchr(D->word,'_'); // SPELLING lists have no underscore or space words in them if (hasUnderscore && !under) continue; // require keep any underscore if (!hasUnderscore && under) continue; // require not have any underscore if (isUpper && !(D->internalBits & UPPERCASE_HASH) && start != 1) continue; // dont spell check to lower a word in upper int val = EditDistance(D, D->length, len, base+1,min,realWordLetterCounts,language); if (val <= min) // as good or better { if (val < min) { if (trace == TRACE_SPELLING) Log(STDTRACELOG,(char*)" Better: %s against %s value: %d\r\n",D->word,originalWord,val); index = 0; min = val; } else if ( val == min && trace == TRACE_SPELLING) Log(STDTRACELOG,(char*)" Equal: %s against %s value: %d\r\n",D->word,originalWord,val); if (!(D->internalBits & BEEN_HERE)) { choices[index++] = D; if (index > 3998) break; AddInternalFlag(D,BEEN_HERE); } } } } // try endings ing, s, etc if (start) // no stem spell if COMING from a stem spell attempt (start == 0) { char* stem = StemSpell(word,start); if (stem) { WORDP D = FindWord(stem,0,(tokenControl & ONLY_LOWERCASE) ? PRIMARY_CASE_ALLOWED : STANDARD_LOOKUP); if (D) { for (unsigned int j = 0; j < index; ++j) { if (choices[j] == D) // already in our list { D = NULL; break; } } } if (D) choices[index++] = D; } } if (!index) return NULL; // take our guesses, and pick the most common (earliest learned or most frequently used) word uint64 commonmin = 0; bestGuess[0] = NULL; for (unsigned int j = 0; j < index; ++j) RemoveInternalFlag(choices[j],BEEN_HERE); if (index == 1) { if (trace == TRACE_SPELLING) Log(STDTRACELOG,(char*)" Single best spell: %s\r\n",choices[0]->word); return choices[0]->word; // pick the one } for (unsigned int j = 0; j < index; ++j) { uint64 common = choices[j]->systemFlags & COMMONNESS; if (common < commonmin) continue; if (choices[j]->internalBits & UPPERCASE_HASH && index > 1) continue; // ignore proper names for spell better when some other choice exists if (common > commonmin) { commonmin = common; bestGuessindex = 0; } bestGuess[bestGuessindex++] = choices[j]; } if (bestGuessindex) { if (trace == TRACE_SPELLING) Log(STDTRACELOG,(char*)" Pick spell: %s\r\n",bestGuess[0]->word); return bestGuess[0]->word; } return NULL; }
char* SpellFix(char* originalWord,int start,uint64 posflags) { multichoice = false; char word[MAX_WORD_SIZE]; MakeLowerCopy(word, originalWord); char word1[MAX_WORD_SIZE]; MakeUpperCopy(word1, originalWord); WORDINFO realWordData; ComputeWordData(word, &realWordData); if (realWordData.bytelen >= 100 || realWordData.bytelen == 0) return NULL; if (IsDigit(*originalWord)) return NULL; // number-based words and numbers must be treated elsewhere char letterLow = *word; char letterHigh = *word1; bool hasUnderscore = (strchr(originalWord,'_')) ? true : false; bool isUpper = IsUpperCase(originalWord[0]); if (IsUpperCase(originalWord[1])) isUpper = false; // not if all caps if (trace == TRACE_SPELLING) Log(STDTRACELOG,(char*)"Spell: %s\r\n",originalWord); // Priority is to a word that looks like what the user typed, because the user probably would have noticed if it didnt and changed it. So add/delete has priority over tranform WORDP choices[4000]; WORDP bestGuess[4000]; unsigned int index = 0; unsigned int bestGuessindex = 0; int min = 35; // allow 2 changes as needed uint64 pos = PART_OF_SPEECH; // all pos allowed WORDP D; if (posflags == PART_OF_SPEECH && start < wordCount) // see if we can restrict word based on next word { D = FindWord(wordStarts[start+1],0,PRIMARY_CASE_ALLOWED); uint64 flags = (D) ? D->properties : (-1ull); // if we dont know the word, it could be anything if ((flags & PART_OF_SPEECH) == PREPOSITION) pos &= -1 ^ (PREPOSITION | NOUN); // prep cannot be preceeded by noun or prep if (!(flags & (PREPOSITION | VERB | CONJUNCTION | ADVERB)) && flags & DETERMINER) pos &= -1 ^ (DETERMINER | ADJECTIVE | NOUN | ADJECTIVE_NUMBER | NOUN_NUMBER); // determiner cannot be preceeded by noun determiner adjective if (!(flags & (PREPOSITION | VERB | CONJUNCTION | DETERMINER | ADVERB)) && flags & ADJECTIVE) pos &= -1 ^ (NOUN); if (!(flags & (PREPOSITION | NOUN | CONJUNCTION | DETERMINER | ADVERB | ADJECTIVE)) && flags & VERB) pos &= -1 ^ (VERB); // we know all helper verbs we might be if (D && *D->word == '\'' && D->word[1] == 's' ) pos &= NOUN; // we can only be a noun if possessive - contracted 's should already be removed by now } if (posflags == PART_OF_SPEECH && start > 1) { D = FindWord(wordStarts[start-1],0,PRIMARY_CASE_ALLOWED); uint64 flags = (D) ? D->properties : (-1); // if we dont know the word, it could be anything if (flags & DETERMINER) pos &= -1 ^ (VERB|CONJUNCTION|PREPOSITION|DETERMINER); } posflags &= pos; // if pos types are known and restricted and dont match static int range[] = {0,-1,1,-2,2}; for (unsigned int i = 0; i < 5; ++i) { if (i >= 3) break; MEANING offset = lengthLists[realWordData.charlen + range[i]]; if (trace == TRACE_SPELLING) Log(STDTRACELOG,(char*)"\r\n Begin offset %d\r\n",i); while (offset) { D = Meaning2Word(offset); offset = D->spellNode; if (PART_OF_SPEECH == posflags && D->systemFlags & PATTERN_WORD){;} // legal generic match else if (!(D->properties & posflags)) continue; // wrong kind of word char* under = strchr(D->word,'_'); // SPELLING lists have no underscore or space words in them if (hasUnderscore && !under) continue; // require keep any underscore if (!hasUnderscore && under) continue; // require not have any underscore if (isUpper && !(D->internalBits & UPPERCASE_HASH) && start != 1) continue; // dont spell check to lower a word in upper WORDINFO dictWordData; ComputeWordData(D->word, &dictWordData); int val = EditDistance(dictWordData, realWordData, min); if (val <= min) // as good or better { if (val < min) { if (trace == TRACE_SPELLING) Log(STDTRACELOG,(char*)" Better: %s against %s value: %d\r\n",D->word,originalWord,val); index = 0; min = val; } else if ( val == min && trace == TRACE_SPELLING) Log(STDTRACELOG,(char*)" Equal: %s against %s value: %d\r\n",D->word,originalWord,val); if (!(D->internalBits & BEEN_HERE)) { choices[index++] = D; if (index > 3998) break; AddInternalFlag(D,BEEN_HERE); } } } } // try endings ing, s, etc if (start && !index && !stricmp(language,"english")) // no stem spell if COMING from a stem spell attempt (start == 0) or we have a good guess already { uint64 flags = 0; char* stem = StemSpell(word,start,flags); if (stem) { WORDP X = StoreWord(stem,flags); if (X) choices[index++] = X; } } if (!index) return NULL; if (index > 1) multichoice = true; // take our guesses, and pick the most common (earliest learned or most frequently used) word uint64 commonmin = 0; bestGuess[0] = NULL; for (unsigned int j = 0; j < index; ++j) RemoveInternalFlag(choices[j],BEEN_HERE); if (index == 1) { if (trace == TRACE_SPELLING) Log(STDTRACELOG,(char*)" Single best spell: %s\r\n",choices[0]->word); return choices[0]->word; // pick the one } for (unsigned int j = 0; j < index; ++j) { uint64 common = choices[j]->systemFlags & COMMONNESS; if (common < commonmin) continue; if (choices[j]->internalBits & UPPERCASE_HASH && index > 1) continue; // ignore proper names for spell better when some other choice exists if (common > commonmin) // this one is more common { commonmin = common; bestGuessindex = 0; } bestGuess[bestGuessindex++] = choices[j]; } if (bestGuessindex) { if (bestGuessindex > 1) multichoice = true; if (trace == TRACE_SPELLING) Log(STDTRACELOG,(char*)" Pick spell: %s\r\n",bestGuess[0]->word); return bestGuess[0]->word; } return NULL; }