예제 #1
0
static bool FindPartialInSentenceTest(char* test, int start,int originalstart,bool reverse,
	int& actualStart, int& actualEnd)
{
	if (!test || !*test) return false;
	if (reverse)
	{
		for ( int i = originalstart-1; i >= 1; --i) // can this be found in sentence backwards
		{
			char word[MAX_WORD_SIZE];
			MakeLowerCopy(word,wordStarts[i]);
			if (unmarked[i] || !MatchesPattern(word,test)) continue;	// if universally unmarked, skip it. Or if they dont match
			// we have a match of a word
			actualStart = i;
			actualEnd = i;
			return true;
		}
	}
	else 
	{
		for (int i = start+1; i <= wordCount; ++i) // can this be found in sentence
		{
			char word[MAX_WORD_SIZE];
			MakeLowerCopy(word,wordStarts[i]);
			if (unmarked[i] || !MatchesPattern(word,test)) continue;	// if universally unmarked, skip it. Or if they dont match
			// we have a match of a word
			actualStart = i;
			actualEnd = i;
			return true;
		}
	}
	return false;
}
예제 #2
0
void Login(char* caller,char* usee,char* ip) //   select the participants
{
    if (*usee) strcpy(computerID,usee);
	if (!*computerID) ReadComputerID(); //   we are defaulting the chatee

	//   for topic access validation
	computerIDwSpace[0] = ' ';
	MakeLowerCopy(computerIDwSpace+1,computerID);
	strcat(computerIDwSpace," ");

	if ( *caller == '.' && ip) // make unique with IP adddress at end
	{
		char name[MAX_WORD_SIZE];
		strcpy(name,caller);
		sprintf(caller,"%s%s",ip,name);
	}

	// alternate to .user for non-unique internet systems, just make them guest with ip at end
	if (!stricmp(caller,"guest") && ip)
	{
		char word[MAX_WORD_SIZE];
		strcpy(word,ip);
		char* ptr;
		while ((ptr = strchr(word,'.'))) *ptr = '_';	// purify for file system
		sprintf(caller,"guest%s",ip);
	}

    //   prepare for chat
    PartialLogin(caller,ip);
 }
예제 #3
0
char* SpellFix(char* originalWord,int start,uint64 posflags,int language)
{
    size_t len = strlen(originalWord);
	if (len >= 100 || len == 0) return NULL;
	if (IsDigit(*originalWord)) return NULL; // number-based words and numbers must be treated elsewhere
	char letterLow = GetLowercaseData(*originalWord);
	char letterHigh = GetUppercaseData(*originalWord);
	bool hasUnderscore = (strchr(originalWord,'_')) ? true : false;
	bool isUpper = IsUpperCase(originalWord[0]);
	if (IsUpperCase(originalWord[1])) isUpper = false;	// not if all caps
	if (trace == TRACE_SPELLING) Log(STDTRACELOG,(char*)"Spell: %s\r\n",originalWord);

	char word[MAX_WORD_SIZE];
	MakeLowerCopy(word,originalWord);

	// mark positions of the letters and make lower case
    char base[257];
    memset(base,0,257);
    char* ptr = word - 1;
    char c;
    int position = 0;
    while ((c = *++ptr) && position < 255)
    {
        base[position++ + 1] = GetLowercaseData(c);
   }

	//   Priority is to a word that looks like what the user typed, because the user probably would have noticed if it didnt and changed it. So add/delete  has priority over tranform
    WORDP choices[4000];
    WORDP bestGuess[4000];
    unsigned int index = 0;
    unsigned int bestGuessindex = 0;
    int min = 30;
	unsigned char realWordLetterCounts[LETTERMAX];
	memset(realWordLetterCounts,0,LETTERMAX); 
	for (int  i = 0; i < (int)len; ++i)  ++realWordLetterCounts[(unsigned char)letterIndexData[(unsigned char)word[i]]]; // compute number of each kind of character
	
	uint64  pos = PART_OF_SPEECH;  // all pos allowed
    WORDP D;
    if (posflags == PART_OF_SPEECH && start < wordCount) // see if we can restrict word based on next word
    {
        D = FindWord(wordStarts[start+1],0,PRIMARY_CASE_ALLOWED);
        uint64 flags = (D) ? D->properties : (-1); //   if we dont know the word, it could be anything
        if (flags & PREPOSITION) pos &= -1 ^ (PREPOSITION|NOUN);   //   prep cannot be preceeded by noun or prep
        if (!(flags & (PREPOSITION|VERB|CONJUNCTION|ADVERB)) && flags & DETERMINER) pos &= -1 ^ (DETERMINER|ADJECTIVE|NOUN|ADJECTIVE_NUMBER|NOUN_NUMBER); //   determiner cannot be preceeded by noun determiner adjective
        if (!(flags & (PREPOSITION|VERB|CONJUNCTION|DETERMINER|ADVERB)) && flags & ADJECTIVE) pos &= -1 ^ (NOUN); 
        if (!(flags & (PREPOSITION|NOUN|CONJUNCTION|DETERMINER|ADVERB|ADJECTIVE)) && flags & VERB) pos &= -1 ^ (VERB); //   we know all helper verbs we might be
        if (D && *D->word == '\'' && D->word[1] == 's' ) pos &= NOUN;    //   we can only be a noun if possessive - contracted 's should already be removed by now
    }
    if (posflags == PART_OF_SPEECH && start > 1)
    {
        D = FindWord(wordStarts[start-1],0,PRIMARY_CASE_ALLOWED);
        uint64 flags = (D) ? D->properties : (-1); // if we dont know the word, it could be anything
        if (flags & DETERMINER) pos &= -1 ^ (VERB|CONJUNCTION|PREPOSITION|DETERMINER);  
    }
    posflags &= pos; //   if pos types are known and restricted and dont match
	static int range[] = {0,-1,1,-2,2};
	for (unsigned int i = 0; i < 5; ++i)
	{
		if (language == ENGLISH && i >= 3) break;	// only allow +-2 for spanish
		MEANING offset = lengthLists[len + range[i]];
		if (trace == TRACE_SPELLING) Log(STDTRACELOG,(char*)"\r\n  Begin offset %d\r\n",i);
		while (offset)
		{
			D = Meaning2Word(offset);
			offset = D->spellNode;
			if (PART_OF_SPEECH == posflags  && D->systemFlags & PATTERN_WORD){;} // legal generic match
			else if (!(D->properties & posflags)) continue; // wrong kind of word
			if (*D->word != letterLow && *D->word != letterHigh && language == ENGLISH) continue;	// we assume no one misspells starting letter
			char* under = strchr(D->word,'_');
			// SPELLING lists have no underscore or space words in them
			if (hasUnderscore && !under) continue;	 // require keep any underscore
			if (!hasUnderscore && under) continue;	 // require not have any underscore
			if (isUpper && !(D->internalBits & UPPERCASE_HASH) && start != 1) continue;	// dont spell check to lower a word in upper
			int val = EditDistance(D, D->length, len, base+1,min,realWordLetterCounts,language);
			if (val <= min) // as good or better
			{
				if (val < min)
				{
					if (trace == TRACE_SPELLING) Log(STDTRACELOG,(char*)"    Better: %s against %s value: %d\r\n",D->word,originalWord,val);
					index = 0;
					min = val;
				}
				else if ( val == min && trace == TRACE_SPELLING) Log(STDTRACELOG,(char*)"    Equal: %s against %s value: %d\r\n",D->word,originalWord,val);

				if (!(D->internalBits & BEEN_HERE)) 
				{
					choices[index++] = D;
					if (index > 3998) break; 
					AddInternalFlag(D,BEEN_HERE);
				}
			}
		}
	}
	// try endings ing, s, etc
	if (start) // no stem spell if COMING from a stem spell attempt (start == 0)
	{
		char* stem = StemSpell(word,start);
		if (stem) 
		{
			WORDP D = FindWord(stem,0,(tokenControl & ONLY_LOWERCASE) ?  PRIMARY_CASE_ALLOWED : STANDARD_LOOKUP);
			if (D) 
			{
				for (unsigned int j = 0; j < index; ++j) 
				{
					if (choices[j] == D) // already in our list
					{
						D = NULL; 
						break;
					}
				}
			}
			if (D) choices[index++] = D;
		}
	}

    if (!index)  return NULL; 

	// take our guesses, and pick the most common (earliest learned or most frequently used) word
    uint64 commonmin = 0;
    bestGuess[0] = NULL;
	for (unsigned int j = 0; j < index; ++j) RemoveInternalFlag(choices[j],BEEN_HERE);
    if (index == 1) 
	{
		if (trace == TRACE_SPELLING) Log(STDTRACELOG,(char*)"    Single best spell: %s\r\n",choices[0]->word);
		return choices[0]->word;	// pick the one
	}
    for (unsigned int j = 0; j < index; ++j) 
    {
        uint64 common = choices[j]->systemFlags & COMMONNESS;
        if (common < commonmin) continue;
		if (choices[j]->internalBits & UPPERCASE_HASH && index > 1) continue;	// ignore proper names for spell better when some other choice exists
        if (common > commonmin)
        {
            commonmin = common;
            bestGuessindex = 0;
        }
        bestGuess[bestGuessindex++] = choices[j];
    }
	if (bestGuessindex) 
	{
		if (trace == TRACE_SPELLING) Log(STDTRACELOG,(char*)"    Pick spell: %s\r\n",bestGuess[0]->word);
		return bestGuess[0]->word; 
	}
	return NULL;
}
예제 #4
0
static int EditDistance(WORDP D, unsigned int size, unsigned int inputLen, char* inputSet, int min,
		unsigned char realWordLetterCounts[LETTERMAX], int language)
{//   dictword has no underscores, inputSet is already lower case
    char dictw[MAX_WORD_SIZE];
    MakeLowerCopy(dictw,D->word);
    char* dictinfo = dictw;
    char* dictstart = dictinfo;
	char* inputstart = inputSet;
    int val = 0; //   a difference in length will manifest as a difference in letter count
    //   how many changes  (change a letter, transpose adj letters, insert letter, drop letter)
    if (size != inputLen) 
	{
		val += (size < inputLen) ? 5 : 2;	// real word is shorter than what they typed, not so likely as longer
		if (size < 7) val += 3;	
	}
    if (val > min) return 60;	// fast abort
	
	// match off how many letter counts are correct between the two, need to be close enough to bother with
	unsigned char dictWordLetterSet[LETTERMAX];
 	memset(dictWordLetterSet,0,LETTERMAX); 
	for (unsigned int  i = 0; i < size; ++i) 
	{
		int index = letterIndexData[(unsigned char)dictinfo[i]];
		++dictWordLetterSet[index]; // computer number of each kind of letter
	}
	unsigned int count = 0;
	for (unsigned int  i = 0; i < LETTERMAX; ++i) // count how many letters are the same in both words
	{
		if (dictWordLetterSet[i]) // revised word has these many
		{
			int diff = dictWordLetterSet[i] - realWordLetterCounts[i]; // how many of ours does real have?
			if (diff < 0) count += dictWordLetterSet[i]; // he has more than we have, he gets credit for ours he does have
			else count += dictWordLetterSet[i] - diff; // he has <= what we have, count them
		}
	}
	unsigned int countVariation = size - ((size > 7) ? 3 : 2); // since size >= 2, this is always >= 0
	if (count < countVariation  && language == ENGLISH)  return 60;	// need most letters be in common
	if (count == size && language == ENGLISH)  // same letters (though he may have excess) --  how many transposes
	{
		unsigned int bad = 0;
		for (unsigned int i = 0; i < size; ++i) if (dictinfo[i] != inputSet[i]) ++bad;
		if (size != inputLen){;}
		else if (bad <= 2) return val + 3; // 1 transpose
		else if (bad <= 4) return val + 9; // 2 transpose
		else return val + 38; // many transpose
    }
	
	// now look at specific letter errors
    char* dictend = dictinfo+size;
    char* inputend = inputSet+inputLen;
	count = 0;
    while (ALWAYS)
    {
		++count;
        if (*dictinfo == *inputSet) // match
        {
            if (inputSet == inputend && dictinfo == dictend) break;    // ended
            ++inputSet;
            ++dictinfo;
            continue;
        }
        if (inputSet == inputend || dictinfo == dictend) // one ending, other has to catch up by adding a letter
        {
            if (inputSet == inputend) ++dictinfo;
            else ++inputSet;
            val += 6;
            continue;
        }

        //   letter match failed
		
        // can we change an accented letter forward to another similar letter without accent
		if (*dictinfo == 0xc3)
		{
			bool accent = false;
			if (*inputSet == 'a' && (dictinfo[1] >= 0xa0 && dictinfo[1] <= 0xa5 )) accent = true;
			else if (*inputSet == 'e' && (dictinfo[1] >= 0xa8 && dictinfo[1] <= 0xab )) accent = true;
			else if (*inputSet == 'i' &&  (dictinfo[1] >= 0xac && dictinfo[1] <= 0xaf )) accent = true;
			else if (*inputSet == 'o' && (dictinfo[1] >= 0xb2 && dictinfo[1] <= 0xb6 )) accent = true;
			else if (*inputSet == 'u' && (dictinfo[1] >= 0xb9 && dictinfo[1] <= 0xbc )) accent = true;
			if (accent)
			{
				++dictinfo;
				++dictinfo; // double unicode
				++inputSet;
				continue;
			}
		}
		  //   first and last letter errors are rare, more likely to get them right
		if (dictinfo == dictstart && *dictstart != *inputstart && language == ENGLISH) val += 6; // costs a lot  to change first letter, odds are he types that right 
		if (dictinfo[1] == 0 &&  inputSet[1] == 0 &&  *dictinfo != *inputSet) val += 6; // costs more to change last letter, odds are he types that right or sees its wrong
  
        //   try to resynch series and reduce cost of a transposition of adj letters  
        if (*dictinfo == inputSet[1] && dictinfo[1] == *inputSet) // transpose 
        {
			if (dictinfo[2] == inputSet[2]) // they match after, so transpose is pretty likely
			{
				val += 4;  
				if (dictinfo[2]) // not at end, skip the letter in synch for speed
				{
					++dictinfo;
					++inputSet;
				}
			}
			else val += 8;  // transposed maybe good, assume it is
   			dictinfo += 2;
			inputSet += 2;
		}
        else if (*dictinfo == inputSet[1]) // current dict letter matches matches his next input letter, so maybe his input inserted a char here and need to delete it 
        {
            char* prior = inputSet-1; // potential extraneous letter
            if (*prior == *inputSet) val += 5; // low cost for dropping an excess repeated letter - start of word is prepadded with 0 for prior char
            else if (*inputSet == '-') val += 3; //   very low cost for removing a hypen 
            else if (inputSet+1 == inputend && *inputSet == 's') val += 30;    // losing a trailing s is almost not acceptable
            else val += 9; //  high cost removing an extra letter, but not as much as having to change it
            ++inputSet;
		}
        else if (dictinfo[1] == *inputSet) // next dict leter matches current input letter, so maybe his input deleted a char here and needs to insert  it
        {
            char* prior = (dictinfo == dictstart) ? (char*)" " : (dictinfo-1);
            if (*dictinfo == *prior  && !IsVowel(*dictinfo )) val += 5; 
            else if (IsVowel(*dictinfo ))  val += 1; //  low cost for missing a vowel ( already charged for short input), might be a texting abbreviation
            else val += 9; // high cost for deleting a character, but not as much as changing it
            ++dictinfo;
       }
       else //   this has no valid neighbors.  alter it to be the correct, but charge for multiple occurences
       {
			if (count == 1 && *dictinfo != *inputSet && language == ENGLISH) val += 30; //costs a lot to change the first letter, odds are he types that right or sees its wrong
			//  2 in a row are bad, check for a substituted vowel sound
			bool swap = false;
			int oldval = val;
			if (dictinfo[1] != inputSet[1]) // do multicharacter transformations
			{
				if (language == SPANISH) // ch-x | qu-k | c-k | do-o | b-v | bue-w | vue-w | z-s | s-c | h- | y-i | y-ll | m-n  1st is valid
				{
					if (*inputSet == 'c' && *dictinfo == 'k') 
					{
						dictinfo += 1;
						inputSet += 1;
						continue;
					}
					if (*inputSet == 'b' && *dictinfo == 'v') 
					{
						dictinfo += 1;
						inputSet += 1;
						continue;
					}
					if (*inputSet == 'v' && *dictinfo == 'b') 
					{
						dictinfo += 1;
						inputSet += 1;
						continue;
					}
					if (*inputSet == 'z' && *dictinfo == 's') 
					{
						dictinfo += 1;
						inputSet += 1;
						continue;
					}
					if (*inputSet == 's' && *dictinfo == 'c') 
					{
						dictinfo += 1;
						inputSet += 1;
						continue;
					}
					if (*inputSet == 'y' && *dictinfo == 'i') 
					{
						dictinfo += 1;
						inputSet += 1;
						continue;
					}
					if (*inputSet == 'm' && *dictinfo == 'n') 
					{
						dictinfo += 1;
						inputSet += 1;
						continue;
					}
					if (*inputSet == 'n' && *dictinfo == 'm') 
					{
						dictinfo += 1;
						inputSet += 1;
						continue;
					}
					if (*dictinfo == 'h') 
					{
						dictinfo += 1;
						continue;
					}
					if (*inputSet == 'x' && !strncmp(dictinfo,(char*)"ch",2)) 
					{
						dictinfo += 2;
						inputSet += 1;
						val -= (size < inputLen) ? 5 : 2;
						if (size < 7) val -= 3;	
						if (val < 0) val = 0;
						continue;
					}
					if (*inputSet == 'k' && !strncmp(dictinfo,(char*)"qu",2)) 
					{
						dictinfo += 2;
						inputSet += 1;
						val -= (size < inputLen) ? 5 : 2;
						if (size < 7) val -= 3;	
						if (val < 0) val = 0;
						continue;
					}
					if (*inputSet == 'o' && !strncmp(dictinfo,(char*)"do",2) && !inputSet[1] && !dictinfo[2]) // at end
					{
						dictinfo += 2;
						inputSet += 1;
						val -= (size < inputLen) ? 5 : 2;
						if (size < 7) val -= 3;	
						if (val < 0) val = 0;
						continue;
					}
					if (*inputSet == 'w' && !strncmp(dictinfo,(char*)"bue",3)) 
					{
						dictinfo += 3;
						inputSet += 1;
						val -= (size < inputLen) ? 5 : 2;
						if (size < 7) val -= 3;	
						if (val < 0) val = 0;
						continue;
					}
					if (*inputSet == 'w' && !strncmp(dictinfo,(char*)"vue",3)) 
					{
						dictinfo += 3;
						inputSet += 1;
						val -= (size < inputLen) ? 5 : 2;
						if (size < 7) val -= 3;	
						if (val < 0) val = 0;
						continue;
					}
					if (!strncmp(inputSet,(char*)"ll",2) && *dictinfo == 'y') 
					{
						inputSet += 2;
						dictinfo += 1;
						val -= (size < inputLen) ? 5 : 2;
						if (size < 7) val -= 3;	
						if (val < 0) val = 0;
						continue;
					}
					if (*inputSet == 'y' && *dictinfo == 'l' && dictinfo[1] == 'l') 
					{
						inputSet += 1;
						dictinfo += 2;
						val -= (size < inputLen) ? 5 : 2;
						if (size < 7) val -= 3;	
						if (val < 0) val = 0;
						continue;
					}
				}

				if (*inputSet == 't' && !strncmp(dictinfo,(char*)"ght",3)) 
				{
                    dictinfo += 3;
                    inputSet += 1;
                    val += 5;  
				}
				else if (!strncmp(inputSet,(char*)"ci",2) && !strncmp(dictinfo,(char*)"cki",3)) 
				{
                    dictinfo += 3;
                    inputSet += 2;
                    val += 5;
				}
				else if (*(dictinfo-1) == 'a' && !strcmp(dictinfo,(char*)"ir") && !strcmp(inputSet,(char*)"re")) // prepair prepare as terminal sound
				{
                    dictinfo += 2;
                    inputSet += 2;
                    val += 3;
				}
				else if (!strncmp(inputSet,(char*)"ous",3) && !strncmp(dictinfo,(char*)"eous",4)) 
				{
                    dictinfo += 4;
                    inputSet += 3;
                    val += 5; 
               }
              else if (!strncmp(inputSet,(char*)"of",2) && !strncmp(dictinfo,(char*)"oph",3)) 
               {
                    dictinfo += 3;
                    inputSet += 2;
                    val += 5; 
               }
			else if (*dictinfo == 'x' && !strncmp(inputSet,(char*)"cks",3)) 
               {
                    dictinfo += 1;
                    inputSet += 3;
                    val += 5; 
               }
               else if (*inputSet == 'k' && !strncmp(dictinfo,(char*)"qu",2)) 
               {
                    dictinfo += 2;
                    inputSet += 1;
                    val += 5;  
               }
			   if (oldval != val){;} // swallowed a multiple letter sound change
               else if (!strncmp(dictinfo,(char*)"able",4) && !strncmp(inputSet,(char*)"ible",4)) swap = true;
               else if (!strncmp(dictinfo,(char*)"ible",4) && !strncmp(inputSet,(char*)"able",4)) swap = true;
               else if (*dictinfo == 'a' && dictinfo[1] == 'y'     && *inputSet == 'e' && inputSet[1] == 'i') swap = true;
               else if (*dictinfo == 'e' && dictinfo[1] == 'a'     && *inputSet == 'e' && inputSet[1] == 'e') swap = true;
               else if (*dictinfo == 'e' && dictinfo[1] == 'e'     && *inputSet == 'e' && inputSet[1] == 'a') swap = true;
               else if (*dictinfo == 'e' && dictinfo[1] == 'e'     && *inputSet == 'i' && inputSet[1] == 'e') swap = true;
               else if (*dictinfo == 'e' && dictinfo[1] == 'i'     && *inputSet == 'a' && inputSet[1] == 'y') swap = true;
               else if (*dictinfo == 'e' && dictinfo[1] == 'u'     && *inputSet == 'o' && inputSet[1] == 'o') swap = true;
               else if (*dictinfo == 'e' && dictinfo[1] == 'u'     && *inputSet == 'o' && inputSet[1] == 'u') swap = true;
               else if (*dictinfo == 'i' && dictinfo[1] == 'e'     && *inputSet == 'e' && inputSet[1] == 'e') swap = true;
               else if (*dictinfo == 'o' && dictinfo[1] == 'o'     && *inputSet == 'e' && inputSet[1] == 'u') swap = true;
               else if (*dictinfo == 'o' && dictinfo[1] == 'o'     && *inputSet == 'o' && inputSet[1] == 'u') swap = true;
               else if (*dictinfo == 'o' && dictinfo[1] == 'o'     && *inputSet == 'u' && inputSet[1] == 'i') swap = true;
               else if (*dictinfo == 'o' && dictinfo[1] == 'u'     && *inputSet == 'e' && inputSet[1] == 'u') swap = true;
               else if (*dictinfo == 'o' && dictinfo[1] == 'u'     && *inputSet == 'o' && inputSet[1] == 'o') swap = true;
               else if (*dictinfo == 'u' && dictinfo[1] == 'i'     && *inputSet == 'o' && inputSet[1] == 'o') swap = true;
               if (swap)
               {
                    dictinfo += 2;
                    inputSet += 2;
                    val += 5; 
               }
            } 

            // can we change a letter to another similar letter
            if (oldval == val) 
            {
				bool convert = false;
                if (*dictinfo == 'i' && *inputSet== 'y' && count > 1) convert = true;//   but not as first letter
                else if ((*dictinfo == 's' && *inputSet == 'z') || (*dictinfo == 'z' && *inputSet == 's')) convert = true;
                else if (*dictinfo == 'y' && *inputSet == 'i' && count > 1) convert = true; //   but not as first letter
                else if (*dictinfo == '/' && *inputSet == '-') convert = true;
                else if (inputSet+1 == inputend && *inputSet == 's') val += 30;    //   changing a trailing s is almost not acceptable
                if (convert) val += 5;	// low cost for exchange of similar letter, but dont do it often
                else val += 12;			// changing a letter is expensive, since it destroys the visual image
                ++dictinfo;
                ++inputSet;
            }
       } 
       if (val > min) return val; // too costly, ignore it
    }
    return val;
}
예제 #5
0
bool SpellCheckSentence()
{
	WORDP D,E;
	fixedSpell = false;
	bool lowercase = false;
	int language = ENGLISH;
	char* lang = GetUserVariable((char*)"$cs_language");
	if (lang && !stricmp(lang,(char*)"spanish")) language = SPANISH;
	
	// check for all uppercase
	for (int i = FindOOBEnd(1) + 1; i <= wordCount; ++i) // skip start of sentence
	{
		char* word = wordStarts[i];
		size_t len = strlen(word);
		for (int j = 0; j < (int)len; ++j) 
		{
			if (IsLowerCase(word[j])) 
			{
				lowercase = true;
				i = j = 1000;
			}
		}
	}
	if (!lowercase && wordCount > 2) // must have several words in uppercase
	{
		for (int i = FindOOBEnd(1); i <= wordCount; ++i)
		{
			char* word = wordStarts[i];
			MakeLowerCase(word);
		}
	}

	int startWord = FindOOBEnd(1);
	for (int i = startWord; i <= wordCount; ++i)
	{
		char* word = wordStarts[i];
		if (!word || !word[1] || *word == '"' ) continue; // illegal or single char or quoted thingy 
		size_t len = strlen(word);

		// dont spell check uppercase not at start or joined word
		if (IsUpperCase(word[0]) && (i != startWord || strchr(word,'_')) && tokenControl & NO_PROPER_SPELLCHECK) continue; 
		//  dont  spell check email or other things with @ or . in them
		if (strchr(word,'@') || strchr(word,'.') || strchr(word,'$')) continue;

		// dont spell check names of json objects or arrays
		if (!strnicmp(word,"ja-",3) || !strnicmp(word,"jo-",3)) continue;

		char* known = ProbableKnownWord(word);
		if (known && !strcmp(known,word)) continue;	 // we know it
		if (known && strcmp(known,word)) 
		{
			char* tokens[2];
			if (!IsUpperCase(*known)) // revised the word to lower case (avoid to upper case like "fields" to "Fields"
			{
				WORDP D = FindWord(known,0,LOWERCASE_LOOKUP);
				if (D) 
				{
					tokens[1] = D->word;
					ReplaceWords(i,1,1,tokens);
					fixedSpell = true;
					continue;
				}
			}
			else // is uppercase a concept member? then revise upwards
			{
				WORDP D = FindWord(known,0,UPPERCASE_LOOKUP);
				if (IsConceptMember(D))
				{
					tokens[1] = D->word;
					ReplaceWords(i,1,1,tokens);
					fixedSpell = true;		
					continue;
				}
			}
		}

		char* p = word -1;
		unsigned char c;
		char* hyphen = 0;
		while ((c = *++p) != 0)
		{ 
			++len;
			if (c == '-') hyphen = p; // note is hyphenated - use trailing
		}
		if (len == 0 || GetTemperatureLetter(word)) continue;	// bad ignore utf word or llegal length - also no composite words
		if (c && c != '@' && c != '.') // illegal word character
		{
			if (IsDigit(word[0]) || len == 1){;} // probable numeric?
			// accidental junk on end of word we do know immedately?
			else if (i > 1 && !IsAlphaUTF8OrDigit(wordStarts[i][len-1]) )
			{
				WORDP entry,canonical;
				char word[MAX_WORD_SIZE];
				strcpy(word,wordStarts[i]);
				word[len-1] = 0;
				uint64 sysflags = 0;
				uint64 cansysflags = 0;
				WORDP revise;
				GetPosData(i,word,revise,entry,canonical,sysflags,cansysflags,true,true); // dont create a non-existent word
				if (entry && entry->properties & PART_OF_SPEECH)
				{
					wordStarts[i] = reuseAllocation(wordStarts[i],entry->word);
					fixedSpell = true;
					continue;	// not a legal word character, leave it alone
				}
			}
		}

		// see if we know the other case
		if (!(tokenControl & (ONLY_LOWERCASE|STRICT_CASING)) || (i == startSentence && !(tokenControl & ONLY_LOWERCASE)))
		{
			WORDP E = FindWord(word,0,SECONDARY_CASE_ALLOWED);
			bool useAlternateCase = false;
			if (E && E->systemFlags & PATTERN_WORD) useAlternateCase = true;
			if (E && E->properties & (PART_OF_SPEECH|FOREIGN_WORD))
			{
				// if the word we find is UPPER case, and this might be a lower case noun plural, don't change case.
				size_t len = strlen(word);
				if (word[len-1] == 's' ) 
				{
					WORDP F = FindWord(word,len-1);
					if (!F || !(F->properties & (PART_OF_SPEECH|FOREIGN_WORD))) useAlternateCase = true;
					else continue;
				}
				else useAlternateCase = true;
			}
			else if (E) // does it have a member concept fact
			{
				if (IsConceptMember(E)) 
				{
					useAlternateCase = true;
					break;
				}
			}
			if (useAlternateCase)
			{
				char* tokens[2];
				tokens[1] = E->word;
				ReplaceWords(i,1,1,tokens);
				fixedSpell = true;
				continue;	
			}
		}
		
		// merge with next token?
		char join[MAX_WORD_SIZE * 3];
		if (i != wordCount && *wordStarts[i+1] != '"' )
		{
			// direct merge as a single word
			strcpy(join,word);
			strcat(join,wordStarts[i+1]);
			WORDP D = FindWord(join,0,(tokenControl & ONLY_LOWERCASE) ?  PRIMARY_CASE_ALLOWED : STANDARD_LOOKUP);

			strcpy(join,word);
			if (!D || !(D->properties & PART_OF_SPEECH) ) // merge these two, except "going to" or wordnet composites of normal words  // merge as a compound word
			{
				strcat(join,(char*)"_");
				strcat(join,wordStarts[i+1]);
				D = FindWord(join,0,(tokenControl & ONLY_LOWERCASE) ?  PRIMARY_CASE_ALLOWED : STANDARD_LOOKUP);
			}

			if (D && D->properties & PART_OF_SPEECH && !(D->properties & AUX_VERB)) // merge these two, except "going to" or wordnet composites of normal words
			{
				WORDP P1 = FindWord(word,0,LOWERCASE_LOOKUP);
				WORDP P2 = FindWord(wordStarts[i+1],0,LOWERCASE_LOOKUP);
				if (!P1 || !P2 || !(P1->properties & PART_OF_SPEECH) || !(P2->properties & PART_OF_SPEECH)) 
				{
					char* tokens[2];
					tokens[1] = D->word;
					ReplaceWords(i,2,1,tokens);
					fixedSpell = true;
					continue;
				}
			}
		}   

		// break apart slashed pair like eat/feed
		char* slash = strchr(word,'/');
		if (slash && slash != word && slash[1]) //   break apart word/word
		{
			if ((wordCount + 2 ) >= REAL_SENTENCE_LIMIT) continue;	// no room
			*slash = 0;
			D = StoreWord(word);
			*slash = '/';
			E = StoreWord(slash+1);
			char* tokens[4];
			tokens[1] = D->word;
			tokens[2] = "/";
			tokens[3] = E->word;
			ReplaceWords(i,1,3,tokens);
			fixedSpell = true;
			--i;
			continue;
		}

		// see if hypenated word should be separate or joined (ignore obvious adjective suffix)
		if (hyphen &&  !stricmp(hyphen,(char*)"-like"))
		{
			StoreWord(word,ADJECTIVE_NORMAL|ADJECTIVE); // accept it as a word
			continue;
		}
		else if (hyphen && (hyphen-word) > 1)
		{
			char test[MAX_WORD_SIZE];
			char first[MAX_WORD_SIZE];

			// test for split
			*hyphen = 0;
			strcpy(test,hyphen+1);
			strcpy(first,word);
			*hyphen = '-';

			WORDP E = FindWord(test,0,LOWERCASE_LOOKUP);
			WORDP D = FindWord(first,0,LOWERCASE_LOOKUP);
			if (*first == 0) 
			{
				wordStarts[i] = AllocateString(wordStarts[i] + 1); // -pieces  want to lose the leading hypen  (2-pieces)
				fixedSpell = true;
			}
			else if (D && E) //   1st word gets replaced, we added another word after
			{
				if ((wordCount + 1 ) >= REAL_SENTENCE_LIMIT) continue;	// no room
				char* tokens[3];
				tokens[1] = D->word;
				tokens[2] = E->word;
				ReplaceWords(i,1,2,tokens);
				fixedSpell = true;
				--i;
			}
			else if (!stricmp(test,(char*)"old") || !stricmp(test,(char*)"olds")) //   break apart 5-year-old
			{
				if ((wordCount + 1 ) >= REAL_SENTENCE_LIMIT) continue;	// no room
				D = StoreWord(first);
				E = StoreWord(test);
				char* tokens[3];
				tokens[1] = D->word;
				tokens[2] = E->word;
				ReplaceWords(i,1,2,tokens);
				fixedSpell = true;
				--i;
			}
			else // remove hyphen entirely?
			{
				strcpy(test,first);
				strcat(test,hyphen+1);
				D = FindWord(test,0,(tokenControl & ONLY_LOWERCASE) ?  PRIMARY_CASE_ALLOWED : STANDARD_LOOKUP);
				if (D) 
				{
					wordStarts[i] = D->word;
					fixedSpell = true;
					--i;
				}
			}
			continue; // ignore hypenated errors that we couldnt solve, because no one mistypes a hypen
		}
		
		// leave uppercase in first position if not adjusted yet... but check for lower case spell error
		if (IsUpperCase(word[0])  && tokenControl & NO_PROPER_SPELLCHECK) 
		{
			char lower[MAX_WORD_SIZE];
			MakeLowerCopy(lower,word);
			WORDP D = FindWord(lower,0,LOWERCASE_LOOKUP);
			if (!D && i == startWord)
			{
				char* okword = SpellFix(lower,i,PART_OF_SPEECH,language); 
				if (okword)
				{
					char* tokens[2];
					WORDP E = StoreWord(okword);
					tokens[1] = E->word;
					ReplaceWords(i,1,1,tokens);
					fixedSpell = true;
				}
			}
			continue; 
		}

		if (*word != '\'' && (!FindCanonical(word, i,true) || IsUpperCase(word[0]))) // dont check quoted or findable words unless they are capitalized
		{
			word = SpellCheck(i,language);

			// dont spell check proper names to improper, if word before or after is lower case originally
			if (word && i != 1 && originalCapState[i] && !IsUpperCase(*word))
			{
				if (!originalCapState[i-1]) return false;
				else if (i != wordCount && !originalCapState[i+1]) return false;
			}

			if (word && !*word) // performed substitution on prior word, restart this one
			{
				fixedSpell = true;
				--i;
				continue;
			}
			if (word) 
			{
				char* tokens[2];
				tokens[1] = word;
				ReplaceWords(i,1,1,tokens);
				fixedSpell = true;
				continue;
			}
		}
    }
	return fixedSpell;
}
예제 #6
0
char* ProbableKnownWord(char* word)
{
	if (strchr(word,' ') || strchr(word,'_')) return word; // not user input, is synthesized
	size_t len = strlen(word);

	// do we know the word as is?
	WORDP D = FindWord(word,0,PRIMARY_CASE_ALLOWED);
	if (D) 
	{
		if (D->properties & FOREIGN_WORD || *D->word == '~' || D->systemFlags & PATTERN_WORD) return D->word;	// we know this word clearly or its a concept set ref emotion
		if (D->properties & PART_OF_SPEECH && !IS_NEW_WORD(D)) return D->word; // old word we know
		if (IsConceptMember(D)) return D->word;
		// are there facts using this word? -- issue with facts because on seeing input second time, having made facts of original, we see original
//		if (GetSubjectNondeadHead(D) || GetObjectNondeadHead(D) || GetVerbNondeadHead(D)) return D->word;
	}
	
	char lower[MAX_WORD_SIZE];
	MakeLowerCopy(lower,word);

	// do we know the word in lower case?
	D = FindWord(word,0,LOWERCASE_LOOKUP);
	if (D) // direct recognition
	{
		if (D->properties & FOREIGN_WORD || *D->word == '~' || D->systemFlags & PATTERN_WORD) return D->word;	// we know this word clearly or its a concept set ref emotion
		if (D->properties & PART_OF_SPEECH && !IS_NEW_WORD(D)) return D->word; // old word we know
		if (IsConceptMember(D)) return D->word;

		// are there facts using this word?
//		if (GetSubjectNondeadHead(D) || GetObjectNondeadHead(D) || GetVerbNondeadHead(D)) return D->word;
	}

	// do we know the word in upper case?
	char upper[MAX_WORD_SIZE];
	MakeLowerCopy(upper,word);
	upper[0] = GetUppercaseData(upper[0]);
	D = FindWord(upper,0,UPPERCASE_LOOKUP);
	if (D) // direct recognition
	{
		if (D->properties & FOREIGN_WORD || *D->word == '~' || D->systemFlags & PATTERN_WORD) return D->word;	// we know this word clearly or its a concept set ref emotion
		if (D->properties & PART_OF_SPEECH && !IS_NEW_WORD(D)) return D->word; // old word we know
		if (IsConceptMember(D)) return D->word;

	// are there facts using this word?
//		if (GetSubjectNondeadHead(D) || GetObjectNondeadHead(D) || GetVerbNondeadHead(D)) return D->word;
	}

	// interpolate to lower case words 
	uint64 expectedBase = 0;
	if (ProbableAdjective(word,len,expectedBase) && expectedBase) return word;
	expectedBase = 0;
	if (ProbableAdverb(word,len,expectedBase) && expectedBase) return word;
	// is it a verb form
	char* verb = GetInfinitive(lower,true); // no new verbs
	if (verb) 
	{
		WORDP D =  StoreWord(lower,0); // verb form recognized
		return D->word;
	}
	
	// is it simple plural of a noun?
	if (word[len-1] == 's') 
	{
		WORDP E = FindWord(lower,len-1,LOWERCASE_LOOKUP);
		if (E && E->properties & NOUN) 
		{
			E = StoreWord(word,NOUN|NOUN_PLURAL);
			return E->word;	
		}
		E = FindWord(lower,len-1,UPPERCASE_LOOKUP);
		if (E && E->properties & NOUN) 
		{
			*word = toUppercaseData[*word];
			E = StoreWord(word,NOUN|NOUN_PROPER_PLURAL);
			return E->word;	
		}
	}

	return NULL;
}
예제 #7
0
static int EditDistance(WORDINFO& dictWordData, WORDINFO& realWordData,int min)
{//   dictword has no underscores, inputSet is already lower case
    char dictw[MAX_WORD_SIZE];
    MakeLowerCopy(dictw, dictWordData.word);
    char* dictinfo = dictw;
    char* realinfo = realWordData.word;
    char* dictstart = dictinfo;
	char* realstart = realWordData.word;
    int val = 0; //   a difference in length will manifest as a difference in letter count
	//  look at specific letter errors
    char priorCharDict[10];
    char priorCharReal[10];
    *priorCharDict = *priorCharReal = 0;
    char currentCharReal[10];
    char currentCharDict[10];
    *currentCharReal = *currentCharDict = 0;
    char nextCharReal[10];
    char nextCharDict[10];
    char next1CharReal[10];
    char next1CharDict[10];
    char* resumeReal2;
    char* resumeDict2;
    char* resumeReal;
    char* resumeDict;
    char* resumeReal1;
    char* resumeDict1;
	char baseCharReal;
	char baseCharDict;
    while (ALWAYS)
    {
        if (val > min) return 1000; // no good
        strcpy(priorCharReal, currentCharReal);
        strcpy(priorCharDict, currentCharDict);

        resumeReal = IsUTF8((char*)realinfo, currentCharReal);
        resumeDict = IsUTF8((char*)dictinfo, currentCharDict);
        if (!*currentCharReal && !*currentCharDict) break; //both at end
        if (!*currentCharReal || !*currentCharDict) // one ending, other has to catch up by adding a letter
        {
            val += 16; // add a letter
            if (*priorCharReal == *currentCharDict) val -= 10; // doubling letter at end
            dictinfo = resumeDict;
            realinfo = resumeReal;
            continue;
        }

		// punctuation in a word is bad tokenization, dont spell check it away
		if (*currentCharReal == '?' || *currentCharReal == '!' || *currentCharReal == '('
			|| *currentCharReal == ')' ||  *currentCharReal == '[' || *currentCharReal == ']'
			|| *currentCharReal == '{' || *currentCharReal == '}') return 200; // dont mess with this

        resumeReal1 = IsUTF8((char*)resumeReal, nextCharReal);
        resumeDict1 = IsUTF8((char*)resumeDict, nextCharDict);
        resumeReal2 = IsUTF8((char*)resumeReal1, next1CharReal); // 2 char ahead
        resumeDict2 = IsUTF8((char*)resumeDict1, next1CharDict);
		baseCharReal = UnaccentedChar(currentCharReal);
		baseCharDict = UnaccentedChar(currentCharDict);
		if (!stricmp(currentCharReal, currentCharDict)) // match chars
		{
			dictinfo = resumeDict;
			realinfo = resumeReal;
			continue;
		}
		if (baseCharReal && baseCharReal == baseCharDict)
		{
			dictinfo = resumeDict;
			realinfo = resumeReal;
			val += 1; // minimal charge but separate forms of delivre
			continue;
		}
        // treat german double s and ss equivalent
        if (!stricmp(language, "german"))
        {
            if (*currentCharReal == 0xc3 && currentCharReal[1] == 0x9f && *currentCharDict == 's' && *nextCharDict == 's')
            {
                dictinfo = resumeDict1;
                realinfo = resumeReal;
                continue;
            }
            if (*currentCharDict == 0xc3 && currentCharDict[1] == 0x9f && *currentCharReal == 's'  && *nextCharReal == 's')
            {
                dictinfo = resumeDict;
                realinfo = resumeReal1;
                continue;
            }
        }
        // spanish alternative spellings
        if (!stricmp(language, "spanish")) // ch-x | qu-k | c-k | do-o | b-v | bue-w | vue-w | z-s | s-c | h- | y-i | y-ll | m-n  1st is valid
        {
            if (*currentCharReal == 'c' && *currentCharDict == 'k')
            {
                dictinfo = resumeDict;
                realinfo = resumeReal;
                continue;
            }
            if (*currentCharReal == 'b' && *currentCharDict == 'v')
            {
                dictinfo = resumeDict;
                realinfo = resumeReal;
                continue;
            }
            if (*currentCharReal == 'v' && *currentCharDict == 'b')
            {
                dictinfo = resumeDict;
                realinfo = resumeReal;
                continue;
            }
            if (*currentCharReal == 'z' && *currentCharDict == 's')
            {
                dictinfo = resumeDict;
                realinfo = resumeReal;
                continue;
            }
            if (*currentCharReal == 's' && *currentCharDict == 'c')
            {
                dictinfo = resumeDict;
                realinfo = resumeReal;
                continue;
            }
            if (*currentCharReal == 'y' && *currentCharDict == 'i')
            {
                dictinfo = resumeDict;
                realinfo = resumeReal;
                continue;
            }
            if (*currentCharReal == 'm' && *currentCharDict == 'n')
            {
                dictinfo = resumeDict;
                realinfo = resumeReal;
                continue;
            }
            if (*currentCharReal == 'n' && *currentCharDict == 'm')
            {
                dictinfo = resumeDict;
                realinfo = resumeReal;
                continue;
            }
            if (*currentCharDict == 'h')
            {
                dictinfo = resumeDict;
                continue;
            }
            if (*currentCharReal == 'x' && *currentCharDict == 'c' && *nextCharDict == 'h')
            {
                dictinfo = resumeDict1;
                realinfo = resumeReal;
                continue;
            }
            if (*currentCharReal == 'k' && *currentCharDict == 'q' && *nextCharDict == 'u')
            {
                dictinfo = resumeDict1;
                realinfo = resumeReal;
                continue;
            }
            if (*currentCharReal == 'o' && *currentCharDict == 'd' && *nextCharDict == 'o')
            {
                dictinfo = resumeDict1;
                realinfo = resumeReal;
                continue;
            }
            if (*currentCharReal == 'w' && *currentCharDict == 'b' && *nextCharDict == 'u'  && *next1CharDict == 'e')
            {
                dictinfo = resumeDict2;
                realinfo = resumeReal;
                continue;
            }
            if (*currentCharReal == 'w' && *currentCharDict == 'v' && *nextCharDict == 'u'  && *next1CharDict == 'e')
            {
                dictinfo = resumeDict2;
                realinfo = resumeReal;
                continue;
            }
            if (*currentCharReal == 'l' && *nextCharReal == 'l' && *currentCharDict == 'y')
            {
                dictinfo = resumeDict;
                realinfo = resumeReal1;
                continue;
            }
            if (*currentCharReal == 'y' && *currentCharDict == 'l' && *nextCharDict == 'l')
            {
                dictinfo = resumeDict1;
                realinfo = resumeReal;
                continue;
            }
        }
        // french common bad spellings
        if (!stricmp(language, "french"))
        {
            if (*currentCharReal == 'a' && SameUTF(currentCharDict,"â"))
            {
            		val += 1;
                dictinfo = resumeDict;
                realinfo = resumeReal;
                continue;
            }
            if (*currentCharReal == 'e' && SameUTF(currentCharDict,"ê"))
            {
				val += 10;
                dictinfo = resumeDict;
                realinfo = resumeReal;
                continue;
            }
            if (*currentCharReal == 0xc3 && currentCharReal[1] == 0xa8 && SameUTF(currentCharDict,"ê"))
            {
            	val += 5;
                dictinfo = resumeDict;
                realinfo = resumeReal;
                continue;
            }
            if (*currentCharReal == 'i' && SameUTF(currentCharDict,"î"))
            {
				val += 1;
                dictinfo = resumeDict;
                realinfo = resumeReal;
                continue;
            }
            if (*currentCharReal == 'o' && SameUTF(currentCharDict, "ô"))
            {
					val += 1;
                dictinfo = resumeDict;
                realinfo = resumeReal;
                continue;
            }
            if (*currentCharReal == 'u' && SameUTF(currentCharDict, "û"))
            {
			 	val += 5;
                dictinfo = resumeDict;
                realinfo = resumeReal;
                continue;
            }
            if (*currentCharReal == 'y' && *currentCharDict == 'l' && *nextCharDict == 'l')
            {
            	val += 10;
                dictinfo = resumeDict1;
                realinfo = resumeReal;
                continue;
            }
            if (*currentCharReal == 'k' && *currentCharDict == 'q' && *nextCharDict == 'u')
            {
            	val += 10;
                dictinfo = resumeDict1;
                realinfo = resumeReal;
                continue;
            }
            if (*currentCharReal == 'f' && *currentCharDict == 'p' && *nextCharDict == 'h')
            {
            	val += 5;
                dictinfo = resumeDict1;
                realinfo = resumeReal;
                continue;
            }
            if (*currentCharReal == 's' && *currentCharDict == 'c')
            {
            	val += 10;
                dictinfo = resumeDict;
                realinfo = resumeReal;
                continue;
            }
            if (*currentCharReal == 's' && SameUTF(currentCharDict, "ç"))
            {
            		val += 5;
                dictinfo = resumeDict;
                realinfo = resumeReal;
                continue;
            }
            if (*currentCharReal == 'c' && SameUTF(currentCharDict, "ç"))
            {
				val += 5;
                dictinfo = resumeDict;
                realinfo = resumeReal;
                continue;
            }
        }
        // probable transposition since swapping syncs up
        if (!strcmp(currentCharReal, nextCharDict) && !strcmp(nextCharReal, currentCharDict))
        {
            val += 16; // more expensive if next is not correct after transposition
            dictinfo = resumeDict2; // skip ahead 2
            realinfo = resumeReal2;
            continue;
        }

        // probable mistyped letter since next matches up
        if (!strcmp(nextCharReal, nextCharDict))
        {
            val += 16; // more expensive if 2nd next is not correct after transposition
            if (*currentCharReal == 's' && *currentCharDict == 'z') val -= 5;
            else if (*currentCharReal == 'z' && *currentCharDict == 's') val -= 5;
            else if (IsVowel(*currentCharReal) && IsVowel(*currentCharDict))  val -= 6; //  low cost for switching a vowel 

            dictinfo = resumeDict; 
            realinfo = resumeReal;
            continue;
        }
        // probable excess letter by user since next matches up to current
        if (!strcmp(nextCharReal, currentCharDict))
        {
            val += 16;  // only delete 1 letter

            if (*priorCharDict == *currentCharReal) val -= 14; // low cost for dropping an excess repeated letter (wherre->where not wherry)
            else if (*currentCharReal == '-') val -= 10; //   very low cost for removing a hypen 

            dictinfo = resumeDict; // skip over letter we match momentarily
            realinfo = resumeReal1; // move on past our junk letter and match letter
            continue;
        }
        // probable missing letter by user since current matches up to nextdict
        if (!strcmp(nextCharDict, currentCharReal))
        {
            val += 16; // only add 1 letter
            // better to add repeated letter than to drop a letter
            if (*currentCharDict == *priorCharReal) val -= 6; // low cost for adding a repeated letter
            else if (*currentCharDict == 'e' && *nextCharDict == 'o') val -= 10; // yoman->yeoman
            dictinfo = resumeDict1; // skip over letter we match momentarily
            realinfo = resumeReal; // move on past our junk letter and match letter
            continue;
        }
    
        // complete mismatch with no understanding of why, just fix them and move on
        dictinfo = resumeDict; // skip over letter we match momentarily
        realinfo = resumeReal; // move on past our junk letter and match letter
        val += 16;
    }
    return val;
}
예제 #8
0
bool SpellCheckSentence()
{
	WORDP D,E;
	fixedSpell = false;
	bool lowercase = false;
	
	// check for all uppercase (capslock)
	for (int i = FindOOBEnd(1); i <= wordCount; ++i) // skip start of sentence
	{
		char* word = wordStarts[i];
		if (!word[1]) continue; // autoconversion of letters to lower case should be ignored (eg A)
		if (!stricmp(word, "the")) continue;
		size_t len = strlen(word);
		for (int j = 0; j < (int)len; ++j) 
		{
			if (IsLowerCase(word[j])) 
			{
				lowercase = true;
				i = j = len+1000; // len might be BIG (oob data) so make sure beyond it)
			}
		}
	}

	if (!lowercase && wordCount > 2) // must have multiple words all in uppercase
	{
		for (int i = FindOOBEnd(1); i <= wordCount; ++i)
		{
			char* word = wordStarts[i];
			char myword[MAX_WORD_SIZE];
			MakeLowerCopy(myword,word);
			if (strcmp(word, myword))
			{
				char* tokens[2];
				tokens[1] = myword;
				ReplaceWords("caplocWord", i, 1, 1, tokens);
				originalCapState[i] = false;
			}
		}
	}

	int startWord = FindOOBEnd(1);
	for (int i = startWord; i <= wordCount; ++i)
	{
		char* word = wordStarts[i];
		char* tokens[2];

		// change any \ to /
		char newword[MAX_WORD_SIZE];
		bool altered = false;
		if (strlen(word) < MAX_WORD_SIZE)
		{
			strcpy(newword, word);
			char* at = newword;
			while ((at = strchr(at,'\\')))
			{
				*at = '/';
				altered = true;
			}
			if (altered) word = wordStarts[i] = StoreWord(newword, AS_IS)->word;
		}

		if (*word == '\'' && !word[1] && i != startWord && IsDigit(*wordStarts[i - 1]) && !stricmp(language, "english")) // fails if not digit bug
		{
			tokens[1] = (char*)"foot";
			ReplaceWords("' as feet", i, 1, 1, tokens);
			fixedSpell = true;
			continue;
		}
		if (*word == '"' && !word[1] && i != startWord && IsDigit(*wordStarts[i - 1]) && !stricmp(language, "english")) // fails if not digit bug
		{
			tokens[1] = (char*)"inch";
			ReplaceWords("' as feet", i, 1, 1, tokens);
			fixedSpell = true;
			continue;
		}
		if (!word || !word[1] || *word == '"' ) continue; // illegal or single char or quoted thingy 
		size_t len = strlen(word);

		// dont spell check uppercase not at start or joined word
		if (IsUpperCase(word[0]) && (i != startWord || strchr(word,'_')) && tokenControl & NO_PROPER_SPELLCHECK) continue; 
		//  dont  spell check email or other things with @ or . in them
		if (strchr(word,'@') || strchr(word, '&')  || strchr(word,'.') || strchr(word,'$')) continue;

		// dont spell check names of json objects or arrays
		if (!strnicmp(word,"ja-",3) || !strnicmp(word,"jo-",3)) continue;

		// dont spell check web addresses
		if (!strnicmp(word,"http",4) || !strnicmp(word,"www",3)) continue;

		// nor fractions
		if (IsFraction(word))  continue; // fraction?

		// joined number words  like 100dollars
		char* at = word - 1;
		while (IsDigit(*++at) || *at == numberPeriod);
		if (IsDigit(*word) && strlen(at) > 3 && ProbableKnownWord(at))
		{
			char first[MAX_WORD_SIZE];
			strncpy(first, word, (at - word));
			first[at - word] = 0;
			char* tokens[3];
			tokens[1] = first;
			tokens[2] = at;
			ReplaceWords("joined number word", i, 1, 2, tokens);
			continue;
		}

		// nor model numbers
		if (IsModelNumber(word))
		{
			WORDP X = FindWord(word, 0, UPPERCASE_LOOKUP);
			if (IsConceptMember(X) && !strcmp(word,X->word))
			{
				char* tokens[2];
				tokens[1] = X->word;
				ReplaceWords("KnownUpperModelNumber", i, 1, 1, tokens);
				fixedSpell = true;
			}
			continue;
		}

		char* number;
		if (GetCurrency((unsigned char*)word, number)) continue; // currency

		if (!stricmp(word, (char*)"am") && i != startWord && 
			(IsDigit(*wordStarts[i-1]) || IsNumber(wordStarts[i-1]) ==REAL_NUMBER) && !stricmp(language,"english")) // fails if not digit bug
		{
			char* tokens[2];
			tokens[1] = (char*)"a.m.";
			ReplaceWords("am as time", i, 1, 1, tokens);
			fixedSpell = true;
			continue;
		}

		char* known = ProbableKnownWord(word);
		if (known && !strcmp(known,word)) continue;	 // we know it
		if (known && strcmp(known,word)) 
		{
			WORDP D = FindWord(known);
			char* tokens[2];
			if ((!D || !(D->internalBits & UPPERCASE_HASH)) && !IsUpperCase(*known)) // revised the word to lower case (avoid to upper case like "fields" to "Fields"
			{
				WORDP X = FindWord(known,0,LOWERCASE_LOOKUP);
				if (X) 
				{
					tokens[1] = X->word;
					ReplaceWords("KnownWord",i,1,1,tokens);
					fixedSpell = true;
					continue;
				}
			}
			else // is uppercase a concept member? then revise upwards
			{
				WORDP X = FindWord(known,0,UPPERCASE_LOOKUP);
				if (IsConceptMember(X) || stricmp(language,"english")) // all german nouns are uppercase
				{
					tokens[1] = X->word;
					ReplaceWords("KnownUpper",i,1,1,tokens);
					fixedSpell = true;		
					continue;
				}
			}
		}

		char* p = word -1;
		unsigned char c;
		char* hyphen = 0;
		while ((c = *++p) != 0)
		{ 
			++len;
			if (c == '-') hyphen = p; // note is hyphenated - use trailing
		}
		if (len == 0 || GetTemperatureLetter(word)) continue;	// bad ignore utf word or llegal length - also no composite words
		if (c && c != '@' && c != '.') // illegal word character
		{
			if (IsDigit(word[0]) || len == 1){;} // probable numeric?
			// accidental junk on end of word we do know immedately?
			else if (i > 1 && !IsAlphaUTF8OrDigit(wordStarts[i][len-1]) )
			{
				WORDP entry,canonical;
				char word[MAX_WORD_SIZE];
				strcpy(word,wordStarts[i]);
				word[len-1] = 0;
				uint64 sysflags = 0;
				uint64 cansysflags = 0;
				WORDP revise;
				GetPosData(i,word,revise,entry,canonical,sysflags,cansysflags,true,true); // dont create a non-existent word
				if (entry && entry->properties & PART_OF_SPEECH)
				{
					wordStarts[i] = entry->word;
					fixedSpell = true;
					continue;	// not a legal word character, leave it alone
				}
			}
		}

		// see if we know the other case
		if (!(tokenControl & (ONLY_LOWERCASE|STRICT_CASING)) || (i == startSentence && !(tokenControl & ONLY_LOWERCASE)))
		{
			WORDP E = FindWord(word,0,SECONDARY_CASE_ALLOWED);
			bool useAlternateCase = false;
			if (E && E->systemFlags & PATTERN_WORD) useAlternateCase = true;
			if (E && E->properties & (PART_OF_SPEECH|FOREIGN_WORD))
			{
				// if the word we find is UPPER case, and this might be a lower case noun plural, don't change case.
				size_t len = strlen(word);
				if (word[len-1] == 's' ) 
				{
					WORDP F = FindWord(word,len-1);
					if (!F || !(F->properties & (PART_OF_SPEECH|FOREIGN_WORD))) useAlternateCase = true;
					else continue;
				}
				else useAlternateCase = true;
			}
			else if (E) // does it have a member concept fact
			{
				if (IsConceptMember(E)) 
				{
					useAlternateCase = true;
					break;
				}
			}
			if (useAlternateCase)
			{
				char* tokens[2];
				tokens[1] = E->word;
				ReplaceWords("Alternatecase",i,1,1,tokens);
				fixedSpell = true;
				continue;	
			}
		}
		
		// merge with next token?
		char join[MAX_WORD_SIZE * 3];
		if (i != wordCount && *wordStarts[i+1] != '"' )
		{
			// direct merge as a single word
			strcpy(join,word);
			strcat(join,wordStarts[i+1]);
			WORDP D = FindWord(join,0,(tokenControl & ONLY_LOWERCASE) ?  PRIMARY_CASE_ALLOWED : STANDARD_LOOKUP);

			strcpy(join,word);
//			if (!D || !(D->properties & PART_OF_SPEECH) ) // merge these two, except "going to" or wordnet composites of normal words  // merge as a compound word
//			{
//				strcat(join,(char*)"_");
//				strcat(join,wordStarts[i+1]);
//				D = FindWord(join,0,(tokenControl & ONLY_LOWERCASE) ?  PRIMARY_CASE_ALLOWED : STANDARD_LOOKUP);
//			}    DONT CREATE _ words, let sequence handle it

			if (D && D->properties & PART_OF_SPEECH && !(D->properties & AUX_VERB)) // merge these two, except "going to" or wordnet composites of normal words
			{
				WORDP P1 = FindWord(word,0,LOWERCASE_LOOKUP);
				WORDP P2 = FindWord(wordStarts[i+1],0,LOWERCASE_LOOKUP);
				if (!P1 || !P2 || !(P1->properties & PART_OF_SPEECH) || !(P2->properties & PART_OF_SPEECH)) 
				{
					char* tokens[2];
					tokens[1] = D->word;
					ReplaceWords("merge",i,2,1,tokens);
					fixedSpell = true;
					continue;
				}
			}
		}   

		// break apart slashed pair like eat/feed
		char* slash = strchr(word,'/');
		if (slash && !slash[1] && len < MAX_WORD_SIZE) // remove trailing slash
		{
			strcpy(newword, word);
			newword[slash - word] = 0;
			word = wordStarts[i] = StoreWord(newword, AS_IS)->word;
		}
		if (slash && slash != word && slash[1]) //   break apart word/word
		{
			if ((wordCount + 2 ) >= REAL_SENTENCE_LIMIT) continue;	// no room
			*slash = 0;
			D = StoreWord(word);
			*slash = '/';
			E = StoreWord(slash+1);
			char* tokens[4];
			tokens[1] = D->word;
			tokens[2] = "/";
			tokens[3] = E->word;
			ReplaceWords("Split",i,1,3,tokens);
			fixedSpell = true;
			--i;
			continue;
		}

		// see if hypenated word should be separate or joined (ignore obvious adjective suffix)
		if (hyphen &&  !stricmp(hyphen,(char*)"-like"))
		{
			StoreWord(word,ADJECTIVE_NORMAL|ADJECTIVE); // accept it as a word
			continue;
		}
		else if (hyphen && (hyphen-word) > 1 && !IsPlaceNumber(word)) // dont break up fifty-second
		{
			char test[MAX_WORD_SIZE];
			char first[MAX_WORD_SIZE];

			// test for split
			*hyphen = 0;
			strcpy(test,hyphen+1);
			strcpy(first,word);
			*hyphen = '-';

			WORDP E = FindWord(test,0,LOWERCASE_LOOKUP);
			WORDP D = FindWord(first,0,LOWERCASE_LOOKUP);
			if (*first == 0) 
			{
				wordStarts[i] = AllocateHeap(wordStarts[i] + 1); // -pieces  want to lose the leading hypen  (2-pieces)
				fixedSpell = true;
			}
			else if (D && E) //   1st word gets replaced, we added another word after
			{
				if ((wordCount + 1 ) >= REAL_SENTENCE_LIMIT) continue;	// no room
				char* tokens[3];
				tokens[1] = D->word;
				tokens[2] = E->word;
				ReplaceWords("Pair",i,1,2,tokens);
				fixedSpell = true;
				--i;
			}
			else if (!stricmp(test,(char*)"old") || !stricmp(test,(char*)"olds")) //   break apart 5-year-old
			{
				if ((wordCount + 1 ) >= REAL_SENTENCE_LIMIT) continue;	// no room
				D = StoreWord(first);
				E = StoreWord(test);
				char* tokens[3];
				tokens[1] = D->word;
				tokens[2] = E->word;
				ReplaceWords("Break old",i,1,2,tokens);
				fixedSpell = true;
				--i;
			}
			else // remove hyphen entirely?
			{
				strcpy(test,first);
				strcat(test,hyphen+1);
				D = FindWord(test,0,(tokenControl & ONLY_LOWERCASE) ?  PRIMARY_CASE_ALLOWED : STANDARD_LOOKUP);
				if (D) 
				{
					wordStarts[i] = D->word;
					fixedSpell = true;
					--i;
				}
			}
			continue; // ignore hypenated errors that we couldnt solve, because no one mistypes a hypen
		}

		// see if number in front of unit split like 10mg
		if (IsDigit(*word))
		{
			char* at = word;
			while (*++at && IsDigit(*at)) {;}
			WORDP E = FindWord(at);
			if (E && strlen(at) > 2 && *at != 'm') // number in front of known word ( but must be longer than 2 char, 5th) but allow mg
			{
				char token1[MAX_WORD_SIZE];
				int len = at - word;
				strncpy(token1,word,len);
				token1[len] = 0;
				D = StoreWord(token1);
				char* tokens[4];
				tokens[1] = D->word;
				tokens[2] = E->word;
				ReplaceWords("Split",i,1,2,tokens);
				fixedSpell = true;
				continue;
			}
		}
		
		// leave uppercase in first position if not adjusted yet... but check for lower case spell error
		if (IsUpperCase(word[0])  && tokenControl & NO_PROPER_SPELLCHECK) 
		{
			char lower[MAX_WORD_SIZE];
			MakeLowerCopy(lower,word);
			WORDP D = FindWord(lower,0,LOWERCASE_LOOKUP);
			if (!D && i == startWord)
			{
				char* okword = SpellFix(lower,i,PART_OF_SPEECH); 
				if (okword)
				{
					char* tokens[2];
					WORDP E = StoreWord(okword);
					tokens[1] = E->word;
					ReplaceWords("Spell",i,1,1,tokens);
					fixedSpell = true;
				}
			}
			continue; 
		}

		// see if smooshed word pair
		size_t len1 = strlen(word);
		int j;
		if (!IsDigit(*word))
		{
			for (j = 1; j <= len1 - 1; ++j)
			{
				WORDP X1 = FindWord(word, j);  // any case
				WORDP X2 = FindWord(word + j, len1 - i); // any case

				if (X1 && X2 && (X1->word[1] || X1->word[0] == 'i' || X1->word[0] == 'I' || X1->word[0] == 'a'))
				{
					char* tokens[3];
					tokens[1] = X1->word;
					tokens[2] = X2->word;
					ReplaceWords("Split", i, 1, 2, tokens);
					fixedSpell = true;
					break;
				}
			}
			if (j != len1) continue;
		}

		if (*word != '\'' && (!FindCanonical(word, i,true) || IsUpperCase(word[0]))) // dont check quoted or findable words unless they are capitalized
		{
			word = SpellCheck(i);

			// dont spell check proper names to improper, if word before or after is lower case originally
			if (word && i != 1 && originalCapState[i] && !IsUpperCase(*word))
			{
				if (!originalCapState[i-1]) continue;
				else if (i != wordCount && !originalCapState[i+1]) continue;
			}

			if (word && !*word) // performed substitution on prior word, restart this one
			{
				fixedSpell = true;
				--i;
				continue;
			}
			if (word) 
			{
				char* tokens[2];
				tokens[1] = word;
				ReplaceWords("Spell",i,1,1,tokens);
				fixedSpell = true;
				continue;
			}
		}
    }
	return fixedSpell;
}
예제 #9
0
char* SpellFix(char* originalWord,int start,uint64 posflags)
{
    multichoice = false;
    char word[MAX_WORD_SIZE];
    MakeLowerCopy(word, originalWord);
	char word1[MAX_WORD_SIZE];
	MakeUpperCopy(word1, originalWord);
	WORDINFO realWordData;
    ComputeWordData(word, &realWordData);
	if (realWordData.bytelen >= 100 || realWordData.bytelen == 0) return NULL;
	if (IsDigit(*originalWord)) return NULL; // number-based words and numbers must be treated elsewhere
    char letterLow = *word;
	char letterHigh = *word1;
	bool hasUnderscore = (strchr(originalWord,'_')) ? true : false;
	bool isUpper = IsUpperCase(originalWord[0]);
	if (IsUpperCase(originalWord[1])) isUpper = false;	// not if all caps
	if (trace == TRACE_SPELLING) Log(STDTRACELOG,(char*)"Spell: %s\r\n",originalWord);

	//   Priority is to a word that looks like what the user typed, because the user probably would have noticed if it didnt and changed it. So add/delete  has priority over tranform
    WORDP choices[4000];
    WORDP bestGuess[4000];
    unsigned int index = 0;
    unsigned int bestGuessindex = 0;
    int min = 35; // allow 2 changes as needed
      
	uint64  pos = PART_OF_SPEECH;  // all pos allowed
    WORDP D;
    if (posflags == PART_OF_SPEECH && start < wordCount) // see if we can restrict word based on next word
    {
        D = FindWord(wordStarts[start+1],0,PRIMARY_CASE_ALLOWED);
		uint64 flags = (D) ? D->properties : (-1ull); //   if we dont know the word, it could be anything
		if ((flags & PART_OF_SPEECH) == PREPOSITION) pos &= -1 ^ (PREPOSITION | NOUN);   //   prep cannot be preceeded by noun or prep
		if (!(flags & (PREPOSITION | VERB | CONJUNCTION | ADVERB)) && flags & DETERMINER) pos &= -1 ^ (DETERMINER | ADJECTIVE | NOUN | ADJECTIVE_NUMBER | NOUN_NUMBER); //   determiner cannot be preceeded by noun determiner adjective
		if (!(flags & (PREPOSITION | VERB | CONJUNCTION | DETERMINER | ADVERB)) && flags & ADJECTIVE) pos &= -1 ^ (NOUN);
		if (!(flags & (PREPOSITION | NOUN | CONJUNCTION | DETERMINER | ADVERB | ADJECTIVE)) && flags & VERB) pos &= -1 ^ (VERB); //   we know all helper verbs we might be
		if (D && *D->word == '\'' && D->word[1] == 's' ) pos &= NOUN;    //   we can only be a noun if possessive - contracted 's should already be removed by now
    }
    if (posflags == PART_OF_SPEECH && start > 1)
    {
        D = FindWord(wordStarts[start-1],0,PRIMARY_CASE_ALLOWED);
        uint64 flags = (D) ? D->properties : (-1); // if we dont know the word, it could be anything
        if (flags & DETERMINER) pos &= -1 ^ (VERB|CONJUNCTION|PREPOSITION|DETERMINER);  
    }
    posflags &= pos; //   if pos types are known and restricted and dont match
	static int range[] = {0,-1,1,-2,2};
	for (unsigned int i = 0; i < 5; ++i)
	{
		if (i >= 3) break;
		MEANING offset = lengthLists[realWordData.charlen + range[i]];
		if (trace == TRACE_SPELLING) Log(STDTRACELOG,(char*)"\r\n  Begin offset %d\r\n",i);
		while (offset)
		{
			D = Meaning2Word(offset);
			offset = D->spellNode;
			if (PART_OF_SPEECH == posflags  && D->systemFlags & PATTERN_WORD){;} // legal generic match
			else if (!(D->properties & posflags)) continue; // wrong kind of word
			char* under = strchr(D->word,'_');
			// SPELLING lists have no underscore or space words in them
			if (hasUnderscore && !under) continue;	 // require keep any underscore
			if (!hasUnderscore && under) continue;	 // require not have any underscore
			if (isUpper && !(D->internalBits & UPPERCASE_HASH) && start != 1) continue;	// dont spell check to lower a word in upper
            WORDINFO dictWordData;
            ComputeWordData(D->word, &dictWordData);
            int val = EditDistance(dictWordData, realWordData, min);
			if (val <= min) // as good or better
			{
				if (val < min)
				{
					if (trace == TRACE_SPELLING) Log(STDTRACELOG,(char*)"    Better: %s against %s value: %d\r\n",D->word,originalWord,val);
					index = 0;
					min = val;
				}
				else if ( val == min && trace == TRACE_SPELLING) Log(STDTRACELOG,(char*)"    Equal: %s against %s value: %d\r\n",D->word,originalWord,val);

				if (!(D->internalBits & BEEN_HERE)) 
				{
					choices[index++] = D;
					if (index > 3998) break; 
					AddInternalFlag(D,BEEN_HERE);
				}
			}
		}
	}
	// try endings ing, s, etc
	if (start && !index && !stricmp(language,"english")) // no stem spell if COMING from a stem spell attempt (start == 0) or we have a good guess already
	{
        uint64 flags = 0;
		char* stem = StemSpell(word,start,flags);
		if (stem) 
		{
            WORDP X = StoreWord(stem,flags); 
			if (X) choices[index++] = X;
		}
	}

    if (!index)  return NULL; 
    if (index > 1) multichoice = true;

	// take our guesses, and pick the most common (earliest learned or most frequently used) word
    uint64 commonmin = 0;
    bestGuess[0] = NULL;
	for (unsigned int j = 0; j < index; ++j) RemoveInternalFlag(choices[j],BEEN_HERE);
    if (index == 1) 
	{
		if (trace == TRACE_SPELLING) Log(STDTRACELOG,(char*)"    Single best spell: %s\r\n",choices[0]->word);
		return choices[0]->word;	// pick the one
	}
    for (unsigned int j = 0; j < index; ++j) 
    {
        uint64 common = choices[j]->systemFlags & COMMONNESS;
        if (common < commonmin) continue;
		if (choices[j]->internalBits & UPPERCASE_HASH && index > 1) continue;	// ignore proper names for spell better when some other choice exists
        if (common > commonmin) // this one is more common
        {
            commonmin = common;
            bestGuessindex = 0;
        }
        bestGuess[bestGuessindex++] = choices[j];
    }
	if (bestGuessindex) 
	{
        if (bestGuessindex > 1) multichoice = true;
		if (trace == TRACE_SPELLING) Log(STDTRACELOG,(char*)"    Pick spell: %s\r\n",bestGuess[0]->word);
		return bestGuess[0]->word; 
	}
	return NULL;
}