int WordSize(char *word) /* in: word having its WordSize taken */ { register int result; /* WordSize of the word */ register int state; /* current state in machine */ result = 0; state = 0; /* Run a DFA to compute the word size */ while ( *word != '\0' ) { switch ( state ) { case 0: state = (IsVowel(*word)) ? 1 : 2; break; case 1: state = (IsVowel(*word)) ? 1 : 2; if ( 2 == state ) result++; break; case 2: state = (IsVowel(*word) || ('y' == *word)) ? 1 : 2; break; } word++; } return( result ); } /* WordSize */
std::string en::GetVerbPreAdd(int VerbNum) { std::ifstream is (DICTIONARY EN_EN_FOLDER "verb_present"); if (GotoLine(is,VerbNum)) return ""; int Data = is.get(); if (Data == '1') { if (GotoSegment(is,2)) return ""; std::string Segment = GetSegment(is); is.close(); return Segment; } else if (Data == '0') { if (GotoSegment(is,11)) return ""; std::string Segment = GetSegment(is); is.close(); return Segment; } else { if (GotoSegment(is,1)) return ""; std::string Segment = GetSegment(is); is.close(); //Now follow all the english spelling rules //http://www.oxforddictionaries.com/words/verb-tenses-adding-ed-and-ing //http://www.grammar.cl/Notes/Spelling_ING.htm int u = Segment.length(); if (Segment[u-1] == 'e' && Segment[u-2] == 'i') { //Turn 'ie' into y Segment = Segment.substr(0,u-2); Segment += "y"; } else if (Segment[u-1] == 'e' && ( Segment[u-2] != 'e' && Segment[u-2] != 'y' && Segment[u-2] != 'o' )) { //Drop final e, if e is not before y,e or o //make -> mak ( + {ed,ing} ) Segment = Segment.substr(0,u-1); } else if (Segment[u-1] == 'c') { //Add a k after a c //picnic -> picnick ( + {ed,ing} ) Segment += "k"; } else if (!IsVowel(Segment[u-3]) && IsVowel(Segment[u-2]) && !IsVowel(Segment[u-1]) && Segment[u-1]!='x' && Segment[u-1]!='y' && Segment[u-1]!='z' && Segment[u-1]!='w') { //Double consonant after consonant-vowel-consonant cluster. //Provided that final consonant is not w,x,y or z //Travel -> Travell ( + {ed,ing} ) Segment += Segment[u-1]; } return Segment; } return ""; }
bool NoThreeCons(char* cString) { if(strlen(cString)<3) return true; for(int i=0;cString[i+2]!='\0';i++) if(IsVowel(cString[i])==IsVowel(cString[i+1]) && IsVowel(cString[i])==IsVowel(cString[i+2])) return false; return true; }
int getSyllables(String ing){ int i = 0,s; while(i<ing.length()){ if((!IsVowel(ing[i]))&&IsVowel(ing[i-1])){ s++; } i++; } return s; }
string reverseVowels(string s) { int n=s.size(); int i=0,j=n-1; while (i<j) { while (i<j&&!IsVowel(s[i])) i++; while (i<j&&!IsVowel(s[j])) j--; swap(s[i],s[j]); i++;j--; } return s; }
bool InflectRule::IsApplicable(const char *string) const { for (int i=0; i<nEndings; i++) { const int n = strlen(Ending(i)); const char *ending, *stringEnd; if (Ending(i)[0] == '^') { ending = Ending(i) + 1; stringEnd = string; } else { ending = Ending(i); #if 0 /* jbfix: len - n may be < 0! */ stringEnd = string + strlen(string) - n; #else int index = strlen(string) - n; if(index < 0) index = 0; stringEnd = string + index; #endif } if (!strncmp(stringEnd, ending, n) || (ending[0] == 'V' && IsVowel(stringEnd[0]) && !strncmp(stringEnd+1, ending+1, n-1)) || (ending[0] == 'C' && IsConsonant(stringEnd[0]) && !strncmp(stringEnd+1, ending+1, n-1))) return true; } return false; }
bool ExistVowel(char* cString) { for(int i=0;cString[i]!='\0';i++) if(IsVowel(cString[i])) return true; return false; }
void main() { bool bUse[26]={false}; char cString[LEN]; int iLength,i,j; cin.getline(cString,LEN); iLength=strlen(cString); for(i=0;i<iLength;i++) if(isalpha(cString[i])) { if(IsVowel(cString[i]) || bUse[cString[i]-'A'])cString[i]=DELETE; else bUse[cString[i]-'A']=true; } //cout<<cString<<endl; for(i=iLength-1;i>=0;i--) if(cString[i]!=DELETE && (ispunct(cString[i]) || cString[i]==' ')) { //cout<<i<<endl; for(j=i-1;j>=0 && (cString[j]==' ' || cString[j]==DELETE);j--) { //cout<<j<<endl; cString[j]=DELETE; } } //cout<<cString<<endl; for(;iLength>0 && (cString[iLength-1]==DELETE || cString[iLength-1]==' ');iLength--); for(i=0;i<=iLength && (cString[i]==DELETE || cString[i]==' ');i++); //cout<<i<<" "<<iLength<<endl; for(;i<iLength;i++) if(cString[i]!=DELETE) cout<<cString[i]; cout<<endl; }
int main() { char c = 'O'; printf("%d", IsVowel(c)); return 0; }
static int FindFirstVowel(string word) { int i; for (i = 0; i < StringLength(word); i++) { if (IsVowel(IthChar(word, i))) return (i); } return (-1); }
main() { char ch; printf("The vowels are:"); for (ch = 'A'; ch <= 'Z'; ch++) { if (IsVowel(ch)) printf(" %c", ch); } printf("\n"); }
int ContainsVowel(char *word) /* in: buffer with word checked */ //vowel:ÔªÒô { if ( *word == '\0' ) return( 0 ); else return( IsVowel(*word) || (NULL != strpbrk(word+1,"aeiouy")) ); } /* ContainsVowel */
int ContainsVowel(char *word) { /* This isn't needed, right? */ if ( !*word ) return FALSE; return (IsVowel(*word) || (NULL != strpbrk(word + 1, "aeiouy"))); } /* ContainsVowel */
int removeVowelPointer(char *s) { int r = 0, i = 0; while (*(s+i) != '\0'){ if (IsVowel(*(s+i)) == 1){ *(s+i) = 'T'; r++; } i++; } return r; }
bool check () { int len = strlen ( Str ) , i , Have = 0; if ( len == 0 ) return false; for ( i = 0; i < len; i ++ ) { if ( IsVowel ( Str [i] )) Have = true; if ( i && Str [i] == Str [i - 1] && Str [i] != 'e' && Str [i] != 'o') return false; if ( i > 1 && !IsVowel ( Str [i - 2] ) && !IsVowel ( Str [i - 1] ) && !IsVowel ( Str [i] )) return false; if ( i > 1 && IsVowel ( Str [i - 2] ) && IsVowel ( Str [i - 1] ) && IsVowel ( Str [i] )) return false; } return Have; }
int CStringCruncher::GetCharPriority(char c, char prev, char next) { if (::isdigit(c)) return 4; if (('-' == c || '.' == c) && ::isdigit(next)) return 4; // Is it the first character of a word? if (::isalpha(c) && !::isalnum(prev)) return 3; if (IsConsonant(c)) return 2; if (IsVowel(c)) return 1; return 0; }
int CStringCruncherU::GetCharPriority(TCHAR c, TCHAR prev, TCHAR next) { if (::isdigit(c)) return 4; if ((_T('-') == c || _T('.') == c) && ::isdigit(next)) return 4; // Is it the first character of a word? if (::isalpha(c) && !::isalnum(prev)) return 3; if (IsConsonant(c)) return 2; if (IsVowel(c)) return 1; return 0; }
static int EditDistance(WORDP D, unsigned int size, unsigned int inputLen, char* inputSet, int min, unsigned char realWordLetterCounts[LETTERMAX], int language) {// dictword has no underscores, inputSet is already lower case char dictw[MAX_WORD_SIZE]; MakeLowerCopy(dictw,D->word); char* dictinfo = dictw; char* dictstart = dictinfo; char* inputstart = inputSet; int val = 0; // a difference in length will manifest as a difference in letter count // how many changes (change a letter, transpose adj letters, insert letter, drop letter) if (size != inputLen) { val += (size < inputLen) ? 5 : 2; // real word is shorter than what they typed, not so likely as longer if (size < 7) val += 3; } if (val > min) return 60; // fast abort // match off how many letter counts are correct between the two, need to be close enough to bother with unsigned char dictWordLetterSet[LETTERMAX]; memset(dictWordLetterSet,0,LETTERMAX); for (unsigned int i = 0; i < size; ++i) { int index = letterIndexData[(unsigned char)dictinfo[i]]; ++dictWordLetterSet[index]; // computer number of each kind of letter } unsigned int count = 0; for (unsigned int i = 0; i < LETTERMAX; ++i) // count how many letters are the same in both words { if (dictWordLetterSet[i]) // revised word has these many { int diff = dictWordLetterSet[i] - realWordLetterCounts[i]; // how many of ours does real have? if (diff < 0) count += dictWordLetterSet[i]; // he has more than we have, he gets credit for ours he does have else count += dictWordLetterSet[i] - diff; // he has <= what we have, count them } } unsigned int countVariation = size - ((size > 7) ? 3 : 2); // since size >= 2, this is always >= 0 if (count < countVariation && language == ENGLISH) return 60; // need most letters be in common if (count == size && language == ENGLISH) // same letters (though he may have excess) -- how many transposes { unsigned int bad = 0; for (unsigned int i = 0; i < size; ++i) if (dictinfo[i] != inputSet[i]) ++bad; if (size != inputLen){;} else if (bad <= 2) return val + 3; // 1 transpose else if (bad <= 4) return val + 9; // 2 transpose else return val + 38; // many transpose } // now look at specific letter errors char* dictend = dictinfo+size; char* inputend = inputSet+inputLen; count = 0; while (ALWAYS) { ++count; if (*dictinfo == *inputSet) // match { if (inputSet == inputend && dictinfo == dictend) break; // ended ++inputSet; ++dictinfo; continue; } if (inputSet == inputend || dictinfo == dictend) // one ending, other has to catch up by adding a letter { if (inputSet == inputend) ++dictinfo; else ++inputSet; val += 6; continue; } // letter match failed // can we change an accented letter forward to another similar letter without accent if (*dictinfo == 0xc3) { bool accent = false; if (*inputSet == 'a' && (dictinfo[1] >= 0xa0 && dictinfo[1] <= 0xa5 )) accent = true; else if (*inputSet == 'e' && (dictinfo[1] >= 0xa8 && dictinfo[1] <= 0xab )) accent = true; else if (*inputSet == 'i' && (dictinfo[1] >= 0xac && dictinfo[1] <= 0xaf )) accent = true; else if (*inputSet == 'o' && (dictinfo[1] >= 0xb2 && dictinfo[1] <= 0xb6 )) accent = true; else if (*inputSet == 'u' && (dictinfo[1] >= 0xb9 && dictinfo[1] <= 0xbc )) accent = true; if (accent) { ++dictinfo; ++dictinfo; // double unicode ++inputSet; continue; } } // first and last letter errors are rare, more likely to get them right if (dictinfo == dictstart && *dictstart != *inputstart && language == ENGLISH) val += 6; // costs a lot to change first letter, odds are he types that right if (dictinfo[1] == 0 && inputSet[1] == 0 && *dictinfo != *inputSet) val += 6; // costs more to change last letter, odds are he types that right or sees its wrong // try to resynch series and reduce cost of a transposition of adj letters if (*dictinfo == inputSet[1] && dictinfo[1] == *inputSet) // transpose { if (dictinfo[2] == inputSet[2]) // they match after, so transpose is pretty likely { val += 4; if (dictinfo[2]) // not at end, skip the letter in synch for speed { ++dictinfo; ++inputSet; } } else val += 8; // transposed maybe good, assume it is dictinfo += 2; inputSet += 2; } else if (*dictinfo == inputSet[1]) // current dict letter matches matches his next input letter, so maybe his input inserted a char here and need to delete it { char* prior = inputSet-1; // potential extraneous letter if (*prior == *inputSet) val += 5; // low cost for dropping an excess repeated letter - start of word is prepadded with 0 for prior char else if (*inputSet == '-') val += 3; // very low cost for removing a hypen else if (inputSet+1 == inputend && *inputSet == 's') val += 30; // losing a trailing s is almost not acceptable else val += 9; // high cost removing an extra letter, but not as much as having to change it ++inputSet; } else if (dictinfo[1] == *inputSet) // next dict leter matches current input letter, so maybe his input deleted a char here and needs to insert it { char* prior = (dictinfo == dictstart) ? (char*)" " : (dictinfo-1); if (*dictinfo == *prior && !IsVowel(*dictinfo )) val += 5; else if (IsVowel(*dictinfo )) val += 1; // low cost for missing a vowel ( already charged for short input), might be a texting abbreviation else val += 9; // high cost for deleting a character, but not as much as changing it ++dictinfo; } else // this has no valid neighbors. alter it to be the correct, but charge for multiple occurences { if (count == 1 && *dictinfo != *inputSet && language == ENGLISH) val += 30; //costs a lot to change the first letter, odds are he types that right or sees its wrong // 2 in a row are bad, check for a substituted vowel sound bool swap = false; int oldval = val; if (dictinfo[1] != inputSet[1]) // do multicharacter transformations { if (language == SPANISH) // ch-x | qu-k | c-k | do-o | b-v | bue-w | vue-w | z-s | s-c | h- | y-i | y-ll | m-n 1st is valid { if (*inputSet == 'c' && *dictinfo == 'k') { dictinfo += 1; inputSet += 1; continue; } if (*inputSet == 'b' && *dictinfo == 'v') { dictinfo += 1; inputSet += 1; continue; } if (*inputSet == 'v' && *dictinfo == 'b') { dictinfo += 1; inputSet += 1; continue; } if (*inputSet == 'z' && *dictinfo == 's') { dictinfo += 1; inputSet += 1; continue; } if (*inputSet == 's' && *dictinfo == 'c') { dictinfo += 1; inputSet += 1; continue; } if (*inputSet == 'y' && *dictinfo == 'i') { dictinfo += 1; inputSet += 1; continue; } if (*inputSet == 'm' && *dictinfo == 'n') { dictinfo += 1; inputSet += 1; continue; } if (*inputSet == 'n' && *dictinfo == 'm') { dictinfo += 1; inputSet += 1; continue; } if (*dictinfo == 'h') { dictinfo += 1; continue; } if (*inputSet == 'x' && !strncmp(dictinfo,(char*)"ch",2)) { dictinfo += 2; inputSet += 1; val -= (size < inputLen) ? 5 : 2; if (size < 7) val -= 3; if (val < 0) val = 0; continue; } if (*inputSet == 'k' && !strncmp(dictinfo,(char*)"qu",2)) { dictinfo += 2; inputSet += 1; val -= (size < inputLen) ? 5 : 2; if (size < 7) val -= 3; if (val < 0) val = 0; continue; } if (*inputSet == 'o' && !strncmp(dictinfo,(char*)"do",2) && !inputSet[1] && !dictinfo[2]) // at end { dictinfo += 2; inputSet += 1; val -= (size < inputLen) ? 5 : 2; if (size < 7) val -= 3; if (val < 0) val = 0; continue; } if (*inputSet == 'w' && !strncmp(dictinfo,(char*)"bue",3)) { dictinfo += 3; inputSet += 1; val -= (size < inputLen) ? 5 : 2; if (size < 7) val -= 3; if (val < 0) val = 0; continue; } if (*inputSet == 'w' && !strncmp(dictinfo,(char*)"vue",3)) { dictinfo += 3; inputSet += 1; val -= (size < inputLen) ? 5 : 2; if (size < 7) val -= 3; if (val < 0) val = 0; continue; } if (!strncmp(inputSet,(char*)"ll",2) && *dictinfo == 'y') { inputSet += 2; dictinfo += 1; val -= (size < inputLen) ? 5 : 2; if (size < 7) val -= 3; if (val < 0) val = 0; continue; } if (*inputSet == 'y' && *dictinfo == 'l' && dictinfo[1] == 'l') { inputSet += 1; dictinfo += 2; val -= (size < inputLen) ? 5 : 2; if (size < 7) val -= 3; if (val < 0) val = 0; continue; } } if (*inputSet == 't' && !strncmp(dictinfo,(char*)"ght",3)) { dictinfo += 3; inputSet += 1; val += 5; } else if (!strncmp(inputSet,(char*)"ci",2) && !strncmp(dictinfo,(char*)"cki",3)) { dictinfo += 3; inputSet += 2; val += 5; } else if (*(dictinfo-1) == 'a' && !strcmp(dictinfo,(char*)"ir") && !strcmp(inputSet,(char*)"re")) // prepair prepare as terminal sound { dictinfo += 2; inputSet += 2; val += 3; } else if (!strncmp(inputSet,(char*)"ous",3) && !strncmp(dictinfo,(char*)"eous",4)) { dictinfo += 4; inputSet += 3; val += 5; } else if (!strncmp(inputSet,(char*)"of",2) && !strncmp(dictinfo,(char*)"oph",3)) { dictinfo += 3; inputSet += 2; val += 5; } else if (*dictinfo == 'x' && !strncmp(inputSet,(char*)"cks",3)) { dictinfo += 1; inputSet += 3; val += 5; } else if (*inputSet == 'k' && !strncmp(dictinfo,(char*)"qu",2)) { dictinfo += 2; inputSet += 1; val += 5; } if (oldval != val){;} // swallowed a multiple letter sound change else if (!strncmp(dictinfo,(char*)"able",4) && !strncmp(inputSet,(char*)"ible",4)) swap = true; else if (!strncmp(dictinfo,(char*)"ible",4) && !strncmp(inputSet,(char*)"able",4)) swap = true; else if (*dictinfo == 'a' && dictinfo[1] == 'y' && *inputSet == 'e' && inputSet[1] == 'i') swap = true; else if (*dictinfo == 'e' && dictinfo[1] == 'a' && *inputSet == 'e' && inputSet[1] == 'e') swap = true; else if (*dictinfo == 'e' && dictinfo[1] == 'e' && *inputSet == 'e' && inputSet[1] == 'a') swap = true; else if (*dictinfo == 'e' && dictinfo[1] == 'e' && *inputSet == 'i' && inputSet[1] == 'e') swap = true; else if (*dictinfo == 'e' && dictinfo[1] == 'i' && *inputSet == 'a' && inputSet[1] == 'y') swap = true; else if (*dictinfo == 'e' && dictinfo[1] == 'u' && *inputSet == 'o' && inputSet[1] == 'o') swap = true; else if (*dictinfo == 'e' && dictinfo[1] == 'u' && *inputSet == 'o' && inputSet[1] == 'u') swap = true; else if (*dictinfo == 'i' && dictinfo[1] == 'e' && *inputSet == 'e' && inputSet[1] == 'e') swap = true; else if (*dictinfo == 'o' && dictinfo[1] == 'o' && *inputSet == 'e' && inputSet[1] == 'u') swap = true; else if (*dictinfo == 'o' && dictinfo[1] == 'o' && *inputSet == 'o' && inputSet[1] == 'u') swap = true; else if (*dictinfo == 'o' && dictinfo[1] == 'o' && *inputSet == 'u' && inputSet[1] == 'i') swap = true; else if (*dictinfo == 'o' && dictinfo[1] == 'u' && *inputSet == 'e' && inputSet[1] == 'u') swap = true; else if (*dictinfo == 'o' && dictinfo[1] == 'u' && *inputSet == 'o' && inputSet[1] == 'o') swap = true; else if (*dictinfo == 'u' && dictinfo[1] == 'i' && *inputSet == 'o' && inputSet[1] == 'o') swap = true; if (swap) { dictinfo += 2; inputSet += 2; val += 5; } } // can we change a letter to another similar letter if (oldval == val) { bool convert = false; if (*dictinfo == 'i' && *inputSet== 'y' && count > 1) convert = true;// but not as first letter else if ((*dictinfo == 's' && *inputSet == 'z') || (*dictinfo == 'z' && *inputSet == 's')) convert = true; else if (*dictinfo == 'y' && *inputSet == 'i' && count > 1) convert = true; // but not as first letter else if (*dictinfo == '/' && *inputSet == '-') convert = true; else if (inputSet+1 == inputend && *inputSet == 's') val += 30; // changing a trailing s is almost not acceptable if (convert) val += 5; // low cost for exchange of similar letter, but dont do it often else val += 12; // changing a letter is expensive, since it destroys the visual image ++dictinfo; ++inputSet; } } if (val > min) return val; // too costly, ignore it } return val; }
//////////////////////////////////////////////////////////////////////////////// // main deal //////////////////////////////////////////////////////////////////////////////// void MString::DoubleMetaphone(CString &metaph, CString &metaph2) { int current = 0; length = GetLength(); if(length < 1) return; last = length - 1;//zero based index alternate = FALSE; MakeUpper(); //pad the original string so that we can index beyond the edge of the world Insert(GetLength(), " "); //skip these when at start of word if(StringAt(0, 2, "GN", "KN", "PN", "WR", "PS", "")) current += 1; //Initial 'X' is pronounced 'Z' e.g. 'Xavier' if(GetAt(0) == 'X') { MetaphAdd("S"); //'Z' maps to 'S' current += 1; } ///////////main loop////////////////////////// while(TRUE OR (primary.GetLength() < 4) OR (secondary.GetLength() < 4)) { if(current >= length) break; switch(GetAt(current)) { case 'A': case 'E': case 'I': case 'O': case 'U': case 'Y': if(current == 0) //all init vowels now map to 'A' MetaphAdd("A"); current +=1; break; case 'B': //"-mb", e.g", "dumb", already skipped over... MetaphAdd("P"); if(GetAt(current + 1) == 'B') current +=2; else current +=1; break; case 'Ç': MetaphAdd("S"); current += 1; break; case 'C': //various germanic if((current > 1) AND !IsVowel(current - 2) AND StringAt((current - 1), 3, "ACH", "") AND ((GetAt(current + 2) != 'I') AND ((GetAt(current + 2) != 'E') OR StringAt((current - 2), 6, "BACHER", "MACHER", "")) )) { MetaphAdd("K"); current +=2; break; } //special case 'caesar' if((current == 0) AND StringAt(current, 6, "CAESAR", "")) { MetaphAdd("S"); current +=2; break; } //italian 'chianti' if(StringAt(current, 4, "CHIA", "")) { MetaphAdd("K"); current +=2; break; } if(StringAt(current, 2, "CH", "")) { //find 'michael' if((current > 0) AND StringAt(current, 4, "CHAE", "")) { MetaphAdd("K", "X"); current +=2; break; } //greek roots e.g. 'chemistry', 'chorus' if((current == 0) AND (StringAt((current + 1), 5, "HARAC", "HARIS", "") OR StringAt((current + 1), 3, "HOR", "HYM", "HIA", "HEM", "")) AND !StringAt(0, 5, "CHORE", "")) { MetaphAdd("K"); current +=2; break; } //germanic, greek, or otherwise 'ch' for 'kh' sound if((StringAt(0, 4, "VAN ", "VON ", "") OR StringAt(0, 3, "SCH", "")) // 'architect but not 'arch', 'orchestra', 'orchid' OR StringAt((current - 2), 6, "ORCHES", "ARCHIT", "ORCHID", "") OR StringAt((current + 2), 1, "T", "S", "") OR ((StringAt((current - 1), 1, "A", "O", "U", "E", "") OR (current == 0)) //e.g., 'wachtler', 'wechsler', but not 'tichner' AND StringAt((current + 2), 1, "L", "R", "N", "M", "B", "H", "F", "V", "W", " ", ""))) { MetaphAdd("K"); }else{ if(current > 0) { if(StringAt(0, 2, "MC", "")) //e.g., "McHugh" MetaphAdd("K"); else MetaphAdd("X", "K"); }else MetaphAdd("X"); } current +=2; break; } //e.g, 'czerny' if(StringAt(current, 2, "CZ", "") AND !StringAt((current - 2), 4, "WICZ", "")) { MetaphAdd("S", "X"); current += 2; break; } //e.g., 'focaccia' if(StringAt((current + 1), 3, "CIA", "")) { MetaphAdd("X"); current += 3; break; } //double 'C', but not if e.g. 'McClellan' if(StringAt(current, 2, "CC", "") AND !((current == 1) AND (GetAt(0) == 'M'))) //'bellocchio' but not 'bacchus' if(StringAt((current + 2), 1, "I", "E", "H", "") AND !StringAt((current + 2), 2, "HU", "")) { //'accident', 'accede' 'succeed' if(((current == 1) AND (GetAt(current - 1) == 'A')) OR StringAt((current - 1), 5, "UCCEE", "UCCES", "")) MetaphAdd("KS"); //'bacci', 'bertucci', other italian else MetaphAdd("X"); current += 3; break; }else{//Pierce's rule MetaphAdd("K"); current += 2; break; } if(StringAt(current, 2, "CK", "CG", "CQ", "")) { MetaphAdd("K"); current += 2; break; } if(StringAt(current, 2, "CI", "CE", "CY", "")) { //italian vs. english if(StringAt(current, 3, "CIO", "CIE", "CIA", "")) MetaphAdd("S", "X"); else MetaphAdd("S"); current += 2; break; } //else MetaphAdd("K"); //name sent in 'mac caffrey', 'mac gregor if(StringAt((current + 1), 2, " C", " Q", " G", "")) current += 3; else if(StringAt((current + 1), 1, "C", "K", "Q", "") AND !StringAt((current + 1), 2, "CE", "CI", "")) current += 2; else current += 1; break; case 'D': if(StringAt(current, 2, "DG", "")) if(StringAt((current + 2), 1, "I", "E", "Y", "")) { //e.g. 'edge' MetaphAdd("J"); current += 3; break; }else{ //e.g. 'edgar' MetaphAdd("TK"); current += 2; break; } if(StringAt(current, 2, "DT", "DD", "")) { MetaphAdd("T"); current += 2; break; } //else MetaphAdd("T"); current += 1; break; case 'F': if(GetAt(current + 1) == 'F') current += 2; else current += 1; MetaphAdd("F"); break; case 'G': if(GetAt(current + 1) == 'H') { if((current > 0) AND !IsVowel(current - 1)) { MetaphAdd("K"); current += 2; break; } if(current < 3) { //'ghislane', ghiradelli if(current == 0) { if(GetAt(current + 2) == 'I') MetaphAdd("J"); else MetaphAdd("K"); current += 2; break; } } //Parker's rule (with some further refinements) - e.g., 'hugh' if(((current > 1) AND StringAt((current - 2), 1, "B", "H", "D", "") ) //e.g., 'bough' OR ((current > 2) AND StringAt((current - 3), 1, "B", "H", "D", "") ) //e.g., 'broughton' OR ((current > 3) AND StringAt((current - 4), 1, "B", "H", "") ) ) { current += 2; break; }else{ //e.g., 'laugh', 'McLaughlin', 'cough', 'gough', 'rough', 'tough' if((current > 2) AND (GetAt(current - 1) == 'U') AND StringAt((current - 3), 1, "C", "G", "L", "R", "T", "") ) { MetaphAdd("F"); }else if((current > 0) AND GetAt(current - 1) != 'I') MetaphAdd("K"); current += 2; break; } } if(GetAt(current + 1) == 'N') { if((current == 1) AND IsVowel(0) AND !SlavoGermanic()) { MetaphAdd("KN", "N"); }else //not e.g. 'cagney' if(!StringAt((current + 2), 2, "EY", "") AND (GetAt(current + 1) != 'Y') AND !SlavoGermanic()) { MetaphAdd("N", "KN"); }else MetaphAdd("KN"); current += 2; break; } //'tagliaro' if(StringAt((current + 1), 2, "LI", "") AND !SlavoGermanic()) { MetaphAdd("KL", "L"); current += 2; break; } //-ges-,-gep-,-gel-, -gie- at beginning if((current == 0) AND ((GetAt(current + 1) == 'Y') OR StringAt((current + 1), 2, "ES", "EP", "EB", "EL", "EY", "IB", "IL", "IN", "IE", "EI", "ER", "")) ) { MetaphAdd("K", "J"); current += 2; break; } // -ger-, -gy- if((StringAt((current + 1), 2, "ER", "") OR (GetAt(current + 1) == 'Y')) AND !StringAt(0, 6, "DANGER", "RANGER", "MANGER", "") AND !StringAt((current - 1), 1, "E", "I", "") AND !StringAt((current - 1), 3, "RGY", "OGY", "") ) { MetaphAdd("K", "J"); current += 2; break; } // italian e.g, 'biaggi' if(StringAt((current + 1), 1, "E", "I", "Y", "") OR StringAt((current - 1), 4, "AGGI", "OGGI", "")) { //obvious germanic if((StringAt(0, 4, "VAN ", "VON ", "") OR StringAt(0, 3, "SCH", "")) OR StringAt((current + 1), 2, "ET", "")) MetaphAdd("K"); else //always soft if french ending if(StringAt((current + 1), 4, "IER ", "")) MetaphAdd("J"); else MetaphAdd("J", "K"); current += 2; break; } if(GetAt(current + 1) == 'G') current += 2; else current += 1; MetaphAdd("K"); break; case 'H': //only keep if first & before vowel or btw. 2 vowels if(((current == 0) OR IsVowel(current - 1)) AND IsVowel(current + 1)) { MetaphAdd("H"); current += 2; }else//also takes care of 'HH' current += 1; break; case 'J': //obvious spanish, 'jose', 'san jacinto' if(StringAt(current, 4, "JOSE", "") OR StringAt(0, 4, "SAN ", "") ) { if(((current == 0) AND (GetAt(current + 4) == ' ')) OR StringAt(0, 4, "SAN ", "") ) MetaphAdd("H"); else { MetaphAdd("J", "H"); } current +=1; break; } if((current == 0) AND !StringAt(current, 4, "JOSE", "")) MetaphAdd("J", "A");//Yankelovich/Jankelowicz else //spanish pron. of e.g. 'bajador' if(IsVowel(current - 1) AND !SlavoGermanic() AND ((GetAt(current + 1) == 'A') OR (GetAt(current + 1) == 'O'))) MetaphAdd("J", "H"); else if(current == last) MetaphAdd("J", " "); else if(!StringAt((current + 1), 1, "L", "T", "K", "S", "N", "M", "B", "Z", "") AND !StringAt((current - 1), 1, "S", "K", "L", "")) MetaphAdd("J"); if(GetAt(current + 1) == 'J')//it could happen! current += 2; else current += 1; break; case 'K': if(GetAt(current + 1) == 'K') current += 2; else current += 1; MetaphAdd("K"); break; case 'L': if(GetAt(current + 1) == 'L') { //spanish e.g. 'cabrillo', 'gallegos' if(((current == (length - 3)) AND StringAt((current - 1), 4, "ILLO", "ILLA", "ALLE", "")) OR ((StringAt((last - 1), 2, "AS", "OS", "") OR StringAt(last, 1, "A", "O", "")) AND StringAt((current - 1), 4, "ALLE", "")) ) { MetaphAdd("L", " "); current += 2; break; } current += 2; }else current += 1; MetaphAdd("L"); break; case 'M': if((StringAt((current - 1), 3, "UMB", "") AND (((current + 1) == last) OR StringAt((current + 2), 2, "ER", ""))) //'dumb','thumb' OR (GetAt(current + 1) == 'M') ) current += 2; else current += 1; MetaphAdd("M"); break; case 'N': if(GetAt(current + 1) == 'N') current += 2; else current += 1; MetaphAdd("N"); break; case 'Ñ': current += 1; MetaphAdd("N"); break; case 'P': if(GetAt(current + 1) == 'H') { MetaphAdd("F"); current += 2; break; } //also account for "campbell", "raspberry" if(StringAt((current + 1), 1, "P", "B", "")) current += 2; else current += 1; MetaphAdd("P"); break; case 'Q': if(GetAt(current + 1) == 'Q') current += 2; else current += 1; MetaphAdd("K"); break; case 'R': //french e.g. 'rogier', but exclude 'hochmeier' if((current == last) AND !SlavoGermanic() AND StringAt((current - 2), 2, "IE", "") AND !StringAt((current - 4), 2, "ME", "MA", "")) MetaphAdd("", "R"); else MetaphAdd("R"); if(GetAt(current + 1) == 'R') current += 2; else current += 1; break; case 'S': //special cases 'island', 'isle', 'carlisle', 'carlysle' if(StringAt((current - 1), 3, "ISL", "YSL", "")) { current += 1; break; } //special case 'sugar-' if((current == 0) AND StringAt(current, 5, "SUGAR", "")) { MetaphAdd("X", "S"); current += 1; break; } if(StringAt(current, 2, "SH", "")) { //germanic if(StringAt((current + 1), 4, "HEIM", "HOEK", "HOLM", "HOLZ", "")) MetaphAdd("S"); else MetaphAdd("X"); current += 2; break; } //italian & armenian if(StringAt(current, 3, "SIO", "SIA", "") OR StringAt(current, 4, "SIAN", "")) { if(!SlavoGermanic()) MetaphAdd("S", "X"); else MetaphAdd("S"); current += 3; break; } //german & anglicisations, e.g. 'smith' match 'schmidt', 'snider' match 'schneider' //also, -sz- in slavic language altho in hungarian it is pronounced 's' if(((current == 0) AND StringAt((current + 1), 1, "M", "N", "L", "W", "")) OR StringAt((current + 1), 1, "Z", "")) { MetaphAdd("S", "X"); if(StringAt((current + 1), 1, "Z", "")) current += 2; else current += 1; break; } if(StringAt(current, 2, "SC", "")) { //Schlesinger's rule if(GetAt(current + 2) == 'H') //dutch origin, e.g. 'school', 'schooner' if(StringAt((current + 3), 2, "OO", "ER", "EN", "UY", "ED", "EM", "")) { //'schermerhorn', 'schenker' if(StringAt((current + 3), 2, "ER", "EN", "")) { MetaphAdd("X", "SK"); }else MetaphAdd("SK"); current += 3; break; }else{ if((current == 0) AND !IsVowel(3) AND (GetAt(3) != 'W')) MetaphAdd("X", "S"); else MetaphAdd("X"); current += 3; break; } if(StringAt((current + 2), 1, "I", "E", "Y", "")) { MetaphAdd("S"); current += 3; break; } //else MetaphAdd("SK"); current += 3; break; } //french e.g. 'resnais', 'artois' if((current == last) AND StringAt((current - 2), 2, "AI", "OI", "")) MetaphAdd("", "S"); else MetaphAdd("S"); if(StringAt((current + 1), 1, "S", "Z", "")) current += 2; else current += 1; break; case 'T': if(StringAt(current, 4, "TION", "")) { MetaphAdd("X"); current += 3; break; } if(StringAt(current, 3, "TIA", "TCH", "")) { MetaphAdd("X"); current += 3; break; } if(StringAt(current, 2, "TH", "") OR StringAt(current, 3, "TTH", "")) { //special case 'thomas', 'thames' or germanic if(StringAt((current + 2), 2, "OM", "AM", "") OR StringAt(0, 4, "VAN ", "VON ", "") OR StringAt(0, 3, "SCH", "")) { MetaphAdd("T"); }else{ MetaphAdd("0", "T"); } current += 2; break; } if(StringAt((current + 1), 1, "T", "D", "")) current += 2; else current += 1; MetaphAdd("T"); break; case 'V': if(GetAt(current + 1) == 'V') current += 2; else current += 1; MetaphAdd("F"); break; case 'W': //can also be in middle of word if(StringAt(current, 2, "WR", "")) { MetaphAdd("R"); current += 2; break; } if((current == 0) AND (IsVowel(current + 1) OR StringAt(current, 2, "WH", ""))) { //Wasserman should match Vasserman if(IsVowel(current + 1)) MetaphAdd("A", "F"); else //need Uomo to match Womo MetaphAdd("A"); } //Arnow should match Arnoff if(((current == last) AND IsVowel(current - 1)) OR StringAt((current - 1), 5, "EWSKI", "EWSKY", "OWSKI", "OWSKY", "") OR StringAt(0, 3, "SCH", "")) { MetaphAdd("", "F"); current +=1; break; } //polish e.g. 'filipowicz' if(StringAt(current, 4, "WICZ", "WITZ", "")) { MetaphAdd("TS", "FX"); current +=4; break; } //else skip it current +=1; break; case 'X': //french e.g. breaux if(!((current == last) AND (StringAt((current - 3), 3, "IAU", "EAU", "") OR StringAt((current - 2), 2, "AU", "OU", ""))) ) MetaphAdd("KS"); if(StringAt((current + 1), 1, "C", "X", "")) current += 2; else current += 1; break; case 'Z': //chinese pinyin e.g. 'zhao' if(GetAt(current + 1) == 'H') { MetaphAdd("J"); current += 2; break; }else if(StringAt((current + 1), 2, "ZO", "ZI", "ZA", "") OR (SlavoGermanic() AND ((current > 0) AND GetAt(current - 1) != 'T'))) { MetaphAdd("S", "TS"); } else MetaphAdd("S"); if(GetAt(current + 1) == 'Z') current += 2; else current += 1; break; default: current += 1; } } metaph = primary; //only give back 4 char metaph //if(metaph.GetLength() > 4) // metaph.SetAt(4,'\0'); if(alternate) { metaph2 = secondary; //if(metaph2.GetLength() > 4) // metaph2.SetAt(4,'\0'); } }
std::string en::GetNounString(Noun* NounObj, bool ObjCase) { if (NounObj->ID==0) return ""; //Create the final string that will be returned. (Empty for now). std::string NounString = ""; //Create the string that will be inserted before the articles (so that we can correctly turn 'a' into 'an' if this begins with a vowel) std::string NounResult; //Prepend all the adjectives to the NounResult, if the adjectives exist. for (int i = 0; i < 16; ++i) { std::string Adjective = GetAdjective(NounObj->Adjectives[i]); if (Adjective.compare("")!=0) NounResult += Adjective + " "; } //Prepend the Noun to NounResult. NounResult += GetNoun(NounObj,ObjCase); //Get the string of the article, telling it to turn 'a' into 'an' if NounResult begins with a vowel. std::string Article = GetArticle(NounObj,IsVowel(NounResult[0])); //Get the numberal std::string Numeral = GetNumeral(NounObj,false); //Get the preposition std::string Prepos = GetPrepos(NounObj->PreposNum); //Start the NounString with the preposition if necessary if (Prepos.compare("")!=0) NounString+=Prepos + " "; //Add the article if (Article.compare("")!=0) NounString+=Article + " "; //Add the Numeral. This is not currently added to noun result, and shouldn't matter as you shouldn't be adding a numeral when you use a/an if (Numeral.compare("")!=0) NounString+=Numeral + " "; //Add the noun result to the noun string (adjectives, and noun) NounString+=NounResult; //If there is a genitive object, add it. if (NounObj->ShouldUseGenitive) { NounString += " " + GenitiveMarker + " "; NounString += GetNounString(NounObj->GenitiveNoun,true); } //If there is a relative clause, add it. if (NounObj->ShouldUseRelativeClause) { NounObj->RelativeClause->IsClause = true; //If the relative clause is about a persion, use "who" int NounType = GetNounType(NounObj->ID); if (NounType == 'm' || NounType == 'f' || NounType == 'p' || NounType == 'd') { NounString += " " + RClausePersonalMarker; } //Otherwise if the relative clause is essential use "that" else if (NounObj->IsRelativeClauseEssential) NounString += " " + RClauseEssentialMarker; //Otherwise (non essential) use "which" else NounString += " " + RClauseNonEssentialMarker; //Append the clause NounString += " " + NounObj->RelativeClause->createSentence(); } //Return the result. return NounString; }
static int EditDistance(WORDINFO& dictWordData, WORDINFO& realWordData,int min) {// dictword has no underscores, inputSet is already lower case char dictw[MAX_WORD_SIZE]; MakeLowerCopy(dictw, dictWordData.word); char* dictinfo = dictw; char* realinfo = realWordData.word; char* dictstart = dictinfo; char* realstart = realWordData.word; int val = 0; // a difference in length will manifest as a difference in letter count // look at specific letter errors char priorCharDict[10]; char priorCharReal[10]; *priorCharDict = *priorCharReal = 0; char currentCharReal[10]; char currentCharDict[10]; *currentCharReal = *currentCharDict = 0; char nextCharReal[10]; char nextCharDict[10]; char next1CharReal[10]; char next1CharDict[10]; char* resumeReal2; char* resumeDict2; char* resumeReal; char* resumeDict; char* resumeReal1; char* resumeDict1; char baseCharReal; char baseCharDict; while (ALWAYS) { if (val > min) return 1000; // no good strcpy(priorCharReal, currentCharReal); strcpy(priorCharDict, currentCharDict); resumeReal = IsUTF8((char*)realinfo, currentCharReal); resumeDict = IsUTF8((char*)dictinfo, currentCharDict); if (!*currentCharReal && !*currentCharDict) break; //both at end if (!*currentCharReal || !*currentCharDict) // one ending, other has to catch up by adding a letter { val += 16; // add a letter if (*priorCharReal == *currentCharDict) val -= 10; // doubling letter at end dictinfo = resumeDict; realinfo = resumeReal; continue; } // punctuation in a word is bad tokenization, dont spell check it away if (*currentCharReal == '?' || *currentCharReal == '!' || *currentCharReal == '(' || *currentCharReal == ')' || *currentCharReal == '[' || *currentCharReal == ']' || *currentCharReal == '{' || *currentCharReal == '}') return 200; // dont mess with this resumeReal1 = IsUTF8((char*)resumeReal, nextCharReal); resumeDict1 = IsUTF8((char*)resumeDict, nextCharDict); resumeReal2 = IsUTF8((char*)resumeReal1, next1CharReal); // 2 char ahead resumeDict2 = IsUTF8((char*)resumeDict1, next1CharDict); baseCharReal = UnaccentedChar(currentCharReal); baseCharDict = UnaccentedChar(currentCharDict); if (!stricmp(currentCharReal, currentCharDict)) // match chars { dictinfo = resumeDict; realinfo = resumeReal; continue; } if (baseCharReal && baseCharReal == baseCharDict) { dictinfo = resumeDict; realinfo = resumeReal; val += 1; // minimal charge but separate forms of delivre continue; } // treat german double s and ss equivalent if (!stricmp(language, "german")) { if (*currentCharReal == 0xc3 && currentCharReal[1] == 0x9f && *currentCharDict == 's' && *nextCharDict == 's') { dictinfo = resumeDict1; realinfo = resumeReal; continue; } if (*currentCharDict == 0xc3 && currentCharDict[1] == 0x9f && *currentCharReal == 's' && *nextCharReal == 's') { dictinfo = resumeDict; realinfo = resumeReal1; continue; } } // spanish alternative spellings if (!stricmp(language, "spanish")) // ch-x | qu-k | c-k | do-o | b-v | bue-w | vue-w | z-s | s-c | h- | y-i | y-ll | m-n 1st is valid { if (*currentCharReal == 'c' && *currentCharDict == 'k') { dictinfo = resumeDict; realinfo = resumeReal; continue; } if (*currentCharReal == 'b' && *currentCharDict == 'v') { dictinfo = resumeDict; realinfo = resumeReal; continue; } if (*currentCharReal == 'v' && *currentCharDict == 'b') { dictinfo = resumeDict; realinfo = resumeReal; continue; } if (*currentCharReal == 'z' && *currentCharDict == 's') { dictinfo = resumeDict; realinfo = resumeReal; continue; } if (*currentCharReal == 's' && *currentCharDict == 'c') { dictinfo = resumeDict; realinfo = resumeReal; continue; } if (*currentCharReal == 'y' && *currentCharDict == 'i') { dictinfo = resumeDict; realinfo = resumeReal; continue; } if (*currentCharReal == 'm' && *currentCharDict == 'n') { dictinfo = resumeDict; realinfo = resumeReal; continue; } if (*currentCharReal == 'n' && *currentCharDict == 'm') { dictinfo = resumeDict; realinfo = resumeReal; continue; } if (*currentCharDict == 'h') { dictinfo = resumeDict; continue; } if (*currentCharReal == 'x' && *currentCharDict == 'c' && *nextCharDict == 'h') { dictinfo = resumeDict1; realinfo = resumeReal; continue; } if (*currentCharReal == 'k' && *currentCharDict == 'q' && *nextCharDict == 'u') { dictinfo = resumeDict1; realinfo = resumeReal; continue; } if (*currentCharReal == 'o' && *currentCharDict == 'd' && *nextCharDict == 'o') { dictinfo = resumeDict1; realinfo = resumeReal; continue; } if (*currentCharReal == 'w' && *currentCharDict == 'b' && *nextCharDict == 'u' && *next1CharDict == 'e') { dictinfo = resumeDict2; realinfo = resumeReal; continue; } if (*currentCharReal == 'w' && *currentCharDict == 'v' && *nextCharDict == 'u' && *next1CharDict == 'e') { dictinfo = resumeDict2; realinfo = resumeReal; continue; } if (*currentCharReal == 'l' && *nextCharReal == 'l' && *currentCharDict == 'y') { dictinfo = resumeDict; realinfo = resumeReal1; continue; } if (*currentCharReal == 'y' && *currentCharDict == 'l' && *nextCharDict == 'l') { dictinfo = resumeDict1; realinfo = resumeReal; continue; } } // french common bad spellings if (!stricmp(language, "french")) { if (*currentCharReal == 'a' && SameUTF(currentCharDict,"â")) { val += 1; dictinfo = resumeDict; realinfo = resumeReal; continue; } if (*currentCharReal == 'e' && SameUTF(currentCharDict,"ê")) { val += 10; dictinfo = resumeDict; realinfo = resumeReal; continue; } if (*currentCharReal == 0xc3 && currentCharReal[1] == 0xa8 && SameUTF(currentCharDict,"ê")) { val += 5; dictinfo = resumeDict; realinfo = resumeReal; continue; } if (*currentCharReal == 'i' && SameUTF(currentCharDict,"î")) { val += 1; dictinfo = resumeDict; realinfo = resumeReal; continue; } if (*currentCharReal == 'o' && SameUTF(currentCharDict, "ô")) { val += 1; dictinfo = resumeDict; realinfo = resumeReal; continue; } if (*currentCharReal == 'u' && SameUTF(currentCharDict, "û")) { val += 5; dictinfo = resumeDict; realinfo = resumeReal; continue; } if (*currentCharReal == 'y' && *currentCharDict == 'l' && *nextCharDict == 'l') { val += 10; dictinfo = resumeDict1; realinfo = resumeReal; continue; } if (*currentCharReal == 'k' && *currentCharDict == 'q' && *nextCharDict == 'u') { val += 10; dictinfo = resumeDict1; realinfo = resumeReal; continue; } if (*currentCharReal == 'f' && *currentCharDict == 'p' && *nextCharDict == 'h') { val += 5; dictinfo = resumeDict1; realinfo = resumeReal; continue; } if (*currentCharReal == 's' && *currentCharDict == 'c') { val += 10; dictinfo = resumeDict; realinfo = resumeReal; continue; } if (*currentCharReal == 's' && SameUTF(currentCharDict, "ç")) { val += 5; dictinfo = resumeDict; realinfo = resumeReal; continue; } if (*currentCharReal == 'c' && SameUTF(currentCharDict, "ç")) { val += 5; dictinfo = resumeDict; realinfo = resumeReal; continue; } } // probable transposition since swapping syncs up if (!strcmp(currentCharReal, nextCharDict) && !strcmp(nextCharReal, currentCharDict)) { val += 16; // more expensive if next is not correct after transposition dictinfo = resumeDict2; // skip ahead 2 realinfo = resumeReal2; continue; } // probable mistyped letter since next matches up if (!strcmp(nextCharReal, nextCharDict)) { val += 16; // more expensive if 2nd next is not correct after transposition if (*currentCharReal == 's' && *currentCharDict == 'z') val -= 5; else if (*currentCharReal == 'z' && *currentCharDict == 's') val -= 5; else if (IsVowel(*currentCharReal) && IsVowel(*currentCharDict)) val -= 6; // low cost for switching a vowel dictinfo = resumeDict; realinfo = resumeReal; continue; } // probable excess letter by user since next matches up to current if (!strcmp(nextCharReal, currentCharDict)) { val += 16; // only delete 1 letter if (*priorCharDict == *currentCharReal) val -= 14; // low cost for dropping an excess repeated letter (wherre->where not wherry) else if (*currentCharReal == '-') val -= 10; // very low cost for removing a hypen dictinfo = resumeDict; // skip over letter we match momentarily realinfo = resumeReal1; // move on past our junk letter and match letter continue; } // probable missing letter by user since current matches up to nextdict if (!strcmp(nextCharDict, currentCharReal)) { val += 16; // only add 1 letter // better to add repeated letter than to drop a letter if (*currentCharDict == *priorCharReal) val -= 6; // low cost for adding a repeated letter else if (*currentCharDict == 'e' && *nextCharDict == 'o') val -= 10; // yoman->yeoman dictinfo = resumeDict1; // skip over letter we match momentarily realinfo = resumeReal; // move on past our junk letter and match letter continue; } // complete mismatch with no understanding of why, just fix them and move on dictinfo = resumeDict; // skip over letter we match momentarily realinfo = resumeReal; // move on past our junk letter and match letter val += 16; } return val; }