// add morphological analysis to CFSArray containing the sentence void addAnalysis(CLinguistic& linguistic, CDisambiguator& disambiguator, CFSArray<CFSVar>& words, const bool disambiguate) { //CFSVar &words=Data["words"]; CFSArray<CPTWord> PTWords; for (INTPTR ip=0; ip<words.GetSize(); ip++) { PTWords.AddItem(words[ip]["text"].GetWString()); } // perform analysis and optional disambiguation CFSArray<CMorphInfos> MorphResults=linguistic.AnalyzeSentense(PTWords); if (disambiguate) { MorphResults=disambiguator.Disambiguate(MorphResults); } // collect the analysis results ASSERT(PTWords.GetSize()==MorphResults.GetSize()); for (INTPTR ip=0; ip<words.GetSize(); ip++) { const CFSArray<CMorphInfo> &Analysis=MorphResults[ip].m_MorphInfo; CFSVar VarAnalysis; VarAnalysis.Cast(CFSVar::VAR_ARRAY); for (INTPTR ipRes=0; ipRes<Analysis.GetSize(); ipRes++) { const CMorphInfo &Analysis1=Analysis[ipRes]; CFSVar VarAnalysis1; VarAnalysis1["root"]=Analysis1.m_szRoot; VarAnalysis1["ending"]=Analysis1.m_szEnding; VarAnalysis1["clitic"]=Analysis1.m_szClitic; VarAnalysis1["partofspeech"]=CFSWString(Analysis1.m_cPOS); VarAnalysis1["form"]=Analysis1.m_szForm; VarAnalysis[ipRes]=VarAnalysis1; } words[ip]["analysis"]=VarAnalysis; } }
CFSWString chars_to_phones_part_I(CFSWString &s) { CFSWString res; for (INTPTR i = 0; i < s.GetLength(); i++) { CFSWString c = s.GetAt(i); if (c == L']') { CFSWString t = CFSWString(s.GetAt(i - 1)).ToUpper(); res.SetAt(res.GetLength() - 1, t.GetAt(0)); //vaatab taha; pole kindel, et kas on vajalik t = CFSWString(s.GetAt(i - 2)).ToUpper(); if (can_palat(t)) { res.SetAt(res.GetLength() - 2, t.GetAt(0)); t = CFSWString(s.GetAt(i - 3)).ToUpper(); if (can_palat(t)) { res.SetAt(res.GetLength() - 2, t.GetAt(0)); } } //vaatab ette t = CFSWString(s.GetAt(i + 1)).ToUpper(); if (can_palat(t)) { s.SetAt(i + 1, t.GetAt(0)); t = CFSWString(s.GetAt(i + 2)).ToUpper(); if (can_palat(t)) { s.SetAt(i + 2, t.GetAt(0)); } } } else if (c == L'<') { CFSWString t = CFSWString(s.GetAt(i + 1)).ToUpper(); s.SetAt(i + 1, t.GetAt(0)); } else if (c == L'?') { }//Ebanormaalne rõhk. Pärast vaatab, mis teha else if (c == L'x') res += L"ks"; else if (c == L'y') res += L"i"; else if (c == L'w') res += L"v"; else if (c == L'z') res += L"ts"; else if (c == L'c') { res += L"k"; } else if (c == L'ü' && is_vowel(s.GetAt(i + 1)) && s.GetAt(i - 1) == L'ü') res += L"i"; else res += c; } return res; }
CFSArray<CFSWString> do_all(CFSWString utt, bool print_label, bool print_utt) { CFSArray<CFSWString> res; CFSArray<CPTWord> PTW; utterance_struct u; u.s = utt; u.syl_c = 0; u.phone_c = 0; u.phra_c = do_phrases(u); INTPTR word_count = 0; if (print_utt) fprintf(stderr, "%s\n", ccstr(utt)); for (INTPTR i = 0; i < u.phr_vector.GetSize(); i++) { u.phr_vector[i].utt_p = i; phrase2words(u.phr_vector[i], PTW); word_count += u.phr_vector[i].word_c; } CFSArray<CMorphInfos> MRs = Disambiguator.Disambiguate(Linguistic.AnalyzeSentense(PTW)); CFSArray<CMorphInfo> words; for (INTPTR i = 0; i < MRs.GetSize(); i++) //for (INTPTR i1 = 0; i1 < MRs[i].m_MorphInfo.GetSize(); i1++) words.AddItem(MRs[i].m_MorphInfo[0]); //Ühestamistulemuse ühestamise koht u.word_c = words.GetSize(); word_struct w; INTPTR utt_phone_c = 1; INTPTR syl_utt_p = 1; INTPTR phone_utt_p = 1; for (INTPTR i = 0; i < u.phr_vector.GetSize(); i++) { u.phr_vector[i].utt_p = i + 1; INTPTR syl_phr_p = 1; INTPTR phone_phr_p = 1; INTPTR phrase_pho_c = 1; for (INTPTR i1 = 0; i1 < u.phr_vector[i].word_c; i1++) { w.utt_p = utt_phone_c++; w.phr_p = i1 + 1; w.mi = words[0]; w.mi.m_szRoot += make_char_string(w.mi.m_szEnding) + w.mi.m_szClitic; w.mi.m_szRoot = w.mi.m_szRoot.ToLower(); // sidesõnad + ei välteta if ((CFSWString(w.mi.m_cPOS) == L"J") || (w.mi.m_szRoot == L"<ei")) w.mi.m_szRoot.Replace(L"<", L"", 1); do_syls(w); u.phr_vector[i].word_vector.AddItem(w); INTPTR phone_word_p = 1; INTPTR word_pho_c = 1; for (INTPTR i2 = 0; i2 < u.phr_vector[i].word_vector[i1].syl_vector.GetSize(); i2++) { u.syl_c++; u.phr_vector[i].syl_c++; u.phr_vector[i].word_vector[i1].syl_c++; u.phr_vector[i].word_vector[i1].syl_vector[i2].phr_p = syl_phr_p++; u.phr_vector[i].word_vector[i1].syl_vector[i2].utt_p = syl_utt_p++; INTPTR syl_phone_c = 1; do_phones(u.phr_vector[i].word_vector[i1].syl_vector[i2]); for (INTPTR i3 = 0; i3 < u.phr_vector[i].word_vector[i1].syl_vector[i2].phone_vector.GetSize(); i3++) { u.phone_c++; u.phr_vector[i].phone_c = phrase_pho_c++; u.phr_vector[i].word_vector[i1].phone_c = word_pho_c++; u.phr_vector[i].word_vector[i1].syl_vector[i2].phone_c = syl_phone_c++; phone_struct p = u.phr_vector[i].word_vector[i1].syl_vector[i2].phone_vector[i3]; p.utt_p = phone_utt_p++; p.phr_p = phone_phr_p++; p.word_p = phone_word_p++; u.phr_vector[i].word_vector[i1].syl_vector[i2].phone_vector[i3] = p; } } words.RemoveItem(0, 1); } } if (print_label) print_u(u); res = do_label(u); return res; }
int CSuggestor::Suggest(const CFSWString &szWord, bool bStartSentence){ m_TimeStart=CFSTime::Now(); m_Items.Cleanup(); m_Cap.SetCap(szWord); if (bStartSentence && m_Cap.GetCapMode()==CFSStrCap<CFSWString>::CAP_LOWER) { m_Cap.SetCapMode(CFSStrCap<CFSWString>::CAP_INITIAL); } CFSWString szWordHigh=szWord.ToUpper(); INTPTR ipWordLength=szWordHigh.GetLength(); CFSWString szTemp; INTPTR i, j; long lLevel=100; SetLevel(lLevel); // Case problems & change list i=SpellWord(szWordHigh, szTemp, &lLevel); if ((i==SPL_NOERROR || i==SPL_CHANGEONCE) && !szTemp.IsEmpty()){ SetLevel(GetLevelGroup(lLevel)); m_Items.AddItem(CSuggestorItem(szTemp, lLevel)); } else SetLevel(5); // Abbrevations // !!! Unimplemented // Quotes /* if (ipWordLength>=2 && (szAllQuot.Find(szWordHigh[0])>=0 || szAllQuot.Find(szWordHigh[ipWordLength-1])>=0)) { szTemp=szWordHigh; int iPos; if (szAllQuot.Find(szTemp[0])>=0){ if (szQuotLeft.Find(szTemp[0])>=0) { } else if ((iPos=szQuotRight.Find(szTemp[0]))>=0) { szTemp[0]=szQuotLeft[iPos]; } else if (szDQuotLeft.Find(szTemp[0])>=0) { } else if ((iPos=szDQuotRight.Find(szTemp[0]))>=0) { szTemp[0]=szDQuotLeft[iPos]; } if (szAllQuot.Find(szTemp[ipWordLength-1])>=0) { szTemp[ipWordLength-1]=(szQuotRight+szDQuotRight)[(szQuotLeft+szDQuotLeft).Find(szTemp[0])]; } else{ if (szQuotRight.Find(szTemp[ipWordLength-1])>=0) { } else if ((iPos=szQuotLeft.Find(szTemp[ipWordLength-1]))>=0) { szTemp[ipWordLength-1]=szQuotRight[iPos]; } else if (szDQuotRight.Find(szTemp[ipWordLength-1])>=0) { } else if ((iPos=szDQuotLeft.Find(szTemp[ipWordLength-1]))>=0) { szTemp[ipWordLength-1]=szDQuotRight[iPos]; } } CheckAndAdd(szTemp); }*/ // Add space for (i=1; i<ipWordLength-1; i++){ static CFSWString szPunktuation=FSWSTR(".:,;!?"); if (szPunktuation.Find(szWord[i])>=0){ long lLevel1, lLevel2; CFSWString szTemp1, szTemp2; if (SpellWord(szWord.Left(i+1), szTemp1, &lLevel1)==SPL_NOERROR && SpellWord(szWord.Mid(i+1), szTemp2, &lLevel2)==SPL_NOERROR) { m_Items.AddItem(CSuggestorItem(szWord.Left(i+1)+L' '+szWord.Mid(i+1), FSMAX(lLevel1, lLevel2))); } } } // Delete following blocks: le[nnu][nnu]jaam for (i=2; i<=3; i++){ for (j=0; j<ipWordLength-i-i; j++){ if (memcmp((const FSWCHAR *)szWordHigh+j, (const FSWCHAR *)szWordHigh+j+i, i*sizeof(FSWCHAR))==0){ szTemp=szWordHigh.Left(j)+szWordHigh.Mid(j+i); CheckAndAdd(szTemp); } } } // Change following letters: abb -> aab & aab -> abb for (i=1; i<ipWordLength-1; i++){ if (szWordHigh[i]==szWordHigh[i+1]){ szTemp=szWordHigh; szTemp[i]=szTemp[i-1]; if (FSIsLetterEst(szTemp[i])) CheckAndAdd(szTemp); } else if (szWordHigh[i]==szWordHigh[i-1]){ szTemp=szWordHigh; szTemp[i]=szTemp[i+1]; if (FSIsLetterEst(szTemp[i])) CheckAndAdd(szTemp); } } // Exchange letters: van[na]ema -> van[an]ema szTemp=szWordHigh; for (i=1; i<ipWordLength; i++){ if (szTemp[i]!=szTemp[i-1]){ FSWCHAR ch=szTemp[i]; szTemp[i]=szTemp[i-1]; szTemp[i-1]=ch; CheckAndAdd(szTemp); szTemp[i-1]=szTemp[i]; szTemp[i]=ch; } } // Change blocks for (i=0; i<ipWordLength; i++){ for (j=0; j<(INTPTR)(sizeof(ChangeStrings)/sizeof(__CChangeStrings)); j++){ if (szWordHigh.ContainsAt(i, ChangeStrings[j].m_lpszFrom)){ szTemp=szWordHigh.Left(i)+ChangeStrings[j].m_lpszTo+szWordHigh.Mid(i+FSStrLen(ChangeStrings[j].m_lpszFrom)); CheckAndAdd(szTemp); } } } // Change end blocks for (i=0; i<(INTPTR)(sizeof(ChangeStringsEnd)/sizeof(__CChangeStrings)); i++){ if (szWordHigh.EndsWith(ChangeStringsEnd[i].m_lpszFrom)){ szTemp=szWordHigh.Left(ipWordLength-FSStrLen(ChangeStringsEnd[i].m_lpszFrom))+ChangeStringsEnd[i].m_lpszTo; CheckAndAdd(szTemp); } } // Po~o~sas MultiReplace(szWordHigh, 0); // gi/ki: Kylli[gi]le -> Kyllile[gi] for (i=3; i<=6; i++){ if (i>ipWordLength) break; if (memcmp((const FSWCHAR *)szWordHigh+ipWordLength-i, FSWSTR("GI"), 2*sizeof(FSWCHAR))==0){ szTemp=szWordHigh.Left(ipWordLength-i)+szWordHigh.Mid(ipWordLength-i+2)+FSWSTR("GI"); CheckAndAdd(szTemp); szTemp=szWordHigh.Left(ipWordLength-i)+szWordHigh.Mid(ipWordLength-i+2)+FSWSTR("KI"); CheckAndAdd(szTemp); } } // Delete letters: van[n]aema -> vanaema szTemp=szWordHigh.Mid(1); CheckAndAdd(szTemp); for (i=0; i<ipWordLength-1; i++){ if (szTemp[i]!=szWordHigh[i]){ szTemp[i]=szWordHigh[i]; CheckAndAdd(szTemp); } } // Change letters from list for (i=0; i<ipWordLength; i++){ const FSWCHAR *lpszTo=__SuggestChangeLetters(szWordHigh[i]); if (!lpszTo) continue; szTemp=szWordHigh; for (; lpszTo[0]; lpszTo++){ szTemp[i]=lpszTo[0]; CheckAndAdd(szTemp); } } // Insert letters to word body for (i=1; i<ipWordLength; i++){ szTemp=szWordHigh.Left(i)+FSWSTR(' ')+szWordHigh.Mid(i); for (j=0; szInsertLetters[j]; j++){ szTemp[i]=szInsertLetters[j]; CheckAndAdd(szTemp); } } // Insert letters to the beginning szTemp=CFSWString(FSWSTR(" "))+szWordHigh; for (i=0; szInsertLettersBeg[i]; i++){ if (szTemp[1]==szInsertLettersBeg[i]) continue; szTemp[0]=szInsertLettersBeg[i]; CheckAndAdd(szTemp); } // Try apostrophe for names if (szWord[0]!=szWordHigh[0] && szWordHigh.Find('\'')<0){ for (i=0; i<5; i++){ if (i>=ipWordLength) break; szTemp=szWordHigh.Left(ipWordLength-i)+L'\''+szWordHigh.Mid(ipWordLength-i); CheckAndAdd(szTemp); } } Order(); RemoveImmoderate(); RemoveDuplicates(); return 0; }
CFSWString chars_to_phones_part_I(CFSWString &s) { /* müüa -> müia siia? Kuna vältemärgi nihutamise reeglid on ehitatud selliselt, et kohati kasutatakse foneeme ja kohati ei, siis tuleb täht->foneem teisendus teha kahes jaos. Palataliseerimine põhineb ideel, et palataliseeritud foneemi ümbruses ei saa olla palataliseeruvaid mittepalataliseeritud foneeme :D */ CFSWString res; for (INTPTR i = 0; i < s.GetLength(); i++) { CFSWString c = s.GetAt(i); if (c == L']') { CFSWString t = CFSWString(s.GetAt(i - 1)).ToUpper(); res.SetAt(res.GetLength() - 1, t.GetAt(0)); //vaatab taha; pole kindel, et kas on vajalik t = CFSWString(s.GetAt(i - 2)).ToUpper(); if (can_palat(t)) { res.SetAt(res.GetLength() - 2, t.GetAt(0)); t = CFSWString(s.GetAt(i - 3)).ToUpper(); if (can_palat(t)) { res.SetAt(res.GetLength() - 2, t.GetAt(0)); } } //vaatab ette t = CFSWString(s.GetAt(i + 1)).ToUpper(); if (can_palat(t)) { s.SetAt(i + 1, t.GetAt(0)); t = CFSWString(s.GetAt(i + 2)).ToUpper(); if (can_palat(t)) { s.SetAt(i + 2, t.GetAt(0)); } } } else if (c == L'<') { CFSWString t = CFSWString(s.GetAt(i + 1)).ToUpper(); s.SetAt(i + 1, t.GetAt(0)); } else if (c == L'?') { }//Ebanormaalne rõhk. Pärast vaatab, mis teha else if (c == L'x') res += L"ks"; else if (c == L'y') res += L"i"; else if (c == L'w') res += L"v"; else if (c == L'z') res += L"ts"; else if (c == L'c') { res += L"k"; } else if (c == L'ü' && is_vowel(s.GetAt(i + 1)) && s.GetAt(i - 1) == L'ü') res += L"i"; else if (c == L'q') { if (is_vowel(s.GetAt(i + 1))) { res += L"k"; s.SetAt(i + 1, L'v'); } else res += L"k"; } else res += c; } return res; }
void OnValReadEnd(const CFSAString &szKey, CFSVar &Data) { if (szKey.IsEmpty()) { SubKeys("paragraphs", Data); m_Writer.ObjectEnd(); } else if (szKey=="/paragraphs") { m_Writer.ArrayEnd(); m_iCollectData++; } else if (KeyMatch(szKey, "/paragraphs/%d")) { SubKeys("sentences", Data); m_Writer.ObjectEnd(); m_iCollectData--; } else if (KeyMatch(szKey, "/paragraphs/%d/sentences")) { m_Writer.ArrayEnd(); m_iCollectData++; } else if (KeyMatch(szKey, "/paragraphs/%d/sentences/%d")) { if (Data.KeyExist("words")) { CFSVar &Words=Data["words"]; CFSArray<CMorphInfos> WordsAnalysis; for (INTPTR ip=0; ip<Words.GetSize(); ip++) { const CFSVar &Word=Words[ip]; CMorphInfos Analysis; Analysis.m_szWord=Word["text"].GetWString(); const CFSVar &VarAnalysis=Word["analysis"]; for (INTPTR ip2=0; ip2<VarAnalysis.GetSize(); ip2++) { const CFSVar &VarAnalysis1=VarAnalysis[ip2]; CMorphInfo Analysis1; Analysis1.m_szRoot=VarAnalysis1["root"].GetWString(); Analysis1.m_szEnding=VarAnalysis1["ending"].GetWString(); Analysis1.m_szClitic=VarAnalysis1["clitic"].GetWString(); Analysis1.m_cPOS=VarAnalysis1["partofspeech"].GetWString()[0]; Analysis1.m_szForm=VarAnalysis1["form"].GetWString(); Analysis.m_MorphInfo.AddItem(Analysis1); } WordsAnalysis.AddItem(Analysis); } WordsAnalysis=m_Disambiguator.Disambiguate(WordsAnalysis); RT_ASSERT(Words.GetSize()==WordsAnalysis.GetSize()); for (INTPTR ip=0; ip<Words.GetSize(); ip++) { const CMorphInfos &Analysis=WordsAnalysis[ip]; CFSVar VarAnalysis; VarAnalysis.Cast(CFSVar::VAR_ARRAY); for (INTPTR ipRes=0; ipRes<Analysis.m_MorphInfo.GetSize(); ipRes++) { const CMorphInfo &Analysis1=Analysis.m_MorphInfo[ipRes]; CFSVar VarAnalysis1; VarAnalysis1["root"]=Analysis1.m_szRoot; VarAnalysis1["ending"]=Analysis1.m_szEnding; VarAnalysis1["clitic"]=Analysis1.m_szClitic; VarAnalysis1["partofspeech"]=CFSWString(Analysis1.m_cPOS); VarAnalysis1["form"]=Analysis1.m_szForm; VarAnalysis[ipRes]=VarAnalysis1; } Words[ip]["analysis"]=VarAnalysis; } } m_Writer.Val(Data); m_iCollectData--; } }