Пример #1
0
bool CMorphDictionary::FindWord(string& InputWordStr, bool cap, bool predict, bool max_predict, vector<CFindWordNode>& results) const
{
	size_t nLen = InputWordStr.length();
	if (	(nLen == 0)
		||	(nLen > GLOBAL_MAX_WORD_LEN))
		return false;

	RmlMakeUpper (InputWordStr, m_Language);

	CResultVector FlexResults;

	bool retval = FlexFind(InputWordStr, FlexResults);
	if( !retval )
		return( false );

	retval = FindBases(InputWordStr, FlexResults, results);
	if( retval )
		return( true );
	
	if( !predict )
		return( false );

	
	if( InputWordStr.length() < m_Predict.m_CountOfLetters )
		return( false );

	retval = PredictByDataBase(InputWordStr, FlexResults, results,cap,max_predict);
	
	return (retval);
}
Пример #2
0
//   CLemmatizer::LemmatizeWord should return true if 
// the word was found in the dictionary, if it was predicted, then it returns false
bool CLemmatizer::LemmatizeWord(string& InputWordStr, const bool cap, const bool predict, vector<CAutomAnnotationInner>& results, bool bGetLemmaInfos) const
{
	
	RmlMakeUpper (InputWordStr, GetLanguage());

	size_t WordOffset = 0;
	

	m_pFormAutomat->GetInnerMorphInfos(InputWordStr, 0, results);

	bool bResult = !results.empty();

	if (results.empty())
	{
		if (m_bUsePrediction)
		{
			PredictBySuffix(InputWordStr, WordOffset, 4, results); // the length of the minal suffix is 4 


			if (InputWordStr[WordOffset-1] != '-') //  and there is no hyphen
			{
				size_t  KnownPostfixLen = InputWordStr.length()-WordOffset;
				size_t  UnknownPrefixLen = WordOffset;
				if (KnownPostfixLen < 6)// if  the known part is too short
					//if	(UnknownPrefixLen > 5)// no prediction if unknown prefix is more than 5
					{
						if (!IsPrefix(InputWordStr.substr(0, UnknownPrefixLen)))
							results.clear();
					};
			};

			// отменяем предсказание по местоимениям, например "Семыкиным"
			for (size_t i=0; i<results.size(); i++)
				if (m_NPSs[results[i].m_ModelNo] == UnknownPartOfSpeech)
				{
					results.clear();
					break;
				};

		};
	};

	if (!results.empty())
	{
		if (bGetLemmaInfos)
			GetLemmaInfos(InputWordStr, WordOffset, results);
	}
	else
		if (m_bUsePrediction)
		{
			PredictByDataBase(InputWordStr, results,cap);
		};

	return bResult;
}
Пример #3
0
// loading simple list of words from a file, lemmatizing it, and storing
void CMorphDictionary::LoadFrequentRoots(string path)
{
	string load_path = path+"predictroots.txt";
	FILE*fp =  fopen (load_path.c_str(), "r");
	if (!fp) return;
	char buffer[1000];
	while (fgets (buffer, 1000, fp))
	{
		string WordStr = buffer;
		Trim(WordStr);
		if (WordStr.empty()) continue;
		RmlMakeUpper (WordStr, m_Language);
		vector<CFindWordNode> FindResults;
		bool retval = FindWord(WordStr, true, false, false, FindResults);
		if (!retval) continue;
		set<size_t> UsedFlexia;
		for (size_t i=0; i<FindResults.size(); i++)
		{
				CFormInfo P;
				P.Create(	this, 
							FindResults[i].m_nBase,
							FindResults[i].m_LemmaInfo,
							FindResults[i].m_nFlex
						);
				for (size_t j=0; j < P.GetCount(); j++)
				{
					size_t FlexNo = P.GetFlexNoOfForm(j);
					if (UsedFlexia.find(FlexNo) == UsedFlexia.end())
					{
						CPredictEndRoot R;
						R.m_BaseNo = FindResults[i].m_nBase;
						R.m_LemmaInfo = FindResults[i].m_LemmaInfo;
						R.m_EndRoot = P.GetWordForm(j);
						reverse(R.m_EndRoot.begin(),R.m_EndRoot.end());
						
						R.m_FlexNo = FlexNo;
						UsedFlexia.insert(FlexNo);
						m_PredictEndRoots.push_back(R);
					};
				};
			

		};

	};
	fclose(fp);
	sort(m_PredictEndRoots.begin(), m_PredictEndRoots.end());
};
Пример #4
0
bool CGerSyntaxOpt :: InitOptionsLanguageSpecific()
{
	//  reading adjektives
	string strFileName = GetSyntaxFilePath()+"adj_prp.txt";
	{
		if (!ReadListFile (strFileName.c_str(),(*m_pAdjPrp)))
			return false;
		// deleting valency information
		for (size_t i=0; i  < m_pAdjPrp->size(); i++)
		{
			string&  s = (*m_pAdjPrp)[i];
			int q = s.find("+");
			if (q != string::npos)
				s.erase(q);
			q = s.find(" ");
			if (q != string::npos)
				s.erase(q);
			RmlMakeUpper(s, morphGerman);
		};
		sort(m_pAdjPrp->begin(), m_pAdjPrp->end());

	};


	// reading formats
	
	strFileName = GetSyntaxFilePath()+"gformats.txt";
	m_FormatsGrammar.m_Language = morphGerman;
	m_FormatsGrammar.m_pGramTab = GetGramTab();
	m_FormatsGrammar.m_SourceGrammarFile = strFileName;
	if (!LoadGrammarForGLR(m_FormatsGrammar, true, false))
	{
		ErrorMessage(  Format("Cannot load %s\n", strFileName.c_str()));
		return false;
	};
	
	
	return true;
}
Пример #5
0
bool CRusSemStructure::SetLemmasToReplace(string LemmasToReplace)
{
	m_SynthLemmaToReplace.clear();
	
	RmlMakeUpper(LemmasToReplace,morphRussian);
	StringTokenizer tok(LemmasToReplace.c_str(), ";");
	while (tok())
	{
		char lemma1[255];
		char lemma2[255];
		string OnePair = tok.val();
		if (sscanf(OnePair.c_str(), "%[^/]/%[^/]", lemma1, lemma2) != 2)
			return false;
		string lem1=lemma1;
		Trim(lem1);
		if (lem1.empty()) return false;
		string lem2=lemma2;
		Trim(lem2);
		if (lem2.empty()) return false;
		m_SynthLemmaToReplace[lem1] = lem2;
	};
	return true;
};
Пример #6
0
bool	CGrammarItem::AddAttribute(string Name, string Value, MorphLanguageEnum Language, string& ErrorStr)
{
    if (Value.length() > 0) 
	    if (Value[0] == '"')
	    {
		    if ( (Value.length()<2) || (Value[Value.length() - 1] != '"'))
		    {
			    ErrorStr = Format("no matching quotation mark for attribute value \"%s\"",Value.c_str());
			    return false;
		    };
		    Value = Value.substr(1, Value.length()-2);
	    };

	if (Name == "root")
	{
		m_bSynMain = true;
		return true;
	};

	if (Name == "type")
	{
		m_TokenType = StringToTokenType(Value);
		if (m_TokenType == OTHER_TOKEN_TYPE)
		{
			ErrorStr = Format("unknown token type:%s ",Value.c_str());
			return false;
		}
	};

	if (Name == "hom")
	{
		if (Value == "yes")
			m_bCanHaveManyHomonyms = true;
		else
			if (Value == "no")
				m_bCanHaveManyHomonyms = false;
			else
			{
				ErrorStr = Format("Bad value for attribute \"hom\" (\"%s\"). It can be \"yes\" or \"no\"",Value.c_str());
				return false;
			};

		if (m_TokenType == OTHER_TOKEN_TYPE)
				m_TokenType = (Language == morphRussian) ? RLE : LLE;
		return true;
	};


	if	(Name == "grm") 
	{
		m_MorphPattern.m_GrmAttribute = Value;
		if (m_TokenType == OTHER_TOKEN_TYPE)
				m_TokenType = (Language == morphRussian) ? RLE : LLE;
		return true;
	};

	if	(Name == "form") 
	{
		m_Token = Value;
		RmlMakeUpper(m_Token, Language);
		m_ItemStrId = Value;

		if ( (m_TokenType == OTHER_TOKEN_TYPE) && !m_Token.empty())
		{
			if (ispunct((BYTE)m_Token[0]))
				m_TokenType = PUNCTUAT;
			else
			if (isdigit((BYTE)m_Token[0]))
				m_TokenType = NUM;
			else
			if (Language == morphRussian)
			{
				if (CheckLanguage(m_Token, Language))
					m_TokenType = RLE;
			}
			else
			{
				if (CheckLanguage(m_Token, Language))
					m_TokenType = LLE;
			}
		};

		return true;
	};

	if (Name == "register")
	{
		if (Value == "AA")
			m_Register = UpUp;
		else
			if (Value == "aa")
				m_Register = LowLow;
			else
			if (Value == "Aa")
				m_Register = UpLow;
			else
			{
				ErrorStr = Format("Bad value for attribute \"register\" (\"%s\"). It can be \"AA\", \"aa\" or \"Aa\"",Value.c_str());
				return false;
			};
		if (m_TokenType == OTHER_TOKEN_TYPE)
				m_TokenType = (Language == morphRussian) ? RLE : LLE;
		return true;
	};

	if (Name == "filename")
	{
		Value = GetPathByFile(CurrentSourceFileName) + Value;
		if (m_TokenType == OTHER_TOKEN_TYPE)
				m_TokenType = (Language == morphRussian) ? RLE : LLE;
	}


	m_Attributes[Name] = Value;

	return true;
};
Пример #7
0
void	CLemWord::SetWordStr (string NewValue, MorphLanguageEnum langua)
{
	m_strWord = NewValue;
	m_strUpperWord =  NewValue;
	RmlMakeUpper(m_strUpperWord, langua);
};
Пример #8
0
//----------------------------------------------------------------------------
//	Ищет слова по заданному в файле перечню. 
//  В файле на каждой строке сначала стоит часть речи, а потом лемма.
//----------------------------------------------------------------------------
void CMorphwizardView::OnToolsSelectByFile() 
{
	// TODO: Add your command handler code here
   	CFileDialog D(TRUE, "slf", "paradigms.txt");
	if (D.DoModal() != IDOK) return;
	FILE * fp = fopen (D.GetPathName(),"rb");
	if (!fp) 
	{
		AfxMessageBox ("Cannot open file");
		return;
	};
	char buf[1000];
	std::string strNotFound;
	int ParadigmCount = 0;
	found_paradigms.clear();
	while  (fgets(buf, 1000, fp)) 
	{
		std::string Line = buf;
		Trim(Line);
		if (Line.empty()) continue;
		StringTokenizer tok (Line.c_str(), ";");
		if (!tok())
		{
			std::string mess = std::string("cannot get lemma from ") + buf + std::string("; The format should be <Lemma>;<TypeGrammems>;<MorphPattern>");
			AfxMessageBox (mess.c_str());
			break;
		};
		std::string Lemma = tok.val();
		Trim(Lemma);


		if (!tok())
		{
			std::string mess = std::string("cannot get type grammem ") + buf + std::string("; The format should be <Lemma> <PartofSpeech> <Grammems>");
			AfxMessageBox (mess.c_str());
			break;
		};
		std::string TypeAncode;
		{
			std::string grams = tok.val();
			Trim(grams);
			if (grams != "*")
				if (!GetWizard()->slf2ancode(grams, TypeAncode )) 
				{
					std::string mess = std::string("cannot process type grammems ") + grams;
					AfxMessageBox (mess.c_str());
					break;
				};
		};


		if (!tok())
		{
			std::string mess = std::string("cannot get morphological pattern ") + buf + std::string("; The format should be <Lemma>;<TypeGrammems>;<MorphPattern>");
			AfxMessageBox (mess.c_str());
			break;
		};
		std::string FirstCode;
		{
			std::string PosStr = tok.val();
			Trim(PosStr);
			if (!GetWizard()->slf2ancode(PosStr,FirstCode )) 
			{
				std::string mess = std::string("cannot process morph. pattern ") + PosStr;
				AfxMessageBox (mess.c_str());
				break;
			};
		};

		std::vector<lemma_iterator_t> curr_found_paradigms;
		RmlMakeUpper(Lemma, GetWizard()->m_Language);
		bool bFound = false;
		GetWizard()->find_lemm(Lemma.c_str(), true, curr_found_paradigms);
		for(size_t i=0; i<curr_found_paradigms.size(); i++ )
		{
			std::string str_pos = GetWizard()->m_FlexiaModels[curr_found_paradigms[i]->second.m_FlexiaModelNo].get_first_code();;

			if (curr_found_paradigms[i]->second.GetCommonAncodeIfCan() == TypeAncode)
				if( FirstCode == str_pos )
					if (std::find(found_paradigms.begin(), found_paradigms.end(), curr_found_paradigms[i]) == found_paradigms.end())
					{
						found_paradigms.push_back(curr_found_paradigms[i]);
						bFound = true;
					};
		}
		if (!bFound)
			strNotFound +=  Format("Not found: %s\r\n", Line.c_str());


	};
	fclose(fp);

	FilterFoundParadigms();
	ShowFoundParadigms();	
	if (!strNotFound.empty())
		echo(strNotFound.c_str());
}
Пример #9
0
void CMorphwizardView::OnFind() 
{
	try 
	{
		m_FoundList.DeleteAllItems();
		found_paradigms.clear();
		m_FindWhat.SetFocus();

		CString find_what;
		m_FindWhat.GetWindowText(find_what);
		if (find_what == "") return;
		ChangeHistory((const char*)find_what);

		CWizardProgressMeter meter(*GetWizard());

		if(GetCheckedRadioButton(IDC_RFIND_LEM,IDC_WORD_FORM)==IDC_RFIND_LEM)
		{
			std::string s = find_what;
			RmlMakeUpper(s, GetWizard()->m_Language);
			Trim(s);
			char ch = s[0];
			if( '0'<=ch && ch<='9' )
			{
				int prdno = atoi(s.c_str());
				GetWizard()->find_lemm_by_prdno(prdno,found_paradigms);
			}
			else
			{
				if (s.substr(0,7) == "ACCENT=")
				{
					int accent_model_no = atoi(s.c_str()+7);
					GetWizard()->find_lemm_by_accent_model(accent_model_no, found_paradigms);
				}
				else
					GetWizard()->find_lemm(s.c_str(), false, found_paradigms);
			};
		}
	//
		if(GetCheckedRadioButton(IDC_RFIND_LEM,IDC_WORD_FORM)==IDC_RFIND_GRM)
			GetWizard()->find_lemm_by_grammem((const char *)find_what, found_paradigms);

		if(GetCheckedRadioButton(IDC_RFIND_LEM,IDC_WORD_FORM)==IDC_WORD_FORM)
		{
			std::string s = find_what;
			// нельзя применять  RmlMakeUpper из-за "*"
			if (GetWizard()->IsGerman())
				GerMakeUpper(s);
			else
				EngRusMakeUpper(s);
			GetWizard()->find_wordforms(s.c_str(), found_paradigms);
		};

		//  findinsg by gramcode
		if(GetCheckedRadioButton(IDC_RFIND_LEM,IDC_FIND_BY_GRAMCODE)==IDC_FIND_BY_GRAMCODE)
		{
			std::string s = find_what;
			GetWizard()->find_ancodes(s.c_str(), found_paradigms);
		};
		if(GetCheckedRadioButton(IDC_RFIND_LEM,IDC_FIND_BY_USERNAME)==IDC_FIND_BY_USERNAME)
		{
			std::string s = find_what;
			GetWizard()->find_lemm_by_user(s.c_str(), found_paradigms);
		};
		

	// proceed filter string
		FilterFoundParadigms();
		ShowFoundParadigms();
	}
	catch (CExpc C)
	{
		ErrorMessage(C.m_strCause);
	}
}