Example #1
0
//--------------------------------------------------------------------------------
void init_dicts()
{
	if (!MorphHolderRus.LoadGraphanAndLemmatizer(morphRussian))
		throw CExpc("cannot load Russian morphologyn");
	if (!MorphHolderEng.LoadGraphanAndLemmatizer(morphEnglish))
		throw CExpc("cannot load English morphology\n");

	if (!BinaryDictionary.Load())
		throw CExpc("cannot load binary dictionary\n");

	std::cerr << "dictionaries are loaded" << std::endl;

}
Example #2
0
void get_english_ancode(std::string e, std::string &ec)
{
    vector<CFormInfo> ParadigmCollection;

    if (!MorphHolderEng.m_pLemmatizer->CreateParadigmCollection(true, e, true, false, ParadigmCollection))
        throw CExpc(Format("Cannot lemmatize %s by English lemmatizer" , e.c_str()));

//	assert(count > 0);
    for(int i = 0; i < ParadigmCollection.size(); i++)
    {
        const CFormInfo& Paradigm = ParadigmCollection[i];
        if(!Paradigm.m_bFound) continue;
        std::string ancode = Paradigm.GetAncode(0);
        if (ec=="na" && ( ancode=="nc" || ancode=="ne" || ancode=="ni"))
        {
            ec = ancode;
            return;
        };
        if (ec=="nb" && ( ancode=="nd" || ancode=="ng" || ancode=="nk"))
        {
            ec = ancode;
            return;
        };
    }
}
Example #3
0
static int  InitAlphabet(MorphLanguageEnum Language, int* pCode2Alphabet, int *pAlphabet2Code, size_t AnnotChar)
{
    assert (!is_upper_alpha(AnnotChar, Language));
    string AdditionalEnglishChars = "'1234567890";
    string AdditionalGermanChars = "";
    int AlphabetSize = 0;
    for (size_t i=0; i < 256; i++)
    {
        if	(		is_upper_alpha((BYTE)i, Language)
                    ||	(i == '-')
                    ||	(i == AnnotChar)
                    ||	(		(Language == morphEnglish)
                                &&	(AdditionalEnglishChars.find(i) != string::npos)
                       )
                    ||	(		(Language == morphGerman)
                                &&	(AdditionalGermanChars.find(i) != string::npos)
                       )
                    ||	(		(Language == morphURL)
                                &&	is_alpha(i, morphURL)
                       )
           )
        {
            pCode2Alphabet[AlphabetSize] = i;
            pAlphabet2Code[i] = AlphabetSize;
            AlphabetSize++;
        }
        else
            pAlphabet2Code[i] = -1;

    };

    if (AlphabetSize > MaxAlphabetSize)
    {
        string Error = "Error! The  ABC is too large";
        ErrorMessage (Error);
        throw CExpc(Error);
    };

    return AlphabetSize;
};
Example #4
0
void CMorphwizardDoc::Serialize(CArchive& ar)
{
	if (ar.IsStoring())
	{
	}
	else
	{
	   	// ask username
		CDlgLogin dlgLogin;
		if( dlgLogin.DoModal() != IDOK )
			throw CExpc("No login information");

		CGriIni cIni;
		cIni.Init();
		CWizardProgressMeter meter(m_Wizard);
		bool ret = GetWizard()->load_wizard(ar.GetFile()->GetFilePath(), dlgLogin.m_name);
		
		SetInputLanguage(GetWizard()->m_Language);

		cIni.Exit();
	}
}
int CSentence::CanLinkSimpleSimilar(int CommaWordNo) 
{
	try
	{

		/*
			if comma is at the very  beginning of ath the end, then exit
		*/
		if	(		(CommaWordNo == 0) 
				||	(CommaWordNo + 1 >= m_Words.size() )
			)
			return -1;


		if (GetOpt()->m_Language == morphGerman)
		{
			// we can use CSentence::m_GroupsUnion if Tomita is enabled
			for (size_t i=0; i< m_GroupsUnion.GetGroups().size(); i++)
			{
				const CGroup& group = m_GroupsUnion.GetGroups()[i];
				//  ignore groups which contain only three words and the second word is 
				// a comma (a clause delimiter)
				if (group.size() == 3)
					if (m_Words[group.m_iFirstWord+1].m_bComma)
						continue;

				if	(		(group.m_iFirstWord < CommaWordNo) 
						&&	(group.m_iLastWord > CommaWordNo)
					)
					return group.m_iLastWord;
			};
			return -1;
		};


		const int Radius = (GetOpt()->m_Language == morphGerman)? 10 : 6;
		int StartClauseWordNo = max(0, CommaWordNo - Radius);
		CSentence* pSent = GetOpt()->NewSentence();
		if (!pSent)
			throw CExpc ("Cannot create sentence");
		
		for (int i = StartClauseWordNo; i < min((int)m_Words.size(), CommaWordNo + Radius); i++)
            pSent->m_Words.push_back(m_Words[i]);
		

		CClause C(pSent, 0,  pSent->m_Words.size() - 1);
		pSent->AddClause(C);	
		pSent->m_bShouldUseTwoPotentialRule = false;
		pSent->RunSyntaxInClauses(SimpleSimilarRules);
		
		int Result = -1;
		const CClause& prClause = pSent->m_Clauses[0];
		for (CSVI it = prClause.m_SynVariants.begin(); (Result == -1)&& (it!=prClause.m_SynVariants.end()); it++)
			for (size_t i=0; i< it->m_vectorGroups.GetGroups().size(); i++)
			{
				const CGroup& group = it->m_vectorGroups.GetGroups()[i];
				//  ignore groups which contain only three words and the second word is 
				// a comma (a clause delimiter)
				if (group.size() == 3)
				{
					const CSynUnit& U = it->m_SynUnits[group.m_iFirstWord+1];
					if (pSent->m_Words[U.m_SentPeriod.m_iFirstWord].m_bComma)
						continue;
				};

				if	(		(group.m_iFirstWord+StartClauseWordNo < CommaWordNo) 
						&&	(group.m_iLastWord+StartClauseWordNo > CommaWordNo)
					)
				{
					Result = group.m_iLastWord + StartClauseWordNo;
					break;
				};
			}
		
		delete pSent;

		return Result;
	}
	catch(...)
	{
		OutputErrorString("Failed RunSyntaxInClause(CanLinkSimpleSimilar)");
		return -1;
	}

}
Example #6
0
void CTrigramModel::InitModelFromConfigAndBuildTagset(string FileName, const CLemmatizer* Lemmatizer, const CAgramtab* GramTab, bool bLoadReverseModel) 
{
	FILE * fp = fopen (FileName.c_str(), "r");
	if (!fp)
		throw CExpc ("cannot read file %s\n", FileName.c_str());
	string TagsetFile;
    string ReverseModelConfig;
    char buffer[1000]; 
	while (fgets(buffer, 1000, fp))
	{
        if (strchr (buffer, '#'))
            *strchr (buffer, '#') = 0;

		StringTokenizer tok(buffer, "\r\n\t ");
		if (!tok())
			 continue;
		string Field = tok.val();
        string Value;
		if (tok())
		    Value = tok.val();
		if (Field == "NgramFile")
            m_NgramFile = BuildRMLPath (Value.c_str());
		else
		if (Field == "DictionaryFile")
            m_DictionaryFile = BuildRMLPath (Value.c_str());
		else
		if (Field == "TagsetFile")
			TagsetFile = BuildRMLPath (Value.c_str());
		else
		if (Field == "Language")
        {
			if (!GetLanguageByString(Value, m_Language))
                throw CExpc ("unknown language:%s\n",  Value.c_str());
        }
        else
		if (Field == "--second-local-coef")
		{
			m_SecondLocalCoef = atoi(Value.c_str());
			if (!m_SecondLocalCoef)
                throw CExpc ("wrong second local coef: %s\n",  Value.c_str());
			fprintf(stderr, "second local coef: %i\n",  m_SecondLocalCoef);
            m_bUseSecondLocalMax = true;	
		}
		else
		if (Field == "--min-bucket-size")
		{
			m_MinBucketSize = atoi(Value.c_str());
			if (!m_MinBucketSize)
				throw CExpc ("wrong min bucket size: %s\n",  Value.c_str());
			fprintf(stderr, "min bucket size: %i\n",  m_MinBucketSize);
		}
		else
		if (Field == "reverse-model-config")
		{
            
                ReverseModelConfig = Value;
       	}
		else
		if (Field == "--raw-texts")
		{
			m_bRawTexts = true;
		}
		else
		if (Field == "--supress-morph-errors")
		{
			m_bQuiet = true;
		}
        else
		if (Field == "--reverse-model")
		{
			m_bReverseModel = true;	
			fprintf (stderr, "reverse model!\n");
		}
		else
		if (Field == "--check-only-amb-words")
		{
			m_bCheckOnlyAmbiguosWords = true;	
			fprintf (stderr, "evaluate precision and recall only for ambiguous words\n");
		}

	};
	fclose(fp);
	if (		m_NgramFile.empty()
			||	m_DictionaryFile.empty()
		) 
        throw CExpc ("cannot find  NgramFile or DictionaryFile in %s\n", FileName.c_str());

#ifdef  USE_TRIGRAM_LEMMATIZER
    if (GramTab==0)
    {
	    if (!InitDicts())
            throw CExpc ("cannot initialize morphology\n", FileName.c_str());
    }
    else
    {
        m_pLemmatizer = Lemmatizer;
        m_pAgramtab = GramTab;
        // m_Graphan is disabled
    };
   	if (!TagsetFile.empty())
	{
		fprintf (stderr, "loading tagset from %s\n", TagsetFile.c_str());
	    if (!m_TagSet.ReadTagSet(TagsetFile, m_pAgramtab))
		    throw CExpc ("cannot load tagset");
	}
	else
	{
		fprintf (stderr, "building default tagset\n");
		m_TagSet.BuildDefaultTags(m_pAgramtab);
	}
	fprintf (stderr, "tag set file contains %i tags \n", m_TagSet.m_Tags.size());
#endif

    if (bLoadReverseModel && !ReverseModelConfig.empty())
    {
       m_pReverseModel = new CTrigramModel();
       fprintf(stderr, "load reverse model from : %s\n",  ReverseModelConfig.c_str());
       #ifdef  USE_TRIGRAM_LEMMATIZER
        m_pReverseModel->InitModelFromConfigAndBuildTagset(ReverseModelConfig, m_pLemmatizer, m_pAgramtab, false);
       #else
           m_pReverseModel->InitModelFromConfigAndBuildTagset(ReverseModelConfig, 0, 0, false);
       #endif
       m_pReverseModel->ReadBinary();
    }
};
Example #7
0
CDictionarySearch CTrigramModel::find_word(const string& WordStr) const
{
	CDictionarySearch R;
	assert (!WordStr.empty());
	if (WordStr.empty())
	{
		//fprintf (stderr, "Empty word!\n");
        R.m_pFoundWord = 0;
		for (WORD i=0; i < m_TagsCount; i++)
			R.m_PossibleWordTags.insert(i);
		return R;
	}
	
	
    R.m_pFoundWord =  lookup_word(WordStr);

	if (! R.m_pFoundWord ) 
	{
		// если слова нет в словаре, тогда попробуем его поискать в нижнем регистре
		string  lower = WordStr;
		RmlMakeLower(lower, m_Language);
        R.m_pFoundWord =  lookup_word(lower);
	}

	if ( R.m_pFoundWord ) 
	{
		// приписываем все тэги, которые были в корпусе 
        for (size_t i=0; i < R.m_pFoundWord->m_Length; i++)
		{
            int Tag = m_LexProbs[R.m_pFoundWord->m_StartOffset + i].m_Tag;
			R.m_PossibleWordTags.insert(Tag);
		}
	}

	// получаем все возможные тэги из морф. словар¤
    map<string, const vector<CXmlMorphAnnot>* >::iterator it = m_CurrentSentenceWords2Annots.find(WordStr);
    if (it != m_CurrentSentenceWords2Annots.end())
        get_tags_from_annots(*it->second,R.m_PossibleWordTags, WordStr);

#ifdef  USE_TRIGRAM_LEMMATIZER
        else
    	    get_tags_from_lemmatizer_but_not_preps(WordStr,R.m_PossibleWordTags);
#endif
		
	if (R.m_PossibleWordTags.empty()) 
		if (		atoi(WordStr.c_str()) > 0
				&&  (m_Language==morphRussian) 
		)
		{
            for (size_t  i=0; i < m_RegisteredTags.size();i++)
                if (m_RegisteredTags[i].length() > 3 && m_RegisteredTags[i].substr(0,4) == "„»—Ћ")
                    R.m_PossibleWordTags.insert(i);

            if (R.m_PossibleWordTags.empty())
				throw CExpc ("Cannot find „»—Ћ tag");
		}
		else
		if (		ispunct((BYTE)WordStr[0])
				||	!CheckLanguage(WordStr,m_Language)
			)
		{
			int tag = find_tag("UNK");
			if (tag == UnknownTag)
				throw CExpc ("Cannot find UNK tag");
			R.m_PossibleWordTags.insert(tag);
		}
		else
		{
			
			// приписываем все тэги
			if (!m_bQuiet)
				fprintf (stderr, "No information for word %s\n",WordStr.c_str());
			for (size_t i=0; i < min((size_t)200, m_TagsOrderedByUnigrams.size()); i++)
			{
				WORD tagno = m_TagsOrderedByUnigrams[i];
				string tag = m_RegisteredTags[tagno];
				if (tag.length()> 1 || !ispunct((unsigned char)tag[0]))
					R.m_PossibleWordTags.insert(tagno);
			}
		}

	return R;
}
void CMorphAutomatBuilder::ConvertBuildRelationsToRelations()
{
	if (!m_pRoot) return;
	m_pRoot->SetNodeIdNullRecursive();
	queue<CTrieNodeBuild*> NodesQueue;
	NodesQueue.push(m_pRoot);
	m_pRoot->m_NodeId = 0;

	vector<CMorphAutomNode> Nodes;
	vector<CMorphAutomRelation> Relations;

	while (!NodesQueue.empty())
	{
		//  getting an element from the queue
		CTrieNodeBuild* pNode = NodesQueue.front();
		NodesQueue.pop();

		CMorphAutomNode N;
		N.SetFinal(pNode->m_bFinal);
		
		N.SetChildrenStart(Relations.size());
		assert (N.GetChildrenStart() == Relations.size());
		assert (N.IsFinal() == pNode->m_bFinal);

		Nodes.push_back(N);

		int CurrentNodeId = Nodes.size() + NodesQueue.size();

		for (size_t i=0; i < MaxAlphabetSize; i++)
		if (pNode->m_Children[i])
		{
			CTrieNodeBuild* Child = pNode->m_Children[i];
			if (Child->m_NodeId == -1)
			{
				Child->m_NodeId = CurrentNodeId++;
				NodesQueue.push(Child);
			};

			// adding new relation
			CMorphAutomRelation R;
			R.SetRelationalChar(m_Code2Alphabet[i]);
			R.SetChildNo(Child->m_NodeId);
			assert (R.GetChildNo() == Child->m_NodeId);
			assert (R.GetRelationalChar() == m_Code2Alphabet[i]);

			Relations.push_back(R);
			if (Relations.size() > 0xffffff)
			{
				throw CExpc("Too many children in the automat. It cannot be more than 0xffffff");
			};
		};
	};

	Clear();

	m_NodesCount = Nodes.size();
	m_pNodes = new CMorphAutomNode[m_NodesCount];
	copy(Nodes.begin(), Nodes.end(), m_pNodes);

	m_RelationsCount = Relations.size();
	m_pRelations = new CMorphAutomRelation[m_RelationsCount];
	copy(Relations.begin(), Relations.end(), m_pRelations);

};
Example #9
0
// порождает по числительному генитивный вариант.
// например, два=>двух
// это генитивный вариант используетс¤ в слвоах типа "двухламповый"
bool BuildGenitFormOfCardinal(const CLemmatizer* piRusLemmatizer, const CRusGramTab* Agramtab) 
{
	GenitFormsOfCardinal.clear();
	for(int i = 0 ; i < NumeralToNumberCount; i++ )
	{
		if (NumeralToNumber[i].m_Number == 0)
		{
			GenitFormsOfCardinal.push_back("Ќ”Ћ№");
			continue;
		};
		if (NumeralToNumber[i].m_Number == 1)
		{
			GenitFormsOfCardinal.push_back("ќƒЌќ");
			continue;
		};
		if (NumeralToNumber[i].m_Number == 100)
		{
			GenitFormsOfCardinal.push_back("—“ќ");
			continue;
		};
		if (NumeralToNumber[i].m_Number == 1000)
		{
			GenitFormsOfCardinal.push_back("“џ—я„≈");
			continue;
		};
		if (NumeralToNumber[i].m_Number == 1000000)
		{
			GenitFormsOfCardinal.push_back("ћ»ЋЋ»ќЌќ");
			continue;
		};
		if (NumeralToNumber[i].m_Number == 1000000000)
		{
			GenitFormsOfCardinal.push_back("ћ»ЋЋ»ј–ƒЌќ");
			continue;
		};
		if (NumeralToNumber[i].m_Number == 1000000000000.0)
		{
			GenitFormsOfCardinal.push_back("“–»ЋЋ»ќЌќ");
			continue;
		};
		if (NumeralToNumber[i].m_Number == 1000000000000000.0)
		{
			GenitFormsOfCardinal.push_back(" ¬јƒ–»ЋЋ»ќЌќ");
			continue;
		};
		vector<CFormInfo> ParadigmCollection;
		string WordForm = NumeralToNumber[i].m_Cardinal;
		piRusLemmatizer->CreateParadigmCollection(true, WordForm, false, false, ParadigmCollection);
		// ищем числительное
		long k=0;
		for (; k < ParadigmCollection.size(); k++)
		{
			string AnCode = ParadigmCollection[k].GetAncode(0);
			BYTE POS = Agramtab->GetPartOfSpeech(AnCode.c_str());
            if (NumeralToNumber[i].m_bNoun)
            {
                if (POS == NOUN)
                    break;
            }
            else
			    if (POS == NUMERAL)
				    break;
		};
		assert (k < ParadigmCollection.size());
		const CFormInfo& P = ParadigmCollection[k];
		// ищем родительный падеж
		for (k=0; k < P.GetCount(); k++)
		{
			string AnCode = P.GetAncode(k);
			QWORD Grammems;
			if (!Agramtab->GetGrammems(AnCode.c_str(), Grammems))
				throw CExpc ("Bad ancode in  BuildGenitFormOfCardinal");
			if ( (Grammems & _QM(rGenitiv)) > 0 )
				break;
		};
		assert (k < P.GetCount());
		GenitFormsOfCardinal.push_back(P.GetWordForm(k));
	};
	return true;
};
Example #10
0
DWORD CSynDictionary::GetId(UINT Index) const
{
	// TODO: Add your implementation code here
	if(curr_synset == synonims.end()) throw CExpc ("bad index in synonyms vector");
	return curr_synset->second[Index];
}