예제 #1
0
bool InitMorphologySystem(JNIEnv *env, jni_dictionary &dic){
	switch (dic.Language){
		case morphRussian:
			dic.lang=Russian;
			break;
		case morphEnglish :
		case morphGerman:
		default:
			throwEx(env, strdup("assertion error: A1"));
			return false;
	}

	string langua_str = GetStringByLanguage(dic.Language);
	dic.pLemmatizer = new T;
	string strError;
	if (!dic.pLemmatizer->LoadDictionariesRegistry(strError)){
   		char* err=str_compose("Cannot load %s morphological dictionary. Error details: %s", langua_str.c_str(), strError.c_str());
		throwEx(env, err);
		return false;
	}
	dic.pAgramtab = new Y;
	if (!dic.pAgramtab->LoadFromRegistry()){
   		char* err=str_compose("Cannot load %s gramtab.", langua_str.c_str());
		throwEx(env, err);
		return false;
	}
	return true;
}
bool CMorphDictBuilder::GenPredictIdx(const MorphoWizard& wizard, int PostfixLength, int MinFreq, string path)
{
	
	
	DwordVector ModelFreq(wizard.m_FlexiaModels.size(), 0);
	//  building frequences of flexia models
	for(const_lemma_iterator_t lnMapIt = wizard.m_LemmaToParadigm.begin(); lnMapIt != wizard.m_LemmaToParadigm.end(); lnMapIt++)
		ModelFreq[lnMapIt->second.m_FlexiaModelNo]++;

	bool bSparsedDictionary;
	{
		int Count=0;
		for (size_t ModelNo=0; ModelNo<ModelFreq.size(); ModelNo++)
			if (ModelFreq[ModelNo] >= MinimalFlexiaModelFrequence)
				Count++;
		bSparsedDictionary = 2*Count < ModelFreq.size();
		if (bSparsedDictionary)
			fprintf (stderr, "Flexia models are too sparsed\n");
	};



	string PlugLemma = GetPlugLemmabyLanguage(wizard.m_Language);
	int PlugLemmaInfoNo = -1;

	Flex2WordMap svMapRaw;
	//  going through all words
	for(size_t lin =0; lin < m_LemmaInfos.size(); lin++)
	{
		
		if (!(lin%1000))
			log ( Format("Pick up data...%i            \r", lin) ) ;

		const CLemmaInfo& LemmaInfo = m_LemmaInfos[lin].m_LemmaInfo;
		size_t ModelNo = LemmaInfo.m_FlexiaModelNo;
		const CFlexiaModel& paradigm = m_FlexiaModels[ModelNo];
		string	base = m_Bases[m_LemmaInfos[lin].m_LemmaStrNo].GetString();

		if (base+paradigm.get_first_flex() == PlugLemma)
		{
			PlugLemmaInfoNo = lin;
			continue;
		};
		
		if (!bSparsedDictionary)
			if (ModelFreq[ModelNo] < MinimalFlexiaModelFrequence)
				continue;

		string pos = wizard.get_pos_string(paradigm.get_first_code());
		WORD nps =  GetPredictionPartOfSpeech(pos, wizard.m_Language);
		if (nps == UnknownPartOfSpeech)
			continue;


		const vector <bool>& Infos = m_ModelInfo[ModelNo];
		for (size_t i=0; i<paradigm.m_Flexia.size(); i++)
		if (Infos[i])
		{
			string flexia = paradigm.m_Flexia[i].m_FlexiaStr;
			string wordform = base + flexia;
			if (wordform.length() < PostfixLength) continue;
			string Postfix = wordform.substr(wordform.length() - PostfixLength);
			AddElem(svMapRaw, Postfix, lin, nps, i, ModelFreq, m_LemmaInfos);
		}
		
	}
	if (PlugLemmaInfoNo == -1)
	{
		ErrorMessage (Format("Cannot find a word for the default noun prediction (\"%s\") while  generating %s prediction base",PlugLemma.c_str(), GetStringByLanguage(wizard.m_Language).c_str()));
		return false;
	};

	log("Saving...\n");

	CMorphAutomatBuilder R(wizard.m_Language);
	R.InitTrie();

	// adding crtitical noun
	{
		string s = CriticalNounLetterPack;
		s += AnnotChar;
		s += R.EncodeIntToAlphabet(0); // noun
		s += AnnotChar;
		s += R.EncodeIntToAlphabet(PlugLemmaInfoNo);
		s += AnnotChar;
		s += R.EncodeIntToAlphabet(0);
		R.AddStringDaciuk(s);
	};

	for( Flex2WordMap::const_iterator it=svMapRaw.begin(); it!=svMapRaw.end(); it++ )
	{
		for( int i=0; i<it->second.size(); i++ )
		{
			const CPredictWord& W = it->second[i];
			// checking minimal frequence

			if (W.m_Freq < MinFreq) continue;
			
			string s = it->first;
			reverse(s.begin(), s.end());
			s += AnnotChar;
			s += R.EncodeIntToAlphabet(W.m_nps);
			s += AnnotChar;
			s += R.EncodeIntToAlphabet(W.m_LemmaInfoNo);
			s += AnnotChar;
			s += R.EncodeIntToAlphabet(W.m_ItemNo);
			R.AddStringDaciuk(s);
		}
	};

	R.ConvertBuildRelationsToRelations();
	R.Save(path + PREDICT_BIN_PATH);
	

	svMapRaw.clear();
	return true;
	
}
예제 #3
0
bool CMorphAutomat::Load(string AutomatFileName)
{
    Clear();

    FILE * fp = fopen(AutomatFileName.c_str(), "rb");
    if (!fp)
    {
        ErrorMessage (Format("Cannot open %s", AutomatFileName.c_str()));
        return false;
    };

    char buffer [256];
    if (!fgets(buffer, 256, fp)) return false;
    m_NodesCount = atoi(buffer);
    if (!m_NodesCount) return false;

    assert (m_pNodes == 0);

    m_pNodes  = new CMorphAutomNode[m_NodesCount];
    assert (m_pNodes != 0);
    if (fread(m_pNodes, sizeof(CMorphAutomNode),m_NodesCount, fp) != m_NodesCount)
        return  false;


    if (!fgets(buffer, 256, fp)) return false;
    m_RelationsCount = atoi(buffer);
    assert (m_pRelations == 0);
    m_pRelations  = new CMorphAutomRelation[m_RelationsCount];
    assert (m_pRelations != 0);
    if (fread(m_pRelations, sizeof(CMorphAutomRelation),m_RelationsCount, fp) != m_RelationsCount)
        return  false;


    {
        int Alphabet2Code[256];
        fread(Alphabet2Code,sizeof(int),256,fp);
        if (memcmp(Alphabet2Code,m_Alphabet2Code, 256*sizeof(int)) )
        {
            string err = Format("%s alphabet has changed; cannot load morph automat", GetStringByLanguage(m_Language).c_str());
            ErrorMessage(err);
            return false;
        };
    };


    fclose(fp);



    BuildChildrenCache();

    return true;
};