Beispiel #1
0
CWrapEngine::CWrapEngine()
{
	CheckLanguage();
}
Beispiel #2
0
bool	CGrammarItem::AddAttribute(string Name, string Value, MorphLanguageEnum Language, string& ErrorStr)
{
    if (Value.length() > 0) 
	    if (Value[0] == '"')
	    {
		    if ( (Value.length()<2) || (Value[Value.length() - 1] != '"'))
		    {
			    ErrorStr = Format("no matching quotation mark for attribute value \"%s\"",Value.c_str());
			    return false;
		    };
		    Value = Value.substr(1, Value.length()-2);
	    };

	if (Name == "root")
	{
		m_bSynMain = true;
		return true;
	};

	if (Name == "type")
	{
		m_TokenType = StringToTokenType(Value);
		if (m_TokenType == OTHER_TOKEN_TYPE)
		{
			ErrorStr = Format("unknown token type:%s ",Value.c_str());
			return false;
		}
	};

	if (Name == "hom")
	{
		if (Value == "yes")
			m_bCanHaveManyHomonyms = true;
		else
			if (Value == "no")
				m_bCanHaveManyHomonyms = false;
			else
			{
				ErrorStr = Format("Bad value for attribute \"hom\" (\"%s\"). It can be \"yes\" or \"no\"",Value.c_str());
				return false;
			};

		if (m_TokenType == OTHER_TOKEN_TYPE)
				m_TokenType = (Language == morphRussian) ? RLE : LLE;
		return true;
	};


	if	(Name == "grm") 
	{
		m_MorphPattern.m_GrmAttribute = Value;
		if (m_TokenType == OTHER_TOKEN_TYPE)
				m_TokenType = (Language == morphRussian) ? RLE : LLE;
		return true;
	};

	if	(Name == "form") 
	{
		m_Token = Value;
		RmlMakeUpper(m_Token, Language);
		m_ItemStrId = Value;

		if ( (m_TokenType == OTHER_TOKEN_TYPE) && !m_Token.empty())
		{
			if (ispunct((BYTE)m_Token[0]))
				m_TokenType = PUNCTUAT;
			else
			if (isdigit((BYTE)m_Token[0]))
				m_TokenType = NUM;
			else
			if (Language == morphRussian)
			{
				if (CheckLanguage(m_Token, Language))
					m_TokenType = RLE;
			}
			else
			{
				if (CheckLanguage(m_Token, Language))
					m_TokenType = LLE;
			}
		};

		return true;
	};

	if (Name == "register")
	{
		if (Value == "AA")
			m_Register = UpUp;
		else
			if (Value == "aa")
				m_Register = LowLow;
			else
			if (Value == "Aa")
				m_Register = UpLow;
			else
			{
				ErrorStr = Format("Bad value for attribute \"register\" (\"%s\"). It can be \"AA\", \"aa\" or \"Aa\"",Value.c_str());
				return false;
			};
		if (m_TokenType == OTHER_TOKEN_TYPE)
				m_TokenType = (Language == morphRussian) ? RLE : LLE;
		return true;
	};

	if (Name == "filename")
	{
		Value = GetPathByFile(CurrentSourceFileName) + Value;
		if (m_TokenType == OTHER_TOKEN_TYPE)
				m_TokenType = (Language == morphRussian) ? RLE : LLE;
	}


	m_Attributes[Name] = Value;

	return true;
};
CDictionarySearch CTrigramModel::find_word(const string& WordStr) const
{
	CDictionarySearch R;
	assert (!WordStr.empty());
	if (WordStr.empty())
	{
		//fprintf (stderr, "Empty word!\n");
        R.m_pFoundWord = 0;
		for (WORD i=0; i < m_TagsCount; i++)
			R.m_PossibleWordTags.insert(i);
		return R;
	}
	
	
    R.m_pFoundWord =  lookup_word(WordStr);

	if (! R.m_pFoundWord ) 
	{
		// если слова нет в словаре, тогда попробуем его поискать в нижнем регистре
		string  lower = WordStr;
		RmlMakeLower(lower, m_Language);
        R.m_pFoundWord =  lookup_word(lower);
	}

	if ( R.m_pFoundWord ) 
	{
		// приписываем все тэги, которые были в корпусе 
        for (size_t i=0; i < R.m_pFoundWord->m_Length; i++)
		{
            int Tag = m_LexProbs[R.m_pFoundWord->m_StartOffset + i].m_Tag;
			R.m_PossibleWordTags.insert(Tag);
		}
	}

	// получаем все возможные тэги из морф. словар¤
    map<string, const vector<CXmlMorphAnnot>* >::iterator it = m_CurrentSentenceWords2Annots.find(WordStr);
    if (it != m_CurrentSentenceWords2Annots.end())
        get_tags_from_annots(*it->second,R.m_PossibleWordTags, WordStr);

#ifdef  USE_TRIGRAM_LEMMATIZER
        else
    	    get_tags_from_lemmatizer_but_not_preps(WordStr,R.m_PossibleWordTags);
#endif
		
	if (R.m_PossibleWordTags.empty()) 
		if (		atoi(WordStr.c_str()) > 0
				&&  (m_Language==morphRussian) 
		)
		{
            for (size_t  i=0; i < m_RegisteredTags.size();i++)
                if (m_RegisteredTags[i].length() > 3 && m_RegisteredTags[i].substr(0,4) == "„»—Ћ")
                    R.m_PossibleWordTags.insert(i);

            if (R.m_PossibleWordTags.empty())
				throw CExpc ("Cannot find „»—Ћ tag");
		}
		else
		if (		ispunct((BYTE)WordStr[0])
				||	!CheckLanguage(WordStr,m_Language)
			)
		{
			int tag = find_tag("UNK");
			if (tag == UnknownTag)
				throw CExpc ("Cannot find UNK tag");
			R.m_PossibleWordTags.insert(tag);
		}
		else
		{
			
			// приписываем все тэги
			if (!m_bQuiet)
				fprintf (stderr, "No information for word %s\n",WordStr.c_str());
			for (size_t i=0; i < min((size_t)200, m_TagsOrderedByUnigrams.size()); i++)
			{
				WORD tagno = m_TagsOrderedByUnigrams[i];
				string tag = m_RegisteredTags[tagno];
				if (tag.length()> 1 || !ispunct((unsigned char)tag[0]))
					R.m_PossibleWordTags.insert(tagno);
			}
		}

	return R;
}