CWrapEngine::CWrapEngine() { CheckLanguage(); }
bool CGrammarItem::AddAttribute(string Name, string Value, MorphLanguageEnum Language, string& ErrorStr) { if (Value.length() > 0) if (Value[0] == '"') { if ( (Value.length()<2) || (Value[Value.length() - 1] != '"')) { ErrorStr = Format("no matching quotation mark for attribute value \"%s\"",Value.c_str()); return false; }; Value = Value.substr(1, Value.length()-2); }; if (Name == "root") { m_bSynMain = true; return true; }; if (Name == "type") { m_TokenType = StringToTokenType(Value); if (m_TokenType == OTHER_TOKEN_TYPE) { ErrorStr = Format("unknown token type:%s ",Value.c_str()); return false; } }; if (Name == "hom") { if (Value == "yes") m_bCanHaveManyHomonyms = true; else if (Value == "no") m_bCanHaveManyHomonyms = false; else { ErrorStr = Format("Bad value for attribute \"hom\" (\"%s\"). It can be \"yes\" or \"no\"",Value.c_str()); return false; }; if (m_TokenType == OTHER_TOKEN_TYPE) m_TokenType = (Language == morphRussian) ? RLE : LLE; return true; }; if (Name == "grm") { m_MorphPattern.m_GrmAttribute = Value; if (m_TokenType == OTHER_TOKEN_TYPE) m_TokenType = (Language == morphRussian) ? RLE : LLE; return true; }; if (Name == "form") { m_Token = Value; RmlMakeUpper(m_Token, Language); m_ItemStrId = Value; if ( (m_TokenType == OTHER_TOKEN_TYPE) && !m_Token.empty()) { if (ispunct((BYTE)m_Token[0])) m_TokenType = PUNCTUAT; else if (isdigit((BYTE)m_Token[0])) m_TokenType = NUM; else if (Language == morphRussian) { if (CheckLanguage(m_Token, Language)) m_TokenType = RLE; } else { if (CheckLanguage(m_Token, Language)) m_TokenType = LLE; } }; return true; }; if (Name == "register") { if (Value == "AA") m_Register = UpUp; else if (Value == "aa") m_Register = LowLow; else if (Value == "Aa") m_Register = UpLow; else { ErrorStr = Format("Bad value for attribute \"register\" (\"%s\"). It can be \"AA\", \"aa\" or \"Aa\"",Value.c_str()); return false; }; if (m_TokenType == OTHER_TOKEN_TYPE) m_TokenType = (Language == morphRussian) ? RLE : LLE; return true; }; if (Name == "filename") { Value = GetPathByFile(CurrentSourceFileName) + Value; if (m_TokenType == OTHER_TOKEN_TYPE) m_TokenType = (Language == morphRussian) ? RLE : LLE; } m_Attributes[Name] = Value; return true; };
CDictionarySearch CTrigramModel::find_word(const string& WordStr) const { CDictionarySearch R; assert (!WordStr.empty()); if (WordStr.empty()) { //fprintf (stderr, "Empty word!\n"); R.m_pFoundWord = 0; for (WORD i=0; i < m_TagsCount; i++) R.m_PossibleWordTags.insert(i); return R; } R.m_pFoundWord = lookup_word(WordStr); if (! R.m_pFoundWord ) { // если слова нет в словаре, тогда попробуем его поискать в нижнем регистре string lower = WordStr; RmlMakeLower(lower, m_Language); R.m_pFoundWord = lookup_word(lower); } if ( R.m_pFoundWord ) { // приписываем все тэги, которые были в корпусе for (size_t i=0; i < R.m_pFoundWord->m_Length; i++) { int Tag = m_LexProbs[R.m_pFoundWord->m_StartOffset + i].m_Tag; R.m_PossibleWordTags.insert(Tag); } } // получаем все возможные тэги из морф. словар¤ map<string, const vector<CXmlMorphAnnot>* >::iterator it = m_CurrentSentenceWords2Annots.find(WordStr); if (it != m_CurrentSentenceWords2Annots.end()) get_tags_from_annots(*it->second,R.m_PossibleWordTags, WordStr); #ifdef USE_TRIGRAM_LEMMATIZER else get_tags_from_lemmatizer_but_not_preps(WordStr,R.m_PossibleWordTags); #endif if (R.m_PossibleWordTags.empty()) if ( atoi(WordStr.c_str()) > 0 && (m_Language==morphRussian) ) { for (size_t i=0; i < m_RegisteredTags.size();i++) if (m_RegisteredTags[i].length() > 3 && m_RegisteredTags[i].substr(0,4) == "„»—Ћ") R.m_PossibleWordTags.insert(i); if (R.m_PossibleWordTags.empty()) throw CExpc ("Cannot find „»—Ћ tag"); } else if ( ispunct((BYTE)WordStr[0]) || !CheckLanguage(WordStr,m_Language) ) { int tag = find_tag("UNK"); if (tag == UnknownTag) throw CExpc ("Cannot find UNK tag"); R.m_PossibleWordTags.insert(tag); } else { // приписываем все тэги if (!m_bQuiet) fprintf (stderr, "No information for word %s\n",WordStr.c_str()); for (size_t i=0; i < min((size_t)200, m_TagsOrderedByUnigrams.size()); i++) { WORD tagno = m_TagsOrderedByUnigrams[i]; string tag = m_RegisteredTags[tagno]; if (tag.length()> 1 || !ispunct((unsigned char)tag[0])) R.m_PossibleWordTags.insert(tagno); } } return R; }