void LemmatizatorStorage_SQLITE::Lemmatize( const lem::UCString &word, lem::MCollect<lem::UCString> &lemmas ) { lemmas.clear(); lem::MemFormatter mem; mem.printf( "SELECT L.lemma" " FROM lexemes_n X, lemmas L" " WHERE X.lexeme='%us' AND L.id=X.id_lemma", to_upper(word).c_str() ); lem::Ptr<LS_ResultSet> rs(cnx->Select(lem::to_utf8(mem.string()))); while( rs->Fetch() ) { lemmas.push_back( rs->GetUCString(0) ); } if( lemmas.empty() ) { lemmas.push_back(word); } return; }
// Генерация слов, фонетически близких к заданному word. // Возвращается список вариантов, включая исходное слово, и список их достоверностей. void LexicalAutomat::ProducePhonInv( const lem::UCString &word, int id_language, lem::MCollect<lem::UCString> &res, lem::MCollect<lem::Real1> &rels, LA_RecognitionTrace *trace ) { MCollect<LA_AA_list*> packs; LA_AA_list *list = new LA_AA_list; list->reserve(16); list->push_back(LA_AA_item(word, Real1(100))); // Теперь мутированные варианты. LA_Pack *pack = AlephAuto(id_language, word, 1, trace); for (Container::size_type j = 0; j < pack->size(); j++) { const Solarix::Lexem &ph_lex = *(pack->get(j)); if (res.find(ph_lex) == UNKNOWN) { Real1 r = pack->get(j)->get_Val(); rels.push_back(r); res.push_back(ph_lex); } } return; }
void SynPatternResult::FilterExportedNodes(const lem::MCollect< ExportNode > & must_be_exported) { if (must_be_exported.empty()) { exported_nodes.clear(); } else { lem::MCollect< std::pair<const lem::UCString*, const Word_Form*> > filtered; for (lem::Container::size_type i = 0; i < exported_nodes.size(); ++i) { const lem::UCString & name = *exported_nodes[i].first; for (lem::Container::size_type j = 0; j < must_be_exported.size(); ++j) { if (must_be_exported[j].node_name == name) { // Нашли ссылку, которую нужно перебросить в новый список, возможно уже под другим именем filtered.push_back(std::make_pair(&must_be_exported[j].as_name, exported_nodes[i].second)); break; } } } exported_nodes = filtered; } return; }
void Lemmatizator::Lemmatize( const lem::MCollect<lem::UCString> & words, lem::MCollect<lem::UCString> &lemmas ) { #if defined LEM_THREADS lem::Process::CritSecLocker lock(&cs); #endif if( !model_loaded ) { bin->seekp( model_pos ); model_loaded = true; model_available = bin->read_bool(); if( model_available ) { LoadModel(); } } if( model_available ) { LemmatizeViaModel( words, lemmas ); } else { for( lem::Container::size_type i=0; i<words.size(); ++i ) { lem::UCString lemma; Lemmatize( words[i], lemma ); lemmas.push_back( lemma ); } } return; }
void LEMM_Compiler::LoadNGram( lem::Iridium::Macro_Parser & txtfile, Dictionary & dict, lem::MCollect<int> & terms, int order ) const { lem::Iridium::BSourceState beg = txtfile.tellp(); while( !txtfile.eof() ) { lem::Iridium::BethToken t = txtfile.read(); if( lem::is_int(t.string()) ) terms.push_back( lem::to_int(t.string()) ); else { txtfile.seekp(t); break; } } if( terms.size() != order+1 ) { dict.GetIO().merr().printf( "%vfDInvalid ngram%vn\n" ); lem::Iridium::Print_Error( beg, txtfile ); throw lem::E_ParserError(); } return; }
void TreeMatchingExperience::AddKBCheckerMatching( int id_facts, const lem::MCollect< const Solarix::Word_Form * > & arg_values, const KB_CheckingResult & res ) { LEM_CHECKIT_Z( id_facts!=UNKNOWN ); LEM_CHECKIT_Z( arg_values.size()>0 ); TME_KBChecker * y = new TME_KBChecker( arg_values, res ); kbid2item.insert( std::make_pair( std::make_pair(id_facts,arg_values.front()), y ) ); return; }
bool LA_PreprocessorRules::Crop( const lem::UCString &word, lem::MCollect<lem::UCString> &results, lem::MCollect<lem::Real1> &rels, LA_RecognitionTrace *trace ) const { bool applied = false; if (!crop_rules.empty()) { // сначала применяем префиксные правила typedef CROP_RULES::const_iterator IT; LA_CropRule::HashType prefix_hash = LA_CropRule::CalcHash(word.c_str(), true, false); std::pair<IT, IT> pp = prefix_crop_rules.equal_range(prefix_hash); lem::UCString result; for (auto it = pp.first; it != pp.second; ++it) { const LA_CropRule *r = it->second; if (r->Apply(word, result)) { applied = true; results.push_back(result); rels.push_back(r->GetRel()); if (trace != nullptr) { trace->CropRuleApplied(word, result, r); } } } // теперь отсекаем аффикс LA_CropRule::HashType affix_hash = LA_CropRule::CalcHash(word.c_str(), false, true); pp = affix_crop_rules.equal_range(affix_hash); for (auto it = pp.first; it != pp.second; ++it) { const LA_CropRule *r = it->second; if (r->Apply(word, result)) { applied = true; results.push_back(result); rels.push_back(r->GetRel()); if (trace != nullptr) { trace->CropRuleApplied(word, result, r); } } } } return applied; }
// Ищем в справочнике набор тегов, заданный списком tags. При необходимости // вносим в БД новую запись. Возвращается ID найденной или созданной записи. int TagSets::Register( const lem::MCollect< std::pair<int,int> > &tags ) { if( tags.empty() ) { return 0; } #if defined LEM_THREADS lem::Process::CritSecLocker lock(&cs); #endif // Для устранения вариантов записи одного и того же набора тегов отсортируем элементы по id_tag. lem::MCollect< std::pair<int,int> > *sorted_tags = new lem::MCollect< std::pair<int,int> >(tags); std::sort( sorted_tags->begin(), sorted_tags->end(), tags_sorter ); // Такой кортеж есть? const int i = tag_ptr.find(*sorted_tags); if( i==UNKNOWN ) { // Нет. // Поищем в БД. lem::UFString s; if( tags.size()==1 ) { s = lem::format_str( L"%d %d", tags.front().first, tags.front().second ); } else if( tags.size()==2 ) { s = lem::format_str( L"%d %d %d %d", sorted_tags->get(0).first, sorted_tags->get(0).second, sorted_tags->get(1).first, sorted_tags->get(1).second ); } else { for( lem::Container::size_type i=0; i<sorted_tags->size(); ++i ) { if(i>0) s += L' '; s += lem::format_str( L"%d %d", sorted_tags->get(i).first, sorted_tags->get(i).second ); } } const int id = db->AddTagSet(s); id2tags.insert( std::make_pair(id,sorted_tags) ); tag_ptr.push_back( sorted_tags ); tagset_id.push_back(id); return id; } else { delete sorted_tags; return tagset_id[i]; } }
static bool x_contains_any_of_y( const lem::MCollect<int> &x, const lem::MCollect<int> &y ) { if( y.size()==1 ) return x.find(y.front())!=UNKNOWN; else if( y.size()==2 ) return x.find(y.front())!=UNKNOWN || x.find(y.back())!=UNKNOWN; else { for( lem::Container::size_type i=0; i<y.size(); ++i ) if( x.find(y[i])!=UNKNOWN ) return true; return false; } }
void SynPatternResult::SelectUnique_WithoutRemoval(lem::MCollect<const SynPatternResult*> & results) { lem::MCollect<int> result_hash; lem::MCollect<const SynPatternResult*> unique_result; for (lem::Container::size_type k = 0; k < results.size(); ++k) { const SynPatternResult * result_k = results[k]; const int h = result_k->CalcHash(); bool found = false; for (lem::Container::size_type i = 0; i < unique_result.size(); ++i) { if (result_hash[i] == h) { if (SynPatternResult::Equals(result_k, unique_result[i])) { found = true; break; } } } if (!found) { result_hash.push_back(h); unique_result.push_back(result_k); } } results = unique_result; return; }
void LexerTextPos::Collect_Right2Left(int count, lem::MCollect<const LexerTextPos*> & inverted_path) const { inverted_path.push_back(this); if (count > 0 && !IsBegin() && previous != nullptr) GetPrev()->Collect_Right2Left(count - 1, inverted_path); return; }
void SynPatternResult::GetExportCoordPairs(lem::MCollect< std::pair<int, int> > & pairs) const { for (auto it = exported_coords.begin(); it != exported_coords.end(); ++it) { pairs.push_back(*it); } return; }
void SynPatternResult::FilterExportedCoords(const lem::MCollect<int> & must_be_exported) { if (must_be_exported.empty()) exported_coords.clear(); else { std::multimap< int /*id_coord*/, int /*id_state*/ > filtered; for (auto it = exported_coords.begin(); it != exported_coords.end(); ++it) { if (must_be_exported.find(it->first) != UNKNOWN) filtered.insert(*it); } exported_coords = filtered; } return; }
void SynPatternResult::AppendDebugTrace(const lem::MCollect<SynPatternDebugTrace> & debug_trace2) { for (lem::Container::size_type i = 0; i < debug_trace2.size(); ++i) { debug_trace.push_back(debug_trace2[i]); } return; }
bool TreeMatchingExperience::FindKBCheckerMatching( int id_facts, const lem::MCollect< const Solarix::Word_Form * > & arg_values, KB_CheckingResult * res ) const { LEM_CHECKIT_Z( id_facts!=UNKNOWN ); LEM_CHECKIT_Z( arg_values.size()>0 ); typedef KBID2ITEM::const_iterator IT; std::pair<IT,IT> pit = kbid2item.equal_range( std::make_pair( id_facts, arg_values.front() ) ); for( IT it=pit.first; it!=pit.second; ++it ) { if( arg_values == it->second->arg_values ) { *res = it->second->res; return true; } } return false; }
void GeneratorLexer::CollectUsedWords( const LexerTextPos * t, lem::MCollect<int> & indeces ) const { TOKEN2WORD::const_iterator it = token2word.find(t); if( it!=token2word.end() ) indeces.push_back( it->second ); if( !t->IsBegin() && t->GetPrev()!=NULL ) CollectUsedWords( t->GetPrev(), indeces ); return; }
Word_Form::Word_Form( const lem::MCollect<const Word_Form*> &variants ) { LEM_CHECKIT_Z( !variants.empty() ); // Первая версия становится основной, ее не копируем в альтернативы. for( lem::Container::size_type i=1; i<variants.size(); ++i ) alt.push_back( new Word_Form(*variants[i]) ); name = variants[0]->name; normalized = variants[0]->normalized; pair = variants[0]->pair; entry_key = variants[0]->entry_key; val = variants[0]->val; score=variants[0]->score; origin_pos = variants[0]->origin_pos; tokenizer_flags = variants[0]->tokenizer_flags; iversion = seq_iversion++; return; }
void LexerTextPos::CollectPathToLeft(int count, lem::MCollect<const Word_Form*> & org) const { LEM_CHECKIT_Z(count >= 0); org.push_back(wordform); if (count > 0 && previous != NULL) previous->CollectPathToLeft(count - 1, org); return; }
void LexerTextPos::Collect_Right2Left(const LexerTextPos *left_boundary, lem::MCollect<const LexerTextPos*> & inverted_path) const { LEM_CHECKIT_Z(left_boundary != nullptr); inverted_path.push_back(this); if (this != left_boundary && previous != nullptr) previous->Collect_Right2Left(left_boundary, inverted_path); return; }
void SG_DeclensionTable::GenerateForms( const Lexem &entry_name, lem::MCollect<Lexem> &res, const SynGram &sg, const SG_DeclensionAutomat &dsa ) const { res.reserve(form.size()); for( lem::Container::size_type i=0; i<form.size(); i++ ) { UCString frm( dsa.ProduceForm( entry_name, GetClass(), *form[i], sg ) ); // Без повторов if( std::find( res.begin(), res.end(), frm )==res.end() ) res.push_back( frm ); } return; }
void SG_DeclensionTable::GenerateForms( const Lexem &entry_name, lem::MCollect<Lexem> & res, lem::PtrCollect<CP_Array> & form_dims, const SynGram &sg, const SG_DeclensionAutomat &dsa ) const { res.reserve(form.size()); for( lem::Container::size_type i=0; i<form.size(); i++ ) { UCString frm( dsa.ProduceForm( entry_name, GetClass(), *form[i], sg ) ); res.push_back( frm); form_dims.push_back( new CP_Array( form[i]->GetDim() ) ); } return; }
void SynPatterns::GetUnresolvedForwardDeclarations( lem::MCollect<lem::UCString> & unresolved_names ) const { typedef std::map< lem::UCString, int >::const_iterator IT; for( IT it=name2id.begin(); it!=name2id.end(); ++it ) { if( id2count.find( it->second )==id2count.end() ) unresolved_names.push_back( it->first ); } return; }
void SyntaxShell::PrintLexerPerformance( Solarix::BasicLexer & lexer, const lem::MCollect<const LexerTextPos*> & final_tokens ) { for( lem::Container::size_type i=0; i<final_tokens.size(); ++i ) { lem::mout->printf( "#%vf9%d%vn-->", CastSizeToInt(i) ); PrintLexerPerformance( lexer, final_tokens[i] ); lem::mout->eol(); } return; }
void SyllabContext::GetResultSyllabs(lem::MCollect<lem::UCString> & result_syllabs, bool Normalized) const { for (auto point : points) { if (point->IsLeftBoundary() || point->IsRightBoundary()) continue; result_syllabs.push_back(point->BuildSyllab(Normalized)); } return; }
void SyllabContext::GetResultSyllabs( lem::MCollect<lem::UCString> & result_syllabs, bool Normalized ) const { for( lem::Container::size_type i=0; i<points.size(); ++i ) { const SyllabContextPoint * p = points[i]; if( p->IsLeftBoundary() || p->IsRightBoundary() ) continue; result_syllabs.push_back( p->BuildSyllab(Normalized) ); } return; }
static bool IsHtmlClosed( const lem::UFString &tag ) { if( tags1.empty() ) { const wchar_t* stags[] = { L"br", L"hr", L"link", L"meta", L"img", L"input", NULL }; int i=0; while(stags[i]!=NULL) tags1.push_back( lem::UFString(stags[i++]) ); } for( lem::Container::size_type i=0; i<tags1.size(); ++i ) { const lem::UFString &t = tags1[i]; if( tag.eq_begi(t) && (tag.length()==t.length() || tag[ t.length() ]==L' ' ) ) return true; } return false; }
static bool IsTextDelimiterTag( const UFString &tag ) { if( tags2.empty() ) { const wchar_t* stags[] = { L"p", L"br", L"table", L"td", L"tr", L"th", L"ol", L"ul", L"li", L"dd", L"input", L"frame", L"div", NULL }; int i=0; while(stags[i]!=NULL) tags2.push_back( lem::UFString(stags[i++]) ); } for( lem::Container::size_type i=0; i<tags2.size(); ++i ) { const lem::UFString &t = tags2[i]; if( tag.eq_begi(t) && (tag.length()==t.length() || tag[ t.length() ]==L' ' ) ) return true; } return false; }
// ************************************************************************************* // Ищем парадигмы, чьи условия подходят для указанной базовой формы, возвращает // список id таких парадигм. // ************************************************************************************* void ParadigmaFinder::Find( int PartOfSpeech, const lem::UCString &entry_name, lem::MCollect<int> &found_ids ) { #if defined LEM_THREADS lem::Process::RWU_ReaderGuard rlock(cs); #endif if( !loaded ) { #if defined LEM_THREADS lem::Process::RWU_WriterGuard wlock(rlock); #endif LoadFromDB(); } if( PartOfSpeech==UNKNOWN || PartOfSpeech==ANY_STATE ) { for( lem::Container::size_type i=0; i<matchers.size(); ++i ) if( matchers[i]->Match(entry_name) ) { found_ids.push_back(ids[i]); } } else { CLASS2DECL::const_iterator it=class2decl.find(PartOfSpeech); if( it!=class2decl.end() ) { for( lem::Container::size_type i=0; i<it->second->size(); ++i ) if( it->second->get(i).second->Match(entry_name) ) { found_ids.push_back( it->second->get(i).first ); } } } return; }
void BasicModel::PullFeatures2( lem::MCollect<lem::CString> & b, const lem::PtrCollect<ModelTokenFeatures> & token_features, int ifocus, int offset1, int offset2 ) const { int iword1 = ifocus + offset1; int iword2 = ifocus + offset2; if( iword1 >= 0 && iword1 < token_features.size() && iword2 >= 0 && iword2 < token_features.size() ) { b.push_back( lem::format_str( "sfx[%d,%d]=%d,%d", offset1, offset2, token_features[iword1]->suffix_id, token_features[iword2]->suffix_id ).c_str() ); // здесь можно вывести и другие свойства слов. // ... TODO } return; }
void SyllabContext::Replace(int start_index, int count, lem::MCollect<SyllabContextPoint*> & new_points) { for (int n = 0; n < count; ++n) points.Remove(start_index); for (lem::Container::size_type i = 0; i < new_points.size(); ++i) { const int new_index = start_index + CastSizeToInt(i); if (new_index == Count()) points.push_back(new_points[i]); else points.Insert(new_index, new_points[i]); } return; }