void LemmatizatorStorage_SQLITE::Lemmatize(
                                           const lem::UCString &word,
                                           lem::MCollect<lem::UCString> &lemmas
                                          )
{
 lemmas.clear();

 lem::MemFormatter mem;
 mem.printf( "SELECT L.lemma"
             " FROM lexemes_n X, lemmas L"
             " WHERE X.lexeme='%us' AND L.id=X.id_lemma", to_upper(word).c_str() );

 lem::Ptr<LS_ResultSet> rs(cnx->Select(lem::to_utf8(mem.string())));
 while( rs->Fetch() )
  {
   lemmas.push_back( rs->GetUCString(0) );
  }

 if( lemmas.empty() )
  {
   lemmas.push_back(word);
  }

 return;
}
Exemple #2
0
// Генерация слов, фонетически близких к заданному word.
// Возвращается список вариантов, включая исходное слово, и список их достоверностей.
void LexicalAutomat::ProducePhonInv(
    const lem::UCString &word,
    int id_language,
    lem::MCollect<lem::UCString> &res,
    lem::MCollect<lem::Real1> &rels,
    LA_RecognitionTrace *trace
)
{
    MCollect<LA_AA_list*> packs;
    LA_AA_list *list = new LA_AA_list;
    list->reserve(16);
    list->push_back(LA_AA_item(word, Real1(100)));

    // Теперь мутированные варианты.
    LA_Pack *pack = AlephAuto(id_language, word, 1, trace);

    for (Container::size_type j = 0; j < pack->size(); j++)
    {
        const Solarix::Lexem &ph_lex = *(pack->get(j));
        if (res.find(ph_lex) == UNKNOWN)
        {
            Real1 r = pack->get(j)->get_Val();
            rels.push_back(r);
            res.push_back(ph_lex);
        }
    }

    return;
}
void SynPatternResult::FilterExportedNodes(const lem::MCollect< ExportNode > & must_be_exported)
{
    if (must_be_exported.empty())
    {
        exported_nodes.clear();
    }
    else
    {
        lem::MCollect< std::pair<const lem::UCString*, const Word_Form*> > filtered;
        for (lem::Container::size_type i = 0; i < exported_nodes.size(); ++i)
        {
            const lem::UCString & name = *exported_nodes[i].first;
            for (lem::Container::size_type j = 0; j < must_be_exported.size(); ++j)
            {
                if (must_be_exported[j].node_name == name)
                {
                    // Нашли ссылку, которую нужно перебросить в новый список, возможно уже под другим именем
                    filtered.push_back(std::make_pair(&must_be_exported[j].as_name, exported_nodes[i].second));
                    break;
                }
            }
        }

        exported_nodes = filtered;
    }

    return;
}
Exemple #4
0
void Lemmatizator::Lemmatize( const lem::MCollect<lem::UCString> & words, lem::MCollect<lem::UCString> &lemmas )
{
 #if defined LEM_THREADS
 lem::Process::CritSecLocker lock(&cs);
 #endif

 if( !model_loaded )
  {
   bin->seekp( model_pos );
   model_loaded = true;
   model_available = bin->read_bool();
   if( model_available )
    {
     LoadModel();
    }
  }


 if( model_available )
  {
   LemmatizeViaModel( words, lemmas );
  }
 else
  {
   for( lem::Container::size_type i=0; i<words.size(); ++i )
    {
     lem::UCString lemma;
     Lemmatize( words[i], lemma );
     lemmas.push_back( lemma );
    }
  }

 return;
}
void LEMM_Compiler::LoadNGram( lem::Iridium::Macro_Parser & txtfile, Dictionary & dict, lem::MCollect<int> & terms, int order ) const
{
 lem::Iridium::BSourceState beg = txtfile.tellp();

 while( !txtfile.eof() )
 {
  lem::Iridium::BethToken t = txtfile.read();
  if( lem::is_int(t.string()) )
   terms.push_back( lem::to_int(t.string()) );
  else
   {
    txtfile.seekp(t);
    break;
   }
 }

 if( terms.size() != order+1 )
  {
   dict.GetIO().merr().printf( "%vfDInvalid ngram%vn\n" );
   lem::Iridium::Print_Error( beg, txtfile );
   throw lem::E_ParserError();
  }

 return;
}
void TreeMatchingExperience::AddKBCheckerMatching( int id_facts, const lem::MCollect< const Solarix::Word_Form * > & arg_values, const KB_CheckingResult & res )
{
 LEM_CHECKIT_Z( id_facts!=UNKNOWN );
 LEM_CHECKIT_Z( arg_values.size()>0 );

 TME_KBChecker * y = new TME_KBChecker( arg_values, res );
 kbid2item.insert( std::make_pair( std::make_pair(id_facts,arg_values.front()), y ) );
 return;
}
bool LA_PreprocessorRules::Crop(
    const lem::UCString &word,
    lem::MCollect<lem::UCString> &results,
    lem::MCollect<lem::Real1> &rels,
    LA_RecognitionTrace *trace
) const
{
    bool applied = false;

    if (!crop_rules.empty())
    {
        // сначала применяем префиксные правила
        typedef CROP_RULES::const_iterator IT;

        LA_CropRule::HashType prefix_hash = LA_CropRule::CalcHash(word.c_str(), true, false);
        std::pair<IT, IT> pp = prefix_crop_rules.equal_range(prefix_hash);

        lem::UCString result;

        for (auto it = pp.first; it != pp.second; ++it)
        {
            const LA_CropRule *r = it->second;
            if (r->Apply(word, result))
            {
                applied = true;
                results.push_back(result);
                rels.push_back(r->GetRel());
                if (trace != nullptr)
                {
                    trace->CropRuleApplied(word, result, r);
                }
            }
        }

        // теперь отсекаем аффикс

        LA_CropRule::HashType affix_hash = LA_CropRule::CalcHash(word.c_str(), false, true);
        pp = affix_crop_rules.equal_range(affix_hash);

        for (auto it = pp.first; it != pp.second; ++it)
        {
            const LA_CropRule *r = it->second;
            if (r->Apply(word, result))
            {
                applied = true;
                results.push_back(result);
                rels.push_back(r->GetRel());
                if (trace != nullptr)
                {
                    trace->CropRuleApplied(word, result, r);
                }
            }
        }
    }

    return applied;
}
Exemple #8
0
// Ищем в справочнике набор тегов, заданный списком tags. При необходимости
// вносим в БД новую запись. Возвращается ID найденной или созданной записи.
int TagSets::Register( const lem::MCollect< std::pair<int,int> > &tags )
{
 if( tags.empty() )
  {
   return 0;
  }

 #if defined LEM_THREADS
 lem::Process::CritSecLocker lock(&cs); 
 #endif

 // Для устранения вариантов записи одного и того же набора тегов отсортируем элементы по id_tag.
 lem::MCollect< std::pair<int,int> > *sorted_tags = new lem::MCollect< std::pair<int,int> >(tags);
 std::sort( sorted_tags->begin(), sorted_tags->end(), tags_sorter );

 // Такой кортеж есть?
 const int i = tag_ptr.find(*sorted_tags);
 
 if( i==UNKNOWN )
  { 
   // Нет.
   // Поищем в БД.
   lem::UFString s;
   if( tags.size()==1 )
    {
     s = lem::format_str( L"%d %d", tags.front().first, tags.front().second );
    }
   else if( tags.size()==2 )
    {
     s = lem::format_str( L"%d %d %d %d", sorted_tags->get(0).first, sorted_tags->get(0).second, sorted_tags->get(1).first, sorted_tags->get(1).second );
    }
   else
    {
     for( lem::Container::size_type i=0; i<sorted_tags->size(); ++i )
      {
       if(i>0) s += L' ';
       s += lem::format_str( L"%d %d", sorted_tags->get(i).first, sorted_tags->get(i).second );
      }
    }
   
   const int id = db->AddTagSet(s);

   id2tags.insert( std::make_pair(id,sorted_tags) );
   tag_ptr.push_back( sorted_tags );
   tagset_id.push_back(id);

   return id;
  }
 else
  {
   delete sorted_tags;
   return tagset_id[i];
  }
}
Exemple #9
0
static bool x_contains_any_of_y( const lem::MCollect<int> &x, const lem::MCollect<int> &y )
{
 if( y.size()==1 )
  return x.find(y.front())!=UNKNOWN;
 else if( y.size()==2 )
  return x.find(y.front())!=UNKNOWN || x.find(y.back())!=UNKNOWN;
 else
  {
   for( lem::Container::size_type i=0; i<y.size(); ++i )
    if( x.find(y[i])!=UNKNOWN )
     return true;

   return false;
  }
}
void SynPatternResult::SelectUnique_WithoutRemoval(lem::MCollect<const SynPatternResult*> & results)
{
    lem::MCollect<int> result_hash;
    lem::MCollect<const SynPatternResult*> unique_result;
    for (lem::Container::size_type k = 0; k < results.size(); ++k)
    {
        const SynPatternResult * result_k = results[k];
        const int h = result_k->CalcHash();

        bool found = false;
        for (lem::Container::size_type i = 0; i < unique_result.size(); ++i)
        {
            if (result_hash[i] == h)
            {
                if (SynPatternResult::Equals(result_k, unique_result[i]))
                {
                    found = true;
                    break;
                }
            }
        }

        if (!found)
        {
            result_hash.push_back(h);
            unique_result.push_back(result_k);
        }
    }

    results = unique_result;

    return;
}
Exemple #11
0
void LexerTextPos::Collect_Right2Left(int count, lem::MCollect<const LexerTextPos*> & inverted_path) const
{
    inverted_path.push_back(this);
    if (count > 0 && !IsBegin() && previous != nullptr)
        GetPrev()->Collect_Right2Left(count - 1, inverted_path);

    return;
}
void SynPatternResult::GetExportCoordPairs(lem::MCollect< std::pair<int, int> > & pairs) const
{
    for (auto it = exported_coords.begin(); it != exported_coords.end(); ++it)
    {
        pairs.push_back(*it);
    }

    return;
}
void SynPatternResult::FilterExportedCoords(const lem::MCollect<int> & must_be_exported)
{
    if (must_be_exported.empty())
        exported_coords.clear();
    else
    {
        std::multimap< int /*id_coord*/, int /*id_state*/ > filtered;
        for (auto it = exported_coords.begin(); it != exported_coords.end(); ++it)
        {
            if (must_be_exported.find(it->first) != UNKNOWN)
                filtered.insert(*it);
        }

        exported_coords = filtered;
    }

    return;
}
void SynPatternResult::AppendDebugTrace(const lem::MCollect<SynPatternDebugTrace> & debug_trace2)
{
    for (lem::Container::size_type i = 0; i < debug_trace2.size(); ++i)
    {
        debug_trace.push_back(debug_trace2[i]);
    }

    return;
}
bool TreeMatchingExperience::FindKBCheckerMatching( int id_facts, const lem::MCollect< const Solarix::Word_Form * > & arg_values, KB_CheckingResult * res ) const
{
 LEM_CHECKIT_Z( id_facts!=UNKNOWN );
 LEM_CHECKIT_Z( arg_values.size()>0 );

 typedef KBID2ITEM::const_iterator IT;
 std::pair<IT,IT> pit = kbid2item.equal_range( std::make_pair( id_facts, arg_values.front() ) );

 for( IT it=pit.first; it!=pit.second; ++it )
  {
   if( arg_values == it->second->arg_values )
    {
     *res = it->second->res;
     return true;
    }
  }

 return false;
}
Exemple #16
0
void GeneratorLexer::CollectUsedWords( const LexerTextPos * t, lem::MCollect<int> & indeces ) const
{
 TOKEN2WORD::const_iterator it = token2word.find(t);
 if( it!=token2word.end() )
  indeces.push_back( it->second );

 if( !t->IsBegin() && t->GetPrev()!=NULL )
  CollectUsedWords( t->GetPrev(), indeces );

 return;
}
Exemple #17
0
Word_Form::Word_Form( const lem::MCollect<const Word_Form*> &variants )
{
 LEM_CHECKIT_Z( !variants.empty() );

 // Первая версия становится основной, ее не копируем в альтернативы.
 for( lem::Container::size_type i=1; i<variants.size(); ++i )
  alt.push_back( new Word_Form(*variants[i]) );

 name = variants[0]->name;
 normalized = variants[0]->normalized;
 pair = variants[0]->pair;
 entry_key = variants[0]->entry_key;
 val = variants[0]->val;
 score=variants[0]->score;
 origin_pos = variants[0]->origin_pos;
 tokenizer_flags = variants[0]->tokenizer_flags;

 iversion = seq_iversion++;
 return;
}
Exemple #18
0
void LexerTextPos::CollectPathToLeft(int count, lem::MCollect<const Word_Form*> & org) const
{
    LEM_CHECKIT_Z(count >= 0);

    org.push_back(wordform);

    if (count > 0 && previous != NULL)
        previous->CollectPathToLeft(count - 1, org);

    return;
}
Exemple #19
0
void LexerTextPos::Collect_Right2Left(const LexerTextPos *left_boundary, lem::MCollect<const LexerTextPos*> & inverted_path) const
{
    LEM_CHECKIT_Z(left_boundary != nullptr);

    inverted_path.push_back(this);

    if (this != left_boundary && previous != nullptr)
        previous->Collect_Right2Left(left_boundary, inverted_path);

    return;
}
Exemple #20
0
void SG_DeclensionTable::GenerateForms(
                                       const Lexem &entry_name,
                                       lem::MCollect<Lexem> &res,
                                       const SynGram &sg,
                                       const SG_DeclensionAutomat &dsa 
                                      ) const
{
 res.reserve(form.size());

 for( lem::Container::size_type i=0; i<form.size(); i++ )
  {
   UCString frm( dsa.ProduceForm( entry_name, GetClass(), *form[i], sg ) );

   // Без повторов
   if( std::find( res.begin(), res.end(), frm )==res.end() )
    res.push_back( frm );
  }

 return;
}
Exemple #21
0
void SG_DeclensionTable::GenerateForms(
                                       const Lexem &entry_name,
                                       lem::MCollect<Lexem> & res,
                                       lem::PtrCollect<CP_Array> & form_dims, 
                                       const SynGram &sg,
                                       const SG_DeclensionAutomat &dsa 
                                      ) const
{
 res.reserve(form.size());

 for( lem::Container::size_type i=0; i<form.size(); i++ )
  {
   UCString frm( dsa.ProduceForm( entry_name, GetClass(), *form[i], sg ) );

   res.push_back( frm);
   form_dims.push_back( new CP_Array( form[i]->GetDim() ) );
  }

 return;
}
Exemple #22
0
void SynPatterns::GetUnresolvedForwardDeclarations( lem::MCollect<lem::UCString> & unresolved_names ) const
{
 typedef std::map< lem::UCString, int >::const_iterator IT;
 for( IT it=name2id.begin(); it!=name2id.end(); ++it )
  {
   if( id2count.find( it->second )==id2count.end() )
    unresolved_names.push_back( it->first );
  }

 return;
}
Exemple #23
0
void SyntaxShell::PrintLexerPerformance( Solarix::BasicLexer & lexer, const lem::MCollect<const LexerTextPos*> & final_tokens )
{
 for( lem::Container::size_type i=0; i<final_tokens.size(); ++i )
  {
   lem::mout->printf( "#%vf9%d%vn-->", CastSizeToInt(i) );
   PrintLexerPerformance( lexer, final_tokens[i] );
   lem::mout->eol();
  }

 return;
}
void SyllabContext::GetResultSyllabs(lem::MCollect<lem::UCString> & result_syllabs, bool Normalized) const
{
    for (auto point : points)
    {
        if (point->IsLeftBoundary() || point->IsRightBoundary())
            continue;

        result_syllabs.push_back(point->BuildSyllab(Normalized));
    }

    return;
}
Exemple #25
0
void SyllabContext::GetResultSyllabs( lem::MCollect<lem::UCString> & result_syllabs, bool Normalized ) const
{
 for( lem::Container::size_type i=0; i<points.size(); ++i )
  {
   const SyllabContextPoint * p = points[i];
   if( p->IsLeftBoundary() || p->IsRightBoundary() )
    continue;
   
   result_syllabs.push_back( p->BuildSyllab(Normalized) );
  }

 return;
}
Exemple #26
0
static bool IsHtmlClosed( const lem::UFString &tag )
{
 if( tags1.empty() )
  {
   const wchar_t* stags[] = { L"br", L"hr", L"link", L"meta", L"img", L"input",
                              NULL
                            };

   int i=0;
   while(stags[i]!=NULL)
    tags1.push_back( lem::UFString(stags[i++]) );
  }

 for( lem::Container::size_type i=0; i<tags1.size(); ++i )
  {
   const lem::UFString &t = tags1[i];
   if( tag.eq_begi(t) && (tag.length()==t.length() || tag[ t.length() ]==L' ' ) ) 
    return true;
  }

 return false;
}
Exemple #27
0
static bool IsTextDelimiterTag( const UFString &tag )
{
 if( tags2.empty() )
  {
   const wchar_t* stags[] = { L"p", L"br", L"table", L"td", L"tr", L"th",
                              L"ol", L"ul", L"li", L"dd", L"input", L"frame", L"div",
                              NULL
                            };

   int i=0;
   while(stags[i]!=NULL)
    tags2.push_back( lem::UFString(stags[i++]) );
  }

 for( lem::Container::size_type i=0; i<tags2.size(); ++i )
  {
   const lem::UFString &t = tags2[i];
   if( tag.eq_begi(t) && (tag.length()==t.length() || tag[ t.length() ]==L' ' ) ) 
    return true;
  }

 return false;
}
// *************************************************************************************
// Ищем парадигмы, чьи условия подходят для указанной базовой формы, возвращает
// список id таких парадигм.
// *************************************************************************************
void ParadigmaFinder::Find( int PartOfSpeech, const lem::UCString &entry_name, lem::MCollect<int> &found_ids )
{
 #if defined LEM_THREADS
 lem::Process::RWU_ReaderGuard rlock(cs);
 #endif

 if( !loaded )
  {
   #if defined LEM_THREADS
   lem::Process::RWU_WriterGuard wlock(rlock);
   #endif
   LoadFromDB();
  }

 if( PartOfSpeech==UNKNOWN || PartOfSpeech==ANY_STATE )
  {
   for( lem::Container::size_type i=0; i<matchers.size(); ++i )
    if( matchers[i]->Match(entry_name) )
     {
      found_ids.push_back(ids[i]);
     }
  }
 else
  {
   CLASS2DECL::const_iterator it=class2decl.find(PartOfSpeech);
   if( it!=class2decl.end() )
    {
     for( lem::Container::size_type i=0; i<it->second->size(); ++i )
      if( it->second->get(i).second->Match(entry_name) )
       {
        found_ids.push_back( it->second->get(i).first );
       }
    }
  }
 
 return;
}
Exemple #29
0
void BasicModel::PullFeatures2( lem::MCollect<lem::CString> & b, const lem::PtrCollect<ModelTokenFeatures> & token_features, int ifocus, int offset1, int offset2 ) const
{
 int iword1 = ifocus + offset1;
 int iword2 = ifocus + offset2;

 if( iword1 >= 0 && iword1 < token_features.size() && iword2 >= 0 && iword2 < token_features.size() )
 {
  b.push_back( lem::format_str( "sfx[%d,%d]=%d,%d", offset1, offset2, token_features[iword1]->suffix_id, token_features[iword2]->suffix_id ).c_str() );

  // здесь можно вывести и другие свойства слов.
  // ... TODO
 }

 return;
}
void SyllabContext::Replace(int start_index, int count, lem::MCollect<SyllabContextPoint*> & new_points)
{
    for (int n = 0; n < count; ++n)
        points.Remove(start_index);

    for (lem::Container::size_type i = 0; i < new_points.size(); ++i)
    {
        const int new_index = start_index + CastSizeToInt(i);
        if (new_index == Count())
            points.push_back(new_points[i]);
        else
            points.Insert(new_index, new_points[i]);
    }

    return;
}