void WrittenTextAnalysisSession::AnalyzeIt(const lem::UFString & str, bool ApplyPatterns, bool DoSyntacticLinks, const ElapsedTimeConstraint & constraints) { processed_str = str; if (params.LanguageUnknown()) { // Язык предложения не задан явно, определим его на основе статистических критериев. const int id_language = dict->GetLexAuto().GuessLanguage(str); if (id_language == UNKNOWN) { lem::MemFormatter msg; msg.printf("Can not guess the language of the phrase [%us]", str.c_str()); throw E_BaseException(msg.string()); } params.SetLanguageID(id_language); #if defined SOL_DEBUGGING if (trace != nullptr) { trace->LanguageGuessed(str, id_language); } #endif } delete lexer; lexer = new WrittenTextLexer(str, params, dict, trace); Analyze(ApplyPatterns, DoSyntacticLinks, constraints); return; }
void TreeScorerResult::DeserializeExpression( int expr_type, const lem::UFString & serialized ) { type=expr_type; if( type==NumberScoreType ) { score = lem::to_int( serialized); } else if( type==NGramScoreType ) { lem::StrParser<lem::UFString> txt(serialized); id_fact = lem::to_int( txt.read() ); txt.read_it( L"(" ); while( !txt.eof() ) { if( txt.probe( L")" ) ) break; if( !args.empty() ) txt.read_it(L","); args.push_back( lem::UCString(txt.read().c_str()) ); } } else if( type==FuncScoreType ) { lem::MemReadHexStream bin2( serialized.c_str() ); args.LoadBin(bin2); score_fun = TrFunCall::load_bin(bin2); } else { LEM_STOPIT; } return; }
SG_DeclensionForm::SG_DeclensionForm( const lem::UFString & lexem_generator, const lem::UFString & rx_condition, const lem::UFString & rx_flexer_flags, const CP_Array & dims ) : dim(dims), form(lexem_generator.c_str()) { valid_condition = !rx_condition.empty(); valid_flexer_flags = !rx_flexer_flags.empty(); condition_str = rx_condition; flexer_flags_str = rx_flexer_flags; if( !condition_str.empty() ) condition = boost::wregex( condition_str.c_str(), boost::basic_regex<wchar_t>::icase ); if( !flexer_flags.empty() ) flexer_flags = boost::wregex( flexer_flags_str.c_str(), boost::basic_regex<wchar_t>::icase ); return; }
bool SyntaxShell::TryCommand( const lem::UFString &_str ) { LEM_CHECKIT_Z( !_str.empty() ); if( _str==L"#help" || _str==L"?" ) { ShowHelp(); return true; } if( _str.front()!=L'#' ) return false; if( _str.eq_beg( L"# " ) ) return true; // комментарий if( _str.eq_beg( L"#timeout" ) ) { lem::MCollect<UCString> toks; lem::parse( _str, toks, false ); MaxTimeout = lem::to_int( toks[1] ); return true; } if( _str.eq_beg( L"#maxalt" ) ) { lem::MCollect<UCString> toks; lem::parse( _str, toks, false ); MaxAlt = lem::to_int( toks[1] ); lem::mout->printf( "MaxAlt=%d\n", MaxAlt ); return true; } if( _str.eq_beg( L"#maxskiptoken" ) ) { lem::MCollect<UCString> toks; lem::parse( _str, toks, false ); MaxSkipToken = lem::to_int( toks[1] ); lem::mout->printf( "MaxSkipToken=%d\n", MaxSkipToken ); if( MaxSkipToken>0 ) CompleteAnalysisOnly = false; if( MaxAlt==0 || MaxAlt==lem::int_max ) { lem::mout->printf( "Attention: it is highly recommended to use %vfE#maxalt%vn NNN in order to limit the search tree depth\n" ); } return true; } if( _str.eq_beg( L"#sem" ) ) { lem::MCollect<UCString> toks; lem::parse( _str, toks, false ); FindFacts = lem::to_bool( toks[1] ); return true; } if( _str.eqi( L"#info" ) ) { ShowDictionaryInfo(); return true; } if( _str.eqi( L"#disconnect" ) ) { sol_id.Delete(); lem::mout->printf( "Dictionary database is disconnected.\n" ); return true; } if( _str.eqi( L"#connect" ) ) { LoadDictionary(); return true; } if( _str.eq_begi( L"#tag" ) ) { if( _str==L"#tag-" ) { // Сбрасываем установленный фильтр tags_ptr.Delete(); tags.clear(); return true; } lem::Collect<lem::UFString> toks; lem::parse( UFString(_str.c_str()+4), toks, L"=" ); UCString tag_name, tag_value; if( toks.size()>0 ) tag_name = toks[0].c_str(); if( toks.size()>1 ) tag_value = toks[1].c_str(); tag_name.trim(); tag_value.trim(); const int itag = sol_id->GetSynGram().Get_Net().FindTag(tag_name); if( itag==UNKNOWN ) { lem::mout->printf( "Tag [%vfE%us%vn] not found\n", tag_name.c_str() ); return true; } const ThesaurusTag &tt = sol_id->GetSynGram().Get_Net().GetTagDefs()[itag]; if( tt.CountValues()>0 ) { int ivalue = tt[tag_value]; if( ivalue==UNKNOWN ) { lem::mout->printf( "Tag value [%vfE%us%vn] not found\n", tag_value.c_str() ); return true; } } tags_ptr = new TF_TagOrNullFilter( *sol_id, tag_name, tag_value ); return true; } if( _str.eq_begi( L"#param" ) ) { if( _str==L"#param-" ) { // Очищаем список параметров. params.clear(); return true; } lem::Collect<lem::UFString> toks; lem::parse( UFString(_str.c_str()+7), toks, L"=" ); UCString param_name, param_value; if( toks.size()>0 ) param_name = toks[0].c_str(); if( toks.size()>1 ) param_value = toks[1].c_str(); param_name.trim(); param_value.trim(); params.push_back( std::make_pair( param_name, param_value ) ); return true; } lem::UFString str = lem::right( _str, _str.length()-1 ); lem::zbool ret; if( str==L"debug" ) { SetDebug(true); ret=true; } else if( str==L"nodebug" ) { SetDebug(false); ret=true; } else if( str==L"traceon" ) { SetDebug(true); traceon=true; debugger->Trace(true); ret=true; } else if( str==L"traceoff" ) { traceon=false; if( debugger.NotNull() ) debugger->Trace(true); ret=true; } else if( str==L"fuzzyon" ) { allow_fuzzy = true; mout->printf( "Fuzzy projection is now %vfAON%vn\n" ); ret=true; } else if( str==L"fuzzyoff" ) { allow_fuzzy = false; mout->printf( "Fuzzy projection is now %vfDOFF%vn\n" ); ret=true; } else if( str=="disable_filters" ) { EnableFilters=false; ret = true; } else if( str=="enable_filters" ) { EnableFilters=true; ret = true; } else if( str=="schedule1" ) { CompleteAnalysisOnly=true; UseTopDownThenSparse=true; mout->printf( "Workflow=%vfATOP-DOWN, TOP-DOWN INCOMPLETE%vn\n" ); ret=true; } else if( str==L"topdown" ) { UseTopDownThenSparse=false; CompleteAnalysisOnly=true; mout->printf( "%vfAtop-down%vn analyzer is activated\n" ); ret=true; } else if( str==L"allow_incomplete" ) { CompleteAnalysisOnly = false; mout->printf( "Incomplete analysis is %vfAALLOWED%vn\n" ); ret=true; } else if( str==L"disallow_incomplete" ) { CompleteAnalysisOnly = true; mout->printf( "Incomplete analysis is %vfDDISALLOWED%vn\n" ); ret=true; } else if( str==L"allow_reco" ) { UseReconstructor = true; mout->printf( "Token reconstructor is %vfAALLOWED%vn\n" ); ret=true; } else if( str==L"disallow_reco" ) { UseReconstructor = false; mout->printf( "Token reconstructor is %vfDDISALLOWED%vn\n" ); ret=true; } else if( str==L"allow_model" ) { if( sol_id->GetLexAuto().GetModel().GetSequenceLabeler().IsAvailable() || sol_id->GetLexAuto().GetModel().GetClassifier().IsAvailable() ) { ApplyModel = true; mout->printf( "Morphology model is enabled\n" ); } else { mout->printf( "Morphology model is not available\n" ); } ret=true; } else if( str==L"disallow_model" ) { ApplyModel = false; mout->printf( "Morphology model is disabled\n" ); ret=true; } else if( str==L"show" ) { if( current_analysis.NotNull() ) { const Res_Pack &pack = current_analysis->GetPack(); mout->printf( "\nResult pack contains %vfE%d%vn variators:\n", pack.vars().size() ); if( run_mode==MorphologyMode ) { for( lem::Container::size_type i=0; i<pack.vars().size(); i++ ) { const Variator * var = pack.vars()[i]; for( lem::Container::size_type k=0; k<var->size(); ++k ) { const Tree_Node & root = var->get(k); mout->printf( "%d: ", CastSizeToInt(k) ); root.Print( *lem::mout, sol_id->GetSynGram(), -1, true ); mout->eol(); } mout->eol(); mout->eol(); } } else { for( lem::Container::size_type i=0; i<pack.vars().size(); i++ ) { pack.vars()[i]->PrintV( *mout, sol_id->GetSynGram(), true ); mout->eol(); mout->eol(); } } } ret=true; } else if( str==L"tree" ) { if( current_analysis.NotNull() ) { const Res_Pack &pack = current_analysis->GetPack(); Solarix::print_syntax_tree( current_analysis->GetString(), current_analysis->GetPack(), *sol_id, *lem::mout, false, true ); } ret=true; } else if( str.eq_beg("recog" ) ) { if( current_analysis.NotNull() ) { lem::mout->eol(); current_analysis->GetLexer().PrintRecognitions( *lem::mout ); } return true; } else if( str==L"tokenize" ) { SetMode(TokenizerMode); ret=true; } else if( str==L"lemmatize" ) { SetMode(LemmatizerMode); ret=true; } else if( str==L"speak" ) { SetMode(SpeakerMode); ret=true; } else if( str==L"syntax" ) { SetMode(SyntaxMode); ret=true; } else if( str==L"morphology" ) { SetMode(MorphologyMode); ret=true; } else if( str==L"debugger" ) { if( debugger.NotNull() ) debugger->ManageBreakpoints(); ret=true; } else { lem::mout->printf( "Invalid command %vfC%us%vn\n", str.c_str() ); ret=true; } return ret; }
void LexicalAutomat::TranslateLexem( lem::UFString &str, bool substitute_entries, int LanguageID ) { WideStringUcs4 cenum(str.c_str()); lem::uint32_t c; lem::uint32_t prev_c=0; lem::Ucs4ToUFString yield; while( (c=cenum.Fetch())!=0 ) { if( c==L' ' ) { // Оставляем всегда единственный пробел (точнее - символ разделения лексем // в мультилексеме). if( prev_c!=Lexem::DelimiterChar ) { yield.Add(Lexem::DelimiterChar); prev_c = c; } continue; } if( c==L'-' || c==L',' || c==L'\'' ) { if( prev_c!=Lexem::DelimiterChar && prev_c!=0 ) yield.Add(Lexem::DelimiterChar); yield.Add(c); if( cenum.Fetch()!=0 ) { yield.Add( Lexem::DelimiterChar ); cenum.Back(); } prev_c = Lexem::DelimiterChar; continue; } Word_Coord wc; if( LanguageID==ANY_STATE ) wc = GG->FindSymbol(c); else if( LanguageID==UNKNOWN ) wc = GG->FindSymbol(c,GetDefaultAlphabets()); else wc = GG->FindSymbol(c,LanguageID); if( wc.GetEntry()==UNKNOWN ) { // Символ не найден в графической грамматике. yield.Add(c); } else { const GG_Entry &entry = GG->entries()[ wc.GetEntry() ]; if( !substitute_entries || wc.GetForm()==UNKNOWN || wc.GetForm()==ANY_STATE ) yield.Add( entry.GetForm(wc.GetForm()).GetName() ); else yield.Add( entry.GetName() ); } prev_c = c; } str = yield.ToString(); return; }
void CasingCoder::RestoreCasing( int external_casing_state, lem::UFString &res, int ekey ) { if( (ekey!=UNKNOWN && ekey==UnknownEntries_ekey ) ) return; const XLAT *xlat = GetXLAT(ekey); switch( x(external_casing_state) ) { case 1: { if( xlat->use_unicode ) { res.to_Aa(); } else { WideStringUcs4 src_enum( res.c_str() ); lem::UFString out; out.reserve(res.length()+1); for( int i=0; ; ++i ) { const lem::uint32_t src_ucs4 = src_enum.Fetch(); if(!src_ucs4) break; if(i==0) AddUpper( xlat, src_ucs4, out ); else AddLower( xlat, src_ucs4, out ); } res = out; } break; } case 2: { if( xlat->use_unicode ) { res.to_upper(); } else { WideStringUcs4 src_enum( res.c_str() ); lem::UFString out; out.reserve(res.length()+1); for(;;) { const lem::uint32_t src_ucs4 = src_enum.Fetch(); if(!src_ucs4) break; AddUpper( xlat, src_ucs4, out ); } res = out; } break; } case 3: { if( xlat->use_unicode ) { Solarix::MakeEachLexemAa(res); } else { bool capitalize=true; WideStringUcs4 src_enum( res.c_str() ); lem::UFString out; out.reserve(res.length()+1); for(;;) { const lem::uint32_t src_ucs4 = src_enum.Fetch(); if(!src_ucs4) break; if( capitalize ) { AddUpper( xlat, src_ucs4, out ); capitalize=false; } else { AddLower( xlat, src_ucs4, out ); if( src_ucs4==L' ' || src_ucs4==L'-' ) capitalize=true; } } res = out; } break; } case 0: default: { if( xlat->use_unicode ) { res.to_lower(); } else { WideStringUcs4 src_enum( res.c_str() ); lem::UFString out; out.reserve(res.length()+1); for(;;) { const lem::uint32_t src_ucs4 = src_enum.Fetch(); if(!src_ucs4) break; AddLower( xlat, src_ucs4, out ); } res = out; } break; } } // res.subst_all( L" - ", L"-" ); // res.subst_all( L" ' ", L"'" ); return; }
int SegmentingSentenceTokenizer::MatchLen(const lem::UFString & s, int i0) const { return lookup->match_len(s.c_str() + i0); }