FastFileReader::FastFileReader( const lem::Path& filename ) :Stream(true,false,filename) { hFile = NULL; hFileMap = NULL; OrgAdr = EndAdr = NULL; filesize.LowPart = filesize.HighPart = 0; binary=true; hFile = CreateFileW( filename.GetUnicode().c_str(), GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, 0, NULL ); LEM_CHECKIT_Z(hFile!=(HANDLE)-1); if( hFile==(HANDLE)-1 ) return; hFileMap = CreateFileMapping( hFile, NULL, PAGE_READONLY, 0, 0, NULL ); LEM_CHECKIT_Z(hFileMap!=NULL); if( hFileMap==NULL ) return; OrgAdr = (lem::uint8_t*)MapViewOfFile( hFileMap, FILE_MAP_READ, 0, 0, 0 ); LEM_CHECKIT_Z(OrgAdr!=NULL); if( OrgAdr==NULL ) return; #if defined LEM_QT quint64 i64 = QFileInfo(filename.GetUnicode().to_qt()).size(); filesize.QuadPart = i64; #elif (defined LEM_MSC && _MSC_VER<1300) || defined LEM_WIN98 filesize.LowPart = GetFileSize( hFile, (unsigned long*)&filesize.HighPart ); LEM_CHECKIT_Z( filesize.LowPart!=INVALID_FILE_SIZE ); #else BOOL res = GetFileSizeEx( hFile, &filesize ); LEM_CHECKIT_Z( res!=0 ); #endif CurAdr = OrgAdr; EndAdr = OrgAdr+filesize.LowPart; return; }
StdFileStream::StdFileStream( const lem::Path& filename, bool for_read, bool for_write, bool for_append ) : Stream(for_read,for_write,filename) { /* // This code is non-portable, or extremely difficult to make portable. // One of the reasons - the type of ios::in, out, app VARIES among // C++ compilers. Even MinGW differs from GCC in mandrake 10.0 !!! #if defined LEM_BORLAND unsigned mode = ios::out; #elif defined LEM_MSC unsigned mode = ios::out; #elif defined LEM_GNUC // unsigned mode = ios::out; std::_Ios_Openmode mode = ios::out; #else std::_Ios_Openmode mode = ios::out; #endif if( for_read ) mode = ios::in; if( for_write ) mode = ios::out; if( for_append ) mode = ios::app; stream = new fstream( filename, mode ); */ stream = NULL; if( for_read ) { if( for_write ) stream = new fstream( filename.GetAscii().c_str(), ios::in | ios::out ); else stream = new fstream( filename.GetAscii().c_str(), ios::in ); } else if( for_write ) stream = new fstream( filename.GetAscii().c_str(), ios::in | ios::out ); else if( for_append ) stream = new fstream( filename.GetAscii().c_str(), ios::in | ios::app ); do_del = true; return; }
bool Base_Application::Read_Ini( const lem::Path& filename ) { try { if( !filename.DoesExist() ) // Config file is not accessible. It is not worth trying to read it. return false; // Загружаем ini-файл с настройками Ini_Parser ini; Open_Ini_Parser(ini,filename); TryRead(ini); } catch(...) { // Файл конфигурации не найден, либо возникли необработанные ошибки // при чтении из него. #if LEM_DEBUGGING==1 // merr->printf( "Error occured while loading ini file '%us'\n", filename.Get_Unicode().c_str() ); #endif return false; } return true; }
void NGramsStorage_SQLITE::Connect( const lem::Path &connection_string ) { hdb = lem::sqlite_open_serialized(connection_string.GetUnicode()); lem::sqlite_execute( hdb, "PRAGMA temp_store = MEMORY" ); return; }
void NGramsCollectors::BuildKnowledgeBase( Solarix::Search_Engine::File_Type_Detector& detector, const Solarix::Search_Engine::Scan_Options& scanning, const lem::Path &DocumentsFolder, const lem::Path &DestinationFolder, Solarix::Language language, bool echo ) { lem::Collect<Solarix::NGramsCollectorFlags::Flags> flags_step; lem::BoolCollect lemmatize_step; switch(language) { case Solarix::Russian: { Solarix::NGramsCollectorFlags::Flags ngrams_flags=0; ngrams_flags = Solarix::NGramsCollectorFlags::Words; flags_step.push_back(ngrams_flags); lemmatize_step.push_back(true); ngrams_flags = Solarix::NGramsCollectorFlags::Raw2Grams; flags_step.push_back(ngrams_flags); lemmatize_step.push_back(true); ngrams_flags=0; ngrams_flags = Solarix::NGramsCollectorFlags::Covalent2Grams; ngrams_flags |= Solarix::NGramsCollectorFlags::Noun_Noun; ngrams_flags |= Solarix::NGramsCollectorFlags::Adj_Noun | Solarix::NGramsCollectorFlags::Covalent2Grams; ngrams_flags |= Solarix::NGramsCollectorFlags::Noun_Verb | Solarix::NGramsCollectorFlags::Covalent2Grams; ngrams_flags |= Solarix::NGramsCollectorFlags::Adverb_Verb | Solarix::NGramsCollectorFlags::Covalent2Grams; ngrams_flags |= Solarix::NGramsCollectorFlags::Adverb_Adj | Solarix::NGramsCollectorFlags::Covalent2Grams; ngrams_flags |= Solarix::NGramsCollectorFlags::Prepos_Noun | Solarix::NGramsCollectorFlags::Covalent2Grams; ngrams_flags |= Solarix::NGramsCollectorFlags::Verb_Object | Solarix::NGramsCollectorFlags::Covalent2Grams; flags_step.push_back(ngrams_flags); lemmatize_step.push_back(true); ngrams_flags = Solarix::NGramsCollectorFlags::Raw3Grams; flags_step.push_back(ngrams_flags); lemmatize_step.push_back(true); /* ngrams_flags = Solarix::NGramsCollectorFlags::Raw4Grams; flags_step.push_back(ngrams_flags); lemmatize_step.push_back(true); ngrams_flags = Solarix::NGramsCollectorFlags::Raw5Grams; flags_step.push_back(ngrams_flags); lemmatize_step.push_back(true); */ break; } case Solarix::English: { Solarix::NGramsCollectorFlags::Flags ngrams_flags=0; ngrams_flags = Solarix::NGramsCollectorFlags::Words; flags_step.push_back(ngrams_flags); lemmatize_step.push_back(true); ngrams_flags = Solarix::NGramsCollectorFlags::Raw2Grams; flags_step.push_back(ngrams_flags); lemmatize_step.push_back(false); ngrams_flags = Solarix::NGramsCollectorFlags::Raw2Grams; flags_step.push_back(ngrams_flags); lemmatize_step.push_back(true); ngrams_flags = Solarix::NGramsCollectorFlags::Raw3Grams; flags_step.push_back(ngrams_flags); lemmatize_step.push_back(false); ngrams_flags = Solarix::NGramsCollectorFlags::Raw3Grams; flags_step.push_back(ngrams_flags); lemmatize_step.push_back(true); /* ngrams_flags = Solarix::NGramsCollectorFlags::Raw4Grams; flags_step.push_back(ngrams_flags); lemmatize_step.push_back(false); ngrams_flags = Solarix::NGramsCollectorFlags::Raw4Grams; flags_step.push_back(ngrams_flags); lemmatize_step.push_back(true); ngrams_flags = Solarix::NGramsCollectorFlags::Raw5Grams; flags_step.push_back(ngrams_flags); lemmatize_step.push_back(false); ngrams_flags = Solarix::NGramsCollectorFlags::Raw5Grams; flags_step.push_back(ngrams_flags); lemmatize_step.push_back(true); */ } default: LEM_STOPIT; } for( lem::Container::size_type k=0; k<flags_step.size(); k++ ) { const Solarix::NGramsCollectorFlags::Flags ngrams_flags=flags_step[k]; const bool lemmatize = lemmatize_step[k]!=0; const bool cleanup_ngrams = true; ngrams.clear(); Prepare( DestinationFolder, language, ngrams_flags, false, true, lemmatize, false, true, "", cleanup_ngrams ); std::vector<lem::Path> files; if( DocumentsFolder.IsFolder() ) DocumentsFolder.ListFiles( files, true ); else files.push_back(DocumentsFolder); for( lem::Container::size_type i=0; i<files.size(); i++ ) { const lem::Path &filename = files[i]; try { lem::StreamPtr file( new BinaryReader(filename) ); lem::Ptr<Solarix::Search_Engine::Base_File_Reader> reader = detector.FindReader( scanning, filename.GetUnicode(), to_upper(filename.GetExtension()), file ); if( reader.NotNull() ) { BeginDocument(echo); while( !reader->eof() ) { Solarix::Search_Engine::Base_File_Lexem bl; // Retrive next lexem from stream reader->read(bl); bool process=false; // Do process this lexem? switch(language) { case Solarix::All: process = true; break; case Solarix::Russian: // Only russian words (and numbers and punctuators) are allowed. process = bl.word.length() && (are_cyr_only(bl.word.c_str()) || is_int(bl.word.c_str()) || is_upunct(bl.word.front())); break; case Solarix::English: // Only english words are to be processed. process = bl.word.length() && (are_lat_only(bl.word.c_str()) || is_int(bl.word.c_str()) || is_upunct(bl.word.front())); break; case Solarix::French: process = bl.word.length() && (are_french_only(bl.word.c_str()) || is_int(bl.word.c_str()) || is_upunct(bl.word.front())); break; case Solarix::German: process = bl.word.length() && (are_german_only(bl.word.c_str()) || is_int(bl.word.c_str()) || is_upunct(bl.word.front())); break; case Solarix::Italian: process = bl.word.length() && (are_italian_only(bl.word.c_str()) || is_int(bl.word.c_str()) || is_upunct(bl.word.front())); break; case Solarix::Spanish: process = bl.word.length() && (are_spanish_only(bl.word.c_str()) || is_int(bl.word.c_str()) || is_upunct(bl.word.front())); break; } if( process ) { if( bl.word==L'\'' || bl.word==L'"' || bl.word.front()=='_' ) continue; Process(bl.word); } } EndDocument(echo); } } catch(...) { } } EndAll(echo); } return; }