Beispiel #1
0
FastFileReader::FastFileReader( const lem::Path& filename )
:Stream(true,false,filename)
{
 hFile = NULL;
 hFileMap = NULL;
 OrgAdr = EndAdr = NULL;
 filesize.LowPart = filesize.HighPart = 0;
 binary=true;

 hFile = CreateFileW(
                     filename.GetUnicode().c_str(),
                     GENERIC_READ,
                     FILE_SHARE_READ,
                     NULL,
                     OPEN_EXISTING,
                     0,
                     NULL
                    );

 LEM_CHECKIT_Z(hFile!=(HANDLE)-1);

 if( hFile==(HANDLE)-1 )
  return;

 hFileMap = CreateFileMapping( hFile, NULL, PAGE_READONLY, 0, 0, NULL );
 LEM_CHECKIT_Z(hFileMap!=NULL);

 if( hFileMap==NULL )
  return;

 OrgAdr = (lem::uint8_t*)MapViewOfFile( hFileMap, FILE_MAP_READ, 0, 0, 0 );
 LEM_CHECKIT_Z(OrgAdr!=NULL);

 if( OrgAdr==NULL )
  return;

 #if defined LEM_QT
 quint64 i64 = QFileInfo(filename.GetUnicode().to_qt()).size();
 filesize.QuadPart = i64;
 #elif (defined LEM_MSC && _MSC_VER<1300) || defined LEM_WIN98
 filesize.LowPart = GetFileSize( hFile, (unsigned long*)&filesize.HighPart );
 LEM_CHECKIT_Z( filesize.LowPart!=INVALID_FILE_SIZE );
 #else
 BOOL res = GetFileSizeEx( hFile, &filesize );
 LEM_CHECKIT_Z( res!=0 );
 #endif

 CurAdr = OrgAdr;
 EndAdr = OrgAdr+filesize.LowPart;

 return;
}
StdFileStream::StdFileStream(
                             const lem::Path& filename,
                             bool for_read,
                             bool for_write,
                             bool for_append
                            )
 : Stream(for_read,for_write,filename)
{
/*
 // This code is non-portable, or extremely difficult to make portable.
 // One of the reasons - the type of ios::in, out, app VARIES among
 // C++ compilers. Even MinGW differs from GCC in mandrake 10.0 !!!

 #if defined LEM_BORLAND
 unsigned mode = ios::out;
 #elif defined LEM_MSC
 unsigned mode = ios::out;
 #elif defined LEM_GNUC
// unsigned mode = ios::out;
 std::_Ios_Openmode mode = ios::out;
 #else
 std::_Ios_Openmode mode = ios::out;
 #endif

 if( for_read )   mode = ios::in;
 if( for_write )  mode = ios::out;
 if( for_append ) mode = ios::app;

 stream = new fstream( filename, mode );
*/

 stream = NULL;

 if( for_read )
  {
   if( for_write )
    stream = new fstream( filename.GetAscii().c_str(), ios::in | ios::out );
   else
    stream = new fstream( filename.GetAscii().c_str(), ios::in );
  }
 else if( for_write )
  stream = new fstream( filename.GetAscii().c_str(), ios::in | ios::out );
 else if( for_append )
  stream = new fstream( filename.GetAscii().c_str(), ios::in | ios::app );

 do_del = true;

 return;
}
bool Base_Application::Read_Ini( const lem::Path& filename )
{
 try
  {
   if( !filename.DoesExist() ) 
    // Config file is not accessible. It is not worth trying to read it.
    return false;

   // Загружаем ini-файл с настройками
   Ini_Parser ini;

   Open_Ini_Parser(ini,filename);
   TryRead(ini);
  }
 catch(...)
  {
   // Файл конфигурации не найден, либо возникли необработанные ошибки
   // при чтении из него.
   #if LEM_DEBUGGING==1
//   merr->printf( "Error occured while loading ini file '%us'\n", filename.Get_Unicode().c_str() );
   #endif
   return false;
  }

 return true;
}
void NGramsStorage_SQLITE::Connect( const lem::Path &connection_string )
{
 hdb = lem::sqlite_open_serialized(connection_string.GetUnicode());

 lem::sqlite_execute( hdb, "PRAGMA temp_store = MEMORY" );
 return;
}
void NGramsCollectors::BuildKnowledgeBase(
                                          Solarix::Search_Engine::File_Type_Detector& detector,
                                          const Solarix::Search_Engine::Scan_Options& scanning,
                                          const lem::Path &DocumentsFolder,
                                          const lem::Path &DestinationFolder,
                                          Solarix::Language language,
                                          bool echo 
                                         )
{
 lem::Collect<Solarix::NGramsCollectorFlags::Flags> flags_step;
 lem::BoolCollect lemmatize_step;

 switch(language)
 {
  case Solarix::Russian:
  {
   Solarix::NGramsCollectorFlags::Flags ngrams_flags=0;

   ngrams_flags = Solarix::NGramsCollectorFlags::Words;
   flags_step.push_back(ngrams_flags);
   lemmatize_step.push_back(true);

   ngrams_flags = Solarix::NGramsCollectorFlags::Raw2Grams;
   flags_step.push_back(ngrams_flags);
   lemmatize_step.push_back(true);

   ngrams_flags=0;
   ngrams_flags = Solarix::NGramsCollectorFlags::Covalent2Grams;
   ngrams_flags |= Solarix::NGramsCollectorFlags::Noun_Noun;
   ngrams_flags |= Solarix::NGramsCollectorFlags::Adj_Noun | Solarix::NGramsCollectorFlags::Covalent2Grams;
   ngrams_flags |= Solarix::NGramsCollectorFlags::Noun_Verb | Solarix::NGramsCollectorFlags::Covalent2Grams;
   ngrams_flags |= Solarix::NGramsCollectorFlags::Adverb_Verb | Solarix::NGramsCollectorFlags::Covalent2Grams;
   ngrams_flags |= Solarix::NGramsCollectorFlags::Adverb_Adj | Solarix::NGramsCollectorFlags::Covalent2Grams;
   ngrams_flags |= Solarix::NGramsCollectorFlags::Prepos_Noun | Solarix::NGramsCollectorFlags::Covalent2Grams;
   ngrams_flags |= Solarix::NGramsCollectorFlags::Verb_Object | Solarix::NGramsCollectorFlags::Covalent2Grams;
   flags_step.push_back(ngrams_flags);
   lemmatize_step.push_back(true);

   ngrams_flags = Solarix::NGramsCollectorFlags::Raw3Grams;
   flags_step.push_back(ngrams_flags);
   lemmatize_step.push_back(true);

/*
   ngrams_flags = Solarix::NGramsCollectorFlags::Raw4Grams;
   flags_step.push_back(ngrams_flags);
   lemmatize_step.push_back(true);
  
   ngrams_flags = Solarix::NGramsCollectorFlags::Raw5Grams;
   flags_step.push_back(ngrams_flags);
   lemmatize_step.push_back(true);
*/
   break;
  }

  case Solarix::English:
  {
   Solarix::NGramsCollectorFlags::Flags ngrams_flags=0;

   ngrams_flags = Solarix::NGramsCollectorFlags::Words;
   flags_step.push_back(ngrams_flags);
   lemmatize_step.push_back(true);

   ngrams_flags = Solarix::NGramsCollectorFlags::Raw2Grams;
   flags_step.push_back(ngrams_flags);
   lemmatize_step.push_back(false);

   ngrams_flags = Solarix::NGramsCollectorFlags::Raw2Grams;
   flags_step.push_back(ngrams_flags);
   lemmatize_step.push_back(true);

   ngrams_flags = Solarix::NGramsCollectorFlags::Raw3Grams;
   flags_step.push_back(ngrams_flags);
   lemmatize_step.push_back(false);

   ngrams_flags = Solarix::NGramsCollectorFlags::Raw3Grams;
   flags_step.push_back(ngrams_flags);
   lemmatize_step.push_back(true);
/*
   ngrams_flags = Solarix::NGramsCollectorFlags::Raw4Grams;
   flags_step.push_back(ngrams_flags);
   lemmatize_step.push_back(false);

   ngrams_flags = Solarix::NGramsCollectorFlags::Raw4Grams;
   flags_step.push_back(ngrams_flags);
   lemmatize_step.push_back(true);

   ngrams_flags = Solarix::NGramsCollectorFlags::Raw5Grams;
   flags_step.push_back(ngrams_flags);
   lemmatize_step.push_back(false);

   ngrams_flags = Solarix::NGramsCollectorFlags::Raw5Grams;
   flags_step.push_back(ngrams_flags);
   lemmatize_step.push_back(true);
*/
  }

  default:
   LEM_STOPIT;
 }

 for( lem::Container::size_type k=0; k<flags_step.size(); k++ )
  {
   const Solarix::NGramsCollectorFlags::Flags ngrams_flags=flags_step[k];
   const bool lemmatize = lemmatize_step[k]!=0;

   const bool cleanup_ngrams = true;

   ngrams.clear();
   Prepare( DestinationFolder, language, ngrams_flags, false, true, lemmatize, false, true, "", cleanup_ngrams );

   std::vector<lem::Path> files;

   if( DocumentsFolder.IsFolder() )
    DocumentsFolder.ListFiles( files, true );
   else
    files.push_back(DocumentsFolder);

   for( lem::Container::size_type i=0; i<files.size(); i++ )
    {
     const lem::Path &filename = files[i];

     try
      {
       lem::StreamPtr file( new BinaryReader(filename) );
  
       lem::Ptr<Solarix::Search_Engine::Base_File_Reader> reader = detector.FindReader(
                                                   scanning,
                                                   filename.GetUnicode(),
                                                   to_upper(filename.GetExtension()),
                                                   file 
                                                  );

       if( reader.NotNull() )
        {
         BeginDocument(echo);
           
         while( !reader->eof() )
          {
           Solarix::Search_Engine::Base_File_Lexem bl;

           // Retrive next lexem from stream
           reader->read(bl);
     
           bool process=false;

           // Do process this lexem?
           switch(language)
           { 
            case Solarix::All: process = true; break;

            case Solarix::Russian:
            // Only russian words (and numbers and punctuators) are allowed.
            process = bl.word.length() && (are_cyr_only(bl.word.c_str()) || is_int(bl.word.c_str()) || is_upunct(bl.word.front()));
            break;

           case Solarix::English:
            // Only english words are to be processed.
            process = bl.word.length() && (are_lat_only(bl.word.c_str()) || is_int(bl.word.c_str()) || is_upunct(bl.word.front()));
            break;

           case Solarix::French:
            process = bl.word.length() && (are_french_only(bl.word.c_str()) || is_int(bl.word.c_str()) || is_upunct(bl.word.front()));
            break;

           case Solarix::German:
            process = bl.word.length() && (are_german_only(bl.word.c_str()) || is_int(bl.word.c_str()) || is_upunct(bl.word.front()));
            break;

           case Solarix::Italian:
            process = bl.word.length() && (are_italian_only(bl.word.c_str()) || is_int(bl.word.c_str()) || is_upunct(bl.word.front()));
            break;

           case Solarix::Spanish:
            process = bl.word.length() && (are_spanish_only(bl.word.c_str()) || is_int(bl.word.c_str()) || is_upunct(bl.word.front()));
            break;
           }
            
           if( process )
            {
             if( bl.word==L'\'' || bl.word==L'"'  || bl.word.front()=='_' )
              continue;

             Process(bl.word);
            }        
          }

         EndDocument(echo);
        }
      }
     catch(...)
      {
      }
    }

   EndAll(echo);
  }

 return;
}