int lem::sqlite_select_int( struct sqlite3* hdb, const lem::FString & Select ) { LEM_CHECKIT_Z( hdb!=NULL ); LEM_CHECKIT_Z( !Select.empty() ); int retval=-1; sqlite3_stmt *stmt=NULL; const char *dummy; int res = sqlite3_prepare_v2( hdb, Select.c_str(), Select.length(), &stmt, &dummy ); if( res==SQLITE_OK ) { res = sqlite3_step( stmt ); if( res == SQLITE_ROW ) { try { retval = sqlite3_column_int(stmt,0); } catch(...) { } } sqlite3_finalize(stmt); } else { lem::MemFormatter msg; msg.printf( "SQLite error: can not execute query %s, message=%us", Select.c_str(), lem::sqlite_errmsg(hdb).c_str() ); throw E_BaseException( msg.string() ); } return retval; }
bool NGramsDBMS::GetStat( int order, const lem::FString &suffix, const lem::FString &segment, float &min_w, float &max_w, lem::int64_t &n ) { if (TableExists(lem::format_str("NGRAM_STATS%d%s%s", order, suffix.c_str(), segment.c_str()))) { lem::FString sql(lem::format_str("SELECT max_w%d, count_n%d FROM NGRAM_STATS%d%s%s", order, order, order, suffix.c_str(), segment.c_str())); std::unique_ptr<LS_ResultSet> rs(Select(sql)); if (rs->Fetch()) { const float _max_w = (float)rs->GetInt(0); const float _min_w = 0.F; const int _n = rs->GetInt(1); max_w = max(max_w, _max_w); min_w = min(min_w, _min_w); n += _n; return true; } } return false; }
void NGramsDBMS::Find1Grams( const lem::FString &suffix, const lem::FString &sgm, int max_w1, int min_freq, std::map< UCString, std::pair<int, float> > & reslist ) { FString sql = lem::format_str("SELECT wrd.word, w" " FROM NGRAM1%s%s, NGRAM_WORDS%s%s WRD" " WHERE iword1=wrd.id ORDER BY w DESC" , suffix.c_str(), sgm.c_str(), suffix.c_str(), sgm.c_str()); std::unique_ptr<LS_ResultSet> rs(Select(sql)); while (rs->Fetch()) { const int f = rs->GetInt(1); if (f >= min_freq) { UCString cs(rs->GetUCString(0)); const int ciw = f; const float cw = float(f) / max_w1; auto it = reslist.find(cs); if (it == reslist.end()) reslist.insert(std::make_pair(cs, std::make_pair(ciw, cw))); } } return; }
void NGramsDBMS::Dump3Grams( const lem::FString &suffix, const lem::FString &sgm, lem::OFormatter &to ) { FString sql = lem::format_str( "SELECT w1.word, w2.word, w3.word, w" " FROM NGRAM3%s%s, NGRAM_WORDS%s%s w1, NGRAM_WORDS%s%s w2, NGRAM_WORDS%s%s w3" " WHERE w1.id=iword1 AND w2.id=iword2 AND w3.id=iword3" , suffix.c_str(), sgm.c_str() , suffix.c_str(), sgm.c_str() , suffix.c_str(), sgm.c_str() , suffix.c_str(), sgm.c_str() ); std::unique_ptr<LS_ResultSet> rs(Select(sql)); while (rs->Fetch()) { UCString s1(rs->GetUCString(0)); UCString s2(rs->GetUCString(1)); UCString s3(rs->GetUCString(2)); const int w = rs->GetInt(3); to.printf("%us %us %us [%d]\n", s1.c_str(), s2.c_str(), s3.c_str(), w); } return; }
int StorageConnection_SQLITE::SelectInt( const lem::FString &sql, int default_value ) { int retval=default_value; sqlite3_stmt *stmt=NULL; const char *dummy; int res = sqlite3_prepare_v2( hdb, sql.c_str(), sql.length(), &stmt, &dummy ); if( res==SQLITE_OK ) { res = sqlite3_step( stmt ); if( res == SQLITE_ROW ) { try { retval = sqlite3_column_int(stmt,0); } catch(...) { } } sqlite3_finalize(stmt); } else { lem::MemFormatter msg; msg.printf( "SQLite error: can not execute query %s, message=%us", sql.c_str(), lem::sqlite_errmsg(hdb).c_str() ); throw lem::E_BaseException( msg.string() ); } return retval; }
lem::FString NGramsStorage_SQLITE::StartBulkInsert( const lem::FString &suffix, int order ) { switch( order ) { case 1: return lem::format_str( "INSERT INTO NGRAM1%s( id, iword1, w ) ", suffix.c_str() ); case 2: return lem::format_str( "INSERT INTO NGRAM2%s( id, iword1, iword2, w ) ", suffix.c_str() ); case 3: return lem::format_str( "INSERT INTO NGRAM3%s( id, iword1, iword2, iword3, w ) ", suffix.c_str() ); case 4: return lem::format_str( "INSERT INTO NGRAM4%s( id, iword1, iword2, iword3, iword4, w ) ", suffix.c_str() ); case 5: return lem::format_str( "INSERT INTO NGRAM5%s( id, iword1, iword2, iword3, iword4, iword5, w ) ", suffix.c_str() ); default: LEM_STOPIT; return lem::FString(); } }
int NGramsDBMS::FindWord( const lem::FString &suffix, const lem::FString &sgm, const lem::UCString &word ) { FString sql = lem::format_str("SELECT id FROM NGRAM_WORDS%s%s WHERE word='%s'" , suffix.c_str(), sgm.c_str(), to_utf8(word.c_str()).c_str() ); const int id = SelectInt(sql, -1); return id; }
void NGramsDBMS::CreateIndeces2(const lem::FString &suffix, int order) { for (int k = 0; k < order; ++k) { if (!IndexExists(format_str("ngrams_idx%d%d%s", order, 3 + k, suffix.c_str()), lem::format_str("NGRAM%d%s", order, suffix.c_str()))) { ExecuteSql(lem::format_str("CREATE INDEX ngrams_idx%d%d%s ON NGRAM%d%s(iword%d)", order, k + 3, suffix.c_str(), order, suffix.c_str(), k + 1)); } } return; }
void NGramsDBMS::UpdateStatistics(const lem::FString &suffix, int order) { ExecuteSql(lem::format_str("DELETE FROM NGRAM_STATS%d%s", order, suffix.c_str()).c_str()); std::pair<int, int> p1(0, 0); p1 = SelectIntPair(lem::format_str("SELECT Coalesce(max(w),0), count(*) FROM NGRAM%d%s", order, suffix.c_str()).c_str()); FString sql = format_str( "INSERT INTO NGRAM_STATS%d%s( max_w%d, count_n%d ) VALUES ( %d, %d )", order, suffix.c_str(), order, order, p1.first, p1.second ); ExecuteSql(sql); return; }
lem::int64_t NGramsDBMS::SumFrequency( int order, const lem::FString &suffix, const lem::FString &sgm ) { FString sql = lem::format_str("SELECT sum(w)" " FROM NGRAM%d%s%s" , order, suffix.c_str(), sgm.c_str() ); const int f = SelectInt(sql, 0); return f; }
std::pair<int, int> NGramsDBMS::FindRawNGramsID( const lem::FString &suffix, const lem::FString &segment, const lem::UCString &left ) { FString sql = lem::format_str("SELECT w, N.id" " FROM NGRAM_WORDS%s%s wrd1, NGRAM1%s%s N" " WHERE wrd1.word='%s'AND iword1=wrd1.id" , suffix.c_str(), segment.c_str(), suffix.c_str(), segment.c_str() , to_utf8(left.c_str()).c_str()); std::pair<int, int> r = SelectIntPair(sql); return r; }
// Итератация по всем 3-граммам void NGramsDBMS::ForEachRaw3( Ngrams &ngrams, const lem::FString &suffix, const lem::FString &sgm, NGramHandler *handler, int sorting_type ) { const char *sorting = ""; switch (sorting_type) { case 1: sorting = "ORDER BY id"; break; case 2: sorting = "ORDER BY w"; break; case 3: sorting = "ORDER BY w DESC"; break; } FString sql = lem::format_str("SELECT NG.id, iword1, iword2, iword3, wrd1.word, wrd2.word, wrd3.word, NG.w" " FROM NGRAM3%s%s NG, NGRAM_WORDS%s%s wrd1, NGRAM_WORDS%s%s wrd2, NGRAM_WORDS%s%s wrd3" " WHERE wrd1.id=iword1 AND wrd2.id=iword2 AND wrd3.id=iword3" " %s" , suffix.c_str(), sgm.c_str() , suffix.c_str(), sgm.c_str() , suffix.c_str(), sgm.c_str() , suffix.c_str(), sgm.c_str() , sorting ); std::unique_ptr<LS_ResultSet> rs(Select(sql)); while (rs->Fetch()) { const int id_ngram = rs->GetInt(0); const int id_word1 = rs->GetInt(1); const int id_word2 = rs->GetInt(2); const int id_word3 = rs->GetInt(3); UCString word1(rs->GetUCString(4)); UCString word2(rs->GetUCString(5)); UCString word3(rs->GetUCString(6)); const int f = rs->GetInt(7); const int ciw = f; if (!handler->Do(ngrams, id_ngram, ciw, id_word1, id_word2, id_word3, word1, word2, word3)) break; } return; }
void NGramsStorage_SQLITE::CreateTable_NGrams5( const lem::FString &suffix ) { char asql[1000]; #if LEM_DEBUGGING==1 sprintf( asql, "CREATE TABLE NGRAM5%s( " "id integer NOT NULL," "iword1 integer NOT NULL," "iword2 integer NOT NULL," "iword3 integer NOT NULL," "iword4 integer NOT NULL," "iword5 integer NOT NULL," "w integer NOT NULL" ")", suffix.c_str() ); #else sprintf( asql, "CREATE TABLE NGRAM5%s( " "id integer," "iword1 integer," "iword2 integer," "iword3 integer," "iword4 integer," "iword5 integer," "w integer" ")", suffix.c_str() ); #endif ExecuteSql(asql); sprintf( asql, "CREATE TABLE NGRAM_STATS5%s( " "max_w5 integer NOT NULL," "count_n5 integer NOT NULL" ")", suffix.c_str() ); ExecuteSql(asql); #if LEM_DEBUGGING==1 sprintf( asql, "CREATE UNIQUE INDEX NGRAM_UNQ5_%s ON NGRAM5%s( iword1, iword2, iword3, iword4, iword5 )", suffix.c_str(), suffix.c_str() ); ExecuteSql(asql); sprintf( asql, "CREATE UNIQUE INDEX ngrams_idx51%s ON NGRAM5%s(id)", suffix.c_str(), suffix.c_str() ); ExecuteSql(asql); #endif return; }
lem::UCString NGramsDBMS::GetWord(const lem::FString &suffix, const lem::FString &segment, int id_word) { FString sql = lem::format_str("SELECT word FROM NGRAM_WORDS%s%s WHERE id=%d" , suffix.c_str(), segment.c_str(), id_word ); std::unique_ptr<LS_ResultSet> rs(Select(sql)); if (rs->Fetch()) { UCString w(rs->GetUCString(0)); return w; } else { LEM_STOPIT; return lem::UCString(); } }
void NGramsDBMS::Dump2Grams_1( const lem::FString &suffix, const lem::FString &sgm, lem::OFormatter &to ) { FString sql = lem::format_str("SELECT iword1, iword2, w FROM NGRAM2%s%s" , suffix.c_str(), sgm.c_str()); std::unique_ptr<LS_ResultSet> rs(Select(sql)); while (rs->Fetch()) { const int ie1 = rs->GetInt(0); const int ie2 = rs->GetInt(1); const int w = rs->GetInt(2); to.printf("%6d %6d [%d]\n", ie1, ie2, w); } return; }
void StorageConnection_SQLITE::Execute( const lem::FString &sql ) { int res = sqlite3_exec( hdb, sql.c_str(), NULL, NULL, NULL ); if( res!=SQLITE_OK ) { lem::MemFormatter msg; msg.printf( "SQLite error in file %s:%d, message=%us", __FILE__, __LINE__, lem::sqlite_errmsg(hdb).c_str() ); throw lem::E_BaseException(msg.string()); } return; }
void StorageConnection_MySQL::Error( const lem::FString &sql ) { MySQLCnx *cnx = GetDb(); #if defined LEM_THREADS lem::Process::CritSecLocker guard(&cnx->cs); #endif lem::MemFormatter mem; mem.printf( "Error in MySQL, sql=%s, error=%us", sql.c_str(), lem::mysql_errmsg(cnx->mysql).c_str() ); throw lem::E_BaseException(mem.string()); }
std::pair<int, int> NGramsDBMS::FindRawNGramsID( const lem::FString &suffix, const lem::FString &sgm, const lem::UCString &w1, const lem::UCString &w2, const lem::UCString &w3, const lem::UCString &w4, const lem::UCString &w5 ) { FString sql = lem::format_str("SELECT w, N.id" " FROM NGRAM_WORDS%s%s wrd1, NGRAM_WORDS%s%s wrd2, NGRAM_WORDS%s%s wrd3, NGRAM_WORDS%s%s wrd4, NGRAM_WORDS%s%s wrd5, NGRAM5%s%s N" " WHERE wrd1.word='%s' AND wrd2.word='%s' AND wrd3.word='%s' AND wrd4.word='%s' AND wrd5.word='%s' AND iword1=wrd1.id AND iword2=wrd2.id AND iword3=wrd3.id AND iword4=wrd4.id AND iword5=wrd5.id" , suffix.c_str(), sgm.c_str() , suffix.c_str(), sgm.c_str() , suffix.c_str(), sgm.c_str() , suffix.c_str(), sgm.c_str() , suffix.c_str(), sgm.c_str() , suffix.c_str(), sgm.c_str() , to_utf8(w1.c_str()).c_str(), to_utf8(w2.c_str()).c_str() , to_utf8(w3.c_str()).c_str() , to_utf8(w4.c_str()).c_str(), to_utf8(w5.c_str()).c_str()); std::pair<int, int> r = SelectIntPair(sql); return r; }
bool StorageConnection_MySQL::DoesTableExist( const lem::FString &table_name ) { MySQLCnx *cnx = GetDb(); #if defined LEM_THREADS lem::Process::CritSecLocker guard(&cnx->cs); #endif lem::FString sql( lem::format_str( "SELECT count(*) FROM INFORMATION_SCHEMA.TABLES" " WHERE TABLE_SCHEMA='%s' AND" " TABLE_NAME='%s'", schema.c_str(), table_name.c_str() ) ); return lem::mysql_select_int( cnx->mysql, sql.c_str() )==1; }
void NGramsDBMS::FindRaw3GramsWithCenter( const lem::FString &suffix, const lem::FString &sgm, float max_w3, const lem::UCString ¢er, int min_freq, std::map< std::pair<lem::UCString, lem::UCString>, float > & reslist ) { FString sql = lem::format_str( "SELECT wrd1.word, wrd3.word, w" " FROM NGRAM3%s%s, NGRAM_WORDS%s%s wrd1, NGRAM_WORDS%s%s wrd2, NGRAM_WORDS%s%s wrd3" " WHERE wrd2.word='%s' AND iword2=wrd2.id AND wrd1.id=iword1 AND wrd3.id=iword3" , suffix.c_str(), sgm.c_str() , suffix.c_str(), sgm.c_str() , suffix.c_str(), sgm.c_str() , suffix.c_str(), sgm.c_str() , to_utf8(center.c_str()).c_str()); std::unique_ptr<LS_ResultSet> rs(Select(sql)); while (rs->Fetch()) { const int f = rs->GetInt(2); if (f >= min_freq) { UCString cw1(rs->GetUCString(0)); UCString cw3(rs->GetUCString(1)); const float ff = float(f) / max_w3; auto it = reslist.find(std::make_pair(cw1, cw3)); if (it == reslist.end()) reslist.insert(std::make_pair(std::make_pair(cw1, cw3), ff)); else it->second += ff; } } return; }
void StorageConnection_MySQL::Execute( const lem::FString &sql ) { MySQLCnx *cnx = GetDb(); #if defined LEM_THREADS lem::Process::CritSecLocker guard(&cnx->cs); #endif int res = mysql_query(cnx->mysql,sql.c_str()); if( res!=0 ) Error(sql); return; }
bool StorageConnection_MySQL::DoesIndexExist( const lem::FString &index_name, const lem::FString &table_name ) { MySQLCnx *cnx = GetDb(); #if defined LEM_THREADS lem::Process::CritSecLocker guard(&cnx->cs); #endif lem::FString sql( lem::format_str( "SHOW INDEX FROM %s FROM %s WHERE Key_name='%s'" , table_name.c_str(), schema.c_str(), index_name.c_str() ) ); bool rc=false; int ok = mysql_query( cnx->mysql, sql.c_str() ); if( ok==0 ) { MYSQL_RES *res = mysql_store_result(cnx->mysql); if( res!=NULL ) { MYSQL_ROW row = mysql_fetch_row(res); if( row ) { rc = true; } mysql_free_result(res); } } else { lem::MemFormatter mem; mem.printf( "Error in MySQL, sql=%s, error=%us", sql.c_str(), lem::mysql_errmsg(cnx->mysql).c_str() ); throw lem::E_BaseException(mem.string()); } return rc; }
int StorageConnection_MySQL::SelectInt( const lem::FString &sql, int default_value ) { int retval=default_value; MySQLCnx *cnx = GetDb(); #if defined LEM_THREADS lem::Process::CritSecLocker guard(&cnx->cs); #endif int ok = mysql_query( cnx->mysql, sql.c_str() ); if( ok==0 ) { MYSQL_RES *res = mysql_store_result(cnx->mysql); if( res!=NULL ) { MYSQL_ROW row = mysql_fetch_row(res); if( row ) { const char *str = row[0]; if( !lem::to_int( str, &retval ) ) retval=-1; } mysql_free_result(res); } } else { lem::MemFormatter mem; mem.printf( "Error in MySQL, sql=%s, error=%us", sql.c_str(), lem::mysql_errmsg(cnx->mysql).c_str() ); throw lem::E_BaseException(mem.string()); } return retval; }
void NGramsDBMS::Find2GramsWithRight( const lem::FString &suffix, const lem::FString &sgm, float max_w2, const lem::UCString &right, int min_freq, std::map< lem::UCString, float > & reslist ) { FString sql = lem::format_str( "SELECT wrd2.word, w" " FROM NGRAM_WORDS%s%s wrd1, NGRAM2%s%s, NGRAM_WORDS%s%s wrd2" " WHERE wrd1.word='%s' AND iword2=wrd1.id AND wrd2.id=iword1" , suffix.c_str(), sgm.c_str() , suffix.c_str(), sgm.c_str() , suffix.c_str(), sgm.c_str() , to_utf8(right.c_str()).c_str()); std::unique_ptr<LS_ResultSet> rs(Select(sql)); while (rs->Fetch()) { const int f = rs->GetInt(1); if (f >= min_freq) { UCString cs(rs->GetUCString(0)); const float ff = float(f) / max_w2; auto it = reslist.find(cs); if (it == reslist.end()) reslist.insert(std::make_pair(cs, ff)); else it->second += ff; } } return; }
void NGramsDBMS::FindLeftWords( const lem::FString &suffix, const lem::FString &segment, int ie_right, std::map< int, float > & reslist2, int min_w, int max_w, int min_freq ) { LEM_CHECKIT_Z(ie_right != UNKNOWN); lem::FString sql(lem::format_str("SELECT iword1, w FROM NGRAM2%s%s WHERE iword2=%d", suffix.c_str(), segment.c_str(), ie_right)); std::unique_ptr<LS_ResultSet> rs(Select(sql)); while (rs->Fetch()) { const int freq = rs->GetInt(1); if (freq >= min_freq) { const int ie1 = rs->GetInt(0); float w = max_w > 0 ? (freq - min_w) / max_w : 0.0F; if (w < 0.0F) w = 0.0F; std::map< int, float >::iterator it = reslist2.find(ie1); if (it == reslist2.end()) reslist2.insert(std::make_pair(ie1, w)); else it->second += w; } } return; }
void NGramsStorage_SQLITE::UpdateNGrams( const lem::FString &suffix, int order, const lem::MCollect< std::pair<int,int> > & list ) { char asql_buf[1000]; sprintf( asql_buf, "UPDATE NGRAM%d%s SET w=w+? WHERE id=?", order, suffix.c_str() ); sqlite3_stmt *stmt2=NULL; int res = sqlite3_prepare_v2( hdb, asql_buf, -1, &stmt2, NULL ); if( res!=SQLITE_OK ) { const char *errmsg = sqlite3_errmsg(hdb); throw E_BaseException( lem::format_str( L"SQLite error %S", errmsg ).c_str() ); } for( lem::Container::size_type i=0; i<list.size(); ++i ) { #if LEM_DEBUGGING==1 int debug_id_ngram = list[i].first; int debug_add_w = list[i].second; if( debug_id_ngram==5 ) { // printf( "add_w=%d\n", debug_add_w ); } #endif res = sqlite3_bind_int( stmt2, 1, list[i].second ); res = sqlite3_bind_int( stmt2, 2, list[i].first ); res = sqlite3_step( stmt2 ); if( res!=SQLITE_DONE ) { const char *errmsg = sqlite3_errmsg(hdb); throw E_BaseException( lem::format_str( L"SQLite error %S", errmsg ).c_str() ); } res = sqlite3_reset(stmt2); } sqlite3_finalize(stmt2); return; }
void NGramsStorage_SQLITE::SelectFStrings( const lem::FString &Select, lem::Collect<lem::FString> &list ) { LEM_CHECKIT_Z( !Select.empty() ); sqlite3_stmt *stmt=NULL; int res = sqlite3_prepare_v2( hdb, Select.c_str(), -1, &stmt, NULL ); if( res==SQLITE_OK ) { while( sqlite3_step( stmt ) == SQLITE_ROW ) { lem::FString s = lem::sqlite_column_fstring( stmt, 0 ); list.push_back(s); } sqlite3_finalize(stmt); } return; }
int NGramsStorage_SQLITE::SelectInt( const lem::FString &Select, int def ) { LEM_CHECKIT_Z( !Select.empty() ); int retval=def; sqlite3_stmt *stmt=NULL; int res = sqlite3_prepare_v2( hdb, Select.c_str(), -1, &stmt, NULL ); if( res==SQLITE_OK ) { if( sqlite3_step( stmt ) == SQLITE_ROW ) { retval = sqlite3_column_int(stmt,0); } sqlite3_finalize(stmt); } return retval; }
std::pair<int,int> NGramsStorage_SQLITE::SelectIntPair( const lem::FString & Select ) { int retval1=-1; int retval2=-1; sqlite3_stmt *stmt=NULL; int res = sqlite3_prepare_v2( hdb, Select.c_str(), -1, &stmt, NULL ); if( res==SQLITE_OK ) { if( sqlite3_step( stmt ) == SQLITE_ROW ) { retval1 = sqlite3_column_int(stmt,0); retval2 = sqlite3_column_int(stmt,1); } sqlite3_finalize(stmt); } return std::make_pair(retval1,retval2); }
void NGramsDBMS::CreateIndecesWord2(const lem::FString &suffix) { if (!IndexExists(format_str("ngrams_idxw1%s", suffix.c_str()), lem::format_str("NGRAM_WORDS%s", suffix.c_str()))) ExecuteSql(lem::format_str("CREATE INDEX ngrams_idxw1%s ON NGRAM_WORDS%s(id)", suffix.c_str(), suffix.c_str())); return; }