bool Sqlite3Table::insert(sqlite3 * db, QueryData & inQuerydata) const { std::vector<std::string> all_columns (extract_column_names(inQuerydata)); if(all_columns.empty()) { std::cerr << "No insertion values !" << std::endl; return false; } std::stringstream _mainQuerySs; std::stringstream _subQuerySs; _mainQuerySs << "INSERT INTO " + mName + " ("; bool first_add(true); for(auto it(all_columns.begin()); it != all_columns.end(); it++) { if(!first_add) { _mainQuerySs << ", "; _subQuerySs << ", "; } _mainQuerySs << *it; _subQuerySs << "@" << *it; first_add = false; } _mainQuerySs << ") " << "VALUES (" << _subQuerySs.str() << ");"; bool _success(true); sqlite3_stmt * statement; std::cout << "Insert: " << _mainQuerySs.str() << std::endl; // Prepare the statement checkIfSqlError(sqlite3_prepare_v2(db, _mainQuerySs.str().c_str(),-1/*null-terminated*/,&statement,NULL), __FILE__, __LINE__); // PERFORM BINDING for(auto it(inQuerydata.mStringColumns.begin()); it != inQuerydata.mStringColumns.end(); it++) _success = (_success && checkIfSqlError(mColumns.find(it->first)->second->bind(statement, &it->second), __FILE__, __LINE__)); for(auto it(inQuerydata.mIntColumns.begin()); it != inQuerydata.mIntColumns.end(); it++) _success = (_success && checkIfSqlError(mColumns.find(it->first)->second->bind(statement, &it->second), __FILE__, __LINE__)); for(auto it(inQuerydata.mDoubleColumns.begin()); it != inQuerydata.mDoubleColumns.end(); it++) _success = (_success && checkIfSqlError(mColumns.find(it->first)->second->bind(statement, &it->second), __FILE__, __LINE__)); for(auto it(inQuerydata.mNullColumns.begin()); it != inQuerydata.mNullColumns.end(); it++) _success = (_success && checkIfSqlError(mColumns.find(*it)->second->bind(statement, nullptr), __FILE__, __LINE__)); if(!_success) // Binding failed return false; _success = (_success && checkIfSqlError(sqlite3_step(statement), __FILE__, __LINE__)); sqlite3_finalize(statement); return _success; }
bool Sqlite3Table::remove(sqlite3 * db, QueryData & inQuerydata, int & outColumnsRemoved) const { std::vector<std::string> all_columns (extract_column_names(inQuerydata)); std::stringstream ss; ss << "DELETE FROM " + mName; if(!all_columns.empty()) { ss << + " WHERE "; } bool first_add(true); for(auto it(all_columns.begin()); it != all_columns.end(); it++) { if(!first_add) { ss << " AND "; first_add = false; } ss << *it << " = @" << *it; } ss << ";"; // Prepare the statement sqlite3_stmt * statement; bool _success(true); _success = (_success && checkIfSqlError(sqlite3_prepare_v2(db, ss.str().c_str(),-1/*null-terminated*/,&statement,NULL), __FILE__, __LINE__)); // PERFORM BINDING for(auto it(inQuerydata.mStringColumns.begin()); it != inQuerydata.mStringColumns.end(); it++) _success = (_success && checkIfSqlError(mColumns.find(it->first)->second->bind(statement, &it->second), __FILE__, __LINE__)); for(auto it(inQuerydata.mIntColumns.begin()); it != inQuerydata.mIntColumns.end(); it++) _success = (_success && checkIfSqlError(mColumns.find(it->first)->second->bind(statement, &it->second), __FILE__, __LINE__)); for(auto it(inQuerydata.mDoubleColumns.begin()); it != inQuerydata.mDoubleColumns.end(); it++) _success = (_success && checkIfSqlError(mColumns.find(it->first)->second->bind(statement, &it->second), __FILE__, __LINE__)); for(auto it(inQuerydata.mNullColumns.begin()); it != inQuerydata.mNullColumns.end(); it++) _success = (_success && checkIfSqlError(mColumns.find(*it)->second->bind(statement, nullptr), __FILE__, __LINE__)); sqlite3_step(statement); outColumnsRemoved = sqlite3_changes(db); sqlite3_finalize(statement); return _success; }
bool Sqlite3Table::select(sqlite3 * db, QueryData & inQuerydata, std::vector<QueryData> & outQuerydata) const { // exit_on_error(sqlite3_step(statement), __LINE__); std::vector<std::string> all_columns (extract_column_names(inQuerydata)); std::stringstream ss; ss << "SELECT * FROM " + mName; if(!all_columns.empty()) { ss << + " WHERE "; } bool first_add(true); for(auto it(all_columns.begin()); it != all_columns.end(); it++) { if(!first_add) { ss << " AND "; first_add = false; } ss << *it << " = @" << *it; } ss << ";"; // Prepare the statement sqlite3_stmt * statement; bool _success(true); _success = (_success && checkIfSqlError(sqlite3_prepare_v2(db, ss.str().c_str(),-1/*null-terminated*/,&statement,NULL), __FILE__, __LINE__)); // PERFORM BINDING for(auto it(inQuerydata.mStringColumns.begin()); it != inQuerydata.mStringColumns.end(); it++) _success = (_success && checkIfSqlError(mColumns.find(it->first)->second->bind(statement, &it->second), __FILE__, __LINE__)); for(auto it(inQuerydata.mIntColumns.begin()); it != inQuerydata.mIntColumns.end(); it++) _success = (_success && checkIfSqlError(mColumns.find(it->first)->second->bind(statement, &it->second), __FILE__, __LINE__)); for(auto it(inQuerydata.mDoubleColumns.begin()); it != inQuerydata.mDoubleColumns.end(); it++) _success = (_success && checkIfSqlError(mColumns.find(it->first)->second->bind(statement, &it->second), __FILE__, __LINE__)); for(auto it(inQuerydata.mNullColumns.begin()); it != inQuerydata.mNullColumns.end(); it++) _success = (_success && checkIfSqlError(mColumns.find(*it)->second->bind(statement, nullptr), __FILE__, __LINE__)); // Fill return data int _columnCount(sqlite3_column_count(statement)); while(sqlite3_step(statement) == SQLITE_ROW) { QueryData row_result; for(int c (0); c < _columnCount; c++) { std::string column_name(sqlite3_column_name(statement, c)); Sqlite3Column * column(mColumns.find(column_name)->second.get()); Sqlite3Type column_type(column->getType()); if(column_type == Sqlite3Null::_NAME) { row_result.mNullColumns.push_back(column_name); } else if(column_type == Sqlite3Integer::_NAME) { int * v = new int; column->value(statement, c, v); row_result.mIntColumns.emplace(column_name, *v); delete v; } else if(column_type == Sqlite3Real::_NAME) { double * v = new double; column->value(statement, c, v); row_result.mDoubleColumns.emplace(column_name, *v); delete v; } else if(column_type == Sqlite3Text::_NAME) { std::string * v = new std::string; column->value(statement, c, v); row_result.mStringColumns.emplace(column_name, *v); delete v; } else { _success = false; std::cerr << "Unimplemented type!" << std::endl; } } outQuerydata.push_back(row_result); } sqlite3_finalize(statement); return _success; }
sframe groupby_aggregate(const sframe& source, const std::vector<std::string>& keys, const std::vector<std::string>& output_column_names, const std::vector<std::pair<std::vector<std::string>, std::shared_ptr<group_aggregate_value>>>& groups, size_t max_buffer_size) { // first, sanity checks // check that group keys exist if (output_column_names.size() != groups.size()) { log_and_throw("There must be as many output columns as there are groups"); } { // check that output column names are all unique, and do not intersect with // keys. Since empty values will be automatically assigned, we will skip // those. std::set<std::string> all_output_columns(keys.begin(), keys.end()); size_t named_column_count = 0; for (auto s: output_column_names) { if (!s.empty()) { all_output_columns.insert(s); ++named_column_count; } } if (all_output_columns.size() != keys.size() + named_column_count) { log_and_throw("Output columns names are not unique"); } } for (const auto& key: keys) { // check that the column name is valid if (!source.contains_column(key)) { log_and_throw("SFrame does not contain column " + key); } } // check that each group is valid for (const auto& group: groups) { // check that the column name is valid if (group.first.size() > 0) { for(size_t index = 0; index < group.first.size();index++) { auto& col_name = group.first[index]; if (!source.contains_column(col_name)) { log_and_throw("SFrame does not contain column " + col_name); } if(graphlab::registered_arg_functions.count(group.second->name()) != 0 && index > 0) continue; // check that the types are valid size_t column_number = source.column_index(col_name); if (!group.second->support_type(source.column_type(column_number))) { log_and_throw("Requested operation: " + group.second->name() + " not supported on the type of column " + col_name); } } } } // key should not have repeated columns std::set<std::string> key_columns; std::set<std::string> group_columns; for (const auto& key: keys) key_columns.insert(key); for (const auto& group: groups) { for(auto& col_name : group.first) { group_columns.insert(col_name); } } if (key_columns.size() != keys.size()) { log_and_throw("Group by key cannot have repeated column names"); } // ok. select out just the columns I care about // begin with the key columns std::vector<std::string> all_columns(key_columns.begin(), key_columns.end()); // then all the group columns (as long as they are not also key columns) for (const auto& group_column: group_columns) { if (group_column != "" && key_columns.count(group_column) == 0) { all_columns.push_back(group_column); } } sframe frame_with_relevant_cols = source.select_columns(all_columns); // prepare the output frame sframe output; std::vector<std::string> column_names; std::vector<flex_type_enum> column_types; // output frame has the key column name and types for (const auto& key: key_columns) { column_names.push_back(key); column_types.push_back(source.column_type(source.column_index(key))); } // then for each group, make a unique name and determine the output group type for (size_t i = 0;i < groups.size(); ++i) { const auto& group = groups[i]; std::string candidate_name = output_column_names[i]; if (candidate_name.empty()) { std::string root_candidate_name; if(graphlab::registered_arg_functions.count(group.second->name()) == 0) { for (auto& col_name: group.first) { if (root_candidate_name.empty()) { root_candidate_name += " of " + col_name; } else { root_candidate_name += "_" + col_name; } } root_candidate_name = group.second->name() + root_candidate_name; } else { if(group.first.size() != 2) log_and_throw("arg functions takes exactly two arguments"); root_candidate_name += group.first[1] + " for " + group.second->name() + " of " + group.first[0]; } candidate_name = root_candidate_name; size_t ctr = 1; // keep trying to come up with a unique column name while (std::find(column_names.begin(), column_names.end(), candidate_name) != column_names.end()) { candidate_name = root_candidate_name + "." + std::to_string(ctr); ++ctr; } } column_names.push_back(candidate_name); std::vector<flex_type_enum> input_types; for(auto col_name : group.first) { input_types.push_back(source.column_type(source.column_index(col_name))); } // this statement is valid for argmax and argmin as well, because their // set_input_types(...) simply return input_types. auto output_type = group.second->set_input_types(input_types); column_types.push_back(output_type); } // done! now we can start on the groupby size_t nsegments = frame_with_relevant_cols.num_segments(); // either nsegments, or n*log n buckets nsegments = std::max(nsegments, thread::cpu_count() * std::max<size_t>(1, log2(thread::cpu_count()))); output.open_for_write(column_names, column_types, "", nsegments); groupby_aggregate_impl::group_aggregate_container container(max_buffer_size, nsegments); // ok the input sframe (frame_with_relevant_cols) contains all the values // we care about. However, the challenge here is to figure out how the keys // and values line up. By construction, all the key columns come first. // which is good. But group columns can be pretty much anywhere. size_t num_keys = keys.size(); for (const auto& group: groups) { std::vector<size_t> column_numbers; for(auto& col_name : group.first) { column_numbers.push_back(frame_with_relevant_cols.column_index(col_name)); } container.define_group(column_numbers, group.second); } // done. now we can begin parallel processing // shuffle the rows based on the value of the key column. auto input_reader = frame_with_relevant_cols.get_reader(thread::cpu_count()); graphlab::timer ti; logstream(LOG_INFO) << "Filling group container: " << std::endl; parallel_for (0, input_reader->num_segments(), [&](size_t i) { auto iter = input_reader->begin(i); auto enditer = input_reader->end(i); while(iter != enditer) { auto& row = *iter; container.add(row, num_keys); ++iter; } }); logstream(LOG_INFO) << "Group container filled in " << ti.current_time() << std::endl; logstream(LOG_INFO) << "Writing output: " << std::endl; ti.start(); container.group_and_write(output); logstream(LOG_INFO) << "Output written in: " << ti.current_time() << std::endl; output.close(); return output; }