void SequentialHeapMerger::mergeValues(const std::vector<c_atable_ptr_t > &input_tables, size_t source_column_index, atable_ptr_t merged_table, size_t destination_column_index, value_id_mapping_t &value_id_mapping, bool useValid, const std::vector<bool>& valid) { std::vector<AbstractTable::SharedDictionaryPtr > value_id_maps; AbstractTable::SharedDictionaryPtr new_dict; // shortcut for dicts value_id_maps.reserve(input_tables.size()); for (size_t table = 0; table < input_tables.size(); table++) { if (!types::isCompatible(merged_table->metadataAt(destination_column_index).getType(), input_tables[table]->metadataAt(source_column_index).getType())) { throw std::runtime_error("Dictionary types don't match"); } auto dict = std::dynamic_pointer_cast<BaseDictionary<T>>(input_tables[table]->dictionaryAt(source_column_index)); value_id_maps.push_back(dict); } // Create new BaseDictionary - shrink when merge finished? new_dict = createNewDict<T>(input_tables, value_id_maps, value_id_mapping, source_column_index, useValid, valid); // set new value id map for column merged_table->setDictionaryAt(new_dict, destination_column_index); }
void SimpleTableDump::dumpHeader(std::string name, atable_ptr_t table) { std::stringstream header; std::vector<std::string> names, types; std::vector<uint32_t> parts; // Get names and types for (size_t i = 0; i < table->columnCount(); ++i) { names.push_back(table->nameOfColumn(i)); types.push_back(data_type_to_string(table->typeOfColumn(i))); } // This calculation will break if the width of the value_id changes // or someone forgets to simply update the width accordingly in the // constructor of the table for (size_t i = 0; i < table->partitionCount(); ++i) { parts.push_back(table->partitionWidth(i)); } // Dump and join header << std::accumulate(names.begin(), names.end(), std::string(), infix(" | ")) << "\n"; header << std::accumulate(types.begin(), types.end(), std::string(), infix(" | ")) << "\n"; std::vector<std::string> allParts; for (size_t i = 0; i < parts.size(); ++i) { auto p = parts[i]; auto tmp = std::vector<std::string>(p, std::to_string(i) + "_R"); allParts.insert(allParts.end(), tmp.begin(), tmp.end()); } header << std::accumulate(allParts.begin(), allParts.end(), std::string(), infix(" | ")) << "\n"; header << "==="; std::string fullPath = _baseDirectory + "/" + name + "/header.dat"; std::ofstream data(fullPath, std::ios::out | std::ios::binary); data << header.str(); data.close(); }
value_type operator() () { auto d = std::dynamic_pointer_cast<OrderPreservingDictionary<R>>(_dict); size_t tabSize = _main->size(); size_t start = _main->size() - _delta->size(); for(size_t row = start; row < tabSize; ++row) { _main->setValueId(_dstCol, row, ValueId{d->getValueIdForValue(_delta->getValue<R>(_col, row-start)), 0}); } }
Store::Store(atable_ptr_t main_table) : _delta_size(0), _main_table(main_table), delta(main_table->copy_structure(create_concurrent_dict, create_concurrent_storage)), merger(createDefaultMerger()), _cidBeginVector(main_table->size(), 0), _cidEndVector(main_table->size(), tx::INF_CID), _tidVector(main_table->size(), tx::UNKNOWN) { setUuid(); }
void RawTable::appendRows(const atable_ptr_t& rows) { type_switch<hyrise_basic_types> ts; for(size_t row=0; row < rows->size(); ++row) { rawtable::RowHelper rh(_metadata); for(size_t column=0; column < _metadata.size(); ++column) { type_func tf(rows, rh, column, row); ts(rows->typeOfColumn(column), tf); } std::unique_ptr<byte, void (*)(void *)> data(rh.build(), &std::free); appendRow(data.get()); } }
void SimpleStoreMerger::mergeValues(const std::vector<c_atable_ptr_t > &input_tables, atable_ptr_t merged_table, const column_mapping_t &column_mapping, const uint64_t newSize, bool useValid, const std::vector<bool>& valid) { if (useValid) throw std::runtime_error("SimpleStoreMerger does not support valid vectors"); if(input_tables.size() != 2) throw std::runtime_error("SimpleStoreMerger does not support more than two tables"); auto delta = std::dynamic_pointer_cast<const RawTable>(input_tables[1]); auto main = input_tables[0]; // Prepare type handling MergeDictFunctor fun; type_switch<hyrise_basic_types> ts; std::vector<MergeDictFunctor::result> mergedDictionaries(column_mapping.size()); // Extract unique values for delta for(const auto& kv : column_mapping) { const auto& col = kv.first; const auto& dst = kv.second; fun.prepare(main, delta, col); auto result = ts(main->typeOfColumn(col), fun); merged_table->setDictionaryAt(result.dict, dst); mergedDictionaries[col] = result; } // Update the values of the new Table merged_table->resize(newSize); size_t tabSize = main->size(); for(size_t row=0; row < tabSize; ++row) { for( const auto& kv : column_mapping) { const auto& col = kv.first; const auto& dst = kv.second; merged_table->setValueId(dst, row, ValueId{mergedDictionaries[col].mapping[main->getValueId(col, row).valueId], 0}); } } // Map the values for the values in the uncompressed delta MapValueForValueId map; for( const auto& kv : column_mapping) { const auto& col = kv.first; const auto& dst = kv.second; map.prepare(merged_table, dst, mergedDictionaries[col].dict, col, delta); ts(merged_table->typeOfColumn(dst), map); } }
void SequentialHeapMerger::mergeValues(const std::vector<c_atable_ptr_t > &input_tables, atable_ptr_t merged_table, const column_mapping_t &column_mapping, const uint64_t newSize, bool useValid, const std::vector<bool>& valid) { //if (input_tables.size () != 2) // throw std::runtime_error("Merging more than 2 tables is not supported with this merger..."); std::vector<value_id_mapping_t> mappingPerAtrtibute(input_tables[0]->columnCount()); for (const auto & kv: column_mapping) { const auto &source = kv.first; const auto &destination = kv.second; switch (merged_table->metadataAt(destination).getType()) { case IntegerType: case IntegerTypeDelta: case IntegerTypeDeltaConcurrent: mergeValues<hyrise_int_t>(input_tables, source, merged_table, destination, mappingPerAtrtibute[source], useValid, valid); break; case FloatType: case FloatTypeDelta: case FloatTypeDeltaConcurrent: mergeValues<hyrise_float_t>(input_tables, source, merged_table, destination, mappingPerAtrtibute[source], useValid, valid); break; case StringType: case StringTypeDelta: case StringTypeDeltaConcurrent: mergeValues<hyrise_string_t>(input_tables, source, merged_table, destination, mappingPerAtrtibute[source], useValid, valid); break; case IntegerNoDictType: case FloatNoDictType: merged_table->setDictionaryAt(makeDictionary(merged_table->typeOfColumn(destination)), destination); default: break; } } merged_table->resize(newSize); // Only after the dictionaries are merged copy the values for (const auto & kv: column_mapping) { const auto &source = kv.first; const auto &destination = kv.second; // copy the actual values and apply mapping copyValues(input_tables, source, merged_table, destination, mappingPerAtrtibute[source], useValid, valid); } }
void SequentialHeapMerger::copyValues(const std::vector<c_atable_ptr_t > &input_tables, size_t source_column_index, atable_ptr_t &merged_table, size_t destination_column_index, std::vector<std::vector<value_id_t> > &value_id_mapping, bool useValid, const std::vector<bool>& valid) { ValueId value_id; // copy all value ids to the new doc vector // and apply value id mapping size_t merged_table_row = 0; // Only apply the mapping if we have one, for non-dict columns, we // just copy the "value_ids". We use almost identical source code // here to avoid the additional branch in the inner loop. Not pretty // but it works. if (value_id_mapping.size() > 0) { size_t part_counter = 0; for (size_t table = 0; table < input_tables.size(); table++) { for (size_t row = 0; row < input_tables[table]->size(); row++) { if (!useValid || (useValid && valid[part_counter + row])) { value_id.valueId = input_tables[table]->getValueId(source_column_index, row).valueId; value_id.valueId = value_id_mapping[table][value_id.valueId]; // translate value id to new dict merged_table->setValueId(destination_column_index, merged_table_row, value_id); merged_table_row++; } } part_counter += input_tables[table]->size(); } } else { // No dict columns size_t part_counter = 0; for (size_t table = 0; table < input_tables.size(); table++) { for (size_t row = 0; row < input_tables[table]->size(); row++) { if (!useValid || (useValid && valid[part_counter + row])) { value_id.valueId = input_tables[table]->getValueId(source_column_index, row).valueId; merged_table->setValueId(destination_column_index, merged_table_row, value_id); merged_table_row++; } } part_counter += input_tables[table]->size(); } } }
inline void operator()() { auto dict = std::dynamic_pointer_cast<BaseDictionary<R>>(table->dictionaryAt(col)); const R* ptr = (R*)(data + sizeof(size_t)); size_t size = *((size_t*)data); // first sizeof(size_t) bytes store dictionary size; dict->reserve(size); for (size_t i = 0; i < size; ++i) { dict->addValue(*(ptr++)); } }
inline void operator()() { auto dict = checked_pointer_cast<ConcurrentUnorderedDictionary<R>>(table->dictionaryAt(col)); size_t size; data.read((char*)&size, sizeof(size_t)); std::vector<R> values(size); data.read((char*)&values[0], size * sizeof(R)); for (const auto value : values) { dict->addValue(value); } }
void SimpleTableDump::dumpAttribute(std::string name, atable_ptr_t table, size_t col) { assert(std::dynamic_pointer_cast<Store>(table) == nullptr); // this should never be called with a store directly, but with main and delta table sepratly. std::string fullPath = _baseDirectory + "/" + name + "/" + table->nameOfColumn(col) + ".attr.dat"; std::ofstream data(fullPath, std::ios::out | std::ios::binary); // size_t tableSize = table->size(); // get size before, to avoid chasing updates.. auto tableSize = table->checkpointSize(); std::vector<value_id_t> vidVector; vidVector.resize(tableSize); for (size_t i = 0; i < tableSize; ++i) { ValueId v; v = table->getValueId(col, i); vidVector[i] = v.valueId; } data.write((char*)&vidVector[0], tableSize * sizeof(value_id_t)); data.close(); }
void SimpleTableDump::dumpDictionary(std::string name, atable_ptr_t table, size_t col, bool delta) { std::string fullPath = _baseDirectory + "/" + name + "/" + table->nameOfColumn(col) + ".dict.dat"; std::ofstream data(fullPath, std::ios::out | std::ios::binary); if (!delta) { // We make a small hack here, first we obtain the size of the // dictionary then we virtually create all value ids, this can break // if the dictionary has no contigous value ids // size_t dictionarySize = table->dictionaryAt(col)->size(); write_to_stream_functor fun(data, table->dictionaryAt(col)); // will pick main dictionary by default for stores type_switch<hyrise_basic_types> ts; ts(table->typeOfColumn(col), fun); /*for(size_t i=0; i < dictionarySize; ++i) { fun.setCol(col); fun.setVid(i); ts(table->typeOfColumn(col), fun); }*/ } else { write_to_stream_functor_delta_dict fun( data, table->dictionaryAt(col)); // will pick main dictionary by default for stores type_switch<hyrise_basic_types> ts; ts(table->typeOfColumn(col), fun); } data.close(); }
void write_to_dict_functor_mmap::operator()<hyrise_string_t>() { auto dict = std::dynamic_pointer_cast<BaseDictionary<hyrise_string_t>>(table->dictionaryAt(col)); size_t size = *((size_t*)data); // first sizeof(size_t) bytes store dictionary size; dict->reserve(size); const size_t* sptr = (size_t*)(data + sizeof(size_t)); const char* cptr = data + 2 * sizeof(size_t); size_t read; for (size_t i = 0; i < size; ++i) { std::string val(cptr, *sptr); dict->addValue(val); read = *sptr; sptr = (size_t*)(cptr + read); cptr = cptr + read + sizeof(size_t); } }
void write_to_delta_vector_functor::operator()<hyrise_string_t>() { auto dict = checked_pointer_cast<ConcurrentUnorderedDictionary<hyrise_string_t>>(table->dictionaryAt(col)); size_t size; // copy whole file to buffer first data.seekg(0, data.end); int length = data.tellg(); data.seekg(0, data.beg); char* buffer = new char[length]; data.read(buffer, length); char* position_in_buffer = buffer; // file's format is (int)nr_of_entries, [(int)length_of_string, string] memcpy(&size, position_in_buffer, sizeof(size_t)); position_in_buffer += sizeof(size_t); for (size_t i = 0; i < size; ++i) { size_t s; memcpy(&s, position_in_buffer, sizeof(size_t)); position_in_buffer += sizeof(size_t); std::string tmp(s, '\0'); memcpy(&tmp[0], position_in_buffer, s); position_in_buffer += s; dict->addValue(tmp); } delete[] buffer; if (position_in_buffer != (buffer + length)) { throw std::runtime_error("Warning, did not read whole file."); } // Equivalent on regular file object, without buffer: // for (size_t i=0; i<size; ++i) { // size_t s; // data.read((char*) &s, sizeof(size_t)); // std::string tmp(s, '\0'); // data.read(&tmp[0], s); // dict->addValue(tmp); // } }
column_mapping_t identityMap(atable_ptr_t input) { column_mapping_t map; for (size_t column_index = 0; column_index < input->columnCount(); ++column_index) map[column_index] = column_index; return map; }
void SimpleTableDump::dumpMetaData(std::string name, atable_ptr_t table) { std::string fullPath = _baseDirectory + "/" + name + "/metadata.dat"; std::ofstream data(fullPath, std::ios::out | std::ios::binary); data << table->checkpointSize(); data.close(); }