Block MergeSortingBlocksBlockInputStream::mergeImpl(std::priority_queue<TSortCursor> & queue) { size_t num_columns = blocks[0].columns(); MutableColumns merged_columns = blocks[0].cloneEmptyColumns(); /// TODO: reserve (in each column) /// Take rows from queue in right order and push to 'merged'. size_t merged_rows = 0; while (!queue.empty()) { TSortCursor current = queue.top(); queue.pop(); for (size_t i = 0; i < num_columns; ++i) merged_columns[i]->insertFrom(*current->all_columns[i], current->pos); if (!current->isLast()) { current->next(); queue.push(current); } ++total_merged_rows; if (limit && total_merged_rows == limit) { auto res = blocks[0].cloneWithColumns(std::move(merged_columns)); blocks.clear(); return res; } ++merged_rows; if (merged_rows == max_merged_block_size) return blocks[0].cloneWithColumns(std::move(merged_columns)); } if (merged_rows == 0) return {}; return blocks[0].cloneWithColumns(std::move(merged_columns)); }
void MergingSortedBlockInputStream::merge(Block & merged_block, ColumnPlainPtrs & merged_columns, std::priority_queue<TSortCursor> & queue) { size_t merged_rows = 0; /** Увеличить счётчики строк. * Вернуть true, если пора закончить формировать текущий блок данных. */ auto count_row_and_check_limit = [&, this]() { ++total_merged_rows; if (limit && total_merged_rows == limit) { // std::cerr << "Limit reached\n"; cancel(); finished = true; return true; } ++merged_rows; if (merged_rows == max_block_size) { // std::cerr << "max_block_size reached\n"; return true; } return false; }; /// Вынимаем строки в нужном порядке и кладём в merged_block, пока строк не больше max_block_size while (!queue.empty()) { TSortCursor current = queue.top(); queue.pop(); while (true) { /** А вдруг для текущего курсора блок целиком меньше или равен, чем остальные? * Или в очереди остался только один источник данных? Тогда можно целиком взять блок текущего курсора. */ if (current.impl->isFirst() && (queue.empty() || current.totallyLessOrEquals(queue.top()))) { // std::cerr << "current block is totally less or equals\n"; /// Если в текущем блоке уже есть данные, то сначала вернём его. Мы попадём сюда снова при следующем вызове функции merge. if (merged_rows != 0) { // std::cerr << "merged rows is non-zero\n"; queue.push(current); return; } size_t source_num = 0; size_t size = cursors.size(); for (; source_num < size; ++source_num) if (&cursors[source_num] == current.impl) break; if (source_num == size) throw Exception("Logical error in MergingSortedBlockInputStream", ErrorCodes::LOGICAL_ERROR); for (size_t i = 0; i < num_columns; ++i) merged_block.unsafeGetByPosition(i).column = source_blocks[source_num]->unsafeGetByPosition(i).column; // std::cerr << "copied columns\n"; size_t merged_rows = merged_block.rows(); if (limit && total_merged_rows + merged_rows > limit) { merged_rows = limit - total_merged_rows; for (size_t i = 0; i < num_columns; ++i) { auto & column = merged_block.unsafeGetByPosition(i).column; column = column->cut(0, merged_rows); } cancel(); finished = true; } // std::cerr << "fetching next block\n"; total_merged_rows += merged_rows; fetchNextBlock(current, queue); return; } // std::cerr << "total_merged_rows: " << total_merged_rows << ", merged_rows: " << merged_rows << "\n"; // std::cerr << "Inserting row\n"; for (size_t i = 0; i < num_columns; ++i) merged_columns[i]->insertFrom(*current->all_columns[i], current->pos); if (!current->isLast()) { // std::cerr << "moving to next row\n"; current->next(); if (queue.empty() || !(current.greater(queue.top()))) { if (count_row_and_check_limit()) { // std::cerr << "pushing back to queue\n"; queue.push(current); return; } /// Не кладём курсор обратно в очередь, а продолжаем работать с текущим курсором. // std::cerr << "current is still on top, using current row\n"; continue; } else { // std::cerr << "next row is not least, pushing back to queue\n"; queue.push(current); } } else { /// Достаём из соответствующего источника следующий блок, если есть. // std::cerr << "It was last row, fetching next block\n"; fetchNextBlock(current, queue); } break; } if (count_row_and_check_limit()) return; } cancel(); finished = true; }
void AggregatingSortedBlockInputStream::merge(ColumnPlainPtrs & merged_columns, std::priority_queue<TSortCursor> & queue) { size_t merged_rows = 0; /// Вынимаем строки в нужном порядке и кладём в merged_block, пока строк не больше max_block_size while (!queue.empty()) { TSortCursor current = queue.top(); setPrimaryKeyRef(next_key, current); bool key_differs; if (current_key.empty()) /// Первый встретившийся ключ. { current_key.columns.resize(description.size()); setPrimaryKeyRef(current_key, current); key_differs = true; } else key_differs = next_key != current_key; /// если накопилось достаточно строк и последняя посчитана полностью if (key_differs && merged_rows >= max_block_size) return; queue.pop(); if (key_differs) { current_key.swap(next_key); /// Запишем данные для очередной группы. Копируем значения обычных столбцов. for (size_t i = 0, size = column_numbers_not_to_aggregate.size(); i < size; ++i) { size_t j = column_numbers_not_to_aggregate[i]; merged_columns[j]->insertFrom(*current->all_columns[j], current->pos); } /// Добавляем в агрегатные столбцы пустое состояние агрегации. Состояние будет обновлено в функции addRow. for (auto & column_to_aggregate : columns_to_aggregate) column_to_aggregate->insertDefault(); ++merged_rows; } addRow(current); if (!current->isLast()) { current->next(); queue.push(current); } else { /// Достаём из соответствующего источника следующий блок, если есть. fetchNextBlock(current, queue); } } finished = true; }
void MergingSortedBlockInputStream::merge(MutableColumns & merged_columns, std::priority_queue<TSortCursor> & queue) { size_t merged_rows = 0; /** Increase row counters. * Return true if it's time to finish generating the current data block. */ auto count_row_and_check_limit = [&, this]() { ++total_merged_rows; if (limit && total_merged_rows == limit) { // std::cerr << "Limit reached\n"; cancel(false); finished = true; return true; } ++merged_rows; if (merged_rows == max_block_size) { // std::cerr << "max_block_size reached\n"; return true; } return false; }; /// Take rows in required order and put them into `merged_columns`, while the rows are no more than `max_block_size` while (!queue.empty()) { TSortCursor current = queue.top(); queue.pop(); while (true) { /** And what if the block is totally less or equal than the rest for the current cursor? * Or is there only one data source left in the queue? Then you can take the entire block on current cursor. */ if (current.impl->isFirst() && (queue.empty() || current.totallyLessOrEquals(queue.top()))) { // std::cerr << "current block is totally less or equals\n"; /// If there are already data in the current block, we first return it. We'll get here again the next time we call the merge function. if (merged_rows != 0) { // std::cerr << "merged rows is non-zero\n"; queue.push(current); return; } /// Actually, current.impl->order stores source number (i.e. cursors[current.impl->order] == current.impl) size_t source_num = current.impl->order; if (source_num >= cursors.size()) throw Exception("Logical error in MergingSortedBlockInputStream", ErrorCodes::LOGICAL_ERROR); for (size_t i = 0; i < num_columns; ++i) merged_columns[i] = (*std::move(source_blocks[source_num]->getByPosition(i).column)).mutate(); // std::cerr << "copied columns\n"; merged_rows = merged_columns.at(0)->size(); if (limit && total_merged_rows + merged_rows > limit) { merged_rows = limit - total_merged_rows; for (size_t i = 0; i < num_columns; ++i) { auto & column = merged_columns[i]; column = (*column->cut(0, merged_rows)).mutate(); } cancel(false); finished = true; } if (out_row_sources_buf) { RowSourcePart row_source(source_num); for (size_t i = 0; i < merged_rows; ++i) out_row_sources_buf->write(row_source.data); } // std::cerr << "fetching next block\n"; total_merged_rows += merged_rows; fetchNextBlock(current, queue); return; } // std::cerr << "total_merged_rows: " << total_merged_rows << ", merged_rows: " << merged_rows << "\n"; // std::cerr << "Inserting row\n"; for (size_t i = 0; i < num_columns; ++i) merged_columns[i]->insertFrom(*current->all_columns[i], current->pos); if (out_row_sources_buf) { /// Actually, current.impl->order stores source number (i.e. cursors[current.impl->order] == current.impl) RowSourcePart row_source(current.impl->order); out_row_sources_buf->write(row_source.data); } if (!current->isLast()) { // std::cerr << "moving to next row\n"; current->next(); if (queue.empty() || !(current.greater(queue.top()))) { if (count_row_and_check_limit()) { // std::cerr << "pushing back to queue\n"; queue.push(current); return; } /// Do not put the cursor back in the queue, but continue to work with the current cursor. // std::cerr << "current is still on top, using current row\n"; continue; } else { // std::cerr << "next row is not least, pushing back to queue\n"; queue.push(current); } } else { /// We get the next block from the corresponding source, if there is one. // std::cerr << "It was last row, fetching next block\n"; fetchNextBlock(current, queue); } break; } if (count_row_and_check_limit()) return; } cancel(false); finished = true; }