void MergingSortedBlockInputStream::merge(Block & merged_block, ColumnPlainPtrs & merged_columns, std::priority_queue<TSortCursor> & queue) { size_t merged_rows = 0; /** Увеличить счётчики строк. * Вернуть true, если пора закончить формировать текущий блок данных. */ auto count_row_and_check_limit = [&, this]() { ++total_merged_rows; if (limit && total_merged_rows == limit) { // std::cerr << "Limit reached\n"; cancel(); finished = true; return true; } ++merged_rows; if (merged_rows == max_block_size) { // std::cerr << "max_block_size reached\n"; return true; } return false; }; /// Вынимаем строки в нужном порядке и кладём в merged_block, пока строк не больше max_block_size while (!queue.empty()) { TSortCursor current = queue.top(); queue.pop(); while (true) { /** А вдруг для текущего курсора блок целиком меньше или равен, чем остальные? * Или в очереди остался только один источник данных? Тогда можно целиком взять блок текущего курсора. */ if (current.impl->isFirst() && (queue.empty() || current.totallyLessOrEquals(queue.top()))) { // std::cerr << "current block is totally less or equals\n"; /// Если в текущем блоке уже есть данные, то сначала вернём его. Мы попадём сюда снова при следующем вызове функции merge. if (merged_rows != 0) { // std::cerr << "merged rows is non-zero\n"; queue.push(current); return; } size_t source_num = 0; size_t size = cursors.size(); for (; source_num < size; ++source_num) if (&cursors[source_num] == current.impl) break; if (source_num == size) throw Exception("Logical error in MergingSortedBlockInputStream", ErrorCodes::LOGICAL_ERROR); for (size_t i = 0; i < num_columns; ++i) merged_block.unsafeGetByPosition(i).column = source_blocks[source_num]->unsafeGetByPosition(i).column; // std::cerr << "copied columns\n"; size_t merged_rows = merged_block.rows(); if (limit && total_merged_rows + merged_rows > limit) { merged_rows = limit - total_merged_rows; for (size_t i = 0; i < num_columns; ++i) { auto & column = merged_block.unsafeGetByPosition(i).column; column = column->cut(0, merged_rows); } cancel(); finished = true; } // std::cerr << "fetching next block\n"; total_merged_rows += merged_rows; fetchNextBlock(current, queue); return; } // std::cerr << "total_merged_rows: " << total_merged_rows << ", merged_rows: " << merged_rows << "\n"; // std::cerr << "Inserting row\n"; for (size_t i = 0; i < num_columns; ++i) merged_columns[i]->insertFrom(*current->all_columns[i], current->pos); if (!current->isLast()) { // std::cerr << "moving to next row\n"; current->next(); if (queue.empty() || !(current.greater(queue.top()))) { if (count_row_and_check_limit()) { // std::cerr << "pushing back to queue\n"; queue.push(current); return; } /// Не кладём курсор обратно в очередь, а продолжаем работать с текущим курсором. // std::cerr << "current is still on top, using current row\n"; continue; } else { // std::cerr << "next row is not least, pushing back to queue\n"; queue.push(current); } } else { /// Достаём из соответствующего источника следующий блок, если есть. // std::cerr << "It was last row, fetching next block\n"; fetchNextBlock(current, queue); } break; } if (count_row_and_check_limit()) return; } cancel(); finished = true; }
void MergingSortedBlockInputStream::merge(MutableColumns & merged_columns, std::priority_queue<TSortCursor> & queue) { size_t merged_rows = 0; /** Increase row counters. * Return true if it's time to finish generating the current data block. */ auto count_row_and_check_limit = [&, this]() { ++total_merged_rows; if (limit && total_merged_rows == limit) { // std::cerr << "Limit reached\n"; cancel(false); finished = true; return true; } ++merged_rows; if (merged_rows == max_block_size) { // std::cerr << "max_block_size reached\n"; return true; } return false; }; /// Take rows in required order and put them into `merged_columns`, while the rows are no more than `max_block_size` while (!queue.empty()) { TSortCursor current = queue.top(); queue.pop(); while (true) { /** And what if the block is totally less or equal than the rest for the current cursor? * Or is there only one data source left in the queue? Then you can take the entire block on current cursor. */ if (current.impl->isFirst() && (queue.empty() || current.totallyLessOrEquals(queue.top()))) { // std::cerr << "current block is totally less or equals\n"; /// If there are already data in the current block, we first return it. We'll get here again the next time we call the merge function. if (merged_rows != 0) { // std::cerr << "merged rows is non-zero\n"; queue.push(current); return; } /// Actually, current.impl->order stores source number (i.e. cursors[current.impl->order] == current.impl) size_t source_num = current.impl->order; if (source_num >= cursors.size()) throw Exception("Logical error in MergingSortedBlockInputStream", ErrorCodes::LOGICAL_ERROR); for (size_t i = 0; i < num_columns; ++i) merged_columns[i] = (*std::move(source_blocks[source_num]->getByPosition(i).column)).mutate(); // std::cerr << "copied columns\n"; merged_rows = merged_columns.at(0)->size(); if (limit && total_merged_rows + merged_rows > limit) { merged_rows = limit - total_merged_rows; for (size_t i = 0; i < num_columns; ++i) { auto & column = merged_columns[i]; column = (*column->cut(0, merged_rows)).mutate(); } cancel(false); finished = true; } if (out_row_sources_buf) { RowSourcePart row_source(source_num); for (size_t i = 0; i < merged_rows; ++i) out_row_sources_buf->write(row_source.data); } // std::cerr << "fetching next block\n"; total_merged_rows += merged_rows; fetchNextBlock(current, queue); return; } // std::cerr << "total_merged_rows: " << total_merged_rows << ", merged_rows: " << merged_rows << "\n"; // std::cerr << "Inserting row\n"; for (size_t i = 0; i < num_columns; ++i) merged_columns[i]->insertFrom(*current->all_columns[i], current->pos); if (out_row_sources_buf) { /// Actually, current.impl->order stores source number (i.e. cursors[current.impl->order] == current.impl) RowSourcePart row_source(current.impl->order); out_row_sources_buf->write(row_source.data); } if (!current->isLast()) { // std::cerr << "moving to next row\n"; current->next(); if (queue.empty() || !(current.greater(queue.top()))) { if (count_row_and_check_limit()) { // std::cerr << "pushing back to queue\n"; queue.push(current); return; } /// Do not put the cursor back in the queue, but continue to work with the current cursor. // std::cerr << "current is still on top, using current row\n"; continue; } else { // std::cerr << "next row is not least, pushing back to queue\n"; queue.push(current); } } else { /// We get the next block from the corresponding source, if there is one. // std::cerr << "It was last row, fetching next block\n"; fetchNextBlock(current, queue); } break; } if (count_row_and_check_limit()) return; } cancel(false); finished = true; }