bool BinaryRowInputStream::read(MutableColumns & columns, RowReadExtension &) { if (istr.eof()) return false; size_t num_columns = columns.size(); for (size_t i = 0; i < num_columns; ++i) header.getByPosition(i).type->deserializeBinary(*columns[i], istr); return true; }
void MergingSortedBlockInputStream::init(MutableColumns & merged_columns) { /// Read the first blocks, initialize the queue. if (first) { first = false; for (size_t i = 0; i < source_blocks.size(); ++i) { SharedBlockPtr & shared_block_ptr = source_blocks[i]; if (shared_block_ptr.get()) continue; shared_block_ptr = new detail::SharedBlock(children[i]->read()); const size_t rows = shared_block_ptr->rows(); if (rows == 0) continue; if (expected_block_size < rows) expected_block_size = std::min(rows, max_block_size); cursors[i] = SortCursorImpl(*shared_block_ptr, description, i); shared_block_ptr->all_columns = cursors[i].all_columns; shared_block_ptr->sort_columns = cursors[i].sort_columns; has_collation |= cursors[i].has_collation; } if (has_collation) initQueue(queue_with_collation); else initQueue(queue_without_collation); } /// Let's check that all source blocks have the same structure. for (const SharedBlockPtr & shared_block_ptr : source_blocks) { if (!*shared_block_ptr) continue; assertBlocksHaveEqualStructure(*shared_block_ptr, header, getName()); } merged_columns.resize(num_columns); for (size_t i = 0; i < num_columns; ++i) { merged_columns[i] = header.safeGetByPosition(i).column->cloneEmpty(); merged_columns[i]->reserve(expected_block_size); } }
bool ValuesRowInputStream::read(MutableColumns & columns) { size_t num_columns = columns.size(); skipWhitespaceIfAny(istr); if (istr.eof() || *istr.position() == ';') return false; /** Typically, this is the usual format for streaming parsing. * But as an exception, it also supports processing arbitrary expressions instead of values. * This is very inefficient. But if there are no expressions, then there is no overhead. */ ParserExpression parser; assertChar('(', istr); for (size_t i = 0; i < num_columns; ++i) { skipWhitespaceIfAny(istr); char * prev_istr_position = istr.position(); size_t prev_istr_bytes = istr.count() - istr.offset(); bool rollback_on_exception = false; try { header.getByPosition(i).type->deserializeTextQuoted(*columns[i], istr, format_settings); rollback_on_exception = true; skipWhitespaceIfAny(istr); if (i != num_columns - 1) assertChar(',', istr); else assertChar(')', istr); } catch (const Exception & e) { if (!format_settings.values.interpret_expressions) throw; /** The normal streaming parser could not parse the value. * Let's try to parse it with a SQL parser as a constant expression. * This is an exceptional case. */ if (e.code() == ErrorCodes::CANNOT_PARSE_INPUT_ASSERTION_FAILED || e.code() == ErrorCodes::CANNOT_PARSE_QUOTED_STRING || e.code() == ErrorCodes::CANNOT_PARSE_NUMBER || e.code() == ErrorCodes::CANNOT_PARSE_DATE || e.code() == ErrorCodes::CANNOT_PARSE_DATETIME || e.code() == ErrorCodes::CANNOT_READ_ARRAY_FROM_TEXT) { /// TODO Case when the expression does not fit entirely in the buffer. /// If the beginning of the value is no longer in the buffer. if (istr.count() - istr.offset() != prev_istr_bytes) throw; if (rollback_on_exception) columns[i]->popBack(1); const IDataType & type = *header.getByPosition(i).type; Expected expected; Tokens tokens(prev_istr_position, istr.buffer().end()); TokenIterator token_iterator(tokens); ASTPtr ast; if (!parser.parse(token_iterator, ast, expected)) throw Exception("Cannot parse expression of type " + type.getName() + " here: " + String(prev_istr_position, std::min(SHOW_CHARS_ON_SYNTAX_ERROR, istr.buffer().end() - prev_istr_position)), ErrorCodes::SYNTAX_ERROR); istr.position() = const_cast<char *>(token_iterator->begin); std::pair<Field, DataTypePtr> value_raw = evaluateConstantExpression(ast, *context); Field value = convertFieldToType(value_raw.first, type, value_raw.second.get()); /// Check that we are indeed allowed to insert a NULL. if (value.isNull()) { if (!type.isNullable()) throw Exception{"Expression returns value " + applyVisitor(FieldVisitorToString(), value) + ", that is out of range of type " + type.getName() + ", at: " + String(prev_istr_position, std::min(SHOW_CHARS_ON_SYNTAX_ERROR, istr.buffer().end() - prev_istr_position)), ErrorCodes::VALUE_IS_OUT_OF_RANGE_OF_DATA_TYPE}; } columns[i]->insert(value); skipWhitespaceIfAny(istr); if (i != num_columns - 1) assertChar(',', istr); else assertChar(')', istr); } else throw; } } skipWhitespaceIfAny(istr); if (!istr.eof() && *istr.position() == ',') ++istr.position(); return true; }
void MergingSortedBlockInputStream::merge(MutableColumns & merged_columns, std::priority_queue<TSortCursor> & queue) { size_t merged_rows = 0; /** Increase row counters. * Return true if it's time to finish generating the current data block. */ auto count_row_and_check_limit = [&, this]() { ++total_merged_rows; if (limit && total_merged_rows == limit) { // std::cerr << "Limit reached\n"; cancel(false); finished = true; return true; } ++merged_rows; if (merged_rows == max_block_size) { // std::cerr << "max_block_size reached\n"; return true; } return false; }; /// Take rows in required order and put them into `merged_columns`, while the rows are no more than `max_block_size` while (!queue.empty()) { TSortCursor current = queue.top(); queue.pop(); while (true) { /** And what if the block is totally less or equal than the rest for the current cursor? * Or is there only one data source left in the queue? Then you can take the entire block on current cursor. */ if (current.impl->isFirst() && (queue.empty() || current.totallyLessOrEquals(queue.top()))) { // std::cerr << "current block is totally less or equals\n"; /// If there are already data in the current block, we first return it. We'll get here again the next time we call the merge function. if (merged_rows != 0) { // std::cerr << "merged rows is non-zero\n"; queue.push(current); return; } /// Actually, current.impl->order stores source number (i.e. cursors[current.impl->order] == current.impl) size_t source_num = current.impl->order; if (source_num >= cursors.size()) throw Exception("Logical error in MergingSortedBlockInputStream", ErrorCodes::LOGICAL_ERROR); for (size_t i = 0; i < num_columns; ++i) merged_columns[i] = (*std::move(source_blocks[source_num]->getByPosition(i).column)).mutate(); // std::cerr << "copied columns\n"; merged_rows = merged_columns.at(0)->size(); if (limit && total_merged_rows + merged_rows > limit) { merged_rows = limit - total_merged_rows; for (size_t i = 0; i < num_columns; ++i) { auto & column = merged_columns[i]; column = (*column->cut(0, merged_rows)).mutate(); } cancel(false); finished = true; } if (out_row_sources_buf) { RowSourcePart row_source(source_num); for (size_t i = 0; i < merged_rows; ++i) out_row_sources_buf->write(row_source.data); } // std::cerr << "fetching next block\n"; total_merged_rows += merged_rows; fetchNextBlock(current, queue); return; } // std::cerr << "total_merged_rows: " << total_merged_rows << ", merged_rows: " << merged_rows << "\n"; // std::cerr << "Inserting row\n"; for (size_t i = 0; i < num_columns; ++i) merged_columns[i]->insertFrom(*current->all_columns[i], current->pos); if (out_row_sources_buf) { /// Actually, current.impl->order stores source number (i.e. cursors[current.impl->order] == current.impl) RowSourcePart row_source(current.impl->order); out_row_sources_buf->write(row_source.data); } if (!current->isLast()) { // std::cerr << "moving to next row\n"; current->next(); if (queue.empty() || !(current.greater(queue.top()))) { if (count_row_and_check_limit()) { // std::cerr << "pushing back to queue\n"; queue.push(current); return; } /// Do not put the cursor back in the queue, but continue to work with the current cursor. // std::cerr << "current is still on top, using current row\n"; continue; } else { // std::cerr << "next row is not least, pushing back to queue\n"; queue.push(current); } } else { /// We get the next block from the corresponding source, if there is one. // std::cerr << "It was last row, fetching next block\n"; fetchNextBlock(current, queue); } break; } if (count_row_and_check_limit()) return; } cancel(false); finished = true; }