bool CSVRowInputStream::read(MutableColumns & columns, RowReadExtension & ext) { if (istr.eof()) return false; updateDiagnosticInfo(); String tmp; for (size_t input_position = 0; input_position < column_indexes_for_input_fields.size(); ++input_position) { const auto & column_index = column_indexes_for_input_fields[input_position]; if (column_index) { skipWhitespacesAndTabs(istr); data_types[*column_index]->deserializeAsTextCSV(*columns[*column_index], istr, format_settings); skipWhitespacesAndTabs(istr); } else { readCSVString(tmp, istr, format_settings.csv); } skipDelimiter(istr, format_settings.csv.delimiter, input_position + 1 == column_indexes_for_input_fields.size()); } fillUnreadColumnsWithDefaults(columns, ext); return true; }
void CSVRowInputStream::readPrefix() { /// In this format, we assume, that if first string field contain BOM as value, it will be written in quotes, /// so BOM at beginning of stream cannot be confused with BOM in first string value, and it is safe to skip it. skipBOMIfExists(istr); if (with_names) { if (format_settings.with_names_use_header) { String column_name; do { skipWhitespacesAndTabs(istr); readCSVString(column_name, istr, format_settings.csv); skipWhitespacesAndTabs(istr); addInputColumn(column_name); } while (checkChar(format_settings.csv.delimiter, istr)); skipDelimiter(istr, format_settings.csv.delimiter, true); } else { setupAllColumnsByTableSchema(); skipRow(istr, format_settings.csv, column_indexes_for_input_fields.size()); } } else { setupAllColumnsByTableSchema(); } }
void DataTypeEnum<Type>::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const char delimiter) const { std::string name; readCSVString(name, istr, delimiter); static_cast<ColumnType &>(column).getData().push_back(getValue(StringRef(name))); }
/** gcc-7 generates wrong code with optimization level greater than 1. * See tests: dbms/src/IO/tests/write_int.cpp * and dbms/tests/queries/0_stateless/00898_parsing_bad_diagnostic_message.sh * This is compiler bug. The bug does not present in gcc-8 and clang-8. * Nevertheless, we don't need high optimization of this function. */ bool OPTIMIZE(1) CSVRowInputStream::parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out, size_t max_length_of_column_name, size_t max_length_of_data_type_name) { const char delimiter = format_settings.csv.delimiter; for (size_t input_position = 0; input_position < column_indexes_for_input_fields.size(); ++input_position) { if (input_position == 0 && istr.eof()) { out << "<End of stream>\n"; return false; } if (column_indexes_for_input_fields[input_position].has_value()) { const auto & column_index = *column_indexes_for_input_fields[input_position]; const auto & current_column_type = data_types[column_index]; out << "Column " << input_position << ", " << std::string((input_position < 10 ? 2 : input_position < 100 ? 1 : 0), ' ') << "name: " << header.safeGetByPosition(column_index).name << ", " << std::string(max_length_of_column_name - header.safeGetByPosition(column_index).name.size(), ' ') << "type: " << current_column_type->getName() << ", " << std::string(max_length_of_data_type_name - current_column_type->getName().size(), ' '); BufferBase::Position prev_position = istr.position(); BufferBase::Position curr_position = istr.position(); std::exception_ptr exception; try { skipWhitespacesAndTabs(istr); prev_position = istr.position(); current_column_type->deserializeAsTextCSV(*columns[column_index], istr, format_settings); curr_position = istr.position(); skipWhitespacesAndTabs(istr); } catch (...) { exception = std::current_exception(); } if (curr_position < prev_position) throw Exception("Logical error: parsing is non-deterministic.", ErrorCodes::LOGICAL_ERROR); if (isNumber(current_column_type) || isDateOrDateTime(current_column_type)) { /// An empty string instead of a value. if (curr_position == prev_position) { out << "ERROR: text "; verbosePrintString(prev_position, std::min(prev_position + 10, istr.buffer().end()), out); out << " is not like " << current_column_type->getName() << "\n"; return false; } } out << "parsed text: "; verbosePrintString(prev_position, curr_position, out); if (exception) { if (current_column_type->getName() == "DateTime") out << "ERROR: DateTime must be in YYYY-MM-DD hh:mm:ss or NNNNNNNNNN (unix timestamp, exactly 10 digits) format.\n"; else if (current_column_type->getName() == "Date") out << "ERROR: Date must be in YYYY-MM-DD format.\n"; else out << "ERROR\n"; return false; } out << "\n"; if (current_column_type->haveMaximumSizeOfValue()) { if (*curr_position != '\n' && *curr_position != '\r' && *curr_position != delimiter) { out << "ERROR: garbage after " << current_column_type->getName() << ": "; verbosePrintString(curr_position, std::min(curr_position + 10, istr.buffer().end()), out); out << "\n"; if (current_column_type->getName() == "DateTime") out << "ERROR: DateTime must be in YYYY-MM-DD hh:mm:ss or NNNNNNNNNN (unix timestamp, exactly 10 digits) format.\n"; else if (current_column_type->getName() == "Date") out << "ERROR: Date must be in YYYY-MM-DD format.\n"; return false; } } } else { static const String skipped_column_str = "<SKIPPED COLUMN>"; out << "Column " << input_position << ", " << std::string((input_position < 10 ? 2 : input_position < 100 ? 1 : 0), ' ') << "name: " << skipped_column_str << ", " << std::string(max_length_of_column_name - skipped_column_str.length(), ' ') << "type: " << skipped_column_str << ", " << std::string(max_length_of_data_type_name - skipped_column_str.length(), ' '); String tmp; readCSVString(tmp, istr, format_settings.csv); } /// Delimiters if (input_position + 1 == column_indexes_for_input_fields.size()) { if (istr.eof()) return false; /// we support the extra delimiter at the end of the line if (*istr.position() == delimiter) { ++istr.position(); if (istr.eof()) break; } if (!istr.eof() && *istr.position() != '\n' && *istr.position() != '\r') { out << "ERROR: There is no line feed. "; verbosePrintString(istr.position(), istr.position() + 1, out); out << " found instead.\n" " It's like your file has more columns than expected.\n" "And if your file have right number of columns, maybe it have unquoted string value with comma.\n"; return false; } skipEndOfLine(istr); } else { try { assertChar(delimiter, istr); } catch (const DB::Exception &) { if (*istr.position() == '\n' || *istr.position() == '\r') { out << "ERROR: Line feed found where delimiter (" << delimiter << ") is expected." " It's like your file has less columns than expected.\n" "And if your file have right number of columns, maybe it have unescaped quotes in values.\n"; } else { out << "ERROR: There is no delimiter (" << delimiter << "). "; verbosePrintString(istr.position(), istr.position() + 1, out); out << " found instead.\n"; } return false; } } } return true; }