C++ (Cpp) readCSVString Examples

Example #1

0

Show file

File: CSVRowInputStream.cpp Project: yandex/ClickHouse

bool CSVRowInputStream::read(MutableColumns & columns, RowReadExtension & ext)
{
    if (istr.eof())
        return false;

    updateDiagnosticInfo();

    String tmp;
    for (size_t input_position = 0; input_position < column_indexes_for_input_fields.size(); ++input_position)
    {
        const auto & column_index = column_indexes_for_input_fields[input_position];
        if (column_index)
        {
            skipWhitespacesAndTabs(istr);
            data_types[*column_index]->deserializeAsTextCSV(*columns[*column_index], istr, format_settings);
            skipWhitespacesAndTabs(istr);
        }
        else
        {
            readCSVString(tmp, istr, format_settings.csv);
        }

        skipDelimiter(istr, format_settings.csv.delimiter, input_position + 1 == column_indexes_for_input_fields.size());
    }

    fillUnreadColumnsWithDefaults(columns, ext);

    return true;
}

Example #2

0

Show file

File: CSVRowInputStream.cpp Project: yandex/ClickHouse

void CSVRowInputStream::readPrefix()
{
    /// In this format, we assume, that if first string field contain BOM as value, it will be written in quotes,
    ///  so BOM at beginning of stream cannot be confused with BOM in first string value, and it is safe to skip it.
    skipBOMIfExists(istr);

    if (with_names)
    {
        if (format_settings.with_names_use_header)
        {
            String column_name;
            do
            {
                skipWhitespacesAndTabs(istr);
                readCSVString(column_name, istr, format_settings.csv);
                skipWhitespacesAndTabs(istr);

                addInputColumn(column_name);
            }
            while (checkChar(format_settings.csv.delimiter, istr));

            skipDelimiter(istr, format_settings.csv.delimiter, true);
        }
        else
        {
            setupAllColumnsByTableSchema();
            skipRow(istr, format_settings.csv, column_indexes_for_input_fields.size());
        }
    }
    else
    {
        setupAllColumnsByTableSchema();
    }
}

Example #3

0

Show file

File: DataTypeEnum.cpp Project: yurial/ClickHouse

void DataTypeEnum<Type>::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const char delimiter) const
{
    std::string name;
    readCSVString(name, istr, delimiter);
    static_cast<ColumnType &>(column).getData().push_back(getValue(StringRef(name)));
}

Example #4

0

Show file

File: CSVRowInputStream.cpp Project: yandex/ClickHouse

/** gcc-7 generates wrong code with optimization level greater than 1.
  * See tests: dbms/src/IO/tests/write_int.cpp
  *  and dbms/tests/queries/0_stateless/00898_parsing_bad_diagnostic_message.sh
  * This is compiler bug. The bug does not present in gcc-8 and clang-8.
  * Nevertheless, we don't need high optimization of this function.
  */
bool OPTIMIZE(1) CSVRowInputStream::parseRowAndPrintDiagnosticInfo(MutableColumns & columns,
    WriteBuffer & out, size_t max_length_of_column_name, size_t max_length_of_data_type_name)
{
    const char delimiter = format_settings.csv.delimiter;

    for (size_t input_position = 0; input_position < column_indexes_for_input_fields.size(); ++input_position)
    {
        if (input_position == 0 && istr.eof())
        {
            out << "<End of stream>\n";
            return false;
        }

        if (column_indexes_for_input_fields[input_position].has_value())
        {
            const auto & column_index = *column_indexes_for_input_fields[input_position];
            const auto & current_column_type = data_types[column_index];

            out << "Column " << input_position << ", " << std::string((input_position < 10 ? 2 : input_position < 100 ? 1 : 0), ' ')
                << "name: " << header.safeGetByPosition(column_index).name << ", " << std::string(max_length_of_column_name - header.safeGetByPosition(column_index).name.size(), ' ')
                << "type: " << current_column_type->getName() << ", " << std::string(max_length_of_data_type_name - current_column_type->getName().size(), ' ');

            BufferBase::Position prev_position = istr.position();
            BufferBase::Position curr_position = istr.position();
            std::exception_ptr exception;

            try
            {
                skipWhitespacesAndTabs(istr);
                prev_position = istr.position();
                current_column_type->deserializeAsTextCSV(*columns[column_index], istr, format_settings);
                curr_position = istr.position();
                skipWhitespacesAndTabs(istr);
            }
            catch (...)
            {
                exception = std::current_exception();
            }

            if (curr_position < prev_position)
                throw Exception("Logical error: parsing is non-deterministic.", ErrorCodes::LOGICAL_ERROR);

            if (isNumber(current_column_type) || isDateOrDateTime(current_column_type))
            {
                /// An empty string instead of a value.
                if (curr_position == prev_position)
                {
                    out << "ERROR: text ";
                    verbosePrintString(prev_position, std::min(prev_position + 10, istr.buffer().end()), out);
                    out << " is not like " << current_column_type->getName() << "\n";
                    return false;
                }
            }

            out << "parsed text: ";
            verbosePrintString(prev_position, curr_position, out);

            if (exception)
            {
                if (current_column_type->getName() == "DateTime")
                    out << "ERROR: DateTime must be in YYYY-MM-DD hh:mm:ss or NNNNNNNNNN (unix timestamp, exactly 10 digits) format.\n";
                else if (current_column_type->getName() == "Date")
                    out << "ERROR: Date must be in YYYY-MM-DD format.\n";
                else
                    out << "ERROR\n";
                return false;
            }

            out << "\n";

            if (current_column_type->haveMaximumSizeOfValue())
            {
                if (*curr_position != '\n' && *curr_position != '\r' && *curr_position != delimiter)
                {
                    out << "ERROR: garbage after " << current_column_type->getName() << ": ";
                    verbosePrintString(curr_position, std::min(curr_position + 10, istr.buffer().end()), out);
                    out << "\n";

                    if (current_column_type->getName() == "DateTime")
                        out << "ERROR: DateTime must be in YYYY-MM-DD hh:mm:ss or NNNNNNNNNN (unix timestamp, exactly 10 digits) format.\n";
                    else if (current_column_type->getName() == "Date")
                        out << "ERROR: Date must be in YYYY-MM-DD format.\n";

                    return false;
                }
            }
        }
        else
        {
            static const String skipped_column_str = "<SKIPPED COLUMN>";
            out << "Column " << input_position << ", " << std::string((input_position < 10 ? 2 : input_position < 100 ? 1 : 0), ' ')
                << "name: " << skipped_column_str << ", " << std::string(max_length_of_column_name - skipped_column_str.length(), ' ')
                << "type: " << skipped_column_str << ", " << std::string(max_length_of_data_type_name - skipped_column_str.length(), ' ');

            String tmp;
            readCSVString(tmp, istr, format_settings.csv);
        }

        /// Delimiters
        if (input_position + 1 == column_indexes_for_input_fields.size())
        {
            if (istr.eof())
                return false;

            /// we support the extra delimiter at the end of the line
            if (*istr.position() == delimiter)
            {
                ++istr.position();
                if (istr.eof())
                    break;
            }

            if (!istr.eof() && *istr.position() != '\n' && *istr.position() != '\r')
            {
                out << "ERROR: There is no line feed. ";
                verbosePrintString(istr.position(), istr.position() + 1, out);
                out << " found instead.\n"
                    " It's like your file has more columns than expected.\n"
                    "And if your file have right number of columns, maybe it have unquoted string value with comma.\n";

                return false;
            }

            skipEndOfLine(istr);
        }
        else
        {
            try
            {
                assertChar(delimiter, istr);
            }
            catch (const DB::Exception &)
            {
                if (*istr.position() == '\n' || *istr.position() == '\r')
                {
                    out << "ERROR: Line feed found where delimiter (" << delimiter << ") is expected."
                        " It's like your file has less columns than expected.\n"
                        "And if your file have right number of columns, maybe it have unescaped quotes in values.\n";
                }
                else
                {
                    out << "ERROR: There is no delimiter (" << delimiter << "). ";
                    verbosePrintString(istr.position(), istr.position() + 1, out);
                    out << " found instead.\n";
                }
                return false;
            }
        }
    }

    return true;
}