void genericLineBasedParsing( std::istream &file, field_cb_t cb_per_field, line_cb_t cb_per_line, void *data, const csv::params ¶ms) { struct csv_parser parser; if (!csv_init(&parser, 0)) { csv_set_opts(&parser, CSV_APPEND_NULL); csv_set_delim(&parser, params.getDelimiter()); std::string line; int line_start = params.getLineStart(); if (line_start != 1) { while (line_start > 1) { std::getline(file, line); --line_start; } } int lineCount = 0; while (std::getline(file, line)) { ++lineCount; line.append("\n"); if (csv_parse(&parser, line.c_str(), line.size(), cb_per_field, cb_per_line, data) != line.size()) { throw ParserError(csv_strerror(csv_error(&parser))); } if (params.getLineCount() != -1 && lineCount >= params.getLineCount()) break; if (file.bad()) break; } csv_fini(&parser, cb_per_field, cb_per_line, data); } csv_free(&parser); }
void genericParse( /*std::istream &file,*/ std::string filename, field_cb_t cb_per_field, line_cb_t cb_per_line, void *data, const csv::params ¶ms ) { // Open the file typedef std::unique_ptr<std::FILE, int (*)(std::FILE *)> unique_file_ptr; unique_file_ptr file(fopen(filename.c_str(), "rb"), fclose); if (!file) { throw ParserError(std::string("File Opening Failed") + std::strerror(errno)); } struct csv_parser parser; if (!csv_init(&parser, 0)) { csv_set_opts(&parser, CSV_APPEND_NULL); csv_set_delim(&parser, params.getDelimiter()); int line_start = params.getLineStart(); if (line_start > 1) { int c; do { c = fgetc(file.get()); if ( c== '\n') --line_start; } while (c!= EOF && line_start > 1); } // 1GB Buffer size_t block_size; if (getenv("HYRISE_LOAD_BLOCK_SIZE")) block_size = strtoul(getenv("HYRISE_LOAD_BLOCK_SIZE"), nullptr, 0); else block_size = 1024 * 1024; // Read from the buffer size_t readBytes = 0; char rdbuf[block_size]; // Read the file until we cannot extract more bytes do { readBytes = fread(rdbuf, 1, block_size, file.get()); if (csv_parse(&parser, rdbuf, readBytes, cb_per_field, cb_per_line, data) != (size_t) readBytes) { throw ParserError(csv_strerror(csv_error(&parser))); } } while (readBytes == block_size); if (ferror(file.get())) { throw ParserError("Could not read file"); } csv_fini(&parser, cb_per_field, cb_per_line, data); } csv_free(&parser); }
std::shared_ptr<storage::AbstractTable> RawTableLoader::load(std::shared_ptr<storage::AbstractTable> in, const storage::compound_metadata_list *ml, const Loader::params &args) { csv::params params; if (detectHeader(args.getBasePath() + _filename)) params.setLineStart(5); // Create the result table storage::metadata_vec_t v(in->columnCount()); for(size_t i=0; i < in->columnCount(); ++i) { v[i] = in->metadataAt(i); } auto result = std::make_shared<storage::RawTable>(v); // CSV Parsing std::ifstream file(args.getBasePath() + _filename, std::ios::binary); if (!file || file.bad()) { throw csv::ParserError("CSV file '" + _filename + "' does not exist"); } struct csv_parser parser; if (!csv_init(&parser, 0)) { csv_set_opts(&parser, CSV_APPEND_NULL); csv_set_delim(&parser, params.getDelimiter()); // If there is a header in the file, we will ignore it std::string line; int line_start = params.getLineStart(); if (line_start != 1) { while (line_start > 1) { std::getline(file, line); --line_start; } } // Prepare cb data handler struct raw_table_cb_data data(v); data.table = result; const size_t block_size = 16 * 1024; char rdbuf [block_size]; while (file.read(rdbuf, block_size).good()) { auto extracted = file.gcount(); if (extracted == 0) break; if (csv_parse(&parser, rdbuf, extracted, (field_cb_t) raw_table_cb_per_field, (line_cb_t) raw_table_cb_per_line, (void*) &data) != (size_t) extracted) { throw csv::ParserError(csv_strerror(csv_error(&parser))); } } // Parse the rest if (csv_parse(&parser, rdbuf, file.gcount(), (field_cb_t) raw_table_cb_per_field, (line_cb_t) raw_table_cb_per_line, (void*) &data) != (size_t) file.gcount()) { throw csv::ParserError(csv_strerror(csv_error(&parser))); } csv_fini(&parser, (field_cb_t) raw_table_cb_per_field, (line_cb_t) raw_table_cb_per_line, (void*) &data); } csv_free(&parser); return result; }