Beispiel #1
0
void genericLineBasedParsing(
    std::istream &file,
    field_cb_t cb_per_field,
    line_cb_t cb_per_line,
    void *data,
    const csv::params &params) {
  struct csv_parser parser;

  if (!csv_init(&parser, 0)) {
    csv_set_opts(&parser, CSV_APPEND_NULL);
    csv_set_delim(&parser, params.getDelimiter());

    std::string line;
    int line_start = params.getLineStart();

    if (line_start != 1) {
      while (line_start > 1) {
        std::getline(file, line);
        --line_start;
      }
    }


    int lineCount = 0;
    while (std::getline(file, line)) {

      ++lineCount;
      line.append("\n");
      if (csv_parse(&parser,
                    line.c_str(),
                    line.size(),
                    cb_per_field,
                    cb_per_line,
                    data) != line.size()) {
        throw ParserError(csv_strerror(csv_error(&parser)));
      }

      if (params.getLineCount() != -1 && lineCount >= params.getLineCount())
        break;

      if (file.bad())
        break;
    }

    csv_fini(&parser,
             cb_per_field,
             cb_per_line,
             data);
  }
  csv_free(&parser);

}
Beispiel #2
0
void genericParse(
    /*std::istream &file,*/
    std::string filename,
    field_cb_t cb_per_field,
    line_cb_t cb_per_line,
    void *data,
    const csv::params &params
                  ) {
  // Open the file
  typedef std::unique_ptr<std::FILE, int (*)(std::FILE *)> unique_file_ptr;
  unique_file_ptr file(fopen(filename.c_str(), "rb"), fclose);
  if (!file) {
    throw ParserError(std::string("File Opening Failed") +  std::strerror(errno));
  }

  struct csv_parser parser;

  if (!csv_init(&parser, 0)) {
    csv_set_opts(&parser, CSV_APPEND_NULL);
    csv_set_delim(&parser, params.getDelimiter());

    int line_start = params.getLineStart();
    if (line_start > 1) {
      int c;
      do {
        c = fgetc(file.get());
        if ( c== '\n') --line_start;
      } while (c!= EOF  && line_start > 1);
    }

    // 1GB Buffer
    size_t block_size;
    if (getenv("HYRISE_LOAD_BLOCK_SIZE"))
      block_size = strtoul(getenv("HYRISE_LOAD_BLOCK_SIZE"), nullptr, 0);
    else
      block_size = 1024 * 1024;

    // Read from the buffer
    size_t readBytes = 0;
    char rdbuf[block_size];

    // Read the file until we cannot extract more bytes
    do {
      readBytes = fread(rdbuf, 1, block_size, file.get());
      if (csv_parse(&parser,
                    rdbuf,
                    readBytes,
                    cb_per_field,
                    cb_per_line,
                    data) != (size_t) readBytes) {
        throw ParserError(csv_strerror(csv_error(&parser)));
      }
    } while (readBytes == block_size);

    if (ferror(file.get())) {
      throw ParserError("Could not read file");
    }

    csv_fini(&parser,
             cb_per_field,
             cb_per_line,
             data);
  }
  csv_free(&parser);
}
Beispiel #3
0
std::shared_ptr<storage::AbstractTable> RawTableLoader::load(std::shared_ptr<storage::AbstractTable> in,
        const storage::compound_metadata_list *ml,
        const Loader::params &args) {



    csv::params params;
    if (detectHeader(args.getBasePath() + _filename)) params.setLineStart(5);

    // Create the result table
    storage::metadata_vec_t v(in->columnCount());
    for(size_t i=0; i < in->columnCount(); ++i) {
        v[i] = in->metadataAt(i);
    }
    auto result = std::make_shared<storage::RawTable>(v);

    // CSV Parsing
    std::ifstream file(args.getBasePath() + _filename, std::ios::binary);
    if (!file || file.bad()) {
        throw csv::ParserError("CSV file '" + _filename + "' does not exist");
    }

    struct csv_parser parser;

    if (!csv_init(&parser, 0)) {
        csv_set_opts(&parser, CSV_APPEND_NULL);
        csv_set_delim(&parser, params.getDelimiter());

        // If there is a header in the file, we will ignore it
        std::string line;
        int line_start = params.getLineStart();

        if (line_start != 1) {
            while (line_start > 1) {
                std::getline(file, line);
                --line_start;
            }
        }

        // Prepare cb data handler
        struct raw_table_cb_data data(v);
        data.table = result;

        const size_t block_size = 16 * 1024;
        char rdbuf [block_size];

        while (file.read(rdbuf, block_size).good()) {
            auto extracted = file.gcount();
            if (extracted == 0)
                break;

            if (csv_parse(&parser,
                          rdbuf,
                          extracted,
                          (field_cb_t) raw_table_cb_per_field,
                          (line_cb_t) raw_table_cb_per_line,
                          (void*) &data) != (size_t) extracted) {
                throw csv::ParserError(csv_strerror(csv_error(&parser)));
            }
        }

        // Parse the rest
        if (csv_parse(&parser,
                      rdbuf,
                      file.gcount(),
                      (field_cb_t) raw_table_cb_per_field,
                      (line_cb_t) raw_table_cb_per_line,
                      (void*) &data) != (size_t) file.gcount()) {
            throw csv::ParserError(csv_strerror(csv_error(&parser)));
        }

        csv_fini(&parser,
                 (field_cb_t) raw_table_cb_per_field,
                 (line_cb_t) raw_table_cb_per_line,
                 (void*) &data);

    }
    csv_free(&parser);
    return result;
}