void genericLineBasedParsing( std::istream &file, field_cb_t cb_per_field, line_cb_t cb_per_line, void *data, const csv::params ¶ms) { struct csv_parser parser; if (!csv_init(&parser, 0)) { csv_set_opts(&parser, CSV_APPEND_NULL); csv_set_delim(&parser, params.getDelimiter()); std::string line; int line_start = params.getLineStart(); if (line_start != 1) { while (line_start > 1) { std::getline(file, line); --line_start; } } int lineCount = 0; while (std::getline(file, line)) { ++lineCount; line.append("\n"); if (csv_parse(&parser, line.c_str(), line.size(), cb_per_field, cb_per_line, data) != line.size()) { throw ParserError(csv_strerror(csv_error(&parser))); } if (params.getLineCount() != -1 && lineCount >= params.getLineCount()) break; if (file.bad()) break; } csv_fini(&parser, cb_per_field, cb_per_line, data); } csv_free(&parser); }
/* ================================================================== * Parser for comma-separated argument list * ================================================================== */ void ParseVarList(int nLn, FILE *flp, char *dataName, char *leftPart, char *argString) { struct csv_parser p; unsigned char options = 0; LIST_DATA ldata; // fill in data for the callback memset(&ldata, '\x0', sizeof(LIST_DATA)); ldata.nLn = nLn; ldata.flp = flp; ldata.cnt = 0; strcpy(ldata.dataName, dataName); strcpy(ldata.lp, leftPart); // Initialize csv parser if (csv_init(&p, options) != 0) { fprintf(stderr, "Failed to initialize csv parser\n"); return; } // set white space, eol and delimiter csv_set_space_func(&p, is_space_list); csv_set_term_func(&p, is_term_list); csv_set_delim(&p, ','); unsigned int agrLen = strlen(argString); fprintf(stderr, "ParseVarList: argString = %s argLen - %d\n", argString, agrLen); memset(inputsLst, '\x0', sizeof(inputsLst)); InpCnt = 0; if (csv_parse(&p, argString, strlen(argString), cbProcessListElement, NULL, &ldata) != agrLen) { fprintf(stderr, "ParseVarList: %s\n", csv_strerror(csv_error(&p))); return; } csv_fini(&p, cbProcessListElement, NULL, &ldata); csv_free(&p); return; }
int main (int argc, char ** argv){ if (argc != 4){ printf("Usage: %s nodes.csv ways.csv direct.csv\n",argv[0]); return 1; } char * nodescsvname = argv[1]; char * wayscsvname = argv[2]; char * directcsvname = argv[3]; struct csv_parser parser; csv_init(&parser,CSV_APPEND_NULL); csv_set_delim(&parser,';'); struct parse_t * p_struct; p_struct = malloc(sizeof(struct parse_t)); p_struct->state=0; p_struct->count=0; p_struct->ok=1; GARY_INIT(p_struct->vertices,0); GARY_INIT(p_struct->edges,0); parseFile(nodescsvname,&parser,p_struct,node_item_cb,node_line_cb); nodesIdx_refresh(GARY_SIZE(p_struct->vertices),p_struct->vertices); parseFile(wayscsvname,&parser,p_struct,way_item_cb,way_line_cb); parseFile(directcsvname,&parser,p_struct,direct_item_cb,direct_line_cb); Graph__Graph * graph; graph = malloc(sizeof(Graph__Graph)); graph__graph__init(graph); graph->n_edges=GARY_SIZE(p_struct->edges); graph->edges=p_struct->edges; graph->n_vertices=GARY_SIZE(p_struct->vertices); graph->vertices=p_struct->vertices; printf("Created graph with %d edges and %d vertices\n",graph->n_edges,graph->n_vertices); struct vertexedges_t * vedges; vedges = makeVertexEdges(graph); largestComponent(graph,vedges); saveSearchGraph(graph,"../data/postgis-graph.pbf"); }
void csv_init_parser(csv_file_t &csvin) { #if CSV_MAJOR >= 3 #define PARSER_OPTIONS CSV_APPEND_NULL #else #define PARSER_OPTIONS 0 #endif unsigned char parser_options = PARSER_OPTIONS; if (csv_init(&csvin.csv_file_parser, parser_options) != 0) { fprintf(stderr, "Failed to initialize csv parser\n"); exit(EXIT_FAILURE); } csv_set_space_func(&csvin.csv_file_parser, csv_is_space); csv_set_term_func(&csvin.csv_file_parser, csv_is_term); csv_set_delim(&csvin.csv_file_parser, csvin.tab_delimter); }/* ----- end of function csv_init_parser ----- */
void genericParse( /*std::istream &file,*/ std::string filename, field_cb_t cb_per_field, line_cb_t cb_per_line, void *data, const csv::params ¶ms ) { // Open the file typedef std::unique_ptr<std::FILE, int (*)(std::FILE *)> unique_file_ptr; unique_file_ptr file(fopen(filename.c_str(), "rb"), fclose); if (!file) { throw ParserError(std::string("File Opening Failed") + std::strerror(errno)); } struct csv_parser parser; if (!csv_init(&parser, 0)) { csv_set_opts(&parser, CSV_APPEND_NULL); csv_set_delim(&parser, params.getDelimiter()); int line_start = params.getLineStart(); if (line_start > 1) { int c; do { c = fgetc(file.get()); if ( c== '\n') --line_start; } while (c!= EOF && line_start > 1); } // 1GB Buffer size_t block_size; if (getenv("HYRISE_LOAD_BLOCK_SIZE")) block_size = strtoul(getenv("HYRISE_LOAD_BLOCK_SIZE"), nullptr, 0); else block_size = 1024 * 1024; // Read from the buffer size_t readBytes = 0; char rdbuf[block_size]; // Read the file until we cannot extract more bytes do { readBytes = fread(rdbuf, 1, block_size, file.get()); if (csv_parse(&parser, rdbuf, readBytes, cb_per_field, cb_per_line, data) != (size_t) readBytes) { throw ParserError(csv_strerror(csv_error(&parser))); } } while (readBytes == block_size); if (ferror(file.get())) { throw ParserError("Could not read file"); } csv_fini(&parser, cb_per_field, cb_per_line, data); } csv_free(&parser); }
/* An rb_rescue()-compatible Ruby pseudo-method that handles the actual parsing */ VALUE rcsv_raw_parse(VALUE ensure_container) { /* Unpacking multiple variables from a single Ruby VALUE */ VALUE options = rb_ary_entry(ensure_container, 0); VALUE csvio = rb_ary_entry(ensure_container, 1); struct rcsv_metadata * meta = (struct rcsv_metadata *)NUM2LONG(rb_ary_entry(ensure_container, 2)); struct csv_parser * cp = (struct csv_parser *)NUM2LONG(rb_ary_entry(ensure_container, 3)); /* Helper temporary variables */ VALUE option, csvstr, buffer_size; /* libcsv-related temporary variables */ char * csv_string; size_t csv_string_len; int error; /* Generic iterator */ size_t i = 0; /* IO buffer size can be controller via an option */ buffer_size = rb_hash_aref(options, ID2SYM(rb_intern("buffer_size"))); /* By default, parse as Array of Arrays */ option = rb_hash_aref(options, ID2SYM(rb_intern("row_as_hash"))); if (option && (option != Qnil)) { meta->row_as_hash = true; } /* :col_sep sets the column separator, default is comma (,) */ option = rb_hash_aref(options, ID2SYM(rb_intern("col_sep"))); if (option != Qnil) { csv_set_delim(cp, (unsigned char)*StringValuePtr(option)); } /* :quote_char sets the character used for quoting data; default is double-quote (") */ option = rb_hash_aref(options, ID2SYM(rb_intern("quote_char"))); if (option != Qnil) { csv_set_quote(cp, (unsigned char)*StringValuePtr(option)); } /* Specify how many rows to skip from the beginning of CSV */ option = rb_hash_aref(options, ID2SYM(rb_intern("offset_rows"))); if (option != Qnil) { meta->offset_rows = (size_t)NUM2INT(option); } /* Specify the character encoding of the input data */ option = rb_hash_aref(options, ID2SYM(rb_intern("output_encoding"))); if (option && (option != Qnil)) { meta->encoding_index = RB_ENC_FIND_INDEX(StringValueCStr(option)); } /* :only_rows is a list of values where row is only parsed if its fields match those in the passed array. [nil, nil, ["ABC", nil, 1]] skips all rows where 3rd column isn't equal to "ABC", nil or 1 */ option = rb_hash_aref(options, ID2SYM(rb_intern("only_rows"))); if (option != Qnil) { meta->num_only_rows = (size_t)RARRAY_LEN(option); meta->only_rows = (VALUE *)malloc(meta->num_only_rows * sizeof(VALUE)); for (i = 0; i < meta->num_only_rows; i++) { VALUE only_row = rb_ary_entry(option, i); meta->only_rows[i] = validate_filter_row("only_rows", only_row); } } /* :except_rows is a list of values where row is only parsed if its fields don't match those in the passed array. [nil, nil, ["ABC", nil, 1]] skips all rows where 3rd column is equal to "ABC", nil or 1 */ option = rb_hash_aref(options, ID2SYM(rb_intern("except_rows"))); if (option != Qnil) { meta->num_except_rows = (size_t)RARRAY_LEN(option); meta->except_rows = (VALUE *)malloc(meta->num_except_rows * sizeof(VALUE)); for (i = 0; i < meta->num_except_rows; i++) { VALUE except_row = rb_ary_entry(option, i); meta->except_rows[i] = validate_filter_row("except_rows", except_row); } } /* :row_defaults is an array of default values that are assigned to fields containing empty strings according to matching field positions */ option = rb_hash_aref(options, ID2SYM(rb_intern("row_defaults"))); if (option != Qnil) { meta->num_row_defaults = RARRAY_LEN(option); meta->row_defaults = (VALUE*)malloc(meta->num_row_defaults * sizeof(VALUE*)); for (i = 0; i < meta->num_row_defaults; i++) { VALUE row_default = rb_ary_entry(option, i); meta->row_defaults[i] = row_default; } } /* :row_conversions specifies Ruby types that CSV field values should be converted into. Each char of row_conversions string represents Ruby type for CSV field with matching position. */ option = rb_hash_aref(options, ID2SYM(rb_intern("row_conversions"))); if (option != Qnil) { meta->num_row_conversions = RSTRING_LEN(option); meta->row_conversions = StringValuePtr(option); } /* Column names should be declared explicitly when parsing fields as Hashes */ if (meta->row_as_hash) { /* Only matters for hash results */ option = rb_hash_aref(options, ID2SYM(rb_intern("column_names"))); if (option == Qnil) { rb_raise(rcsv_parse_error, ":row_as_hash requires :column_names to be set."); } else { meta->last_entry = rb_hash_new(); meta->num_columns = (size_t)RARRAY_LEN(option); meta->column_names = (VALUE*)malloc(meta->num_columns * sizeof(VALUE*)); for (i = 0; i < meta->num_columns; i++) { meta->column_names[i] = rb_ary_entry(option, i); } } } else { meta->last_entry = rb_ary_new(); } while(true) { csvstr = rb_funcall(csvio, rb_intern("read"), 1, buffer_size); if ((csvstr == Qnil) || (RSTRING_LEN(csvstr) == 0)) { break; } csv_string = StringValuePtr(csvstr); csv_string_len = strlen(csv_string); /* Actual parsing and error handling */ if (csv_string_len != csv_parse(cp, csv_string, csv_string_len, &end_of_field_callback, &end_of_line_callback, meta)) { error = csv_error(cp); switch(error) { case CSV_EPARSE: rb_raise(rcsv_parse_error, "Error when parsing malformed data"); break; case CSV_ENOMEM: rb_raise(rcsv_parse_error, "No memory"); break; case CSV_ETOOBIG: rb_raise(rcsv_parse_error, "Field data is too large"); break; case CSV_EINVALID: rb_raise(rcsv_parse_error, "%s", (const char *)csv_strerror(error)); break; default: rb_raise(rcsv_parse_error, "Failed due to unknown reason"); } } } /* Flushing libcsv's buffer */ csv_fini(cp, &end_of_field_callback, &end_of_line_callback, meta); return Qnil; }
std::shared_ptr<storage::AbstractTable> RawTableLoader::load(std::shared_ptr<storage::AbstractTable> in, const storage::compound_metadata_list *ml, const Loader::params &args) { csv::params params; if (detectHeader(args.getBasePath() + _filename)) params.setLineStart(5); // Create the result table storage::metadata_vec_t v(in->columnCount()); for(size_t i=0; i < in->columnCount(); ++i) { v[i] = in->metadataAt(i); } auto result = std::make_shared<storage::RawTable>(v); // CSV Parsing std::ifstream file(args.getBasePath() + _filename, std::ios::binary); if (!file || file.bad()) { throw csv::ParserError("CSV file '" + _filename + "' does not exist"); } struct csv_parser parser; if (!csv_init(&parser, 0)) { csv_set_opts(&parser, CSV_APPEND_NULL); csv_set_delim(&parser, params.getDelimiter()); // If there is a header in the file, we will ignore it std::string line; int line_start = params.getLineStart(); if (line_start != 1) { while (line_start > 1) { std::getline(file, line); --line_start; } } // Prepare cb data handler struct raw_table_cb_data data(v); data.table = result; const size_t block_size = 16 * 1024; char rdbuf [block_size]; while (file.read(rdbuf, block_size).good()) { auto extracted = file.gcount(); if (extracted == 0) break; if (csv_parse(&parser, rdbuf, extracted, (field_cb_t) raw_table_cb_per_field, (line_cb_t) raw_table_cb_per_line, (void*) &data) != (size_t) extracted) { throw csv::ParserError(csv_strerror(csv_error(&parser))); } } // Parse the rest if (csv_parse(&parser, rdbuf, file.gcount(), (field_cb_t) raw_table_cb_per_field, (line_cb_t) raw_table_cb_per_line, (void*) &data) != (size_t) file.gcount()) { throw csv::ParserError(csv_strerror(csv_error(&parser))); } csv_fini(&parser, (field_cb_t) raw_table_cb_per_field, (line_cb_t) raw_table_cb_per_line, (void*) &data); } csv_free(&parser); return result; }
readstat_error_t readstat_parse_csv(readstat_parser_t *parser, const char *path, const char *jsonpath, struct csv_metadata* md, void *user_ctx) { readstat_error_t retval = READSTAT_OK; readstat_io_t *io = parser->io; size_t file_size = 0; size_t bytes_read; struct csv_parser csvparser; struct csv_parser *p = &csvparser; char buf[BUFSIZ]; size_t* column_width = md->column_width; md->pass = column_width ? 2 : 1; md->open_row = 0; md->columns = 0; md->_rows = md->rows; md->rows = 0; md->parser = parser; md->user_ctx = user_ctx; md->json_md = NULL; if ((md->json_md = get_json_metadata(jsonpath)) == NULL) { fprintf(stderr, "Could not get JSON metadata\n"); retval = READSTAT_ERROR_PARSE; goto cleanup; } if (io->open(path, io->io_ctx) == -1) { retval = READSTAT_ERROR_OPEN; goto cleanup; } file_size = io->seek(0, READSTAT_SEEK_END, io->io_ctx); if (file_size == -1) { retval = READSTAT_ERROR_SEEK; goto cleanup; } if (io->seek(0, READSTAT_SEEK_SET, io->io_ctx) == -1) { retval = READSTAT_ERROR_SEEK; goto cleanup; } if (csv_init(p, CSV_APPEND_NULL) != 0) { retval = READSTAT_ERROR_OPEN; goto cleanup; } unsigned char sep = get_separator(md->json_md); csv_set_delim(p, sep); while ((bytes_read = io->read(buf, sizeof(buf), io->io_ctx)) > 0) { if (csv_parse(p, buf, bytes_read, csv_metadata_cell, csv_metadata_row, md) != bytes_read) { fprintf(stderr, "Error while parsing file: %s\n", csv_strerror(csv_error(p))); retval = READSTAT_ERROR_PARSE; goto cleanup; } } csv_fini(p, csv_metadata_cell, csv_metadata_row, md); if (!md->open_row) { md->rows--; } if (parser->info_handler && md->pass == 1) { parser->info_handler(md->rows, md->_columns, user_ctx); } cleanup: if (md->variables) { free(md->variables); md->variables = NULL; } if (md->is_date) { free(md->is_date); md->is_date = NULL; } if (md->json_md) { free_json_metadata(md->json_md); md->json_md = NULL; } csv_free(p); io->close(io->io_ctx); return retval; }