lrec_reader_t* lrec_reader_alloc(char* fmtdesc, int use_mmap, char irs, char ifs, int allow_repeat_ifs, char ips, int allow_repeat_ips) { // xxx refactor for https://github.com/johnkerl/miller/issues/51 et al. byte_reader_t* pbr = use_mmap ? mmap_byte_reader_alloc() : stdio_byte_reader_alloc(); if (streq(fmtdesc, "dkvp")) { if (use_mmap) return lrec_reader_mmap_dkvp_alloc(irs, ifs, ips, allow_repeat_ifs); else return lrec_reader_stdio_dkvp_alloc(irs, ifs, ips, allow_repeat_ifs); } else if (streq(fmtdesc, "csv")) { return lrec_reader_csv_alloc(pbr, irs, ifs); } else if (streq(fmtdesc, "csvlite")) { if (use_mmap) return lrec_reader_mmap_csvlite_alloc(irs, ifs, allow_repeat_ifs); else return lrec_reader_stdio_csvlite_alloc(irs, ifs, allow_repeat_ifs); } else if (streq(fmtdesc, "nidx")) { if (use_mmap) return lrec_reader_mmap_nidx_alloc(irs, ifs, allow_repeat_ifs); else return lrec_reader_stdio_nidx_alloc(irs, ifs, allow_repeat_ifs); } else if (streq(fmtdesc, "xtab")) { if (use_mmap) return lrec_reader_mmap_xtab_alloc(irs, ips, TRUE/*allow_repeat_ips*/); else return lrec_reader_stdio_xtab_alloc(ips, TRUE); // xxx parameterize allow_repeat_ips } else { return NULL; } }
static int read_file_pfr_psb(char* filename, int do_write) { byte_reader_t* pbr = stdio_byte_reader_alloc(); string_builder_t* psb = sb_alloc(STRING_BUILDER_INIT_SIZE); pbr->popen_func(pbr, NULL, filename); peek_file_reader_t* pfr = pfr_alloc(pbr, PEEK_BUF_LEN); parse_trie_t* ptrie = parse_trie_alloc(); parse_trie_add_string(ptrie, "\n", IRS_STRIDX); parse_trie_add_string(ptrie, "\xff", EOF_STRIDX); parse_trie_add_string(ptrie, "\n\xff", IRSEOF_STRIDX); int bc = 0; while (TRUE) { char* line = read_line_pfr_psb(pfr, psb, ptrie); if (line == NULL) break; if (do_write) { fputs(line, stdout); fputc('\n', stdout); } bc += strlen(line); free(line); } sb_free(psb); pbr->pclose_func(pbr, NULL); return bc; }
lrec_reader_t* lrec_reader_alloc(char* fmtdesc, int use_mmap, char* irs, char* ifs, int allow_repeat_ifs, char* ips, int allow_repeat_ips) { // Only for RFC-CSV reader: see https://github.com/johnkerl/miller/issues/51 et al. byte_reader_t* pbr = use_mmap ? mmap_byte_reader_alloc() : stdio_byte_reader_alloc(); if (streq(fmtdesc, "dkvp")) { if (use_mmap) return lrec_reader_mmap_dkvp_alloc(irs, ifs, ips, allow_repeat_ifs); else return lrec_reader_stdio_dkvp_alloc(irs, ifs, ips, allow_repeat_ifs); } else if (streq(fmtdesc, "csv")) { return lrec_reader_csv_alloc(pbr, irs, ifs); } else if (streq(fmtdesc, "csvlite")) { if (use_mmap) return lrec_reader_mmap_csvlite_alloc(irs, ifs, allow_repeat_ifs); else return lrec_reader_stdio_csvlite_alloc(irs, ifs, allow_repeat_ifs); } else if (streq(fmtdesc, "nidx")) { if (use_mmap) return lrec_reader_mmap_nidx_alloc(irs, ifs, allow_repeat_ifs); else return lrec_reader_stdio_nidx_alloc(irs, ifs, allow_repeat_ifs); } else if (streq(fmtdesc, "xtab")) { if (strlen(ips) != 1) { fprintf(stderr, "%s: IPS for XTAB format must be single-character; got \"%s\".\n", MLR_GLOBALS.argv0, ips); return NULL; } if (use_mmap) return lrec_reader_mmap_xtab_alloc(ifs, ips[0], allow_repeat_ips); else return lrec_reader_stdio_xtab_alloc(ifs, ips[0], allow_repeat_ips); } else { return NULL; } }
// ---------------------------------------------------------------- lrec_reader_t* lrec_reader_stdio_csv_alloc(char* irs, char* ifs, int use_implicit_header, comment_handling_t comment_handling, char* comment_string) { lrec_reader_t* plrec_reader = mlr_malloc_or_die(sizeof(lrec_reader_t)); lrec_reader_stdio_csv_state_t* pstate = mlr_malloc_or_die(sizeof(lrec_reader_stdio_csv_state_t)); pstate->ilno = 0LL; pstate->do_auto_line_term = FALSE; if (streq(irs, "auto")) { irs = "\n"; pstate->do_auto_line_term = TRUE; } pstate->comment_handling = comment_handling; pstate->comment_string = comment_string; pstate->comment_string_length = comment_string == NULL ? 0 : strlen(comment_string); pstate->eof = "\xff"; pstate->irs = irs; pstate->ifs = ifs; pstate->ifs_eof = mlr_paste_2_strings(pstate->ifs, "\xff"); pstate->dquote = "\""; pstate->dquote_ifs = mlr_paste_2_strings("\"", pstate->ifs); pstate->dquote_eof = "\"\xff"; pstate->dquote_dquote = "\"\""; pstate->dquotelen = strlen(pstate->dquote); // Parse trie for UTF-8 BOM pstate->putf8_bom_parse_trie = parse_trie_alloc(); parse_trie_add_string(pstate->putf8_bom_parse_trie, UTF8_BOM, UTF8_BOM_STRIDX); // Parse trie for non-double-quoted fields pstate->pno_dquote_parse_trie = parse_trie_alloc(); parse_trie_add_string(pstate->pno_dquote_parse_trie, pstate->eof, EOF_STRIDX); parse_trie_add_string(pstate->pno_dquote_parse_trie, pstate->irs, IRS_STRIDX); parse_trie_add_string(pstate->pno_dquote_parse_trie, pstate->ifs_eof, IFS_EOF_STRIDX); parse_trie_add_string(pstate->pno_dquote_parse_trie, pstate->ifs, IFS_STRIDX); parse_trie_add_string(pstate->pno_dquote_parse_trie, pstate->dquote, DQUOTE_STRIDX); // Parse trie for double-quoted fields pstate->pdquote_parse_trie = parse_trie_alloc(); if (pstate->do_auto_line_term) { pstate->dquote_irs = mlr_paste_2_strings("\"", "\n"); pstate->dquote_irs2 = mlr_paste_2_strings("\"", "\r\n"); parse_trie_add_string(pstate->pdquote_parse_trie, pstate->dquote_irs, DQUOTE_IRS_STRIDX); parse_trie_add_string(pstate->pdquote_parse_trie, pstate->dquote_irs2, DQUOTE_IRS2_STRIDX); } else { pstate->dquote_irs = mlr_paste_2_strings("\"", pstate->irs); pstate->dquote_irs2 = NULL; parse_trie_add_string(pstate->pdquote_parse_trie, pstate->dquote_irs, DQUOTE_IRS_STRIDX); } parse_trie_add_string(pstate->pdquote_parse_trie, pstate->eof, EOF_STRIDX); parse_trie_add_string(pstate->pdquote_parse_trie, pstate->dquote_irs, DQUOTE_IRS_STRIDX); parse_trie_add_string(pstate->pdquote_parse_trie, pstate->dquote_ifs, DQUOTE_IFS_STRIDX); parse_trie_add_string(pstate->pdquote_parse_trie, pstate->dquote_eof, DQUOTE_EOF_STRIDX); parse_trie_add_string(pstate->pdquote_parse_trie, pstate->dquote_dquote, DQUOTE_DQUOTE_STRIDX); pstate->pfields = rslls_alloc(); pstate->psb = sb_alloc(STRING_BUILDER_INIT_SIZE); pstate->pbr = stdio_byte_reader_alloc(); pstate->pfr = pfr_alloc(pstate->pbr, mlr_imax3( pstate->putf8_bom_parse_trie->maxlen, pstate->pno_dquote_parse_trie->maxlen, pstate->pdquote_parse_trie->maxlen)); pstate->expect_header_line_next = use_implicit_header ? FALSE : TRUE; pstate->use_implicit_header = use_implicit_header; pstate->pheader_keeper = NULL; pstate->pheader_keepers = lhmslv_alloc(); plrec_reader->pvstate = (void*)pstate; plrec_reader->popen_func = lrec_reader_stdio_csv_open; plrec_reader->pclose_func = lrec_reader_stdio_csv_close; plrec_reader->pprocess_func = lrec_reader_stdio_csv_process; plrec_reader->psof_func = lrec_reader_stdio_csv_sof; plrec_reader->pfree_func = lrec_reader_stdio_csv_free; return plrec_reader; }