static char* read_line_pfr_psb(peek_file_reader_t* pfr, string_builder_t* psb, parse_trie_t* ptrie) { int rc, stridx, matchlen; while (TRUE) { pfr_buffer_by(pfr, ptrie->maxlen); rc = parse_trie_ring_match(ptrie, pfr->peekbuf, pfr->sob, pfr->npeeked, pfr->peekbuflenmask, &stridx, &matchlen); if (rc) { pfr_advance_by(pfr, matchlen); switch(stridx) { case IRS_STRIDX: return sb_finish(psb); break; case IRSEOF_STRIDX: return sb_finish(psb); break; case EOF_STRIDX: return NULL; break; } } else { sb_append_char(psb, pfr_read_char(pfr)); } } }
static int lrec_reader_stdio_csv_get_fields(lrec_reader_stdio_csv_state_t* pstate, rslls_t* pfields, context_t* pctx, int is_header) { int rc, stridx, matchlen, record_done, field_done; peek_file_reader_t* pfr = pstate->pfr; string_builder_t* psb = pstate->psb; char* field = NULL; int field_length = 0; if (pfr_peek_char(pfr) == (char)EOF) // char defaults to unsigned on some platforms return FALSE; // Strip the UTF-8 BOM, if any. This is MUCH simpler for mmap, and for stdio on files. For mmap // we can test the first 3 bytes, then skip past them or not. For stdio on files we can fread // the first 3 bytes, then rewind the fp if they're not the UTF-8 BOM. But for stdio on stdin // (which is the primary reason we support stdio in Miller), we cannot rewind: stdin is not // rewindable. if (is_header) { pfr_buffer_by(pfr, UTF8_BOM_LENGTH); int rc = parse_trie_ring_match(pstate->putf8_bom_parse_trie, pfr->peekbuf, pfr->sob, pfr->npeeked, pfr->peekbuflenmask, &stridx, &matchlen); #ifdef DEBUG_PARSER printf("RC=%d stridx=0x%04x matchlen=%d\n", rc, stridx, matchlen); #endif if (rc == TRUE && stridx == UTF8_BOM_STRIDX) { pfr_advance_by(pfr, matchlen); } } // Loop over fields in record record_done = FALSE; while (!record_done) { // Assumption is dquote is "\"" if (pfr_peek_char(pfr) != pstate->dquote[0]) { // NOT DOUBLE-QUOTED // Loop over characters in field field_done = FALSE; while (!field_done) { pfr_buffer_by(pfr, pstate->pno_dquote_parse_trie->maxlen); rc = parse_trie_ring_match(pstate->pno_dquote_parse_trie, pfr->peekbuf, pfr->sob, pfr->npeeked, pfr->peekbuflenmask, &stridx, &matchlen); #ifdef DEBUG_PARSER pfr_print(pfr); #endif if (rc) { #ifdef DEBUG_PARSER printf("RC=%d stridx=0x%04x matchlen=%d\n", rc, stridx, matchlen); #endif switch(stridx) { case EOF_STRIDX: // end of record rslls_append(pfields, sb_finish(psb), FREE_ENTRY_VALUE, 0); field_done = TRUE; record_done = TRUE; break; case IFS_EOF_STRIDX: fprintf(stderr, "%s: syntax error: record-ending field separator at line %lld.\n", MLR_GLOBALS.bargv0, pstate->ilno); exit(1); break; case IFS_STRIDX: // end of field rslls_append(pfields, sb_finish(psb), FREE_ENTRY_VALUE, 0); field_done = TRUE; break; case IRS_STRIDX: // end of record field = sb_finish_with_length(psb, &field_length); // The line-ending '\n' won't be included in the field buffer. if (pstate->do_auto_line_term) { if (field_length > 0 && field[field_length-1] == '\r') { field[field_length-1] = 0; context_set_autodetected_crlf(pctx); } else { context_set_autodetected_lf(pctx); } } rslls_append(pfields, field, FREE_ENTRY_VALUE, 0); field_done = TRUE; record_done = TRUE; break; case DQUOTE_STRIDX: // CSV syntax error: fields containing quotes must be fully wrapped in quotes fprintf(stderr, "%s: syntax error: unwrapped double quote at line %lld.\n", MLR_GLOBALS.bargv0, pstate->ilno); exit(1); break; default: fprintf(stderr, "%s: internal coding error: unexpected token %d at line %lld.\n", MLR_GLOBALS.bargv0, stridx, pstate->ilno); exit(1); break; } pfr_advance_by(pfr, matchlen); } else { #ifdef DEBUG_PARSER char c = pfr_read_char(pfr); printf("CHAR=%c [%02x]\n", isprint((unsigned char)c) ? c : ' ', (unsigned)c); sb_append_char(psb, c); #else sb_append_char(psb, pfr_read_char(pfr)); #endif } } } else { // DOUBLE-QUOTED pfr_advance_by(pfr, pstate->dquotelen); // loop over characters in field field_done = FALSE; char* field = NULL; int field_length = 0; while (!field_done) { pfr_buffer_by(pfr, pstate->pdquote_parse_trie->maxlen); rc = parse_trie_ring_match(pstate->pdquote_parse_trie, pfr->peekbuf, pfr->sob, pfr->npeeked, pfr->peekbuflenmask, &stridx, &matchlen); if (rc) { switch(stridx) { case EOF_STRIDX: // end of record fprintf(stderr, "%s: unmatched double quote at line %lld.\n", MLR_GLOBALS.bargv0, pstate->ilno); exit(1); break; case DQUOTE_EOF_STRIDX: // end of record rslls_append(pfields, sb_finish(psb), FREE_ENTRY_VALUE, FIELD_QUOTED_ON_INPUT); field_done = TRUE; record_done = TRUE; break; case DQUOTE_IFS_STRIDX: // end of field rslls_append(pfields, sb_finish(psb), FREE_ENTRY_VALUE, FIELD_QUOTED_ON_INPUT); field_done = TRUE; break; case DQUOTE_IRS_STRIDX: // end of record case DQUOTE_IRS2_STRIDX: // end of record field = sb_finish_with_length(psb, &field_length); // The line-ending '\n' won't be included in the field buffer. if (pstate->do_auto_line_term) { if (field_length > 0 && field[field_length-1] == '\r') { field[field_length-1] = 0; context_set_autodetected_crlf(pctx); } else { context_set_autodetected_lf(pctx); } } rslls_append(pfields, field, FREE_ENTRY_VALUE, FIELD_QUOTED_ON_INPUT); field_done = TRUE; record_done = TRUE; break; case DQUOTE_DQUOTE_STRIDX: // RFC-4180 CSV: "" inside a dquoted field is an escape for " sb_append_char(psb, pstate->dquote[0]); break; default: fprintf(stderr, "%s: internal coding error: unexpected token %d at line %lld.\n", MLR_GLOBALS.bargv0, stridx, pstate->ilno); exit(1); break; } pfr_advance_by(pfr, matchlen); } else { sb_append_char(psb, pfr_read_char(pfr)); } } } } return TRUE; }