Example #1
0
static char* read_line_pfr_psb(peek_file_reader_t* pfr, string_builder_t* psb, parse_trie_t* ptrie) {
	int rc, stridx, matchlen;
	while (TRUE) {
		pfr_buffer_by(pfr, ptrie->maxlen);
		rc = parse_trie_ring_match(ptrie, pfr->peekbuf, pfr->sob, pfr->npeeked, pfr->peekbuflenmask,
			&stridx, &matchlen);
		if (rc) {
			pfr_advance_by(pfr, matchlen);
			switch(stridx) {
			case IRS_STRIDX:
				return sb_finish(psb);
				break;
			case IRSEOF_STRIDX:
				return sb_finish(psb);
				break;
			case EOF_STRIDX:
				return NULL;
				break;
			}
		} else {
			sb_append_char(psb, pfr_read_char(pfr));
		}
	}
}
Example #2
0
static int lrec_reader_stdio_csv_get_fields(lrec_reader_stdio_csv_state_t* pstate, rslls_t* pfields,
	context_t* pctx, int is_header)
{
	int rc, stridx, matchlen, record_done, field_done;
	peek_file_reader_t* pfr = pstate->pfr;
	string_builder_t*   psb = pstate->psb;
	char* field = NULL;
	int field_length = 0;

	if (pfr_peek_char(pfr) == (char)EOF) // char defaults to unsigned on some platforms
		return FALSE;

	// Strip the UTF-8 BOM, if any. This is MUCH simpler for mmap, and for stdio on files.  For mmap
	// we can test the first 3 bytes, then skip past them or not. For stdio on files we can fread
	// the first 3 bytes, then rewind the fp if they're not the UTF-8 BOM. But for stdio on stdin
	// (which is the primary reason we support stdio in Miller), we cannot rewind: stdin is not
	// rewindable.
	if (is_header) {
		pfr_buffer_by(pfr, UTF8_BOM_LENGTH);
		int rc = parse_trie_ring_match(pstate->putf8_bom_parse_trie,
			pfr->peekbuf, pfr->sob, pfr->npeeked, pfr->peekbuflenmask,
			&stridx, &matchlen);
#ifdef DEBUG_PARSER
		printf("RC=%d stridx=0x%04x matchlen=%d\n", rc, stridx, matchlen);
#endif
		if (rc == TRUE && stridx == UTF8_BOM_STRIDX) {
			pfr_advance_by(pfr, matchlen);
		}
	}

	// Loop over fields in record
	record_done = FALSE;
	while (!record_done) {
		// Assumption is dquote is "\""
		if (pfr_peek_char(pfr) != pstate->dquote[0]) { // NOT DOUBLE-QUOTED

			// Loop over characters in field
			field_done = FALSE;
			while (!field_done) {
				pfr_buffer_by(pfr, pstate->pno_dquote_parse_trie->maxlen);

				rc = parse_trie_ring_match(pstate->pno_dquote_parse_trie,
					pfr->peekbuf, pfr->sob, pfr->npeeked, pfr->peekbuflenmask,
					&stridx, &matchlen);
#ifdef DEBUG_PARSER
				pfr_print(pfr);
#endif
				if (rc) {
#ifdef DEBUG_PARSER
					printf("RC=%d stridx=0x%04x matchlen=%d\n", rc, stridx, matchlen);
#endif
					switch(stridx) {
					case EOF_STRIDX: // end of record
						rslls_append(pfields, sb_finish(psb), FREE_ENTRY_VALUE, 0);
						field_done  = TRUE;
						record_done = TRUE;
						break;
					case IFS_EOF_STRIDX:
						fprintf(stderr, "%s: syntax error: record-ending field separator at line %lld.\n",
							MLR_GLOBALS.bargv0, pstate->ilno);
						exit(1);
						break;
					case IFS_STRIDX: // end of field
						rslls_append(pfields, sb_finish(psb), FREE_ENTRY_VALUE, 0);
						field_done  = TRUE;
						break;
					case IRS_STRIDX: // end of record
						field = sb_finish_with_length(psb, &field_length);

						// The line-ending '\n' won't be included in the field buffer.
						if (pstate->do_auto_line_term) {
							if (field_length > 0 && field[field_length-1] == '\r') {
								field[field_length-1] = 0;
								context_set_autodetected_crlf(pctx);
							} else {
								context_set_autodetected_lf(pctx);
							}
						}

						rslls_append(pfields, field, FREE_ENTRY_VALUE, 0);
						field_done  = TRUE;
						record_done = TRUE;
						break;
					case DQUOTE_STRIDX: // CSV syntax error: fields containing quotes must be fully wrapped in quotes
						fprintf(stderr, "%s: syntax error: unwrapped double quote at line %lld.\n",
							MLR_GLOBALS.bargv0, pstate->ilno);
						exit(1);
						break;
					default:
						fprintf(stderr, "%s: internal coding error: unexpected token %d at line %lld.\n",
							MLR_GLOBALS.bargv0, stridx, pstate->ilno);
						exit(1);
						break;
					}
					pfr_advance_by(pfr, matchlen);
				} else {
#ifdef DEBUG_PARSER
					char c = pfr_read_char(pfr);
					printf("CHAR=%c [%02x]\n", isprint((unsigned char)c) ? c : ' ', (unsigned)c);
					sb_append_char(psb, c);
#else
					sb_append_char(psb, pfr_read_char(pfr));
#endif
				}
			}

		} else { // DOUBLE-QUOTED
			pfr_advance_by(pfr, pstate->dquotelen);

			// loop over characters in field
			field_done = FALSE;
			char* field = NULL;
			int field_length = 0;
			while (!field_done) {
				pfr_buffer_by(pfr, pstate->pdquote_parse_trie->maxlen);

				rc = parse_trie_ring_match(pstate->pdquote_parse_trie,
					pfr->peekbuf, pfr->sob, pfr->npeeked, pfr->peekbuflenmask,
					&stridx, &matchlen);

				if (rc) {
					switch(stridx) {
					case EOF_STRIDX: // end of record
						fprintf(stderr, "%s: unmatched double quote at line %lld.\n",
							MLR_GLOBALS.bargv0, pstate->ilno);
						exit(1);
						break;
					case DQUOTE_EOF_STRIDX: // end of record
						rslls_append(pfields, sb_finish(psb), FREE_ENTRY_VALUE, FIELD_QUOTED_ON_INPUT);
						field_done  = TRUE;
						record_done = TRUE;
						break;
					case DQUOTE_IFS_STRIDX: // end of field
						rslls_append(pfields, sb_finish(psb), FREE_ENTRY_VALUE, FIELD_QUOTED_ON_INPUT);
						field_done  = TRUE;
						break;
					case DQUOTE_IRS_STRIDX: // end of record
					case DQUOTE_IRS2_STRIDX: // end of record

						field = sb_finish_with_length(psb, &field_length);

						// The line-ending '\n' won't be included in the field buffer.
						if (pstate->do_auto_line_term) {
							if (field_length > 0 && field[field_length-1] == '\r') {
								field[field_length-1] = 0;
								context_set_autodetected_crlf(pctx);
							} else {
								context_set_autodetected_lf(pctx);
							}
						}

						rslls_append(pfields, field, FREE_ENTRY_VALUE, FIELD_QUOTED_ON_INPUT);
						field_done  = TRUE;
						record_done = TRUE;
						break;
					case DQUOTE_DQUOTE_STRIDX: // RFC-4180 CSV: "" inside a dquoted field is an escape for "
						sb_append_char(psb, pstate->dquote[0]);
						break;
					default:
						fprintf(stderr, "%s: internal coding error: unexpected token %d at line %lld.\n",
							MLR_GLOBALS.bargv0, stridx, pstate->ilno);
						exit(1);
						break;
					}
					pfr_advance_by(pfr, matchlen);
				} else {
					sb_append_char(psb, pfr_read_char(pfr));
				}
			}

		}
	}

	return TRUE;
}