/* If we are tokenizing the header, end after the first line. Handle the possibility of insufficient columns appropriately; if fill_extra_cols=1, then append empty fields, but otherwise return an error. Increment our row count and possibly end if all the necessary rows have already been parsed. */ static inline int end_line(tokenizer_t *self, int col, int header, int end, tokenizer_state *old_state) { if (header) { ++self->source_pos; RETURN(NO_ERROR); } else if (self->fill_extra_cols) { while (col < self->num_cols) { PUSH('\x01'); END_FIELD(); } } else if (col < self->num_cols) { RETURN(NOT_ENOUGH_COLS); } ++self->num_rows; *old_state = START_LINE; if (end != -1 && self->num_rows == end) { ++self->source_pos; RETURN(NO_ERROR); } return -1; }
int tokenize_whitespace(parser_t *self, size_t line_limit) { int i, slen, start_lines; char c; char *stream; char *buf = self->data + self->datapos; start_lines = self->lines; if (make_stream_space(self, self->datalen - self->datapos) < 0) { self->error_msg = "out of memory"; return -1; } stream = self->stream + self->stream_len; slen = self->stream_len; TRACE(("%s\n", buf)); for (i = self->datapos; i < self->datalen; ++i) { // Next character in file c = *buf++; TRACE(("Iter: %d Char: %c Line %d field_count %d, state %d\n", i, c, self->file_lines + 1, self->line_fields[self->lines], self->state)); switch(self->state) { case EAT_WHITESPACE: if (!IS_WHITESPACE(c)) { // END_FIELD(); self->state = START_FIELD; // Fall through to subsequent state } else { // if whitespace char, keep slurping break; } case START_RECORD: // start of record if (c == '\n') { // \n\r possible? END_LINE(); break; } else if (c == '\r') { self->state = EAT_CRNL; break; } else if (IS_WHITESPACE(c)) { END_FIELD(); self->state = EAT_WHITESPACE; break; } else { /* normal character - handle as START_FIELD */ self->state = START_FIELD; } /* fallthru */ case START_FIELD: /* expecting field */ if (c == '\n') { END_FIELD(); END_LINE(); /* self->state = START_RECORD; */ } else if (c == '\r') { END_FIELD(); self->state = EAT_CRNL; } else if (c == self->quotechar && self->quoting != QUOTE_NONE) { /* start quoted field */ self->state = IN_QUOTED_FIELD; } else if (c == self->escapechar) { /* possible escaped character */ self->state = ESCAPED_CHAR; } /* else if (c == ' ' && self->skipinitialspace) */ /* /\* ignore space at start of field *\/ */ /* ; */ else if (IS_WHITESPACE(c)) { self->state = EAT_WHITESPACE; } else if (c == self->commentchar) { END_FIELD(); self->state = EAT_COMMENT; } else { /* begin new unquoted field */ if (self->quoting == QUOTE_NONNUMERIC) self->numeric_field = 1; // TRACE(("pushing %c", c)); PUSH_CHAR(c); self->state = IN_FIELD; } break; case ESCAPED_CHAR: /* if (c == '\0') */ /* c = '\n'; */ PUSH_CHAR(c); self->state = IN_FIELD; break; case IN_FIELD: /* in unquoted field */ if (c == '\n') { END_FIELD(); END_LINE(); /* self->state = START_RECORD; */ } else if (c == '\r') { END_FIELD(); self->state = EAT_CRNL; } else if (c == self->escapechar) { /* possible escaped character */ self->state = ESCAPED_CHAR; } else if (IS_WHITESPACE(c)) { // End of field. End of line not reached yet END_FIELD(); self->state = EAT_WHITESPACE; } else if (c == self->commentchar) { END_FIELD(); self->state = EAT_COMMENT; } else { /* normal character - save in field */ PUSH_CHAR(c); } break; case IN_QUOTED_FIELD: /* in quoted field */ if (c == self->escapechar) { /* Possible escape character */ self->state = ESCAPE_IN_QUOTED_FIELD; } else if (c == self->quotechar && self->quoting != QUOTE_NONE) { if (self->doublequote) { /* doublequote; " represented by "" */ self->state = QUOTE_IN_QUOTED_FIELD; } else { /* end of quote part of field */ self->state = IN_FIELD; } } else { /* normal character - save in field */ PUSH_CHAR(c); } break; case ESCAPE_IN_QUOTED_FIELD: /* if (c == '\0') */ /* c = '\n'; */ PUSH_CHAR(c); self->state = IN_QUOTED_FIELD; break; case QUOTE_IN_QUOTED_FIELD: /* doublequote - seen a quote in an quoted field */ if (self->quoting != QUOTE_NONE && c == self->quotechar) { /* save "" as " */ PUSH_CHAR(c); self->state = IN_QUOTED_FIELD; } else if (IS_WHITESPACE(c)) { // End of field. End of line not reached yet END_FIELD(); self->state = EAT_WHITESPACE; } else if (c == '\n') { END_FIELD(); END_LINE(); /* self->state = START_RECORD; */ } else if (c == '\r') { END_FIELD(); self->state = EAT_CRNL; } else if (!self->strict) { PUSH_CHAR(c); self->state = IN_FIELD; } else { self->error_msg = (char*) malloc(50); sprintf(self->error_msg, "'%c' expected after '%c'", self->delimiter, self->quotechar); goto parsingerror; } break; case EAT_CRNL: if (c == '\n') { END_LINE(); /* self->state = START_RECORD; */ } else if (IS_WHITESPACE(c)){ // Handle \r-delimited files END_LINE_AND_FIELD_STATE(EAT_WHITESPACE); } else { PUSH_CHAR(c); END_LINE_STATE(IN_FIELD); } break; case EAT_COMMENT: if (c == '\n') { END_LINE(); } else if (c == '\r') { self->state = EAT_CRNL; } break; default: break; } } _TOKEN_CLEANUP(); TRACE(("Finished tokenizing input\n")) return 0; parsingerror: i++; _TOKEN_CLEANUP(); return -1; linelimit: i++; _TOKEN_CLEANUP(); return 0; }
int tokenize(tokenizer_t *self, int end, int header, int num_cols) { char c; // input character int col = 0; // current column ignoring possibly excluded columns tokenizer_state old_state = START_LINE; // last state the tokenizer was in before CR mode int parse_newline = 0; // explicit flag to treat current char as a newline int i = 0; int whitespace = 1; delete_data(self); // clear old reading data self->num_rows = 0; self->comment_lines_len = INITIAL_COMMENT_LEN; if (header) self->num_cols = 1; // store header output in one column else self->num_cols = num_cols; // Allocate memory for structures used during tokenization self->output_cols = (char **) malloc(self->num_cols * sizeof(char *)); self->col_ptrs = (char **) malloc(self->num_cols * sizeof(char *)); self->output_len = (size_t *) malloc(self->num_cols * sizeof(size_t)); for (i = 0; i < self->num_cols; ++i) { self->output_cols[i] = (char *) calloc(1, INITIAL_COL_SIZE * sizeof(char)); // Make each col_ptrs pointer point to the beginning of the // column string self->col_ptrs[i] = self->output_cols[i]; self->output_len[i] = INITIAL_COL_SIZE; } if (end == 0) RETURN(NO_ERROR); // don't read if end == 0 self->state = START_LINE; // Loop until all of self->source has been read while (self->source_pos < self->source_len + 1) { if (self->source_pos == self->source_len || parse_newline) c = '\n'; else c = self->source[self->source_pos]; if (c == '\r') c = '\n'; parse_newline = 0; switch (self->state) { case START_LINE: if (c == '\n') break; else if ((c == ' ' || c == '\t') && self->strip_whitespace_lines) break; else if (self->comment != 0 && c == self->comment) { // comment line; ignore self->state = COMMENT; break; } // initialize variables for the beginning of line parsing col = 0; BEGIN_FIELD(); // parse in mode START_FIELD case START_FIELD: // strip whitespace before field begins if ((c == ' ' || c == '\t') && self->strip_whitespace_fields) break; else if (!self->strip_whitespace_lines && self->comment != 0 && c == self->comment) { // comment line, not caught earlier because of no stripping self->state = COMMENT; break; } else if (c == self->delimiter) // field ends before it begins { if (col >= self->num_cols) RETURN(TOO_MANY_COLS); END_FIELD(); BEGIN_FIELD(); break; } else if (c == '\n') { if (self->strip_whitespace_lines) { // Move on if the delimiter is whitespace, e.g. // '1 2 3 '->['1','2','3'] if (self->delimiter == ' ' || self->delimiter == '\t') ; // Register an empty field if non-whitespace delimiter, // e.g. '1,2, '->['1','2',''] else { if (col >= self->num_cols) RETURN(TOO_MANY_COLS); END_FIELD(); } } else if (!self->strip_whitespace_lines) { // In this case we don't want to left-strip the field, // so we backtrack size_t tmp = self->source_pos; --self->source_pos; while (self->source_pos >= 0 && self->source[self->source_pos] != self->delimiter && self->source[self->source_pos] != '\n' && self->source[self->source_pos] != '\r') { --self->source_pos; } // backtracked to line beginning if (self->source_pos == -1 || self->source[self->source_pos] == '\n' || self->source[self->source_pos] == '\r') { self->source_pos = tmp; } else { ++self->source_pos; if (self->source_pos == tmp) // no whitespace, just an empty field ; else while (self->source_pos < tmp) { // append whitespace characters PUSH(self->source[self->source_pos]); ++self->source_pos; } if (col >= self->num_cols) RETURN(TOO_MANY_COLS); END_FIELD(); // whitespace counts as a field } } END_LINE(); self->state = START_LINE; break; } else if (c == self->quotechar) // start parsing quoted field { self->state = START_QUOTED_FIELD; break; } else { if (col >= self->num_cols) RETURN(TOO_MANY_COLS); // Valid field character, parse again in FIELD mode self->state = FIELD; } case FIELD: if (self->comment != 0 && c == self->comment && whitespace && col == 0) { // No whitespace stripping, but the comment char is found // before any data, e.g. ' # a b c' self->state = COMMENT; } else if (c == self->delimiter) { // End of field, look for new field END_FIELD(); BEGIN_FIELD(); } else if (c == '\n') { // Line ending, stop parsing both field and line END_FIELD(); END_LINE(); self->state = START_LINE; } else { if (c != ' ' && c != '\t') whitespace = 0; // field is not all whitespace PUSH(c); } break; case START_QUOTED_FIELD: if ((c == ' ' || c == '\t') && self->strip_whitespace_fields) { // ignore initial whitespace break; } else if (c == self->quotechar) { // Lookahead check for double quote inside quoted field, // e.g. """cd" => "cd if (self->source_pos < self->source_len - 1) { if (self->source[self->source_pos + 1] == self->quotechar) { self->state = QUOTED_FIELD_DOUBLE_QUOTE; PUSH(c); break; } } // Parse rest of field normally, e.g. ""c self->state = FIELD; } else { // Valid field character, parse again in QUOTED_FIELD mode self->state = QUOTED_FIELD; } case QUOTED_FIELD_NEWLINE: if (self->state == QUOTED_FIELD) ; // fall through // Ignore initial whitespace if strip_whitespace_lines and // newlines regardless else if (((c == ' ' || c == '\t') && self->strip_whitespace_lines) || c == '\n') break; else if (c == self->quotechar) { self->state = FIELD; break; } else { // Once data begins, parse it as a normal quoted field self->state = QUOTED_FIELD; } case QUOTED_FIELD: if (c == self->quotechar) { // Lookahead check for double quote inside quoted field, // e.g. "ab""cd" => ab"cd if (self->source_pos < self->source_len - 1) { if (self->source[self->source_pos + 1] == self->quotechar) { self->state = QUOTED_FIELD_DOUBLE_QUOTE; PUSH(c); break; } } // Parse rest of field normally, e.g. "ab"c self->state = FIELD; } else if (c == '\n') self->state = QUOTED_FIELD_NEWLINE; else { PUSH(c); } break; case QUOTED_FIELD_DOUBLE_QUOTE: // Ignore the second double quote from "ab""cd" and parse rest of // field normally as quoted field. self->state = QUOTED_FIELD; break; case COMMENT: if (c == '\n') { self->state = START_LINE; if (!header) end_comment(self); } else if (!header) push_comment(self, c); break; // keep looping until we find a newline } ++self->source_pos; } RETURN(0); }