// ---------------------------------------------------------------- static void test_case( char* test_name, char** strings, int num_strings, char* buf, int* prc, int* pstridx, int* pmatchlen) { int stridx, matchlen, rc; parse_trie_t* ptrie = parse_trie_alloc(); printf("%s %s\n", sep, test_name); parse_trie_print(ptrie); for (stridx = 0; stridx < num_strings; stridx++) { printf("Adding string[%d] = [%s]\n", stridx, strings[stridx]); parse_trie_add_string(ptrie, strings[stridx], stridx); parse_trie_print(ptrie); } stridx = -2; matchlen = -2; rc = parse_trie_match(ptrie, buf, 0, strlen(buf), 0xff, &stridx, &matchlen); parse_trie_free(ptrie); printf("buf = %s\n", buf); printf("rc = %d\n", rc); printf("stridx = %d (%s)\n", stridx, strings[stridx]); printf("matchlen = %d\n", matchlen); *prc = rc; *pstridx = stridx; *pmatchlen = matchlen; }
// ---------------------------------------------------------------- static char* show_it() { char* test_name = "show_it"; char* strings[] = { "=" , ",", "\r\n", "\xff" }; const int EOF_TOKEN = 3; int num_strings = sizeof(strings) / sizeof(strings[0]); char* buf = "abc=123,def=456\r\n" "ghi=789\xff"; char* p = buf; printf("%s %s\n", sep, test_name); int stridx, matchlen, rc; parse_trie_t* ptrie = parse_trie_alloc(); parse_trie_print(ptrie); for (stridx = 0; stridx < num_strings; stridx++) { printf("Adding string[%d] = [%s]\n", stridx, strings[stridx]); parse_trie_add_string(ptrie, strings[stridx], stridx); } parse_trie_print(ptrie); while (TRUE) { rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen); if (rc) { printf("match token %d (%s)\n", stridx, strings[stridx]); p += matchlen; if (stridx == EOF_TOKEN) { break; } } else { char c = *p; printf("c %c[%02x]\n", isprint((unsigned char)c) ? c : '?', (unsigned)c); p++; } } mu_assert_lf(*p == 0); return 0; }
static int lrec_reader_mmap_csv_get_fields(lrec_reader_mmap_csv_state_t* pstate, rslls_t* pfields, file_reader_mmap_state_t* phandle, context_t* pctx) { int rc, stridx, matchlen, record_done, field_done; string_builder_t* psb = pstate->psb; if (phandle->sol >= phandle->eof) return FALSE; char* p = phandle->sol; char* e = p; // loop over fields in record record_done = FALSE; while (!record_done) { // Assumption is dquote is "\"" if (*e != pstate->dquote[0]) { // start of non-quoted field // Loop over characters in field field_done = FALSE; while (!field_done) { MLR_INTERNAL_CODING_ERROR_IF(e > phandle->eof); rc = parse_trie_match(pstate->pno_dquote_parse_trie, e, phandle->eof, &stridx, &matchlen); if (rc) { switch(stridx) { case IFS_STRIDX: // end of field *e = 0; rslls_append(pfields, p, NO_FREE, 0); p = e + matchlen; field_done = TRUE; break; case IRS_STRIDX: // end of record *e = 0; if (pstate->do_auto_line_term) { if (e > p && e[-1] == '\r') { e[-1] = 0; context_set_autodetected_crlf(pctx); } else { context_set_autodetected_lf(pctx); } } rslls_append(pfields, p, NO_FREE, 0); p = e + matchlen; field_done = TRUE; record_done = TRUE; break; case DQUOTE_STRIDX: // CSV syntax error: fields containing quotes must be fully wrapped in quotes fprintf(stderr, "%s: syntax error: unwrapped double quote at line %lld.\n", MLR_GLOBALS.bargv0, pstate->ilno); exit(1); break; default: fprintf(stderr, "%s: internal coding error: unexpected token %d at line %lld.\n", MLR_GLOBALS.bargv0, stridx, pstate->ilno); exit(1); break; } e += matchlen; } else if (e >= phandle->eof) { // We read to end of file without seeing end of line. We can't always zero-poke a null character to // terminate the C string: if the file size is not a multiple of the OS page size it'll work (it's // our copy-on-write memory). But if the file size is a multiple of the page size, then zero-poking // at EOF is one byte past the page and that will segv us. char* copy = mlr_alloc_string_from_char_range(p, phandle->eof - p); rslls_append(pfields, copy, FREE_ENTRY_VALUE, 0); p = e + matchlen; field_done = TRUE; record_done = TRUE; break; } else { e++; } } } else { // start of quoted field e += pstate->dquotelen; p = e; // loop over characters in field field_done = FALSE; int contiguous = TRUE; // If there are no embedded double-double quotes, then the field value is a contiguous // array of bytes between the start and end double-quotes (non-inclusive). E.g. "ab,c" // has contents ab,c. In that case we can point the rslls at that range of bytes // with no data-copying. However, if there are embedded double-double quotes, then // we use the string-build logic to build up a dynamically allocated string. E.g. // "ab""c" becomes ab"c. while (!field_done) { if (e >= phandle->eof) { fprintf(stderr, "%s: unmatched double quote at line %lld.\n", MLR_GLOBALS.bargv0, pstate->ilno); exit(1); } rc = parse_trie_match(pstate->pdquote_parse_trie, e, phandle->eof, &stridx, &matchlen); if (rc) { switch(stridx) { case DQUOTE_IFS_STRIDX: // end of field *e = 0; if (contiguous) rslls_append(pfields, p, NO_FREE, FIELD_QUOTED_ON_INPUT); else rslls_append(pfields, sb_finish(psb), FREE_ENTRY_VALUE, FIELD_QUOTED_ON_INPUT); p = e + matchlen; field_done = TRUE; break; case DQUOTE_IRS_STRIDX: // end of record case DQUOTE_IRS2_STRIDX: // end of record *e = 0; if (pstate->do_auto_line_term) { if (e > p && e[-1] == '\r') { e[-1] = 0; context_set_autodetected_crlf(pctx); } else { context_set_autodetected_lf(pctx); } } if (contiguous) rslls_append(pfields, p, NO_FREE, FIELD_QUOTED_ON_INPUT); else rslls_append(pfields, sb_finish(psb), FREE_ENTRY_VALUE, FIELD_QUOTED_ON_INPUT); p = e + matchlen; field_done = TRUE; record_done = TRUE; break; case DQUOTE_DQUOTE_STRIDX: // RFC-4180 CSV: "" inside a dquoted field is an escape for " if (contiguous) { // not anymore it isn't sb_append_char_range(psb, p, e); contiguous = FALSE; } else { sb_append_char(psb, pstate->dquote[0]); } break; default: fprintf(stderr, "%s: internal coding error: unexpected token %d at line %lld.\n", MLR_GLOBALS.bargv0, stridx, pstate->ilno); exit(1); break; } e += matchlen; } else { if (!contiguous) sb_append_char(psb, *e); e++; } } } } phandle->sol = e; return TRUE; }
// ---------------------------------------------------------------- static char* test_dkvp() { char* test_name = "dkvp"; char* strings[] = { "=" , ",", "\r\n", "\xff" }; const int PS_TOKEN = 0; const int FS_TOKEN = 1; const int RS_TOKEN = 2; const int EOF_TOKEN = 3; int num_strings = sizeof(strings) / sizeof(strings[0]); char* buf = "abc=123,def=456\r\n" "ghi=789\xff"; char* p = buf; printf("%s %s\n", sep, test_name); int stridx, matchlen, rc; parse_trie_t* ptrie = parse_trie_alloc(); parse_trie_print(ptrie); for (stridx = 0; stridx < num_strings; stridx++) { printf("Adding string[%d] = [%s]\n", stridx, strings[stridx]); parse_trie_add_string(ptrie, strings[stridx], stridx); } parse_trie_print(ptrie); rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen); mu_assert_lf(rc == FALSE); p++; rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen); mu_assert_lf(rc == FALSE); p++; rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen); mu_assert_lf(rc == FALSE); p++; rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen); mu_assert_lf(rc == TRUE); mu_assert_lf(stridx == PS_TOKEN); mu_assert_lf(matchlen == strlen(strings[PS_TOKEN])); p += matchlen; rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen); mu_assert_lf(rc == FALSE); p++; rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen); mu_assert_lf(rc == FALSE); p++; rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen); mu_assert_lf(rc == FALSE); p++; rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen); mu_assert_lf(rc == TRUE); mu_assert_lf(stridx == FS_TOKEN); mu_assert_lf(matchlen == strlen(strings[FS_TOKEN])); p += matchlen; rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen); mu_assert_lf(rc == FALSE); p++; rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen); mu_assert_lf(rc == FALSE); p++; rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen); mu_assert_lf(rc == FALSE); p++; rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen); mu_assert_lf(rc == TRUE); mu_assert_lf(stridx == PS_TOKEN); mu_assert_lf(matchlen == strlen(strings[PS_TOKEN])); p += matchlen; rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen); mu_assert_lf(rc == FALSE); p++; rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen); mu_assert_lf(rc == FALSE); p++; rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen); mu_assert_lf(rc == FALSE); p++; rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen); mu_assert_lf(rc == TRUE); mu_assert_lf(stridx == RS_TOKEN); mu_assert_lf(matchlen == strlen(strings[RS_TOKEN])); p += matchlen; rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen); mu_assert_lf(rc == FALSE); p++; rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen); mu_assert_lf(rc == FALSE); p++; rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen); mu_assert_lf(rc == FALSE); p++; rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen); mu_assert_lf(rc == TRUE); mu_assert_lf(stridx == PS_TOKEN); mu_assert_lf(matchlen == strlen(strings[PS_TOKEN])); p += matchlen; rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen); mu_assert_lf(rc == FALSE); p++; rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen); mu_assert_lf(rc == FALSE); p++; rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen); mu_assert_lf(rc == FALSE); p++; rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen); mu_assert_lf(rc == TRUE); mu_assert_lf(stridx == EOF_TOKEN); mu_assert_lf(matchlen == strlen(strings[EOF_TOKEN])); p += matchlen; return 0; }