static int read_file_pfr_psb(char* filename, int do_write) { byte_reader_t* pbr = stdio_byte_reader_alloc(); string_builder_t* psb = sb_alloc(STRING_BUILDER_INIT_SIZE); pbr->popen_func(pbr, NULL, filename); peek_file_reader_t* pfr = pfr_alloc(pbr, PEEK_BUF_LEN); parse_trie_t* ptrie = parse_trie_alloc(); parse_trie_add_string(ptrie, "\n", IRS_STRIDX); parse_trie_add_string(ptrie, "\xff", EOF_STRIDX); parse_trie_add_string(ptrie, "\n\xff", IRSEOF_STRIDX); int bc = 0; while (TRUE) { char* line = read_line_pfr_psb(pfr, psb, ptrie); if (line == NULL) break; if (do_write) { fputs(line, stdout); fputc('\n', stdout); } bc += strlen(line); free(line); } sb_free(psb); pbr->pclose_func(pbr, NULL); return bc; }
// ---------------------------------------------------------------- static void test_case( char* test_name, char** strings, int num_strings, char* buf, int* prc, int* pstridx, int* pmatchlen) { int stridx, matchlen, rc; parse_trie_t* ptrie = parse_trie_alloc(); printf("%s %s\n", sep, test_name); parse_trie_print(ptrie); for (stridx = 0; stridx < num_strings; stridx++) { printf("Adding string[%d] = [%s]\n", stridx, strings[stridx]); parse_trie_add_string(ptrie, strings[stridx], stridx); parse_trie_print(ptrie); } stridx = -2; matchlen = -2; rc = parse_trie_match(ptrie, buf, 0, strlen(buf), 0xff, &stridx, &matchlen); parse_trie_free(ptrie); printf("buf = %s\n", buf); printf("rc = %d\n", rc); printf("stridx = %d (%s)\n", stridx, strings[stridx]); printf("matchlen = %d\n", matchlen); *prc = rc; *pstridx = stridx; *pmatchlen = matchlen; }
// ---------------------------------------------------------------- lrec_reader_t* lrec_reader_mmap_csv_alloc(char* irs, char* ifs, int use_implicit_header) { lrec_reader_t* plrec_reader = mlr_malloc_or_die(sizeof(lrec_reader_t)); lrec_reader_mmap_csv_state_t* pstate = mlr_malloc_or_die(sizeof(lrec_reader_mmap_csv_state_t)); pstate->ilno = 0LL; pstate->eof = "\xff"; pstate->irs = irs; pstate->ifs = ifs; pstate->ifs_eof = mlr_paste_2_strings(pstate->ifs, "\xff"); pstate->dquote = "\""; pstate->dquote_irs = mlr_paste_2_strings("\"", pstate->irs); pstate->dquote_ifs = mlr_paste_2_strings("\"", pstate->ifs); pstate->dquote_eof = "\"\xff"; pstate->dquote_dquote = "\"\""; pstate->dquotelen = strlen(pstate->dquote); pstate->pno_dquote_parse_trie = parse_trie_alloc(); parse_trie_add_string(pstate->pno_dquote_parse_trie, pstate->irs, IRS_STRIDX); parse_trie_add_string(pstate->pno_dquote_parse_trie, pstate->ifs, IFS_STRIDX); parse_trie_add_string(pstate->pno_dquote_parse_trie, pstate->dquote, DQUOTE_STRIDX); pstate->pdquote_parse_trie = parse_trie_alloc(); parse_trie_add_string(pstate->pdquote_parse_trie, pstate->dquote_irs, DQUOTE_IRS_STRIDX); parse_trie_add_string(pstate->pdquote_parse_trie, pstate->dquote_ifs, DQUOTE_IFS_STRIDX); parse_trie_add_string(pstate->pdquote_parse_trie, pstate->dquote_dquote, DQUOTE_DQUOTE_STRIDX); pstate->pfields = rslls_alloc(); pstate->psb = sb_alloc(STRING_BUILDER_INIT_SIZE); pstate->expect_header_line_next = use_implicit_header ? FALSE : TRUE; pstate->use_implicit_header = use_implicit_header; pstate->pheader_keeper = NULL; pstate->pheader_keepers = lhmslv_alloc(); plrec_reader->pvstate = (void*)pstate; plrec_reader->popen_func = file_reader_mmap_vopen; plrec_reader->pclose_func = file_reader_mmap_vclose; plrec_reader->pprocess_func = lrec_reader_mmap_csv_process; plrec_reader->psof_func = lrec_reader_mmap_csv_sof; plrec_reader->pfree_func = lrec_reader_mmap_csv_free; return plrec_reader; }
// ---------------------------------------------------------------- static char* show_it() { char* test_name = "show_it"; char* strings[] = { "=" , ",", "\r\n", "\xff" }; const int EOF_TOKEN = 3; int num_strings = sizeof(strings) / sizeof(strings[0]); char* buf = "abc=123,def=456\r\n" "ghi=789\xff"; char* p = buf; printf("%s %s\n", sep, test_name); int stridx, matchlen, rc; parse_trie_t* ptrie = parse_trie_alloc(); parse_trie_print(ptrie); for (stridx = 0; stridx < num_strings; stridx++) { printf("Adding string[%d] = [%s]\n", stridx, strings[stridx]); parse_trie_add_string(ptrie, strings[stridx], stridx); } parse_trie_print(ptrie); while (TRUE) { rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen); if (rc) { printf("match token %d (%s)\n", stridx, strings[stridx]); p += matchlen; if (stridx == EOF_TOKEN) { break; } } else { char c = *p; printf("c %c[%02x]\n", isprint((unsigned char)c) ? c : '?', (unsigned)c); p++; } } mu_assert_lf(*p == 0); return 0; }
// ---------------------------------------------------------------- lrec_reader_t* lrec_reader_stdio_csv_alloc(char* irs, char* ifs, int use_implicit_header, comment_handling_t comment_handling, char* comment_string) { lrec_reader_t* plrec_reader = mlr_malloc_or_die(sizeof(lrec_reader_t)); lrec_reader_stdio_csv_state_t* pstate = mlr_malloc_or_die(sizeof(lrec_reader_stdio_csv_state_t)); pstate->ilno = 0LL; pstate->do_auto_line_term = FALSE; if (streq(irs, "auto")) { irs = "\n"; pstate->do_auto_line_term = TRUE; } pstate->comment_handling = comment_handling; pstate->comment_string = comment_string; pstate->comment_string_length = comment_string == NULL ? 0 : strlen(comment_string); pstate->eof = "\xff"; pstate->irs = irs; pstate->ifs = ifs; pstate->ifs_eof = mlr_paste_2_strings(pstate->ifs, "\xff"); pstate->dquote = "\""; pstate->dquote_ifs = mlr_paste_2_strings("\"", pstate->ifs); pstate->dquote_eof = "\"\xff"; pstate->dquote_dquote = "\"\""; pstate->dquotelen = strlen(pstate->dquote); // Parse trie for UTF-8 BOM pstate->putf8_bom_parse_trie = parse_trie_alloc(); parse_trie_add_string(pstate->putf8_bom_parse_trie, UTF8_BOM, UTF8_BOM_STRIDX); // Parse trie for non-double-quoted fields pstate->pno_dquote_parse_trie = parse_trie_alloc(); parse_trie_add_string(pstate->pno_dquote_parse_trie, pstate->eof, EOF_STRIDX); parse_trie_add_string(pstate->pno_dquote_parse_trie, pstate->irs, IRS_STRIDX); parse_trie_add_string(pstate->pno_dquote_parse_trie, pstate->ifs_eof, IFS_EOF_STRIDX); parse_trie_add_string(pstate->pno_dquote_parse_trie, pstate->ifs, IFS_STRIDX); parse_trie_add_string(pstate->pno_dquote_parse_trie, pstate->dquote, DQUOTE_STRIDX); // Parse trie for double-quoted fields pstate->pdquote_parse_trie = parse_trie_alloc(); if (pstate->do_auto_line_term) { pstate->dquote_irs = mlr_paste_2_strings("\"", "\n"); pstate->dquote_irs2 = mlr_paste_2_strings("\"", "\r\n"); parse_trie_add_string(pstate->pdquote_parse_trie, pstate->dquote_irs, DQUOTE_IRS_STRIDX); parse_trie_add_string(pstate->pdquote_parse_trie, pstate->dquote_irs2, DQUOTE_IRS2_STRIDX); } else { pstate->dquote_irs = mlr_paste_2_strings("\"", pstate->irs); pstate->dquote_irs2 = NULL; parse_trie_add_string(pstate->pdquote_parse_trie, pstate->dquote_irs, DQUOTE_IRS_STRIDX); } parse_trie_add_string(pstate->pdquote_parse_trie, pstate->eof, EOF_STRIDX); parse_trie_add_string(pstate->pdquote_parse_trie, pstate->dquote_irs, DQUOTE_IRS_STRIDX); parse_trie_add_string(pstate->pdquote_parse_trie, pstate->dquote_ifs, DQUOTE_IFS_STRIDX); parse_trie_add_string(pstate->pdquote_parse_trie, pstate->dquote_eof, DQUOTE_EOF_STRIDX); parse_trie_add_string(pstate->pdquote_parse_trie, pstate->dquote_dquote, DQUOTE_DQUOTE_STRIDX); pstate->pfields = rslls_alloc(); pstate->psb = sb_alloc(STRING_BUILDER_INIT_SIZE); pstate->pbr = stdio_byte_reader_alloc(); pstate->pfr = pfr_alloc(pstate->pbr, mlr_imax3( pstate->putf8_bom_parse_trie->maxlen, pstate->pno_dquote_parse_trie->maxlen, pstate->pdquote_parse_trie->maxlen)); pstate->expect_header_line_next = use_implicit_header ? FALSE : TRUE; pstate->use_implicit_header = use_implicit_header; pstate->pheader_keeper = NULL; pstate->pheader_keepers = lhmslv_alloc(); plrec_reader->pvstate = (void*)pstate; plrec_reader->popen_func = lrec_reader_stdio_csv_open; plrec_reader->pclose_func = lrec_reader_stdio_csv_close; plrec_reader->pprocess_func = lrec_reader_stdio_csv_process; plrec_reader->psof_func = lrec_reader_stdio_csv_sof; plrec_reader->pfree_func = lrec_reader_stdio_csv_free; return plrec_reader; }
// ---------------------------------------------------------------- static char* test_dkvp() { char* test_name = "dkvp"; char* strings[] = { "=" , ",", "\r\n", "\xff" }; const int PS_TOKEN = 0; const int FS_TOKEN = 1; const int RS_TOKEN = 2; const int EOF_TOKEN = 3; int num_strings = sizeof(strings) / sizeof(strings[0]); char* buf = "abc=123,def=456\r\n" "ghi=789\xff"; char* p = buf; printf("%s %s\n", sep, test_name); int stridx, matchlen, rc; parse_trie_t* ptrie = parse_trie_alloc(); parse_trie_print(ptrie); for (stridx = 0; stridx < num_strings; stridx++) { printf("Adding string[%d] = [%s]\n", stridx, strings[stridx]); parse_trie_add_string(ptrie, strings[stridx], stridx); } parse_trie_print(ptrie); rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen); mu_assert_lf(rc == FALSE); p++; rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen); mu_assert_lf(rc == FALSE); p++; rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen); mu_assert_lf(rc == FALSE); p++; rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen); mu_assert_lf(rc == TRUE); mu_assert_lf(stridx == PS_TOKEN); mu_assert_lf(matchlen == strlen(strings[PS_TOKEN])); p += matchlen; rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen); mu_assert_lf(rc == FALSE); p++; rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen); mu_assert_lf(rc == FALSE); p++; rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen); mu_assert_lf(rc == FALSE); p++; rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen); mu_assert_lf(rc == TRUE); mu_assert_lf(stridx == FS_TOKEN); mu_assert_lf(matchlen == strlen(strings[FS_TOKEN])); p += matchlen; rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen); mu_assert_lf(rc == FALSE); p++; rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen); mu_assert_lf(rc == FALSE); p++; rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen); mu_assert_lf(rc == FALSE); p++; rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen); mu_assert_lf(rc == TRUE); mu_assert_lf(stridx == PS_TOKEN); mu_assert_lf(matchlen == strlen(strings[PS_TOKEN])); p += matchlen; rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen); mu_assert_lf(rc == FALSE); p++; rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen); mu_assert_lf(rc == FALSE); p++; rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen); mu_assert_lf(rc == FALSE); p++; rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen); mu_assert_lf(rc == TRUE); mu_assert_lf(stridx == RS_TOKEN); mu_assert_lf(matchlen == strlen(strings[RS_TOKEN])); p += matchlen; rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen); mu_assert_lf(rc == FALSE); p++; rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen); mu_assert_lf(rc == FALSE); p++; rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen); mu_assert_lf(rc == FALSE); p++; rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen); mu_assert_lf(rc == TRUE); mu_assert_lf(stridx == PS_TOKEN); mu_assert_lf(matchlen == strlen(strings[PS_TOKEN])); p += matchlen; rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen); mu_assert_lf(rc == FALSE); p++; rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen); mu_assert_lf(rc == FALSE); p++; rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen); mu_assert_lf(rc == FALSE); p++; rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen); mu_assert_lf(rc == TRUE); mu_assert_lf(stridx == EOF_TOKEN); mu_assert_lf(matchlen == strlen(strings[EOF_TOKEN])); p += matchlen; return 0; }