Exemplo n.º 1
0
// ----------------------------------------------------------------
static void test_case(
	char*  test_name,
	char** strings,
	int    num_strings,
	char*  buf,
	int*   prc,
	int*   pstridx,
	int*   pmatchlen)
{
	int stridx, matchlen, rc;

	parse_trie_t* ptrie = parse_trie_alloc();
	printf("%s %s\n", sep, test_name);
	parse_trie_print(ptrie);
	for (stridx = 0; stridx < num_strings; stridx++) {
		printf("Adding string[%d] = [%s]\n", stridx, strings[stridx]);
		parse_trie_add_string(ptrie, strings[stridx], stridx);
		parse_trie_print(ptrie);
	}

	stridx = -2;
	matchlen = -2;
	rc = parse_trie_match(ptrie, buf, 0, strlen(buf), 0xff, &stridx, &matchlen);

	parse_trie_free(ptrie);

	printf("buf      = %s\n", buf);
	printf("rc       = %d\n", rc);
	printf("stridx   = %d (%s)\n", stridx, strings[stridx]);
	printf("matchlen = %d\n", matchlen);

	*prc       = rc;
	*pstridx   = stridx;
	*pmatchlen = matchlen;
}
Exemplo n.º 2
0
// ----------------------------------------------------------------
static char* show_it() {
	char* test_name = "show_it";
	char* strings[] = { "=" , ",", "\r\n", "\xff" };
	const int EOF_TOKEN = 3;
	int num_strings = sizeof(strings) / sizeof(strings[0]);
	char* buf =
		"abc=123,def=456\r\n"
		"ghi=789\xff";
	char* p = buf;

	printf("%s %s\n", sep, test_name);
	int stridx, matchlen, rc;

	parse_trie_t* ptrie = parse_trie_alloc();
	parse_trie_print(ptrie);
	for (stridx = 0; stridx < num_strings; stridx++) {
		printf("Adding string[%d] = [%s]\n", stridx, strings[stridx]);
		parse_trie_add_string(ptrie, strings[stridx], stridx);
	}
	parse_trie_print(ptrie);

	while (TRUE) {
		rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen);
		if (rc) {
			printf("match token %d (%s)\n", stridx, strings[stridx]);
			p += matchlen;
			if (stridx == EOF_TOKEN) {
				break;
			}
		} else {
			char c = *p;
			printf("c %c[%02x]\n", isprint((unsigned char)c) ? c : '?', (unsigned)c);
			p++;
		}
	}

	mu_assert_lf(*p == 0);

	return 0;
}
Exemplo n.º 3
0
static int lrec_reader_mmap_csv_get_fields(lrec_reader_mmap_csv_state_t* pstate,
	rslls_t* pfields, file_reader_mmap_state_t* phandle, context_t* pctx)
{
	int rc, stridx, matchlen, record_done, field_done;
	string_builder_t* psb = pstate->psb;

	if (phandle->sol >= phandle->eof)
		return FALSE;

	char* p = phandle->sol;
	char* e = p;

	// loop over fields in record
	record_done = FALSE;
	while (!record_done) {
		// Assumption is dquote is "\""
		if (*e != pstate->dquote[0]) { // start of non-quoted field

			// Loop over characters in field
			field_done = FALSE;
			while (!field_done) {
				MLR_INTERNAL_CODING_ERROR_IF(e > phandle->eof);
				rc = parse_trie_match(pstate->pno_dquote_parse_trie, e, phandle->eof, &stridx, &matchlen);
				if (rc) {
					switch(stridx) {
					case IFS_STRIDX: // end of field
						*e = 0;
						rslls_append(pfields, p, NO_FREE, 0);
						p = e + matchlen;
						field_done  = TRUE;
						break;
					case IRS_STRIDX: // end of record
						*e = 0;

						if (pstate->do_auto_line_term) {
							if (e > p && e[-1] == '\r') {
								e[-1] = 0;
								context_set_autodetected_crlf(pctx);
							} else {
								context_set_autodetected_lf(pctx);
							}
						}

						rslls_append(pfields, p, NO_FREE, 0);
						p = e + matchlen;
						field_done  = TRUE;
						record_done = TRUE;
						break;
					case DQUOTE_STRIDX: // CSV syntax error: fields containing quotes must be fully wrapped in quotes
						fprintf(stderr, "%s: syntax error: unwrapped double quote at line %lld.\n",
							MLR_GLOBALS.bargv0, pstate->ilno);
						exit(1);
						break;
					default:
						fprintf(stderr, "%s: internal coding error: unexpected token %d at line %lld.\n",
							MLR_GLOBALS.bargv0, stridx, pstate->ilno);
						exit(1);
						break;
					}
					e += matchlen;
				} else if (e >= phandle->eof) {
					// We read to end of file without seeing end of line.  We can't always zero-poke a null character to
					// terminate the C string: if the file size is not a multiple of the OS page size it'll work (it's
					// our copy-on-write memory). But if the file size is a multiple of the page size, then zero-poking
					// at EOF is one byte past the page and that will segv us.
				    char* copy = mlr_alloc_string_from_char_range(p, phandle->eof - p);
					rslls_append(pfields, copy, FREE_ENTRY_VALUE, 0);
					p = e + matchlen;
					field_done  = TRUE;
					record_done = TRUE;
					break;
				} else {
					e++;
				}
			}

		} else { // start of quoted field
			e += pstate->dquotelen;
			p = e;

			// loop over characters in field
			field_done = FALSE;
			int contiguous = TRUE;
			// If there are no embedded double-double quotes, then the field value is a contiguous
			// array of bytes between the start and end double-quotes (non-inclusive). E.g. "ab,c"
			// has contents ab,c. In that case we can point the rslls at that range of bytes
			// with no data-copying. However, if there are embedded double-double quotes, then
			// we use the string-build logic to build up a dynamically allocated string. E.g.
			// "ab""c" becomes ab"c.
			while (!field_done) {
				if (e >= phandle->eof) {
					fprintf(stderr, "%s: unmatched double quote at line %lld.\n",
						MLR_GLOBALS.bargv0, pstate->ilno);
					exit(1);
				}

				rc = parse_trie_match(pstate->pdquote_parse_trie, e, phandle->eof, &stridx, &matchlen);

				if (rc) {
					switch(stridx) {
					case DQUOTE_IFS_STRIDX: // end of field
						*e = 0;
						if (contiguous)
							rslls_append(pfields, p, NO_FREE, FIELD_QUOTED_ON_INPUT);
						else
							rslls_append(pfields, sb_finish(psb), FREE_ENTRY_VALUE, FIELD_QUOTED_ON_INPUT);
						p = e + matchlen;
						field_done  = TRUE;
						break;
					case DQUOTE_IRS_STRIDX: // end of record
					case DQUOTE_IRS2_STRIDX: // end of record
						*e = 0;

						if (pstate->do_auto_line_term) {
							if (e > p && e[-1] == '\r') {
								e[-1] = 0;
								context_set_autodetected_crlf(pctx);
							} else {
								context_set_autodetected_lf(pctx);
							}
						}

						if (contiguous)
							rslls_append(pfields, p, NO_FREE, FIELD_QUOTED_ON_INPUT);
						else
							rslls_append(pfields, sb_finish(psb), FREE_ENTRY_VALUE, FIELD_QUOTED_ON_INPUT);
						p = e + matchlen;
						field_done  = TRUE;
						record_done = TRUE;
						break;
					case DQUOTE_DQUOTE_STRIDX: // RFC-4180 CSV: "" inside a dquoted field is an escape for "
						if (contiguous) { // not anymore it isn't
							sb_append_char_range(psb, p, e);
							contiguous = FALSE;
						} else {
							sb_append_char(psb, pstate->dquote[0]);
						}
						break;
					default:
						fprintf(stderr, "%s: internal coding error: unexpected token %d at line %lld.\n",
							MLR_GLOBALS.bargv0, stridx, pstate->ilno);
						exit(1);
						break;
					}
					e += matchlen;
				} else {
					if (!contiguous)
						sb_append_char(psb, *e);
					e++;
				}
			}
		}
	}
	phandle->sol = e;

	return TRUE;
}
Exemplo n.º 4
0
// ----------------------------------------------------------------
static char* test_dkvp() {
	char* test_name = "dkvp";
	char* strings[] = { "=" , ",", "\r\n", "\xff" };
	const int PS_TOKEN  = 0;
	const int FS_TOKEN  = 1;
	const int RS_TOKEN  = 2;
	const int EOF_TOKEN = 3;
	int num_strings = sizeof(strings) / sizeof(strings[0]);
	char* buf =
		"abc=123,def=456\r\n"
		"ghi=789\xff";
	char* p = buf;

	printf("%s %s\n", sep, test_name);
	int stridx, matchlen, rc;

	parse_trie_t* ptrie = parse_trie_alloc();
	parse_trie_print(ptrie);
	for (stridx = 0; stridx < num_strings; stridx++) {
		printf("Adding string[%d] = [%s]\n", stridx, strings[stridx]);
		parse_trie_add_string(ptrie, strings[stridx], stridx);
	}
	parse_trie_print(ptrie);

	rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen); mu_assert_lf(rc == FALSE); p++;
	rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen); mu_assert_lf(rc == FALSE); p++;
	rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen); mu_assert_lf(rc == FALSE); p++;

	rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen);
	mu_assert_lf(rc == TRUE);
	mu_assert_lf(stridx == PS_TOKEN);
	mu_assert_lf(matchlen == strlen(strings[PS_TOKEN]));
	p += matchlen;

	rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen); mu_assert_lf(rc == FALSE); p++;
	rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen); mu_assert_lf(rc == FALSE); p++;
	rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen); mu_assert_lf(rc == FALSE); p++;

	rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen);
	mu_assert_lf(rc == TRUE);
	mu_assert_lf(stridx == FS_TOKEN);
	mu_assert_lf(matchlen == strlen(strings[FS_TOKEN]));
	p += matchlen;

	rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen); mu_assert_lf(rc == FALSE); p++;
	rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen); mu_assert_lf(rc == FALSE); p++;
	rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen); mu_assert_lf(rc == FALSE); p++;

	rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen);
	mu_assert_lf(rc == TRUE);
	mu_assert_lf(stridx == PS_TOKEN);
	mu_assert_lf(matchlen == strlen(strings[PS_TOKEN]));
	p += matchlen;

	rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen); mu_assert_lf(rc == FALSE); p++;
	rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen); mu_assert_lf(rc == FALSE); p++;
	rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen); mu_assert_lf(rc == FALSE); p++;

	rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen);
	mu_assert_lf(rc == TRUE);
	mu_assert_lf(stridx == RS_TOKEN);
	mu_assert_lf(matchlen == strlen(strings[RS_TOKEN]));
	p += matchlen;

	rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen); mu_assert_lf(rc == FALSE); p++;
	rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen); mu_assert_lf(rc == FALSE); p++;
	rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen); mu_assert_lf(rc == FALSE); p++;

	rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen);
	mu_assert_lf(rc == TRUE);
	mu_assert_lf(stridx == PS_TOKEN);
	mu_assert_lf(matchlen == strlen(strings[PS_TOKEN]));
	p += matchlen;

	rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen); mu_assert_lf(rc == FALSE); p++;
	rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen); mu_assert_lf(rc == FALSE); p++;
	rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen); mu_assert_lf(rc == FALSE); p++;

	rc = parse_trie_match(ptrie, p, 0, strlen(p), 0xff, &stridx, &matchlen);
	mu_assert_lf(rc == TRUE);
	mu_assert_lf(stridx == EOF_TOKEN);
	mu_assert_lf(matchlen == strlen(strings[EOF_TOKEN]));
	p += matchlen;

	return 0;
}