/** Remove backslashes from all newlines. This makes a string from the history file better formated for on screen display. The memory for the return value will be reused by subsequent calls to this function. */ static wchar_t *history_unescape_newlines( wchar_t *in ) { static string_buffer_t *out = 0; if( !out ) { out = sb_halloc( global_context ); if( !out ) { DIE_MEM(); } } else { sb_clear( out ); } for( ; *in; in++ ) { if( *in == L'\\' ) { if( *(in+1)!= L'\n') { sb_append_char( out, *in ); } } else { sb_append_char( out, *in ); } } return (wchar_t *)out->buff; }
/** Returns the interpreter for the specified script. Returns 0 if file is not a script with a shebang. This function leaks memory on every call. Only use it in the execve error handler which calls exit right afterwards, anyway. */ static wchar_t *get_interpreter( wchar_t *file ) { string_buffer_t sb; FILE *fp = wfopen( file, "r" ); sb_init( &sb ); wchar_t *res = 0; if( fp ) { while( 1 ) { wint_t ch = getwc( fp ); if( ch == WEOF ) break; if( ch == L'\n' ) break; sb_append_char( &sb, (wchar_t)ch ); } } res = (wchar_t *)sb.buff; if( !wcsncmp( L"#! /", res, 4 ) ) return res+3; if( !wcsncmp( L"#!/", res, 3 ) ) return res+2; return 0; }
// ---------------------------------------------------------------- // Keep and reuse the internal buffer. Allocate to the caller only // the size needed. char* sb_finish(string_builder_t* psb) { sb_append_char(psb, '\0'); char* rv = mlr_malloc_or_die(psb->used_length); memcpy(rv, psb->buffer, psb->used_length); psb->used_length = 0; return rv; }
// ---------------------------------------------------------------- // Keep and reuse the internal buffer. Allocate to the caller only // the size needed. char* sb_finish(string_builder_t* psb) { sb_append_char(psb, '\0'); int alloc_length = (psb->used_length + BLOCK_LENGTH_MASK) & BLOCK_LENGTH_NMASK; char* rv = mlr_malloc_or_die(alloc_length); memcpy(rv, psb->buffer, psb->used_length); psb->used_length = 0; return rv; }
char* sb_finish_with_length(string_builder_t* psb, int* pline_length) { sb_append_char(psb, '\0'); int alloc_length = (psb->used_length + BLOCK_LENGTH_MASK) & BLOCK_LENGTH_NMASK; char* rv = mlr_malloc_or_die(alloc_length); memcpy(rv, psb->buffer, psb->used_length); // Line length doesn't include null terminator; used length does. *pline_length = psb->used_length - 1; psb->used_length = 0; return rv; }
// ================================================================ static char* read_line_fgetc_psb(FILE* fp, string_builder_t* psb, char* irs) { while (TRUE) { int c = fgetc(fp); if (c == EOF) { if (sb_is_empty(psb)) return NULL; else return sb_finish(psb); } else if (c == irs[0]) { return sb_finish(psb); } else { sb_append_char(psb, c); } } }
// ================================================================ static char* read_line_pfr_psb(peek_file_reader_t* pfr, string_builder_t* psb, char* irs, int irs_len) { while (TRUE) { if (pfr_at_eof(pfr)) { if (sb_is_empty(psb)) return NULL; else return sb_finish(psb); } else if (pfr_next_is(pfr, irs, irs_len)) { if (!pfr_advance_past(pfr, irs)) { fprintf(stderr, "%s: Internal coding error: IRS found and lost.\n", MLR_GLOBALS.argv0); exit(1); } return sb_finish(psb); } else { sb_append_char(psb, pfr_read_char(pfr)); } } }
// ================================================================ static char* read_line_mmap_psb(file_reader_mmap_state_t* ph, string_builder_t* psb, char* irs) { char *p = ph->sol; while (TRUE) { if (p == ph->eof) { ph->sol = p; if (sb_is_empty(psb)) return NULL; else return sb_finish(psb); } else if (*p == irs[0]) { ph->sol = p+1; return sb_finish(psb); } else { sb_append_char(psb, *p); p++; } } }
/** Add backslashes to all newlines, so that the returning string is suitable for writing to the history file. The memory for the return value will be reused by subsequent calls to this function. */ static wchar_t *history_escape_newlines( wchar_t *in ) { static string_buffer_t *out = 0; if( !out ) { out = sb_halloc( global_context ); if( !out ) { DIE_MEM(); } } else { sb_clear( out ); } for( ; *in; in++ ) { if( *in == L'\\' ) { sb_append_char( out, *in ); if( *(in+1) ) { in++; sb_append_char( out, *in ); } else { /* This is a weird special case. When we are trying to save a string that ends with a backslash, we need to handle it specially, otherwise this command would be combined with the one following it. We hack around this by adding an additional newline. */ sb_append_char( out, L'\n' ); } } else if( *in == L'\n' ) { sb_append_char( out, L'\\' ); sb_append_char( out, *in ); } else { sb_append_char( out, *in ); } } return (wchar_t *)out->buff; }
/** Test the escaping/unescaping code by escaping/unescaping random strings and verifying that the original string comes back. */ static void test_escape() { int i; string_buffer_t sb; say( L"Testing escaping and unescaping" ); sb_init( &sb ); for( i=0; i<ESCAPE_TEST_COUNT; i++ ) { wchar_t *o, *e, *u; sb_clear( &sb ); while( rand() % ESCAPE_TEST_LENGTH ) { sb_append_char( &sb, (rand() %ESCAPE_TEST_CHAR) +1 ); } o = (wchar_t *)sb.buff; e = escape(o, 1); u = unescape( e, 0 ); if( !o || !e || !u ) { err( L"Escaping cycle of string %ls produced null pointer on %ls", o, e?L"unescaping":L"escaping" ); } if( wcscmp(o, u) ) { err( L"Escaping cycle of string %ls produced different string %ls", o, u ); } free( e ); free( u ); } }
static void etherframe_print(u_char *usr, const struct pcap_pkthdr *pkt, const u_char *d, uint16_t ethtype) { struct iphdr *ip; struct tcphdr *th; struct udphdr *uh; uint16_t ethtype_le; ethtype_le = ntohs(ethtype); switch (ethtype_le) { case ETH_P_IP: ip = ip_hdr(d); sb_append_str(&sb, "IP: "); iphdr_print(ip, &sb); switch (ip->protocol) { case IPPROTO_TCP: th = tcp_hdr(d + ip_hdrlen(ip)); sb_append_str(&sb, "; TCP: "); tcp_print(th, &sb); break; case IPPROTO_UDP: uh = udp_hdr(d + ip_hdrlen(ip)); sb_append_str(&sb, "; UDP: "); udp_print(uh, &sb); break; default: sb_append_char(&sb, ' '); sb_append_str(&sb, ipproto_str(ip->protocol)); } break; default: /* FIXME: This code is open to buffer overrun errors */ sb_append_str(&sb, "ether type: "); sb.len += sprintf(sb_curr(&sb), "0x%04x ", ethtype_le); sb_append_str(&sb, ethertype_to_str(ethtype_le)); } }
void sb_replace(string_builder *sb, const char* old_val, const char* new_val) { if (old_val == NULL || sb->count == 0) return; string_builder new_sb; sb_init(&new_sb); int old_val_len = strlen(old_val); for (size_t i = 0; i < sb->count; i++) { if (strncmp(sb->_str_p + i, old_val, old_val_len) == 0 && i <= sb->count - old_val_len) { sb_append(&new_sb, new_val); i += old_val_len - 1; } else { sb_append_char(&new_sb, sb->_str_p[i]); } } sb_free(sb); *sb = new_sb; }
static char* read_line_pfr_psb(peek_file_reader_t* pfr, string_builder_t* psb, parse_trie_t* ptrie) { int rc, stridx, matchlen; while (TRUE) { pfr_buffer_by(pfr, ptrie->maxlen); rc = parse_trie_ring_match(ptrie, pfr->peekbuf, pfr->sob, pfr->npeeked, pfr->peekbuflenmask, &stridx, &matchlen); if (rc) { pfr_advance_by(pfr, matchlen); switch(stridx) { case IRS_STRIDX: return sb_finish(psb); break; case IRSEOF_STRIDX: return sb_finish(psb); break; case EOF_STRIDX: return NULL; break; } } else { sb_append_char(psb, pfr_read_char(pfr)); } } }
void write_screen( const wchar_t *msg, string_buffer_t *buff ) { const wchar_t *start, *pos; int line_width = 0; int tok_width = 0; int screen_width = common_get_width(); CHECK( msg, ); CHECK( buff, ); if( screen_width ) { start = pos = msg; while( 1 ) { int overflow = 0; tok_width=0; /* Tokenize on whitespace, and also calculate the width of the token */ while( *pos && ( !wcschr( L" \n\r\t", *pos ) ) ) { /* Check is token is wider than one line. If so we mark it as an overflow and break the token. */ if((tok_width + wcwidth(*pos)) > (screen_width-1)) { overflow = 1; break; } tok_width += wcwidth( *pos ); pos++; } /* If token is zero character long, we don't do anything */ if( pos == start ) { start = pos = pos+1; } else if( overflow ) { /* In case of overflow, we print a newline, except if we already are at position 0 */ wchar_t *token = wcsndup( start, pos-start ); if( line_width != 0 ) sb_append_char( buff, L'\n' ); sb_printf( buff, L"%ls-\n", token ); free( token ); line_width=0; } else { /* Print the token */ wchar_t *token = wcsndup( start, pos-start ); if( (line_width + (line_width!=0?1:0) + tok_width) > screen_width ) { sb_append_char( buff, L'\n' ); line_width=0; } sb_printf( buff, L"%ls%ls", line_width?L" ":L"", token ); free( token ); line_width += (line_width!=0?1:0) + tok_width; } /* Break on end of string */ if( !*pos ) { break; } start=pos; } } else { sb_printf( buff, L"%ls", msg ); } sb_append_char( buff, L'\n' ); }
// ---------------------------------------------------------------- void sb_append_string(string_builder_t* psb, char* s) { for (char* p = s; *p; p++) sb_append_char(psb, *p); }
static int lrec_reader_stdio_csv_get_fields(lrec_reader_stdio_csv_state_t* pstate, rslls_t* pfields, context_t* pctx, int is_header) { int rc, stridx, matchlen, record_done, field_done; peek_file_reader_t* pfr = pstate->pfr; string_builder_t* psb = pstate->psb; char* field = NULL; int field_length = 0; if (pfr_peek_char(pfr) == (char)EOF) // char defaults to unsigned on some platforms return FALSE; // Strip the UTF-8 BOM, if any. This is MUCH simpler for mmap, and for stdio on files. For mmap // we can test the first 3 bytes, then skip past them or not. For stdio on files we can fread // the first 3 bytes, then rewind the fp if they're not the UTF-8 BOM. But for stdio on stdin // (which is the primary reason we support stdio in Miller), we cannot rewind: stdin is not // rewindable. if (is_header) { pfr_buffer_by(pfr, UTF8_BOM_LENGTH); int rc = parse_trie_ring_match(pstate->putf8_bom_parse_trie, pfr->peekbuf, pfr->sob, pfr->npeeked, pfr->peekbuflenmask, &stridx, &matchlen); #ifdef DEBUG_PARSER printf("RC=%d stridx=0x%04x matchlen=%d\n", rc, stridx, matchlen); #endif if (rc == TRUE && stridx == UTF8_BOM_STRIDX) { pfr_advance_by(pfr, matchlen); } } // Loop over fields in record record_done = FALSE; while (!record_done) { // Assumption is dquote is "\"" if (pfr_peek_char(pfr) != pstate->dquote[0]) { // NOT DOUBLE-QUOTED // Loop over characters in field field_done = FALSE; while (!field_done) { pfr_buffer_by(pfr, pstate->pno_dquote_parse_trie->maxlen); rc = parse_trie_ring_match(pstate->pno_dquote_parse_trie, pfr->peekbuf, pfr->sob, pfr->npeeked, pfr->peekbuflenmask, &stridx, &matchlen); #ifdef DEBUG_PARSER pfr_print(pfr); #endif if (rc) { #ifdef DEBUG_PARSER printf("RC=%d stridx=0x%04x matchlen=%d\n", rc, stridx, matchlen); #endif switch(stridx) { case EOF_STRIDX: // end of record rslls_append(pfields, sb_finish(psb), FREE_ENTRY_VALUE, 0); field_done = TRUE; record_done = TRUE; break; case IFS_EOF_STRIDX: fprintf(stderr, "%s: syntax error: record-ending field separator at line %lld.\n", MLR_GLOBALS.bargv0, pstate->ilno); exit(1); break; case IFS_STRIDX: // end of field rslls_append(pfields, sb_finish(psb), FREE_ENTRY_VALUE, 0); field_done = TRUE; break; case IRS_STRIDX: // end of record field = sb_finish_with_length(psb, &field_length); // The line-ending '\n' won't be included in the field buffer. if (pstate->do_auto_line_term) { if (field_length > 0 && field[field_length-1] == '\r') { field[field_length-1] = 0; context_set_autodetected_crlf(pctx); } else { context_set_autodetected_lf(pctx); } } rslls_append(pfields, field, FREE_ENTRY_VALUE, 0); field_done = TRUE; record_done = TRUE; break; case DQUOTE_STRIDX: // CSV syntax error: fields containing quotes must be fully wrapped in quotes fprintf(stderr, "%s: syntax error: unwrapped double quote at line %lld.\n", MLR_GLOBALS.bargv0, pstate->ilno); exit(1); break; default: fprintf(stderr, "%s: internal coding error: unexpected token %d at line %lld.\n", MLR_GLOBALS.bargv0, stridx, pstate->ilno); exit(1); break; } pfr_advance_by(pfr, matchlen); } else { #ifdef DEBUG_PARSER char c = pfr_read_char(pfr); printf("CHAR=%c [%02x]\n", isprint((unsigned char)c) ? c : ' ', (unsigned)c); sb_append_char(psb, c); #else sb_append_char(psb, pfr_read_char(pfr)); #endif } } } else { // DOUBLE-QUOTED pfr_advance_by(pfr, pstate->dquotelen); // loop over characters in field field_done = FALSE; char* field = NULL; int field_length = 0; while (!field_done) { pfr_buffer_by(pfr, pstate->pdquote_parse_trie->maxlen); rc = parse_trie_ring_match(pstate->pdquote_parse_trie, pfr->peekbuf, pfr->sob, pfr->npeeked, pfr->peekbuflenmask, &stridx, &matchlen); if (rc) { switch(stridx) { case EOF_STRIDX: // end of record fprintf(stderr, "%s: unmatched double quote at line %lld.\n", MLR_GLOBALS.bargv0, pstate->ilno); exit(1); break; case DQUOTE_EOF_STRIDX: // end of record rslls_append(pfields, sb_finish(psb), FREE_ENTRY_VALUE, FIELD_QUOTED_ON_INPUT); field_done = TRUE; record_done = TRUE; break; case DQUOTE_IFS_STRIDX: // end of field rslls_append(pfields, sb_finish(psb), FREE_ENTRY_VALUE, FIELD_QUOTED_ON_INPUT); field_done = TRUE; break; case DQUOTE_IRS_STRIDX: // end of record case DQUOTE_IRS2_STRIDX: // end of record field = sb_finish_with_length(psb, &field_length); // The line-ending '\n' won't be included in the field buffer. if (pstate->do_auto_line_term) { if (field_length > 0 && field[field_length-1] == '\r') { field[field_length-1] = 0; context_set_autodetected_crlf(pctx); } else { context_set_autodetected_lf(pctx); } } rslls_append(pfields, field, FREE_ENTRY_VALUE, FIELD_QUOTED_ON_INPUT); field_done = TRUE; record_done = TRUE; break; case DQUOTE_DQUOTE_STRIDX: // RFC-4180 CSV: "" inside a dquoted field is an escape for " sb_append_char(psb, pstate->dquote[0]); break; default: fprintf(stderr, "%s: internal coding error: unexpected token %d at line %lld.\n", MLR_GLOBALS.bargv0, stridx, pstate->ilno); exit(1); break; } pfr_advance_by(pfr, matchlen); } else { sb_append_char(psb, pfr_read_char(pfr)); } } } } return TRUE; }
static int lrec_reader_mmap_csv_get_fields(lrec_reader_mmap_csv_state_t* pstate, rslls_t* pfields, file_reader_mmap_state_t* phandle, context_t* pctx) { int rc, stridx, matchlen, record_done, field_done; string_builder_t* psb = pstate->psb; if (phandle->sol >= phandle->eof) return FALSE; char* p = phandle->sol; char* e = p; // loop over fields in record record_done = FALSE; while (!record_done) { // Assumption is dquote is "\"" if (*e != pstate->dquote[0]) { // start of non-quoted field // Loop over characters in field field_done = FALSE; while (!field_done) { MLR_INTERNAL_CODING_ERROR_IF(e > phandle->eof); rc = parse_trie_match(pstate->pno_dquote_parse_trie, e, phandle->eof, &stridx, &matchlen); if (rc) { switch(stridx) { case IFS_STRIDX: // end of field *e = 0; rslls_append(pfields, p, NO_FREE, 0); p = e + matchlen; field_done = TRUE; break; case IRS_STRIDX: // end of record *e = 0; if (pstate->do_auto_line_term) { if (e > p && e[-1] == '\r') { e[-1] = 0; context_set_autodetected_crlf(pctx); } else { context_set_autodetected_lf(pctx); } } rslls_append(pfields, p, NO_FREE, 0); p = e + matchlen; field_done = TRUE; record_done = TRUE; break; case DQUOTE_STRIDX: // CSV syntax error: fields containing quotes must be fully wrapped in quotes fprintf(stderr, "%s: syntax error: unwrapped double quote at line %lld.\n", MLR_GLOBALS.bargv0, pstate->ilno); exit(1); break; default: fprintf(stderr, "%s: internal coding error: unexpected token %d at line %lld.\n", MLR_GLOBALS.bargv0, stridx, pstate->ilno); exit(1); break; } e += matchlen; } else if (e >= phandle->eof) { // We read to end of file without seeing end of line. We can't always zero-poke a null character to // terminate the C string: if the file size is not a multiple of the OS page size it'll work (it's // our copy-on-write memory). But if the file size is a multiple of the page size, then zero-poking // at EOF is one byte past the page and that will segv us. char* copy = mlr_alloc_string_from_char_range(p, phandle->eof - p); rslls_append(pfields, copy, FREE_ENTRY_VALUE, 0); p = e + matchlen; field_done = TRUE; record_done = TRUE; break; } else { e++; } } } else { // start of quoted field e += pstate->dquotelen; p = e; // loop over characters in field field_done = FALSE; int contiguous = TRUE; // If there are no embedded double-double quotes, then the field value is a contiguous // array of bytes between the start and end double-quotes (non-inclusive). E.g. "ab,c" // has contents ab,c. In that case we can point the rslls at that range of bytes // with no data-copying. However, if there are embedded double-double quotes, then // we use the string-build logic to build up a dynamically allocated string. E.g. // "ab""c" becomes ab"c. while (!field_done) { if (e >= phandle->eof) { fprintf(stderr, "%s: unmatched double quote at line %lld.\n", MLR_GLOBALS.bargv0, pstate->ilno); exit(1); } rc = parse_trie_match(pstate->pdquote_parse_trie, e, phandle->eof, &stridx, &matchlen); if (rc) { switch(stridx) { case DQUOTE_IFS_STRIDX: // end of field *e = 0; if (contiguous) rslls_append(pfields, p, NO_FREE, FIELD_QUOTED_ON_INPUT); else rslls_append(pfields, sb_finish(psb), FREE_ENTRY_VALUE, FIELD_QUOTED_ON_INPUT); p = e + matchlen; field_done = TRUE; break; case DQUOTE_IRS_STRIDX: // end of record case DQUOTE_IRS2_STRIDX: // end of record *e = 0; if (pstate->do_auto_line_term) { if (e > p && e[-1] == '\r') { e[-1] = 0; context_set_autodetected_crlf(pctx); } else { context_set_autodetected_lf(pctx); } } if (contiguous) rslls_append(pfields, p, NO_FREE, FIELD_QUOTED_ON_INPUT); else rslls_append(pfields, sb_finish(psb), FREE_ENTRY_VALUE, FIELD_QUOTED_ON_INPUT); p = e + matchlen; field_done = TRUE; record_done = TRUE; break; case DQUOTE_DQUOTE_STRIDX: // RFC-4180 CSV: "" inside a dquoted field is an escape for " if (contiguous) { // not anymore it isn't sb_append_char_range(psb, p, e); contiguous = FALSE; } else { sb_append_char(psb, pstate->dquote[0]); } break; default: fprintf(stderr, "%s: internal coding error: unexpected token %d at line %lld.\n", MLR_GLOBALS.bargv0, stridx, pstate->ilno); exit(1); break; } e += matchlen; } else { if (!contiguous) sb_append_char(psb, *e); e++; } } } } phandle->sol = e; return TRUE; }
/** Returns an item_t for the specified adress. The adress must come from the item list of the specified mode. Later calls to this function may erase the output of a previous call to this function. */ static item_t *item_get( history_mode_t *m, void *d ) { char *begin = (char *)d; if( item_is_new( m, d ) ) { return (item_t *)d; } else { char *end = m->mmap_start + m->mmap_length; char *pos=begin; int was_backslash = 0; static string_buffer_t *out = 0; static item_t narrow_item; int first_char = 1; int timestamp_mode = 0; narrow_item.timestamp = 0; if( !out ) { out = sb_halloc( global_context ); if( !out ) { DIE_MEM(); } } else { sb_clear( out ); } while( 1 ) { wchar_t c; mbstate_t state; size_t res; memset( &state, 0, sizeof(state) ); res = mbrtowc( &c, pos, end-pos, &state ); if( res == (size_t)-1 ) { pos++; continue; } else if( res == (size_t)-2 ) { break; } else if( res == (size_t)0 ) { pos++; continue; } pos += res; if( c == L'\n' ) { if( timestamp_mode ) { wchar_t *time_string = (wchar_t *)out->buff; while( *time_string && !iswdigit(*time_string)) time_string++; errno=0; if( *time_string ) { time_t tm; wchar_t *end; errno = 0; tm = (time_t)wcstol( time_string, &end, 10 ); if( tm && !errno && !*end ) { narrow_item.timestamp = tm; } } sb_clear( out ); timestamp_mode = 0; continue; } if( !was_backslash ) break; } if( first_char ) { if( c == L'#' ) timestamp_mode = 1; } first_char = 0; sb_append_char( out, c ); was_backslash = ( (c == L'\\') && !was_backslash); } narrow_item.data = history_unescape_newlines((wchar_t *)out->buff); return &narrow_item; } }