void dict_re_compile(t_dict_recurse *x, t_symbol *sym, long argc, t_atom *argv) { TRACE("dict_re_compile"); if (argc == 1) { re_compile(x->re2, atom_getsym(argv)->s_name, NULL); } else if (argc == 2) { re_compile(x->re2, atom_getsym(argv)->s_name, atom_getsym(argv + 1)->s_name); } MY_ASSERT(x->re2->err != ERR_NONE, "Compilation error."); POST("Compile: %s %s - RPN: %s - States: %i - Flags: %i - Substr: %i %s", x->re2->re_search_s, atom_getsym(argv + 1)->s_name, x->re2->rpn_s, x->re2->state_cnt, x->re2->capt_flags, x->re2->repl_sub_cnt, x->re2->repl_sub_s); }
// Reads input char sequence line by line and test it against the // regular expression. If line is matched with regexp it is printed // to stdout. // Input files usage: sgrep REGEXP FILE1 FILE2 // stdin usage: echo foo | sgrep REGEXP // If at lease one matching found exit code is 0, otherwise it is 1. int main(int argc, char *argv[]) { if (argc < 2) { return print_usage(); } re *regexp; int matched; regexp = re_compile(argv[1]); if (argc > 2) { // Use input files. for (int i = 2; i < argc; i++) { FILE *f = fopen(argv[i], "r"); if (f == NULL) { die("Failed to open file %s", argv[i]); } matched |= match_file(f, regexp); fclose(f); } } else { // Use stdin as input. matched = match_file(stdin, regexp); } re_free(regexp); return !matched; }
Re *re_new(const char *rep, int opts) { Re *re = malloc(sizeof(Re)); bzero(re, sizeof re); input = (char *)rep; re_setopt(re, opts); yyparse(re); re->ast = ast_new(Paren, 0, re->ast, NULL); if (!re_getopt(re, RE_ANCHOR_HEAD)) { ReAst *ast = ast_new(Star, 0, ast_new(Any, 0, NULL, NULL), NULL); ast->nongreedy = 1; re->ast = ast_new(Concat, 0, ast, re->ast); } dumpast(re->ast, 0); int nr_insts = visit_ast(re->ast, collect_insts) + 1; // plus 1 for IMatch debug("insts size: %d\n", nr_insts); re->insts = malloc(sizeof(Inst) * nr_insts); re_compile(re, re->ast); re_addInst(re, IMatch, 0, NULL, NULL); dumpinsts(re); return re; }
/** * Determine the portion of the original file name to preserve. * * If an error occurs in this function it will be appended to the log and * error mail messages, and the process status will be set appropriately. * * @param ds_id datastream ID * @param file_name name of the raw data file * * @retval 1 if successful * @retval 0 if a pattern matching error occurred */ int dsproc_set_preserve_dots_from_name(int ds_id, const char *file_name) { DataStream *ds = _DSProc->datastreams[ds_id]; char pattern[512]; regex_t preg; regmatch_t pmatch[1]; const char *strp; int status; int ndots; sprintf(pattern, "^%s\\.[[:digit:]]{8}\\.[[:digit:]]{6}\\.[[:alpha:]]+\\.", ds->name); if (!re_compile(&preg, pattern, REG_EXTENDED)) { DSPROC_ERROR( NULL, "Could not compile regular expression: '%s'", pattern); return(0); } status = re_execute(&preg, file_name, 1, pmatch, 0); if (status < 0) { DSPROC_ERROR( NULL, "Could not execute regular expression: '%s'", pattern); re_free(&preg); return(0); } strp = file_name; if (status == 1) { strp += pmatch[0].rm_eo; } ndots = 1; while ((strp = strchr(strp, '.'))) { ++ndots; ++strp; } re_free(&preg); DEBUG_LV1( DSPROC_LIB_NAME, "%s: Setting rename preserve dots value to: %d\n", ds->name, ndots); ds->preserve_dots = ndots; return(1); }
static void set_rs_shadow() { CELL c ; STRING *sval ; char *s ; unsigned len ; if (posix_space_flag && mawk_state == EXECUTION) scan_code['\n'] = SC_UNEXPECTED ; if (rs_shadow.type == SEP_STR) free_STRING((STRING *) rs_shadow.ptr) ; cast_for_split(cellcpy(&c, RS)) ; switch (c.type) { case C_RE: if ( (s = is_string_split(c.ptr, &len)) ) { if (len == 1) { rs_shadow.type = SEP_CHAR ; rs_shadow.c = s[0] ; } else { rs_shadow.type = SEP_STR ; rs_shadow.ptr = (PTR) new_STRING(s) ; } } else { rs_shadow.type = SEP_RE ; rs_shadow.ptr = c.ptr ; } break ; case C_SPACE: rs_shadow.type = SEP_CHAR ; rs_shadow.c = ' ' ; break ; case C_SNULL: /* RS becomes one or more blank lines */ if (mawk_state == EXECUTION) scan_code['\n'] = SC_SPACE ; rs_shadow.type = SEP_MLR ; sval = new_STRING("\n\n+") ; rs_shadow.ptr = re_compile(sval) ; free_STRING(sval) ; break ; default: bozo("bad cell in set_rs_shadow") ; } }
/* * ex_subtilde -- * [line [,line]] ~ [cgr] [count] [#lp]] * * Substitute using the last RE and last substitute replacement pattern. * * PUBLIC: int ex_subtilde(SCR *, EXCMD *); */ int ex_subtilde(SCR *sp, EXCMD *cmdp) { if (sp->re == NULL) { ex_emsg(sp, NULL, EXM_NOPREVRE); return (1); } if (!F_ISSET(sp, SC_RE_SEARCH) && re_compile(sp, sp->re, sp->re_len, NULL, NULL, &sp->re_c, RE_C_SEARCH)) return (1); return (s(sp, cmdp, cmdp->argc ? cmdp->argv[0]->bp : NULL, &sp->re_c, 0)); }
/* Compile common regular expressions */ void re_compile_common(void) { /* nm: address, type, symbol */ re_compile(&re_nm, "^([0-9a-fA-F]{4,}) +([^ ]) +([^ ]+)$", REG_NEWLINE|REG_EXTENDED, &re_nm_pmatch); /* bracketed address preceded by optional white space */ re_compile(&re_bracketed_address, "^[ \t]*" BRACKETED_ADDRESS, REG_NEWLINE|REG_EXTENDED, &re_bracketed_address_pmatch); /* unbracketed address preceded by optional white space */ re_compile(&re_unbracketed_address, "^[ \t*]*" UNBRACKETED_ADDRESS, REG_NEWLINE|REG_EXTENDED, &re_unbracketed_address_pmatch); }
void cast_to_RE(CELL * cp) { register PTR p; if (cp->type < C_STRING) cast1_to_s(cp); p = re_compile(string(cp)); free_STRING(string(cp)); cp->type = C_RE; cp->ptr = p; }
static void set_rs_shadow(void) { CELL c; STRING *sval; char *s; unsigned len; if (posix_space_flag && mawk_state == EXECUTION) scan_code['\n'] = SC_UNEXPECTED; if (rs_shadow.type == SEP_STR) { free_STRING((STRING *) rs_shadow.ptr); } cast_for_split(cellcpy(&c, RS)); switch (c.type) { case C_RE: if ((s = is_string_split(c.ptr, &len))) { if (len == 1) { rs_shadow.type = SEP_CHAR; rs_shadow.c = s[0]; } else { rs_shadow.type = SEP_STR; rs_shadow.ptr = (PTR) new_STRING(s); } } else { rs_shadow.type = SEP_RE; rs_shadow.ptr = c.ptr; } break; case C_SPACE: rs_shadow.type = SEP_CHAR; rs_shadow.c = ' '; break; case C_SNULL: /* RS becomes one or more blank lines */ if (mawk_state == EXECUTION) scan_code['\n'] = SC_SPACE; rs_shadow.type = SEP_MLR; sval = new_STRING("\n\n+"); rs_shadow.ptr = re_compile(sval); free_STRING(sval); break; case C_STRING: /* * Check for special case where we retained the cell as a string, * bypassing regular-expression compiling. */ if (string(&c)->len == 1) { rs_shadow.type = SEP_CHAR; rs_shadow.c = string(&c)->str[0]; break; } /* FALLTHRU */ default: bozo("bad cell in set_rs_shadow"); } }
static int collect_RE(void) { char *p = string_buff; const char *first = NULL; int limit = sizeof(string_buff) - 2; int c; int boxed = 0; STRING *sval; while (1) { if (p >= (string_buff + limit)) { compile_error( "regular expression /%.10s ..." " exceeds implementation size limit (%d)", string_buff, limit); mawk_exit(2); } CheckStringSize(p); switch (scan_code[NextUChar(c = *p++)]) { case SC_POW: /* Handle [^]] and [^^] correctly. */ if ((p - 1) == first && first != 0 && first[-1] == '[') { first = p; } break; case SC_LBOX: /* * If we're starting a bracket expression, remember where that * started, so we can make comparisons to handle things like * "[]xxxx]" and "[^]xxxx]". */ if (!boxed) { first = p; ++boxed; } else { /* XXX. Does not handle collating symbols or equivalence * class expressions. */ /* XXX. Does not match logic used in rexp0.c to check for * a character class expression, though probably the * latter should be adjusted. * POSIX and common sense give us license to complain about * expressions such as '[[:not a special character class]]'. */ if (next() == ':') { ++boxed; } un_next(); } break; case SC_RBOX: /* * A right square-bracket loses its special meaning if it occurs * first in the list (after an optional "^"). */ if (boxed && p - 1 != first) { --boxed; } break; case SC_DIV: /* done */ if (!boxed) { *--p = 0; goto out; } break; case SC_NL: p[-1] = 0; /* FALLTHRU */ case 0: /* unterminated re */ compile_error( "runaway regular expression /%.10s ...", string_buff); mawk_exit(2); case SC_ESCAPE: switch (c = next()) { case '/': p[-1] = '/'; break; case '\n': p--; break; case 0: un_next(); break; default: *p++ = (char) c; break; } break; } } out: /* now we've got the RE, so compile it */ sval = new_STRING(string_buff); yylval.ptr = re_compile(sval); free_STRING(sval); return RE; }
/* * search_init -- * Set up a search. */ static int search_init( SCR *sp, dir_t dir, CHAR_T *ptrn, size_t plen, CHAR_T **epp, u_int flags) { recno_t lno; int delim; CHAR_T *p, *t; /* If the file is empty, it's a fast search. */ if (sp->lno <= 1) { if (db_last(sp, &lno)) return (1); if (lno == 0) { if (LF_ISSET(SEARCH_MSG)) search_msg(sp, S_EMPTY); return (1); } } if (LF_ISSET(SEARCH_PARSE)) { /* Parse the string. */ /* * Use the saved pattern if no pattern specified, or if only * one or two delimiter characters specified. * * !!! * Historically, only the pattern itself was saved, vi didn't * preserve addressing or delta information. */ if (ptrn == NULL) goto prev; if (plen == 1) { if (epp != NULL) *epp = ptrn + 1; goto prev; } if (ptrn[0] == ptrn[1]) { if (epp != NULL) *epp = ptrn + 2; /* Complain if we don't have a previous pattern. */ prev: if (sp->re == NULL) { search_msg(sp, S_NOPREV); return (1); } /* Re-compile the search pattern if necessary. */ if (!F_ISSET(sp, SC_RE_SEARCH) && re_compile(sp, sp->re, sp->re_len, NULL, NULL, &sp->re_c, RE_C_SEARCH | (LF_ISSET(SEARCH_MSG) ? 0 : RE_C_SILENT))) return (1); /* Set the search direction. */ if (LF_ISSET(SEARCH_SET)) sp->searchdir = dir; return (0); } /* * Set the delimiter, and move forward to the terminating * delimiter, handling escaped delimiters. * * QUOTING NOTE: * Only discard an escape character if it escapes a delimiter. */ for (delim = *ptrn, p = t = ++ptrn;; *t++ = *p++) { if (--plen == 0 || p[0] == delim) { if (plen != 0) ++p; break; } if (plen > 1 && p[0] == '\\' && p[1] == delim) { ++p; --plen; } } if (epp != NULL) *epp = p; plen = t - ptrn; } /* Compile the RE. */ if (re_compile(sp, ptrn, plen, &sp->re, &sp->re_len, &sp->re_c, RE_C_SEARCH | (LF_ISSET(SEARCH_MSG) ? 0 : RE_C_SILENT) | (LF_ISSET(SEARCH_TAG) ? RE_C_TAG : 0) | (LF_ISSET(SEARCH_CSCOPE) ? RE_C_CSCOPE : 0))) return (1); /* Set the search direction. */ if (LF_ISSET(SEARCH_SET)) sp->searchdir = dir; return (0); }
static Inst *re_compile(Re *re, ReAst *ast) { if (!ast) { return NULL; } Inst *tmp; switch(ast->type) { case Alt: { Inst *i = re_addInst(re, ISplit, 0, NULL, NULL); i->br1 = re_compile(re, ast->lhs); Inst *i2 = re_addInst(re, IJmp, 0, NULL, NULL); i->br2 = re_compile(re, ast->rhs); i2->br1 = &re->insts[re->size]; return i; } case Concat: { Inst *i = re_compile(re, ast->lhs); re_compile(re, ast->rhs); return i; } case Char: { return re_addInst(re, IChar, ast->c, NULL, NULL); } case Any: { return re_addInst(re, IAny, 0, NULL, NULL); } case Star: { Inst *i = re_addInst(re, ISplit, 0, NULL, NULL); Inst *i2 = re_compile(re, ast->lhs); re_addInst(re, IJmp, 0, i, NULL); i->br1 =i2; i->br2 = &re->insts[re->size]; if (ast->nongreedy) { tmp = i->br1; i->br1 = i->br2; i->br2 = tmp; } return i; } case Plus: { Inst *i = re_compile(re, ast->lhs); Inst *i2 = re_addInst(re, ISplit, 0, i, NULL); i2->br2 = &re->insts[re->size]; if (ast->nongreedy) { tmp = i->br1; i->br1 = i->br2; i->br2 = tmp; } return i; } case Quest: { Inst *i = re_addInst(re, ISplit, 0, NULL, NULL); i->br1 = re_compile(re, ast->lhs); i->br2 = &re->insts[re->size]; if (ast->nongreedy) { tmp = i->br1; i->br1 = i->br2; i->br2 = tmp; } return i; } case Paren: { Inst *i = re_addInst(re, ISave, 2*ast->c, NULL, NULL); re_compile(re, ast->lhs); re_addInst(re, ISave, 2*ast->c + 1, NULL, NULL); return i; } default: assert(0); return NULL; }; }
/** * Compile a list of regular expression patterns. * * See the regcomp man page for the descriptions of the pattern * strings and compile flags. * * Error messages from this function are sent to the message handler * (see msngr_init_log() and msngr_init_mail()). * * @param re_list - pointer to the regular expressions list to add the * patterns to, or NULL to create a new list * @param npatterns - number of patterns to compile * @param patterns - list of patterns to compile * @param cflags - compile flags * * @return * - pointer to the regular expressions list * - NULL if an error occurred */ REList *relist_compile( REList *re_list, int npatterns, const char **patterns, int cflags) { REList *new_re_list = (REList *)NULL; int new_nregs; char **new_patterns; int *new_cflags; regex_t **new_regs; regex_t *preg; int pi; /* Create a new REList if one was not specified */ if (!re_list) { new_re_list = (REList *)calloc(1, sizeof(REList)); if (!new_re_list) { goto MEMORY_ERROR; } re_list = new_re_list; re_list->mindex = -1; } /* Allocate space for the new patterns list */ new_nregs = re_list->nregs + npatterns; new_patterns = (char **)realloc( re_list->patterns, new_nregs * sizeof(char *)); if (!new_patterns) { goto MEMORY_ERROR; } re_list->patterns = new_patterns; /* Allocate space for the new cflags list */ new_cflags = (int *)realloc( re_list->cflags, new_nregs * sizeof(int)); if (!new_cflags) { goto MEMORY_ERROR; } re_list->cflags = new_cflags; /* Allocate space for the new regs list */ new_regs = (regex_t **)realloc( re_list->regs, new_nregs * sizeof(regex_t *)); if (!new_regs) { goto MEMORY_ERROR; } re_list->regs = new_regs; /* Compile the new regular expressions */ for (pi = 0; pi < npatterns; pi++) { preg = (regex_t *)calloc(1, sizeof(regex_t)); if (!preg) { goto MEMORY_ERROR; } if (!re_compile(preg, patterns[pi], cflags)) { free(preg); goto REGCOMP_ERROR; } re_list->patterns[re_list->nregs] = strdup(patterns[pi]); if (!re_list->patterns[re_list->nregs]) { regfree(preg); free(preg); goto MEMORY_ERROR; } re_list->cflags[re_list->nregs] = cflags; re_list->regs[re_list->nregs] = preg; re_list->nregs++; } return(re_list); MEMORY_ERROR: ERROR( ARMUTILS_LIB_NAME, "Could not compile list of regular expression patterns\n" " -> memory allocation error\n"); REGCOMP_ERROR: if (new_re_list) relist_free(new_re_list); return((REList *)NULL); }
/* * ex_s -- * [line [,line]] s[ubstitute] [[/;]pat[/;]/repl[/;] [cgr] [count] [#lp]] * * Substitute on lines matching a pattern. * * PUBLIC: int ex_s(SCR *, EXCMD *); */ int ex_s(SCR *sp, EXCMD *cmdp) { regex_t *re; size_t blen, len; u_int flags; int delim; char *bp, *ptrn, *rep, *p, *t; /* * Skip leading white space. * * !!! * Historic vi allowed any non-alphanumeric to serve as the * substitution command delimiter. * * !!! * If the arguments are empty, it's the same as &, i.e. we * repeat the last substitution. */ if (cmdp->argc == 0) goto subagain; for (p = cmdp->argv[0]->bp, len = cmdp->argv[0]->len; len > 0; --len, ++p) { if (!isblank(*p)) break; } if (len == 0) subagain: return (ex_subagain(sp, cmdp)); delim = *p++; if (isalnum(delim) || delim == '\\') return (s(sp, cmdp, p, &sp->subre_c, SUB_MUSTSETR)); /* * !!! * The full-blown substitute command reset the remembered * state of the 'c' and 'g' suffices. */ sp->c_suffix = sp->g_suffix = 0; /* * Get the pattern string, toss escaping characters. * * !!! * Historic vi accepted any of the following forms: * * :s/abc/def/ change "abc" to "def" * :s/abc/def change "abc" to "def" * :s/abc/ delete "abc" * :s/abc delete "abc" * * QUOTING NOTE: * * Only toss an escaping character if it escapes a delimiter. * This means that "s/A/\\\\f" replaces "A" with "\\f". It * would be nice to be more regular, i.e. for each layer of * escaping a single escaping character is removed, but that's * not how the historic vi worked. */ for (ptrn = t = p;;) { if (p[0] == '\0' || p[0] == delim) { if (p[0] == delim) ++p; /* * !!! * Nul terminate the pattern string -- it's passed * to regcomp which doesn't understand anything else. */ *t = '\0'; break; } if (p[0] == '\\') { if (p[1] == delim) ++p; else if (p[1] == '\\') *t++ = *p++; } *t++ = *p++; } /* * If the pattern string is empty, use the last RE (not just the * last substitution RE). */ if (*ptrn == '\0') { if (sp->re == NULL) { ex_emsg(sp, NULL, EXM_NOPREVRE); return (1); } /* Re-compile the RE if necessary. */ if (!F_ISSET(sp, SC_RE_SEARCH) && re_compile(sp, sp->re, sp->re_len, NULL, NULL, &sp->re_c, RE_C_SEARCH)) return (1); flags = 0; } else { /* * !!! * Compile the RE. Historic practice is that substitutes set * the search direction as well as both substitute and search * RE's. We compile the RE twice, as we don't want to bother * ref counting the pattern string and (opaque) structure. */ if (re_compile(sp, ptrn, t - ptrn, &sp->re, &sp->re_len, &sp->re_c, RE_C_SEARCH)) return (1); if (re_compile(sp, ptrn, t - ptrn, &sp->subre, &sp->subre_len, &sp->subre_c, RE_C_SUBST)) return (1); flags = SUB_FIRST; sp->searchdir = FORWARD; } re = &sp->re_c; /* * Get the replacement string. * * The special character & (\& if O_MAGIC not set) matches the * entire RE. No handling of & is required here, it's done by * re_sub(). * * The special character ~ (\~ if O_MAGIC not set) inserts the * previous replacement string into this replacement string. * Count ~'s to figure out how much space we need. We could * special case nonexistent last patterns or whether or not * O_MAGIC is set, but it's probably not worth the effort. * * QUOTING NOTE: * * Only toss an escaping character if it escapes a delimiter or * if O_MAGIC is set and it escapes a tilde. * * !!! * If the entire replacement pattern is "%", then use the last * replacement pattern. This semantic was added to vi in System * V and then percolated elsewhere, presumably around the time * that it was added to their version of ed(1). */ if (p[0] == '\0' || p[0] == delim) { if (p[0] == delim) ++p; if (sp->repl != NULL) free(sp->repl); sp->repl = NULL; sp->repl_len = 0; } else if (p[0] == '%' && (p[1] == '\0' || p[1] == delim)) p += p[1] == delim ? 2 : 1; else { for (rep = p, len = 0; p[0] != '\0' && p[0] != delim; ++p, ++len) if (p[0] == '~') len += sp->repl_len; GET_SPACE_RET(sp, bp, blen, len); for (t = bp, len = 0, p = rep;;) { if (p[0] == '\0' || p[0] == delim) { if (p[0] == delim) ++p; break; } if (p[0] == '\\') { if (p[1] == delim) ++p; else if (p[1] == '\\') { *t++ = *p++; ++len; } else if (p[1] == '~') { ++p; if (!O_ISSET(sp, O_MAGIC)) goto tilde; } } else if (p[0] == '~' && O_ISSET(sp, O_MAGIC)) { tilde: ++p; memcpy(t, sp->repl, sp->repl_len); t += sp->repl_len; len += sp->repl_len; continue; } *t++ = *p++; ++len; } if ((sp->repl_len = len) != 0) { if (sp->repl != NULL) free(sp->repl); if ((sp->repl = malloc(len)) == NULL) { msgq(sp, M_SYSERR, NULL); FREE_SPACE(sp, bp, blen); return (1); } memcpy(sp->repl, bp, len); } FREE_SPACE(sp, bp, blen); } return (s(sp, cmdp, p, re, flags)); }
int main(int argc, char *argv[]) { LinkedList *ll = NULL; TreeSet *ts = NULL; char *sp; char pattern[4096]; RegExp *reg; Iterator *it; if (argc < 2) { fprintf(stderr, "Usage: ./fileCrawler pattern [dir] ...\n"); return -1; } /* * convert bash expression to regular expression and compile */ cvtPattern(pattern, argv[1]); if ((reg = re_create()) == NULL) { fprintf(stderr, "Error creating Regular Expression Instance\n"); return -1; } if (! re_compile(reg, pattern)) { char eb[4096]; re_status(reg, eb, sizeof eb); fprintf(stderr, "Compile error - pattern: `%s', error message: `%s'\n", pattern, eb); re_destroy(reg); return -1; } /* * create linked list and treeset */ if ((ll = ll_create()) == NULL) { fprintf(stderr, "Unable to create linked list\n"); goto done; } if ((ts = ts_create(scmp)) == NULL) { fprintf(stderr, "Unable to create tree set\n"); goto done; } /* * populate linked list */ if (argc == 2) { if (! processDirectory(".", ll, 1)) goto done; } else { int i; for (i = 2; i < argc; i++) { if (! processDirectory(argv[i], ll, 1)) goto done; } } /* * for each directory in the linked list, apply regular expression */ while (ll_removeFirst(ll, (void **)&sp)) { int stat = applyRe(sp, reg, ts); free(sp); if (! stat) break; } /* * create iterator to traverse files matching pattern in sorted order */ if ((it = ts_it_create(ts)) == NULL) { fprintf(stderr, "Unable to create iterator over tree set\n"); goto done; } while (it_hasNext(it)) { char *s; (void) it_next(it, (void **)&s); printf("%s\n", s); } it_destroy(it); /* * cleanup after ourselves so there are no memory leaks */ done: if (ll != NULL) ll_destroy(ll, free); if (ts != NULL) ts_destroy(ts, free); re_destroy(reg); return 0; }