int KReg::matchNext(const char *str, int str_len, int flag, int *ovector, int ovector_size, int *workspace, int wscount) { #ifdef PCRE_DFA_SHORTEST return pcre_dfa_exec(c_model, pe, str, str_len, 0, PCRE_DFA_RESTART|flag, ovector, ovector_size, workspace, wscount); #else return -1; #endif }
//----------------------------------------------------------------------------// RegexMatcher::MatchState PCRERegexMatcher::getMatchStateOfString( const String& str) const { // if the regex is not valid, then an exception is thrown if (!d_regex) CEGUI_THROW(InvalidRequestException( "Attempt to use invalid RegEx '" + d_string + "'.")); int match[3]; const char* utf8_str = str.c_str(); const int len = static_cast<int>(strlen(utf8_str)); #ifdef PCRE_PARTIAL_SOFT // we are using a new version of pcre const int result = pcre_exec(d_regex, 0, utf8_str, len, 0, PCRE_PARTIAL_SOFT | PCRE_ANCHORED, match, 3); #else // PCRE_PARTIAL is a backwards compatible synonym for PCRE_PARTIAL_SOFT // using it is a requirement if we want to support pcre < 8.0 // Older versions of pcre have problems doing partial matching of // single repeated characters if using pcre_exec, // It is suggested to use pcre_dfa_exec instead. int workspace[100]; // FIXME: persist the workspace between match attempts const int result = pcre_dfa_exec(d_regex, 0, utf8_str, len, 0, PCRE_PARTIAL | PCRE_ANCHORED, match, 3, workspace, 100); #endif if (result == PCRE_ERROR_PARTIAL) return MS_PARTIAL; // a match must be for the entire string if (result >= 0) return (match[1] - match[0] == len) ? MS_VALID : MS_INVALID; // no match found or if test string or regex is 0 if (result == PCRE_ERROR_NOMATCH || result == PCRE_ERROR_NULL) return MS_INVALID; // anything else is an error CEGUI_THROW(InvalidRequestException( "PCRE Error: " + PropertyHelper<int>::toString(result) + " occurred while attempting to match the RegEx '" + d_string + "'.")); }
AjBool ajRegExecallC(AjPRegexp prog, const char* str) { int startoffset = 0; int options = 0; if(!regDfaWorkspace) AJCNEW(regDfaWorkspace, regDfaWsCount); prog->matches = pcre_dfa_exec(prog->pcre, prog->extra, str, strlen(str), startoffset, options, prog->ovector, 3*prog->ovecsize, regDfaWorkspace, regDfaWsCount); if(prog->matches >= 0) { prog->orig = str; if(prog->matches == 0) ajWarn("ajRegExecallC too many substrings"); return ajTrue; } if(prog->matches < -1) /* -1 is a simple fail to match */ { /* others are recursion limits etc. */ ajDebug("ajRegExecallC returned unexpected status '%d'\n", prog->matches); prog->orig = str; /* needed for the trace */ ajRegTrace(prog); } prog->orig = NULL; return ajFalse; }
static int Lpcre_dfa_exec (lua_State *L) { TArgExec argE; TPcre *ud; int res; int *buf, *ovector, *wspace; checkarg_dfa_exec (L, &argE, &ud); buf = (int*) Lmalloc (L, (argE.ovecsize + argE.wscount) * sizeof(int)); ovector = buf; wspace = buf + argE.ovecsize; res = pcre_dfa_exec (ud->pr, ud->extra, argE.text, (int)argE.textlen, argE.startoffset, argE.eflags, ovector, argE.ovecsize, wspace, argE.wscount); if (ALG_ISMATCH (res) || res == PCRE_ERROR_PARTIAL) { int i; int max = (res>0) ? res : (res==0) ? (int)argE.ovecsize/2 : 1; lua_pushinteger (L, ovector[0] + 1); /* 1-st return value */ lua_newtable (L); /* 2-nd return value */ for (i=0; i<max; i++) { lua_pushinteger (L, ovector[i+i+1]); lua_rawseti (L, -2, i+1); } lua_pushinteger (L, res); /* 3-rd return value */ free (buf); return 3; } else { free (buf); if (res == ALG_NOMATCH) return lua_pushnil (L), 1; else return generate_error (L, ud, res); } }
/** * @brief Execute the rule. * * @param[in] ib Ironbee engine * @param[in] tx The transaction. * @param[in,out] User data. A @c pcre_rule_data_t. * @param[in] flags Operator instance flags * @param[in] field The field content. * @param[out] result The result. * @returns IB_OK most times. IB_EALLOC when a memory allocation error handles. */ static ib_status_t dfa_operator_execute(ib_engine_t *ib, ib_tx_t *tx, const ib_rule_t *rule, void *data, ib_flags_t flags, ib_field_t *field, ib_num_t *result) { IB_FTRACE_INIT(); assert(tx); assert(data); int matches; ib_status_t ib_rc; const int ovecsize = 3 * MATCH_MAX; dfa_rule_data_t *rule_data; int *ovector; const char* subject; size_t subject_len; const ib_bytestr_t* bytestr; dfa_workspace_t *dfa_workspace; int options; /* dfa exec options. */ ovector = (int *)malloc(ovecsize*sizeof(*ovector)); if (ovector==NULL) { IB_FTRACE_RET_STATUS(IB_EALLOC); } /* Pull out the rule data. */ rule_data = (dfa_rule_data_t *)data; if (field->type == IB_FTYPE_NULSTR) { ib_rc = ib_field_value(field, ib_ftype_nulstr_out(&subject)); if (ib_rc != IB_OK) { free(ovector); IB_FTRACE_RET_STATUS(ib_rc); } subject_len = strlen(subject); } else if (field->type == IB_FTYPE_BYTESTR) { ib_rc = ib_field_value(field, ib_ftype_bytestr_out(&bytestr)); if (ib_rc != IB_OK) { free(ovector); IB_FTRACE_RET_STATUS(ib_rc); } subject_len = ib_bytestr_length(bytestr); subject = (const char *) ib_bytestr_const_ptr(bytestr); } else { free(ovector); IB_FTRACE_RET_STATUS(IB_EINVAL); } /* Debug block. Escapes a string and prints it to the log. * Memory is freed. */ if (ib_log_get_level(ib) >= 9) { /* Worst case, we can have a string that is 4x larger. * Consider if a string of 0xF7 is passed. That single character * will expand to a string of 4 printed characters +1 for the \0 * character. */ char *debug_str = ib_util_hex_escape(subject, subject_len); if ( debug_str != NULL ) { ib_log_debug3_tx(tx, "Matching against: %s", debug_str); free( debug_str ); } } /* Get the per-tx workspace data for this rule data id. */ ib_rc = get_dfa_tx_data(tx, rule_data->id, &dfa_workspace); if (ib_rc == IB_ENOENT) { options = PCRE_PARTIAL_SOFT; ib_rc = alloc_dfa_tx_data(tx, rule_data->id, &dfa_workspace); if (ib_rc != IB_OK) { free(ovector); ib_log_error_tx(tx, "Unexpected error creating tx storage " "for dfa operator %s", rule_data->id); IB_FTRACE_RET_STATUS(ib_rc); } ib_log_debug_tx(tx, "Created DFA workspace at %p for id %s.", dfa_workspace, rule_data->id); } else if (ib_rc == IB_OK) { options = PCRE_PARTIAL_SOFT | PCRE_DFA_RESTART; ib_log_debug_tx(tx, "Reusing existing DFA workspace %p for id %s.", dfa_workspace, rule_data->id); } else { free(ovector); ib_log_error_tx(tx, "Unexpected error fetching dfa data " "for dfa operator %s", rule_data->id); IB_FTRACE_RET_STATUS(ib_rc); } /* Actually do the DFA match. */ matches = pcre_dfa_exec(rule_data->cpatt, rule_data->edata, subject, subject_len, 0, /* Starting offset. */ options, ovector, ovecsize, dfa_workspace->workspace, dfa_workspace->wscount); if (matches >= 0) { ib_rc = IB_OK; *result = 1; } else if (matches == PCRE_ERROR_PARTIAL) { ib_log_debug2_tx(tx, "Partial match found, but not a full match."); ib_rc = IB_OK; *result = 0; } else if (matches == PCRE_ERROR_NOMATCH) { if (ib_log_get_level(ib) >= 7) { char* tmp_c = malloc(subject_len+1); memcpy(tmp_c, subject, subject_len); tmp_c[subject_len] = '\0'; /* No match. Return false to the caller (*result = 0). */ ib_log_debug2_tx(tx, "No match for [%s] using pattern [%s].", tmp_c, rule_data->patt); free(tmp_c); } ib_rc = IB_OK; *result = 0; } else { /* Some other error occurred. Set the status to false and report the error. */ ib_rc = IB_EUNKNOWN; *result = 0; } free(ovector); IB_FTRACE_RET_STATUS(ib_rc); }
CAMLprim value pcre_exec_stub0( intnat v_opt, value v_rex, intnat v_pos, intnat v_subj_start, value v_subj, value v_ovec, value v_maybe_cof, value v_workspace) { int ret; int is_dfa = v_workspace != (value) NULL; long pos = v_pos, len = caml_string_length(v_subj), subj_start = v_subj_start; long ovec_len = Wosize_val(v_ovec); if (pos > len || pos < subj_start) caml_invalid_argument("Pcre.pcre_exec_stub: illegal position"); if (subj_start > len || subj_start < 0) caml_invalid_argument("Pcre.pcre_exec_stub: illegal subject start"); pos -= subj_start; len -= subj_start; { const pcre *code = get_rex(v_rex); /* Compiled pattern */ const pcre_extra *extra = get_extra(v_rex); /* Extra info */ const char *ocaml_subj = String_val(v_subj) + subj_start; /* Subject string */ const int opt = v_opt; /* Runtime options */ /* Special case when no callout functions specified */ if (v_maybe_cof == None) { int *ovec = (int *) &Field(v_ovec, 0); /* Performs the match */ if (is_dfa) ret = pcre_dfa_exec(code, extra, ocaml_subj, len, pos, opt, ovec, ovec_len, (int *) &Field(v_workspace, 0), Wosize_val(v_workspace)); else ret = pcre_exec(code, extra, ocaml_subj, len, pos, opt, ovec, ovec_len); if (ret < 0) handle_exec_error("pcre_exec_stub", ret); else handle_pcre_exec_result(ovec, v_ovec, ovec_len, subj_start, ret); } /* There are callout functions */ else { value v_cof = Field(v_maybe_cof, 0); value v_substrings; char *subj = caml_stat_alloc(sizeof(char) * len); int *ovec = caml_stat_alloc(sizeof(int) * ovec_len); int workspace_len; int *workspace; struct cod cod = { 0, (value *) NULL, (value *) NULL, (value) NULL }; struct pcre_extra new_extra = #ifdef PCRE_EXTRA_MATCH_LIMIT_RECURSION # ifdef PCRE_EXTRA_MARK # ifdef PCRE_EXTRA_EXECUTABLE_JIT { PCRE_EXTRA_CALLOUT_DATA, NULL, 0, NULL, NULL, 0, NULL, NULL }; # else { PCRE_EXTRA_CALLOUT_DATA, NULL, 0, NULL, NULL, 0, NULL }; # endif # else { PCRE_EXTRA_CALLOUT_DATA, NULL, 0, NULL, NULL, 0 }; # endif #else { PCRE_EXTRA_CALLOUT_DATA, NULL, 0, NULL, NULL }; #endif cod.subj_start = subj_start; memcpy(subj, ocaml_subj, len); Begin_roots4(v_rex, v_cof, v_substrings, v_ovec); Begin_roots1(v_subj); v_substrings = caml_alloc_small(2, 0); End_roots(); Field(v_substrings, 0) = v_subj; Field(v_substrings, 1) = v_ovec; cod.v_substrings_p = &v_substrings; cod.v_cof_p = &v_cof; new_extra.callout_data = &cod; if (extra != NULL) { new_extra.flags = PCRE_EXTRA_CALLOUT_DATA | extra->flags; new_extra.study_data = extra->study_data; new_extra.match_limit = extra->match_limit; new_extra.tables = extra->tables; #ifdef PCRE_EXTRA_MATCH_LIMIT_RECURSION new_extra.match_limit_recursion = extra->match_limit_recursion; #endif } if (is_dfa) { workspace_len = Wosize_val(v_workspace); workspace = caml_stat_alloc(sizeof(int) * workspace_len); ret = pcre_dfa_exec(code, extra, subj, len, pos, opt, ovec, ovec_len, (int *) &Field(v_workspace, 0), workspace_len); } else ret = pcre_exec(code, &new_extra, subj, len, pos, opt, ovec, ovec_len); caml_stat_free(subj); End_roots(); if (ret < 0) { if (is_dfa) caml_stat_free(workspace); caml_stat_free(ovec); if (ret == PCRE_ERROR_CALLOUT) caml_raise(cod.v_exn); else handle_exec_error("pcre_exec_stub(callout)", ret); } else { handle_pcre_exec_result(ovec, v_ovec, ovec_len, subj_start, ret); if (is_dfa) { caml_int_ptr ocaml_workspace_dst = (caml_int_ptr) &Field(v_workspace, 0); const int *workspace_src = workspace; const int *workspace_src_stop = workspace + workspace_len; while (workspace_src != workspace_src_stop) { *ocaml_workspace_dst = *workspace_src; ocaml_workspace_dst++; workspace_src++; } caml_stat_free(workspace); } caml_stat_free(ovec); } } } return Val_unit; } CAMLprim value pcre_exec_stub( intnat v_opt, value v_rex, intnat v_pos, intnat v_subj_start, value v_subj, value v_ovec, value v_maybe_cof) { return pcre_exec_stub0(v_opt, v_rex, v_pos, v_subj_start, v_subj, v_ovec, v_maybe_cof, (value) NULL); } /* Byte-code hook for pcre_exec_stub Needed, because there are more than 5 arguments */ CAMLprim value pcre_exec_stub_bc(value *argv, int __unused argn) { return pcre_exec_stub0( Int_val(argv[0]), argv[1], Int_val(argv[2]), Int_val(argv[3]), argv[4], argv[5], argv[6], (value) NULL); } /* Byte-code hook for pcre_dfa_exec_stub Needed, because there are more than 5 arguments */ CAMLprim value pcre_dfa_exec_stub_bc(value *argv, int __unused argn) { return pcre_exec_stub0( Int_val(argv[0]), argv[1], Int_val(argv[2]), Int_val(argv[3]), argv[4], argv[5], argv[6], argv[7]); } static struct custom_operations tables_ops = { "pcre_ocaml_tables", pcre_dealloc_tables, custom_compare_default, custom_hash_default, custom_serialize_default, custom_deserialize_default, custom_compare_ext_default }; /* Generates a new set of chartables for the current locale (see man page of PCRE */ CAMLprim value pcre_maketables_stub(value __unused v_unit) { /* GC will do a full cycle every 1_000_000 table set allocations (one table set consumes 864 bytes -> maximum of 864_000_000 bytes unreclaimed table sets) */ const value v_tables = caml_alloc_custom( &tables_ops, sizeof(struct pcre_ocaml_tables), 1, 1000000); set_tables(v_tables, pcre_maketables()); return v_tables; } /* Wraps around the isspace-function */ CAMLprim value pcre_isspace_stub(value v_c) { return Val_bool(isspace(Int_val(v_c))); } /* Returns number of substring associated with a name */ CAMLprim intnat pcre_get_stringnumber_stub(value v_rex, value v_name) { const int ret = pcre_get_stringnumber(get_rex(v_rex), String_val(v_name)); if (ret == PCRE_ERROR_NOSUBSTRING) caml_invalid_argument("Named string not found"); return ret; } CAMLprim value pcre_get_stringnumber_stub_bc(value v_rex, value v_name) { return Val_int(pcre_get_stringnumber_stub(v_rex, v_name)); } /* Returns array of names of named substrings in a regexp */ CAMLprim value pcre_names_stub(value v_rex) { CAMLparam0(); CAMLlocal1(v_res); int name_count; int entry_size; const char *tbl_ptr; int i; int ret = pcre_fullinfo_stub(v_rex, PCRE_INFO_NAMECOUNT, &name_count); if (ret != 0) raise_internal_error("pcre_names_stub: namecount"); ret = pcre_fullinfo_stub(v_rex, PCRE_INFO_NAMEENTRYSIZE, &entry_size); if (ret != 0) raise_internal_error("pcre_names_stub: nameentrysize"); ret = pcre_fullinfo_stub(v_rex, PCRE_INFO_NAMETABLE, &tbl_ptr); if (ret != 0) raise_internal_error("pcre_names_stub: nametable"); v_res = caml_alloc(name_count, 0); for (i = 0; i < name_count; ++i) { value v_name = caml_copy_string(tbl_ptr + 2); Store_field(v_res, i, v_name); tbl_ptr += entry_size; } CAMLreturn(v_res); } /* Generic stub for getting integer results from pcre_config */ static inline int pcre_config_int(int what) { int ret; pcre_config(what, (void *) &ret); return ret; } /* Generic stub for getting long integer results from pcre_config */ static inline int pcre_config_long(int what) { long ret; pcre_config(what, (void *) &ret); return ret; }
/** * @brief Execute the dfa operator * * @param[in] tx Current transaction. * @param[in] instance_data Instance data needed for execution. * @param[in] field The field to operate on. * @param[in] capture If non-NULL, the collection to capture to. * @param[out] result The result of the operator 1=true 0=false. * @param[in] cbdata Callback data. * * @returns IB_OK most times. IB_EALLOC when a memory allocation error handles. */ static ib_status_t dfa_operator_execute( ib_tx_t *tx, void *instance_data, const ib_field_t *field, ib_field_t *capture, ib_num_t *result, void *cbdata ) { assert(instance_data != NULL); assert(tx != NULL); int matches; ib_status_t ib_rc; const int ovecsize = 3 * MATCH_MAX; modpcre_operator_data_t *operator_data = (modpcre_operator_data_t *)instance_data; int *ovector; const char *subject; size_t subject_len; const ib_bytestr_t *bytestr; dfa_workspace_t *dfa_workspace; const char *id = operator_data->id; int options; /* dfa exec options. */ int start_offset; int match_count; const ib_module_t *m = (const ib_module_t *)cbdata; assert(m != NULL); assert(operator_data->cpdata->is_dfa == true); ovector = (int *)malloc(ovecsize*sizeof(*ovector)); if (ovector==NULL) { return IB_EALLOC; } if (field->type == IB_FTYPE_NULSTR) { ib_rc = ib_field_value(field, ib_ftype_nulstr_out(&subject)); if (ib_rc != IB_OK) { free(ovector); return ib_rc; } subject_len = strlen(subject); } else if (field->type == IB_FTYPE_BYTESTR) { ib_rc = ib_field_value(field, ib_ftype_bytestr_out(&bytestr)); if (ib_rc != IB_OK) { free(ovector); return ib_rc; } subject_len = ib_bytestr_length(bytestr); subject = (const char *) ib_bytestr_const_ptr(bytestr); } else { free(ovector); return IB_EINVAL; } /* Get the per-tx workspace data for this rule data id. */ ib_rc = get_dfa_tx_data(m, tx, id, &dfa_workspace); if (ib_rc == IB_ENOENT) { /* First time we are called, clear the captures. */ if (capture) { ib_rc = ib_capture_clear(capture); if (ib_rc != IB_OK) { ib_log_error_tx(tx, "Error clearing captures: %s", ib_status_to_string(ib_rc)); } } options = PCRE_PARTIAL_SOFT; ib_rc = alloc_dfa_tx_data(m, tx, operator_data->cpdata, id, &dfa_workspace); if (ib_rc != IB_OK) { free(ovector); return ib_rc; } } else if (ib_rc == IB_OK) { options = PCRE_PARTIAL_SOFT | PCRE_DFA_RESTART; } else { free(ovector); return ib_rc; } /* Perform the match. * If capturing is specified, then find all matches. */ start_offset = 0; match_count = 0; do { matches = pcre_dfa_exec(operator_data->cpdata->cpatt, operator_data->cpdata->edata, subject, subject_len, start_offset, /* Starting offset. */ options, ovector, ovecsize, dfa_workspace->workspace, dfa_workspace->wscount); if (matches > 0) { ++match_count; /* Use the longest match - the first in ovector - * to set the offset in the subject for the next * match. */ start_offset = ovector[1] + 1; if (capture) { pcre_dfa_set_match(tx, capture, ovector, 1, subject); } } } while (capture && (matches > 0)); if (match_count > 0) { ib_rc = IB_OK; *result = 1; } else if ((matches == 0) || (matches == PCRE_ERROR_NOMATCH)) { ib_rc = IB_OK; *result = 0; } else if (matches == PCRE_ERROR_PARTIAL) { ib_rc = IB_OK; *result = 0; } else { /* Some other error occurred. Set the status to false and * return the error. */ ib_rc = IB_EUNKNOWN; *result = 0; } free(ovector); return ib_rc; }
void pcre_find_all(char* pattern, char* subject, int subject_len, int repeat, int mode) { pcre *re; const char *error; int err_val, match[64]; pcre_extra *extra; pcre_jit_stack *stack = NULL; char *ptr; int len; clock_t best_time = 0, time = 0; int found; static int work_space[4096]; re = pcre_compile( pattern, /* the pattern */ PCRE_MULTILINE | PCRE_NEWLINE_ANYCRLF, /* options */ &error, /* for error message */ &err_val, /* for error offset */ NULL); /* use default character tables */ if (!re) { printf("PCRE compilation failed at offset %d: %s\n", err_val, error); return; } error = NULL; extra = pcre_study(re, mode == 2 ? PCRE_STUDY_JIT_COMPILE : 0, &error); if (error) { printf("PCRE study failed: %s\n", error); return; } if (mode == 2) { found = 0; pcre_fullinfo(re, extra, PCRE_INFO_JIT, &found); if (!found) { printf("PCRE JIT compilation failed: %s\n", error); return; } stack = pcre_jit_stack_alloc(65536, 65536); } do { found = 0; ptr = subject; len = subject_len; switch (mode) { case 0: time = clock(); while (1) { err_val = pcre_exec( re, /* the compiled pattern */ extra, /* extra data */ ptr, /* the subject string */ len, /* the length of the subject */ 0, /* start at offset 0 in the subject */ 0, /* default options */ match, /* output vector for substring information */ 64); /* number of elements in the output vector */ if (err_val <= 0) { if (err_val == PCRE_ERROR_NOMATCH) break; printf("PCRE pcre_exec failed with: %d\n", err_val); break; } // printf("match: %d %d\n", (ptr - subject) + match[0], (ptr - subject) + match[1]); ptr += match[1]; len -= match[1]; found++; } time = clock() - time; break; case 1: time = clock(); while (1) { err_val = pcre_dfa_exec( re, /* the compiled pattern */ extra, /* extra data */ ptr, /* the subject string */ len, /* the length of the subject */ 0, /* start at offset 0 in the subject */ 0, /* default options */ match, /* output vector for substring information */ 2, /* number of elements in the output vector */ work_space, /* number of elements (NOT size in bytes) */ 4096); if (err_val < 0) { if (err_val == PCRE_ERROR_NOMATCH) break; printf("PCRE pcre_exec failed\n"); break; } // printf("match: %d %d\n", (ptr - subject) + match[0], (ptr - subject) + match[1]); ptr += match[1]; len -= match[1]; found++; } time = clock() - time; break; case 2: time = clock(); while (1) { err_val = pcre_jit_exec( re, /* the compiled pattern */ extra, /* extra data */ ptr, /* the subject string */ len, /* the length of the subject */ 0, /* start at offset 0 in the subject */ 0, /* default options */ match, /* output vector for substring information */ 64, /* number of elements in the output vector */ stack); /* jit stack */ if (err_val <= 0) { if (err_val == PCRE_ERROR_NOMATCH) break; printf("PCRE pcre_exec failed with: %d\n", err_val); break; } // printf("match: %d %d\n", (ptr - subject) + match[0], (ptr - subject) + match[1]); ptr += match[1]; len -= match[1]; found++; } time = clock() - time; break; } if (!best_time || time < best_time) best_time = time; } while (--repeat > 0); printResult(mode == 0 ? "pcre" : (mode == 1 ? "pcre-dfa" : "pcre-jit"), best_time * 1000 / CLOCKS_PER_SEC, found); if (extra) pcre_free_study(extra); if (stack) pcre_jit_stack_free(stack); pcre_free(re); }