lucy_RegexTokenizer* lucy_RegexTokenizer_init(lucy_RegexTokenizer *self, const lucy_CharBuf *pattern) { SV *token_re_sv; lucy_Analyzer_init((lucy_Analyzer*)self); #define DEFAULT_PATTERN "\\w+(?:['\\x{2019}]\\w+)*" if (pattern) { if (Lucy_CB_Find_Str(pattern, "\\p", 2) != -1 || Lucy_CB_Find_Str(pattern, "\\P", 2) != -1 ) { CFISH_DECREF(self); THROW(LUCY_ERR, "\\p and \\P constructs forbidden"); } self->pattern = Lucy_CB_Clone(pattern); } else { self->pattern = lucy_CB_new_from_trusted_utf8( DEFAULT_PATTERN, sizeof(DEFAULT_PATTERN) - 1); } // Acquire a compiled regex engine for matching one token. token_re_sv = (SV*)lucy_Host_callback_host( LUCY_REGEXTOKENIZER, "compile_token_re", 1, CFISH_ARG_STR("pattern", self->pattern)); S_set_token_re_but_not_pattern(self, SvRV(token_re_sv)); SvREFCNT_dec(token_re_sv); return self; }
lucy_RegexTokenizer* lucy_RegexTokenizer_init(lucy_RegexTokenizer *self, cfish_String *pattern) { lucy_Analyzer_init((lucy_Analyzer*)self); lucy_RegexTokenizerIVARS *const ivars = lucy_RegexTokenizer_IVARS(self); #define DEFAULT_PATTERN "\\w+(?:['\\x{2019}]\\w+)*" if (pattern) { if (CFISH_Str_Contains_Utf8(pattern, "\\p", 2) || CFISH_Str_Contains_Utf8(pattern, "\\P", 2) ) { CFISH_DECREF(self); THROW(CFISH_ERR, "\\p and \\P constructs forbidden"); } ivars->pattern = CFISH_Str_Clone(pattern); } else { ivars->pattern = cfish_Str_new_from_trusted_utf8( DEFAULT_PATTERN, sizeof(DEFAULT_PATTERN) - 1); } // Acquire a compiled regex engine for matching one token. dTHX; SV *token_re = S_compile_token_re(aTHX_ ivars->pattern); #if (PERL_VERSION > 10) REGEXP *rx = SvRX((SV*)token_re); #else if (!SvROK(token_re)) { THROW(CFISH_ERR, "token_re is not a qr// entity"); } SV *inner = SvRV(token_re); MAGIC *magic = NULL; if (SvMAGICAL((SV*)inner)) { magic = mg_find((SV*)inner, PERL_MAGIC_qr); } if (!magic) { THROW(CFISH_ERR, "token_re is not a qr// entity"); } REGEXP *rx = (REGEXP*)magic->mg_obj; #endif if (rx == NULL) { THROW(CFISH_ERR, "Failed to extract REGEXP from token_re '%s'", SvPV_nolen((SV*)token_re)); } ivars->token_re = rx; (void)ReREFCNT_inc(((REGEXP*)ivars->token_re)); SvREFCNT_dec(token_re); return self; }