lucy_RegexTokenizer* lucy_RegexTokenizer_init(lucy_RegexTokenizer *self, cfish_String *pattern) { lucy_Analyzer_init((lucy_Analyzer*)self); lucy_RegexTokenizerIVARS *const ivars = lucy_RegexTokenizer_IVARS(self); #define DEFAULT_PATTERN "\\w+(?:['\\x{2019}]\\w+)*" if (pattern) { if (CFISH_Str_Contains_Utf8(pattern, "\\p", 2) || CFISH_Str_Contains_Utf8(pattern, "\\P", 2) ) { CFISH_DECREF(self); THROW(CFISH_ERR, "\\p and \\P constructs forbidden"); } ivars->pattern = CFISH_Str_Clone(pattern); } else { ivars->pattern = cfish_Str_new_from_trusted_utf8( DEFAULT_PATTERN, sizeof(DEFAULT_PATTERN) - 1); } // Acquire a compiled regex engine for matching one token. dTHX; SV *token_re = S_compile_token_re(aTHX_ ivars->pattern); #if (PERL_VERSION > 10) REGEXP *rx = SvRX((SV*)token_re); #else if (!SvROK(token_re)) { THROW(CFISH_ERR, "token_re is not a qr// entity"); } SV *inner = SvRV(token_re); MAGIC *magic = NULL; if (SvMAGICAL((SV*)inner)) { magic = mg_find((SV*)inner, PERL_MAGIC_qr); } if (!magic) { THROW(CFISH_ERR, "token_re is not a qr// entity"); } REGEXP *rx = (REGEXP*)magic->mg_obj; #endif if (rx == NULL) { THROW(CFISH_ERR, "Failed to extract REGEXP from token_re '%s'", SvPV_nolen((SV*)token_re)); } ivars->token_re = rx; (void)ReREFCNT_inc(((REGEXP*)ivars->token_re)); SvREFCNT_dec(token_re); return self; }
static void S_set_token_re_but_not_pattern(lucy_RegexTokenizer *self, void *token_re) { #if (PERL_VERSION > 10) REGEXP *rx = SvRX((SV*)token_re); #else MAGIC *magic = NULL; if (SvMAGICAL((SV*)token_re)) { magic = mg_find((SV*)token_re, PERL_MAGIC_qr); } if (!magic) { THROW(LUCY_ERR, "token_re is not a qr// entity"); } REGEXP *rx = (REGEXP*)magic->mg_obj; #endif if (rx == NULL) { THROW(LUCY_ERR, "Failed to extract REGEXP from token_re '%s'", SvPV_nolen((SV*)token_re)); } if (self->token_re) { ReREFCNT_dec(((REGEXP*)self->token_re)); } self->token_re = rx; (void)ReREFCNT_inc(((REGEXP*)self->token_re)); }