lucy_RegexTokenizer* lucy_RegexTokenizer_init(lucy_RegexTokenizer *self, cfish_String *pattern) { lucy_Analyzer_init((lucy_Analyzer*)self); lucy_RegexTokenizerIVARS *const ivars = lucy_RegexTokenizer_IVARS(self); #define DEFAULT_PATTERN "\\w+(?:['\\x{2019}]\\w+)*" if (pattern) { if (CFISH_Str_Contains_Utf8(pattern, "\\p", 2) || CFISH_Str_Contains_Utf8(pattern, "\\P", 2) ) { CFISH_DECREF(self); THROW(CFISH_ERR, "\\p and \\P constructs forbidden"); } ivars->pattern = CFISH_Str_Clone(pattern); } else { ivars->pattern = cfish_Str_new_from_trusted_utf8( DEFAULT_PATTERN, sizeof(DEFAULT_PATTERN) - 1); } // Acquire a compiled regex engine for matching one token. dTHX; SV *token_re = S_compile_token_re(aTHX_ ivars->pattern); #if (PERL_VERSION > 10) REGEXP *rx = SvRX((SV*)token_re); #else if (!SvROK(token_re)) { THROW(CFISH_ERR, "token_re is not a qr// entity"); } SV *inner = SvRV(token_re); MAGIC *magic = NULL; if (SvMAGICAL((SV*)inner)) { magic = mg_find((SV*)inner, PERL_MAGIC_qr); } if (!magic) { THROW(CFISH_ERR, "token_re is not a qr// entity"); } REGEXP *rx = (REGEXP*)magic->mg_obj; #endif if (rx == NULL) { THROW(CFISH_ERR, "Failed to extract REGEXP from token_re '%s'", SvPV_nolen((SV*)token_re)); } ivars->token_re = rx; (void)ReREFCNT_inc(((REGEXP*)ivars->token_re)); SvREFCNT_dec(token_re); return self; }
cfish_Hash* LUCY_Doc_Dump_IMP(lucy_Doc *self) { dTHX; lucy_DocIVARS *const ivars = lucy_Doc_IVARS(self); cfish_Hash *dump = cfish_Hash_new(0); CFISH_Hash_Store_Utf8(dump, "_class", 6, (cfish_Obj*)CFISH_Str_Clone(lucy_Doc_get_class_name(self))); CFISH_Hash_Store_Utf8(dump, "doc_id", 7, (cfish_Obj*)cfish_Str_newf("%i32", ivars->doc_id)); SV *fields_sv = newRV_inc((SV*)ivars->fields); CFISH_Hash_Store_Utf8(dump, "fields", 6, XSBind_perl_to_cfish(aTHX_ fields_sv, CFISH_HASH)); SvREFCNT_dec(fields_sv); return dump; }