lucy_RegexTokenizer* lucy_RegexTokenizer_init(lucy_RegexTokenizer *self, cfish_String *pattern) { lucy_Analyzer_init((lucy_Analyzer*)self); lucy_RegexTokenizerIVARS *const ivars = lucy_RegexTokenizer_IVARS(self); #define DEFAULT_PATTERN "\\w+(?:['\\x{2019}]\\w+)*" if (pattern) { if (CFISH_Str_Contains_Utf8(pattern, "\\p", 2) || CFISH_Str_Contains_Utf8(pattern, "\\P", 2) ) { CFISH_DECREF(self); THROW(CFISH_ERR, "\\p and \\P constructs forbidden"); } ivars->pattern = CFISH_Str_Clone(pattern); } else { ivars->pattern = cfish_Str_new_from_trusted_utf8( DEFAULT_PATTERN, sizeof(DEFAULT_PATTERN) - 1); } // Acquire a compiled regex engine for matching one token. dTHX; SV *token_re = S_compile_token_re(aTHX_ ivars->pattern); #if (PERL_VERSION > 10) REGEXP *rx = SvRX((SV*)token_re); #else if (!SvROK(token_re)) { THROW(CFISH_ERR, "token_re is not a qr// entity"); } SV *inner = SvRV(token_re); MAGIC *magic = NULL; if (SvMAGICAL((SV*)inner)) { magic = mg_find((SV*)inner, PERL_MAGIC_qr); } if (!magic) { THROW(CFISH_ERR, "token_re is not a qr// entity"); } REGEXP *rx = (REGEXP*)magic->mg_obj; #endif if (rx == NULL) { THROW(CFISH_ERR, "Failed to extract REGEXP from token_re '%s'", SvPV_nolen((SV*)token_re)); } ivars->token_re = rx; (void)ReREFCNT_inc(((REGEXP*)ivars->token_re)); SvREFCNT_dec(token_re); return self; }
cfish_Vector* LUCY_Doc_Field_Names_IMP(lucy_Doc *self) { dTHX; lucy_DocIVARS *const ivars = lucy_Doc_IVARS(self); HV *fields = (HV*)ivars->fields; I32 num_fields = hv_iterinit(fields); cfish_Vector *retval = cfish_Vec_new(num_fields); while (num_fields--) { HE *entry = hv_iternext(fields); STRLEN key_size; const char *key = XSBind_hash_key_to_utf8(aTHX_ entry, &key_size); cfish_String *key_str = cfish_Str_new_from_trusted_utf8(key, key_size); CFISH_Vec_Push(retval, (cfish_Obj*)key_str); } return retval; }
Obj* cfish_inc_refcount(void *vself) { Obj *self = (Obj*)vself; // Handle special cases. cfish_Class *const klass = self->klass; if (klass->flags & CFISH_fREFCOUNTSPECIAL) { if (SI_is_string_type(klass)) { // Only copy-on-incref Strings get special-cased. Ordinary // strings fall through to the general case. if (CFISH_Str_Is_Copy_On_IncRef((cfish_String*)self)) { const char *utf8 = CFISH_Str_Get_Ptr8((cfish_String*)self); size_t size = CFISH_Str_Get_Size((cfish_String*)self); return (cfish_Obj*)cfish_Str_new_from_trusted_utf8(utf8, size); } } else if (SI_immortal(klass)) { return self; } } self->refcount++; return self; }