static Pvoid_t tokenize_text(char * txt, int length, Pvoid_t features) { char *token; // Remove HTML entities replace(txt, length, "&[^;]+;", " "); // Remove all non-alphanums replace(txt, length, "[^a-zA-Z0-9\\-]", " "); // Remove leading and trailing dashes replace(txt, length, "[[:space:]]+[\\-]+", " "); replace(txt, length, "\\-+[[:space:]]+", " "); // Normalize whitespace replace(txt, length, "[[:space:]]+", " "); foldcase(txt); for (; (token = strsep(&txt, "\t\n ")) != NULL; ) { if (token != '\0') { int toklen = strlen(token) + 1; // +1 for \0 if (toklen > 2) { features = add_token(token, features); } } } return features; }
void LibRnrsUnicode::StringFoldcase::func(KevesVM* vm, const_KevesIterator pc) { const StringKev* original(vm->acc_); StringKev foldcase(original->toCaseFolded(vm->gc())); vm->acc_ = &foldcase; return KevesVM::returnValueSafe(vm, pc); }