Exemple #1
0
static Pvoid_t tokenize_text(char * txt, int length, Pvoid_t features) {
	char *token;

	// Remove HTML entities
	replace(txt, length, "&[^;]+;", " ");
	// Remove all non-alphanums
	replace(txt, length, "[^a-zA-Z0-9\\-]", " ");
	// Remove leading and trailing dashes
	replace(txt, length, "[[:space:]]+[\\-]+", " ");
	replace(txt, length, "\\-+[[:space:]]+", " ");
	// Normalize whitespace
	replace(txt, length, "[[:space:]]+", " ");
	foldcase(txt);

	for (; (token = strsep(&txt, "\t\n ")) != NULL; ) {
		if (token != '\0') {
			int toklen = strlen(token) + 1; // +1 for \0
			if (toklen > 2) {
				features = add_token(token, features);
			}
		}
	}

	return features;
}
Exemple #2
0
void LibRnrsUnicode::StringFoldcase::func(KevesVM* vm, const_KevesIterator pc) {
  const StringKev* original(vm->acc_);
  StringKev foldcase(original->toCaseFolded(vm->gc()));
  vm->acc_ = &foldcase;
  return KevesVM::returnValueSafe(vm, pc);
}