Exemplo n.º 1
 * \brief Basic search case less with a bad characters array. The array badchars contains
 *        flags at character's ascii index that can't be inside the needle. So the skips can be
 *        faster
 * \param haystack pointer to the buffer to search in
 * \param haystack_len length limit of the buffer
 * \param neddle pointer to the pattern we ar searching for
 * \param needle_len length limit of the needle
 * \param badchars pointer to an array of bachars prepared by Bs2BmBadchars()
 * \retval ptr to start of the match; NULL if no match
uint8_t *Bs2BmNocase(const uint8_t *haystack, uint32_t haystack_len, const uint8_t *needle, uint16_t needle_len, uint8_t badchars[])
    const uint8_t *h, *n;
    const uint8_t *hmax = haystack + haystack_len;
    const uint8_t *nmax = needle + needle_len;

    if (needle_len == 0 || needle_len > haystack_len)
        return NULL;

    for (n = needle; nmax - n <= hmax - haystack; haystack++) {
        if (u8_tolower(*haystack) != u8_tolower(*n)) {
        /* one byte needles */
        if (needle_len == 1)
            return (uint8_t *)haystack;

        for (h = haystack+1, n++; nmax - n <= hmax - haystack; h++, n++) {
            if (u8_tolower(*h) != u8_tolower(*n)) {
                if (badchars[u8_tolower(*h)] == 1) {
                    /* skip it! */
                    haystack = h;
            /* if we run out of needle we fully matched */
            if (n == nmax - 1) {
                return (uint8_t *)haystack;
        n = needle;

    return NULL;
Exemplo n.º 2
 * \brief Basic search case less
 * \param haystack pointer to the buffer to search in
 * \param haystack_len length limit of the buffer
 * \param neddle pointer to the pattern we ar searching for
 * \param needle_len length limit of the needle
 * \retval ptr to start of the match; NULL if no match
uint8_t *BasicSearchNocase(const uint8_t *haystack, uint32_t haystack_len, const uint8_t *needle, uint32_t needle_len) {
    const uint8_t *h, *n;
    const uint8_t *hmax = haystack + haystack_len;
    const uint8_t *nmax = needle + needle_len;

    if (needle_len == 0 || needle_len > haystack_len)
        return NULL;

    n = needle;
    for (n = needle; nmax - n <= hmax - haystack; haystack++) {
        if (u8_tolower(*haystack) != u8_tolower(*n)) {
        /* one byte needles */
        if (needle_len == 1) {
            return (uint8_t *)haystack;

        for (h = haystack+1, n++; nmax - n <= hmax - h ; h++, n++) {
            if (u8_tolower(*h) != u8_tolower(*n)) {
            /* if we run out of needle we fully matched */
            if (n == nmax - 1) {
                return (uint8_t *)haystack;
        n = needle;

    return NULL;
Exemplo n.º 3
static int
check (const uint8_t *input, size_t input_length,
       const char *iso639_language, uninorm_t nf,
       const uint8_t *expected, size_t expected_length)
  size_t length;
  uint8_t *result;

  /* Test return conventions with resultbuf == NULL.  */
  result = u8_tolower (input, input_length, iso639_language, nf, NULL, &length);
  if (!(result != NULL))
    return 1;
  if (!(length == expected_length))
    return 2;
  if (!(u8_cmp (result, expected, expected_length) == 0))
    return 3;
  free (result);

  /* Test return conventions with resultbuf too small.  */
  if (expected_length > 0)
      uint8_t *preallocated;

      length = expected_length - 1;
      preallocated = (uint8_t *) malloc (length * sizeof (uint8_t));
      result = u8_tolower (input, input_length, iso639_language, nf, preallocated, &length);
      if (!(result != NULL))
        return 4;
      if (!(result != preallocated))
        return 5;
      if (!(length == expected_length))
        return 6;
      if (!(u8_cmp (result, expected, expected_length) == 0))
        return 7;
      free (result);
      free (preallocated);

  /* Test return conventions with resultbuf large enough.  */
    uint8_t *preallocated;

    length = expected_length;
    preallocated = (uint8_t *) malloc (length * sizeof (uint8_t));
    result = u8_tolower (input, input_length, iso639_language, nf, preallocated, &length);
    if (!(result != NULL))
      return 8;
    if (!(preallocated == NULL || result == preallocated))
      return 9;
    if (!(length == expected_length))
      return 10;
    if (!(u8_cmp (result, expected, expected_length) == 0))
      return 11;
    free (preallocated);

  return 0;
Exemplo n.º 4
string downstring(string localword, string lang) {
  // old Way to do it, not unicode aware.....
  //  for (unsigned int j=0; j < localword.length(); ++j)    {
  //    localword[j]=toupper(localword[j]);   
  //  }
  //  const uint8_t * word = static_cast<const uint8_t*>(localword.c_str());
  //  uint8_t * errCode;
  //  uint8_t val;
  //  errCode = &val;

  // New way to do it using libunicode
  //Get string length
  size_t length = localword.size();
  // create correct type for c-style unicode string
  const uint8_t * word = (const uint8_t*)localword.c_str();
  // create output buffer
  uint8_t output[200];
  // create output length location
  size_t outLength = 200;
  // make lowercase, normalize and put output in the output buffer, length in the outLength variable
  if (u8_check(word, length)) {
    cerr << endl << "Invalid UTF-8 in word: "<< word << " : Dropping it." << endl;
    //    throw Exception("This is an invalid UTF8 in string. Please make sure that you are using UTF8 encoding in all input files. Exiting.");
  if (!u8_tolower(word, length, lang.c_str(), UNINORM_NFKC, output, &outLength))  {
    cerr << endl << "Error during lowercase conversion for word : "<< word << " : Dropping it." << endl;
    //    throw Exception("Error during case conversion (in downstring) ");
  // return a c++ string, using begining and end pointers to the c-style string!
  return(string((const char *)output,(const char *)output+outLength));
Exemplo n.º 5
Arquivo: sss_utf8.c Projeto: SSSD/sssd
uint8_t *sss_utf8_tolower(const uint8_t *s, size_t len, size_t *_nlen)
    size_t llen;
    uint8_t *lower;

    lower = u8_tolower(s, len, NULL, NULL, NULL, &llen);
    if (!lower) return NULL;

    if (_nlen) *_nlen = llen;
    return lower;
Exemplo n.º 6
 * Convert the utf-8 input string to lowercase
 * Output needs to be allocated appropriately
 * @param input input string
 * @param output output buffer
GNUNET_STRINGS_utf8_tolower(const char* input, char** output)
  uint8_t *tmp_in;
  size_t len;

  tmp_in = u8_tolower ((uint8_t*)input, strlen ((char *) input),
                       NULL, UNINORM_NFD, NULL, &len);
  memcpy(*output, tmp_in, len);
  (*output)[len] = '\0';
Exemplo n.º 7
 * \brief Array setup function for Bs2BmNocase of bad characters index (not found at the needle)
 * \param neddle pointer to the pattern we ar searching for
 * \param needle_len length limit of the needle
 * \param badchars pointer to an empty array of bachars. The array prepared contains
 *                 characters that can't be inside the needle_len. So the skips can be
 *                 faster
void Bs2BmBadcharsNocase(const uint8_t *needle, uint16_t needle_len, uint8_t *badchars)
    uint32_t i;
    for (i = 0; i < ALPHABET_SIZE; i++)
        badchars[i] = 1;

    /* set to 0 the values where index as ascii is present
     * because they are not badchars
    for (i = 0; i < needle_len; i++) {
        badchars[u8_tolower(needle[i])] = 0;
Exemplo n.º 8
/* u8_downcase:
    Arguments: a null-terminated utf-8 C string
    Returns: a copy of the string in lowercase

u8_string u8_downcase (u8_string string)
  const u8_byte *scan=string;
  struct U8_OUTPUT ss; int c;
  while (*scan) {
    if (*scan < 0x80) c=tolower(*scan++);
    else c=u8_tolower(u8_sgetc(&scan));
  return (u8_string) ss.u8_outbuf;
Exemplo n.º 9
Arquivo: psl.c Projeto: jcajka/libpsl
static void _add_punycode_if_needed(_psl_vector_t *v, _psl_entry_t *e)
	char *lookupname = NULL;
	int rc;
	uint8_t *lower, resbuf[256];
	size_t len = sizeof(resbuf) - 1; /* leave space for additional \0 byte */

	if (_str_is_ascii(e->label_buf))

	/* we need a conversion to lowercase */
	lower = u8_tolower((uint8_t *)e->label_buf, u8_strlen((uint8_t *)e->label_buf), 0, UNINORM_NFKC, resbuf, &len);
	if (!lower) {
		/* fprintf(stderr, "u8_tolower(%s) failed (%d)\n", e->label_buf, errno); */

	/* u8_tolower() does not terminate the result string */
	if (lower == resbuf) {
	} else {
		uint8_t *tmp = lower;
		lower = (uint8_t *)strndup((char *)lower, len);

	if ((rc = idn2_lookup_u8(lower, (uint8_t **)&lookupname, 0)) == IDN2_OK) {
		if (strcmp(e->label_buf, lookupname)) {
			_psl_entry_t suffix, *suffixp;

			/* fprintf(stderr, "libidn '%s' -> '%s'\n", e->label_buf, lookupname); */
			_suffix_init(&suffix, lookupname, strlen(lookupname));
			suffix.wildcard = e->wildcard;
			suffixp = _vector_get(v, _vector_add(v, &suffix));
			suffixp->label = suffixp->label_buf; /* set label to changed address */
		} /* else ignore */
	} /* else
		fprintf(stderr, "toASCII(%s) failed (%d): %s\n", lower, rc, idn2_strerror(rc)); */

	if (lower != resbuf)
Exemplo n.º 10
main (int argc, char * argv[])
  setlocale (LC_ALL, "");
  if (argc == 1)
      /* Display the lower case of the input string.  */
      char *input = read_file (stdin);
      int length = strlen (input);
      size_t output_length;
      uint8_t *output =
        u8_tolower ((uint8_t *) input, length, uc_locale_language (),
                    NULL, &output_length);

      fwrite (output, 1, output_length, stdout);

      return 0;
    return 1;
Exemplo n.º 11
const char *wget_str_to_ascii(const char *src)
	if (wget_str_needs_encoding(src)) {
		char *asc = NULL;
		int rc;
		uint8_t *lower, resbuf[256];
		size_t len = sizeof(resbuf) - 1; // leave space for additional \0 byte

		// we need a conversion to lowercase
		lower = u8_tolower((uint8_t *)src, u8_strlen((uint8_t *)src), 0, UNINORM_NFKC, resbuf, &len);
		if (!lower) {
			error_printf("u8_tolower(%s) failed (%d)\n", src, errno);
			return src;

		// u8_tolower() does not terminate the result string
		if (lower == resbuf) {
		} else {
			uint8_t *tmp = lower;
			lower = (uint8_t *)wget_strmemdup((char *)lower, len);

		if ((rc = idn2_lookup_u8(lower, (uint8_t **)&asc, 0)) == IDN2_OK) {
			debug_printf("idn2 '%s' -> '%s'\n", src, asc);
			src = asc;
		} else
			error_printf(_("toASCII(%s) failed (%d): %s\n"), lower, rc, idn2_strerror(rc));

		if (lower != resbuf)
		if ((rc = idn2_lookup_u8((uint8_t *)src, (uint8_t **)&asc, 0)) == IDN2_OK) {
			debug_printf("idn2 '%s' -> '%s'\n", src, asc);
			src = asc;
		} else
			error_printf(_("toASCII(%s) failed (%d): %s\n"), src, rc, idn2_strerror(rc));
	if (wget_str_needs_encoding(src)) {
		char *asc = NULL;
		int rc;

		if (_utf8_is_valid(src)) {
			// idna_to_ascii_8z() automatically converts UTF-8 to lowercase

			if ((rc = idna_to_ascii_8z(src, &asc, IDNA_USE_STD3_ASCII_RULES)) == IDNA_SUCCESS) {
				// debug_printf("toASCII '%s' -> '%s'\n", src, asc);
				src = asc;
			} else
				error_printf(_("toASCII failed (%d): %s\n"), rc, idna_strerror(rc));
			error_printf(_("Invalid UTF-8 sequence not converted: '%s'\n"), src);
	if (wget_str_needs_encoding(src)) {
		error_printf(_("toASCII not available: '%s'\n"), src);

	return src;
Exemplo n.º 12
 *  \internal
 *  \brief Apply the nocase keyword to the last pattern match, either content or uricontent
 *  \param det_ctx detection engine ctx
 *  \param s signature
 *  \param nullstr should be null
 *  \retval 0 ok
 *  \retval -1 failure
static int DetectNocaseSetup (DetectEngineCtx *de_ctx, Signature *s, char *nullstr)

    SigMatch *pm = NULL;
    int ret = -1;

    if (nullstr != NULL) {
        SCLogError(SC_ERR_INVALID_VALUE, "nocase has value");
        goto end;

    /* retrive the sm to apply the depth against */
    if (s->list != DETECT_SM_LIST_NOTSET) {
        pm = SigMatchGetLastSMFromLists(s, 2, DETECT_CONTENT, s->sm_lists_tail[s->list]);
    } else {
        pm =  SigMatchGetLastSMFromLists(s, 28,
                                         DETECT_CONTENT, s->sm_lists_tail[DETECT_SM_LIST_PMATCH],
                                         DETECT_CONTENT, s->sm_lists_tail[DETECT_SM_LIST_UMATCH],
                                         DETECT_CONTENT, s->sm_lists_tail[DETECT_SM_LIST_HRUDMATCH],
                                         DETECT_CONTENT, s->sm_lists_tail[DETECT_SM_LIST_HCBDMATCH],
                                         DETECT_CONTENT, s->sm_lists_tail[DETECT_SM_LIST_FILEDATA],
                                         DETECT_CONTENT, s->sm_lists_tail[DETECT_SM_LIST_HHDMATCH],
                                         DETECT_CONTENT, s->sm_lists_tail[DETECT_SM_LIST_HRHDMATCH],
                                         DETECT_CONTENT, s->sm_lists_tail[DETECT_SM_LIST_HMDMATCH],
                                         DETECT_CONTENT, s->sm_lists_tail[DETECT_SM_LIST_HCDMATCH],
                                         DETECT_CONTENT, s->sm_lists_tail[DETECT_SM_LIST_HSCDMATCH],
                                         DETECT_CONTENT, s->sm_lists_tail[DETECT_SM_LIST_HSMDMATCH],
                                         DETECT_CONTENT, s->sm_lists_tail[DETECT_SM_LIST_HUADMATCH],
                                         DETECT_CONTENT, s->sm_lists_tail[DETECT_SM_LIST_HHHDMATCH],
                                         DETECT_CONTENT, s->sm_lists_tail[DETECT_SM_LIST_HRHHDMATCH]);
    if (pm == NULL) {
        SCLogError(SC_ERR_NOCASE_MISSING_PATTERN, "nocase needs "
                   "preceding content, uricontent option, http_client_body, "
                   "http_server_body, http_header option, http_raw_header option, "
                   "http_method option, http_cookie, http_raw_uri, "
                   "http_stat_msg, http_stat_code, http_user_agent or "
                   "file_data/dce_stub_data sticky buffer options");
        goto end;

    /* verify other conditions. */
    DetectContentData *cd = (DetectContentData *)pm->ctx;;

    if (cd->flags & DETECT_CONTENT_NOCASE) {
        SCLogError(SC_ERR_INVALID_SIGNATURE, "can't use multiple nocase modifiers with the same content");
        goto end;

    /* for consistency in later use (e.g. by MPM construction and hashing),
     * coerce the content string to lower-case. */
    for (uint8_t *c = cd->content; c < cd->content + cd->content_len; c++) {
        *c = u8_tolower(*c);

    cd->flags |= DETECT_CONTENT_NOCASE;
    /* Recreate the context with nocase chars */
    cd->spm_ctx = SpmInitCtx(cd->content, cd->content_len, 1,
    if (cd->spm_ctx == NULL) {
        goto end;

    ret = 0;
Exemplo n.º 13
static int read_rules(FILE *fp, char const *fname, loose_trie **root)
* populate the root node with all rules.
* return number of bad lines, or -1 if out of memory
	char buf[512];
	char *s;
	int lineno = 0, bad = 0;

	while ((s = fgets(buf, sizeof buf, fp)) != NULL)
		if (s[0] == '/' && s[1] == '/')

		int is_ascii = 1;
		int ch;
		while ((ch = *(unsigned char *)s++) != 0)
			if (ch & 0x80) // utf-8, check it is a valid sequence
				int m = 0x40;
				is_ascii = (ch & m) != 0? 0: -1;
				while ((ch & m) != 0 && is_ascii == 0)
					is_ascii = (*(unsigned char*)s++ & 0xc0) == 0x80? 0: -1;
					m >>= 1;

			if (isspace(ch)) // end of rule
				*--s = 0;

		if (ch == 0)
			(*do_report)(LOG_CRIT, "Line too long at %s:%d: \"%.10s...\"",
				fname, lineno, buf);
			while ((ch = fgetc(fp)) != '\n' && ch != EOF)

		assert(*s == 0);

		size_t len = s - &buf[0];
		if (len == 0) // empty line

		if (!is_ascii)
			if (is_ascii < 0)
				(*do_report)(LOG_CRIT, "Bad UTF-8 sequence at %s:%d: \"%s\"",
					fname, lineno, buf);

			uint8_t norm[128];
			size_t ulen = sizeof norm - 1;
			uint8_t* n = u8_tolower((uint8_t*)buf, len, NULL, UNINORM_NFC, norm, &ulen);
			if (n != &norm[0])
				(*do_report)(LOG_CRIT, "Failed u8_tolower at %s:%d: %s, len = %zu for \"%s\"",
					fname, lineno, strerror(errno), ulen, buf);

			n[ulen] = 0;

			uint8_t *xn = NULL;
			int rtc = idn2_lookup_u8(n, &xn, 0);
			if (rtc != IDN2_OK || xn == NULL || (len = strlen((char*)xn)) >= sizeof buf)
				(*do_report)(LOG_CRIT, "IDNA failed at %s:%d: %s for \"%s\"",
					fname, lineno, idn2_strerror_name(rtc), buf);

			memcpy(buf, xn, len);
			buf[len] = 0;

		char **labels = reverse_labels(buf, len, "!*");
		if (labels == NULL)
			(*do_report)(LOG_CRIT, "Invalid domain at %s:%d for \"%s\"",
				fname, lineno, buf);

		loose_trie *node = add_trie_node(root, *labels);
		for (size_t i = 1; labels[i] && node; ++i)
			node = add_trie_node(&node->child, labels[i]);

		if (node == NULL) // out of memory
			return -1;

		node->is_terminal = 1;
Exemplo n.º 14
Arquivo: psl.c Projeto: jcajka/libpsl
 * psl_str_to_utf8lower:
 * @str: string to convert
 * @encoding: charset encoding of @str, e.g. 'iso-8859-1' or %NULL
 * @locale: locale of @str for to lowercase conversion, e.g. 'de' or %NULL
 * @lower: return value containing the converted string
 * This helper function converts a string to lowercase UTF-8 representation.
 * Lowercase UTF-8 is needed as input to the domain checking functions.
 * @lower is set to %NULL on error.
 * The return value 'lower' must be freed after usage.
 * Returns: psl_error_t value.
 *   PSL_SUCCESS: Success
 *   PSL_ERR_INVALID_ARG: @str is a %NULL value.
 *   PSL_ERR_CONVERTER: Failed to open the unicode converter with name @encoding
 *   PSL_ERR_TO_UTF16: Failed to convert @str to unicode
 *   PSL_ERR_TO_LOWER: Failed to convert unicode to lowercase
 *   PSL_ERR_TO_UTF8: Failed to convert unicode to UTF-8
 * Since: 0.4
psl_error_t psl_str_to_utf8lower(const char *str, const char *encoding, const char *locale, char **lower)

	if (lower)
		*lower = NULL;

	if (!str)

	/* shortcut to avoid costly conversion */
	if (_str_is_ascii(str)) {
		if (lower) {
			char *p;

			*lower = strdup(str);

			/* convert ASCII string to lowercase */
			for (p = *lower; *p; p++)
				if (isupper(*p))
					*p = tolower(*p);
		return PSL_SUCCESS;

	do {
	size_t str_length = strlen(str);
	UErrorCode status = 0;
	UChar *utf16_dst, *utf16_lower;
	int32_t utf16_dst_length;
	char *utf8_lower;
	UConverter *uconv;

	/* C89 allocation */
	utf16_dst   = alloca(sizeof(UChar) * (str_length * 2 + 1));
	utf16_lower = alloca(sizeof(UChar) * (str_length * 2 + 1));
	utf8_lower  = alloca(str_length * 2 + 1);

	uconv = ucnv_open(encoding, &status);
	if (U_SUCCESS(status)) {
		utf16_dst_length = ucnv_toUChars(uconv, utf16_dst, str_length * 2 + 1, str, str_length, &status);

		if (U_SUCCESS(status)) {
			int32_t utf16_lower_length = u_strToLower(utf16_lower, str_length * 2 + 1, utf16_dst, utf16_dst_length, locale, &status);
			if (U_SUCCESS(status)) {
				u_strToUTF8(utf8_lower, str_length * 8 + 1, NULL, utf16_lower, utf16_lower_length, &status);
				if (U_SUCCESS(status)) {
					if (lower)
						*lower = strdup(utf8_lower);
					ret = PSL_SUCCESS;
				} else {
					ret = PSL_ERR_TO_UTF8;
					/* fprintf(stderr, "Failed to convert UTF-16 to UTF-8 (status %d)\n", status); */
			} else {
				ret = PSL_ERR_TO_LOWER;
				/* fprintf(stderr, "Failed to convert UTF-16 to lowercase (status %d)\n", status); */
		} else {
			ret = PSL_ERR_TO_UTF16;
			/* fprintf(stderr, "Failed to convert string to UTF-16 (status %d)\n", status); */
	} else {
		/* fprintf(stderr, "Failed to open converter for '%s' (status %d)\n", encoding, status); */
	} while (0);
#elif defined(WITH_LIBIDN2) || defined(WITH_LIBIDN)
	do {
		/* find out local charset encoding */
		if (!encoding) {
			encoding = nl_langinfo(CODESET);

			if (!encoding || !*encoding)
				encoding = "ASCII";

		/* convert to UTF-8 */
		if (strcasecmp(encoding, "utf-8")) {
			iconv_t cd = iconv_open("utf-8", encoding);

			if (cd != (iconv_t)-1) {
				char *tmp = (char *)str; /* iconv won't change where str points to, but changes tmp itself */
				size_t tmp_len = strlen(str);
				size_t dst_len = tmp_len * 6, dst_len_tmp = dst_len;
				char *dst = malloc(dst_len + 1), *dst_tmp = dst;

				if (iconv(cd, &tmp, &tmp_len, &dst_tmp, &dst_len_tmp) != (size_t)-1) {
					uint8_t *resbuf = malloc(dst_len * 2 + 1);
					size_t len = dst_len * 2; /* leave space for additional \0 byte */

					if ((dst = (char *)u8_tolower((uint8_t *)dst, dst_len - dst_len_tmp, 0, UNINORM_NFKC, resbuf, &len))) {
						/* u8_tolower() does not terminate the result string */
						if (lower)
							*lower = strndup((char *)dst, len);
					} else {
						ret = PSL_ERR_TO_LOWER;
						/* fprintf(stderr, "Failed to convert UTF-8 to lowercase (errno %d)\n", errno); */

					if (lower)
						*lower = strndup(dst, dst_len - dst_len_tmp);
					ret = PSL_SUCCESS;
				} else {
					ret = PSL_ERR_TO_UTF8;
					/* fprintf(stderr, "Failed to convert '%s' string into '%s' (%d)\n", src_encoding, dst_encoding, errno); */

			} else {
				ret = PSL_ERR_TO_UTF8;
				/* fprintf(stderr, "Failed to prepare encoding '%s' into '%s' (%d)\n", src_encoding, dst_encoding, errno); */
		} else
			ret = PSL_SUCCESS;

		/* convert to lowercase */
		if (ret == PSL_SUCCESS) {
			uint8_t *dst, resbuf[256];
			size_t len = sizeof(resbuf) - 1; /* leave space for additional \0 byte */

			/* we need a conversion to lowercase */
			if ((dst = u8_tolower((uint8_t *)str, u8_strlen((uint8_t *)str), 0, UNINORM_NFKC, resbuf, &len))) {
				/* u8_tolower() does not terminate the result string */
				if (lower)
					*lower = strndup((char *)dst, len);
			} else {
				ret = PSL_ERR_TO_LOWER;
				/* fprintf(stderr, "Failed to convert UTF-8 to lowercase (errno %d)\n", errno); */

	} while (0);

	return ret;