/** * \brief Basic search case less with a bad characters array. The array badchars contains * flags at character's ascii index that can't be inside the needle. So the skips can be * faster * * \param haystack pointer to the buffer to search in * \param haystack_len length limit of the buffer * \param neddle pointer to the pattern we ar searching for * \param needle_len length limit of the needle * \param badchars pointer to an array of bachars prepared by Bs2BmBadchars() * * \retval ptr to start of the match; NULL if no match */ uint8_t *Bs2BmNocase(const uint8_t *haystack, uint32_t haystack_len, const uint8_t *needle, uint16_t needle_len, uint8_t badchars[]) { const uint8_t *h, *n; const uint8_t *hmax = haystack + haystack_len; const uint8_t *nmax = needle + needle_len; if (needle_len == 0 || needle_len > haystack_len) return NULL; for (n = needle; nmax - n <= hmax - haystack; haystack++) { if (u8_tolower(*haystack) != u8_tolower(*n)) { continue; } /* one byte needles */ if (needle_len == 1) return (uint8_t *)haystack; for (h = haystack+1, n++; nmax - n <= hmax - haystack; h++, n++) { if (u8_tolower(*h) != u8_tolower(*n)) { if (badchars[u8_tolower(*h)] == 1) { /* skip it! */ haystack = h; } break; } /* if we run out of needle we fully matched */ if (n == nmax - 1) { return (uint8_t *)haystack; } } n = needle; } return NULL; }
/** * \brief Basic search case less * * \param haystack pointer to the buffer to search in * \param haystack_len length limit of the buffer * \param neddle pointer to the pattern we ar searching for * \param needle_len length limit of the needle * * \retval ptr to start of the match; NULL if no match */ uint8_t *BasicSearchNocase(const uint8_t *haystack, uint32_t haystack_len, const uint8_t *needle, uint32_t needle_len) { const uint8_t *h, *n; const uint8_t *hmax = haystack + haystack_len; const uint8_t *nmax = needle + needle_len; if (needle_len == 0 || needle_len > haystack_len) return NULL; n = needle; for (n = needle; nmax - n <= hmax - haystack; haystack++) { if (u8_tolower(*haystack) != u8_tolower(*n)) { continue; } /* one byte needles */ if (needle_len == 1) { return (uint8_t *)haystack; } for (h = haystack+1, n++; nmax - n <= hmax - h ; h++, n++) { if (u8_tolower(*h) != u8_tolower(*n)) { break; } /* if we run out of needle we fully matched */ if (n == nmax - 1) { return (uint8_t *)haystack; } } n = needle; } return NULL; }
static int check (const uint8_t *input, size_t input_length, const char *iso639_language, uninorm_t nf, const uint8_t *expected, size_t expected_length) { size_t length; uint8_t *result; /* Test return conventions with resultbuf == NULL. */ result = u8_tolower (input, input_length, iso639_language, nf, NULL, &length); if (!(result != NULL)) return 1; if (!(length == expected_length)) return 2; if (!(u8_cmp (result, expected, expected_length) == 0)) return 3; free (result); /* Test return conventions with resultbuf too small. */ if (expected_length > 0) { uint8_t *preallocated; length = expected_length - 1; preallocated = (uint8_t *) malloc (length * sizeof (uint8_t)); result = u8_tolower (input, input_length, iso639_language, nf, preallocated, &length); if (!(result != NULL)) return 4; if (!(result != preallocated)) return 5; if (!(length == expected_length)) return 6; if (!(u8_cmp (result, expected, expected_length) == 0)) return 7; free (result); free (preallocated); } /* Test return conventions with resultbuf large enough. */ { uint8_t *preallocated; length = expected_length; preallocated = (uint8_t *) malloc (length * sizeof (uint8_t)); result = u8_tolower (input, input_length, iso639_language, nf, preallocated, &length); if (!(result != NULL)) return 8; if (!(preallocated == NULL || result == preallocated)) return 9; if (!(length == expected_length)) return 10; if (!(u8_cmp (result, expected, expected_length) == 0)) return 11; free (preallocated); } return 0; }
string downstring(string localword, string lang) { // old Way to do it, not unicode aware..... // // for (unsigned int j=0; j < localword.length(); ++j) { // localword[j]=toupper(localword[j]); // } // const uint8_t * word = static_cast<const uint8_t*>(localword.c_str()); // uint8_t * errCode; // uint8_t val; // errCode = &val; // New way to do it using libunicode // //Get string length size_t length = localword.size(); // create correct type for c-style unicode string const uint8_t * word = (const uint8_t*)localword.c_str(); // create output buffer uint8_t output[200]; // create output length location size_t outLength = 200; // make lowercase, normalize and put output in the output buffer, length in the outLength variable if (u8_check(word, length)) { cerr << endl << "Invalid UTF-8 in word: "<< word << " : Dropping it." << endl; // throw Exception("This is an invalid UTF8 in string. Please make sure that you are using UTF8 encoding in all input files. Exiting."); return(string("")); } if (!u8_tolower(word, length, lang.c_str(), UNINORM_NFKC, output, &outLength)) { cerr << endl << "Error during lowercase conversion for word : "<< word << " : Dropping it." << endl; // throw Exception("Error during case conversion (in downstring) "); return(string("")); } // return a c++ string, using begining and end pointers to the c-style string! return(string((const char *)output,(const char *)output+outLength)); }
uint8_t *sss_utf8_tolower(const uint8_t *s, size_t len, size_t *_nlen) { size_t llen; uint8_t *lower; lower = u8_tolower(s, len, NULL, NULL, NULL, &llen); if (!lower) return NULL; if (_nlen) *_nlen = llen; return lower; }
/** * Convert the utf-8 input string to lowercase * Output needs to be allocated appropriately * * @param input input string * @param output output buffer */ void GNUNET_STRINGS_utf8_tolower(const char* input, char** output) { uint8_t *tmp_in; size_t len; tmp_in = u8_tolower ((uint8_t*)input, strlen ((char *) input), NULL, UNINORM_NFD, NULL, &len); memcpy(*output, tmp_in, len); (*output)[len] = '\0'; free(tmp_in); }
/** * \brief Array setup function for Bs2BmNocase of bad characters index (not found at the needle) * * \param neddle pointer to the pattern we ar searching for * \param needle_len length limit of the needle * \param badchars pointer to an empty array of bachars. The array prepared contains * characters that can't be inside the needle_len. So the skips can be * faster */ void Bs2BmBadcharsNocase(const uint8_t *needle, uint16_t needle_len, uint8_t *badchars) { uint32_t i; for (i = 0; i < ALPHABET_SIZE; i++) badchars[i] = 1; /* set to 0 the values where index as ascii is present * because they are not badchars */ for (i = 0; i < needle_len; i++) { badchars[u8_tolower(needle[i])] = 0; } }
U8_EXPORT /* u8_downcase: Arguments: a null-terminated utf-8 C string Returns: a copy of the string in lowercase */ u8_string u8_downcase (u8_string string) { const u8_byte *scan=string; struct U8_OUTPUT ss; int c; U8_INIT_STATIC_OUTPUT(ss,32); while (*scan) { if (*scan < 0x80) c=tolower(*scan++); else c=u8_tolower(u8_sgetc(&scan)); u8_putc(&ss,c);} return (u8_string) ss.u8_outbuf; }
static void _add_punycode_if_needed(_psl_vector_t *v, _psl_entry_t *e) { char *lookupname = NULL; int rc; uint8_t *lower, resbuf[256]; size_t len = sizeof(resbuf) - 1; /* leave space for additional \0 byte */ if (_str_is_ascii(e->label_buf)) return; /* we need a conversion to lowercase */ lower = u8_tolower((uint8_t *)e->label_buf, u8_strlen((uint8_t *)e->label_buf), 0, UNINORM_NFKC, resbuf, &len); if (!lower) { /* fprintf(stderr, "u8_tolower(%s) failed (%d)\n", e->label_buf, errno); */ return; } /* u8_tolower() does not terminate the result string */ if (lower == resbuf) { lower[len]=0; } else { uint8_t *tmp = lower; lower = (uint8_t *)strndup((char *)lower, len); free(tmp); } if ((rc = idn2_lookup_u8(lower, (uint8_t **)&lookupname, 0)) == IDN2_OK) { if (strcmp(e->label_buf, lookupname)) { _psl_entry_t suffix, *suffixp; /* fprintf(stderr, "libidn '%s' -> '%s'\n", e->label_buf, lookupname); */ _suffix_init(&suffix, lookupname, strlen(lookupname)); suffix.wildcard = e->wildcard; suffixp = _vector_get(v, _vector_add(v, &suffix)); suffixp->label = suffixp->label_buf; /* set label to changed address */ } /* else ignore */ } /* else fprintf(stderr, "toASCII(%s) failed (%d): %s\n", lower, rc, idn2_strerror(rc)); */ if (lower != resbuf) free(lower); }
int main (int argc, char * argv[]) { setlocale (LC_ALL, ""); if (argc == 1) { /* Display the lower case of the input string. */ char *input = read_file (stdin); int length = strlen (input); size_t output_length; uint8_t *output = u8_tolower ((uint8_t *) input, length, uc_locale_language (), NULL, NULL, &output_length); fwrite (output, 1, output_length, stdout); return 0; } else return 1; }
const char *wget_str_to_ascii(const char *src) { #ifdef WITH_LIBIDN2 if (wget_str_needs_encoding(src)) { char *asc = NULL; int rc; #ifdef WITH_LIBUNISTRING uint8_t *lower, resbuf[256]; size_t len = sizeof(resbuf) - 1; // leave space for additional \0 byte // we need a conversion to lowercase lower = u8_tolower((uint8_t *)src, u8_strlen((uint8_t *)src), 0, UNINORM_NFKC, resbuf, &len); if (!lower) { error_printf("u8_tolower(%s) failed (%d)\n", src, errno); return src; } // u8_tolower() does not terminate the result string if (lower == resbuf) { lower[len]=0; } else { uint8_t *tmp = lower; lower = (uint8_t *)wget_strmemdup((char *)lower, len); xfree(tmp); } if ((rc = idn2_lookup_u8(lower, (uint8_t **)&asc, 0)) == IDN2_OK) { debug_printf("idn2 '%s' -> '%s'\n", src, asc); src = asc; } else error_printf(_("toASCII(%s) failed (%d): %s\n"), lower, rc, idn2_strerror(rc)); if (lower != resbuf) xfree(lower); #else if ((rc = idn2_lookup_u8((uint8_t *)src, (uint8_t **)&asc, 0)) == IDN2_OK) { debug_printf("idn2 '%s' -> '%s'\n", src, asc); src = asc; } else error_printf(_("toASCII(%s) failed (%d): %s\n"), src, rc, idn2_strerror(rc)); #endif } #elif WITH_LIBIDN if (wget_str_needs_encoding(src)) { char *asc = NULL; int rc; if (_utf8_is_valid(src)) { // idna_to_ascii_8z() automatically converts UTF-8 to lowercase if ((rc = idna_to_ascii_8z(src, &asc, IDNA_USE_STD3_ASCII_RULES)) == IDNA_SUCCESS) { // debug_printf("toASCII '%s' -> '%s'\n", src, asc); src = asc; } else error_printf(_("toASCII failed (%d): %s\n"), rc, idna_strerror(rc)); } else error_printf(_("Invalid UTF-8 sequence not converted: '%s'\n"), src); } #else if (wget_str_needs_encoding(src)) { error_printf(_("toASCII not available: '%s'\n"), src); } #endif return src; }
/** * \internal * \brief Apply the nocase keyword to the last pattern match, either content or uricontent * \param det_ctx detection engine ctx * \param s signature * \param nullstr should be null * \retval 0 ok * \retval -1 failure */ static int DetectNocaseSetup (DetectEngineCtx *de_ctx, Signature *s, char *nullstr) { SCEnter(); SigMatch *pm = NULL; int ret = -1; if (nullstr != NULL) { SCLogError(SC_ERR_INVALID_VALUE, "nocase has value"); goto end; } /* retrive the sm to apply the depth against */ if (s->list != DETECT_SM_LIST_NOTSET) { pm = SigMatchGetLastSMFromLists(s, 2, DETECT_CONTENT, s->sm_lists_tail[s->list]); } else { pm = SigMatchGetLastSMFromLists(s, 28, DETECT_CONTENT, s->sm_lists_tail[DETECT_SM_LIST_PMATCH], DETECT_CONTENT, s->sm_lists_tail[DETECT_SM_LIST_UMATCH], DETECT_CONTENT, s->sm_lists_tail[DETECT_SM_LIST_HRUDMATCH], DETECT_CONTENT, s->sm_lists_tail[DETECT_SM_LIST_HCBDMATCH], DETECT_CONTENT, s->sm_lists_tail[DETECT_SM_LIST_FILEDATA], DETECT_CONTENT, s->sm_lists_tail[DETECT_SM_LIST_HHDMATCH], DETECT_CONTENT, s->sm_lists_tail[DETECT_SM_LIST_HRHDMATCH], DETECT_CONTENT, s->sm_lists_tail[DETECT_SM_LIST_HMDMATCH], DETECT_CONTENT, s->sm_lists_tail[DETECT_SM_LIST_HCDMATCH], DETECT_CONTENT, s->sm_lists_tail[DETECT_SM_LIST_HSCDMATCH], DETECT_CONTENT, s->sm_lists_tail[DETECT_SM_LIST_HSMDMATCH], DETECT_CONTENT, s->sm_lists_tail[DETECT_SM_LIST_HUADMATCH], DETECT_CONTENT, s->sm_lists_tail[DETECT_SM_LIST_HHHDMATCH], DETECT_CONTENT, s->sm_lists_tail[DETECT_SM_LIST_HRHHDMATCH]); } if (pm == NULL) { SCLogError(SC_ERR_NOCASE_MISSING_PATTERN, "nocase needs " "preceding content, uricontent option, http_client_body, " "http_server_body, http_header option, http_raw_header option, " "http_method option, http_cookie, http_raw_uri, " "http_stat_msg, http_stat_code, http_user_agent or " "file_data/dce_stub_data sticky buffer options"); goto end; } /* verify other conditions. */ DetectContentData *cd = (DetectContentData *)pm->ctx;; if (cd->flags & DETECT_CONTENT_NOCASE) { SCLogError(SC_ERR_INVALID_SIGNATURE, "can't use multiple nocase modifiers with the same content"); goto end; } /* for consistency in later use (e.g. by MPM construction and hashing), * coerce the content string to lower-case. */ for (uint8_t *c = cd->content; c < cd->content + cd->content_len; c++) { *c = u8_tolower(*c); } cd->flags |= DETECT_CONTENT_NOCASE; /* Recreate the context with nocase chars */ SpmDestroyCtx(cd->spm_ctx); cd->spm_ctx = SpmInitCtx(cd->content, cd->content_len, 1, de_ctx->spm_global_thread_ctx); if (cd->spm_ctx == NULL) { goto end; } ret = 0; end: SCReturnInt(ret); }
static int read_rules(FILE *fp, char const *fname, loose_trie **root) /* * populate the root node with all rules. * return number of bad lines, or -1 if out of memory */ { char buf[512]; char *s; int lineno = 0, bad = 0; while ((s = fgets(buf, sizeof buf, fp)) != NULL) { ++lineno; if (s[0] == '/' && s[1] == '/') continue; int is_ascii = 1; int ch; while ((ch = *(unsigned char *)s++) != 0) { if (ch & 0x80) // utf-8, check it is a valid sequence { int m = 0x40; is_ascii = (ch & m) != 0? 0: -1; while ((ch & m) != 0 && is_ascii == 0) { is_ascii = (*(unsigned char*)s++ & 0xc0) == 0x80? 0: -1; m >>= 1; } continue; } if (isspace(ch)) // end of rule { *--s = 0; break; } } if (ch == 0) { (*do_report)(LOG_CRIT, "Line too long at %s:%d: \"%.10s...\"", fname, lineno, buf); ++bad; while ((ch = fgetc(fp)) != '\n' && ch != EOF) ; continue; } assert(*s == 0); size_t len = s - &buf[0]; if (len == 0) // empty line continue; if (!is_ascii) { if (is_ascii < 0) { (*do_report)(LOG_CRIT, "Bad UTF-8 sequence at %s:%d: \"%s\"", fname, lineno, buf); ++bad; continue; } uint8_t norm[128]; size_t ulen = sizeof norm - 1; uint8_t* n = u8_tolower((uint8_t*)buf, len, NULL, UNINORM_NFC, norm, &ulen); if (n != &norm[0]) { (*do_report)(LOG_CRIT, "Failed u8_tolower at %s:%d: %s, len = %zu for \"%s\"", fname, lineno, strerror(errno), ulen, buf); free(n); ++bad; continue; } n[ulen] = 0; uint8_t *xn = NULL; int rtc = idn2_lookup_u8(n, &xn, 0); if (rtc != IDN2_OK || xn == NULL || (len = strlen((char*)xn)) >= sizeof buf) { (*do_report)(LOG_CRIT, "IDNA failed at %s:%d: %s for \"%s\"", fname, lineno, idn2_strerror_name(rtc), buf); ++bad; continue; } memcpy(buf, xn, len); idn2_free(xn); buf[len] = 0; } char **labels = reverse_labels(buf, len, "!*"); if (labels == NULL) { (*do_report)(LOG_CRIT, "Invalid domain at %s:%d for \"%s\"", fname, lineno, buf); ++bad; continue; } loose_trie *node = add_trie_node(root, *labels); for (size_t i = 1; labels[i] && node; ++i) node = add_trie_node(&node->child, labels[i]); free(labels); if (node == NULL) // out of memory return -1; node->is_terminal = 1; }
/** * psl_str_to_utf8lower: * @str: string to convert * @encoding: charset encoding of @str, e.g. 'iso-8859-1' or %NULL * @locale: locale of @str for to lowercase conversion, e.g. 'de' or %NULL * @lower: return value containing the converted string * * This helper function converts a string to lowercase UTF-8 representation. * Lowercase UTF-8 is needed as input to the domain checking functions. * * @lower is set to %NULL on error. * * The return value 'lower' must be freed after usage. * * Returns: psl_error_t value. * PSL_SUCCESS: Success * PSL_ERR_INVALID_ARG: @str is a %NULL value. * PSL_ERR_CONVERTER: Failed to open the unicode converter with name @encoding * PSL_ERR_TO_UTF16: Failed to convert @str to unicode * PSL_ERR_TO_LOWER: Failed to convert unicode to lowercase * PSL_ERR_TO_UTF8: Failed to convert unicode to UTF-8 * * Since: 0.4 */ psl_error_t psl_str_to_utf8lower(const char *str, const char *encoding, const char *locale, char **lower) { int ret = PSL_ERR_INVALID_ARG; if (lower) *lower = NULL; if (!str) return PSL_ERR_INVALID_ARG; /* shortcut to avoid costly conversion */ if (_str_is_ascii(str)) { if (lower) { char *p; *lower = strdup(str); /* convert ASCII string to lowercase */ for (p = *lower; *p; p++) if (isupper(*p)) *p = tolower(*p); } return PSL_SUCCESS; } #ifdef WITH_LIBICU do { size_t str_length = strlen(str); UErrorCode status = 0; UChar *utf16_dst, *utf16_lower; int32_t utf16_dst_length; char *utf8_lower; UConverter *uconv; /* C89 allocation */ utf16_dst = alloca(sizeof(UChar) * (str_length * 2 + 1)); utf16_lower = alloca(sizeof(UChar) * (str_length * 2 + 1)); utf8_lower = alloca(str_length * 2 + 1); uconv = ucnv_open(encoding, &status); if (U_SUCCESS(status)) { utf16_dst_length = ucnv_toUChars(uconv, utf16_dst, str_length * 2 + 1, str, str_length, &status); ucnv_close(uconv); if (U_SUCCESS(status)) { int32_t utf16_lower_length = u_strToLower(utf16_lower, str_length * 2 + 1, utf16_dst, utf16_dst_length, locale, &status); if (U_SUCCESS(status)) { u_strToUTF8(utf8_lower, str_length * 8 + 1, NULL, utf16_lower, utf16_lower_length, &status); if (U_SUCCESS(status)) { if (lower) *lower = strdup(utf8_lower); ret = PSL_SUCCESS; } else { ret = PSL_ERR_TO_UTF8; /* fprintf(stderr, "Failed to convert UTF-16 to UTF-8 (status %d)\n", status); */ } } else { ret = PSL_ERR_TO_LOWER; /* fprintf(stderr, "Failed to convert UTF-16 to lowercase (status %d)\n", status); */ } } else { ret = PSL_ERR_TO_UTF16; /* fprintf(stderr, "Failed to convert string to UTF-16 (status %d)\n", status); */ } } else { ret = PSL_ERR_CONVERTER; /* fprintf(stderr, "Failed to open converter for '%s' (status %d)\n", encoding, status); */ } } while (0); #elif defined(WITH_LIBIDN2) || defined(WITH_LIBIDN) do { /* find out local charset encoding */ if (!encoding) { encoding = nl_langinfo(CODESET); if (!encoding || !*encoding) encoding = "ASCII"; } /* convert to UTF-8 */ if (strcasecmp(encoding, "utf-8")) { iconv_t cd = iconv_open("utf-8", encoding); if (cd != (iconv_t)-1) { char *tmp = (char *)str; /* iconv won't change where str points to, but changes tmp itself */ size_t tmp_len = strlen(str); size_t dst_len = tmp_len * 6, dst_len_tmp = dst_len; char *dst = malloc(dst_len + 1), *dst_tmp = dst; if (iconv(cd, &tmp, &tmp_len, &dst_tmp, &dst_len_tmp) != (size_t)-1) { uint8_t *resbuf = malloc(dst_len * 2 + 1); size_t len = dst_len * 2; /* leave space for additional \0 byte */ if ((dst = (char *)u8_tolower((uint8_t *)dst, dst_len - dst_len_tmp, 0, UNINORM_NFKC, resbuf, &len))) { /* u8_tolower() does not terminate the result string */ if (lower) *lower = strndup((char *)dst, len); } else { ret = PSL_ERR_TO_LOWER; /* fprintf(stderr, "Failed to convert UTF-8 to lowercase (errno %d)\n", errno); */ } if (lower) *lower = strndup(dst, dst_len - dst_len_tmp); ret = PSL_SUCCESS; } else { ret = PSL_ERR_TO_UTF8; /* fprintf(stderr, "Failed to convert '%s' string into '%s' (%d)\n", src_encoding, dst_encoding, errno); */ } free(dst); iconv_close(cd); } else { ret = PSL_ERR_TO_UTF8; /* fprintf(stderr, "Failed to prepare encoding '%s' into '%s' (%d)\n", src_encoding, dst_encoding, errno); */ } } else ret = PSL_SUCCESS; /* convert to lowercase */ if (ret == PSL_SUCCESS) { uint8_t *dst, resbuf[256]; size_t len = sizeof(resbuf) - 1; /* leave space for additional \0 byte */ /* we need a conversion to lowercase */ if ((dst = u8_tolower((uint8_t *)str, u8_strlen((uint8_t *)str), 0, UNINORM_NFKC, resbuf, &len))) { /* u8_tolower() does not terminate the result string */ if (lower) *lower = strndup((char *)dst, len); } else { ret = PSL_ERR_TO_LOWER; /* fprintf(stderr, "Failed to convert UTF-8 to lowercase (errno %d)\n", errno); */ } } } while (0); #endif return ret; }