static char *sanitize_utf8(const char *buf) { const char *ptr = buf; // Count how many errors we encounter uint32_t i = 0; // Upper bounds to ensure termination even if u8_check is unsafe while (i < strlen(buf) && ptr < buf + strlen(buf)) { ptr = (char*)u8_check((uint8_t*)ptr, strlen(ptr)); if (ptr == NULL) { break; } assert(ptr >= buf); assert(ptr < buf + strlen(buf)); ptr++; i++; } // i is the total number of errors. We need 2 extra bytes for each rune char *safe_buf = xmalloc(strlen(buf) + i*2 + 1); char *safe_ptr = NULL; memcpy(safe_buf, buf, strlen(buf)); // Fix exactly i errors for (uint32_t j = 0; j < i; j++) { // Always operate on the working buffer safe_ptr = (char*)u8_check((uint8_t*)safe_buf, strlen(safe_buf)); // This implies we had less errors than we should. assert(safe_ptr != NULL); assert(safe_ptr >= safe_buf); assert(safe_ptr < safe_buf + strlen(safe_buf)); // Shift the rest of the string by 2 bytes if (strlen(safe_ptr) > 1) { memcpy(safe_ptr + 3, safe_ptr + 1, strlen(safe_ptr + 1)); } // UTF8 replacement rune safe_ptr[0] = (char)0xef; safe_ptr[1] = (char)0xbf; safe_ptr[2] = (char)0xbd; } // We now have a valid utf8 string assert(u8_check((uint8_t*)safe_buf, strlen(safe_buf)) == NULL); // We should be null terminated assert(safe_buf[strlen(buf) + i*2] == '\0'); // We should be the right length assert(strlen(safe_buf) == (strlen(buf) + i*2)); return safe_buf; }
string downstring(string localword, string lang) { // old Way to do it, not unicode aware..... // // for (unsigned int j=0; j < localword.length(); ++j) { // localword[j]=toupper(localword[j]); // } // const uint8_t * word = static_cast<const uint8_t*>(localword.c_str()); // uint8_t * errCode; // uint8_t val; // errCode = &val; // New way to do it using libunicode // //Get string length size_t length = localword.size(); // create correct type for c-style unicode string const uint8_t * word = (const uint8_t*)localword.c_str(); // create output buffer uint8_t output[200]; // create output length location size_t outLength = 200; // make lowercase, normalize and put output in the output buffer, length in the outLength variable if (u8_check(word, length)) { cerr << endl << "Invalid UTF-8 in word: "<< word << " : Dropping it." << endl; // throw Exception("This is an invalid UTF8 in string. Please make sure that you are using UTF8 encoding in all input files. Exiting."); return(string("")); } if (!u8_tolower(word, length, lang.c_str(), UNINORM_NFKC, output, &outLength)) { cerr << endl << "Error during lowercase conversion for word : "<< word << " : Dropping it." << endl; // throw Exception("Error during case conversion (in downstring) "); return(string("")); } // return a c++ string, using begining and end pointers to the c-style string! return(string((const char *)output,(const char *)output+outLength)); }
char * unicode_fixup_string(char *str, const char *fromcode) { uint8_t *ret; size_t len; if (!str) return NULL; len = strlen(str); /* String is valid UTF-8 */ if (!u8_check((uint8_t *)str, len)) { if (len >= 3) { /* Check for and strip byte-order mark */ if (memcmp("\xef\xbb\xbf", str, 3) == 0) memmove(str, str + 3, len - 3 + 1); } return str; } ret = u8_strconv_from_encoding(str, fromcode, iconveh_question_mark); if (!ret) { DPRINTF(E_LOG, L_MISC, "Could not convert string '%s' to UTF-8: %s\n", str, strerror(errno)); return NULL; } return (char *)ret; }
bool sss_utf8_check(const uint8_t *s, size_t n) { if (u8_check(s, n) == NULL) { return true; } return false; }
/* A tricky optimization, but probably worth it. */ unsigned long scm_i_utf8_string_hash (const char *str, size_t len) { const scm_t_uint8 *end, *ustr = (const scm_t_uint8 *) str; unsigned long ret; /* The length of the string in characters. This name corresponds to Jenkins' original name. */ size_t length; scm_t_uint32 a, b, c, u32; if (len == (size_t) -1) len = strlen (str); end = ustr + len; if (u8_check (ustr, len) != NULL) /* Invalid UTF-8; punt. */ return scm_i_string_hash (scm_from_utf8_stringn (str, len)); length = u8_strnlen (ustr, len); /* Set up the internal state. */ a = b = c = 0xdeadbeef + ((scm_t_uint32)(length<<2)) + 47; /* Handle most of the key. */ while (length > 3) { ustr += u8_mbtouc_unsafe (&u32, ustr, end - ustr); a += u32; ustr += u8_mbtouc_unsafe (&u32, ustr, end - ustr); b += u32; ustr += u8_mbtouc_unsafe (&u32, ustr, end - ustr); c += u32; mix (a, b, c); length -= 3; } /* Handle the last 3 elements's. */ ustr += u8_mbtouc_unsafe (&u32, ustr, end - ustr); a += u32; if (--length) { ustr += u8_mbtouc_unsafe (&u32, ustr, end - ustr); b += u32; if (--length) { ustr += u8_mbtouc_unsafe (&u32, ustr, end - ustr); c += u32; } } final (a, b, c);
/* Returns an encoding guess based on ENCODING and the N bytes of text starting at DATA. DATA should start with the first non-ASCII text character (as determined by encoding_guess_is_ascii_text()) found in the input. The return value is: 0, if the encoding is definitely not UTF-8 (because the input contains byte sequences that are not valid in UTF-8). 1, if the encoding appears to be UTF-8 (because the input contains valid UTF-8 multibyte sequences). -1, if the input contains only ASCII characters. (This means that the input may be treated as UTF-8, since ASCII is a subset of UTF-8.) See encoding-guesser.h for intended use of this function. N must be at least ENCODING_GUESS_MIN, unless the file has fewer bytes than that starting with the first non-ASCII text character. */ int encoding_guess_tail_is_utf8 (const void *data, size_t n) { /* If all the bytes are in the ASCII range, it's just ASCII. */ if (encoding_guess_count_ascii (data, n) == n) return -1; return (n < ENCODING_GUESS_MIN ? u8_check (data, n) == NULL : is_all_utf8_text (data, n)); }
char * u8_strconv_to_encoding (const uint8_t *string, const char *tocode, enum iconv_ilseq_handler handler) { char *result; size_t length; if (STRCASEEQ (tocode, "UTF-8", 'U','T','F','-','8',0,0,0,0)) { /* Conversion from UTF-8 to UTF-8. No need to go through iconv(). */ length = u8_strlen (string) + 1; #if CONFIG_UNICODE_SAFETY if (u8_check (string, length)) { errno = EILSEQ; return NULL; } #endif result = (char *) malloc (length); if (result == NULL) { errno = ENOMEM; return NULL; } memcpy (result, (const char *) string, length); return result; } else { result = NULL; length = 0; if (mem_iconveha ((const char *) string, u8_strlen (string) + 1, "UTF-8", tocode, handler == iconveh_question_mark, handler, NULL, &result, &length) < 0) return NULL; /* Verify the result has exactly one NUL byte, at the end. */ if (!(length > 0 && result[length-1] == '\0' && strlen (result) == length-1)) { free (result); errno = EILSEQ; return NULL; } return result; } }
char * u8_conv_to_encoding (const char *tocode, enum iconv_ilseq_handler handler, const uint8_t *src, size_t srclen, size_t *offsets, char *resultbuf, size_t *lengthp) { if (STRCASEEQ (tocode, "UTF-8", 'U','T','F','-','8',0,0,0,0)) { char *result; /* Conversion from UTF-8 to UTF-8. No need to go through iconv(). */ #if CONFIG_UNICODE_SAFETY if (u8_check (src, srclen)) { errno = EILSEQ; return NULL; } #endif /* Memory allocation. */ if (resultbuf != NULL && *lengthp >= srclen) result = resultbuf; else { result = (char *) malloc (srclen > 0 ? srclen : 1); if (result == NULL) { errno = ENOMEM; return NULL; } } memcpy (result, (const char *) src, srclen); *lengthp = srclen; return result; } else { char *result = resultbuf; size_t length = *lengthp; if (mem_iconveha ((const char *) src, srclen, "UTF-8", tocode, handler == iconveh_question_mark, handler, offsets, &result, &length) < 0) return NULL; if (result == NULL) /* when (resultbuf == NULL && length == 0) */ { result = (char *) malloc (1); if (result == NULL) { errno = ENOMEM; return NULL; } } *lengthp = length; return result; } }
uint8_t * u8_conv_from_encoding (const char *fromcode, enum iconv_ilseq_handler handler, const char *src, size_t srclen, size_t *offsets, uint8_t *resultbuf, size_t *lengthp) { if (STRCASEEQ (fromcode, "UTF-8", 'U','T','F','-','8',0,0,0,0)) { /* Conversion from UTF-8 to UTF-8. No need to go through iconv(). */ uint8_t *result; if (u8_check ((const uint8_t *) src, srclen)) { errno = EILSEQ; return NULL; } if (offsets != NULL) { size_t i; for (i = 0; i < srclen; ) { int count = u8_mblen ((const uint8_t *) src + i, srclen - i); /* We can rely on count > 0 because of the previous u8_check. */ if (count <= 0) abort (); offsets[i] = i; i++; while (--count > 0) offsets[i++] = (size_t)(-1); } } /* Memory allocation. */ if (resultbuf != NULL && *lengthp >= srclen) result = resultbuf; else { result = (uint8_t *) malloc (srclen > 0 ? srclen : 1); if (result == NULL) { errno = ENOMEM; return NULL; } } memcpy ((char *) result, src, srclen); *lengthp = srclen; return result; } else { char *result = (char *) resultbuf; size_t length = *lengthp; if (mem_iconveha (src, srclen, fromcode, "UTF-8", true, handler, offsets, &result, &length) < 0) return NULL; if (result == NULL) /* when (resultbuf == NULL && length == 0) */ { result = (char *) malloc (1); if (result == NULL) { errno = ENOMEM; return NULL; } } *lengthp = length; return (uint8_t *) result; } }
int main () { /* Test empty string. */ { static const uint8_t input[] = ""; ASSERT (u8_check (input, 0) == NULL); } /* Test valid non-empty string. */ { static const uint8_t input[] = /* "Данило Шеган" */ "\320\224\320\260\320\275\320\270\320\273\320\276 \320\250\320\265\320\263\320\260\320\275"; ASSERT (u8_check (input, sizeof (input) - 1) == NULL); } /* Test out-of-range character with 4 bytes: U+110000. */ { static const uint8_t input[] = "\320\224\320\260\364\220\200\200"; ASSERT (u8_check (input, sizeof (input) - 1) == input + 4); } /* Test out-of-range character with 5 bytes: U+200000. */ { static const uint8_t input[] = "\320\224\320\260\370\210\200\200\200"; ASSERT (u8_check (input, sizeof (input) - 1) == input + 4); } /* Test out-of-range character with 6 bytes: U+4000000. */ { static const uint8_t input[] = "\320\224\320\260\374\204\200\200\200\200"; ASSERT (u8_check (input, sizeof (input) - 1) == input + 4); } /* Test invalid lead byte. */ { static const uint8_t input[] = "\320\224\320\260\376\200\200\200\200\200"; ASSERT (u8_check (input, sizeof (input) - 1) == input + 4); } { static const uint8_t input[] = "\320\224\320\260\377\200\200\200\200\200"; ASSERT (u8_check (input, sizeof (input) - 1) == input + 4); } /* Test overlong 2-byte character. */ { static const uint8_t input[] = "\320\224\320\260\301\200"; ASSERT (u8_check (input, sizeof (input) - 1) == input + 4); } /* Test overlong 3-byte character. */ { static const uint8_t input[] = "\320\224\320\260\340\200\277"; ASSERT (u8_check (input, sizeof (input) - 1) == input + 4); } /* Test overlong 4-byte character. */ { static const uint8_t input[] = "\320\224\320\260\360\200\277\277"; ASSERT (u8_check (input, sizeof (input) - 1) == input + 4); } /* Test invalid bytes in 2-byte character. */ { static const uint8_t input[] = "\320\224\320\260\302\200"; ASSERT (u8_check (input, sizeof (input) - 1) == NULL); } { static const uint8_t input[] = "\320\224\320\260\302\100"; ASSERT (u8_check (input, sizeof (input) - 1) == input + 4); } { static const uint8_t input[] = "\320\224\320\260\302\300"; ASSERT (u8_check (input, sizeof (input) - 1) == input + 4); } /* Test invalid bytes in 3-byte character. */ { static const uint8_t input[] = "\320\224\320\260\342\200\200"; ASSERT (u8_check (input, sizeof (input) - 1) == NULL); } { static const uint8_t input[] = "\320\224\320\260\342\100\200"; ASSERT (u8_check (input, sizeof (input) - 1) == input + 4); } { static const uint8_t input[] = "\320\224\320\260\342\300\200"; ASSERT (u8_check (input, sizeof (input) - 1) == input + 4); } { static const uint8_t input[] = "\320\224\320\260\342\200\100"; ASSERT (u8_check (input, sizeof (input) - 1) == input + 4); } { static const uint8_t input[] = "\320\224\320\260\342\200\300"; ASSERT (u8_check (input, sizeof (input) - 1) == input + 4); } /* Test invalid bytes in 4-byte character. */ { static const uint8_t input[] = "\320\224\320\260\362\200\200\200"; ASSERT (u8_check (input, sizeof (input) - 1) == NULL); } { static const uint8_t input[] = "\320\224\320\260\362\100\200\200"; ASSERT (u8_check (input, sizeof (input) - 1) == input + 4); } { static const uint8_t input[] = "\320\224\320\260\362\300\200\200"; ASSERT (u8_check (input, sizeof (input) - 1) == input + 4); } { static const uint8_t input[] = "\320\224\320\260\362\200\100\200"; ASSERT (u8_check (input, sizeof (input) - 1) == input + 4); } { static const uint8_t input[] = "\320\224\320\260\362\200\300\200"; ASSERT (u8_check (input, sizeof (input) - 1) == input + 4); } { static const uint8_t input[] = "\320\224\320\260\362\200\200\100"; ASSERT (u8_check (input, sizeof (input) - 1) == input + 4); } { static const uint8_t input[] = "\320\224\320\260\362\200\200\300"; ASSERT (u8_check (input, sizeof (input) - 1) == input + 4); } /* Test truncated/incomplete 2-byte character. */ { static const uint8_t input[] = "\320\224\320\260\302"; ASSERT (u8_check (input, sizeof (input) - 1) == input + 4); } /* Test truncated/incomplete 3-byte character. */ { static const uint8_t input[] = "\320\224\320\260\342\200"; ASSERT (u8_check (input, sizeof (input) - 1) == input + 4); } /* Test truncated/incomplete 4-byte character. */ { static const uint8_t input[] = "\320\224\320\260\362\200\200"; ASSERT (u8_check (input, sizeof (input) - 1) == input + 4); } /* Test missing lead byte. */ { static const uint8_t input[] = "\320\224\320\260\200\200\200\200\200"; ASSERT (u8_check (input, sizeof (input) - 1) == input + 4); } /* Test surrogate codepoints. */ { static const uint8_t input[] = "\320\224\320\260\355\240\200\355\260\200"; ASSERT (u8_check (input, sizeof (input) - 1) == input + 4); } { static const uint8_t input[] = "\320\224\320\260\355\260\200"; ASSERT (u8_check (input, sizeof (input) - 1) == input + 4); } return 0; }
/** * gnutls_utf8_password_normalize: * @password: contain the UTF-8 formatted password * @plen: the length of the provided password * @out: the result in an null-terminated allocated string * @flags: should be zero * * This function will convert the provided UTF-8 password according * to the normalization rules in RFC7613. * * If the flag %GNUTLS_UTF8_IGNORE_ERRS is specified, any UTF-8 encoding * errors will be ignored, and in that case the output will be a copy of the input. * * Returns: %GNUTLS_E_INVALID_UTF8_STRING on invalid UTF-8 data, or 0 on success. * * Since: 3.5.7 **/ int gnutls_utf8_password_normalize(const unsigned char *password, unsigned plen, gnutls_datum_t *out, unsigned flags) { size_t ucs4_size = 0, nrm_size = 0; size_t final_size = 0; uint8_t *final = NULL; uint32_t *ucs4 = NULL; uint32_t *nrm = NULL; uint8_t *nrmu8 = NULL; int ret; if (plen == 0) { out->data = (uint8_t*)gnutls_strdup(""); out->size = 0; if (out->data == NULL) return gnutls_assert_val(GNUTLS_E_MEMORY_ERROR); return 0; } /* check for invalid UTF-8 */ if (u8_check((uint8_t*)password, plen) != NULL) { gnutls_assert(); if (flags & GNUTLS_UTF8_IGNORE_ERRS) { raw_copy: out->data = gnutls_malloc(plen+1); if (out->data == NULL) return gnutls_assert_val(GNUTLS_E_MEMORY_ERROR); out->size = plen; memcpy(out->data, password, plen); out->data[plen] = 0; return 0; } else { return GNUTLS_E_INVALID_UTF8_STRING; } } /* convert to UTF-32 */ ucs4 = u8_to_u32((uint8_t*)password, plen, NULL, &ucs4_size); if (ucs4 == NULL) { gnutls_assert(); ret = GNUTLS_E_PARSING_ERROR; goto fail; } ret = check_for_valid_freeformclass(ucs4, ucs4_size); if (ret < 0) { gnutls_assert(); if (flags & GNUTLS_UTF8_IGNORE_ERRS) { free(ucs4); goto raw_copy; } if (ret == GNUTLS_E_INVALID_UTF8_STRING) ret = GNUTLS_E_INVALID_PASSWORD_STRING; goto fail; } /* normalize to NFC */ nrm = u32_normalize(UNINORM_NFC, ucs4, ucs4_size, NULL, &nrm_size); if (nrm == NULL) { gnutls_assert(); ret = GNUTLS_E_INVALID_PASSWORD_STRING; goto fail; } /* convert back to UTF-8 */ final_size = 0; nrmu8 = u32_to_u8(nrm, nrm_size, NULL, &final_size); if (nrmu8 == NULL) { gnutls_assert(); ret = GNUTLS_E_INVALID_PASSWORD_STRING; goto fail; } /* copy to output with null terminator */ final = gnutls_malloc(final_size+1);
gboolean mongo_bson_iter_next (MongoBsonIter *iter) { const guint8 *rawbuf; gsize rawbuf_len; gsize offset; const gchar *key; MongoBsonType type; const guint8 *value1; const guint8 *value2; const gchar *end = NULL; guint32 max_len; g_return_val_if_fail(iter != NULL, FALSE); /* * Copy values onto stack from iter. */ rawbuf = iter->user_data1; rawbuf_len = GPOINTER_TO_SIZE(iter->user_data2); offset = GPOINTER_TO_SIZE(iter->user_data3); key = (const gchar *)iter->user_data4; type = GPOINTER_TO_INT(iter->user_data5); value1 = (const guint8 *)iter->user_data6; value2 = (const guint8 *)iter->user_data7; /* * Unset the invalid utf8 field. */ iter->flags &= ~ITER_INVALID_UTF8; /* * Check for end of buffer. */ if ((offset + 1) >= rawbuf_len) { GOTO(failure); } /* * Get the type of the next field. */ if (!(type = rawbuf[++offset])) { /* * This is the end of the iterator. */ GOTO(failure); } /* * Get the key of the next field. */ key = (const gchar *)&rawbuf[++offset]; max_len = first_nul(key, rawbuf_len - offset - 1); if (!(iter->flags & ITER_TRUST_UTF8)) { if (!g_utf8_validate(key, max_len, &end)) { GOTO(failure); } } offset += strlen(key) + 1; switch (type) { case MONGO_BSON_UTF8: if ((offset + 5) < rawbuf_len) { value1 = &rawbuf[offset]; offset += 4; value2 = &rawbuf[offset]; max_len = GUINT32_FROM_LE(*(guint32 *)value1); if ((offset + max_len - 10) < rawbuf_len) { if (!(iter->flags & ITER_TRUST_UTF8)) { if ((end = (char *)u8_check((guint8 *)value2, max_len - 1))) { /* * Well, we have quite the delima here. The UTF-8 string is * invalid, but there was definitely a key here. Consumers * might need to get at data after this too. So the best * we can do is probably set the value to as long of a valid * utf-8 string as we can. We will simply NULL the end of * the buffer at the given error offset. */ *(gchar *)end = '\0'; offset += max_len - 1; iter->flags |= ITER_INVALID_UTF8; GOTO(success); } } offset += max_len - 1; if (value2[max_len - 1] == '\0') { GOTO(success); } } } GOTO(failure); case MONGO_BSON_DOCUMENT: case MONGO_BSON_ARRAY: if ((offset + 5) < rawbuf_len) { value1 = &rawbuf[offset]; value2 = NULL; memcpy(&max_len, value1, sizeof max_len); max_len = GUINT32_FROM_LE(max_len); if ((offset + max_len) <= rawbuf_len) { offset += max_len - 1; GOTO(success); } } GOTO(failure); case MONGO_BSON_NULL: case MONGO_BSON_UNDEFINED: value1 = NULL; value2 = NULL; offset--; GOTO(success); case MONGO_BSON_OBJECT_ID: if ((offset + 12) < rawbuf_len) { value1 = &rawbuf[offset]; value2 = NULL; offset += 11; GOTO(success); } GOTO(failure); case MONGO_BSON_BOOLEAN: if ((offset + 1) < rawbuf_len) { value1 = &rawbuf[offset]; value2 = NULL; GOTO(success); } GOTO(failure); case MONGO_BSON_DATE_TIME: case MONGO_BSON_DOUBLE: case MONGO_BSON_INT64: if ((offset + 8) < rawbuf_len) { value1 = &rawbuf[offset]; value2 = NULL; offset += 7; GOTO(success); } GOTO(failure); case MONGO_BSON_REGEX: value1 = &rawbuf[offset]; max_len = first_nul((gchar *)value1, rawbuf_len - offset - 1); if (!(iter->flags & ITER_TRUST_UTF8)) { if (!g_utf8_validate((gchar *)value1, max_len, &end)) { GOTO(failure); } } offset += max_len + 1; if ((offset + 1) >= rawbuf_len) { GOTO(failure); } value2 = &rawbuf[offset]; max_len = first_nul((gchar *)value2, rawbuf_len - offset - 1); if (!(iter->flags & ITER_TRUST_UTF8)) { if (!g_utf8_validate((gchar *)value2, max_len, &end)) { GOTO(failure); } } offset += max_len + 1; GOTO(success); case MONGO_BSON_INT32: if ((offset + 4) < rawbuf_len) { value1 = &rawbuf[offset]; value2 = NULL; offset += 3; GOTO(success); } GOTO(failure); default: g_warning("Unknown type: %d key: %s", type, key); GOTO(failure); } success: iter->user_data3 = GSIZE_TO_POINTER(offset); iter->user_data4 = (gpointer)key; iter->user_data5 = GINT_TO_POINTER(type); iter->user_data6 = (gpointer)value1; iter->user_data7 = (gpointer)value2; return TRUE; failure: memset(iter, 0, sizeof *iter); return FALSE; }
char * unicode_fixup_string(char *str) { uint8_t *ret; size_t len; if (!str) return NULL; len = strlen(str); /* String is valid UTF-8 */ if (!u8_check((uint8_t *)str, len)) return str; ret = u8_conv_from_encoding("ascii", iconveh_question_mark, str, len, NULL, NULL, &len); if (!ret) { DPRINTF(E_LOG, L_MISC, "Could not convert string '%s' to UTF-8: %s\n", str, strerror(errno)); return NULL; } return (char *)ret; }