char *normalize_string_latin(char *str, size_t len, uint64_t options) { char *transliterated = transliterate(LATIN_ASCII, str, len); char *utf8_normalized; if (transliterated == NULL) { utf8_normalized = normalize_string_utf8(str, options); } else { utf8_normalized = normalize_string_utf8(transliterated, options); free(transliterated); transliterated = NULL; } return utf8_normalized; }
void add_latin_alternatives(string_tree_t *tree, char *str, size_t len, uint64_t options) { char *transliterated = NULL; char *utf8_normalized = NULL; char *prev_string = NULL; if (options & NORMALIZE_STRING_LATIN_ASCII) { transliterated = transliterate(LATIN_ASCII, str, len); if (transliterated != NULL) { utf8_normalized = normalize_string_utf8(transliterated, options); free(transliterated); transliterated = NULL; } if (utf8_normalized != NULL) { string_tree_add_string(tree, utf8_normalized); prev_string = utf8_normalized; utf8_normalized = NULL; } } char *str_copy = strndup(str, len); utf8_normalized = normalize_string_utf8(str_copy, options); free(str_copy); if (options & NORMALIZE_STRING_LATIN_ASCII && utf8_normalized != NULL) { transliterated = transliterate(LATIN_ASCII, utf8_normalized, strlen(utf8_normalized)); free(utf8_normalized); } else { transliterated = utf8_normalized; } if (transliterated != NULL) { if (prev_string == NULL || strcmp(prev_string, transliterated) != 0) { string_tree_add_string(tree, transliterated); } free(transliterated); transliterated = NULL; } if (prev_string != NULL) { free(prev_string); } }
static PyObject *py_normalize_string_utf8(PyObject *self, PyObject *args) { PyObject *arg1; uint64_t options; if (!PyArg_ParseTuple(args, "OK:normalize", &arg1, &options)) { return 0; } PyObject *unistr = PyUnicode_FromObject(arg1); if (unistr == NULL) { PyErr_SetString(PyExc_TypeError, "Parameter could not be converted to unicode in scanner"); return 0; } #ifdef IS_PY3K // Python 3 encoding, supported by Python 3.3+ char *input = PyUnicode_AsUTF8(unistr); #else // Python 2 encoding PyObject *str = PyUnicode_AsEncodedString(unistr, "utf-8", "strict"); if (str == NULL) { PyErr_SetString(PyExc_TypeError, "Parameter could not be utf-8 encoded"); goto exit_decref_unistr; } char *input = PyBytes_AsString(str); #endif if (input == NULL) { goto exit_decref_str; } char *normalized = normalize_string_utf8(input, options); if (normalized == NULL) { goto exit_decref_str; } PyObject *result = PyUnicode_DecodeUTF8((const char *)normalized, strlen(normalized), "strict"); free(normalized); if (result == NULL) { PyErr_SetString(PyExc_ValueError, "Result could not be utf-8 decoded"); goto exit_decref_str; } #ifndef IS_PY3K Py_XDECREF(str); #endif Py_XDECREF(unistr); return result; exit_decref_str: #ifndef IS_PY3K Py_XDECREF(str); #endif exit_decref_unistr: Py_XDECREF(unistr); return 0; }
string_tree_t *normalize_string_languages(char *str, uint64_t options, size_t num_languages, char **languages) { size_t len = strlen(str); string_tree_t *tree = string_tree_new_size(len); size_t consumed = 0; khash_t(int_set) *scripts = kh_init(int_set); char *utf8_normalized = NULL; char *ptr = str; script_t script; while (consumed < len) { string_script_t script_span = get_string_script(ptr, len - consumed); script = script_span.script; size_t script_len = script_span.len; bool is_ascii = script_span.ascii; // Shortcut if the string is all ASCII if (options & NORMALIZE_STRING_LOWERCASE && is_ascii && script_len == len) { utf8_normalized = normalize_string_utf8(str, NORMALIZE_STRING_LOWERCASE); if (utf8_normalized != NULL) { string_tree_add_string(tree, utf8_normalized); string_tree_finalize_token(tree); free(utf8_normalized); utf8_normalized = NULL; } kh_destroy(int_set, scripts); return tree; } log_debug("script_len=%zu\n", script_len); if (script != SCRIPT_LATIN && script_len > 0) { int ret; khiter_t key = kh_put(int_set, scripts, (khint_t)script, &ret); if (ret < 0) { log_error("Error in kh_put\n"); string_tree_destroy(tree); kh_destroy(int_set, scripts); return NULL; } } consumed += script_len; ptr += script_len; } add_latin_alternatives(tree, str, len, options); size_t non_latin_scripts = kh_size(scripts); if (non_latin_scripts > 0) { string_tree_t *transliterators = string_tree_new_size(non_latin_scripts); khint_t key; char *trans_name = NULL; kh_foreach_key(scripts, key, { script = (script_t)key; for (size_t i = 0; i < num_languages; i++) { char *lang = languages[i]; foreach_transliterator(script, lang, trans_name, { string_tree_add_string(transliterators, trans_name); }) } foreach_transliterator(script, "", trans_name, { string_tree_add_string(transliterators, trans_name); })