Example #1
0
char *normalize_string_latin(char *str, size_t len, uint64_t options) {
    char *transliterated = transliterate(LATIN_ASCII, str, len);
    
    char *utf8_normalized;
    if (transliterated == NULL) {
        utf8_normalized = normalize_string_utf8(str, options);
    } else {
        utf8_normalized = normalize_string_utf8(transliterated, options);
        free(transliterated);
        transliterated = NULL;
    }

    return utf8_normalized;
}
Example #2
0
void add_latin_alternatives(string_tree_t *tree, char *str, size_t len, uint64_t options) {
    
    char *transliterated = NULL;
    char *utf8_normalized = NULL;
    char *prev_string = NULL;

    if (options & NORMALIZE_STRING_LATIN_ASCII) {
        transliterated = transliterate(LATIN_ASCII, str, len);
        if (transliterated != NULL) {
            utf8_normalized = normalize_string_utf8(transliterated, options);
            free(transliterated);
            transliterated = NULL;
        }

        if (utf8_normalized != NULL) {
            string_tree_add_string(tree, utf8_normalized);
            prev_string = utf8_normalized;
            utf8_normalized = NULL;
        }
    }

    char *str_copy = strndup(str, len);
    utf8_normalized = normalize_string_utf8(str_copy, options);
    free(str_copy);

    if (options & NORMALIZE_STRING_LATIN_ASCII && utf8_normalized != NULL) {
        transliterated = transliterate(LATIN_ASCII, utf8_normalized, strlen(utf8_normalized));
        free(utf8_normalized);
    } else {
        transliterated = utf8_normalized;
    }

    if (transliterated != NULL) {
        if (prev_string == NULL || strcmp(prev_string, transliterated) != 0) {
            string_tree_add_string(tree, transliterated);
        }
        free(transliterated);
        transliterated = NULL;
    }

    if (prev_string != NULL) {
        free(prev_string);
    }

}
Example #3
0
static PyObject *py_normalize_string_utf8(PyObject *self, PyObject *args) 
{
    PyObject *arg1;
    uint64_t options;
    if (!PyArg_ParseTuple(args, "OK:normalize", &arg1, &options)) {
        return 0;
    }

    PyObject *unistr = PyUnicode_FromObject(arg1);
    if (unistr == NULL) {
        PyErr_SetString(PyExc_TypeError,
                        "Parameter could not be converted to unicode in scanner");
        return 0;
    }

    #ifdef IS_PY3K
        // Python 3 encoding, supported by Python 3.3+

        char *input = PyUnicode_AsUTF8(unistr);

    #else
        // Python 2 encoding

        PyObject *str = PyUnicode_AsEncodedString(unistr, "utf-8", "strict");
        if (str == NULL) {
            PyErr_SetString(PyExc_TypeError,
                            "Parameter could not be utf-8 encoded");
            goto exit_decref_unistr;
        }

        char *input = PyBytes_AsString(str);

    #endif

    if (input == NULL) {
        goto exit_decref_str;
    }

    char *normalized = normalize_string_utf8(input, options);

    if (normalized == NULL) {
        goto exit_decref_str;
    }

    PyObject *result = PyUnicode_DecodeUTF8((const char *)normalized, strlen(normalized), "strict");
    free(normalized);
    if (result == NULL) {
            PyErr_SetString(PyExc_ValueError,
                            "Result could not be utf-8 decoded");
            goto exit_decref_str;
    }

    #ifndef IS_PY3K
    Py_XDECREF(str);
    #endif
    Py_XDECREF(unistr);

    return result;

exit_decref_str:
#ifndef IS_PY3K
    Py_XDECREF(str);
#endif
exit_decref_unistr:
    Py_XDECREF(unistr);
    return 0;
}
Example #4
0
string_tree_t *normalize_string_languages(char *str, uint64_t options, size_t num_languages, char **languages) {
    size_t len = strlen(str);
    string_tree_t *tree = string_tree_new_size(len);

    size_t consumed = 0;

    khash_t(int_set) *scripts = kh_init(int_set);
    char *utf8_normalized = NULL;

    char *ptr = str;

    script_t script;

    while (consumed < len)  {
        string_script_t script_span = get_string_script(ptr, len - consumed);
        script = script_span.script;
        size_t script_len = script_span.len;
        bool is_ascii = script_span.ascii;

        // Shortcut if the string is all ASCII
        if (options & NORMALIZE_STRING_LOWERCASE && is_ascii && script_len == len) {
            utf8_normalized = normalize_string_utf8(str, NORMALIZE_STRING_LOWERCASE);
            if (utf8_normalized != NULL) {
                string_tree_add_string(tree, utf8_normalized);
                string_tree_finalize_token(tree);
                free(utf8_normalized);
                utf8_normalized = NULL;
            }

            kh_destroy(int_set, scripts);
            return tree;
        }

        log_debug("script_len=%zu\n", script_len);

        if (script != SCRIPT_LATIN && script_len > 0) {
            int ret;
            khiter_t key = kh_put(int_set, scripts, (khint_t)script, &ret);
            if (ret < 0) {
                log_error("Error in kh_put\n");
                string_tree_destroy(tree);
                kh_destroy(int_set, scripts);
                return NULL;
            }
        }

        consumed += script_len;
        ptr += script_len;
    }

    add_latin_alternatives(tree, str, len, options);

    size_t non_latin_scripts = kh_size(scripts);

    if (non_latin_scripts > 0) {
        string_tree_t *transliterators = string_tree_new_size(non_latin_scripts);

        khint_t key;
        char *trans_name = NULL;

        kh_foreach_key(scripts, key, {
            script = (script_t)key;
            for (size_t i = 0; i < num_languages; i++) {
                char *lang = languages[i];
                foreach_transliterator(script, lang, trans_name, {
                    string_tree_add_string(transliterators, trans_name);
                })
            }

            foreach_transliterator(script, "", trans_name, {
                string_tree_add_string(transliterators, trans_name);
            })