Esempio n. 1
0
int
varnam_detect_lang(varnam *handle, const char *input)
{
    strbuf *word;
    utf8_decoder decoder;
    int codepoint, language = VARNAM_LANG_CODE_UNKNOWN, prev_language = 0;
    
    if (handle == NULL || input == NULL) {
        return VARNAM_LANG_CODE_UNKNOWN;
    }

    word = get_pooled_string (handle);
    strbuf_add (word, input);

    if (strbuf_is_blank (word)) {
        return VARNAM_LANG_CODE_UNKNOWN;
    }

    utf8_decode_init (word->buffer, (int) word->length, &decoder);

    for (;;)
    {
        codepoint = utf8_decode_next (&decoder);
        if (codepoint == UTF8_END || codepoint == UTF8_ERROR)
            break;

        if (should_skip(codepoint))
            continue;

        language = get_language (codepoint);

        if (language == VARNAM_LANG_CODE_UNKNOWN)
            return VARNAM_LANG_CODE_UNKNOWN;
        
        if (prev_language != 0 && language != prev_language) {
            /* Looks like characters from multiple languages are mixed */
            return VARNAM_LANG_CODE_UNKNOWN;
        }
        prev_language = language;
    }

    return language;
}
Esempio n. 2
0
UTF8To16Decoder::UTF8To16Decoder(const char *utf8, int length, bool loose)
    : m_loose(loose), m_low_surrogate(0) {
  utf8_decode_init(&m_decode, utf8, length);
}