int varnam_detect_lang(varnam *handle, const char *input) { strbuf *word; utf8_decoder decoder; int codepoint, language = VARNAM_LANG_CODE_UNKNOWN, prev_language = 0; if (handle == NULL || input == NULL) { return VARNAM_LANG_CODE_UNKNOWN; } word = get_pooled_string (handle); strbuf_add (word, input); if (strbuf_is_blank (word)) { return VARNAM_LANG_CODE_UNKNOWN; } utf8_decode_init (word->buffer, (int) word->length, &decoder); for (;;) { codepoint = utf8_decode_next (&decoder); if (codepoint == UTF8_END || codepoint == UTF8_ERROR) break; if (should_skip(codepoint)) continue; language = get_language (codepoint); if (language == VARNAM_LANG_CODE_UNKNOWN) return VARNAM_LANG_CODE_UNKNOWN; if (prev_language != 0 && language != prev_language) { /* Looks like characters from multiple languages are mixed */ return VARNAM_LANG_CODE_UNKNOWN; } prev_language = language; } return language; }
UTF8To16Decoder::UTF8To16Decoder(const char *utf8, int length, bool loose) : m_loose(loose), m_low_surrogate(0) { utf8_decode_init(&m_decode, utf8, length); }