static const char *mp_uchardet(void *talloc_ctx, struct mp_log *log, bstr buf) { uchardet_t det = uchardet_new(); if (!det) return NULL; if (uchardet_handle_data(det, buf.start, buf.len) != 0) { uchardet_delete(det); return NULL; } uchardet_data_end(det); char *res = talloc_strdup(talloc_ctx, uchardet_get_charset(det)); if (res && !res[0]) res = NULL; if (res) { iconv_t icdsc = iconv_open("UTF-8", res); if (icdsc == (iconv_t)(-1)) { mp_warn(log, "Charset detected as %s, but not supported by iconv.\n", res); res = NULL; } else { iconv_close(icdsc); } } uchardet_delete(det); return res; }
CCharsetDetector::~CCharsetDetector() { if (NULL != m_handle) { uchardet_delete(m_handle); } }
char * detect(FILE *fp) { uchardet_t handle = uchardet_new(); char *charset; char buffer[BUFFER_SIZE]; int i; while (!feof(fp)) { size_t len = fread(buffer, 1, BUFFER_SIZE, fp); int retval = uchardet_handle_data(handle, buffer, len); if (retval != 0) { fprintf(stderr, "uchardet-tests: handle data error.\n"); exit(1); } } uchardet_data_end(handle); charset = strdup(uchardet_get_charset(handle)); for (i = 0; charset[i]; i++) { /* Our test files are lowercase. */ charset[i] = tolower(charset[i]); } uchardet_delete(handle); return charset; }
QString TagSetData::decode(const Tag &tag) const { if (tag.encoded) return QTextCodec::codecForName(ENC_CODEC)->toUnicode(tag.value); if (!mTextCodec) { // Auto detect codepage ........................... uchardet_t uc = uchardet_new(); for(int i=0; i<mTrackCount; ++i) { const QByteArray &performer = mTags.value(key(i, TAG_PERFORMER)).value; const QByteArray &title = mTags.value(key(i, TAG_TITLE)).value; uchardet_handle_data(uc, performer.data(), performer.length()); uchardet_handle_data(uc, title.data(), title.length()); } uchardet_data_end(uc); mTextCodec = QTextCodec::codecForName(uchardet_get_charset(uc)); if (!mTextCodec) mTextCodec = QTextCodec::codecForName("UTF-8"); uchardet_delete(uc); } return mTextCodec->toUnicode(tag.value); }
int FileManager::detectCodepage(char* buf, size_t len) { uchardet_t ud = uchardet_new(); uchardet_handle_data(ud, buf, len); uchardet_data_end(ud); const char* cs = uchardet_get_charset(ud); int codepage = EncodingMapper::getInstance()->getEncodingFromString(cs); uchardet_delete(ud); return codepage; }
const char * get_file_encoding(const char* in_str, unsigned int str_len){ const char * enc_name; uchardet_t handler = uchardet_new(); if (uchardet_handle_data(handler, in_str, str_len) == 0) { uchardet_data_end(handler); enc_name = uchardet_get_charset(handler); if (enc_name[0] == 0) { enc_name = NULL; } } else { enc_name = NULL; } uchardet_delete(handler); return enc_name; }
gchar* guessEncoding(const char* buffer, size_t len) { gchar* result = NULL; #ifdef HAVE_CHARDET uchardet_t cd = uchardet_new(); if (!uchardet_handle_data(cd, buffer, len)) { uchardet_data_end(cd); const char* chardet = uchardet_get_charset(cd); if (chardet && strcmp(chardet, "")!=0) { result = g_strdup(chardet); } } uchardet_delete(cd); #else magic_t cookie = magic_open(MAGIC_MIME); magic_load(cookie, NULL); const char* resp = magic_buffer(cookie, buffer, len); if (!resp) { printf("magic error: %s\n", magic_error(cookie)); goto done; } char* charset = strstr(resp, "charset="); if (!charset) { goto done; } charset += 8; // len of "charset=" result = g_strdup(charset); done: magic_close(cookie); #endif return result; }
QTextCodec * Util::detectEncoding( const QByteArray &data ) { QTextCodec * codec = 0; uchardet_t ucd = uchardet_new(); if ( uchardet_handle_data( ucd, data.constData(), data.size() ) == 0 ) { // Notify an end of data to an encoding detctor. uchardet_data_end( ucd ); const char * encoding = uchardet_get_charset( ucd ); if ( encoding[0] != '\0' ) { codec = QTextCodec::codecForName( encoding ); } } uchardet_delete( ucd ); return codec; }
/* * @Function Name : GetFileEncode * @Description : 分析获取文件的文本内容 * @Return Value : 返回操作状态 * @Example : */ bool C_TxtFile::GetFileEncode() { if (m_emEncode != en_unknowEncode) { return true; } if (m_strText.length() > 0) { uchardet_t ud = uchardet_new(); if (uchardet_handle_data(ud, m_strText.c_str(), m_ulTextLen) != 0) { std::cout<<"分析编码失败!"<<std::endl; m_emEncode = en_unknowEncode; return false; } uchardet_data_end(ud); // 获取文件内容编码方式 const char * charset = uchardet_get_charset(ud); if (strcmp(charset, "UTF-7") == 0) { m_emEncode = en_utf_7; uchardet_delete(ud); return true; } if (strcmp(charset, "UTF-8") == 0) { m_emEncode = en_utf_8; uchardet_delete(ud); return true; } if (strcmp(charset, "GB2312") == 0) { m_emEncode = en_gb2312; uchardet_delete(ud); return true; } if (strcmp(charset, "BIG5") == 0) { m_emEncode = en_big5; uchardet_delete(ud); return true; } if (strcmp(charset, "UTF-16") == 0) { if ((unsigned char)m_strText[0] == 0xFE && (unsigned char)m_strText[1] == 0xFF) { m_emEncode = en_utf_16be; } else { if ((unsigned char)m_strText[0] == 0xFF && (unsigned char)m_strText[1] == 0xFE) { m_emEncode = en_utf_16le; } else { m_emEncode = en_utf_16; } } uchardet_delete(ud); return true; } if (strcmp(charset, "UTF-32") == 0) { if ((unsigned char)m_strText[0] == 0x00 && (unsigned char)m_strText[1] == 0x00 && (unsigned char)m_strText[2] == 0xFE && (unsigned char)m_strText[3] == 0xFF) { m_emEncode = en_utf_32be; } else { if ((unsigned char)m_strText[0] == 0xFF && (unsigned char)m_strText[1] == 0xFE && (unsigned char)m_strText[2] == 0x00 && (unsigned char)m_strText[3] == 0x00) { m_emEncode = en_utf_32le; } else { m_emEncode = en_utf_32; } } uchardet_delete(ud); return true; } if (strcmp(charset, "US-ASCII") == 0) { m_emEncode = en_ascii; uchardet_delete(ud); return true; } if (strcmp(charset, "gb18030") == 0) { m_emEncode = en_gb18030; uchardet_delete(ud); return true; } if (*charset == 0) { m_emEncode = en_unknowEncode; uchardet_delete(ud); return true; } m_emEncode = en_unknowEncode; uchardet_delete(ud); return true; } else { std::cout<<"文件内容为容"<<std::endl; return false; } }