Ejemplo n.º 1
0
static const char *mp_uchardet(void *talloc_ctx, struct mp_log *log, bstr buf)
{
    uchardet_t det = uchardet_new();
    if (!det)
        return NULL;
    if (uchardet_handle_data(det, buf.start, buf.len) != 0) {
        uchardet_delete(det);
        return NULL;
    }
    uchardet_data_end(det);
    char *res = talloc_strdup(talloc_ctx, uchardet_get_charset(det));
    if (res && !res[0])
        res = NULL;
    if (res) {
        iconv_t icdsc = iconv_open("UTF-8", res);
        if (icdsc == (iconv_t)(-1)) {
            mp_warn(log, "Charset detected as %s, but not supported by iconv.\n",
                    res);
            res = NULL;
        } else {
            iconv_close(icdsc);
        }
    }
    uchardet_delete(det);
    return res;
}
Ejemplo n.º 2
0
char *
detect(FILE *fp)
{
    uchardet_t  handle = uchardet_new();
    char       *charset;
    char        buffer[BUFFER_SIZE];
    int         i;

    while (!feof(fp))
    {
        size_t len = fread(buffer, 1, BUFFER_SIZE, fp);
        int retval = uchardet_handle_data(handle, buffer, len);
        if (retval != 0)
        {
            fprintf(stderr,
                    "uchardet-tests: handle data error.\n");
            exit(1);
        }
    }
    uchardet_data_end(handle);

    charset = strdup(uchardet_get_charset(handle));
    for (i = 0; charset[i]; i++)
    {
        /* Our test files are lowercase. */
        charset[i] = tolower(charset[i]);
    }

    uchardet_delete(handle);

    return charset;
}
Ejemplo n.º 3
0
QString TagSetData::decode(const Tag &tag) const
{
    if (tag.encoded)
        return QTextCodec::codecForName(ENC_CODEC)->toUnicode(tag.value);

    if (!mTextCodec)
    {
        // Auto detect codepage ...........................
        uchardet_t uc = uchardet_new();

        for(int i=0; i<mTrackCount; ++i)
        {
            const QByteArray &performer = mTags.value(key(i, TAG_PERFORMER)).value;
            const QByteArray &title = mTags.value(key(i, TAG_TITLE)).value;

            uchardet_handle_data(uc, performer.data(), performer.length());
            uchardet_handle_data(uc, title.data(),     title.length());
        }

        uchardet_data_end(uc);
        mTextCodec = QTextCodec::codecForName(uchardet_get_charset(uc));
        if (!mTextCodec)
            mTextCodec = QTextCodec::codecForName("UTF-8");

        uchardet_delete(uc);
    }

    return mTextCodec->toUnicode(tag.value);
}
Ejemplo n.º 4
0
std::string Detect(agi::fs::path const& file) {
	agi::read_file_mapping fp(file);

	// If it's over 100 MB it's either binary or big enough that we won't
	// be able to do anything useful with it anyway
	if (fp.size() > 100 * 1024 * 1024)
		return "binary";

	uint64_t binaryish = 0;

#ifdef WITH_UCHARDET
	agi::scoped_holder<uchardet_t> ud(uchardet_new(), uchardet_delete);
	for (uint64_t offset = 0; offset < fp.size(); ) {
		auto read = std::min<uint64_t>(4096, fp.size() - offset);
		auto buf = fp.read(offset, read);
		uchardet_handle_data(ud, buf, read);
		uchardet_data_end(ud);
		if (*uchardet_get_charset(ud))
			return uchardet_get_charset(ud);

		offset += read;

		// A dumb heuristic to detect binary files
		for (size_t i = 0; i < read; ++i) {
			if ((unsigned char)buf[i] < 32 && (buf[i] != '\r' && buf[i] != '\n' && buf[i] != '\t'))
				++binaryish;
		}

		if (binaryish > offset / 8)
			return "binary";
	}
	return uchardet_get_charset(ud);
#else
	auto read = std::min<uint64_t>(4096, fp.size());
	auto buf = fp.read(0, read);
	for (size_t i = 0; i < read; ++i) {
		if ((unsigned char)buf[i] < 32 && (buf[i] != '\r' && buf[i] != '\n' && buf[i] != '\t'))
			++binaryish;
	}

	if (binaryish > read / 8)
		return "binary";
	return "utf-8";
#endif
}
Ejemplo n.º 5
0
int FileManager::detectCodepage(char* buf, size_t len)
{
	uchardet_t ud = uchardet_new();
	uchardet_handle_data(ud, buf, len);
	uchardet_data_end(ud);
	const char* cs = uchardet_get_charset(ud);
	int codepage = EncodingMapper::getInstance()->getEncodingFromString(cs);
	uchardet_delete(ud);
	return codepage;
}
Ejemplo n.º 6
0
const char * get_file_encoding(const char* in_str, unsigned int str_len){    
    const char * enc_name;
    uchardet_t handler = uchardet_new();
    if (uchardet_handle_data(handler, in_str, str_len) == 0) {
        uchardet_data_end(handler);
        enc_name = uchardet_get_charset(handler);
        if (enc_name[0] == 0) {
            enc_name = NULL;
        }
    } else {
        enc_name = NULL;
    }
    uchardet_delete(handler);
    return enc_name;
}    
Ejemplo n.º 7
0
gchar* guessEncoding(const char* buffer, size_t len) {
  gchar* result = NULL;
#ifdef HAVE_CHARDET
  uchardet_t cd = uchardet_new();
  if (!uchardet_handle_data(cd, buffer, len)) {
    uchardet_data_end(cd);

    const char* chardet = uchardet_get_charset(cd);

    if (chardet && strcmp(chardet, "")!=0) {
      result = g_strdup(chardet);
    }
  }

  uchardet_delete(cd);
#else
  magic_t cookie = magic_open(MAGIC_MIME);
  magic_load(cookie, NULL);

  const char* resp = magic_buffer(cookie, buffer, len);

  if (!resp)
  {
    printf("magic error: %s\n", magic_error(cookie));
    goto done;
  }

  char* charset = strstr(resp, "charset=");

  if (!charset)
  {
    goto done;
  }

  charset += 8; // len of "charset="

  result = g_strdup(charset);

done:
  magic_close(cookie);
#endif
  return result;
}
Ejemplo n.º 8
0
	string CCharsetDetector::detect(string i_fileName)
	{
		Ptr<CFile> file = new CFile();

		if (NULL != m_handle && file->open(i_fileName, CFile::READ))
		{
			uint8_t * pBuffer = new uint8_t[BUFF_SIZE];
			if (NULL != pBuffer)
			{
				size_t len = 0;
				while((len = file->read(pBuffer, BUFF_SIZE)) > 0)
				{
					int ret = uchardet_handle_data(m_handle, (const char*)pBuffer, len);

					if (ret != 0)
					{
						Log::err("Unable to detect charset of file '%s'\n", file->getFileName().c_str());

						return "error";
					}
				}

				uchardet_data_end(m_handle);

				const char * pCharset = uchardet_get_charset(m_handle);

				delete [] pBuffer;
				if (*pCharset)
				{
					return string(pCharset);
				}
				else
				{
					return "ascii/unknown";
				}
			}
		}

		return "unknown";
	}
Ejemplo n.º 9
0
QTextCodec * Util::detectEncoding( const QByteArray &data )
{
    QTextCodec * codec = 0;

    uchardet_t ucd = uchardet_new();

    if ( uchardet_handle_data( ucd, data.constData(), data.size() ) == 0 )
    {
        // Notify an end of data to an encoding detctor.
        uchardet_data_end( ucd );

        const char * encoding = uchardet_get_charset( ucd );

        if ( encoding[0] != '\0' )
        {
            codec = QTextCodec::codecForName( encoding );
        }
    }

    uchardet_delete( ucd );
    return codec;
}
Ejemplo n.º 10
0
/*
 * @Function Name	: GetFileEncode
 * @Description		: 分析获取文件的文本内容
 * @Return Value	: 返回操作状态
 * @Example			:
 */
bool C_TxtFile::GetFileEncode() {
	if (m_emEncode != en_unknowEncode) {
		return true;
	}

	if (m_strText.length() > 0) {
		uchardet_t ud = uchardet_new();
		if (uchardet_handle_data(ud, m_strText.c_str(), m_ulTextLen) != 0) {
			std::cout<<"分析编码失败!"<<std::endl;
			m_emEncode = en_unknowEncode;

			return false;
		}

		uchardet_data_end(ud);

		// 获取文件内容编码方式
		const char * charset = uchardet_get_charset(ud);
		if (strcmp(charset, "UTF-7") == 0) {
			m_emEncode = en_utf_7;
			uchardet_delete(ud);
			return true;
		}
		if (strcmp(charset, "UTF-8") == 0) {
			m_emEncode = en_utf_8;
			uchardet_delete(ud);
			return true;
		}
		if (strcmp(charset, "GB2312") == 0) {
			m_emEncode = en_gb2312;
			uchardet_delete(ud);
			return true;
		}
		if (strcmp(charset, "BIG5") == 0) {
			m_emEncode = en_big5;
			uchardet_delete(ud);
			return true;
		}
		if (strcmp(charset, "UTF-16") == 0) {

			if ((unsigned char)m_strText[0] == 0xFE && (unsigned char)m_strText[1] == 0xFF) {
				m_emEncode = en_utf_16be;
			} else {
				if ((unsigned char)m_strText[0] == 0xFF && (unsigned char)m_strText[1] == 0xFE) {
					m_emEncode = en_utf_16le;
				} else {
					m_emEncode = en_utf_16;
				}
			}
			uchardet_delete(ud);
			return true;
		}
		if (strcmp(charset, "UTF-32") == 0) {

			if ((unsigned char)m_strText[0] == 0x00 &&
					(unsigned char)m_strText[1] == 0x00 &&
					(unsigned char)m_strText[2] == 0xFE &&
					(unsigned char)m_strText[3] == 0xFF) {
				m_emEncode = en_utf_32be;
			} else {
				if ((unsigned char)m_strText[0] == 0xFF &&
						(unsigned char)m_strText[1] == 0xFE &&
						(unsigned char)m_strText[2] == 0x00 &&
						(unsigned char)m_strText[3] == 0x00) {
					m_emEncode = en_utf_32le;
				} else {
					m_emEncode = en_utf_32;
				}
			}
			uchardet_delete(ud);
			return true;
		}
		if (strcmp(charset, "US-ASCII") == 0) {
			m_emEncode = en_ascii;
			uchardet_delete(ud);
			return true;
		}
		if (strcmp(charset, "gb18030") == 0) {
			m_emEncode = en_gb18030;
			uchardet_delete(ud);
			return true;
		}
		if (*charset == 0) {
			m_emEncode = en_unknowEncode;
			uchardet_delete(ud);
			return true;
		}

		m_emEncode = en_unknowEncode;
		uchardet_delete(ud);
		return true;

	} else {
		std::cout<<"文件内容为容"<<std::endl;
		return false;
	}
}