示例#1
0
static void
do_conv(FILE *fp, const char *from, const char *to, bool silent,
    bool hide_invalid)
{
	iconv_t cd;
	char inbuf[INBUFSIZE], outbuf[OUTBUFSIZE], *out;
	char *in;
	size_t inbytes, outbytes, ret;

	if ((cd = iconv_open(to, from)) == (iconv_t)-1)
		err(EXIT_FAILURE, "iconv_open(%s, %s)", to, from);

	if (hide_invalid) {
		int arg = 1;

		if (iconvctl(cd, ICONV_SET_DISCARD_ILSEQ, (void *)&arg) == -1)
			err(1, NULL);
	}
	while ((inbytes = fread(inbuf, 1, INBUFSIZE, fp)) > 0) {
		in = inbuf;
		while (inbytes > 0) {
			size_t inval;

			out = outbuf;
			outbytes = OUTBUFSIZE;
			ret = __iconv(cd, &in, &inbytes, &out, &outbytes,
			    0, &inval);
			invalids += inval;
			if (outbytes < OUTBUFSIZE)
				(void)fwrite(outbuf, 1, OUTBUFSIZE - outbytes,
				    stdout);
			if (ret == (size_t)-1 && errno != E2BIG) {
				if (errno != EINVAL || in == inbuf)
					err(EXIT_FAILURE, "iconv()");

				/* incomplete input character */
				(void)memmove(inbuf, in, inbytes);
				ret = fread(inbuf + inbytes, 1,
				    INBUFSIZE - inbytes, fp);
				if (ret == 0) {
					fflush(stdout);
					if (feof(fp))
						errx(EXIT_FAILURE,
						    "unexpected end of file; "
						    "the last character is "
						    "incomplete.");
					else
						err(EXIT_FAILURE, "fread()");
				}
				in = inbuf;
				inbytes += ret;
			}
		}
	}
	/* reset the shift state of the output buffer */
	outbytes = OUTBUFSIZE;
	out = outbuf;
	ret = iconv(cd, NULL, NULL, &out, &outbytes);
	if (ret == (size_t)-1)
		err(EXIT_FAILURE, "iconv()");
	if (outbytes < OUTBUFSIZE)
		(void)fwrite(outbuf, 1, OUTBUFSIZE - outbytes, stdout);

	if (invalids > 0 && !silent)
		warnx("warning: invalid characters: %llu", invalids);

	iconv_close(cd);
}
示例#2
0
bool charset_conv::update_begin(const char* fromCharset,
	const char* toCharset)
{
#ifdef  HAVE_H_ICONV
	if (EQ2(fromCharset, toCharset))
		return (true);

	if (fromCharset == NULL || toCharset == NULL)
	{
		if (m_iconv != (iconv_t) -1)
			return (true);

		logger_error("input invalid, from: %s, to: %s, m_conv: %s",
			fromCharset ? fromCharset : "null",
			toCharset ? toCharset : "null",
			m_iconv == (iconv_t) -1 ? "invalid" : "valud");
		m_errmsg = "input invalid";
		return (false);
	}

	// 如果源是 UTF-8 编码,则 m_pTuf8Pre 从 UTF8_HEADER 头部第
	// 一个字节开始进行匹配,否则从最后一个字节 '\0' 开始匹配,
	// 即跳过 UTF-8 头部匹配过程
	if (EQ(fromCharset, "utf-8") || EQ(fromCharset, "utf8"))
		m_pUtf8Pre = UTF8_HEADER;
	else
		m_pUtf8Pre = &UTF8_HEADER[3];

	if (m_iconv != (iconv_t) -1
		&& EQ(m_fromCharset, fromCharset)
		&& EQ(m_toCharset, toCharset))
	{
		return (true);
	}

	SCOPY(m_fromCharset, fromCharset, sizeof(m_fromCharset));
	SCOPY(m_toCharset, toCharset, sizeof(m_toCharset));

	if (m_iconv != (iconv_t) -1)
		__iconv_close(m_iconv);
	m_iconv = __iconv_open(toCharset, fromCharset);
	if (m_iconv == (iconv_t) -1)
	{
		logger_error("iconv_open(%s, %s) error(%s)",
			toCharset, fromCharset, acl_last_serror());
		m_errmsg.format("iconv_open(%s, %s) error(%s)",
			toCharset, fromCharset, acl_last_serror());
		return (false);
	}
	else
	{
#ifdef WIN32
# ifndef USE_WIN_ICONV
		int  n = 1;
		__iconvctl(m_iconv, ICONV_TRIVIALP, &n);

		n = 1;
		__iconvctl(m_iconv, ICONV_SET_DISCARD_ILSEQ, &n);

		n = 1;
		__iconvctl(m_iconv, ICONV_SET_TRANSLITERATE, &n);
# endif // USE_WIN_ICONV
#endif

		char *pNil = NULL;
		size_t zero = 0;
#ifdef	WIN32
# ifdef USE_WIN_ICONV
		__iconv(m_iconv, (const char**) &pNil, &zero, &pNil, &zero);
# else
		__iconv(m_iconv, (const char**) &pNil, &zero, &pNil, &zero, NULL);
# endif // USE_WIN_ICONV
#elif defined(ACL_SUNOS5) || defined(ACL_FREEBSD)
		__iconv(m_iconv, (const char**) &pNil, &zero, &pNil, &zero);
#else
		__iconv(m_iconv, &pNil, &zero, &pNil, &zero);
#endif
		return (true);
	}
#else
	logger_error("no iconv lib");
	m_errmsg = "no iconv lib";
	return (false);
#endif
}
示例#3
0
bool charset_conv::update(const char* in, size_t len, acl::string* out)
{
#ifdef  HAVE_H_ICONV
	if (in == NULL)
		logger_fatal("in null");
	if (out == NULL)
		logger_fatal("out null");

	if (EQ(m_fromCharset, m_toCharset))
	{
		out->append(in, len);
		return (true);
	}

	if (m_iconv == (iconv_t) -1)
	{
		logger_error("m_iconv invalid");
		m_errmsg = "m_iconv invalid";
		return (false);
	}

	// 去掉有些 UTF-8 文档中开始的 UTF-8 引导符
	if (*m_pUtf8Pre)
	{
		while (len > 0)
		{
			if (*m_pUtf8Pre == 0x00)
				break;
			else if (*m_pUtf8Pre != *in)
			{
				// 必须使 UTF-8 前缀失效
				m_pUtf8Pre = &UTF8_HEADER[3];
				break;
			}
			m_pUtf8Pre++;
			in++;
			len--;
		}
	}

	if (len == 0)
		return (true);

	if (m_pInBuf == NULL)
		m_pInBuf = acl_vstring_alloc(len);

	if (m_pOutBuf == NULL)
		m_pOutBuf = acl_vstring_alloc(len);
	else
		ACL_VSTRING_SPACE(m_pOutBuf, len);

	// 先将输入数据进行缓冲
	if (*m_pUtf8Pre && m_pUtf8Pre - UTF8_HEADER > 0)
		acl_vstring_memcpy(m_pInBuf, UTF8_HEADER,
			m_pUtf8Pre - UTF8_HEADER);
	acl_vstring_memcat(m_pInBuf, in, len);
	ACL_VSTRING_TERMINATE(m_pInBuf);

	char  *pIn, *pOut;
	size_t ret, nIn, nOut;

	while (true)
	{
		nIn  = LEN(m_pInBuf);
		if (nIn == 0)
			break;
		pIn  = STR(m_pInBuf);
		pOut = STR(m_pOutBuf);
		nOut = SIZE(m_pOutBuf);

#ifdef	WIN32
# ifdef USE_WIN_ICONV
		ret = __iconv(m_iconv, (const char**) &pIn, &nIn,
				&pOut, &nOut);
# else
		int   err;
		ret = __iconv(m_iconv, (const char**) &pIn, &nIn,
				&pOut, &nOut, &err);
		errno = err;
# endif // USE_WIN_ICONV
#elif defined(ACL_SUNOS5) || defined(ACL_FREEBSD)
		ret = __iconv(m_iconv, (const char**) &pIn, &nIn, &pOut, &nOut);
#else
		ret = __iconv(m_iconv, &pIn, &nIn, &pOut, &nOut);
#endif


		if (ret != (size_t) -1)
		{
			if ((ret = SIZE(m_pOutBuf) - nOut) > 0)
				out->append(STR(m_pOutBuf), ret);
			else  // xxx
				out->append(in, len);
			ACL_VSTRING_RESET(m_pInBuf);
			break;
		}
		else if (errno == E2BIG)
		{
			if ((ret = SIZE(m_pOutBuf) - nOut) > 0)
				out->append(STR(m_pOutBuf), ret);
			if (pIn > STR(m_pInBuf) && nIn < LEN(m_pInBuf))
				acl_vstring_memmove(m_pInBuf, pIn, nIn);
			// 扩大内存空间
			ACL_VSTRING_SPACE(m_pOutBuf, SIZE(m_pOutBuf) * 2);
			continue;
		}
		else if (errno == EILSEQ)
		{
			char *pNil = NULL;
			size_t zero = 0;

			// 重置状态, 似乎也没啥用处
#ifdef	WIN32
# ifdef USE_WIN_ICONV
			__iconv(m_iconv, (const char**) &pNil,
				&zero, &pNil, &zero);
# else
			__iconv(m_iconv, (const char**) &pNil,
				&zero, &pNil, &zero, NULL);
# endif
#elif defined(ACL_SUNOS5) || defined(ACL_FREEBSD)
			__iconv(m_iconv, (const char**) &pNil,
				&zero, &pNil, &zero);
#else
			__iconv(m_iconv, &pNil, &zero, &pNil, &zero);
#endif

			// 遇到无效的多字节序列,pIn 指向第一个无效的位置

			// 先拷贝已经转换的数据
			if ((ret = SIZE(m_pOutBuf) - nOut) > 0)
				out->append(STR(m_pOutBuf), ret);

			if (nIn == 0)
			{
				ACL_VSTRING_RESET(m_pInBuf);
				break;
			}

			acl_assert(pIn >= STR(m_pInBuf));

			// 跳过无效字节
			(*out) += (char)(*pIn); // 直接拷贝无效字节
			nIn--;
			pIn++;
			if (nIn > 0)
				acl_vstring_memmove(m_pInBuf, pIn, nIn);
			else
				ACL_VSTRING_RESET(m_pInBuf);
		}
		else if (errno == EINVAL)
		{
			char *pNil = NULL;
			size_t zero = 0;

			// 重置状态, 似乎也没啥用处
#ifdef	WIN32
# ifdef USE_WIN_ICONV
			__iconv(m_iconv, (const char**) &pNil,
				&zero, &pNil, &zero);
# else
			__iconv(m_iconv, (const char**) &pNil,
				&zero, &pNil, &zero, NULL);
# endif // USE_WIN_ICONV
#elif defined(ACL_SUNOS5) || defined(ACL_FREEBSD)
			__iconv(m_iconv, (const char**) &pNil, &zero, &pNil, &zero);
#else
			__iconv(m_iconv, &pNil, &zero, &pNil, &zero);
#endif

			// 输入的多字节序列不完整,pIn 指向该不完整的位置

			// 先拷贝已经转换的数据
			if ((ret = SIZE(m_pOutBuf) - nOut) > 0)
				out->append(STR(m_pOutBuf), ret);

			// 移动数据,将未转换的数据移至缓冲区起始位置
			if (nIn > 0)
				acl_vstring_memmove(m_pInBuf, pIn, nIn);
			else
				ACL_VSTRING_RESET(m_pInBuf);
			break;
		}
		else if (LEN(m_pInBuf) > 0)
		{
			// 如果遇到了无效的字符集,根据设置的标志位
			// 决定是否直接拷贝
			if (m_addInvalid)
			{
				out->append(STR(m_pInBuf), LEN(m_pInBuf));
				ACL_VSTRING_RESET(m_pInBuf);
			}
			break;
		}
		else
			break;
	}

	return (true);
#else
	(void) in;
	(void) len;
	(void) out;
	logger_error("no iconv lib");
	m_errmsg = "no iconv lib";
	return (false);
#endif
}
示例#4
0
static void
do_conv(FILE *fp, const char *from, const char *to, int silent,
	int hide_invalid)
{
	char inbuf[INBUFSIZE], outbuf[OUTBUFSIZE], *out;
	const char *in;
	size_t inbytes, outbytes, invalids;
	ssize_t ret;
	iconv_t cd;
	u_int32_t flags = 0;

	if (hide_invalid)
		flags |= __ICONV_F_HIDE_INVALID;
	cd = iconv_open(to, from);
	if (cd == (iconv_t)-1)
		err(EXIT_FAILURE, "iconv_open(%s, %s)", to, from);

	invalids = 0;
	while ((inbytes = fread(inbuf, 1, INBUFSIZE, fp)) > 0) {
		in = inbuf;
		while (inbytes>0) {
			size_t inval;

			out = outbuf;
			outbytes = OUTBUFSIZE;
			ret = __iconv(cd, &in, &inbytes, &out, &outbytes,
				      flags, &inval);
			invalids += inval;
			if (ret == -1 && errno != E2BIG) {
				/*
				 * XXX: iconv(3) is bad interface.
				 *   invalid character count is lost here.
				 *   instead, we just provide __iconv function.
				 */
				if (errno != EINVAL || in == inbuf)
					err(EXIT_FAILURE, "iconv()");

				/* incomplete input character */
				memmove(inbuf, in, inbytes);
				ret = fread(inbuf+inbytes, 1,
					    INBUFSIZE-inbytes, fp);
				if (ret == 0) {
					if (feof(fp))
						errx(EXIT_FAILURE,
						     "iconv(): %s",
						     strerror(EINVAL));
					else
						err(EXIT_FAILURE, "fread()");
				}
				in = inbuf;
				inbytes += ret;
			}
			if (outbytes < OUTBUFSIZE)
				fwrite(outbuf, 1, OUTBUFSIZE-outbytes, stdout);
		}
	}
	/* reset the shift state of the output buffer */
	outbytes = OUTBUFSIZE;
	out = outbuf;
	ret = iconv(cd, NULL, NULL, &out, &outbytes);
	if (ret == -1)
		err(EXIT_FAILURE, "iconv()");
	if (outbytes < OUTBUFSIZE)
		fwrite(outbuf, 1, OUTBUFSIZE-outbytes, stdout);

	if (invalids > 0 && !silent)
		warnx("warning: invalid characters: %lu",
		      (unsigned long)invalids);

	iconv_close(cd);
}