static void do_conv(FILE *fp, const char *from, const char *to, bool silent, bool hide_invalid) { iconv_t cd; char inbuf[INBUFSIZE], outbuf[OUTBUFSIZE], *out; char *in; size_t inbytes, outbytes, ret; if ((cd = iconv_open(to, from)) == (iconv_t)-1) err(EXIT_FAILURE, "iconv_open(%s, %s)", to, from); if (hide_invalid) { int arg = 1; if (iconvctl(cd, ICONV_SET_DISCARD_ILSEQ, (void *)&arg) == -1) err(1, NULL); } while ((inbytes = fread(inbuf, 1, INBUFSIZE, fp)) > 0) { in = inbuf; while (inbytes > 0) { size_t inval; out = outbuf; outbytes = OUTBUFSIZE; ret = __iconv(cd, &in, &inbytes, &out, &outbytes, 0, &inval); invalids += inval; if (outbytes < OUTBUFSIZE) (void)fwrite(outbuf, 1, OUTBUFSIZE - outbytes, stdout); if (ret == (size_t)-1 && errno != E2BIG) { if (errno != EINVAL || in == inbuf) err(EXIT_FAILURE, "iconv()"); /* incomplete input character */ (void)memmove(inbuf, in, inbytes); ret = fread(inbuf + inbytes, 1, INBUFSIZE - inbytes, fp); if (ret == 0) { fflush(stdout); if (feof(fp)) errx(EXIT_FAILURE, "unexpected end of file; " "the last character is " "incomplete."); else err(EXIT_FAILURE, "fread()"); } in = inbuf; inbytes += ret; } } } /* reset the shift state of the output buffer */ outbytes = OUTBUFSIZE; out = outbuf; ret = iconv(cd, NULL, NULL, &out, &outbytes); if (ret == (size_t)-1) err(EXIT_FAILURE, "iconv()"); if (outbytes < OUTBUFSIZE) (void)fwrite(outbuf, 1, OUTBUFSIZE - outbytes, stdout); if (invalids > 0 && !silent) warnx("warning: invalid characters: %llu", invalids); iconv_close(cd); }
bool charset_conv::update_begin(const char* fromCharset, const char* toCharset) { #ifdef HAVE_H_ICONV if (EQ2(fromCharset, toCharset)) return (true); if (fromCharset == NULL || toCharset == NULL) { if (m_iconv != (iconv_t) -1) return (true); logger_error("input invalid, from: %s, to: %s, m_conv: %s", fromCharset ? fromCharset : "null", toCharset ? toCharset : "null", m_iconv == (iconv_t) -1 ? "invalid" : "valud"); m_errmsg = "input invalid"; return (false); } // 如果源是 UTF-8 编码,则 m_pTuf8Pre 从 UTF8_HEADER 头部第 // 一个字节开始进行匹配,否则从最后一个字节 '\0' 开始匹配, // 即跳过 UTF-8 头部匹配过程 if (EQ(fromCharset, "utf-8") || EQ(fromCharset, "utf8")) m_pUtf8Pre = UTF8_HEADER; else m_pUtf8Pre = &UTF8_HEADER[3]; if (m_iconv != (iconv_t) -1 && EQ(m_fromCharset, fromCharset) && EQ(m_toCharset, toCharset)) { return (true); } SCOPY(m_fromCharset, fromCharset, sizeof(m_fromCharset)); SCOPY(m_toCharset, toCharset, sizeof(m_toCharset)); if (m_iconv != (iconv_t) -1) __iconv_close(m_iconv); m_iconv = __iconv_open(toCharset, fromCharset); if (m_iconv == (iconv_t) -1) { logger_error("iconv_open(%s, %s) error(%s)", toCharset, fromCharset, acl_last_serror()); m_errmsg.format("iconv_open(%s, %s) error(%s)", toCharset, fromCharset, acl_last_serror()); return (false); } else { #ifdef WIN32 # ifndef USE_WIN_ICONV int n = 1; __iconvctl(m_iconv, ICONV_TRIVIALP, &n); n = 1; __iconvctl(m_iconv, ICONV_SET_DISCARD_ILSEQ, &n); n = 1; __iconvctl(m_iconv, ICONV_SET_TRANSLITERATE, &n); # endif // USE_WIN_ICONV #endif char *pNil = NULL; size_t zero = 0; #ifdef WIN32 # ifdef USE_WIN_ICONV __iconv(m_iconv, (const char**) &pNil, &zero, &pNil, &zero); # else __iconv(m_iconv, (const char**) &pNil, &zero, &pNil, &zero, NULL); # endif // USE_WIN_ICONV #elif defined(ACL_SUNOS5) || defined(ACL_FREEBSD) __iconv(m_iconv, (const char**) &pNil, &zero, &pNil, &zero); #else __iconv(m_iconv, &pNil, &zero, &pNil, &zero); #endif return (true); } #else logger_error("no iconv lib"); m_errmsg = "no iconv lib"; return (false); #endif }
bool charset_conv::update(const char* in, size_t len, acl::string* out) { #ifdef HAVE_H_ICONV if (in == NULL) logger_fatal("in null"); if (out == NULL) logger_fatal("out null"); if (EQ(m_fromCharset, m_toCharset)) { out->append(in, len); return (true); } if (m_iconv == (iconv_t) -1) { logger_error("m_iconv invalid"); m_errmsg = "m_iconv invalid"; return (false); } // 去掉有些 UTF-8 文档中开始的 UTF-8 引导符 if (*m_pUtf8Pre) { while (len > 0) { if (*m_pUtf8Pre == 0x00) break; else if (*m_pUtf8Pre != *in) { // 必须使 UTF-8 前缀失效 m_pUtf8Pre = &UTF8_HEADER[3]; break; } m_pUtf8Pre++; in++; len--; } } if (len == 0) return (true); if (m_pInBuf == NULL) m_pInBuf = acl_vstring_alloc(len); if (m_pOutBuf == NULL) m_pOutBuf = acl_vstring_alloc(len); else ACL_VSTRING_SPACE(m_pOutBuf, len); // 先将输入数据进行缓冲 if (*m_pUtf8Pre && m_pUtf8Pre - UTF8_HEADER > 0) acl_vstring_memcpy(m_pInBuf, UTF8_HEADER, m_pUtf8Pre - UTF8_HEADER); acl_vstring_memcat(m_pInBuf, in, len); ACL_VSTRING_TERMINATE(m_pInBuf); char *pIn, *pOut; size_t ret, nIn, nOut; while (true) { nIn = LEN(m_pInBuf); if (nIn == 0) break; pIn = STR(m_pInBuf); pOut = STR(m_pOutBuf); nOut = SIZE(m_pOutBuf); #ifdef WIN32 # ifdef USE_WIN_ICONV ret = __iconv(m_iconv, (const char**) &pIn, &nIn, &pOut, &nOut); # else int err; ret = __iconv(m_iconv, (const char**) &pIn, &nIn, &pOut, &nOut, &err); errno = err; # endif // USE_WIN_ICONV #elif defined(ACL_SUNOS5) || defined(ACL_FREEBSD) ret = __iconv(m_iconv, (const char**) &pIn, &nIn, &pOut, &nOut); #else ret = __iconv(m_iconv, &pIn, &nIn, &pOut, &nOut); #endif if (ret != (size_t) -1) { if ((ret = SIZE(m_pOutBuf) - nOut) > 0) out->append(STR(m_pOutBuf), ret); else // xxx out->append(in, len); ACL_VSTRING_RESET(m_pInBuf); break; } else if (errno == E2BIG) { if ((ret = SIZE(m_pOutBuf) - nOut) > 0) out->append(STR(m_pOutBuf), ret); if (pIn > STR(m_pInBuf) && nIn < LEN(m_pInBuf)) acl_vstring_memmove(m_pInBuf, pIn, nIn); // 扩大内存空间 ACL_VSTRING_SPACE(m_pOutBuf, SIZE(m_pOutBuf) * 2); continue; } else if (errno == EILSEQ) { char *pNil = NULL; size_t zero = 0; // 重置状态, 似乎也没啥用处 #ifdef WIN32 # ifdef USE_WIN_ICONV __iconv(m_iconv, (const char**) &pNil, &zero, &pNil, &zero); # else __iconv(m_iconv, (const char**) &pNil, &zero, &pNil, &zero, NULL); # endif #elif defined(ACL_SUNOS5) || defined(ACL_FREEBSD) __iconv(m_iconv, (const char**) &pNil, &zero, &pNil, &zero); #else __iconv(m_iconv, &pNil, &zero, &pNil, &zero); #endif // 遇到无效的多字节序列,pIn 指向第一个无效的位置 // 先拷贝已经转换的数据 if ((ret = SIZE(m_pOutBuf) - nOut) > 0) out->append(STR(m_pOutBuf), ret); if (nIn == 0) { ACL_VSTRING_RESET(m_pInBuf); break; } acl_assert(pIn >= STR(m_pInBuf)); // 跳过无效字节 (*out) += (char)(*pIn); // 直接拷贝无效字节 nIn--; pIn++; if (nIn > 0) acl_vstring_memmove(m_pInBuf, pIn, nIn); else ACL_VSTRING_RESET(m_pInBuf); } else if (errno == EINVAL) { char *pNil = NULL; size_t zero = 0; // 重置状态, 似乎也没啥用处 #ifdef WIN32 # ifdef USE_WIN_ICONV __iconv(m_iconv, (const char**) &pNil, &zero, &pNil, &zero); # else __iconv(m_iconv, (const char**) &pNil, &zero, &pNil, &zero, NULL); # endif // USE_WIN_ICONV #elif defined(ACL_SUNOS5) || defined(ACL_FREEBSD) __iconv(m_iconv, (const char**) &pNil, &zero, &pNil, &zero); #else __iconv(m_iconv, &pNil, &zero, &pNil, &zero); #endif // 输入的多字节序列不完整,pIn 指向该不完整的位置 // 先拷贝已经转换的数据 if ((ret = SIZE(m_pOutBuf) - nOut) > 0) out->append(STR(m_pOutBuf), ret); // 移动数据,将未转换的数据移至缓冲区起始位置 if (nIn > 0) acl_vstring_memmove(m_pInBuf, pIn, nIn); else ACL_VSTRING_RESET(m_pInBuf); break; } else if (LEN(m_pInBuf) > 0) { // 如果遇到了无效的字符集,根据设置的标志位 // 决定是否直接拷贝 if (m_addInvalid) { out->append(STR(m_pInBuf), LEN(m_pInBuf)); ACL_VSTRING_RESET(m_pInBuf); } break; } else break; } return (true); #else (void) in; (void) len; (void) out; logger_error("no iconv lib"); m_errmsg = "no iconv lib"; return (false); #endif }
static void do_conv(FILE *fp, const char *from, const char *to, int silent, int hide_invalid) { char inbuf[INBUFSIZE], outbuf[OUTBUFSIZE], *out; const char *in; size_t inbytes, outbytes, invalids; ssize_t ret; iconv_t cd; u_int32_t flags = 0; if (hide_invalid) flags |= __ICONV_F_HIDE_INVALID; cd = iconv_open(to, from); if (cd == (iconv_t)-1) err(EXIT_FAILURE, "iconv_open(%s, %s)", to, from); invalids = 0; while ((inbytes = fread(inbuf, 1, INBUFSIZE, fp)) > 0) { in = inbuf; while (inbytes>0) { size_t inval; out = outbuf; outbytes = OUTBUFSIZE; ret = __iconv(cd, &in, &inbytes, &out, &outbytes, flags, &inval); invalids += inval; if (ret == -1 && errno != E2BIG) { /* * XXX: iconv(3) is bad interface. * invalid character count is lost here. * instead, we just provide __iconv function. */ if (errno != EINVAL || in == inbuf) err(EXIT_FAILURE, "iconv()"); /* incomplete input character */ memmove(inbuf, in, inbytes); ret = fread(inbuf+inbytes, 1, INBUFSIZE-inbytes, fp); if (ret == 0) { if (feof(fp)) errx(EXIT_FAILURE, "iconv(): %s", strerror(EINVAL)); else err(EXIT_FAILURE, "fread()"); } in = inbuf; inbytes += ret; } if (outbytes < OUTBUFSIZE) fwrite(outbuf, 1, OUTBUFSIZE-outbytes, stdout); } } /* reset the shift state of the output buffer */ outbytes = OUTBUFSIZE; out = outbuf; ret = iconv(cd, NULL, NULL, &out, &outbytes); if (ret == -1) err(EXIT_FAILURE, "iconv()"); if (outbytes < OUTBUFSIZE) fwrite(outbuf, 1, OUTBUFSIZE-outbytes, stdout); if (invalids > 0 && !silent) warnx("warning: invalid characters: %lu", (unsigned long)invalids); iconv_close(cd); }