int sjis_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n) { unsigned char c = *s; if (c < 0x80 || (c >= 0xa1 && c <= 0xdf)) return jisx0201_mbtowc(conv,pwc,s,n); else { unsigned char s1, s2; s1 = c; if ((s1 >= 0x81 && s1 <= 0x9f) || (s1 >= 0xe0 && s1 <= 0xea)) { if (n < 2) return RET_TOOFEW(0); s2 = s[1]; if ((s2 >= 0x40 && s2 <= 0x7e) || (s2 >= 0x80 && s2 <= 0xfc)) { unsigned char t1 = (s1 < 0xe0 ? s1-0x81 : s1-0xc1); unsigned char t2 = (s2 < 0x80 ? s2-0x40 : s2-0x41); unsigned char buf[2]; buf[0] = 2*t1 + (t2 < 0x5e ? 0 : 1) + 0x21; buf[1] = (t2 < 0x5e ? t2 : t2-0x5e) + 0x21; return jisx0208_mbtowc(conv,pwc,buf,2); } } else if (s1 >= 0xf0 && s1 <= 0xf9) { /* User-defined range. See * Ken Lunde's "CJKV Information Processing", table 4-66, p. 206. */ if (n < 2) return RET_TOOFEW(0); s2 = s[1]; if ((s2 >= 0x40 && s2 <= 0x7e) || (s2 >= 0x80 && s2 <= 0xfc)) { *pwc = 0xe000 + 188*(s1 - 0xf0) + (s2 < 0x80 ? s2-0x40 : s2-0x41); return 2; } } return RET_ILSEQ; } }
static int utf16be_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n) { int count = 0; if (n >= 2) { ucs4_t wc = (s[0] << 8) + s[1]; if (wc >= 0xd800 && wc < 0xdc00) { if (n >= 4) { ucs4_t wc2 = (s[2] << 8) + s[3]; if (!(wc2 >= 0xdc00 && wc2 < 0xe000)) goto ilseq; *pwc = 0x10000 + ((wc - 0xd800) << 10) + (wc2 - 0xdc00); return count+4; } } else if (wc >= 0xdc00 && wc < 0xe000) goto ilseq; else { *pwc = wc; return count+2; } } return RET_TOOFEW(count); ilseq: return RET_ILSEQ; }
static int utf8tostr( XlcConv conv, XPointer *from, int *from_left, XPointer *to, int *to_left, XPointer *args, int num_args) { unsigned char const *src; unsigned char const *srcend; unsigned char *dst; unsigned char *dstend; int unconv_num; if (from == NULL || *from == NULL) return 0; src = (unsigned char const *) *from; srcend = src + *from_left; dst = (unsigned char *) *to; dstend = dst + *to_left; unconv_num = 0; while (src < srcend) { unsigned char c; ucs4_t wc; int consumed; consumed = utf8_mbtowc(NULL, &wc, src, srcend-src); if (consumed == RET_TOOFEW(0)) break; if (dst == dstend) break; if (consumed == RET_ILSEQ) { consumed = 1; c = BAD_CHAR; unconv_num++; } else { if ((wc & ~(ucs4_t)0xff) != 0) { c = BAD_CHAR; unconv_num++; } else c = (unsigned char) wc; } *dst++ = c; src += consumed; } *from = (XPointer) src; *from_left = srcend - src; *to = (XPointer) dst; *to_left = dstend - dst; return unconv_num; }
int pkmgba_mbtowc(conv_t conv, ucs4_t *pwc, const unsigned char* s, int n, bool jap) { if (n < 1) return RET_TOOFEW(0); if (*s == 0xff) *pwc = 0; else if (*s >= 0xf7 && *s <= 0xfe) return RET_ILSEQ; // commands else *pwc = GBAToUCS4(*s, jap); return 1; }
static int utf8towcs( XlcConv conv, XPointer *from, int *from_left, XPointer *to, int *to_left, XPointer *args, int num_args) { unsigned char const *src; unsigned char const *srcend; wchar_t *dst; wchar_t *dstend; int unconv_num; if (from == NULL || *from == NULL) return 0; src = (unsigned char const *) *from; srcend = src + *from_left; dst = (wchar_t *) *to; dstend = dst + *to_left; unconv_num = 0; while (src < srcend && dst < dstend) { ucs4_t wc; int consumed = utf8_mbtowc(NULL, &wc, src, srcend-src); if (consumed == RET_TOOFEW(0)) break; if (consumed == RET_ILSEQ) { src++; *dst = BAD_WCHAR; unconv_num++; } else { src += consumed; *dst = wc; } dst++; } *from = (XPointer) src; *from_left = srcend - src; *to = (XPointer) dst; *to_left = dstend - dst; return unconv_num; }
int big5_2003_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n) { unsigned char c = *s; /* Code set 0 (ASCII) */ if (c < 0x80) return ascii_mbtowc(conv,pwc,s,n); /* Code set 1 (BIG5 extended) */ if (c >= 0x81 && c < 0xff) { if (n < 2) return RET_TOOFEW(0); { unsigned char c2 = s[1]; if ((c2 >= 0x40 && c2 < 0x7f) || (c2 >= 0xa1 && c2 < 0xff)) { if (c >= 0xa1) { if (c < 0xa3) { unsigned int i = 157 * (c - 0xa1) + (c2 - (c2 >= 0xa1 ? 0x62 : 0x40)); unsigned short wc = big5_2003_2uni_pagea1[i]; if (wc != 0xfffd) { *pwc = (ucs4_t) wc; return 2; } } if (!((c == 0xc6 && c2 >= 0xa1) || c == 0xc7)) { if (!(c == 0xc2 && c2 == 0x55)) { int ret = big5_mbtowc(conv,pwc,s,2); if (ret != RET_ILSEQ) return ret; if (c == 0xa3) { if (c2 >= 0xc0 && c2 <= 0xe1) { *pwc = (c2 == 0xe1 ? 0x20ac : c2 == 0xe0 ? 0x2421 : 0x2340 + c2); return 2; } } else if (c == 0xf9) { if (c2 >= 0xd6) { *pwc = big5_2003_2uni_pagef9[c2-0xd6]; return 2; } } else if (c >= 0xfa) { *pwc = 0xe000 + 157 * (c - 0xfa) + (c2 - (c2 >= 0xa1 ? 0x62 : 0x40)); return 2; } } else { /* c == 0xc2 && c2 == 0x55. */ *pwc = 0x5f5e; return 2; } } else { /* (c == 0xc6 && c2 >= 0xa1) || c == 0xc7. */ unsigned int i = 157 * (c - 0xc6) + (c2 - (c2 >= 0xa1 ? 0x62 : 0x40)); if (i < 133) { /* 63 <= i < 133. */ unsigned short wc = big5_2003_2uni_pagec6[i-63]; if (wc != 0xfffd) { *pwc = (ucs4_t) wc; return 2; } } else if (i < 216) { /* 133 <= i < 216. Hiragana. */ *pwc = 0x3041 - 133 + i; return 2; } else if (i < 302) { /* 216 <= i < 302. Katakana. */ *pwc = 0x30a1 - 216 + i; return 2; } } } else { /* 0x81 <= c < 0xa1. */ *pwc = (c >= 0x8e ? 0xdb18 : 0xeeb8) + 157 * (c - 0x81) + (c2 - (c2 >= 0xa1 ? 0x62 : 0x40)); return 2; } } } } return RET_ILSEQ; }
static int utf8_mbtowc (ucs4_t *pwc, const unsigned char *s, int n) { unsigned char c = s[0]; if (c < 0x80) { *pwc = c; return 1; } else if (c < 0xc2) { return RET_ILSEQ; } else if (c < 0xe0) { if (n < 2) return RET_TOOFEW(0); if (!((s[1] ^ 0x80) < 0x40)) return RET_ILSEQ; *pwc = ((ucs4_t) (c & 0x1f) << 6) | (ucs4_t) (s[1] ^ 0x80); return 2; } else if (c < 0xf0) { if (n < 3) return RET_TOOFEW(0); if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 && (c >= 0xe1 || s[1] >= 0xa0))) return RET_ILSEQ; *pwc = ((ucs4_t) (c & 0x0f) << 12) | ((ucs4_t) (s[1] ^ 0x80) << 6) | (ucs4_t) (s[2] ^ 0x80); return 3; } else if (c < 0xf8 && sizeof(ucs4_t)*8 >= 32) { if (n < 4) return RET_TOOFEW(0); if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 && (s[3] ^ 0x80) < 0x40 && (c >= 0xf1 || s[1] >= 0x90))) return RET_ILSEQ; *pwc = ((ucs4_t) (c & 0x07) << 18) | ((ucs4_t) (s[1] ^ 0x80) << 12) | ((ucs4_t) (s[2] ^ 0x80) << 6) | (ucs4_t) (s[3] ^ 0x80); return 4; } else if (c < 0xfc && sizeof(ucs4_t)*8 >= 32) { if (n < 5) return RET_TOOFEW(0); if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40 && (c >= 0xf9 || s[1] >= 0x88))) return RET_ILSEQ; *pwc = ((ucs4_t) (c & 0x03) << 24) | ((ucs4_t) (s[1] ^ 0x80) << 18) | ((ucs4_t) (s[2] ^ 0x80) << 12) | ((ucs4_t) (s[3] ^ 0x80) << 6) | (ucs4_t) (s[4] ^ 0x80); return 5; } else if (c < 0xfe && sizeof(ucs4_t)*8 >= 32) { if (n < 6) return RET_TOOFEW(0); if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40 && (s[5] ^ 0x80) < 0x40 && (c >= 0xfd || s[1] >= 0x84))) return RET_ILSEQ; *pwc = ((ucs4_t) (c & 0x01) << 30) | ((ucs4_t) (s[1] ^ 0x80) << 24) | ((ucs4_t) (s[2] ^ 0x80) << 18) | ((ucs4_t) (s[3] ^ 0x80) << 12) | ((ucs4_t) (s[4] ^ 0x80) << 6) | (ucs4_t) (s[5] ^ 0x80); return 6; } else return RET_ILSEQ; }
int tiniconv_convert(struct tiniconv_ctx_s *ctx, unsigned char const *in_buf, int in_size, int *p_in_size_consumed, unsigned char *out_buf, int out_size, int *p_out_size_consumed) { ucs4_t wc; int in_idx, out_idx; int result, last_result; conv_state_t last_istate; assert(ctx != NULL); assert(in_buf != NULL); assert(out_buf != NULL); for (in_idx = 0, out_idx = 0; in_idx < in_size && out_idx < out_size;) { last_istate = ctx->istate; /* typedef int (*xxx_mb2wc_t) (conv_t conv, ucs4_t *pwc, unsigned char const *s, int n); */ result = ctx->mb2wc(ctx, &wc, in_buf + in_idx, in_size - in_idx); assert(result <= in_size - in_idx); if (result < 0) { if (result == RET_ILSEQ) { if (ctx->options & TINICONV_OPTION_IGNORE_IN_ILSEQ) { ctx->istate = 0; in_idx ++; continue; } else { result = TINICONV_CONVERT_IN_ILSEQ; goto exit; } } else if (result == RET_TOOSMALL) { result = TINICONV_CONVERT_IN_TOO_SMALL; goto exit; } else { in_idx += RET_TOOFEW(result); continue; } } in_idx += last_result = result; /* typedef int (*xxx_wc2mb_t) (conv_t conv, unsigned char *r, ucs4_t wc, int n); */ result = ctx->wc2mb(ctx, out_buf + out_idx, wc, out_size - out_idx); assert(result <= out_size - out_idx); if (result < 0) { if (result == RET_ILUNI) { if (ctx->options & TINICONV_OPTION_IGNORE_OUT_ILSEQ) { out_buf[out_idx ++] = TINICONV_OPTION_GET_OUT_ILSEQ_CHAR(ctx->options); ctx->ostate = 0; continue; } else { result = TINICONV_CONVERT_OUT_ILSEQ; in_idx -= last_result; /* discarding the last read sequence */ ctx->istate = last_istate; goto exit; } } else if (result == RET_TOOSMALL) { result = TINICONV_CONVERT_OUT_TOO_SMALL; in_idx -= last_result; /* discarding the last read sequence */ ctx->istate = last_istate; goto exit; } } out_idx += result; } result = TINICONV_CONVERT_OK; exit: if (p_in_size_consumed) *p_in_size_consumed = in_idx; if (p_out_size_consumed) *p_out_size_consumed = out_idx; return result; }
int utf8_mbtowc(uchar *wc, const unsigned char *src, int src_len) { if (!wc) return 0; unsigned char c = src[0]; if (c < 0x80) { *wc = c; return 1; } else if (c < 0xc2) { return RET_ILSEQ; } else if (c < 0xe0) { if (src_len < 2) return RET_TOOFEW(0); if (!((src[1] ^ 0x80) < 0x40)) return RET_ILSEQ; *wc = ((uchar)(c & 0x1f) << 6) | (uchar)(src[1] ^ 0x80); return 2; } else if (c < 0xf0) { if (src_len < 3) return RET_TOOFEW(0); if (!((src[1] ^ 0x80) < 0x40 && (src[2] ^ 0x80) < 0x40 && (c >= 0xe1 || src[1] >= 0xa0))) return RET_ILSEQ; *wc = ((uchar)(c & 0x0f) << 12) | ((uchar)(src[1] ^ 0x80) << 6) | (uchar)(src[2] ^ 0x80); return 3; } else if (c < 0xf8) { if (src_len < 4) return RET_TOOFEW(0); if (!((src[1] ^ 0x80) < 0x40 && (src[2] ^ 0x80) < 0x40 && (src[3] ^ 0x80) < 0x40 && (c >= 0xf1 || src[1] >= 0x90))) return RET_ILSEQ; *wc = ((uchar)(c & 0x07) << 18) | ((uchar)(src[1] ^ 0x80) << 12) | ((uchar)(src[2] ^ 0x80) << 6) | (uchar)(src[3] ^ 0x80); return 4; } else if (c < 0xfc) { if (src_len < 5) return RET_TOOFEW(0); if (!((src[1] ^ 0x80) < 0x40 && (src[2] ^ 0x80) < 0x40 && (src[3] ^ 0x80) < 0x40 && (src[4] ^ 0x80) < 0x40 && (c >= 0xf9 || src[1] >= 0x88))) return RET_ILSEQ; *wc = ((uchar)(c & 0x03) << 24) | ((uchar)(src[1] ^ 0x80) << 18) | ((uchar)(src[2] ^ 0x80) << 12) | ((uchar)(src[3] ^ 0x80) << 6) | (uchar)(src[4] ^ 0x80); return 5; } else if (c < 0xfe) { if (src_len < 6) return RET_TOOFEW(0); if (!((src[1] ^ 0x80) < 0x40 && (src[2] ^ 0x80) < 0x40 && (src[3] ^ 0x80) < 0x40 && (src[4] ^ 0x80) < 0x40 && (src[5] ^ 0x80) < 0x40 && (c >= 0xfd || src[1] >= 0x84))) return RET_ILSEQ; *wc = ((uchar)(c & 0x01) << 30) | ((uchar)(src[1] ^ 0x80) << 24) | ((uchar)(src[2] ^ 0x80) << 18) | ((uchar)(src[3] ^ 0x80) << 12) | ((uchar)(src[4] ^ 0x80) << 6) | (uchar)(src[5] ^ 0x80); return 6; } else return RET_ILSEQ; }
static int utf8tocs1( XlcConv conv, XPointer *from, int *from_left, XPointer *to, int *to_left, XPointer *args, int num_args) { Utf8Conv *preferred_charsets; XlcCharSet last_charset = NULL; unsigned char const *src; unsigned char const *srcend; unsigned char *dst; unsigned char *dstend; int unconv_num; if (from == NULL || *from == NULL) return 0; preferred_charsets = (Utf8Conv *) conv->state; src = (unsigned char const *) *from; srcend = src + *from_left; dst = (unsigned char *) *to; dstend = dst + *to_left; unconv_num = 0; while (src < srcend && dst < dstend) { Utf8Conv chosen_charset = NULL; XlcSide chosen_side = XlcNONE; ucs4_t wc; int consumed; int count; consumed = utf8_mbtowc(NULL, &wc, src, srcend-src); if (consumed == RET_TOOFEW(0)) break; if (consumed == RET_ILSEQ) { src++; unconv_num++; continue; } count = charset_wctocs(preferred_charsets, &chosen_charset, &chosen_side, conv, dst, wc, dstend-dst); if (count == RET_TOOSMALL) break; if (count == RET_ILSEQ) { src += consumed; unconv_num++; continue; } if (last_charset == NULL) { last_charset = _XlcGetCharSetWithSide(chosen_charset->name, chosen_side); if (last_charset == NULL) { src += consumed; unconv_num++; continue; } } else { if (!(last_charset->xrm_encoding_name == chosen_charset->xrm_name && (last_charset->side == XlcGLGR || last_charset->side == chosen_side))) break; } src += consumed; dst += count; break; } if (last_charset == NULL) return -1; *from = (XPointer) src; *from_left = srcend - src; *to = (XPointer) dst; *to_left = dstend - dst; if (num_args >= 1) *((XlcCharSet *)args[0]) = last_charset; return unconv_num; }
static int cstoutf8( XlcConv conv, XPointer *from, int *from_left, XPointer *to, int *to_left, XPointer *args, int num_args) { XlcCharSet charset; const char *name; Utf8Conv convptr; int i; unsigned char const *src; unsigned char const *srcend; unsigned char *dst; unsigned char *dstend; int unconv_num; if (from == NULL || *from == NULL) return 0; if (num_args < 1) return -1; charset = (XlcCharSet) args[0]; name = charset->encoding_name; /* not charset->name because the latter has a ":GL"/":GR" suffix */ for (convptr = all_charsets, i = all_charsets_count-1; i > 0; convptr++, i--) if (!strcmp(convptr->name, name)) break; if (i == 0) return -1; src = (unsigned char const *) *from; srcend = src + *from_left; dst = (unsigned char *) *to; dstend = dst + *to_left; unconv_num = 0; while (src < srcend) { ucs4_t wc; int consumed; int count; consumed = convptr->cstowc(conv, &wc, src, srcend-src); if (consumed == RET_ILSEQ) return -1; if (consumed == RET_TOOFEW(0)) break; count = utf8_wctomb(NULL, dst, wc, dstend-dst); if (count == RET_TOOSMALL) break; if (count == RET_ILSEQ) { count = utf8_wctomb(NULL, dst, BAD_WCHAR, dstend-dst); if (count == RET_TOOSMALL) break; unconv_num++; } src += consumed; dst += count; } *from = (XPointer) src; *from_left = srcend - src; *to = (XPointer) dst; *to_left = dstend - dst; return unconv_num; }
int iso2022_kr_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n) { state_t state = conv->istate; SPLIT_STATE; int count = 0; unsigned char c; for (;;) { c = *s; if (c == ESC) { if (n < count+4) goto none; if (s[1] == '$') { if (s[2] == ')') { if (s[3] == 'C') { state2 = STATE2_DESIGNATED_KSC5601; s += 4; count += 4; if (n < count+1) goto none; continue; } } } return RET_ILSEQ; } if (c == SO) { if (state2 != STATE2_DESIGNATED_KSC5601) return RET_ILSEQ; state1 = STATE_TWOBYTE; s++; count++; if (n < count+1) goto none; continue; } if (c == SI) { state1 = STATE_ASCII; s++; count++; if (n < count+1) goto none; continue; } break; } switch (state1) { case STATE_ASCII: if (c < 0x80) { int ret = ascii_mbtowc(conv,pwc,s,1); if (ret == RET_ILSEQ) return RET_ILSEQ; if (ret != 1) return SIG_ABRT; #if 0 /* Accept ISO-2022-KR according to CJK.INF. */ if (*pwc == 0x000a || *pwc == 0x000d) state2 = STATE2_NONE; #endif COMBINE_STATE; conv->istate = state; return count+1; } else return RET_ILSEQ; case STATE_TWOBYTE: if (n < count+2) goto none; if (state2 != STATE2_DESIGNATED_KSC5601) return SIG_ABRT; if (s[0] < 0x80 && s[1] < 0x80) { int ret = ksc5601_mbtowc(conv,pwc,s,2); if (ret == RET_ILSEQ) return RET_ILSEQ; if (ret != 2) return SIG_ABRT; COMBINE_STATE; conv->istate = state; return count+2; } else return RET_ILSEQ; default: return SIG_ABRT; } none: COMBINE_STATE; conv->istate = state; return RET_TOOFEW(count); }
size_t iconv(iconv_t cd, const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft) { int n, to, from, count1,count2; int pwc, converted,written; int (*mbtowc)(conv_t, ucs4_t *, const unsigned char *, int); int (*wctomb)(conv_t, ucs4_t *, const unsigned char *, int); char *str; str=*outbuf; from=cd>>16; to=cd&0xFFFF; switch (from) { case CP866: mbtowc=cp866_mbtowc; break; case CP1251: mbtowc=cp1251_mbtowc; break; case CP1252: mbtowc=cp1252_mbtowc; break; case ISO8859_5: mbtowc=iso8859_5_mbtowc; break; case KOI8_RU: mbtowc=koi8_ru_mbtowc; break; case UTF_8: mbtowc=utf8_mbtowc; break; default: return -2; } switch (to) { case CP866: wctomb=cp866_wctomb; break; case CP1251: wctomb=cp1251_wctomb; break; case CP1252: wctomb=cp1252_wctomb; break; case ISO8859_5: wctomb=iso8859_5_wctomb; break; case KOI8_RU: wctomb=koi8_ru_wctomb; break; case UTF_8: wctomb=utf8_wctomb; break; default: return -3; } count1=0; count2=0; while ( *inbytesleft>0 && *outbytesleft>1) { n=1; do { //converted= (utf8_mbtowc)(0,&pwc,((*inbuf)+count1),n); // printf("%d\n",n); converted= (mbtowc)(0,&pwc,((*inbuf)+count1),n); n++; } while (converted==RET_TOOFEW(0)); if (converted<0) return -10; //written= (cp866_wctomb)(0,str+count2,pwc,1); written= (wctomb)(0,str+count2,pwc,1); if (written<0) written=0;//return -11; //printf("Conv:%d Wri:%d In:%d Out:%d UTF:%x UCS:%x 866:%s\n",converted, written, *inbytesleft,*outbytesleft,*((*inbuf)+count1),pwc, str); (*inbytesleft)-=converted; (*outbytesleft)-=written; count1+=converted; count2+=written; } *(str+count2)='\0'; if (*inbytesleft>0 && *outbytesleft==0) return -12; return 0; }