int file_knjchk(FILE *fp) { int c; int f_sjis,f_euc; int n_sjis,n_sjis_i,n_euc,n_euc_i; n_sjis=0; n_sjis_i=0; n_euc=0; n_euc_i=0; f_sjis=FALSE; f_euc=FALSE; for(;;) { c=fgetc(fp); if (c==EOF|| n_euc>32||n_sjis>32||n_euc_i>8||n_sjis_i>8) return n_euc- n_euc_i*4>= n_sjis- n_sjis_i*4? KC_euc:KC_sjis; if (c==ESC) { c=fgetc(fp); if (c=='K') return KC_jis; if (c=='$') { c=fgetc(fp); if (c=='B'||c=='@') return KC_jis; } } if (f_euc) { if (iseuc(c)) ++n_euc; else ++n_euc_i; f_euc=FALSE; } else { if (iseuc(c)) f_euc=TRUE; } if (f_sjis) { if (issjis2(c)) ++n_sjis; else ++n_sjis_i; f_sjis=FALSE; } else { if (issjis1(c)) f_sjis=TRUE; } } }
gint guess_kanji(gint imax, guchar *buf) { int i, bad_euc, bad_sjis; for (i = 0; i < imax; i++) { if(buf[i+5] == '\0') break; if((strncmp(&buf[i], JIS0208_1978, strlen(JIS0208_1978)) == 0) || (strncmp(&buf[i], JIS0208_1983, strlen(JIS0208_1983)) == 0) || (strncmp(&buf[i], JIS0208_1990, strlen(JIS0208_1990)) == 0) || (strncmp(&buf[i], JIS0212, strlen(JIS0212)) == 0) || (strncmp(&buf[i], JIS_ASC, strlen(JIS_ASC)) == 0) || (strncmp(&buf[i], JIS_ASC2, strlen(JIS_ASC2)) == 0) || (strncmp(&buf[i], JIS_KANA, strlen(JIS_KANA)) == 0)) return(KCODE_JIS); } bad_euc = 0; for (i = 0; i < imax; i++) { if(buf[i+2] == '\0') break; if (iseuc(buf[i]) && ++i < imax) { if (! iseuc(buf[i])) { bad_euc += 10; i--; } else if (buf[i-1] >= 0xd0) bad_euc++; /* Dai 2 Suijun */ /* 1999-02-01 bug fixed. Thanks: massangeana */ } else if (buf[i] == 0x8e && ++i < imax) { if (ishankana(buf[i])) bad_euc++; else { bad_euc += 10; i--; } } else if (buf[i] >= 0x80) bad_euc += 10; } bad_sjis = 0; for (i = 0; i < imax; i++) { if(buf[i+2] == '\0') break; if (issjis1(buf[i]) && ++i < imax) { if (! issjis2(buf[i])) { bad_sjis += 10; i--; } else if ((unsigned) (buf[i-1] * 256U + buf[i]) >= 0x989f) bad_sjis++; /* Dai 2 Suijun */ } else if (buf[i] >= 0x80) { if (ishankana(buf[i])) bad_sjis++; else bad_sjis += 10; } } if(bad_sjis < bad_euc) return(KCODE_SJIS); else if (bad_sjis > bad_euc) return(KCODE_EUC); else if ((bad_euc == 0) && (bad_sjis == 0)) return(KCODE_ASCII); else return(KCODE_UNKNOWN); }
int kanji_countbuf(char c) { if (c==0) return 0; if (iseuc(c)) return 2; if ((u_char)c==0x8e) return 2; if ((u_char)c==0x8f) return 3; return 1; }
int kanji_countdsp(char c, int n) { if (c==0) return 0; if (c=='\t' && n!=-1) return (n/sysinfo.tabstop+1)*sysinfo.tabstop - n; if ((u_char)c==0x8e) // 半角かな return 2; if (iseuc(c)|| (u_char)c==0x8f|| iscntrl(c)) return 2; return 1; }
static gboolean validate_euc_str(guchar *str){ guchar *p; p = str; while(*p){ if (iseuc(p)) { p +=2; } else if(isprint(*p)) { p ++; } else if(isspace(*p)) { *p = 0x20; p++; } else { return(FALSE); } } return(TRUE); }
void hiragana_to_katakana(gchar *word) { gint i=0; g_assert(word != NULL); while(word[i] != '\0'){ if(isalpha(word[i])) { i++; continue; } if(iseuc(&word[i])) { if(iseuchiragana(&word[i])) { word[i] = 0xa5; } i += 2; continue; } i++; } }
Encoding GetEncoding(PCSTR str, size_t len) { size_t i; int ascii, eucjp, sjis, utf8, bad_eucjp, bad_sjis, bad_utf8; int jis, hankana; const unsigned char* buf = (const unsigned char*)str; ascii = 1; bad_eucjp = eucjp = 0; bad_sjis = sjis = 0; bad_utf8 = utf8 = 0; jis = 0; // check BOM if (len >= 2) { if (buf[0] == 0xff && buf[1] == 0xfe) return ENCODING_UTF16_LE; else if (buf[0] == 0xfe && buf[1] == 0xff) return ENCODING_UTF16_BE; } if (len >= 3 && !memcmp(buf, "\xef\xbb\xbf", 3)) return ENCODING_UTF8_BOM; // check ENCODING_SJIS hankana = 0; for (i = 0; i < len; i++) { if (buf[i] >= 0x80) ascii = 0; if (buf[i] == 0x1b) jis = 1; if (buf[i] == 0x8e && i + 2 < len && buf[i + 2] == 0x8e && ishankana(buf[i + 1])) { bad_sjis += 1; } if (ishankana(buf[i])) { sjis += 0x10/2 - 1; hankana++; } else { if (hankana == 1) bad_sjis++; hankana = 0; if (issjis1(buf[i])) { if (i + 1 < len) { if (issjis2(buf[i + 1])) { sjis += 0x10; i++; } else bad_sjis += 0x100; } } else if (buf[i] >= 0x80) bad_sjis += 0x100; } } if (ascii) return jis ? ENCODING_JIS : ENCODING_ASCII; // check ENCODING_EUCJP - JP hankana = 0; for (i = 0; i < len; i++) { if (buf[i] == 0x8e) { if (i + 1 < len) { if (ishankana(buf[i + 1])) { eucjp += 10; i++; hankana++; } else bad_eucjp += 0x100; } } else { if (hankana == 1) bad_eucjp++; hankana = 0; if (iseuc(buf[i])) { if (i + 1 < len) { if (iseuc(buf[i + 1])) { i++; eucjp += 0x10; } else bad_eucjp += 0x100; } } else if (buf[i] == 0x8f) { if (i + 2 < len) { if (iseuc(buf[i + 1]) && iseuc(buf[i + 2])) { i += 2; eucjp += 0x10; } else bad_eucjp += 100; } } else if (buf[i] >= 0x80) bad_eucjp += 0x100; } } // check UTF-8 for (i = 0; i < len; i++) { if (isutf8_2byte(buf[i])) { if (i + 1 < len) { if (isutf8_trail(buf[i + 1])) { utf8 += 10; i++; } else bad_utf8 += 100; } } else if (isutf8_3byte(buf[i])) { if (i + 2 < len) { if (isutf8_trail(buf[i + 1]) && isutf8_trail(buf[i + 2])) { utf8 += 15; i += 2; } else bad_utf8 += 1000; } } else if (buf[i] >= 0x80) bad_utf8 += 1000; } if (sjis - bad_sjis > eucjp - bad_eucjp) { if (sjis - bad_sjis > utf8 - bad_utf8) return ENCODING_SJIS; else if (sjis - bad_sjis < utf8 - bad_utf8) return ENCODING_UTF8; } if (sjis - bad_sjis < eucjp - bad_eucjp) { if (eucjp - bad_eucjp > utf8 - bad_utf8) return ENCODING_EUCJP; else if (eucjp - bad_eucjp < utf8 - bad_utf8) return ENCODING_UTF8; } return ENCODING_UNKNOWN; }
const char *kanji_fromeuc(char *s,size_t bytes,const char *t,int kc) { u_char c; const char *p; enum {KM_ank, KM_kanji, KM_kana} km; p=s; switch(kc) { case KC_euc: return t; case KC_sjis: for(;*t!='\0'&& bytes>0;) { c=*t++; if (c==0x8e&& iskana(*t)) c=*t++; else { if (iseuc(c)&& iseuc(*t)) { *s++= EUCtoSJIStable1[c - 0xa1]; --bytes; c= *(u_char *)t - ((c&1)==0 ? 2 : *(u_char *)t>=0xe0? 0x60: 0x61); ++t; } } *s++=c; --bytes; } *s='\0'; break; case KC_jis: km=KM_ank; for(;*t!='\0'&& bytes>0;) { c=*t++; if (iseuc(c)&& km!=KM_kanji) { if (bytes<3) break; *s++='\x1b'; *s++='$'; *s++='B'; bytes-=3; km=KM_kanji; } if (c==0x8e && iskana(*t)) { if (km!=KM_kana) { if (bytes<3) break; *s++='\x1b'; *s++='('; *s++='I'; bytes-=3; km=KM_kana; } c=*t++; } else { if (!iseuc(c) && km!=KM_ank) { if (bytes<3) break; *s++='\x1b'; *s++='('; *s++='B'; bytes-=3; km=KM_ank; } } *s++= (c&0x7f); --bytes; } if (km!=KM_ank&& bytes>=3) strcpy(s,"\x1b(B"); else *s='\0'; } return p; }