Exemple #1
0
int 	file_knjchk(FILE *fp)
{
	int 	c;
	int 	f_sjis,f_euc;
	int 	n_sjis,n_sjis_i,n_euc,n_euc_i;

	n_sjis=0;
	n_sjis_i=0;
	n_euc=0;
	n_euc_i=0;

	f_sjis=FALSE;
	f_euc=FALSE;

	for(;;)
		{
		 c=fgetc(fp);
		 if (c==EOF|| n_euc>32||n_sjis>32||n_euc_i>8||n_sjis_i>8)
			return n_euc- n_euc_i*4>= n_sjis- n_sjis_i*4? KC_euc:KC_sjis;
		 if (c==ESC)
		 	{
		 	 c=fgetc(fp);
		 	 if (c=='K')
		 	 	return KC_jis;
		 	 if (c=='$')
		 	 	{
		 	 	 c=fgetc(fp);
		 	 	 if (c=='B'||c=='@')
		 	 	 	return KC_jis;
		 	 	}
		 	}

		 if (f_euc)
		 	{
		 	 if (iseuc(c))
		 	 	++n_euc; else
		 	 	++n_euc_i;
		 	 f_euc=FALSE;
		 	} else
		 	{
		 	 if (iseuc(c))
		 	 	f_euc=TRUE;
		 	}
		 
		 if (f_sjis)
		 	{
		 	 if (issjis2(c))
		 	 	++n_sjis; else
		 	 	++n_sjis_i;
		 	 f_sjis=FALSE;
		 	} else
		 	{
		 	 if (issjis1(c))
		 	 	f_sjis=TRUE;
		 	}
		}
}
Exemple #2
0
gint guess_kanji(gint imax, guchar *buf)
{
	int i, bad_euc, bad_sjis;
	for (i = 0; i < imax; i++) {
		if(buf[i+5] == '\0')
			break;
		if((strncmp(&buf[i], JIS0208_1978, strlen(JIS0208_1978)) == 0) ||
		   (strncmp(&buf[i], JIS0208_1983, strlen(JIS0208_1983)) == 0) ||
		   (strncmp(&buf[i], JIS0208_1990, strlen(JIS0208_1990)) == 0) ||
		   (strncmp(&buf[i], JIS0212, strlen(JIS0212)) == 0) ||
		   (strncmp(&buf[i], JIS_ASC, strlen(JIS_ASC)) == 0) ||
		   (strncmp(&buf[i], JIS_ASC2, strlen(JIS_ASC2)) == 0) ||
		   (strncmp(&buf[i], JIS_KANA, strlen(JIS_KANA)) == 0))
			return(KCODE_JIS);
	}

	bad_euc = 0;
	for (i = 0; i < imax; i++) {
		if(buf[i+2] == '\0')
			break;

		if (iseuc(buf[i]) && ++i < imax) {
			if (! iseuc(buf[i])) {  bad_euc += 10;  i--;  }
			else if (buf[i-1] >= 0xd0) bad_euc++; /* Dai 2 Suijun */
			/* 1999-02-01 bug fixed.  Thanks: massangeana */
		} else if (buf[i] == 0x8e && ++i < imax) {
			if (ishankana(buf[i])) bad_euc++;
			else {  bad_euc += 10;  i--;  }
		} else if (buf[i] >= 0x80) bad_euc += 10;
	}
	bad_sjis = 0;
	for (i = 0; i < imax; i++) {
		if(buf[i+2] == '\0')
			break;
		if (issjis1(buf[i]) && ++i < imax) {
			if (! issjis2(buf[i])) {  bad_sjis += 10;  i--;  }
			else if ((unsigned) (buf[i-1] * 256U + buf[i]) >= 0x989f)
				bad_sjis++;  /* Dai 2 Suijun */
		} else if (buf[i] >= 0x80) {
			if (ishankana(buf[i])) bad_sjis++;
			else                   bad_sjis += 10;
		}
	}

	if(bad_sjis < bad_euc)
		return(KCODE_SJIS);
	else if (bad_sjis > bad_euc)
		return(KCODE_EUC);
	else if ((bad_euc == 0) && (bad_sjis == 0))
		return(KCODE_ASCII);
	else
		return(KCODE_UNKNOWN);
}
Exemple #3
0
int 	kanji_countbuf(char c)
{
	if (c==0)
		return 0;

	if (iseuc(c))
		return 2;
	if ((u_char)c==0x8e)
		return 2;
	if ((u_char)c==0x8f)
		return 3;
	return 1;
}
Exemple #4
0
int 	kanji_countdsp(char c, int n)
{
	if (c==0)
		return 0;

	if (c=='\t' && n!=-1)
		return (n/sysinfo.tabstop+1)*sysinfo.tabstop - n;

	if ((u_char)c==0x8e)	// 半角かな
		return 2;

	if (iseuc(c)|| (u_char)c==0x8f|| iscntrl(c))
		return 2;
	return 1;
}
Exemple #5
0
static gboolean validate_euc_str(guchar *str){
	guchar *p;

	p = str;
	while(*p){
		if (iseuc(p)) {
			p +=2;
		} else if(isprint(*p)) {
			p ++;
		} else if(isspace(*p)) {
			*p = 0x20;
			p++;
		} else {
			return(FALSE);
		}
	}
	return(TRUE);
}
Exemple #6
0
void hiragana_to_katakana(gchar *word)
{
	gint i=0;

	g_assert(word != NULL);

	while(word[i] != '\0'){
		if(isalpha(word[i])) {
			i++;
			continue;
		}
		if(iseuc(&word[i])) {
			if(iseuchiragana(&word[i])) {
				word[i] = 0xa5;
			}
			i += 2;
			continue;
		}
		i++;
	}

}
Exemple #7
0
Encoding GetEncoding(PCSTR str, size_t len)
{
	size_t	i;
	int	ascii, eucjp, sjis, utf8, bad_eucjp, bad_sjis, bad_utf8;
	int	jis, hankana;
	const unsigned char* buf = (const unsigned char*)str;

	ascii = 1;
	bad_eucjp = eucjp = 0; 
	bad_sjis = sjis = 0;
	bad_utf8 = utf8 = 0;
	jis = 0;

	// check BOM
	if (len >= 2)
	{
		if (buf[0] == 0xff && buf[1] == 0xfe)
			return ENCODING_UTF16_LE;
		else if (buf[0] == 0xfe && buf[1] == 0xff)
			return ENCODING_UTF16_BE;
	}
	if (len >= 3 && !memcmp(buf, "\xef\xbb\xbf", 3))
		return ENCODING_UTF8_BOM;

	// check ENCODING_SJIS
	hankana = 0;
	for (i = 0; i < len; i++)
	{
		if (buf[i] >= 0x80)
			ascii = 0;
		if (buf[i] == 0x1b)
			jis = 1;
		if (buf[i] == 0x8e &&
			i + 2 < len &&
			buf[i + 2] == 0x8e &&
			ishankana(buf[i + 1]))
		{
			bad_sjis += 1;
		}

		if (ishankana(buf[i]))
		{
			sjis += 0x10/2 - 1;
			hankana++;
		}
		else
		{
			if (hankana == 1)
				bad_sjis++;
			hankana = 0;

			if (issjis1(buf[i]))
			{
				if (i + 1 < len)
				{
					if (issjis2(buf[i + 1]))
					{
						sjis += 0x10;
						i++;
					}
					else
						bad_sjis += 0x100;
				}
			}
			else if (buf[i] >= 0x80)
				bad_sjis += 0x100;			
		}
	}

	if (ascii)
		return jis ? ENCODING_JIS : ENCODING_ASCII;

	// check ENCODING_EUCJP - JP
	hankana = 0;
	for (i = 0; i < len; i++)
	{
		if (buf[i] == 0x8e)
		{
			if (i + 1 < len)
			{
				if (ishankana(buf[i + 1]))
				{
					eucjp += 10; 
					i++;
					hankana++;
				}
				else
					bad_eucjp += 0x100;
			}
		}
		else
		{
			if (hankana == 1)
				bad_eucjp++;
			hankana = 0;
			if (iseuc(buf[i]))
			{
				if (i + 1 < len)
				{
					if (iseuc(buf[i + 1]))
					{
						i++;
						eucjp += 0x10;
					}
					else
						bad_eucjp += 0x100;
				}
			}
			else if (buf[i] == 0x8f)
			{
				if (i + 2 < len)
				{
					if (iseuc(buf[i + 1]) && iseuc(buf[i + 2]))
					{
						i += 2;
						eucjp += 0x10;
					}
					else
						bad_eucjp += 100;
				}
			}
			else if (buf[i] >= 0x80)
				bad_eucjp += 0x100;
		}
	}

	// check UTF-8
	for (i = 0; i < len; i++)
	{
		if (isutf8_2byte(buf[i]))
		{
			if (i + 1 < len)
			{
				if (isutf8_trail(buf[i + 1]))
				{
					utf8 += 10;
					i++;
				}
				else
					bad_utf8 += 100;
			}
		}
		else if (isutf8_3byte(buf[i]))
		{
			if (i + 2 < len)
			{
				if (isutf8_trail(buf[i + 1]) && isutf8_trail(buf[i + 2]))
				{
					utf8 += 15;
					i += 2;
				}
				else
					bad_utf8 += 1000;
			}
		}
		else if (buf[i] >= 0x80)
			bad_utf8 += 1000;
	}


	if (sjis - bad_sjis > eucjp - bad_eucjp)
	{
		if (sjis - bad_sjis > utf8 - bad_utf8)
			return ENCODING_SJIS;
		else if (sjis - bad_sjis < utf8 - bad_utf8)
			return ENCODING_UTF8;
	}

	if (sjis - bad_sjis < eucjp - bad_eucjp)
	{
		if (eucjp - bad_eucjp > utf8 - bad_utf8)
			return ENCODING_EUCJP;
		else if (eucjp - bad_eucjp < utf8 - bad_utf8)
			return ENCODING_UTF8;
	}

	return ENCODING_UNKNOWN;
}
Exemple #8
0
const	char	*kanji_fromeuc(char *s,size_t bytes,const char *t,int kc)
{
	u_char	c;
	const	char	*p;
	enum	{KM_ank, KM_kanji, KM_kana}	km;

	p=s;
	switch(kc)
		{
	 case KC_euc:
		 return t;

	 case KC_sjis:
		 for(;*t!='\0'&& bytes>0;)
		 	{
		 	 c=*t++;
		 	 if (c==0x8e&& iskana(*t))
		 	 	c=*t++; else
		 	 	{
		 	 	 if (iseuc(c)&& iseuc(*t))
		 	 		{
		 	 	 	 *s++= EUCtoSJIStable1[c - 0xa1];
		 	 	 	 --bytes;

		 	 	 	 c= *(u_char *)t - ((c&1)==0 ? 2 : *(u_char *)t>=0xe0? 0x60: 0x61);
		 	 	 	 ++t;
		 	 	 	}
		 	 	}
		 	 *s++=c;
		 	 --bytes;
		 	}
		 *s='\0';
		 break;

	 case KC_jis:
		 km=KM_ank;
		 for(;*t!='\0'&& bytes>0;)
		 	{
		 	 c=*t++;
		 	 if (iseuc(c)&& km!=KM_kanji)
		 	 	{
		 	 	 if (bytes<3)
		 	 	 	break;
		 	 	 *s++='\x1b';
		 	 	 *s++='$';
		 	 	 *s++='B';
		 	 	 bytes-=3;
		 	 	 km=KM_kanji;
		 	 	}
		 	 if (c==0x8e && iskana(*t))
		 	 	{
		 	 	 if (km!=KM_kana)
		 	 	 	{
		 	 	 	 if (bytes<3)
		 	 	 	 	break;
		 	 	 	 *s++='\x1b';
		 	 	 	 *s++='(';
		 	 	 	 *s++='I';
		 	 	 	 bytes-=3;
		 	 	 	 km=KM_kana;
		 	 	 	}

		 	 	 c=*t++;
		 	 	} else
		 	 	{
		 	 	 if (!iseuc(c) && km!=KM_ank)
		 	 	 	{
		 	 	 	 if (bytes<3)
		 	 	 	 	break;
		 	 	 	 *s++='\x1b';
		 	 	 	 *s++='(';
		 	 	 	 *s++='B';
		 	 	 	 bytes-=3;
		 	 	 	 km=KM_ank;
		 	 	 	}
		 	 	}

			 *s++= (c&0x7f);
			 --bytes;
		 	}
		 if (km!=KM_ank&& bytes>=3)
		 	strcpy(s,"\x1b(B"); else
		 	*s='\0';
		}
	return p;
}