Exemple #1
0
/*
 * EUC_KR ---> MIC
 */
static void
euc_kr2mic(const unsigned char *euc, unsigned char *p, int len)
{
	int			c1;
	int			l;

	while (len > 0)
	{
		c1 = *euc;
		if (IS_HIGHBIT_SET(c1))
		{
			l = pg_encoding_verifymb(PG_EUC_KR, (const char *) euc, len);
			if (l != 2)
				report_invalid_encoding(PG_EUC_KR,
										(const char *) euc, len);
			*p++ = LC_KS5601;
			*p++ = c1;
			*p++ = euc[1];
			euc += 2;
			len -= 2;
		}
		else
		{						/* should be ASCII */
			if (c1 == 0)
				report_invalid_encoding(PG_EUC_KR,
										(const char *) euc, len);
			*p++ = c1;
			euc++;
			len--;
		}
	}
	*p = '\0';
}
Exemple #2
0
/*
 * EUC_CN ---> MIC
 */
static void
euc_cn2mic(const unsigned char *euc, unsigned char *p, int len)
{
	int			c1;

	while (len > 0)
	{
		c1 = *euc;
		if (IS_HIGHBIT_SET(c1))
		{
			if (len < 2 || !IS_HIGHBIT_SET(euc[1]))
				report_invalid_encoding(PG_EUC_CN, (const char *) euc, len);
			*p++ = LC_GB2312_80;
			*p++ = c1;
			*p++ = euc[1];
			euc += 2;
			len -= 2;
		}
		else
		{						/* should be ASCII */
			if (c1 == 0)
				report_invalid_encoding(PG_EUC_CN, (const char *) euc, len);
			*p++ = c1;
			euc++;
			len--;
		}
	}
	*p = '\0';
}
Exemple #3
0
/*
 * MIC ---> EUC_CN
 */
static void
mic2euc_cn(const unsigned char *mic, unsigned char *p, int len)
{
	int			c1;

	while (len > 0)
	{
		c1 = *mic;
		if (IS_HIGHBIT_SET(c1))
		{
			if (c1 != LC_GB2312_80)
				report_untranslatable_char(PG_MULE_INTERNAL, PG_EUC_CN,
										   (const char *) mic, len);
			if (len < 3 || !IS_HIGHBIT_SET(mic[1]) || !IS_HIGHBIT_SET(mic[2]))
				report_invalid_encoding(PG_MULE_INTERNAL,
										(const char *) mic, len);
			mic++;
			*p++ = *mic++;
			*p++ = *mic++;
			len -= 3;
		}
		else
		{						/* should be ASCII */
			if (c1 == 0)
				report_invalid_encoding(PG_MULE_INTERNAL,
										(const char *) mic, len);
			*p++ = c1;
			mic++;
			len--;
		}
	}
	*p = '\0';
}
Exemple #4
0
/*
 * MIC ---> LATINn when the charset's local codes map directly to MIC
 *
 * mic points to the source string of length len
 * p is the output area (must be large enough!)
 * lc is the mule character set id for the local encoding
 * encoding is the PG identifier for the local encoding
 */
void
mic2latin(const unsigned char *mic, unsigned char *p, int len,
		  int lc, int encoding)
{
	int			c1;

	while (len > 0)
	{
		c1 = *mic;
		if (c1 == 0)
			report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
		if (!IS_HIGHBIT_SET(c1))
		{
			/* easy for ASCII */
			*p++ = c1;
			mic++;
			len--;
		}
		else
		{
			int			l = pg_mic_mblen(mic);

			if (len < l)
				report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
										len);
			if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]))
				report_untranslatable_char(PG_MULE_INTERNAL, encoding,
										   (const char *) mic, len);
			*p++ = mic[1];
			mic += 2;
			len -= 2;
		}
	}
	*p = '\0';
}
Exemple #5
0
/*
 * Verify mbstr to make sure that it is validly encoded in the specified
 * encoding.
 *
 * mbstr is not necessarily zero terminated; length of mbstr is
 * specified by len.
 *
 * If OK, return TRUE.	If a problem is found, return FALSE when noError is
 * true; when noError is false, ereport() a descriptive message.
 */
bool
pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError)
{
	mbverifier	mbverify;

	Assert(PG_VALID_ENCODING(encoding));

	/*
	 * In single-byte encodings, we need only reject nulls (\0).
	 */
	if (pg_encoding_max_length(encoding) <= 1)
	{
		const char *nullpos = memchr(mbstr, 0, len);

		if (nullpos == NULL)
			return true;
		if (noError)
			return false;
		report_invalid_encoding(encoding, nullpos, 1);
	}

	/* fetch function pointer just once */
	mbverify = pg_wchar_table[encoding].mbverify;

	while (len > 0)
	{
		int			l;

		/* fast path for ASCII-subset characters */
		if (!IS_HIGHBIT_SET(*mbstr))
		{
			if (*mbstr != '\0')
			{
				mbstr++;
				len--;
				continue;
			}
			if (noError)
				return false;
			report_invalid_encoding(encoding, mbstr, len);
		}

		l = (*mbverify) ((const unsigned char *) mbstr, len);

		if (l < 0)
		{
			if (noError)
				return false;
			report_invalid_encoding(encoding, mbstr, len);
		}

		mbstr += l;
		len -= l;
	}
	return true;
}
Exemple #6
0
/*
 * EUC_TW ---> MIC
 */
static void
euc_tw2mic(const unsigned char *euc, unsigned char *p, int len)
{
	int			c1;
	int			l;

	while (len > 0)
	{
		c1 = *euc;
		if (IS_HIGHBIT_SET(c1))
		{
			l = pg_encoding_verifymb(PG_EUC_TW, (const char *) euc, len);
			if (l < 0)
				report_invalid_encoding(PG_EUC_TW,
										(const char *) euc, len);
			if (c1 == SS2)
			{
				c1 = euc[1];	/* plane No. */
				if (c1 == 0xa1)
					*p++ = LC_CNS11643_1;
				else if (c1 == 0xa2)
					*p++ = LC_CNS11643_2;
				else
				{
					/* other planes are MULE private charsets */
					*p++ = LCPRV2_B;
					*p++ = c1 - 0xa3 + LC_CNS11643_3;
				}
				*p++ = euc[2];
				*p++ = euc[3];
			}
			else
			{					/* CNS11643-1 */
				*p++ = LC_CNS11643_1;
				*p++ = c1;
				*p++ = euc[1];
			}
			euc += l;
			len -= l;
		}
		else
		{						/* should be ASCII */
			if (c1 == 0)
				report_invalid_encoding(PG_EUC_TW,
										(const char *) euc, len);
			*p++ = c1;
			euc++;
			len--;
		}
	}
	*p = '\0';
}
Exemple #7
0
/*
 * MIC ---> EUC_TW
 */
static void
mic2euc_tw(const unsigned char *mic, unsigned char *p, int len)
{
	int			c1;
	int			l;

	while (len > 0)
	{
		c1 = *mic;
		if (!IS_HIGHBIT_SET(c1))
		{
			/* ASCII */
			if (c1 == 0)
				report_invalid_encoding(PG_MULE_INTERNAL,
										(const char *) mic, len);
			*p++ = c1;
			mic++;
			len--;
			continue;
		}
		l = pg_encoding_verifymb(PG_MULE_INTERNAL, (const char *) mic, len);
		if (l < 0)
			report_invalid_encoding(PG_MULE_INTERNAL,
									(const char *) mic, len);
		if (c1 == LC_CNS11643_1)
		{
			*p++ = mic[1];
			*p++ = mic[2];
		}
		else if (c1 == LC_CNS11643_2)
		{
			*p++ = SS2;
			*p++ = 0xa2;
			*p++ = mic[1];
			*p++ = mic[2];
		}
		else if (c1 == 0x9d &&
				 mic[1] >= LC_CNS11643_3 && mic[1] <= LC_CNS11643_7)
		{						/* LCPRV2? */
			*p++ = SS2;
			*p++ = mic[1] - LC_CNS11643_3 + 0xa3;
			*p++ = mic[2];
			*p++ = mic[3];
		}
		else
			report_untranslatable_char(PG_MULE_INTERNAL, PG_EUC_TW,
									   (const char *) mic, len);
		mic += l;
		len -= l;
	}
	*p = '\0';
}
Datum
utf8_to_iso8859_1(PG_FUNCTION_ARGS)
{
	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
	int			len = PG_GETARG_INT32(4);
	unsigned short c,
				c1;

	Assert(PG_GETARG_INT32(0) == PG_UTF8);
	Assert(PG_GETARG_INT32(1) == PG_LATIN1);
	Assert(len >= 0);

	while (len > 0)
	{
		c = *src;
		if (c == 0)
			report_invalid_encoding(PG_UTF8, (const char *) src, len);
		/* fast path for ASCII-subset characters */
		if (!IS_HIGHBIT_SET(c))
		{
			*dest++ = c;
			src++;
			len--;
		}
		else
		{
			int			l = pg_utf_mblen(src);

			if (l > len || !pg_utf8_islegal(src, l))
				report_invalid_encoding(PG_UTF8, (const char *) src, len);
			if (l != 2)
				report_untranslatable_char(PG_UTF8, PG_LATIN1,
										   (const char *) src, len);
			c1 = src[1] & 0x3f;
			c = ((c & 0x1f) << 6) | c1;
			if (c >= 0x80 && c <= 0xff)
			{
				*dest++ = (unsigned char) c;
				src += 2;
				len -= 2;
			}
			else
				report_untranslatable_char(PG_UTF8, PG_LATIN1,
										   (const char *) src, len);
		}
	}
	*dest = '\0';

	PG_RETURN_VOID();
}
Exemple #9
0
/*
 * latin2mic_with_table: a generic single byte charset encoding
 * conversion from a local charset to the mule internal code.
 *
 * l points to the source string of length len
 * p is the output area (must be large enough!)
 * lc is the mule character set id for the local encoding
 * encoding is the PG identifier for the local encoding
 * tab holds conversion entries for the local charset
 * starting from 128 (0x80). each entry in the table
 * holds the corresponding code point for the mule internal code.
 */
void
latin2mic_with_table(const unsigned char *l,
					 unsigned char *p,
					 int len,
					 int lc,
					 int encoding,
					 const unsigned char *tab)
{
	unsigned char c1,
				c2;

	while (len > 0)
	{
		c1 = *l;
		if (c1 == 0)
			report_invalid_encoding(encoding, (const char *) l, len);
		if (!IS_HIGHBIT_SET(c1))
			*p++ = c1;
		else
		{
			c2 = tab[c1 - HIGHBIT];
			if (c2)
			{
				*p++ = lc;
				*p++ = c2;
			}
			else
				report_untranslatable_char(encoding, PG_MULE_INTERNAL,
										   (const char *) l, len);
		}
		l++;
		len--;
	}
	*p = '\0';
}
Exemple #10
0
/*
 * local2local: a generic single byte charset encoding
 * conversion between two ASCII-superset encodings.
 *
 * l points to the source string of length len
 * p is the output area (must be large enough!)
 * src_encoding is the PG identifier for the source encoding
 * dest_encoding is the PG identifier for the target encoding
 * tab holds conversion entries for the source charset
 * starting from 128 (0x80). each entry in the table holds the corresponding
 * code point for the target charset, or 0 if there is no equivalent code.
 */
void
local2local(const unsigned char *l,
			unsigned char *p,
			int len,
			int src_encoding,
			int dest_encoding,
			const unsigned char *tab)
{
	unsigned char c1,
				c2;

	while (len > 0)
	{
		c1 = *l;
		if (c1 == 0)
			report_invalid_encoding(src_encoding, (const char *) l, len);
		if (!IS_HIGHBIT_SET(c1))
			*p++ = c1;
		else
		{
			c2 = tab[c1 - HIGHBIT];
			if (c2)
				*p++ = c2;
			else
				report_untranslatable_char(src_encoding, dest_encoding,
										   (const char *) l, len);
		}
		l++;
		len--;
	}
	*p = '\0';
}
Datum
iso8859_1_to_utf8(PG_FUNCTION_ARGS)
{
	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
	int			len = PG_GETARG_INT32(4);
	unsigned short c;

	Assert(PG_GETARG_INT32(0) == PG_LATIN1);
	Assert(PG_GETARG_INT32(1) == PG_UTF8);
	Assert(len >= 0);

	while (len > 0)
	{
		c = *src;
		if (c == 0)
			report_invalid_encoding(PG_LATIN1, (const char *) src, len);
		if (!IS_HIGHBIT_SET(c))
			*dest++ = c;
		else
		{
			*dest++ = (c >> 6) | 0xc0;
			*dest++ = (c & 0x003f) | HIGHBIT;
		}
		src++;
		len--;
	}
	*dest = '\0';

	PG_RETURN_VOID();
}
Exemple #12
0
/*
 * Big5 ---> MIC
 */
static void
big52mic(const unsigned char *big5, unsigned char *p, int len)
{
	unsigned short c1;
	unsigned short big5buf,
				cnsBuf;
	unsigned char lc;
	int			l;

	while (len > 0)
	{
		c1 = *big5;
		if (!IS_HIGHBIT_SET(c1))
		{
			/* ASCII */
			if (c1 == 0)
				report_invalid_encoding(PG_BIG5,
										(const char *) big5, len);
			*p++ = c1;
			big5++;
			len--;
			continue;
		}
		l = pg_encoding_verifymb(PG_BIG5, (const char *) big5, len);
		if (l < 0)
			report_invalid_encoding(PG_BIG5,
									(const char *) big5, len);
		big5buf = (c1 << 8) | big5[1];
		cnsBuf = BIG5toCNS(big5buf, &lc);
		if (lc != 0)
		{
			if (lc == LC_CNS11643_3 || lc == LC_CNS11643_4)
			{
				*p++ = 0x9d;	/* LCPRV2 */
			}
			*p++ = lc;			/* Plane No. */
			*p++ = (cnsBuf >> 8) & 0x00ff;
			*p++ = cnsBuf & 0x00ff;
		}
		else
			report_untranslatable_char(PG_BIG5, PG_MULE_INTERNAL,
									   (const char *) big5, len);
		big5 += l;
		len -= l;
	}
Exemple #13
0
/*
 * mic2latin_with_table: a generic single byte charset encoding
 * conversion from the mule internal code to a local charset.
 *
 * mic points to the source string of length len
 * p is the output area (must be large enough!)
 * lc is the mule character set id for the local encoding
 * encoding is the PG identifier for the local encoding
 * tab holds conversion entries for the mule internal code's
 * second byte, starting from 128 (0x80). each entry in the table
 * holds the corresponding code point for the local charset.
 */
void
mic2latin_with_table(const unsigned char *mic,
					 unsigned char *p,
					 int len,
					 int lc,
					 int encoding,
					 const unsigned char *tab)
{
	unsigned char c1,
				c2;

	while (len > 0)
	{
		c1 = *mic;
		if (c1 == 0)
			report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
		if (!IS_HIGHBIT_SET(c1))
		{
			/* easy for ASCII */
			*p++ = c1;
			mic++;
			len--;
		}
		else
		{
			int			l = pg_mic_mblen(mic);

			if (len < l)
				report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
										len);
			if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]) ||
				(c2 = tab[mic[1] - HIGHBIT]) == 0)
			{
				report_untranslatable_char(PG_MULE_INTERNAL, encoding,
										   (const char *) mic, len);
				break;			/* keep compiler quiet */
			}
			*p++ = c2;
			mic += 2;
			len -= 2;
		}
	}
	*p = '\0';
}
Exemple #14
0
/*
 * MIC ---> EUC_KR
 */
static void
mic2euc_kr(const unsigned char *mic, unsigned char *p, int len)
{
	int			c1;
	int			l;

	while (len > 0)
	{
		c1 = *mic;
		if (!IS_HIGHBIT_SET(c1))
		{
			/* ASCII */
			if (c1 == 0)
				report_invalid_encoding(PG_MULE_INTERNAL,
										(const char *) mic, len);
			*p++ = c1;
			mic++;
			len--;
			continue;
		}
		l = pg_encoding_verifymb(PG_MULE_INTERNAL, (const char *) mic, len);
		if (l < 0)
			report_invalid_encoding(PG_MULE_INTERNAL,
									(const char *) mic, len);
		if (c1 == LC_KS5601)
		{
			*p++ = mic[1];
			*p++ = mic[2];
		}
		else
			report_untranslatable_char(PG_MULE_INTERNAL, PG_EUC_KR,
									   (const char *) mic, len);
		mic += l;
		len -= l;
	}
	*p = '\0';
}
Exemple #15
0
/*
 * ASCII ---> MIC
 *
 * While ordinarily SQL_ASCII encoding is forgiving of high-bit-set
 * characters, here we must take a hard line because we don't know
 * the appropriate MIC equivalent.
 */
void
pg_ascii2mic(const unsigned char *l, unsigned char *p, int len)
{
	int			c1;

	while (len > 0)
	{
		c1 = *l;
		if (c1 == 0 || IS_HIGHBIT_SET(c1))
			report_invalid_encoding(PG_SQL_ASCII, (const char *) l, len);
		*p++ = c1;
		l++;
		len--;
	}
	*p = '\0';
}
Exemple #16
0
/*
 * LATINn ---> MIC when the charset's local codes map directly to MIC
 *
 * l points to the source string of length len
 * p is the output area (must be large enough!)
 * lc is the mule character set id for the local encoding
 * encoding is the PG identifier for the local encoding
 */
void
latin2mic(const unsigned char *l, unsigned char *p, int len,
		  int lc, int encoding)
{
	int			c1;

	while (len > 0)
	{
		c1 = *l;
		if (c1 == 0)
			report_invalid_encoding(encoding, (const char *) l, len);
		if (IS_HIGHBIT_SET(c1))
			*p++ = lc;
		*p++ = c1;
		l++;
		len--;
	}
	*p = '\0';
}
Exemple #17
0
/*
 * local code ---> UTF8
 *
 * iso: input string in local encoding (need not be null-terminated)
 * len: length of input string (in bytes)
 * utf: pointer to the output area (must be large enough!)
		  (output string will be null-terminated)
 * map: conversion map for single characters
 * mapsize: number of entries in the conversion map
 * cmap: conversion map for combined characters
 *		  (optional, pass NULL if none)
 * cmapsize: number of entries in the conversion map for combined characters
 *		  (optional, pass 0 if none)
 * conv_func: algorithmic encoding conversion function
 *		  (optional, pass NULL if none)
 * encoding: PG identifier for the local encoding
 *
 * For each character, the map is consulted first; if no match, the cmap
 * (if provided) is consulted next; if still no match, the conv_func
 * (if provided) is applied.  An error is raised if no match is found.
 *
 * See pg_wchar.h for more details about the data structures used here.
 */
void
LocalToUtf(const unsigned char *iso, int len,
		   unsigned char *utf,
		   const pg_local_to_utf *map, int mapsize,
		   const pg_local_to_utf_combined *cmap, int cmapsize,
		   utf_local_conversion_func conv_func,
		   int encoding)
{
	uint32		iiso;
	int			l;
	const pg_local_to_utf *p;
	const pg_local_to_utf_combined *cp;

	if (!PG_VALID_ENCODING(encoding))
		ereport(ERROR,
				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
				 errmsg("invalid encoding number: %d", encoding)));

	for (; len > 0; len -= l)
	{
		/* "break" cases all represent errors */
		if (*iso == '\0')
			break;

		if (!IS_HIGHBIT_SET(*iso))
		{
			/* ASCII case is easy, assume it's one-to-one conversion */
			*utf++ = *iso++;
			l = 1;
			continue;
		}

		l = pg_encoding_verifymb(encoding, (const char *) iso, len);
		if (l < 0)
			break;

		/* collect coded char of length l */
		if (l == 1)
			iiso = *iso++;
		else if (l == 2)
		{
			iiso = *iso++ << 8;
			iiso |= *iso++;
		}
		else if (l == 3)
		{
			iiso = *iso++ << 16;
			iiso |= *iso++ << 8;
			iiso |= *iso++;
		}
		else if (l == 4)
		{
			iiso = *iso++ << 24;
			iiso |= *iso++ << 16;
			iiso |= *iso++ << 8;
			iiso |= *iso++;
		}
		else
		{
			elog(ERROR, "unsupported character length %d", l);
			iiso = 0;			/* keep compiler quiet */
		}

		/* First check ordinary map */
		p = bsearch(&iiso, map, mapsize,
					sizeof(pg_local_to_utf), compare2);

		if (p)
		{
			utf = store_coded_char(utf, p->utf);
			continue;
		}

		/* If there's a combined character map, try that */
		if (cmap)
		{
			cp = bsearch(&iiso, cmap, cmapsize,
						 sizeof(pg_local_to_utf_combined), compare4);

			if (cp)
			{
				utf = store_coded_char(utf, cp->utf1);
				utf = store_coded_char(utf, cp->utf2);
				continue;
			}
		}

		/* if there's a conversion function, try that */
		if (conv_func)
		{
			uint32		converted = (*conv_func) (iiso);

			if (converted)
			{
				utf = store_coded_char(utf, converted);
				continue;
			}
		}

		/* failed to translate this character */
		report_untranslatable_char(encoding, PG_UTF8,
								   (const char *) (iso - l), len);
	}

	/* if we broke out of loop early, must be invalid input */
	if (len > 0)
		report_invalid_encoding(encoding, (const char *) iso, len);

	*utf = '\0';
}
Exemple #18
0
/* ----------
 * conv_proc(
 *		INTEGER,	-- source encoding id
 *		INTEGER,	-- destination encoding id
 *		CSTRING,	-- source string (null terminated C string)
 *		CSTRING,	-- destination string (null terminated C string)
 *		INTEGER		-- source string length
 * ) returns VOID;
 * ----------
 */
Datum
sjis_eudc_to_utf8(PG_FUNCTION_ARGS)
{
	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
	unsigned char *p;
	unsigned char *fallback_character = NULL;
	int			len = PG_GETARG_INT32(4);
	int			sjis_len;
	int			clen;

	CHECK_ENCODING_CONVERSION_ARGS(PG_SJIS, PG_UTF8);

	if (sjis_to_utf8 == NULL)
		sjis_to_utf8 = load_external_function(
			"utf8_and_sjis", "sjis_to_utf8", true, NULL);

	*dest = '\0';
	p = src;
	sjis_len = 0;
	for (; len > 0; len -= clen)
	{
		const unsigned char *c = p + sjis_len;

		if (c[0] == '\0')
			report_invalid_encoding(PG_SJIS, (const char *) p + sjis_len, len);

		if (c[0] >= 0xf0 && c[0] <= 0xf9 && len >= 2 && ISSJISTAIL(c[1]))
		{
			int	ucs;
			int	m;
			int	n;

			clen = 2;

			/* SJIS to UTF8 */
			if (sjis_len > 0)
			{
				DirectFunctionCall5(sjis_to_utf8, PG_SJIS, PG_UTF8,
									CStringGetDatum(p), CStringGetDatum(dest),
									sjis_len);
				dest = dest + strlen((char *) dest);
				p += sjis_len;
				sjis_len = 0;
			}
			p += clen;

			elog(eudc_log_level,
				"eudc character found: %02x%02x in SJIS to UTF8 conversion",
				c[0], c[1]);

			/* SJIS EUDC to UTF8 */
			if (eudc_fallback_character && eudc_fallback_character[0])
			{
				/* map to fallback character */
				int		i;

				if (fallback_character == NULL)
				{
					fallback_character = pg_do_encoding_conversion(
						(unsigned char *) eudc_fallback_character,
						strlen(eudc_fallback_character),
						GetDatabaseEncoding(),
						PG_UTF8);
				}

				for (i = 0; fallback_character[i]; i++)
					*dest++ = fallback_character[i];
			}
			else
			{
				/* linear mapping */
				n = c[0] - 0xf0;
				m = c[1] - 0x40;

				if (m >= 0x40)
					m--;

				ucs = 0xe000 + n * 188 + m;

				*dest++ = (ucs >> 12) | 0xe0;
				*dest++ = (ucs & 0x0fc0) >> 6 | 0x80;
				*dest++ = (ucs & 0x003f) | 0x80;
			}
			*dest = '\0';
		}
		else
		{
Exemple #19
0
/*
 * UTF8 ---> local code
 *
 * utf: input UTF8 string (need not be null-terminated).
 * iso: pointer to the output area (must be large enough!)
 * map: the conversion map.
 * cmap: the conversion map for combined characters.
 *		  (optional)
 * size1: the size of the conversion map.
 * size2: the size of the conversion map for combined characters
 *		  (optional)
 * encoding: the PG identifier for the local encoding.
 * len: length of input string.
 */
void
UtfToLocal(const unsigned char *utf, unsigned char *iso,
		   const pg_utf_to_local *map, const pg_utf_to_local_combined *cmap,
		   int size1, int size2, int encoding, int len)
{
	uint32		iutf;
	uint32		cutf[2];
	uint32		code;
	pg_utf_to_local *p;
	pg_utf_to_local_combined *cp;
	int			l;

	for (; len > 0; len -= l)
	{
		/* "break" cases all represent errors */
		if (*utf == '\0')
			break;

		l = pg_utf_mblen(utf);

		if (len < l)
			break;

		if (!pg_utf8_islegal(utf, l))
			break;

		if (l == 1)
		{
			/* ASCII case is easy */
			*iso++ = *utf++;
			continue;
		}
		else if (l == 2)
		{
			iutf = *utf++ << 8;
			iutf |= *utf++;
		}
		else if (l == 3)
		{
			iutf = *utf++ << 16;
			iutf |= *utf++ << 8;
			iutf |= *utf++;
		}
		else if (l == 4)
		{
			iutf = *utf++ << 24;
			iutf |= *utf++ << 16;
			iutf |= *utf++ << 8;
			iutf |= *utf++;
		}

		/*
		 * first, try with combined map if possible
		 */
		if (cmap && len > l)
		{
			const unsigned char *utf_save = utf;
			int			len_save = len;
			int			l_save = l;

			len -= l;

			l = pg_utf_mblen(utf);
			if (len < l)
				break;

			if (!pg_utf8_islegal(utf, l))
				break;

			cutf[0] = iutf;

			if (l == 1)
			{
				if (len_save > 1)
				{
					p = bsearch(&cutf[0], map, size1,
								sizeof(pg_utf_to_local), compare1);
					if (p == NULL)
						report_untranslatable_char(PG_UTF8, encoding,
							   (const char *) (utf_save - l_save), len_save);
					iso = set_iso_code(iso, p->code);
				}

				/* ASCII case is easy */
				*iso++ = *utf++;
				continue;
			}
			else if (l == 2)
			{
				iutf = *utf++ << 8;
				iutf |= *utf++;
			}
			else if (l == 3)
			{
				iutf = *utf++ << 16;
				iutf |= *utf++ << 8;
				iutf |= *utf++;
			}
			else if (l == 4)
			{
				iutf = *utf++ << 24;
				iutf |= *utf++ << 16;
				iutf |= *utf++ << 8;
				iutf |= *utf++;
			}

			cutf[1] = iutf;
			cp = bsearch(cutf, cmap, size2,
						 sizeof(pg_utf_to_local_combined), compare3);
			if (cp)
				code = cp->code;
			else
			{
				/* not found in combined map. try with ordinary map */
				p = bsearch(&cutf[0], map, size1,
							sizeof(pg_utf_to_local), compare1);
				if (p == NULL)
					report_untranslatable_char(PG_UTF8, encoding,
							   (const char *) (utf_save - l_save), len_save);
				iso = set_iso_code(iso, p->code);

				p = bsearch(&cutf[1], map, size1,
							sizeof(pg_utf_to_local), compare1);
				if (p == NULL)
					report_untranslatable_char(PG_UTF8, encoding,
											   (const char *) (utf - l), len);
				code = p->code;
			}
		}
		else	/* no cmap or no remaining data */
		{
			p = bsearch(&iutf, map, size1,
						sizeof(pg_utf_to_local), compare1);
			if (p == NULL)
				report_untranslatable_char(PG_UTF8, encoding,
										   (const char *) (utf - l), len);
			code = p->code;
		}
		iso = set_iso_code(iso, code);
	}

	if (len > 0)
		report_invalid_encoding(PG_UTF8, (const char *) utf, len);

	*iso = '\0';
}
Exemple #20
0
/*
 * UTF8 ---> local code
 *
 * utf: input string in UTF8 encoding (need not be null-terminated)
 * len: length of input string (in bytes)
 * iso: pointer to the output area (must be large enough!)
		  (output string will be null-terminated)
 * map: conversion map for single characters
 * mapsize: number of entries in the conversion map
 * cmap: conversion map for combined characters
 *		  (optional, pass NULL if none)
 * cmapsize: number of entries in the conversion map for combined characters
 *		  (optional, pass 0 if none)
 * conv_func: algorithmic encoding conversion function
 *		  (optional, pass NULL if none)
 * encoding: PG identifier for the local encoding
 *
 * For each character, the cmap (if provided) is consulted first; if no match,
 * the map is consulted next; if still no match, the conv_func (if provided)
 * is applied.  An error is raised if no match is found.
 *
 * See pg_wchar.h for more details about the data structures used here.
 */
void
UtfToLocal(const unsigned char *utf, int len,
		   unsigned char *iso,
		   const pg_utf_to_local *map, int mapsize,
		   const pg_utf_to_local_combined *cmap, int cmapsize,
		   utf_local_conversion_func conv_func,
		   int encoding)
{
	uint32		iutf;
	int			l;
	const pg_utf_to_local *p;
	const pg_utf_to_local_combined *cp;

	if (!PG_VALID_ENCODING(encoding))
		ereport(ERROR,
				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
				 errmsg("invalid encoding number: %d", encoding)));

	for (; len > 0; len -= l)
	{
		/* "break" cases all represent errors */
		if (*utf == '\0')
			break;

		l = pg_utf_mblen(utf);
		if (len < l)
			break;

		if (!pg_utf8_islegal(utf, l))
			break;

		if (l == 1)
		{
			/* ASCII case is easy, assume it's one-to-one conversion */
			*iso++ = *utf++;
			continue;
		}

		/* collect coded char of length l */
		if (l == 2)
		{
			iutf = *utf++ << 8;
			iutf |= *utf++;
		}
		else if (l == 3)
		{
			iutf = *utf++ << 16;
			iutf |= *utf++ << 8;
			iutf |= *utf++;
		}
		else if (l == 4)
		{
			iutf = *utf++ << 24;
			iutf |= *utf++ << 16;
			iutf |= *utf++ << 8;
			iutf |= *utf++;
		}
		else
		{
			elog(ERROR, "unsupported character length %d", l);
			iutf = 0;			/* keep compiler quiet */
		}

		/* First, try with combined map if possible */
		if (cmap && len > l)
		{
			const unsigned char *utf_save = utf;
			int			len_save = len;
			int			l_save = l;

			/* collect next character, same as above */
			len -= l;

			l = pg_utf_mblen(utf);
			if (len < l)
				break;

			if (!pg_utf8_islegal(utf, l))
				break;

			/* We assume ASCII character cannot be in combined map */
			if (l > 1)
			{
				uint32		iutf2;
				uint32		cutf[2];

				if (l == 2)
				{
					iutf2 = *utf++ << 8;
					iutf2 |= *utf++;
				}
				else if (l == 3)
				{
					iutf2 = *utf++ << 16;
					iutf2 |= *utf++ << 8;
					iutf2 |= *utf++;
				}
				else if (l == 4)
				{
					iutf2 = *utf++ << 24;
					iutf2 |= *utf++ << 16;
					iutf2 |= *utf++ << 8;
					iutf2 |= *utf++;
				}
				else
				{
					elog(ERROR, "unsupported character length %d", l);
					iutf2 = 0;	/* keep compiler quiet */
				}

				cutf[0] = iutf;
				cutf[1] = iutf2;

				cp = bsearch(cutf, cmap, cmapsize,
							 sizeof(pg_utf_to_local_combined), compare3);

				if (cp)
				{
					iso = store_coded_char(iso, cp->code);
					continue;
				}
			}

			/* fail, so back up to reprocess second character next time */
			utf = utf_save;
			len = len_save;
			l = l_save;
		}

		/* Now check ordinary map */
		p = bsearch(&iutf, map, mapsize,
					sizeof(pg_utf_to_local), compare1);

		if (p)
		{
			iso = store_coded_char(iso, p->code);
			continue;
		}

		/* if there's a conversion function, try that */
		if (conv_func)
		{
			uint32		converted = (*conv_func) (iutf);

			if (converted)
			{
				iso = store_coded_char(iso, converted);
				continue;
			}
		}

		/* failed to translate this character */
		report_untranslatable_char(PG_UTF8, encoding,
								   (const char *) (utf - l), len);
	}

	/* if we broke out of loop early, must be invalid input */
	if (len > 0)
		report_invalid_encoding(PG_UTF8, (const char *) utf, len);

	*iso = '\0';
}