Ejemplo n.º 1
0
static void
read_dictionary(DictSyn *d, char *filename)
{
	char	   *real_filename = get_tsearch_config_filename(filename, "rules");
	tsearch_readline_state trst;
	char	   *line;
	int			cur = 0;

	if (!tsearch_readline_begin(&trst, real_filename))
		ereport(ERROR,
				(errcode(ERRCODE_CONFIG_FILE_ERROR),
				 errmsg("could not open synonym file \"%s\": %m",
						real_filename)));

	while ((line = tsearch_readline(&trst)) != NULL)
	{
		char	   *value;
		char	   *key;
		char	   *end = NULL;

		if (*line == '\0')
			continue;

		value = lowerstr(line);
		pfree(line);

		key = find_word(value, &end);
		if (!key)
		{
			pfree(value);
			continue;
		}

		if (cur == d->len)
		{
			d->len = (d->len > 0) ? 2 * d->len : 16;
			if (d->syn)
				d->syn = (Syn *) repalloc(d->syn, sizeof(Syn) * d->len);
			else
				d->syn = (Syn *) palloc(sizeof(Syn) * d->len);
		}

		d->syn[cur].key = pnstrdup(key, end - key);
		d->syn[cur].value = value;

		cur++;
	}

	tsearch_readline_end(&trst);

	d->len = cur;
	if (cur > 1)
		qsort(d->syn, d->len, sizeof(Syn), compare_syn);

	pfree(real_filename);
}
Ejemplo n.º 2
0
/*
 * initSuffixTree  - create suffix tree from file. Function converts
 * UTF8-encoded file into current encoding.
 */
static SuffixChar *
initSuffixTree(char *filename)
{
	SuffixChar *volatile rootSuffixTree = NULL;
	MemoryContext ccxt = CurrentMemoryContext;
	tsearch_readline_state trst;
	volatile bool skip;

	filename = get_tsearch_config_filename(filename, "rules");
	if (!tsearch_readline_begin(&trst, filename))
		ereport(ERROR,
				(errcode(ERRCODE_CONFIG_FILE_ERROR),
				 errmsg("could not open unaccent file \"%s\": %m",
						filename)));

	do
	{
		char		src[4096];
		char		trg[4096];
		int			srclen;
		int			trglen;
		char	   *line = NULL;

		skip = true;

		PG_TRY();
		{
			/*
			 * pg_do_encoding_conversion() (called by tsearch_readline()) will
			 * emit exception if it finds untranslatable characters in current
			 * locale. We just skip such characters.
			 */
			while ((line = tsearch_readline(&trst)) != NULL)
			{
				if (sscanf(line, "%s\t%s\n", src, trg) != 2)
					continue;

				srclen = strlen(src);
				trglen = strlen(trg);

				rootSuffixTree = placeChar(rootSuffixTree,
										   (unsigned char *) src, srclen,
										   trg, trglen);
				skip = false;
				pfree(line);
			}
		}
		PG_CATCH();
		{
			ErrorData  *errdata;
			MemoryContext ecxt;

			ecxt = MemoryContextSwitchTo(ccxt);
			errdata = CopyErrorData();
			if (errdata->sqlerrcode == ERRCODE_UNTRANSLATABLE_CHARACTER)
			{
				FlushErrorState();
			}
			else
			{
				MemoryContextSwitchTo(ecxt);
				PG_RE_THROW();
			}
		}
		PG_END_TRY();
	}
	while (skip);

	tsearch_readline_end(&trst);

	return rootSuffixTree;
}
Ejemplo n.º 3
0
static void
read_dictionary(DictSyn *d, char *filename)
{
	char	   *real_filename = get_tsearch_config_filename(filename, "rules");
	tsearch_readline_state trst;
	char	   *line;
	int			cur = 0;

	if (!tsearch_readline_begin(&trst, real_filename))
		ereport(ERROR,
				(errcode(ERRCODE_CONFIG_FILE_ERROR),
				 errmsg("could not open synonym file \"%s\": %m",
						real_filename)));

	while ((line = tsearch_readline(&trst)) != NULL)
	{
		char	   *value;
		char	   *key;
		char	   *pos;
		char	   *end;

		if (*line == '\0')
			continue;

		value = lowerstr(line);
		pfree(line);

		pos = value;
		while ((key = find_word(pos, &end)) != NULL)
		{
			/* Enlarge syn structure if full */
			if (cur == d->len)
			{
				d->len = (d->len > 0) ? 2 * d->len : 16;
				if (d->syn)
					d->syn = (Syn *) repalloc(d->syn, sizeof(Syn) * d->len);
				else
					d->syn = (Syn *) palloc(sizeof(Syn) * d->len);
			}

			/* Save first word only if we will match it */
			if (pos != value || d->matchorig)
			{
				d->syn[cur].key = pnstrdup(key, end - key);
				d->syn[cur].value = pstrdup(value);

				cur++;
			}

			pos = end;

			/* Don't bother scanning synonyms if we will not match them */
			if (!d->matchsynonyms)
				break;
		}

		pfree(value);
	}

	tsearch_readline_end(&trst);

	d->len = cur;
	if (cur > 1)
		qsort(d->syn, d->len, sizeof(Syn), compare_syn);

	pfree(real_filename);
}
Ejemplo n.º 4
0
static void
thesaurusRead(char *filename, DictThesaurus *d)
{
	tsearch_readline_state trst;
	uint16		idsubst = 0;
	bool		useasis = false;
	char	   *line;

	filename = get_tsearch_config_filename(filename, "ths");
	if (!tsearch_readline_begin(&trst, filename))
		ereport(ERROR,
				(errcode(ERRCODE_CONFIG_FILE_ERROR),
				 errmsg("could not open thesaurus file \"%s\": %m",
						filename)));

	while ((line = tsearch_readline(&trst)) != NULL)
	{
		char	   *ptr;
		int			state = TR_WAITLEX;
		char	   *beginwrd = NULL;
		uint16		posinsubst = 0;
		uint16		nwrd = 0;

		ptr = line;

		/* is it a comment? */
		while (*ptr && t_isspace(ptr))
			ptr += pg_mblen(ptr);

		if (t_iseq(ptr, '#') || *ptr == '\0' ||
			t_iseq(ptr, '\n') || t_iseq(ptr, '\r'))
		{
			pfree(line);
			continue;
		}

		while (*ptr)
		{
			if (state == TR_WAITLEX)
			{
				if (t_iseq(ptr, ':'))
				{
					if (posinsubst == 0)
						ereport(ERROR,
								(errcode(ERRCODE_CONFIG_FILE_ERROR),
								 errmsg("unexpected delimiter")));
					state = TR_WAITSUBS;
				}
				else if (!t_isspace(ptr))
				{
					beginwrd = ptr;
					state = TR_INLEX;
				}
			}
			else if (state == TR_INLEX)
			{
				if (t_iseq(ptr, ':'))
				{
					newLexeme(d, beginwrd, ptr, idsubst, posinsubst++);
					state = TR_WAITSUBS;
				}
				else if (t_isspace(ptr))
				{
					newLexeme(d, beginwrd, ptr, idsubst, posinsubst++);
					state = TR_WAITLEX;
				}
			}
			else if (state == TR_WAITSUBS)
			{
				if (t_iseq(ptr, '*'))
				{
					useasis = true;
					state = TR_INSUBS;
					beginwrd = ptr + pg_mblen(ptr);
				}
				else if (t_iseq(ptr, '\\'))
				{
					useasis = false;
					state = TR_INSUBS;
					beginwrd = ptr + pg_mblen(ptr);
				}
				else if (!t_isspace(ptr))
				{
					useasis = false;
					beginwrd = ptr;
					state = TR_INSUBS;
				}
			}
			else if (state == TR_INSUBS)
			{
				if (t_isspace(ptr))
				{
					if (ptr == beginwrd)
						ereport(ERROR,
								(errcode(ERRCODE_CONFIG_FILE_ERROR),
								 errmsg("unexpected end of line or lexeme")));
					addWrd(d, beginwrd, ptr, idsubst, nwrd++, posinsubst, useasis);
					state = TR_WAITSUBS;
				}
			}
			else
				elog(ERROR, "unrecognized thesaurus state: %d", state);

			ptr += pg_mblen(ptr);
		}

		if (state == TR_INSUBS)
		{
			if (ptr == beginwrd)
				ereport(ERROR,
						(errcode(ERRCODE_CONFIG_FILE_ERROR),
						 errmsg("unexpected end of line or lexeme")));
			addWrd(d, beginwrd, ptr, idsubst, nwrd++, posinsubst, useasis);
		}

		idsubst++;

		if (!(nwrd && posinsubst))
			ereport(ERROR,
					(errcode(ERRCODE_CONFIG_FILE_ERROR),
					 errmsg("unexpected end of line")));

		pfree(line);
	}

	d->nsubst = idsubst;

	tsearch_readline_end(&trst);
}
Ejemplo n.º 5
0
/*
 * Reads a stop-word file. Each word is run through 'wordop'
 * function, if given.	wordop may either modify the input in-place,
 * or palloc a new version.
 */
void
readstoplist(const char *fname, StopList *s, char *(*wordop) (const char *))
{
	char	  **stop = NULL;

	s->len = 0;
	if (fname && *fname)
	{
		char	   *filename = get_tsearch_config_filename(fname, "stop");
		tsearch_readline_state trst;
		char	   *line;
		int			reallen = 0;

		if (!tsearch_readline_begin(&trst, filename))
			ereport(ERROR,
					(errcode(ERRCODE_CONFIG_FILE_ERROR),
					 errmsg("could not open stop-word file \"%s\": %m",
							filename)));

		while ((line = tsearch_readline(&trst)) != NULL)
		{
			char	   *pbuf = line;

			/* Trim trailing space */
			while (*pbuf && !t_isspace(pbuf))
				pbuf += pg_mblen(pbuf);
			*pbuf = '\0';

			/* Skip empty lines */
			if (*line == '\0')
			{
				pfree(line);
				continue;
			}

			if (s->len >= reallen)
			{
				if (reallen == 0)
				{
					reallen = 64;
					stop = (char **) palloc(sizeof(char *) * reallen);
				}
				else
				{
					reallen *= 2;
					stop = (char **) repalloc((void *) stop,
											  sizeof(char *) * reallen);
				}
			}

			if (wordop)
			{
				stop[s->len] = wordop(line);
				if (stop[s->len] != line)
					pfree(line);
			}
			else
				stop[s->len] = line;

			(s->len)++;
		}

		tsearch_readline_end(&trst);
		pfree(filename);
	}

	s->stop = stop;

	/* Sort to allow binary searching */
	if (s->stop && s->len > 0)
		qsort(s->stop, s->len, sizeof(char *), pg_qsort_strcmp);
}
Ejemplo n.º 6
0
/*
 * initTrie  - create trie from file.
 *
 * Function converts UTF8-encoded file into current encoding.
 */
static TrieChar *
initTrie(char *filename)
{
	TrieChar   *volatile rootTrie = NULL;
	MemoryContext ccxt = CurrentMemoryContext;
	tsearch_readline_state trst;
	volatile bool skip;

	filename = get_tsearch_config_filename(filename, "rules");
	if (!tsearch_readline_begin(&trst, filename))
		ereport(ERROR,
				(errcode(ERRCODE_CONFIG_FILE_ERROR),
				 errmsg("could not open unaccent file \"%s\": %m",
						filename)));

	do
	{
		/*
		 * pg_do_encoding_conversion() (called by tsearch_readline()) will
		 * emit exception if it finds untranslatable characters in current
		 * locale. We just skip such lines, continuing with the next.
		 */
		skip = true;

		PG_TRY();
		{
			char	   *line;

			while ((line = tsearch_readline(&trst)) != NULL)
			{
				/*----------
				 * The format of each line must be "src" or "src trg", where
				 * src and trg are sequences of one or more non-whitespace
				 * characters, separated by whitespace.  Whitespace at start
				 * or end of line is ignored.  If trg is omitted, an empty
				 * string is used as the replacement.
				 *
				 * We use a simple state machine, with states
				 *	0	initial (before src)
				 *	1	in src
				 *	2	in whitespace after src
				 *	3	in trg
				 *	4	in whitespace after trg
				 *	-1	syntax error detected
				 *----------
				 */
				int			state;
				char	   *ptr;
				char	   *src = NULL;
				char	   *trg = NULL;
				int			ptrlen;
				int			srclen = 0;
				int			trglen = 0;

				state = 0;
				for (ptr = line; *ptr; ptr += ptrlen)
				{
					ptrlen = pg_mblen(ptr);
					/* ignore whitespace, but end src or trg */
					if (t_isspace(ptr))
					{
						if (state == 1)
							state = 2;
						else if (state == 3)
							state = 4;
						continue;
					}
					switch (state)
					{
						case 0:
							/* start of src */
							src = ptr;
							srclen = ptrlen;
							state = 1;
							break;
						case 1:
							/* continue src */
							srclen += ptrlen;
							break;
						case 2:
							/* start of trg */
							trg = ptr;
							trglen = ptrlen;
							state = 3;
							break;
						case 3:
							/* continue trg */
							trglen += ptrlen;
							break;
						default:
							/* bogus line format */
							state = -1;
							break;
					}
				}

				if (state == 1 || state == 2)
				{
					/* trg was omitted, so use "" */
					trg = "";
					trglen = 0;
				}

				if (state > 0)
					rootTrie = placeChar(rootTrie,
										 (unsigned char *) src, srclen,
										 trg, trglen);
				else if (state < 0)
					ereport(WARNING,
							(errcode(ERRCODE_CONFIG_FILE_ERROR),
							 errmsg("invalid syntax: more than two strings in unaccent rule")));

				pfree(line);
			}
			skip = false;
		}
		PG_CATCH();
		{
			ErrorData  *errdata;
			MemoryContext ecxt;

			ecxt = MemoryContextSwitchTo(ccxt);
			errdata = CopyErrorData();
			if (errdata->sqlerrcode == ERRCODE_UNTRANSLATABLE_CHARACTER)
			{
				FlushErrorState();
			}
			else
			{
				MemoryContextSwitchTo(ecxt);
				PG_RE_THROW();
			}
		}
		PG_END_TRY();
	}
	while (skip);

	tsearch_readline_end(&trst);

	return rootTrie;
}
Ejemplo n.º 7
0
/*
 * initSuffixTree  - create suffix tree from file. Function converts
 * UTF8-encoded file into current encoding.
 */
static SuffixChar *
initSuffixTree(char *filename)
{
	SuffixChar *volatile rootSuffixTree = NULL;
	MemoryContext ccxt = CurrentMemoryContext;
	tsearch_readline_state trst;
	volatile bool skip;

	filename = get_tsearch_config_filename(filename, "rules");
	if (!tsearch_readline_begin(&trst, filename))
		ereport(ERROR,
				(errcode(ERRCODE_CONFIG_FILE_ERROR),
				 errmsg("could not open unaccent file \"%s\": %m",
						filename)));

	do
	{
		/*
		 * pg_do_encoding_conversion() (called by tsearch_readline()) will
		 * emit exception if it finds untranslatable characters in current
		 * locale. We just skip such lines, continuing with the next.
		 */
		skip = true;

		PG_TRY();
		{
			char	   *line;

			while ((line = tsearch_readline(&trst)) != NULL)
			{
				/*
				 * The format of each line must be "src trg" where src and trg
				 * are sequences of one or more non-whitespace characters,
				 * separated by whitespace.  Whitespace at start or end of
				 * line is ignored.
				 */
				int			state;
				char	   *ptr;
				char	   *src = NULL;
				char	   *trg = NULL;
				int			ptrlen;
				int			srclen = 0;
				int			trglen = 0;

				state = 0;
				for (ptr = line; *ptr; ptr += ptrlen)
				{
					ptrlen = pg_mblen(ptr);
					/* ignore whitespace, but end src or trg */
					if (t_isspace(ptr))
					{
						if (state == 1)
							state = 2;
						else if (state == 3)
							state = 4;
						continue;
					}
					switch (state)
					{
						case 0:
							/* start of src */
							src = ptr;
							srclen = ptrlen;
							state = 1;
							break;
						case 1:
							/* continue src */
							srclen += ptrlen;
							break;
						case 2:
							/* start of trg */
							trg = ptr;
							trglen = ptrlen;
							state = 3;
							break;
						case 3:
							/* continue trg */
							trglen += ptrlen;
							break;
						default:
							/* bogus line format */
							state = -1;
							break;
					}
				}

				if (state >= 3)
					rootSuffixTree = placeChar(rootSuffixTree,
											   (unsigned char *) src, srclen,
											   trg, trglen);

				pfree(line);
			}
			skip = false;
		}
		PG_CATCH();
		{
			ErrorData  *errdata;
			MemoryContext ecxt;

			ecxt = MemoryContextSwitchTo(ccxt);
			errdata = CopyErrorData();
			if (errdata->sqlerrcode == ERRCODE_UNTRANSLATABLE_CHARACTER)
			{
				FlushErrorState();
			}
			else
			{
				MemoryContextSwitchTo(ecxt);
				PG_RE_THROW();
			}
		}
		PG_END_TRY();
	}
	while (skip);

	tsearch_readline_end(&trst);

	return rootSuffixTree;
}