/*
 * Get next token from string being parsed. Returns true if successful,
 * false if end of input string is reached.  On success, these output
 * parameters are filled in:
 *
 * *strval		pointer to token
 * *lenval		length of *strval
 * *pos_ptr		pointer to a palloc'd array of positions and weights
 *				associated with the token. If the caller is not interested
 *				in the information, NULL can be supplied. Otherwise
 *				the caller is responsible for pfreeing the array.
 * *poslen		number of elements in *pos_ptr
 * *endptr		scan resumption point
 *
 * Pass NULL for unwanted output parameters.
 */
bool
gettoken_tsvector(TSVectorParseState state,
				  char **strval, int *lenval,
				  WordEntryPos **pos_ptr, int *poslen,
				  char **endptr)
{
	int			oldstate = 0;
	char	   *curpos = state->word;
	int			statecode = WAITWORD;

	/*
	 * pos is for collecting the comma delimited list of positions followed by
	 * the actual token.
	 */
	WordEntryPos *pos = NULL;
	int			npos = 0;		/* elements of pos used */
	int			posalen = 0;	/* allocated size of pos */

	while (1)
	{
		if (statecode == WAITWORD)
		{
			if (*(state->prsbuf) == '\0')
				return false;
			else if (t_iseq(state->prsbuf, '\''))
				statecode = WAITENDCMPLX;
			else if (t_iseq(state->prsbuf, '\\'))
			{
				statecode = WAITNEXTCHAR;
				oldstate = WAITENDWORD;
			}
			else if (state->oprisdelim && ISOPERATOR(state->prsbuf))
				PRSSYNTAXERROR;
			else if (!t_isspace(state->prsbuf))
			{
				COPYCHAR(curpos, state->prsbuf);
				curpos += pg_mblen(state->prsbuf);
				statecode = WAITENDWORD;
			}
		}
		else if (statecode == WAITNEXTCHAR)
		{
			if (*(state->prsbuf) == '\0')
				ereport(ERROR,
						(errcode(ERRCODE_SYNTAX_ERROR),
						 errmsg("there is no escaped character: \"%s\"",
								state->bufstart)));
			else
			{
				RESIZEPRSBUF;
				COPYCHAR(curpos, state->prsbuf);
				curpos += pg_mblen(state->prsbuf);
				Assert(oldstate != 0);
				statecode = oldstate;
			}
		}
		else if (statecode == WAITENDWORD)
		{
			if (t_iseq(state->prsbuf, '\\'))
			{
				statecode = WAITNEXTCHAR;
				oldstate = WAITENDWORD;
			}
			else if (t_isspace(state->prsbuf) || *(state->prsbuf) == '\0' ||
					 (state->oprisdelim && ISOPERATOR(state->prsbuf)))
			{
				RESIZEPRSBUF;
				if (curpos == state->word)
					PRSSYNTAXERROR;
				*(curpos) = '\0';
				RETURN_TOKEN;
			}
			else if (t_iseq(state->prsbuf, ':'))
			{
				if (curpos == state->word)
					PRSSYNTAXERROR;
				*(curpos) = '\0';
				if (state->oprisdelim)
					RETURN_TOKEN;
				else
					statecode = INPOSINFO;
			}
			else
			{
				RESIZEPRSBUF;
				COPYCHAR(curpos, state->prsbuf);
				curpos += pg_mblen(state->prsbuf);
			}
		}
		else if (statecode == WAITENDCMPLX)
		{
			if (t_iseq(state->prsbuf, '\''))
			{
				statecode = WAITCHARCMPLX;
			}
			else if (t_iseq(state->prsbuf, '\\'))
			{
				statecode = WAITNEXTCHAR;
				oldstate = WAITENDCMPLX;
			}
			else if (*(state->prsbuf) == '\0')
				PRSSYNTAXERROR;
			else
			{
				RESIZEPRSBUF;
				COPYCHAR(curpos, state->prsbuf);
				curpos += pg_mblen(state->prsbuf);
			}
		}
		else if (statecode == WAITCHARCMPLX)
		{
			if (t_iseq(state->prsbuf, '\''))
			{
				RESIZEPRSBUF;
				COPYCHAR(curpos, state->prsbuf);
				curpos += pg_mblen(state->prsbuf);
				statecode = WAITENDCMPLX;
			}
			else
			{
				RESIZEPRSBUF;
				*(curpos) = '\0';
				if (curpos == state->word)
					PRSSYNTAXERROR;
				if (state->oprisdelim)
				{
					/* state->prsbuf+=pg_mblen(state->prsbuf); */
					RETURN_TOKEN;
				}
				else
					statecode = WAITPOSINFO;
				continue;		/* recheck current character */
			}
		}
		else if (statecode == WAITPOSINFO)
		{
			if (t_iseq(state->prsbuf, ':'))
				statecode = INPOSINFO;
			else
				RETURN_TOKEN;
		}
		else if (statecode == INPOSINFO)
		{
			if (t_isdigit(state->prsbuf))
			{
				if (posalen == 0)
				{
					posalen = 4;
					pos = (WordEntryPos *) palloc(sizeof(WordEntryPos) * posalen);
					npos = 0;
				}
				else if (npos + 1 >= posalen)
				{
					posalen *= 2;
					pos = (WordEntryPos *) repalloc(pos, sizeof(WordEntryPos) * posalen);
				}
				npos++;
				WEP_SETPOS(pos[npos - 1], LIMITPOS(atoi(state->prsbuf)));
				/* we cannot get here in tsquery, so no need for 2 errmsgs */
				if (WEP_GETPOS(pos[npos - 1]) == 0)
					ereport(ERROR,
							(errcode(ERRCODE_SYNTAX_ERROR),
							 errmsg("wrong position info in tsvector: \"%s\"",
									state->bufstart)));
				WEP_SETWEIGHT(pos[npos - 1], 0);
				statecode = WAITPOSDELIM;
			}
			else
				PRSSYNTAXERROR;
		}
		else if (statecode == WAITPOSDELIM)
		{
			if (t_iseq(state->prsbuf, ','))
				statecode = INPOSINFO;
			else if (t_iseq(state->prsbuf, 'a') || t_iseq(state->prsbuf, 'A') || t_iseq(state->prsbuf, '*'))
			{
				if (WEP_GETWEIGHT(pos[npos - 1]))
					PRSSYNTAXERROR;
				WEP_SETWEIGHT(pos[npos - 1], 3);
			}
			else if (t_iseq(state->prsbuf, 'b') || t_iseq(state->prsbuf, 'B'))
			{
				if (WEP_GETWEIGHT(pos[npos - 1]))
					PRSSYNTAXERROR;
				WEP_SETWEIGHT(pos[npos - 1], 2);
			}
			else if (t_iseq(state->prsbuf, 'c') || t_iseq(state->prsbuf, 'C'))
			{
				if (WEP_GETWEIGHT(pos[npos - 1]))
					PRSSYNTAXERROR;
				WEP_SETWEIGHT(pos[npos - 1], 1);
			}
			else if (t_iseq(state->prsbuf, 'd') || t_iseq(state->prsbuf, 'D'))
			{
				if (WEP_GETWEIGHT(pos[npos - 1]))
					PRSSYNTAXERROR;
				WEP_SETWEIGHT(pos[npos - 1], 0);
			}
			else if (t_isspace(state->prsbuf) ||
					 *(state->prsbuf) == '\0')
				RETURN_TOKEN;
			else if (!t_isdigit(state->prsbuf))
				PRSSYNTAXERROR;
		}
		else	/* internal error */
			elog(ERROR, "unrecognized state in gettoken_tsvector: %d",
				 statecode);

		/* get next char */
		state->prsbuf += pg_mblen(state->prsbuf);
	}
}
Exemple #2
0
int4
gettoken_tsvector(TI_IN_STATE * state)
{
	int4		oldstate = 0;

	state->curpos = state->word;
	state->state = WAITWORD;
	state->alen = 0;

	while (1)
	{
		if (state->state == WAITWORD)
		{
			if (*(state->prsbuf) == '\0')
				return 0;
			else if (*(state->prsbuf) == '\'')
				state->state = WAITENDCMPLX;
			else if (*(state->prsbuf) == '\\')
			{
				state->state = WAITNEXTCHAR;
				oldstate = WAITENDWORD;
			}
			else if (state->oprisdelim && ISOPERATOR(*(state->prsbuf)))
				ereport(ERROR,
						(errcode(ERRCODE_SYNTAX_ERROR),
						 errmsg("syntax error")));
			else if (*(state->prsbuf) != ' ')
			{
				*(state->curpos) = *(state->prsbuf);
				state->curpos++;
				state->state = WAITENDWORD;
			}
		}
		else if (state->state == WAITNEXTCHAR)
		{
			if (*(state->prsbuf) == '\0')
				ereport(ERROR,
						(errcode(ERRCODE_SYNTAX_ERROR),
						 errmsg("there is no escaped character")));
			else
			{
				RESIZEPRSBUF;
				*(state->curpos) = *(state->prsbuf);
				state->curpos++;
				state->state = oldstate;
			}
		}
		else if (state->state == WAITENDWORD)
		{
			if (*(state->prsbuf) == '\\')
			{
				state->state = WAITNEXTCHAR;
				oldstate = WAITENDWORD;
			}
			else if (*(state->prsbuf) == ' ' || *(state->prsbuf) == '\0' ||
					 (state->oprisdelim && ISOPERATOR(*(state->prsbuf))))
			{
				RESIZEPRSBUF;
				if (state->curpos == state->word)
					ereport(ERROR,
							(errcode(ERRCODE_SYNTAX_ERROR),
							 errmsg("syntax error")));
				*(state->curpos) = '\0';
				return 1;
			}
			else if (*(state->prsbuf) == ':')
			{
				if (state->curpos == state->word)
					ereport(ERROR,
							(errcode(ERRCODE_SYNTAX_ERROR),
							 errmsg("syntax error")));
				*(state->curpos) = '\0';
				if (state->oprisdelim)
					return 1;
				else
					state->state = INPOSINFO;
			}
			else
			{
				RESIZEPRSBUF;
				*(state->curpos) = *(state->prsbuf);
				state->curpos++;
			}
		}
		else if (state->state == WAITENDCMPLX)
		{
			if (*(state->prsbuf) == '\'')
			{
				RESIZEPRSBUF;
				*(state->curpos) = '\0';
				if (state->curpos == state->word)
					ereport(ERROR,
							(errcode(ERRCODE_SYNTAX_ERROR),
							 errmsg("syntax error")));
				if (state->oprisdelim)
				{
					state->prsbuf++;
					return 1;
				}
				else
					state->state = WAITPOSINFO;
			}
			else if (*(state->prsbuf) == '\\')
			{
				state->state = WAITNEXTCHAR;
				oldstate = WAITENDCMPLX;
			}
			else if (*(state->prsbuf) == '\0')
				ereport(ERROR,
						(errcode(ERRCODE_SYNTAX_ERROR),
						 errmsg("syntax error")));
			else
			{
				RESIZEPRSBUF;
				*(state->curpos) = *(state->prsbuf);
				state->curpos++;
			}
		}
		else if (state->state == WAITPOSINFO)
		{
			if (*(state->prsbuf) == ':')
				state->state = INPOSINFO;
			else
				return 1;
		}
		else if (state->state == INPOSINFO)
		{
			if (isdigit(*(state->prsbuf)))
			{
				if (state->alen == 0)
				{
					state->alen = 4;
					state->pos = (WordEntryPos *) palloc(sizeof(WordEntryPos) * state->alen);
					*(uint16 *) (state->pos) = 0;
				}
				else if (*(uint16 *) (state->pos) + 1 >= state->alen)
				{
					state->alen *= 2;
					state->pos = (WordEntryPos *) repalloc(state->pos, sizeof(WordEntryPos) * state->alen);
				}
				(*(uint16 *) (state->pos))++;
				state->pos[*(uint16 *) (state->pos)].pos = LIMITPOS(atoi(state->prsbuf));
				if (state->pos[*(uint16 *) (state->pos)].pos == 0)
					ereport(ERROR,
							(errcode(ERRCODE_SYNTAX_ERROR),
							 errmsg("wrong position info")));
				state->pos[*(uint16 *) (state->pos)].weight = 0;
				state->state = WAITPOSDELIM;
			}
			else
				ereport(ERROR,
						(errcode(ERRCODE_SYNTAX_ERROR),
						 errmsg("syntax error")));
		}
		else if (state->state == WAITPOSDELIM)
		{
			if (*(state->prsbuf) == ',')
				state->state = INPOSINFO;
			else if (tolower(*(state->prsbuf)) == 'a' || *(state->prsbuf) == '*')
			{
				if (state->pos[*(uint16 *) (state->pos)].weight)
					ereport(ERROR,
							(errcode(ERRCODE_SYNTAX_ERROR),
							 errmsg("syntax error")));
				state->pos[*(uint16 *) (state->pos)].weight = 3;
			}
			else if (tolower(*(state->prsbuf)) == 'b')
			{
				if (state->pos[*(uint16 *) (state->pos)].weight)
					ereport(ERROR,
							(errcode(ERRCODE_SYNTAX_ERROR),
							 errmsg("syntax error")));
				state->pos[*(uint16 *) (state->pos)].weight = 2;
			}
			else if (tolower(*(state->prsbuf)) == 'c')
			{
				if (state->pos[*(uint16 *) (state->pos)].weight)
					ereport(ERROR,
							(errcode(ERRCODE_SYNTAX_ERROR),
							 errmsg("syntax error")));
				state->pos[*(uint16 *) (state->pos)].weight = 1;
			}
			else if (tolower(*(state->prsbuf)) == 'd')
			{
				if (state->pos[*(uint16 *) (state->pos)].weight)
					ereport(ERROR,
							(errcode(ERRCODE_SYNTAX_ERROR),
							 errmsg("syntax error")));
				state->pos[*(uint16 *) (state->pos)].weight = 0;
			}
			else if (isspace(*(state->prsbuf)) || *(state->prsbuf) == '\0')
				return 1;
			else if (!isdigit(*(state->prsbuf)))
				ereport(ERROR,
						(errcode(ERRCODE_SYNTAX_ERROR),
						 errmsg("syntax error")));
		}
		else
			/* internal error */
			elog(ERROR, "internal error");
		state->prsbuf++;
	}

	return 0;
}