Esempio n. 1
0
void
hlparsetext(TSCfgInfo * cfg, HLPRSTEXT * prs, QUERYTYPE * query, char *buf, int4 buflen)
{
    int			type,
                lenlemm;
    char	   *lemm = NULL;
    WParserInfo *prsobj = findprs(cfg->prs_id);
    LexizeData	ldata;
    TSLexeme   *norms;
    ParsedLex  *lexs;

    prsobj->prs = (void *) DatumGetPointer(
                      FunctionCall2(
                          &(prsobj->start_info),
                          PointerGetDatum(buf),
                          Int32GetDatum(buflen)
                      )
                  );

    LexizeInit(&ldata, cfg);

    do
    {
        type = DatumGetInt32(FunctionCall3(
                                 &(prsobj->getlexeme_info),
                                 PointerGetDatum(prsobj->prs),
                                 PointerGetDatum(&lemm),
                                 PointerGetDatum(&lenlemm)));

        if (type > 0 && lenlemm >= MAXSTRLEN)
        {
#ifdef IGNORE_LONGLEXEME
            ereport(NOTICE,
                    (errcode(ERRCODE_SYNTAX_ERROR),
                     errmsg("A word you are indexing is too long. It will be ignored.")));
            continue;
#else
            ereport(ERROR,
                    (errcode(ERRCODE_SYNTAX_ERROR),
                     errmsg("A word you are indexing is too long")));
#endif
        }

        LexizeAddLemm(&ldata, type, lemm, lenlemm);

        do
        {
            if ((norms = LexizeExec(&ldata, &lexs)) != NULL)
                addHLParsedLex(prs, query, lexs, norms);
            else
                addHLParsedLex(prs, query, lexs, NULL);
        } while (norms);

    } while (type > 0);

    FunctionCall1(
        &(prsobj->end_info),
        PointerGetDatum(prsobj->prs)
    );
}
Esempio n. 2
0
void
hlparsetext(Oid cfgId, HeadlineParsedText *prs, TSQuery query, char *buf, int buflen)
{
	int			type,
				lenlemm;
	char	   *lemm = NULL;
	LexizeData	ldata;
	TSLexeme   *norms;
	ParsedLex  *lexs;
	TSConfigCacheEntry *cfg;
	TSParserCacheEntry *prsobj;
	void	   *prsdata;

	cfg = lookup_ts_config_cache(cfgId);
	prsobj = lookup_ts_parser_cache(cfg->prsId);

	prsdata = (void *) DatumGetPointer(FunctionCall2(&(prsobj->prsstart),
													 PointerGetDatum(buf),
													 Int32GetDatum(buflen)));

	LexizeInit(&ldata, cfg);

	do
	{
		type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken),
										   PointerGetDatum(prsdata),
										   PointerGetDatum(&lemm),
										   PointerGetDatum(&lenlemm)));

		if (type > 0 && lenlemm >= MAXSTRLEN)
		{
#ifdef IGNORE_LONGLEXEME
			ereport(NOTICE,
					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
					 errmsg("word is too long to be indexed"),
					 errdetail("Words longer than %d characters are ignored.",
							   MAXSTRLEN)));
			continue;
#else
			ereport(ERROR,
					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
					 errmsg("word is too long to be indexed"),
					 errdetail("Words longer than %d characters are ignored.",
							   MAXSTRLEN)));
#endif
		}

		LexizeAddLemm(&ldata, type, lemm, lenlemm);

		do
		{
			if ((norms = LexizeExec(&ldata, &lexs)) != NULL)
				addHLParsedLex(prs, query, lexs, norms);
			else
				addHLParsedLex(prs, query, lexs, NULL);
		} while (norms);

	} while (type > 0);

	FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata));
}
Esempio n. 3
0
/*
 * Parse string and lexize words.
 *
 * prs will be filled in.
 */
void
parsetext(Oid cfgId, ParsedText *prs, char *buf, int buflen)
{
	int			type,
				lenlemm;
	char	   *lemm = NULL;
	LexizeData	ldata;
	TSLexeme   *norms;
	TSConfigCacheEntry *cfg;
	TSParserCacheEntry *prsobj;
	void	   *prsdata;

	cfg = lookup_ts_config_cache(cfgId);
	prsobj = lookup_ts_parser_cache(cfg->prsId);

	prsdata = (void *) DatumGetPointer(FunctionCall2(&prsobj->prsstart,
													 PointerGetDatum(buf),
													 Int32GetDatum(buflen)));

	LexizeInit(&ldata, cfg);

	do
	{
		type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken),
										   PointerGetDatum(prsdata),
										   PointerGetDatum(&lemm),
										   PointerGetDatum(&lenlemm)));

		if (type > 0 && lenlemm >= MAXSTRLEN)
		{
#ifdef IGNORE_LONGLEXEME
			ereport(NOTICE,
					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
					 errmsg("word is too long to be indexed"),
					 errdetail("Words longer than %d characters are ignored.",
							   MAXSTRLEN)));
			continue;
#else
			ereport(ERROR,
					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
					 errmsg("word is too long to be indexed"),
					 errdetail("Words longer than %d characters are ignored.",
							   MAXSTRLEN)));
#endif
		}

		LexizeAddLemm(&ldata, type, lemm, lenlemm);

		while ((norms = LexizeExec(&ldata, NULL)) != NULL)
		{
			TSLexeme   *ptr = norms;

			prs->pos++;			/* set pos */

			while (ptr->lexeme)
			{
				if (prs->curwords == prs->lenwords)
				{
					prs->lenwords *= 2;
					prs->words = (ParsedWord *) repalloc((void *) prs->words, prs->lenwords * sizeof(ParsedWord));
				}

				if (ptr->flags & TSL_ADDPOS)
					prs->pos++;
				prs->words[prs->curwords].len = strlen(ptr->lexeme);
				prs->words[prs->curwords].word = ptr->lexeme;
				prs->words[prs->curwords].nvariant = ptr->nvariant;
				prs->words[prs->curwords].flags = ptr->flags & TSL_PREFIX;
				prs->words[prs->curwords].alen = 0;
				prs->words[prs->curwords].pos.pos = LIMITPOS(prs->pos);
				ptr++;
				prs->curwords++;
			}
			pfree(norms);
		}
	} while (type > 0);

	FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata));
}
Esempio n. 4
0
static TSLexeme *
LexizeExec(LexizeData *ld, ParsedLex **correspondLexem)
{
	int			i;
	ListDictionary *map;
	TSDictionaryCacheEntry *dict;
	TSLexeme   *res;

	if (ld->curDictId == InvalidOid)
	{
		/*
		 * usial mode: dictionary wants only one word, but we should keep in
		 * mind that we should go through all stack
		 */

		while (ld->towork.head)
		{
			ParsedLex  *curVal = ld->towork.head;
			char	   *curValLemm = curVal->lemm;
			int			curValLenLemm = curVal->lenlemm;

			map = ld->cfg->map + curVal->type;

			if (curVal->type == 0 || curVal->type >= ld->cfg->lenmap || map->len == 0)
			{
				/* skip this type of lexeme */
				RemoveHead(ld);
				continue;
			}

			for (i = ld->posDict; i < map->len; i++)
			{
				dict = lookup_ts_dictionary_cache(map->dictIds[i]);

				ld->dictState.isend = ld->dictState.getnext = false;
				ld->dictState.private_state = NULL;
				res = (TSLexeme *) DatumGetPointer(FunctionCall4(
															 &(dict->lexize),
											 PointerGetDatum(dict->dictData),
												 PointerGetDatum(curValLemm),
												Int32GetDatum(curValLenLemm),
											  PointerGetDatum(&ld->dictState)
																 ));

				if (ld->dictState.getnext)
				{
					/*
					 * dictionary wants next word, so setup and store current
					 * position and go to multiword mode
					 */

					ld->curDictId = DatumGetObjectId(map->dictIds[i]);
					ld->posDict = i + 1;
					ld->curSub = curVal->next;
					if (res)
						setNewTmpRes(ld, curVal, res);
					return LexizeExec(ld, correspondLexem);
				}

				if (!res)		/* dictionary doesn't know this lexeme */
					continue;

				if (res->flags & TSL_FILTER)
				{
					curValLemm = res->lexeme;
					curValLenLemm = strlen(res->lexeme);
					continue;
				}

				RemoveHead(ld);
				setCorrLex(ld, correspondLexem);
				return res;
			}

			RemoveHead(ld);
		}
	}
	else
	{							/* curDictId is valid */
		dict = lookup_ts_dictionary_cache(ld->curDictId);

		/*
		 * Dictionary ld->curDictId asks  us about following words
		 */

		while (ld->curSub)
		{
			ParsedLex  *curVal = ld->curSub;

			map = ld->cfg->map + curVal->type;

			if (curVal->type != 0)
			{
				bool		dictExists = false;

				if (curVal->type >= ld->cfg->lenmap || map->len == 0)
				{
					/* skip this type of lexeme */
					ld->curSub = curVal->next;
					continue;
				}

				/*
				 * We should be sure that current type of lexeme is recognized
				 * by our dictinonary: we just check is it exist in list of
				 * dictionaries ?
				 */
				for (i = 0; i < map->len && !dictExists; i++)
					if (ld->curDictId == DatumGetObjectId(map->dictIds[i]))
						dictExists = true;

				if (!dictExists)
				{
					/*
					 * Dictionary can't work with current tpe of lexeme,
					 * return to basic mode and redo all stored lexemes
					 */
					ld->curDictId = InvalidOid;
					return LexizeExec(ld, correspondLexem);
				}
			}

			ld->dictState.isend = (curVal->type == 0) ? true : false;
			ld->dictState.getnext = false;

			res = (TSLexeme *) DatumGetPointer(FunctionCall4(
															 &(dict->lexize),
											 PointerGetDatum(dict->dictData),
											   PointerGetDatum(curVal->lemm),
											  Int32GetDatum(curVal->lenlemm),
											  PointerGetDatum(&ld->dictState)
															 ));

			if (ld->dictState.getnext)
			{
				/* Dictionary wants one more */
				ld->curSub = curVal->next;
				if (res)
					setNewTmpRes(ld, curVal, res);
				continue;
			}

			if (res || ld->tmpRes)
			{
				/*
				 * Dictionary normalizes lexemes, so we remove from stack all
				 * used lexemes, return to basic mode and redo end of stack
				 * (if it exists)
				 */
				if (res)
				{
					moveToWaste(ld, ld->curSub);
				}
				else
				{
					res = ld->tmpRes;
					moveToWaste(ld, ld->lastRes);
				}

				/* reset to initial state */
				ld->curDictId = InvalidOid;
				ld->posDict = 0;
				ld->lastRes = NULL;
				ld->tmpRes = NULL;
				setCorrLex(ld, correspondLexem);
				return res;
			}

			/*
			 * Dict don't want next lexem and didn't recognize anything, redo
			 * from ld->towork.head
			 */
			ld->curDictId = InvalidOid;
			return LexizeExec(ld, correspondLexem);
		}
	}

	setCorrLex(ld, correspondLexem);
	return NULL;
}
Esempio n. 5
0
void
parsetext_v2(TSCfgInfo * cfg, PRSTEXT * prs, char *buf, int4 buflen)
{
    int			type,
                lenlemm;
    char	   *lemm = NULL;
    WParserInfo *prsobj = findprs(cfg->prs_id);
    LexizeData	ldata;
    TSLexeme   *norms;

    prsobj->prs = (void *) DatumGetPointer(
                      FunctionCall2(
                          &(prsobj->start_info),
                          PointerGetDatum(buf),
                          Int32GetDatum(buflen)
                      )
                  );

    LexizeInit(&ldata, cfg);

    do
    {
        type = DatumGetInt32(FunctionCall3(
                                 &(prsobj->getlexeme_info),
                                 PointerGetDatum(prsobj->prs),
                                 PointerGetDatum(&lemm),
                                 PointerGetDatum(&lenlemm)));

        if (type > 0 && lenlemm >= MAXSTRLEN)
        {
#ifdef IGNORE_LONGLEXEME
            ereport(NOTICE,
                    (errcode(ERRCODE_SYNTAX_ERROR),
                     errmsg("A word you are indexing is too long. It will be ignored.")));
            continue;
#else
            ereport(ERROR,
                    (errcode(ERRCODE_SYNTAX_ERROR),
                     errmsg("A word you are indexing is too long")));
#endif
        }

        LexizeAddLemm(&ldata, type, lemm, lenlemm);

        while ((norms = LexizeExec(&ldata, NULL)) != NULL)
        {
            TSLexeme   *ptr = norms;

            prs->pos++;			/* set pos */

            while (ptr->lexeme)
            {
                if (prs->curwords == prs->lenwords)
                {
                    prs->lenwords *= 2;
                    prs->words = (TSWORD *) repalloc((void *) prs->words, prs->lenwords * sizeof(TSWORD));
                }

                if (ptr->flags & TSL_ADDPOS)
                    prs->pos++;
                prs->words[prs->curwords].len = strlen(ptr->lexeme);
                prs->words[prs->curwords].word = ptr->lexeme;
                prs->words[prs->curwords].nvariant = ptr->nvariant;
                prs->words[prs->curwords].alen = 0;
                prs->words[prs->curwords].pos.pos = LIMITPOS(prs->pos);
                ptr++;
                prs->curwords++;
            }
            pfree(norms);
        }
    } while (type > 0);

    FunctionCall1(
        &(prsobj->end_info),
        PointerGetDatum(prsobj->prs)
    );
}