void hlparsetext(TSCfgInfo * cfg, HLPRSTEXT * prs, QUERYTYPE * query, char *buf, int4 buflen) { int type, lenlemm; char *lemm = NULL; WParserInfo *prsobj = findprs(cfg->prs_id); LexizeData ldata; TSLexeme *norms; ParsedLex *lexs; prsobj->prs = (void *) DatumGetPointer( FunctionCall2( &(prsobj->start_info), PointerGetDatum(buf), Int32GetDatum(buflen) ) ); LexizeInit(&ldata, cfg); do { type = DatumGetInt32(FunctionCall3( &(prsobj->getlexeme_info), PointerGetDatum(prsobj->prs), PointerGetDatum(&lemm), PointerGetDatum(&lenlemm))); if (type > 0 && lenlemm >= MAXSTRLEN) { #ifdef IGNORE_LONGLEXEME ereport(NOTICE, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("A word you are indexing is too long. It will be ignored."))); continue; #else ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("A word you are indexing is too long"))); #endif } LexizeAddLemm(&ldata, type, lemm, lenlemm); do { if ((norms = LexizeExec(&ldata, &lexs)) != NULL) addHLParsedLex(prs, query, lexs, norms); else addHLParsedLex(prs, query, lexs, NULL); } while (norms); } while (type > 0); FunctionCall1( &(prsobj->end_info), PointerGetDatum(prsobj->prs) ); }
void hlparsetext(Oid cfgId, HeadlineParsedText *prs, TSQuery query, char *buf, int buflen) { int type, lenlemm; char *lemm = NULL; LexizeData ldata; TSLexeme *norms; ParsedLex *lexs; TSConfigCacheEntry *cfg; TSParserCacheEntry *prsobj; void *prsdata; cfg = lookup_ts_config_cache(cfgId); prsobj = lookup_ts_parser_cache(cfg->prsId); prsdata = (void *) DatumGetPointer(FunctionCall2(&(prsobj->prsstart), PointerGetDatum(buf), Int32GetDatum(buflen))); LexizeInit(&ldata, cfg); do { type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken), PointerGetDatum(prsdata), PointerGetDatum(&lemm), PointerGetDatum(&lenlemm))); if (type > 0 && lenlemm >= MAXSTRLEN) { #ifdef IGNORE_LONGLEXEME ereport(NOTICE, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("word is too long to be indexed"), errdetail("Words longer than %d characters are ignored.", MAXSTRLEN))); continue; #else ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("word is too long to be indexed"), errdetail("Words longer than %d characters are ignored.", MAXSTRLEN))); #endif } LexizeAddLemm(&ldata, type, lemm, lenlemm); do { if ((norms = LexizeExec(&ldata, &lexs)) != NULL) addHLParsedLex(prs, query, lexs, norms); else addHLParsedLex(prs, query, lexs, NULL); } while (norms); } while (type > 0); FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata)); }
/* * Parse string and lexize words. * * prs will be filled in. */ void parsetext(Oid cfgId, ParsedText *prs, char *buf, int buflen) { int type, lenlemm; char *lemm = NULL; LexizeData ldata; TSLexeme *norms; TSConfigCacheEntry *cfg; TSParserCacheEntry *prsobj; void *prsdata; cfg = lookup_ts_config_cache(cfgId); prsobj = lookup_ts_parser_cache(cfg->prsId); prsdata = (void *) DatumGetPointer(FunctionCall2(&prsobj->prsstart, PointerGetDatum(buf), Int32GetDatum(buflen))); LexizeInit(&ldata, cfg); do { type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken), PointerGetDatum(prsdata), PointerGetDatum(&lemm), PointerGetDatum(&lenlemm))); if (type > 0 && lenlemm >= MAXSTRLEN) { #ifdef IGNORE_LONGLEXEME ereport(NOTICE, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("word is too long to be indexed"), errdetail("Words longer than %d characters are ignored.", MAXSTRLEN))); continue; #else ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("word is too long to be indexed"), errdetail("Words longer than %d characters are ignored.", MAXSTRLEN))); #endif } LexizeAddLemm(&ldata, type, lemm, lenlemm); while ((norms = LexizeExec(&ldata, NULL)) != NULL) { TSLexeme *ptr = norms; prs->pos++; /* set pos */ while (ptr->lexeme) { if (prs->curwords == prs->lenwords) { prs->lenwords *= 2; prs->words = (ParsedWord *) repalloc((void *) prs->words, prs->lenwords * sizeof(ParsedWord)); } if (ptr->flags & TSL_ADDPOS) prs->pos++; prs->words[prs->curwords].len = strlen(ptr->lexeme); prs->words[prs->curwords].word = ptr->lexeme; prs->words[prs->curwords].nvariant = ptr->nvariant; prs->words[prs->curwords].flags = ptr->flags & TSL_PREFIX; prs->words[prs->curwords].alen = 0; prs->words[prs->curwords].pos.pos = LIMITPOS(prs->pos); ptr++; prs->curwords++; } pfree(norms); } } while (type > 0); FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata)); }
static TSLexeme * LexizeExec(LexizeData *ld, ParsedLex **correspondLexem) { int i; ListDictionary *map; TSDictionaryCacheEntry *dict; TSLexeme *res; if (ld->curDictId == InvalidOid) { /* * usial mode: dictionary wants only one word, but we should keep in * mind that we should go through all stack */ while (ld->towork.head) { ParsedLex *curVal = ld->towork.head; char *curValLemm = curVal->lemm; int curValLenLemm = curVal->lenlemm; map = ld->cfg->map + curVal->type; if (curVal->type == 0 || curVal->type >= ld->cfg->lenmap || map->len == 0) { /* skip this type of lexeme */ RemoveHead(ld); continue; } for (i = ld->posDict; i < map->len; i++) { dict = lookup_ts_dictionary_cache(map->dictIds[i]); ld->dictState.isend = ld->dictState.getnext = false; ld->dictState.private_state = NULL; res = (TSLexeme *) DatumGetPointer(FunctionCall4( &(dict->lexize), PointerGetDatum(dict->dictData), PointerGetDatum(curValLemm), Int32GetDatum(curValLenLemm), PointerGetDatum(&ld->dictState) )); if (ld->dictState.getnext) { /* * dictionary wants next word, so setup and store current * position and go to multiword mode */ ld->curDictId = DatumGetObjectId(map->dictIds[i]); ld->posDict = i + 1; ld->curSub = curVal->next; if (res) setNewTmpRes(ld, curVal, res); return LexizeExec(ld, correspondLexem); } if (!res) /* dictionary doesn't know this lexeme */ continue; if (res->flags & TSL_FILTER) { curValLemm = res->lexeme; curValLenLemm = strlen(res->lexeme); continue; } RemoveHead(ld); setCorrLex(ld, correspondLexem); return res; } RemoveHead(ld); } } else { /* curDictId is valid */ dict = lookup_ts_dictionary_cache(ld->curDictId); /* * Dictionary ld->curDictId asks us about following words */ while (ld->curSub) { ParsedLex *curVal = ld->curSub; map = ld->cfg->map + curVal->type; if (curVal->type != 0) { bool dictExists = false; if (curVal->type >= ld->cfg->lenmap || map->len == 0) { /* skip this type of lexeme */ ld->curSub = curVal->next; continue; } /* * We should be sure that current type of lexeme is recognized * by our dictinonary: we just check is it exist in list of * dictionaries ? */ for (i = 0; i < map->len && !dictExists; i++) if (ld->curDictId == DatumGetObjectId(map->dictIds[i])) dictExists = true; if (!dictExists) { /* * Dictionary can't work with current tpe of lexeme, * return to basic mode and redo all stored lexemes */ ld->curDictId = InvalidOid; return LexizeExec(ld, correspondLexem); } } ld->dictState.isend = (curVal->type == 0) ? true : false; ld->dictState.getnext = false; res = (TSLexeme *) DatumGetPointer(FunctionCall4( &(dict->lexize), PointerGetDatum(dict->dictData), PointerGetDatum(curVal->lemm), Int32GetDatum(curVal->lenlemm), PointerGetDatum(&ld->dictState) )); if (ld->dictState.getnext) { /* Dictionary wants one more */ ld->curSub = curVal->next; if (res) setNewTmpRes(ld, curVal, res); continue; } if (res || ld->tmpRes) { /* * Dictionary normalizes lexemes, so we remove from stack all * used lexemes, return to basic mode and redo end of stack * (if it exists) */ if (res) { moveToWaste(ld, ld->curSub); } else { res = ld->tmpRes; moveToWaste(ld, ld->lastRes); } /* reset to initial state */ ld->curDictId = InvalidOid; ld->posDict = 0; ld->lastRes = NULL; ld->tmpRes = NULL; setCorrLex(ld, correspondLexem); return res; } /* * Dict don't want next lexem and didn't recognize anything, redo * from ld->towork.head */ ld->curDictId = InvalidOid; return LexizeExec(ld, correspondLexem); } } setCorrLex(ld, correspondLexem); return NULL; }
void parsetext_v2(TSCfgInfo * cfg, PRSTEXT * prs, char *buf, int4 buflen) { int type, lenlemm; char *lemm = NULL; WParserInfo *prsobj = findprs(cfg->prs_id); LexizeData ldata; TSLexeme *norms; prsobj->prs = (void *) DatumGetPointer( FunctionCall2( &(prsobj->start_info), PointerGetDatum(buf), Int32GetDatum(buflen) ) ); LexizeInit(&ldata, cfg); do { type = DatumGetInt32(FunctionCall3( &(prsobj->getlexeme_info), PointerGetDatum(prsobj->prs), PointerGetDatum(&lemm), PointerGetDatum(&lenlemm))); if (type > 0 && lenlemm >= MAXSTRLEN) { #ifdef IGNORE_LONGLEXEME ereport(NOTICE, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("A word you are indexing is too long. It will be ignored."))); continue; #else ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("A word you are indexing is too long"))); #endif } LexizeAddLemm(&ldata, type, lemm, lenlemm); while ((norms = LexizeExec(&ldata, NULL)) != NULL) { TSLexeme *ptr = norms; prs->pos++; /* set pos */ while (ptr->lexeme) { if (prs->curwords == prs->lenwords) { prs->lenwords *= 2; prs->words = (TSWORD *) repalloc((void *) prs->words, prs->lenwords * sizeof(TSWORD)); } if (ptr->flags & TSL_ADDPOS) prs->pos++; prs->words[prs->curwords].len = strlen(ptr->lexeme); prs->words[prs->curwords].word = ptr->lexeme; prs->words[prs->curwords].nvariant = ptr->nvariant; prs->words[prs->curwords].alen = 0; prs->words[prs->curwords].pos.pos = LIMITPOS(prs->pos); ptr++; prs->curwords++; } pfree(norms); } } while (type > 0); FunctionCall1( &(prsobj->end_info), PointerGetDatum(prsobj->prs) ); }