/* * Add positions from src to dest after offsetting them by maxpos. * Return the number added (might be less than expected due to overflow) */ static int32 add_pos(TSVector src, WordEntry *srcptr, TSVector dest, WordEntry *destptr, int32 maxpos) { uint16 *clen = &_POSVECPTR(dest, destptr)->npos; int i; uint16 slen = POSDATALEN(src, srcptr), startlen; WordEntryPos *spos = POSDATAPTR(src, srcptr), *dpos = POSDATAPTR(dest, destptr); if (!destptr->haspos) *clen = 0; startlen = *clen; for (i = 0; i < slen && *clen < MAXNUMPOS && (*clen == 0 || WEP_GETPOS(dpos[*clen - 1]) != MAXENTRYPOS - 1); i++) { WEP_SETWEIGHT(dpos[*clen], WEP_GETWEIGHT(spos[i])); WEP_SETPOS(dpos[*clen], LIMITPOS(WEP_GETPOS(spos[i]) + maxpos)); (*clen)++; } if (*clen != startlen) destptr->haspos = 1; return *clen - startlen; }
static void hlfinditem(HeadlineParsedText *prs, TSQuery query, int32 pos, char *buf, int buflen) { int i; QueryItem *item = GETQUERY(query); HeadlineWordEntry *word; while (prs->curwords + query->size >= prs->lenwords) { prs->lenwords *= 2; prs->words = (HeadlineWordEntry *) repalloc((void *) prs->words, prs->lenwords * sizeof(HeadlineWordEntry)); } word = &(prs->words[prs->curwords - 1]); word->pos = LIMITPOS(pos); for (i = 0; i < query->size; i++) { if (item->type == QI_VAL && tsCompareString(GETOPERAND(query) + item->qoperand.distance, item->qoperand.length, buf, buflen, item->qoperand.prefix) == 0) { if (word->item) { memcpy(&(prs->words[prs->curwords]), word, sizeof(HeadlineWordEntry)); prs->words[prs->curwords].item = &item->qoperand; prs->words[prs->curwords].repeated = 1; prs->curwords++; } else word->item = &item->qoperand; } item++; } }
static int uniqueWORD(ParsedWord *a, int4 l) { ParsedWord *ptr, *res; int tmppos; if (l == 1) { tmppos = LIMITPOS(a->pos.pos); a->alen = 2; a->pos.apos = (uint16 *) palloc(sizeof(uint16) * a->alen); a->pos.apos[0] = 1; a->pos.apos[1] = tmppos; return l; } res = a; ptr = a + 1; /* * Sort words with its positions */ qsort((void *) a, l, sizeof(ParsedWord), compareWORD); /* * Initialize first word and its first position */ tmppos = LIMITPOS(a->pos.pos); a->alen = 2; a->pos.apos = (uint16 *) palloc(sizeof(uint16) * a->alen); a->pos.apos[0] = 1; a->pos.apos[1] = tmppos; /* * Summarize position information for each word */ while (ptr - a < l) { if (!(ptr->len == res->len && strncmp(ptr->word, res->word, res->len) == 0)) { /* * Got a new word, so put it in result */ res++; res->len = ptr->len; res->word = ptr->word; tmppos = LIMITPOS(ptr->pos.pos); res->alen = 2; res->pos.apos = (uint16 *) palloc(sizeof(uint16) * res->alen); res->pos.apos[0] = 1; res->pos.apos[1] = tmppos; } else { /* * The word already exists, so adjust position information. But * before we should check size of position's array, max allowed * value for position and uniqueness of position */ pfree(ptr->word); if (res->pos.apos[0] < MAXNUMPOS - 1 && res->pos.apos[res->pos.apos[0]] != MAXENTRYPOS - 1 && res->pos.apos[res->pos.apos[0]] != LIMITPOS(ptr->pos.pos)) { if (res->pos.apos[0] + 1 >= res->alen) { res->alen *= 2; res->pos.apos = (uint16 *) repalloc(res->pos.apos, sizeof(uint16) * res->alen); } if (res->pos.apos[0] == 0 || res->pos.apos[res->pos.apos[0]] != LIMITPOS(ptr->pos.pos)) { res->pos.apos[res->pos.apos[0] + 1] = LIMITPOS(ptr->pos.pos); res->pos.apos[0]++; } } } ptr++; } return res + 1 - a; }
/* * Parse string and lexize words. * * prs will be filled in. */ void parsetext(Oid cfgId, ParsedText *prs, char *buf, int buflen) { int type, lenlemm; char *lemm = NULL; LexizeData ldata; TSLexeme *norms; TSConfigCacheEntry *cfg; TSParserCacheEntry *prsobj; void *prsdata; cfg = lookup_ts_config_cache(cfgId); prsobj = lookup_ts_parser_cache(cfg->prsId); prsdata = (void *) DatumGetPointer(FunctionCall2(&prsobj->prsstart, PointerGetDatum(buf), Int32GetDatum(buflen))); LexizeInit(&ldata, cfg); do { type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken), PointerGetDatum(prsdata), PointerGetDatum(&lemm), PointerGetDatum(&lenlemm))); if (type > 0 && lenlemm >= MAXSTRLEN) { #ifdef IGNORE_LONGLEXEME ereport(NOTICE, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("word is too long to be indexed"), errdetail("Words longer than %d characters are ignored.", MAXSTRLEN))); continue; #else ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("word is too long to be indexed"), errdetail("Words longer than %d characters are ignored.", MAXSTRLEN))); #endif } LexizeAddLemm(&ldata, type, lemm, lenlemm); while ((norms = LexizeExec(&ldata, NULL)) != NULL) { TSLexeme *ptr = norms; prs->pos++; /* set pos */ while (ptr->lexeme) { if (prs->curwords == prs->lenwords) { prs->lenwords *= 2; prs->words = (ParsedWord *) repalloc((void *) prs->words, prs->lenwords * sizeof(ParsedWord)); } if (ptr->flags & TSL_ADDPOS) prs->pos++; prs->words[prs->curwords].len = strlen(ptr->lexeme); prs->words[prs->curwords].word = ptr->lexeme; prs->words[prs->curwords].nvariant = ptr->nvariant; prs->words[prs->curwords].flags = ptr->flags & TSL_PREFIX; prs->words[prs->curwords].alen = 0; prs->words[prs->curwords].pos.pos = LIMITPOS(prs->pos); ptr++; prs->curwords++; } pfree(norms); } } while (type > 0); FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata)); }
/* * Get next token from string being parsed. Returns true if successful, * false if end of input string is reached. On success, these output * parameters are filled in: * * *strval pointer to token * *lenval length of *strval * *pos_ptr pointer to a palloc'd array of positions and weights * associated with the token. If the caller is not interested * in the information, NULL can be supplied. Otherwise * the caller is responsible for pfreeing the array. * *poslen number of elements in *pos_ptr * *endptr scan resumption point * * Pass NULL for unwanted output parameters. */ bool gettoken_tsvector(TSVectorParseState state, char **strval, int *lenval, WordEntryPos **pos_ptr, int *poslen, char **endptr) { int oldstate = 0; char *curpos = state->word; int statecode = WAITWORD; /* * pos is for collecting the comma delimited list of positions followed by * the actual token. */ WordEntryPos *pos = NULL; int npos = 0; /* elements of pos used */ int posalen = 0; /* allocated size of pos */ while (1) { if (statecode == WAITWORD) { if (*(state->prsbuf) == '\0') return false; else if (t_iseq(state->prsbuf, '\'')) statecode = WAITENDCMPLX; else if (t_iseq(state->prsbuf, '\\')) { statecode = WAITNEXTCHAR; oldstate = WAITENDWORD; } else if (state->oprisdelim && ISOPERATOR(state->prsbuf)) PRSSYNTAXERROR; else if (!t_isspace(state->prsbuf)) { COPYCHAR(curpos, state->prsbuf); curpos += pg_mblen(state->prsbuf); statecode = WAITENDWORD; } } else if (statecode == WAITNEXTCHAR) { if (*(state->prsbuf) == '\0') ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("there is no escaped character: \"%s\"", state->bufstart))); else { RESIZEPRSBUF; COPYCHAR(curpos, state->prsbuf); curpos += pg_mblen(state->prsbuf); Assert(oldstate != 0); statecode = oldstate; } } else if (statecode == WAITENDWORD) { if (t_iseq(state->prsbuf, '\\')) { statecode = WAITNEXTCHAR; oldstate = WAITENDWORD; } else if (t_isspace(state->prsbuf) || *(state->prsbuf) == '\0' || (state->oprisdelim && ISOPERATOR(state->prsbuf))) { RESIZEPRSBUF; if (curpos == state->word) PRSSYNTAXERROR; *(curpos) = '\0'; RETURN_TOKEN; } else if (t_iseq(state->prsbuf, ':')) { if (curpos == state->word) PRSSYNTAXERROR; *(curpos) = '\0'; if (state->oprisdelim) RETURN_TOKEN; else statecode = INPOSINFO; } else { RESIZEPRSBUF; COPYCHAR(curpos, state->prsbuf); curpos += pg_mblen(state->prsbuf); } } else if (statecode == WAITENDCMPLX) { if (t_iseq(state->prsbuf, '\'')) { statecode = WAITCHARCMPLX; } else if (t_iseq(state->prsbuf, '\\')) { statecode = WAITNEXTCHAR; oldstate = WAITENDCMPLX; } else if (*(state->prsbuf) == '\0') PRSSYNTAXERROR; else { RESIZEPRSBUF; COPYCHAR(curpos, state->prsbuf); curpos += pg_mblen(state->prsbuf); } } else if (statecode == WAITCHARCMPLX) { if (t_iseq(state->prsbuf, '\'')) { RESIZEPRSBUF; COPYCHAR(curpos, state->prsbuf); curpos += pg_mblen(state->prsbuf); statecode = WAITENDCMPLX; } else { RESIZEPRSBUF; *(curpos) = '\0'; if (curpos == state->word) PRSSYNTAXERROR; if (state->oprisdelim) { /* state->prsbuf+=pg_mblen(state->prsbuf); */ RETURN_TOKEN; } else statecode = WAITPOSINFO; continue; /* recheck current character */ } } else if (statecode == WAITPOSINFO) { if (t_iseq(state->prsbuf, ':')) statecode = INPOSINFO; else RETURN_TOKEN; } else if (statecode == INPOSINFO) { if (t_isdigit(state->prsbuf)) { if (posalen == 0) { posalen = 4; pos = (WordEntryPos *) palloc(sizeof(WordEntryPos) * posalen); npos = 0; } else if (npos + 1 >= posalen) { posalen *= 2; pos = (WordEntryPos *) repalloc(pos, sizeof(WordEntryPos) * posalen); } npos++; WEP_SETPOS(pos[npos - 1], LIMITPOS(atoi(state->prsbuf))); /* we cannot get here in tsquery, so no need for 2 errmsgs */ if (WEP_GETPOS(pos[npos - 1]) == 0) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("wrong position info in tsvector: \"%s\"", state->bufstart))); WEP_SETWEIGHT(pos[npos - 1], 0); statecode = WAITPOSDELIM; } else PRSSYNTAXERROR; } else if (statecode == WAITPOSDELIM) { if (t_iseq(state->prsbuf, ',')) statecode = INPOSINFO; else if (t_iseq(state->prsbuf, 'a') || t_iseq(state->prsbuf, 'A') || t_iseq(state->prsbuf, '*')) { if (WEP_GETWEIGHT(pos[npos - 1])) PRSSYNTAXERROR; WEP_SETWEIGHT(pos[npos - 1], 3); } else if (t_iseq(state->prsbuf, 'b') || t_iseq(state->prsbuf, 'B')) { if (WEP_GETWEIGHT(pos[npos - 1])) PRSSYNTAXERROR; WEP_SETWEIGHT(pos[npos - 1], 2); } else if (t_iseq(state->prsbuf, 'c') || t_iseq(state->prsbuf, 'C')) { if (WEP_GETWEIGHT(pos[npos - 1])) PRSSYNTAXERROR; WEP_SETWEIGHT(pos[npos - 1], 1); } else if (t_iseq(state->prsbuf, 'd') || t_iseq(state->prsbuf, 'D')) { if (WEP_GETWEIGHT(pos[npos - 1])) PRSSYNTAXERROR; WEP_SETWEIGHT(pos[npos - 1], 0); } else if (t_isspace(state->prsbuf) || *(state->prsbuf) == '\0') RETURN_TOKEN; else if (!t_isdigit(state->prsbuf)) PRSSYNTAXERROR; } else /* internal error */ elog(ERROR, "unrecognized state in gettoken_tsvector: %d", statecode); /* get next char */ state->prsbuf += pg_mblen(state->prsbuf); } }
static int uniqueWORD(WORD * a, int4 l) { WORD *ptr, *res; int tmppos; if (l == 1) { tmppos = LIMITPOS(a->pos.pos); a->alen = 2; a->pos.apos = (uint16 *) palloc(sizeof(uint16) * a->alen); a->pos.apos[0] = 1; a->pos.apos[1] = tmppos; return l; } res = a; ptr = a + 1; qsort((void *) a, l, sizeof(WORD), compareWORD); tmppos = LIMITPOS(a->pos.pos); a->alen = 2; a->pos.apos = (uint16 *) palloc(sizeof(uint16) * a->alen); a->pos.apos[0] = 1; a->pos.apos[1] = tmppos; while (ptr - a < l) { if (!(ptr->len == res->len && strncmp(ptr->word, res->word, res->len) == 0)) { res++; res->len = ptr->len; res->word = ptr->word; tmppos = LIMITPOS(ptr->pos.pos); res->alen = 2; res->pos.apos = (uint16 *) palloc(sizeof(uint16) * res->alen); res->pos.apos[0] = 1; res->pos.apos[1] = tmppos; } else { pfree(ptr->word); if (res->pos.apos[0] < MAXNUMPOS - 1 && res->pos.apos[res->pos.apos[0]] != MAXENTRYPOS - 1) { if (res->pos.apos[0] + 1 >= res->alen) { res->alen *= 2; res->pos.apos = (uint16 *) repalloc(res->pos.apos, sizeof(uint16) * res->alen); } if ( res->pos.apos[0]==0 || res->pos.apos[res->pos.apos[0]] != LIMITPOS(ptr->pos.pos) ) { res->pos.apos[res->pos.apos[0] + 1] = LIMITPOS(ptr->pos.pos); res->pos.apos[0]++; } } } ptr++; } return res + 1 - a; }
int4 gettoken_tsvector(TI_IN_STATE * state) { int4 oldstate = 0; state->curpos = state->word; state->state = WAITWORD; state->alen = 0; while (1) { if (state->state == WAITWORD) { if (*(state->prsbuf) == '\0') return 0; else if (*(state->prsbuf) == '\'') state->state = WAITENDCMPLX; else if (*(state->prsbuf) == '\\') { state->state = WAITNEXTCHAR; oldstate = WAITENDWORD; } else if (state->oprisdelim && ISOPERATOR(*(state->prsbuf))) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("syntax error"))); else if (*(state->prsbuf) != ' ') { *(state->curpos) = *(state->prsbuf); state->curpos++; state->state = WAITENDWORD; } } else if (state->state == WAITNEXTCHAR) { if (*(state->prsbuf) == '\0') ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("there is no escaped character"))); else { RESIZEPRSBUF; *(state->curpos) = *(state->prsbuf); state->curpos++; state->state = oldstate; } } else if (state->state == WAITENDWORD) { if (*(state->prsbuf) == '\\') { state->state = WAITNEXTCHAR; oldstate = WAITENDWORD; } else if (*(state->prsbuf) == ' ' || *(state->prsbuf) == '\0' || (state->oprisdelim && ISOPERATOR(*(state->prsbuf)))) { RESIZEPRSBUF; if (state->curpos == state->word) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("syntax error"))); *(state->curpos) = '\0'; return 1; } else if (*(state->prsbuf) == ':') { if (state->curpos == state->word) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("syntax error"))); *(state->curpos) = '\0'; if (state->oprisdelim) return 1; else state->state = INPOSINFO; } else { RESIZEPRSBUF; *(state->curpos) = *(state->prsbuf); state->curpos++; } } else if (state->state == WAITENDCMPLX) { if (*(state->prsbuf) == '\'') { RESIZEPRSBUF; *(state->curpos) = '\0'; if (state->curpos == state->word) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("syntax error"))); if (state->oprisdelim) { state->prsbuf++; return 1; } else state->state = WAITPOSINFO; } else if (*(state->prsbuf) == '\\') { state->state = WAITNEXTCHAR; oldstate = WAITENDCMPLX; } else if (*(state->prsbuf) == '\0') ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("syntax error"))); else { RESIZEPRSBUF; *(state->curpos) = *(state->prsbuf); state->curpos++; } } else if (state->state == WAITPOSINFO) { if (*(state->prsbuf) == ':') state->state = INPOSINFO; else return 1; } else if (state->state == INPOSINFO) { if (isdigit(*(state->prsbuf))) { if (state->alen == 0) { state->alen = 4; state->pos = (WordEntryPos *) palloc(sizeof(WordEntryPos) * state->alen); *(uint16 *) (state->pos) = 0; } else if (*(uint16 *) (state->pos) + 1 >= state->alen) { state->alen *= 2; state->pos = (WordEntryPos *) repalloc(state->pos, sizeof(WordEntryPos) * state->alen); } (*(uint16 *) (state->pos))++; state->pos[*(uint16 *) (state->pos)].pos = LIMITPOS(atoi(state->prsbuf)); if (state->pos[*(uint16 *) (state->pos)].pos == 0) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("wrong position info"))); state->pos[*(uint16 *) (state->pos)].weight = 0; state->state = WAITPOSDELIM; } else ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("syntax error"))); } else if (state->state == WAITPOSDELIM) { if (*(state->prsbuf) == ',') state->state = INPOSINFO; else if (tolower(*(state->prsbuf)) == 'a' || *(state->prsbuf) == '*') { if (state->pos[*(uint16 *) (state->pos)].weight) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("syntax error"))); state->pos[*(uint16 *) (state->pos)].weight = 3; } else if (tolower(*(state->prsbuf)) == 'b') { if (state->pos[*(uint16 *) (state->pos)].weight) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("syntax error"))); state->pos[*(uint16 *) (state->pos)].weight = 2; } else if (tolower(*(state->prsbuf)) == 'c') { if (state->pos[*(uint16 *) (state->pos)].weight) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("syntax error"))); state->pos[*(uint16 *) (state->pos)].weight = 1; } else if (tolower(*(state->prsbuf)) == 'd') { if (state->pos[*(uint16 *) (state->pos)].weight) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("syntax error"))); state->pos[*(uint16 *) (state->pos)].weight = 0; } else if (isspace(*(state->prsbuf)) || *(state->prsbuf) == '\0') return 1; else if (!isdigit(*(state->prsbuf))) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("syntax error"))); } else /* internal error */ elog(ERROR, "internal error"); state->prsbuf++; } return 0; }
void parsetext_v2(TSCfgInfo * cfg, PRSTEXT * prs, char *buf, int4 buflen) { int type, lenlemm; char *lemm = NULL; WParserInfo *prsobj = findprs(cfg->prs_id); LexizeData ldata; TSLexeme *norms; prsobj->prs = (void *) DatumGetPointer( FunctionCall2( &(prsobj->start_info), PointerGetDatum(buf), Int32GetDatum(buflen) ) ); LexizeInit(&ldata, cfg); do { type = DatumGetInt32(FunctionCall3( &(prsobj->getlexeme_info), PointerGetDatum(prsobj->prs), PointerGetDatum(&lemm), PointerGetDatum(&lenlemm))); if (type > 0 && lenlemm >= MAXSTRLEN) { #ifdef IGNORE_LONGLEXEME ereport(NOTICE, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("A word you are indexing is too long. It will be ignored."))); continue; #else ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("A word you are indexing is too long"))); #endif } LexizeAddLemm(&ldata, type, lemm, lenlemm); while ((norms = LexizeExec(&ldata, NULL)) != NULL) { TSLexeme *ptr = norms; prs->pos++; /* set pos */ while (ptr->lexeme) { if (prs->curwords == prs->lenwords) { prs->lenwords *= 2; prs->words = (TSWORD *) repalloc((void *) prs->words, prs->lenwords * sizeof(TSWORD)); } if (ptr->flags & TSL_ADDPOS) prs->pos++; prs->words[prs->curwords].len = strlen(ptr->lexeme); prs->words[prs->curwords].word = ptr->lexeme; prs->words[prs->curwords].nvariant = ptr->nvariant; prs->words[prs->curwords].alen = 0; prs->words[prs->curwords].pos.pos = LIMITPOS(prs->pos); ptr++; prs->curwords++; } pfree(norms); } } while (type > 0); FunctionCall1( &(prsobj->end_info), PointerGetDatum(prsobj->prs) ); }