/* * Add positions from src to dest after offsetting them by maxpos. * Return the number added (might be less than expected due to overflow) */ static int32 add_pos(TSVector src, WordEntry *srcptr, TSVector dest, WordEntry *destptr, int32 maxpos) { uint16 *clen = &_POSVECPTR(dest, destptr)->npos; int i; uint16 slen = POSDATALEN(src, srcptr), startlen; WordEntryPos *spos = POSDATAPTR(src, srcptr), *dpos = POSDATAPTR(dest, destptr); if (!destptr->haspos) *clen = 0; startlen = *clen; for (i = 0; i < slen && *clen < MAXNUMPOS && (*clen == 0 || WEP_GETPOS(dpos[*clen - 1]) != MAXENTRYPOS - 1); i++) { WEP_SETWEIGHT(dpos[*clen], WEP_GETWEIGHT(spos[i])); WEP_SETPOS(dpos[*clen], LIMITPOS(WEP_GETPOS(spos[i]) + maxpos)); (*clen)++; } if (*clen != startlen) destptr->haspos = 1; return *clen - startlen; }
static float calc_rank_and(float *w, tsvector * t, QUERYTYPE * q) { uint16 **pos; int i, k, l, p; WordEntry *entry; WordEntryPos *post, *ct; int4 dimt, lenct, dist; float res = -1.0; ITEM **item; int size = q->size; item = SortAndUniqItems(GETOPERAND(q), GETQUERY(q), &size); if (size < 2) { pfree(item); return calc_rank_or(w, t, q); } pos = (uint16 **) palloc(sizeof(uint16 *) * q->size); memset(pos, 0, sizeof(uint16 *) * q->size); *(uint16 *) POSNULL = lengthof(POSNULL) - 1; WEP_SETPOS(POSNULL[1], MAXENTRYPOS - 1); for (i = 0; i < size; i++) { entry = find_wordentry(t, q, item[i]); if (!entry) continue; if (entry->haspos) pos[i] = (uint16 *) _POSDATAPTR(t, entry); else pos[i] = (uint16 *) POSNULL; dimt = *(uint16 *) (pos[i]); post = (WordEntryPos *) (pos[i] + 1); for (k = 0; k < i; k++) { if (!pos[k]) continue; lenct = *(uint16 *) (pos[k]); ct = (WordEntryPos *) (pos[k] + 1); for (l = 0; l < dimt; l++) { for (p = 0; p < lenct; p++) { dist = Abs((int) WEP_GETPOS(post[l]) - (int) WEP_GETPOS(ct[p])); if (dist || (dist == 0 && (pos[i] == (uint16 *) POSNULL || pos[k] == (uint16 *) POSNULL))) { float curw; if (!dist) dist = MAXENTRYPOS; curw = sqrt(wpos(post[l]) * wpos(ct[p]) * word_distance(dist)); res = (res < 0) ? curw : 1.0 - (1.0 - res) * (1.0 - curw); } } } } } pfree(pos); pfree(item); return res; }
static float calc_rank_and(float *w, TSVector t, TSQuery q) { WordEntryPosVector **pos; int i, k, l, p; WordEntry *entry, *firstentry; WordEntryPos *post, *ct; int32 dimt, lenct, dist, nitem; float res = -1.0; QueryOperand **item; int size = q->size; item = SortAndUniqItems(q, &size); if (size < 2) { pfree(item); return calc_rank_or(w, t, q); } pos = (WordEntryPosVector **) palloc0(sizeof(WordEntryPosVector *) * q->size); WEP_SETPOS(POSNULL.pos[0], MAXENTRYPOS - 1); for (i = 0; i < size; i++) { firstentry = entry = find_wordentry(t, q, item[i], &nitem); if (!entry) continue; while (entry - firstentry < nitem) { if (entry->haspos) pos[i] = _POSVECPTR(t, entry); else pos[i] = &POSNULL; dimt = pos[i]->npos; post = pos[i]->pos; for (k = 0; k < i; k++) { if (!pos[k]) continue; lenct = pos[k]->npos; ct = pos[k]->pos; for (l = 0; l < dimt; l++) { for (p = 0; p < lenct; p++) { dist = Abs((int) WEP_GETPOS(post[l]) - (int) WEP_GETPOS(ct[p])); if (dist || (dist == 0 && (pos[i] == &POSNULL || pos[k] == &POSNULL))) { float curw; if (!dist) dist = MAXENTRYPOS; curw = sqrt(wpos(post[l]) * wpos(ct[p]) * word_distance(dist)); res = (res < 0) ? curw : 1.0 - (1.0 - res) * (1.0 - curw); } } } } entry++; } } pfree(pos); pfree(item); return res; }
/* * make value of tsvector, given parsed text */ TSVector make_tsvector(ParsedText *prs) { int i, j, lenstr = 0, totallen; TSVector in; WordEntry *ptr; char *str; int stroff; prs->curwords = uniqueWORD(prs->words, prs->curwords); for (i = 0; i < prs->curwords; i++) { lenstr += prs->words[i].len; if (prs->words[i].alen) { lenstr = SHORTALIGN(lenstr); lenstr += sizeof(uint16) + prs->words[i].pos.apos[0] * sizeof(WordEntryPos); } } if (lenstr > MAXSTRPOS) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("string is too long for tsvector (%d bytes, max %d bytes)", lenstr, MAXSTRPOS))); totallen = CALCDATASIZE(prs->curwords, lenstr); in = (TSVector) palloc0(totallen); SET_VARSIZE(in, totallen); in->size = prs->curwords; ptr = ARRPTR(in); str = STRPTR(in); stroff = 0; for (i = 0; i < prs->curwords; i++) { ptr->len = prs->words[i].len; ptr->pos = stroff; memcpy(str + stroff, prs->words[i].word, prs->words[i].len); stroff += prs->words[i].len; pfree(prs->words[i].word); if (prs->words[i].alen) { int k = prs->words[i].pos.apos[0]; WordEntryPos *wptr; if (k > 0xFFFF) elog(ERROR, "positions array too long"); ptr->haspos = 1; stroff = SHORTALIGN(stroff); *(uint16 *) (str + stroff) = (uint16) k; wptr = POSDATAPTR(in, ptr); for (j = 0; j < k; j++) { WEP_SETWEIGHT(wptr[j], 0); WEP_SETPOS(wptr[j], prs->words[i].pos.apos[j + 1]); } stroff += sizeof(uint16) + k * sizeof(WordEntryPos); pfree(prs->words[i].pos.apos); } else ptr->haspos = 0; ptr++; } pfree(prs->words); return in; }
/* * Get next token from string being parsed. Returns true if successful, * false if end of input string is reached. On success, these output * parameters are filled in: * * *strval pointer to token * *lenval length of *strval * *pos_ptr pointer to a palloc'd array of positions and weights * associated with the token. If the caller is not interested * in the information, NULL can be supplied. Otherwise * the caller is responsible for pfreeing the array. * *poslen number of elements in *pos_ptr * *endptr scan resumption point * * Pass NULL for unwanted output parameters. */ bool gettoken_tsvector(TSVectorParseState state, char **strval, int *lenval, WordEntryPos **pos_ptr, int *poslen, char **endptr) { int oldstate = 0; char *curpos = state->word; int statecode = WAITWORD; /* * pos is for collecting the comma delimited list of positions followed by * the actual token. */ WordEntryPos *pos = NULL; int npos = 0; /* elements of pos used */ int posalen = 0; /* allocated size of pos */ while (1) { if (statecode == WAITWORD) { if (*(state->prsbuf) == '\0') return false; else if (t_iseq(state->prsbuf, '\'')) statecode = WAITENDCMPLX; else if (t_iseq(state->prsbuf, '\\')) { statecode = WAITNEXTCHAR; oldstate = WAITENDWORD; } else if (state->oprisdelim && ISOPERATOR(state->prsbuf)) PRSSYNTAXERROR; else if (!t_isspace(state->prsbuf)) { COPYCHAR(curpos, state->prsbuf); curpos += pg_mblen(state->prsbuf); statecode = WAITENDWORD; } } else if (statecode == WAITNEXTCHAR) { if (*(state->prsbuf) == '\0') ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("there is no escaped character: \"%s\"", state->bufstart))); else { RESIZEPRSBUF; COPYCHAR(curpos, state->prsbuf); curpos += pg_mblen(state->prsbuf); Assert(oldstate != 0); statecode = oldstate; } } else if (statecode == WAITENDWORD) { if (t_iseq(state->prsbuf, '\\')) { statecode = WAITNEXTCHAR; oldstate = WAITENDWORD; } else if (t_isspace(state->prsbuf) || *(state->prsbuf) == '\0' || (state->oprisdelim && ISOPERATOR(state->prsbuf))) { RESIZEPRSBUF; if (curpos == state->word) PRSSYNTAXERROR; *(curpos) = '\0'; RETURN_TOKEN; } else if (t_iseq(state->prsbuf, ':')) { if (curpos == state->word) PRSSYNTAXERROR; *(curpos) = '\0'; if (state->oprisdelim) RETURN_TOKEN; else statecode = INPOSINFO; } else { RESIZEPRSBUF; COPYCHAR(curpos, state->prsbuf); curpos += pg_mblen(state->prsbuf); } } else if (statecode == WAITENDCMPLX) { if (t_iseq(state->prsbuf, '\'')) { statecode = WAITCHARCMPLX; } else if (t_iseq(state->prsbuf, '\\')) { statecode = WAITNEXTCHAR; oldstate = WAITENDCMPLX; } else if (*(state->prsbuf) == '\0') PRSSYNTAXERROR; else { RESIZEPRSBUF; COPYCHAR(curpos, state->prsbuf); curpos += pg_mblen(state->prsbuf); } } else if (statecode == WAITCHARCMPLX) { if (t_iseq(state->prsbuf, '\'')) { RESIZEPRSBUF; COPYCHAR(curpos, state->prsbuf); curpos += pg_mblen(state->prsbuf); statecode = WAITENDCMPLX; } else { RESIZEPRSBUF; *(curpos) = '\0'; if (curpos == state->word) PRSSYNTAXERROR; if (state->oprisdelim) { /* state->prsbuf+=pg_mblen(state->prsbuf); */ RETURN_TOKEN; } else statecode = WAITPOSINFO; continue; /* recheck current character */ } } else if (statecode == WAITPOSINFO) { if (t_iseq(state->prsbuf, ':')) statecode = INPOSINFO; else RETURN_TOKEN; } else if (statecode == INPOSINFO) { if (t_isdigit(state->prsbuf)) { if (posalen == 0) { posalen = 4; pos = (WordEntryPos *) palloc(sizeof(WordEntryPos) * posalen); npos = 0; } else if (npos + 1 >= posalen) { posalen *= 2; pos = (WordEntryPos *) repalloc(pos, sizeof(WordEntryPos) * posalen); } npos++; WEP_SETPOS(pos[npos - 1], LIMITPOS(atoi(state->prsbuf))); /* we cannot get here in tsquery, so no need for 2 errmsgs */ if (WEP_GETPOS(pos[npos - 1]) == 0) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("wrong position info in tsvector: \"%s\"", state->bufstart))); WEP_SETWEIGHT(pos[npos - 1], 0); statecode = WAITPOSDELIM; } else PRSSYNTAXERROR; } else if (statecode == WAITPOSDELIM) { if (t_iseq(state->prsbuf, ',')) statecode = INPOSINFO; else if (t_iseq(state->prsbuf, 'a') || t_iseq(state->prsbuf, 'A') || t_iseq(state->prsbuf, '*')) { if (WEP_GETWEIGHT(pos[npos - 1])) PRSSYNTAXERROR; WEP_SETWEIGHT(pos[npos - 1], 3); } else if (t_iseq(state->prsbuf, 'b') || t_iseq(state->prsbuf, 'B')) { if (WEP_GETWEIGHT(pos[npos - 1])) PRSSYNTAXERROR; WEP_SETWEIGHT(pos[npos - 1], 2); } else if (t_iseq(state->prsbuf, 'c') || t_iseq(state->prsbuf, 'C')) { if (WEP_GETWEIGHT(pos[npos - 1])) PRSSYNTAXERROR; WEP_SETWEIGHT(pos[npos - 1], 1); } else if (t_iseq(state->prsbuf, 'd') || t_iseq(state->prsbuf, 'D')) { if (WEP_GETWEIGHT(pos[npos - 1])) PRSSYNTAXERROR; WEP_SETWEIGHT(pos[npos - 1], 0); } else if (t_isspace(state->prsbuf) || *(state->prsbuf) == '\0') RETURN_TOKEN; else if (!t_isdigit(state->prsbuf)) PRSSYNTAXERROR; } else /* internal error */ elog(ERROR, "unrecognized state in gettoken_tsvector: %d", statecode); /* get next char */ state->prsbuf += pg_mblen(state->prsbuf); } }