/* * Add positions from src to dest after offsetting them by maxpos. * Return the number added (might be less than expected due to overflow) */ static int32 add_pos(TSVector src, WordEntry *srcptr, TSVector dest, WordEntry *destptr, int32 maxpos) { uint16 *clen = &_POSVECPTR(dest, destptr)->npos; int i; uint16 slen = POSDATALEN(src, srcptr), startlen; WordEntryPos *spos = POSDATAPTR(src, srcptr), *dpos = POSDATAPTR(dest, destptr); if (!destptr->haspos) *clen = 0; startlen = *clen; for (i = 0; i < slen && *clen < MAXNUMPOS && (*clen == 0 || WEP_GETPOS(dpos[*clen - 1]) != MAXENTRYPOS - 1); i++) { WEP_SETWEIGHT(dpos[*clen], WEP_GETWEIGHT(spos[i])); WEP_SETPOS(dpos[*clen], LIMITPOS(WEP_GETPOS(spos[i]) + maxpos)); (*clen)++; } if (*clen != startlen) destptr->haspos = 1; return *clen - startlen; }
/* * Order: haspos, len, word, for all positions (pos, weight) */ static int silly_cmp_tsvector(const TSVector a, const TSVector b) { if (VARSIZE(a) < VARSIZE(b)) return -1; else if (VARSIZE(a) > VARSIZE(b)) return 1; else if (a->size < b->size) return -1; else if (a->size > b->size) return 1; else { WordEntry *aptr = ARRPTR(a); WordEntry *bptr = ARRPTR(b); int i = 0; int res; for (i = 0; i < a->size; i++) { if (aptr->haspos != bptr->haspos) { return (aptr->haspos > bptr->haspos) ? -1 : 1; } else if ((res = tsCompareString(STRPTR(a) + aptr->pos, aptr->len, STRPTR(b) + bptr->pos, bptr->len, false)) != 0) { return res; } else if (aptr->haspos) { WordEntryPos *ap = POSDATAPTR(a, aptr); WordEntryPos *bp = POSDATAPTR(b, bptr); int j; if (POSDATALEN(a, aptr) != POSDATALEN(b, bptr)) return (POSDATALEN(a, aptr) > POSDATALEN(b, bptr)) ? -1 : 1; for (j = 0; j < POSDATALEN(a, aptr); j++) { if (WEP_GETPOS(*ap) != WEP_GETPOS(*bp)) { return (WEP_GETPOS(*ap) > WEP_GETPOS(*bp)) ? -1 : 1; } else if (WEP_GETWEIGHT(*ap) != WEP_GETWEIGHT(*bp)) { return (WEP_GETWEIGHT(*ap) > WEP_GETWEIGHT(*bp)) ? -1 : 1; } ap++, bp++; } } aptr++; bptr++; } } return 0; }
Datum tsvector_setweight(PG_FUNCTION_ARGS) { TSVector in = PG_GETARG_TSVECTOR(0); char cw = PG_GETARG_CHAR(1); TSVector out; int i, j; WordEntry *entry; WordEntryPos *p; int w = 0; switch (cw) { case 'A': case 'a': w = 3; break; case 'B': case 'b': w = 2; break; case 'C': case 'c': w = 1; break; case 'D': case 'd': w = 0; break; default: /* internal error */ elog(ERROR, "unrecognized weight: %d", cw); } out = (TSVector) palloc(VARSIZE(in)); memcpy(out, in, VARSIZE(in)); entry = ARRPTR(out); i = out->size; while (i--) { if ((j = POSDATALEN(out, entry)) != 0) { p = POSDATAPTR(out, entry); while (j--) { WEP_SETWEIGHT(*p, w); p++; } } entry++; } PG_FREE_IF_COPY(in, 0); PG_RETURN_POINTER(out); }
/* * Returns the number of positions in value 'wptr' within tsvector 'txt', * that have a weight equal to one of the weights in 'weight' bitmask. */ static int check_weight(TSVector txt, WordEntry *wptr, int8 weight) { int len = POSDATALEN(txt, wptr); int num = 0; WordEntryPos *ptr = POSDATAPTR(txt, wptr); while (len--) { if (weight & (1 << WEP_GETWEIGHT(*ptr))) num++; ptr++; } return num; }
Datum tsvectorsend(PG_FUNCTION_ARGS) { TSVector vec = PG_GETARG_TSVECTOR(0); StringInfoData buf; int i, j; WordEntry *weptr = ARRPTR(vec); pq_begintypsend(&buf); pq_sendint(&buf, vec->size, sizeof(int32)); for (i = 0; i < vec->size; i++) { uint16 npos; /* * the strings in the TSVector array are not null-terminated, so we * have to send the null-terminator separately */ pq_sendtext(&buf, STRPTR(vec) + weptr->pos, weptr->len); pq_sendbyte(&buf, '\0'); npos = POSDATALEN(vec, weptr); pq_sendint(&buf, npos, sizeof(uint16)); if (npos > 0) { WordEntryPos *wepptr = POSDATAPTR(vec, weptr); for (j = 0; j < npos; j++) pq_sendint(&buf, wepptr[j], sizeof(WordEntryPos)); } weptr++; } PG_RETURN_BYTEA_P(pq_endtypsend(&buf)); }
Datum tsvector_concat(PG_FUNCTION_ARGS) { TSVector in1 = PG_GETARG_TSVECTOR(0); TSVector in2 = PG_GETARG_TSVECTOR(1); TSVector out; WordEntry *ptr; WordEntry *ptr1, *ptr2; WordEntryPos *p; int maxpos = 0, i, j, i1, i2, dataoff, output_bytes, output_size; char *data, *data1, *data2; /* Get max position in in1; we'll need this to offset in2's positions */ ptr = ARRPTR(in1); i = in1->size; while (i--) { if ((j = POSDATALEN(in1, ptr)) != 0) { p = POSDATAPTR(in1, ptr); while (j--) { if (WEP_GETPOS(*p) > maxpos) maxpos = WEP_GETPOS(*p); p++; } } ptr++; } ptr1 = ARRPTR(in1); ptr2 = ARRPTR(in2); data1 = STRPTR(in1); data2 = STRPTR(in2); i1 = in1->size; i2 = in2->size; /* * Conservative estimate of space needed. We might need all the data in * both inputs, and conceivably add a pad byte before position data for * each item where there was none before. */ output_bytes = VARSIZE(in1) + VARSIZE(in2) + i1 + i2; out = (TSVector) palloc0(output_bytes); SET_VARSIZE(out, output_bytes); /* * We must make out->size valid so that STRPTR(out) is sensible. We'll * collapse out any unused space at the end. */ out->size = in1->size + in2->size; ptr = ARRPTR(out); data = STRPTR(out); dataoff = 0; while (i1 && i2) { int cmp = compareEntry(data1, ptr1, data2, ptr2); if (cmp < 0) { /* in1 first */ ptr->haspos = ptr1->haspos; ptr->len = ptr1->len; memcpy(data + dataoff, data1 + ptr1->pos, ptr1->len); ptr->pos = dataoff; dataoff += ptr1->len; if (ptr->haspos) { dataoff = SHORTALIGN(dataoff); memcpy(data + dataoff, _POSVECPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16)); dataoff += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16); } ptr++; ptr1++; i1--; } else if (cmp > 0) { /* in2 first */ ptr->haspos = ptr2->haspos; ptr->len = ptr2->len; memcpy(data + dataoff, data2 + ptr2->pos, ptr2->len); ptr->pos = dataoff; dataoff += ptr2->len; if (ptr->haspos) { int addlen = add_pos(in2, ptr2, out, ptr, maxpos); if (addlen == 0) ptr->haspos = 0; else { dataoff = SHORTALIGN(dataoff); dataoff += addlen * sizeof(WordEntryPos) + sizeof(uint16); } } ptr++; ptr2++; i2--; } else { ptr->haspos = ptr1->haspos | ptr2->haspos; ptr->len = ptr1->len; memcpy(data + dataoff, data1 + ptr1->pos, ptr1->len); ptr->pos = dataoff; dataoff += ptr1->len; if (ptr->haspos) { if (ptr1->haspos) { dataoff = SHORTALIGN(dataoff); memcpy(data + dataoff, _POSVECPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16)); dataoff += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16); if (ptr2->haspos) dataoff += add_pos(in2, ptr2, out, ptr, maxpos) * sizeof(WordEntryPos); } else /* must have ptr2->haspos */ { int addlen = add_pos(in2, ptr2, out, ptr, maxpos); if (addlen == 0) ptr->haspos = 0; else { dataoff = SHORTALIGN(dataoff); dataoff += addlen * sizeof(WordEntryPos) + sizeof(uint16); } } } ptr++; ptr1++; ptr2++; i1--; i2--; } } while (i1) { ptr->haspos = ptr1->haspos; ptr->len = ptr1->len; memcpy(data + dataoff, data1 + ptr1->pos, ptr1->len); ptr->pos = dataoff; dataoff += ptr1->len; if (ptr->haspos) { dataoff = SHORTALIGN(dataoff); memcpy(data + dataoff, _POSVECPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16)); dataoff += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16); } ptr++; ptr1++; i1--; } while (i2) { ptr->haspos = ptr2->haspos; ptr->len = ptr2->len; memcpy(data + dataoff, data2 + ptr2->pos, ptr2->len); ptr->pos = dataoff; dataoff += ptr2->len; if (ptr->haspos) { int addlen = add_pos(in2, ptr2, out, ptr, maxpos); if (addlen == 0) ptr->haspos = 0; else { dataoff = SHORTALIGN(dataoff); dataoff += addlen * sizeof(WordEntryPos) + sizeof(uint16); } } ptr++; ptr2++; i2--; } /* * Instead of checking each offset individually, we check for overflow of * pos fields once at the end. */ if (dataoff > MAXSTRPOS) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("string is too long for tsvector (%d bytes, max %d bytes)", dataoff, MAXSTRPOS))); /* * Adjust sizes (asserting that we didn't overrun the original estimates) * and collapse out any unused array entries. */ output_size = ptr - ARRPTR(out); Assert(output_size <= out->size); out->size = output_size; if (data != STRPTR(out)) memmove(STRPTR(out), data, dataoff); output_bytes = CALCDATASIZE(out->size, dataoff); Assert(output_bytes <= VARSIZE(out)); SET_VARSIZE(out, output_bytes); PG_FREE_IF_COPY(in1, 0); PG_FREE_IF_COPY(in2, 1); PG_RETURN_POINTER(out); }
Datum tsvector_concat(PG_FUNCTION_ARGS) { TSVector in1 = PG_GETARG_TSVECTOR(0); TSVector in2 = PG_GETARG_TSVECTOR(1); TSVector out; WordEntry *ptr; WordEntry *ptr1, *ptr2; WordEntryPos *p; int maxpos = 0, i, j, i1, i2, dataoff; char *data, *data1, *data2; ptr = ARRPTR(in1); i = in1->size; while (i--) { if ((j = POSDATALEN(in1, ptr)) != 0) { p = POSDATAPTR(in1, ptr); while (j--) { if (WEP_GETPOS(*p) > maxpos) maxpos = WEP_GETPOS(*p); p++; } } ptr++; } ptr1 = ARRPTR(in1); ptr2 = ARRPTR(in2); data1 = STRPTR(in1); data2 = STRPTR(in2); i1 = in1->size; i2 = in2->size; /* conservative estimate of space needed */ out = (TSVector) palloc0(VARSIZE(in1) + VARSIZE(in2)); SET_VARSIZE(out, VARSIZE(in1) + VARSIZE(in2)); out->size = in1->size + in2->size; ptr = ARRPTR(out); data = STRPTR(out); dataoff = 0; while (i1 && i2) { int cmp = compareEntry(data1, ptr1, data2, ptr2); if (cmp < 0) { /* in1 first */ ptr->haspos = ptr1->haspos; ptr->len = ptr1->len; memcpy(data + dataoff, data1 + ptr1->pos, ptr1->len); ptr->pos = dataoff; dataoff += ptr1->len; if (ptr->haspos) { dataoff = SHORTALIGN(dataoff); memcpy(data + dataoff, _POSVECPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16)); dataoff += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16); } ptr++; ptr1++; i1--; } else if (cmp > 0) { /* in2 first */ ptr->haspos = ptr2->haspos; ptr->len = ptr2->len; memcpy(data + dataoff, data2 + ptr2->pos, ptr2->len); ptr->pos = dataoff; dataoff += ptr2->len; if (ptr->haspos) { int addlen = add_pos(in2, ptr2, out, ptr, maxpos); if (addlen == 0) ptr->haspos = 0; else { dataoff = SHORTALIGN(dataoff); dataoff += addlen * sizeof(WordEntryPos) + sizeof(uint16); } } ptr++; ptr2++; i2--; } else { ptr->haspos = ptr1->haspos | ptr2->haspos; ptr->len = ptr1->len; memcpy(data + dataoff, data1 + ptr1->pos, ptr1->len); ptr->pos = dataoff; dataoff += ptr1->len; if (ptr->haspos) { if (ptr1->haspos) { dataoff = SHORTALIGN(dataoff); memcpy(data + dataoff, _POSVECPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16)); dataoff += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16); if (ptr2->haspos) dataoff += add_pos(in2, ptr2, out, ptr, maxpos) * sizeof(WordEntryPos); } else /* must have ptr2->haspos */ { int addlen = add_pos(in2, ptr2, out, ptr, maxpos); if (addlen == 0) ptr->haspos = 0; else { dataoff = SHORTALIGN(dataoff); dataoff += addlen * sizeof(WordEntryPos) + sizeof(uint16); } } } ptr++; ptr1++; ptr2++; i1--; i2--; } } while (i1) { ptr->haspos = ptr1->haspos; ptr->len = ptr1->len; memcpy(data + dataoff, data1 + ptr1->pos, ptr1->len); ptr->pos = dataoff; dataoff += ptr1->len; if (ptr->haspos) { dataoff = SHORTALIGN(dataoff); memcpy(data + dataoff, _POSVECPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16)); dataoff += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16); } ptr++; ptr1++; i1--; } while (i2) { ptr->haspos = ptr2->haspos; ptr->len = ptr2->len; memcpy(data + dataoff, data2 + ptr2->pos, ptr2->len); ptr->pos = dataoff; dataoff += ptr2->len; if (ptr->haspos) { int addlen = add_pos(in2, ptr2, out, ptr, maxpos); if (addlen == 0) ptr->haspos = 0; else { dataoff = SHORTALIGN(dataoff); dataoff += addlen * sizeof(WordEntryPos) + sizeof(uint16); } } ptr++; ptr2++; i2--; } /* * Instead of checking each offset individually, we check for overflow of * pos fields once at the end. */ if (dataoff > MAXSTRPOS) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("string is too long for tsvector (%d bytes, max %d bytes)", dataoff, MAXSTRPOS))); out->size = ptr - ARRPTR(out); SET_VARSIZE(out, CALCDATASIZE(out->size, dataoff)); if (data != STRPTR(out)) memmove(STRPTR(out), data, dataoff); PG_FREE_IF_COPY(in1, 0); PG_FREE_IF_COPY(in2, 1); PG_RETURN_POINTER(out); }
Datum get_covers(PG_FUNCTION_ARGS) { tsvector *txt = (tsvector *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0)); QUERYTYPE *query = (QUERYTYPE *) PG_DETOAST_DATUM_COPY(PG_GETARG_DATUM(1)); WordEntry *pptr = ARRPTR(txt); int i, dlen = 0, j, cur = 0, len = 0, rlen; DocWord *dw, *dwptr; text *out; char *cptr; DocRepresentation *doc; int olddwpos = 0; int ncover = 1; Extention ext; doc = get_docrep(txt, query, &rlen); if (!doc) { out = palloc(VARHDRSZ); VARATT_SIZEP(out) = VARHDRSZ; PG_FREE_IF_COPY(txt, 0); PG_FREE_IF_COPY(query, 1); PG_RETURN_POINTER(out); } for (i = 0; i < txt->size; i++) { if (!pptr[i].haspos) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("no pos info"))); dlen += POSDATALEN(txt, &(pptr[i])); } dwptr = dw = palloc(sizeof(DocWord) * dlen); memset(dw, 0, sizeof(DocWord) * dlen); for (i = 0; i < txt->size; i++) { WordEntryPos *posdata = POSDATAPTR(txt, &(pptr[i])); for (j = 0; j < POSDATALEN(txt, &(pptr[i])); j++) { dw[cur].w = STRPTR(txt) + pptr[i].pos; dw[cur].len = pptr[i].len; dw[cur].pos = WEP_GETPOS(posdata[j]); cur++; } len += (pptr[i].len + 1) * (int) POSDATALEN(txt, &(pptr[i])); } qsort((void *) dw, dlen, sizeof(DocWord), compareDocWord); MemSet(&ext, 0, sizeof(Extention)); while (Cover(doc, rlen, query, &ext)) { dwptr = dw + olddwpos; while (dwptr->pos < ext.p && dwptr - dw < dlen) dwptr++; olddwpos = dwptr - dw; dwptr->start = ncover; while (dwptr->pos < ext.q + 1 && dwptr - dw < dlen) dwptr++; (dwptr - 1)->finish = ncover; len += 4 /* {}+two spaces */ + 2 * 16 /* numbers */ ; ncover++; } out = palloc(VARHDRSZ + len); cptr = ((char *) out) + VARHDRSZ; dwptr = dw; while (dwptr - dw < dlen) { if (dwptr->start) { sprintf(cptr, "{%d ", dwptr->start); cptr = strchr(cptr, '\0'); } memcpy(cptr, dwptr->w, dwptr->len); cptr += dwptr->len; *cptr = ' '; cptr++; if (dwptr->finish) { sprintf(cptr, "}%d ", dwptr->finish); cptr = strchr(cptr, '\0'); } dwptr++; } VARATT_SIZEP(out) = cptr - ((char *) out); pfree(dw); for (i = 0; i < rlen; i++) if (doc[i].needfree) pfree(doc[i].item); pfree(doc); PG_FREE_IF_COPY(txt, 0); PG_FREE_IF_COPY(query, 1); PG_RETURN_POINTER(out); }
static DocRepresentation * get_docrep(tsvector * txt, QUERYTYPE * query, int *doclen) { ITEM *item = GETQUERY(query); WordEntry *entry; WordEntryPos *post; int4 dimt, j, i; int len = query->size * 4, cur = 0; DocRepresentation *doc; char *operand; *(uint16 *) POSNULL = lengthof(POSNULL) - 1; doc = (DocRepresentation *) palloc(sizeof(DocRepresentation) * len); operand = GETOPERAND(query); reset_istrue_flag(query); for (i = 0; i < query->size; i++) { if (item[i].type != VAL || item[i].istrue) continue; entry = find_wordentry(txt, query, &(item[i])); if (!entry) continue; if (entry->haspos) { dimt = POSDATALEN(txt, entry); post = POSDATAPTR(txt, entry); } else { dimt = *(uint16 *) POSNULL; post = POSNULL + 1; } while (cur + dimt >= len) { len *= 2; doc = (DocRepresentation *) repalloc(doc, sizeof(DocRepresentation) * len); } for (j = 0; j < dimt; j++) { if (j == 0) { ITEM *kptr, *iptr = item + i; int k; doc[cur].needfree = false; doc[cur].nitem = 0; doc[cur].item = (ITEM **) palloc(sizeof(ITEM *) * query->size); for (k = 0; k < query->size; k++) { kptr = item + k; if (k == i || (item[k].type == VAL && compareITEM(&kptr, &iptr, operand) == 0)) { doc[cur].item[doc[cur].nitem] = item + k; doc[cur].nitem++; kptr->istrue = 1; } } } else { doc[cur].needfree = false; doc[cur].nitem = doc[cur - 1].nitem; doc[cur].item = doc[cur - 1].item; } doc[cur].pos = WEP_GETPOS(post[j]); doc[cur].wclass = WEP_GETWEIGHT(post[j]); cur++; } } *doclen = cur; if (cur > 0) { if (cur > 1) qsort((void *) doc, cur, sizeof(DocRepresentation), compareDocR); return doc; } pfree(doc); return NULL; }
/* * make value of tsvector */ static tsvector * makevalue(PRSTEXT * prs) { int4 i, j, lenstr = 0, totallen; tsvector *in; WordEntry *ptr; char *str, *cur; prs->curwords = uniqueWORD(prs->words, prs->curwords); for (i = 0; i < prs->curwords; i++) { lenstr += SHORTALIGN(prs->words[i].len); if (prs->words[i].alen) lenstr += sizeof(uint16) + prs->words[i].pos.apos[0] * sizeof(WordEntryPos); } totallen = CALCDATASIZE(prs->curwords, lenstr); in = (tsvector *) palloc(totallen); memset(in, 0, totallen); in->len = totallen; in->size = prs->curwords; ptr = ARRPTR(in); cur = str = STRPTR(in); for (i = 0; i < prs->curwords; i++) { ptr->len = prs->words[i].len; if (cur - str > MAXSTRPOS) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("value is too big"))); ptr->pos = cur - str; memcpy((void *) cur, (void *) prs->words[i].word, prs->words[i].len); pfree(prs->words[i].word); cur += SHORTALIGN(prs->words[i].len); if (prs->words[i].alen) { WordEntryPos *wptr; ptr->haspos = 1; *(uint16 *) cur = prs->words[i].pos.apos[0]; wptr = POSDATAPTR(in, ptr); for (j = 0; j < *(uint16 *) cur; j++) { wptr[j].weight = 0; wptr[j].pos = prs->words[i].pos.apos[j + 1]; } cur += sizeof(uint16) + prs->words[i].pos.apos[0] * sizeof(WordEntryPos); pfree(prs->words[i].pos.apos); } else ptr->haspos = 0; ptr++; } pfree(prs->words); return in; }
/* * make value of tsvector, given parsed text */ TSVector make_tsvector(ParsedText *prs) { int i, j, lenstr = 0, totallen; TSVector in; WordEntry *ptr; char *str; int stroff; prs->curwords = uniqueWORD(prs->words, prs->curwords); for (i = 0; i < prs->curwords; i++) { lenstr += prs->words[i].len; if (prs->words[i].alen) { lenstr = SHORTALIGN(lenstr); lenstr += sizeof(uint16) + prs->words[i].pos.apos[0] * sizeof(WordEntryPos); } } if (lenstr > MAXSTRPOS) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("string is too long for tsvector (%d bytes, max %d bytes)", lenstr, MAXSTRPOS))); totallen = CALCDATASIZE(prs->curwords, lenstr); in = (TSVector) palloc0(totallen); SET_VARSIZE(in, totallen); in->size = prs->curwords; ptr = ARRPTR(in); str = STRPTR(in); stroff = 0; for (i = 0; i < prs->curwords; i++) { ptr->len = prs->words[i].len; ptr->pos = stroff; memcpy(str + stroff, prs->words[i].word, prs->words[i].len); stroff += prs->words[i].len; pfree(prs->words[i].word); if (prs->words[i].alen) { int k = prs->words[i].pos.apos[0]; WordEntryPos *wptr; if (k > 0xFFFF) elog(ERROR, "positions array too long"); ptr->haspos = 1; stroff = SHORTALIGN(stroff); *(uint16 *) (str + stroff) = (uint16) k; wptr = POSDATAPTR(in, ptr); for (j = 0; j < k; j++) { WEP_SETWEIGHT(wptr[j], 0); WEP_SETPOS(wptr[j], prs->words[i].pos.apos[j + 1]); } stroff += sizeof(uint16) + k * sizeof(WordEntryPos); pfree(prs->words[i].pos.apos); } else ptr->haspos = 0; ptr++; } pfree(prs->words); return in; }
static DocRepresentation * get_docrep(TSVector txt, QueryRepresentation *qr, int *doclen) { QueryItem *item = GETQUERY(qr->query); WordEntry *entry, *firstentry; WordEntryPos *post; int32 dimt, j, i, nitem; int len = qr->query->size * 4, cur = 0; DocRepresentation *doc; char *operand; doc = (DocRepresentation *) palloc(sizeof(DocRepresentation) * len); operand = GETOPERAND(qr->query); for (i = 0; i < qr->query->size; i++) { QueryOperand *curoperand; if (item[i].type != QI_VAL) continue; curoperand = &item[i].qoperand; if (QR_GET_OPERAND_EXISTS(qr, &item[i])) continue; firstentry = entry = find_wordentry(txt, qr->query, curoperand, &nitem); if (!entry) continue; while (entry - firstentry < nitem) { if (entry->haspos) { dimt = POSDATALEN(txt, entry); post = POSDATAPTR(txt, entry); } else { dimt = POSNULL.npos; post = POSNULL.pos; } while (cur + dimt >= len) { len *= 2; doc = (DocRepresentation *) repalloc(doc, sizeof(DocRepresentation) * len); } for (j = 0; j < dimt; j++) { if (j == 0) { int k; doc[cur].nitem = 0; doc[cur].item = (QueryItem **) palloc(sizeof(QueryItem *) * qr->query->size); for (k = 0; k < qr->query->size; k++) { QueryOperand *kptr = &item[k].qoperand; QueryOperand *iptr = &item[i].qoperand; if (k == i || (item[k].type == QI_VAL && compareQueryOperand(&kptr, &iptr, operand) == 0)) { /* * if k == i, we've already checked above that * it's type == Q_VAL */ doc[cur].item[doc[cur].nitem] = item + k; doc[cur].nitem++; QR_SET_OPERAND_EXISTS(qr, item + k); } } } else { doc[cur].nitem = doc[cur - 1].nitem; doc[cur].item = doc[cur - 1].item; } doc[cur].pos = WEP_GETPOS(post[j]); doc[cur].wclass = WEP_GETWEIGHT(post[j]); cur++; } entry++; } } *doclen = cur; if (cur > 0) { qsort((void *) doc, cur, sizeof(DocRepresentation), compareDocR); return doc; } pfree(doc); return NULL; }
static float calc_rank_or(float *w, TSVector t, TSQuery q) { WordEntry *entry, *firstentry; WordEntryPos *post; int32 dimt, j, i, nitem; float res = 0.0; QueryOperand **item; int size = q->size; item = SortAndUniqItems(q, &size); for (i = 0; i < size; i++) { float resj, wjm; int32 jm; firstentry = entry = find_wordentry(t, q, item[i], &nitem); if (!entry) continue; while (entry - firstentry < nitem) { if (entry->haspos) { dimt = POSDATALEN(t, entry); post = POSDATAPTR(t, entry); } else { dimt = POSNULL.npos; post = POSNULL.pos; } resj = 0.0; wjm = -1.0; jm = 0; for (j = 0; j < dimt; j++) { resj = resj + wpos(post[j]) / ((j + 1) * (j + 1)); if (wpos(post[j]) > wjm) { wjm = wpos(post[j]); jm = j; } } /* limit (sum(i/i^2),i->inf) = pi^2/6 resj = sum(wi/i^2),i=1,noccurence, wi - should be sorted desc, don't sort for now, just choose maximum weight. This should be corrected Oleg Bartunov */ res = res + (wjm + resj - wjm / ((jm + 1) * (jm + 1))) / 1.64493406685; entry++; } } if (size > 0) res = res / size; pfree(item); return res; }
Datum tsvectorrecv(PG_FUNCTION_ARGS) { StringInfo buf = (StringInfo) PG_GETARG_POINTER(0); TSVector vec; int i; int32 nentries; int datalen; /* number of bytes used in the variable size * area after fixed size TSVector header and * WordEntries */ Size hdrlen; Size len; /* allocated size of vec */ bool needSort = false; nentries = pq_getmsgint(buf, sizeof(int32)); if (nentries < 0 || nentries > (MaxAllocSize / sizeof(WordEntry))) elog(ERROR, "invalid size of tsvector"); hdrlen = DATAHDRSIZE + sizeof(WordEntry) * nentries; len = hdrlen * 2; /* times two to make room for lexemes */ vec = (TSVector) palloc0(len); vec->size = nentries; datalen = 0; for (i = 0; i < nentries; i++) { const char *lexeme; uint16 npos; size_t lex_len; lexeme = pq_getmsgstring(buf); npos = (uint16) pq_getmsgint(buf, sizeof(uint16)); /* sanity checks */ lex_len = strlen(lexeme); if (lex_len > MAXSTRLEN) elog(ERROR, "invalid tsvector: lexeme too long"); if (datalen > MAXSTRPOS) elog(ERROR, "invalid tsvector: maximum total lexeme length exceeded"); if (npos > MAXNUMPOS) elog(ERROR, "unexpected number of tsvector positions"); /* * Looks valid. Fill the WordEntry struct, and copy lexeme. * * But make sure the buffer is large enough first. */ while (hdrlen + SHORTALIGN(datalen + lex_len) + (npos + 1) * sizeof(WordEntryPos) >= len) { len *= 2; vec = (TSVector) repalloc(vec, len); } vec->entries[i].haspos = (npos > 0) ? 1 : 0; vec->entries[i].len = lex_len; vec->entries[i].pos = datalen; memcpy(STRPTR(vec) + datalen, lexeme, lex_len); datalen += lex_len; if (i > 0 && WordEntryCMP(&vec->entries[i], &vec->entries[i - 1], STRPTR(vec)) <= 0) needSort = true; /* Receive positions */ if (npos > 0) { uint16 j; WordEntryPos *wepptr; /* * Pad to 2-byte alignment if necessary. Though we used palloc0 * for the initial allocation, subsequent repalloc'd memory areas * are not initialized to zero. */ if (datalen != SHORTALIGN(datalen)) { *(STRPTR(vec) + datalen) = '\0'; datalen = SHORTALIGN(datalen); } memcpy(STRPTR(vec) + datalen, &npos, sizeof(uint16)); wepptr = POSDATAPTR(vec, &vec->entries[i]); for (j = 0; j < npos; j++) { wepptr[j] = (WordEntryPos) pq_getmsgint(buf, sizeof(WordEntryPos)); if (j > 0 && WEP_GETPOS(wepptr[j]) <= WEP_GETPOS(wepptr[j - 1])) elog(ERROR, "position information is misordered"); } datalen += (npos + 1) * sizeof(WordEntry); } } SET_VARSIZE(vec, hdrlen + datalen); if (needSort) qsort_arg((void *) ARRPTR(vec), vec->size, sizeof(WordEntry), compareentry, (void *) STRPTR(vec)); PG_RETURN_TSVECTOR(vec); }
Datum tsvectorout(PG_FUNCTION_ARGS) { TSVector out = PG_GETARG_TSVECTOR(0); char *outbuf; int32 i, lenbuf = 0, pp; WordEntry *ptr = ARRPTR(out); char *curbegin, *curin, *curout; lenbuf = out->size * 2 /* '' */ + out->size - 1 /* space */ + 2 /* \0 */ ; for (i = 0; i < out->size; i++) { lenbuf += ptr[i].len * 2 * pg_database_encoding_max_length() /* for escape */ ; if (ptr[i].haspos) lenbuf += 1 /* : */ + 7 /* int2 + , + weight */ * POSDATALEN(out, &(ptr[i])); } curout = outbuf = (char *) palloc(lenbuf); for (i = 0; i < out->size; i++) { curbegin = curin = STRPTR(out) + ptr->pos; if (i != 0) *curout++ = ' '; *curout++ = '\''; while (curin - curbegin < ptr->len) { int len = pg_mblen(curin); if (t_iseq(curin, '\'')) *curout++ = '\''; else if (t_iseq(curin, '\\')) *curout++ = '\\'; while (len--) *curout++ = *curin++; } *curout++ = '\''; if ((pp = POSDATALEN(out, ptr)) != 0) { WordEntryPos *wptr; *curout++ = ':'; wptr = POSDATAPTR(out, ptr); while (pp) { curout += sprintf(curout, "%d", WEP_GETPOS(*wptr)); switch (WEP_GETWEIGHT(*wptr)) { case 3: *curout++ = 'A'; break; case 2: *curout++ = 'B'; break; case 1: *curout++ = 'C'; break; case 0: default: break; } if (pp > 1) *curout++ = ','; pp--; wptr++; } } ptr++; } *curout = '\0'; PG_FREE_IF_COPY(out, 0); PG_RETURN_CSTRING(outbuf); }
static float calc_rank_or(float *w, tsvector * t, QUERYTYPE * q) { WordEntry *entry; WordEntryPos *post; int4 dimt, j, i; float res = 0.0; ITEM **item; int size = q->size; *(uint16 *) POSNULL = lengthof(POSNULL) - 1; item = SortAndUniqItems(GETOPERAND(q), GETQUERY(q), &size); for (i = 0; i < size; i++) { float resj, wjm; int4 jm; entry = find_wordentry(t, q, item[i]); if (!entry) continue; if (entry->haspos) { dimt = POSDATALEN(t, entry); post = POSDATAPTR(t, entry); } else { dimt = *(uint16 *) POSNULL; post = POSNULL + 1; } resj = 0.0; wjm = -1.0; jm = 0; for (j = 0; j < dimt; j++) { resj = resj + wpos(post[j]) / ((j + 1) * (j + 1)); if (wpos(post[j]) > wjm) { wjm = wpos(post[j]); jm = j; } } /* limit (sum(i/i^2),i->inf) = pi^2/6 resj = sum(wi/i^2),i=1,noccurence, wi - should be sorted desc, don't sort for now, just choose maximum weight. This should be corrected Oleg Bartunov */ res = res + (wjm + resj - wjm / ((jm + 1) * (jm + 1))) / 1.64493406685; } if (size > 0) res = res / size; pfree(item); return res; }
Datum tsvector_out(PG_FUNCTION_ARGS) { tsvector *out = (tsvector *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0)); char *outbuf; int4 i, j, lenbuf = 0, pp; WordEntry *ptr = ARRPTR(out); char *curin, *curout; lenbuf = out->size * 2 /* '' */ + out->size - 1 /* space */ + 2 /* \0 */ ; for (i = 0; i < out->size; i++) { lenbuf += ptr[i].len * 2 /* for escape */ ; if (ptr[i].haspos) lenbuf += 7 * POSDATALEN(out, &(ptr[i])); } curout = outbuf = (char *) palloc(lenbuf); for (i = 0; i < out->size; i++) { curin = STRPTR(out) + ptr->pos; if (i != 0) *curout++ = ' '; *curout++ = '\''; j = ptr->len; while (j--) { if (*curin == '\'') { int4 pos = curout - outbuf; outbuf = (char *) repalloc((void *) outbuf, ++lenbuf); curout = outbuf + pos; *curout++ = '\\'; } *curout++ = *curin++; } *curout++ = '\''; if ((pp = POSDATALEN(out, ptr)) != 0) { WordEntryPos *wptr; *curout++ = ':'; wptr = POSDATAPTR(out, ptr); while (pp) { sprintf(curout, "%d", wptr->pos); curout = strchr(curout, '\0'); switch (wptr->weight) { case 3: *curout++ = 'A'; break; case 2: *curout++ = 'B'; break; case 1: *curout++ = 'C'; break; case 0: default: break; } if (pp > 1) *curout++ = ','; pp--; wptr++; } } ptr++; } *curout = '\0'; outbuf[lenbuf - 1] = '\0'; PG_FREE_IF_COPY(out, 0); PG_RETURN_POINTER(outbuf); }