/* * Order: haspos, len, word, for all positions (pos, weight) */ static int silly_cmp_tsvector(const TSVector a, const TSVector b) { if (VARSIZE(a) < VARSIZE(b)) return -1; else if (VARSIZE(a) > VARSIZE(b)) return 1; else if (a->size < b->size) return -1; else if (a->size > b->size) return 1; else { WordEntry *aptr = ARRPTR(a); WordEntry *bptr = ARRPTR(b); int i = 0; int res; for (i = 0; i < a->size; i++) { if (aptr->haspos != bptr->haspos) { return (aptr->haspos > bptr->haspos) ? -1 : 1; } else if ((res = tsCompareString(STRPTR(a) + aptr->pos, aptr->len, STRPTR(b) + bptr->pos, bptr->len, false)) != 0) { return res; } else if (aptr->haspos) { WordEntryPos *ap = POSDATAPTR(a, aptr); WordEntryPos *bp = POSDATAPTR(b, bptr); int j; if (POSDATALEN(a, aptr) != POSDATALEN(b, bptr)) return (POSDATALEN(a, aptr) > POSDATALEN(b, bptr)) ? -1 : 1; for (j = 0; j < POSDATALEN(a, aptr); j++) { if (WEP_GETPOS(*ap) != WEP_GETPOS(*bp)) { return (WEP_GETPOS(*ap) > WEP_GETPOS(*bp)) ? -1 : 1; } else if (WEP_GETWEIGHT(*ap) != WEP_GETWEIGHT(*bp)) { return (WEP_GETWEIGHT(*ap) > WEP_GETWEIGHT(*bp)) ? -1 : 1; } ap++, bp++; } } aptr++; bptr++; } } return 0; }
/* * Add positions from src to dest after offsetting them by maxpos. * Return the number added (might be less than expected due to overflow) */ static int32 add_pos(TSVector src, WordEntry *srcptr, TSVector dest, WordEntry *destptr, int32 maxpos) { uint16 *clen = &_POSVECPTR(dest, destptr)->npos; int i; uint16 slen = POSDATALEN(src, srcptr), startlen; WordEntryPos *spos = POSDATAPTR(src, srcptr), *dpos = POSDATAPTR(dest, destptr); if (!destptr->haspos) *clen = 0; startlen = *clen; for (i = 0; i < slen && *clen < MAXNUMPOS && (*clen == 0 || WEP_GETPOS(dpos[*clen - 1]) != MAXENTRYPOS - 1); i++) { WEP_SETWEIGHT(dpos[*clen], WEP_GETWEIGHT(spos[i])); WEP_SETPOS(dpos[*clen], LIMITPOS(WEP_GETPOS(spos[i]) + maxpos)); (*clen)++; } if (*clen != startlen) destptr->haspos = 1; return *clen - startlen; }
Datum tsvector_setweight(PG_FUNCTION_ARGS) { TSVector in = PG_GETARG_TSVECTOR(0); char cw = PG_GETARG_CHAR(1); TSVector out; int i, j; WordEntry *entry; WordEntryPos *p; int w = 0; switch (cw) { case 'A': case 'a': w = 3; break; case 'B': case 'b': w = 2; break; case 'C': case 'c': w = 1; break; case 'D': case 'd': w = 0; break; default: /* internal error */ elog(ERROR, "unrecognized weight: %d", cw); } out = (TSVector) palloc(VARSIZE(in)); memcpy(out, in, VARSIZE(in)); entry = ARRPTR(out); i = out->size; while (i--) { if ((j = POSDATALEN(out, entry)) != 0) { p = POSDATAPTR(out, entry); while (j--) { WEP_SETWEIGHT(*p, w); p++; } } entry++; } PG_FREE_IF_COPY(in, 0); PG_RETURN_POINTER(out); }
/* * Returns the number of positions in value 'wptr' within tsvector 'txt', * that have a weight equal to one of the weights in 'weight' bitmask. */ static int check_weight(TSVector txt, WordEntry *wptr, int8 weight) { int len = POSDATALEN(txt, wptr); int num = 0; WordEntryPos *ptr = POSDATAPTR(txt, wptr); while (len--) { if (weight & (1 << WEP_GETWEIGHT(*ptr))) num++; ptr++; } return num; }
static int cnt_length(tsvector * t) { WordEntry *ptr = ARRPTR(t), *end = (WordEntry *) STRPTR(t); int len = 0, clen; while (ptr < end) { if ((clen = POSDATALEN(t, ptr)) == 0) len += 1; else len += clen; ptr++; } return len; }
static int cnt_length(TSVector t) { WordEntry *ptr = ARRPTR(t), *end = (WordEntry *) STRPTR(t); int len = 0; while (ptr < end) { int clen = POSDATALEN(t, ptr); if (clen == 0) len += 1; else len += clen; ptr++; } return len; }
Datum tsvectorsend(PG_FUNCTION_ARGS) { TSVector vec = PG_GETARG_TSVECTOR(0); StringInfoData buf; int i, j; WordEntry *weptr = ARRPTR(vec); pq_begintypsend(&buf); pq_sendint(&buf, vec->size, sizeof(int32)); for (i = 0; i < vec->size; i++) { uint16 npos; /* * the strings in the TSVector array are not null-terminated, so we * have to send the null-terminator separately */ pq_sendtext(&buf, STRPTR(vec) + weptr->pos, weptr->len); pq_sendbyte(&buf, '\0'); npos = POSDATALEN(vec, weptr); pq_sendint(&buf, npos, sizeof(uint16)); if (npos > 0) { WordEntryPos *wepptr = POSDATAPTR(vec, weptr); for (j = 0; j < npos; j++) pq_sendint(&buf, wepptr[j], sizeof(WordEntryPos)); } weptr++; } PG_RETURN_BYTEA_P(pq_endtypsend(&buf)); }
Datum tsvector_concat(PG_FUNCTION_ARGS) { TSVector in1 = PG_GETARG_TSVECTOR(0); TSVector in2 = PG_GETARG_TSVECTOR(1); TSVector out; WordEntry *ptr; WordEntry *ptr1, *ptr2; WordEntryPos *p; int maxpos = 0, i, j, i1, i2, dataoff, output_bytes, output_size; char *data, *data1, *data2; /* Get max position in in1; we'll need this to offset in2's positions */ ptr = ARRPTR(in1); i = in1->size; while (i--) { if ((j = POSDATALEN(in1, ptr)) != 0) { p = POSDATAPTR(in1, ptr); while (j--) { if (WEP_GETPOS(*p) > maxpos) maxpos = WEP_GETPOS(*p); p++; } } ptr++; } ptr1 = ARRPTR(in1); ptr2 = ARRPTR(in2); data1 = STRPTR(in1); data2 = STRPTR(in2); i1 = in1->size; i2 = in2->size; /* * Conservative estimate of space needed. We might need all the data in * both inputs, and conceivably add a pad byte before position data for * each item where there was none before. */ output_bytes = VARSIZE(in1) + VARSIZE(in2) + i1 + i2; out = (TSVector) palloc0(output_bytes); SET_VARSIZE(out, output_bytes); /* * We must make out->size valid so that STRPTR(out) is sensible. We'll * collapse out any unused space at the end. */ out->size = in1->size + in2->size; ptr = ARRPTR(out); data = STRPTR(out); dataoff = 0; while (i1 && i2) { int cmp = compareEntry(data1, ptr1, data2, ptr2); if (cmp < 0) { /* in1 first */ ptr->haspos = ptr1->haspos; ptr->len = ptr1->len; memcpy(data + dataoff, data1 + ptr1->pos, ptr1->len); ptr->pos = dataoff; dataoff += ptr1->len; if (ptr->haspos) { dataoff = SHORTALIGN(dataoff); memcpy(data + dataoff, _POSVECPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16)); dataoff += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16); } ptr++; ptr1++; i1--; } else if (cmp > 0) { /* in2 first */ ptr->haspos = ptr2->haspos; ptr->len = ptr2->len; memcpy(data + dataoff, data2 + ptr2->pos, ptr2->len); ptr->pos = dataoff; dataoff += ptr2->len; if (ptr->haspos) { int addlen = add_pos(in2, ptr2, out, ptr, maxpos); if (addlen == 0) ptr->haspos = 0; else { dataoff = SHORTALIGN(dataoff); dataoff += addlen * sizeof(WordEntryPos) + sizeof(uint16); } } ptr++; ptr2++; i2--; } else { ptr->haspos = ptr1->haspos | ptr2->haspos; ptr->len = ptr1->len; memcpy(data + dataoff, data1 + ptr1->pos, ptr1->len); ptr->pos = dataoff; dataoff += ptr1->len; if (ptr->haspos) { if (ptr1->haspos) { dataoff = SHORTALIGN(dataoff); memcpy(data + dataoff, _POSVECPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16)); dataoff += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16); if (ptr2->haspos) dataoff += add_pos(in2, ptr2, out, ptr, maxpos) * sizeof(WordEntryPos); } else /* must have ptr2->haspos */ { int addlen = add_pos(in2, ptr2, out, ptr, maxpos); if (addlen == 0) ptr->haspos = 0; else { dataoff = SHORTALIGN(dataoff); dataoff += addlen * sizeof(WordEntryPos) + sizeof(uint16); } } } ptr++; ptr1++; ptr2++; i1--; i2--; } } while (i1) { ptr->haspos = ptr1->haspos; ptr->len = ptr1->len; memcpy(data + dataoff, data1 + ptr1->pos, ptr1->len); ptr->pos = dataoff; dataoff += ptr1->len; if (ptr->haspos) { dataoff = SHORTALIGN(dataoff); memcpy(data + dataoff, _POSVECPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16)); dataoff += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16); } ptr++; ptr1++; i1--; } while (i2) { ptr->haspos = ptr2->haspos; ptr->len = ptr2->len; memcpy(data + dataoff, data2 + ptr2->pos, ptr2->len); ptr->pos = dataoff; dataoff += ptr2->len; if (ptr->haspos) { int addlen = add_pos(in2, ptr2, out, ptr, maxpos); if (addlen == 0) ptr->haspos = 0; else { dataoff = SHORTALIGN(dataoff); dataoff += addlen * sizeof(WordEntryPos) + sizeof(uint16); } } ptr++; ptr2++; i2--; } /* * Instead of checking each offset individually, we check for overflow of * pos fields once at the end. */ if (dataoff > MAXSTRPOS) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("string is too long for tsvector (%d bytes, max %d bytes)", dataoff, MAXSTRPOS))); /* * Adjust sizes (asserting that we didn't overrun the original estimates) * and collapse out any unused array entries. */ output_size = ptr - ARRPTR(out); Assert(output_size <= out->size); out->size = output_size; if (data != STRPTR(out)) memmove(STRPTR(out), data, dataoff); output_bytes = CALCDATASIZE(out->size, dataoff); Assert(output_bytes <= VARSIZE(out)); SET_VARSIZE(out, output_bytes); PG_FREE_IF_COPY(in1, 0); PG_FREE_IF_COPY(in2, 1); PG_RETURN_POINTER(out); }
Datum tsvector_concat(PG_FUNCTION_ARGS) { TSVector in1 = PG_GETARG_TSVECTOR(0); TSVector in2 = PG_GETARG_TSVECTOR(1); TSVector out; WordEntry *ptr; WordEntry *ptr1, *ptr2; WordEntryPos *p; int maxpos = 0, i, j, i1, i2, dataoff; char *data, *data1, *data2; ptr = ARRPTR(in1); i = in1->size; while (i--) { if ((j = POSDATALEN(in1, ptr)) != 0) { p = POSDATAPTR(in1, ptr); while (j--) { if (WEP_GETPOS(*p) > maxpos) maxpos = WEP_GETPOS(*p); p++; } } ptr++; } ptr1 = ARRPTR(in1); ptr2 = ARRPTR(in2); data1 = STRPTR(in1); data2 = STRPTR(in2); i1 = in1->size; i2 = in2->size; /* conservative estimate of space needed */ out = (TSVector) palloc0(VARSIZE(in1) + VARSIZE(in2)); SET_VARSIZE(out, VARSIZE(in1) + VARSIZE(in2)); out->size = in1->size + in2->size; ptr = ARRPTR(out); data = STRPTR(out); dataoff = 0; while (i1 && i2) { int cmp = compareEntry(data1, ptr1, data2, ptr2); if (cmp < 0) { /* in1 first */ ptr->haspos = ptr1->haspos; ptr->len = ptr1->len; memcpy(data + dataoff, data1 + ptr1->pos, ptr1->len); ptr->pos = dataoff; dataoff += ptr1->len; if (ptr->haspos) { dataoff = SHORTALIGN(dataoff); memcpy(data + dataoff, _POSVECPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16)); dataoff += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16); } ptr++; ptr1++; i1--; } else if (cmp > 0) { /* in2 first */ ptr->haspos = ptr2->haspos; ptr->len = ptr2->len; memcpy(data + dataoff, data2 + ptr2->pos, ptr2->len); ptr->pos = dataoff; dataoff += ptr2->len; if (ptr->haspos) { int addlen = add_pos(in2, ptr2, out, ptr, maxpos); if (addlen == 0) ptr->haspos = 0; else { dataoff = SHORTALIGN(dataoff); dataoff += addlen * sizeof(WordEntryPos) + sizeof(uint16); } } ptr++; ptr2++; i2--; } else { ptr->haspos = ptr1->haspos | ptr2->haspos; ptr->len = ptr1->len; memcpy(data + dataoff, data1 + ptr1->pos, ptr1->len); ptr->pos = dataoff; dataoff += ptr1->len; if (ptr->haspos) { if (ptr1->haspos) { dataoff = SHORTALIGN(dataoff); memcpy(data + dataoff, _POSVECPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16)); dataoff += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16); if (ptr2->haspos) dataoff += add_pos(in2, ptr2, out, ptr, maxpos) * sizeof(WordEntryPos); } else /* must have ptr2->haspos */ { int addlen = add_pos(in2, ptr2, out, ptr, maxpos); if (addlen == 0) ptr->haspos = 0; else { dataoff = SHORTALIGN(dataoff); dataoff += addlen * sizeof(WordEntryPos) + sizeof(uint16); } } } ptr++; ptr1++; ptr2++; i1--; i2--; } } while (i1) { ptr->haspos = ptr1->haspos; ptr->len = ptr1->len; memcpy(data + dataoff, data1 + ptr1->pos, ptr1->len); ptr->pos = dataoff; dataoff += ptr1->len; if (ptr->haspos) { dataoff = SHORTALIGN(dataoff); memcpy(data + dataoff, _POSVECPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16)); dataoff += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16); } ptr++; ptr1++; i1--; } while (i2) { ptr->haspos = ptr2->haspos; ptr->len = ptr2->len; memcpy(data + dataoff, data2 + ptr2->pos, ptr2->len); ptr->pos = dataoff; dataoff += ptr2->len; if (ptr->haspos) { int addlen = add_pos(in2, ptr2, out, ptr, maxpos); if (addlen == 0) ptr->haspos = 0; else { dataoff = SHORTALIGN(dataoff); dataoff += addlen * sizeof(WordEntryPos) + sizeof(uint16); } } ptr++; ptr2++; i2--; } /* * Instead of checking each offset individually, we check for overflow of * pos fields once at the end. */ if (dataoff > MAXSTRPOS) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("string is too long for tsvector (%d bytes, max %d bytes)", dataoff, MAXSTRPOS))); out->size = ptr - ARRPTR(out); SET_VARSIZE(out, CALCDATASIZE(out->size, dataoff)); if (data != STRPTR(out)) memmove(STRPTR(out), data, dataoff); PG_FREE_IF_COPY(in1, 0); PG_FREE_IF_COPY(in2, 1); PG_RETURN_POINTER(out); }
Datum get_covers(PG_FUNCTION_ARGS) { tsvector *txt = (tsvector *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0)); QUERYTYPE *query = (QUERYTYPE *) PG_DETOAST_DATUM_COPY(PG_GETARG_DATUM(1)); WordEntry *pptr = ARRPTR(txt); int i, dlen = 0, j, cur = 0, len = 0, rlen; DocWord *dw, *dwptr; text *out; char *cptr; DocRepresentation *doc; int olddwpos = 0; int ncover = 1; Extention ext; doc = get_docrep(txt, query, &rlen); if (!doc) { out = palloc(VARHDRSZ); VARATT_SIZEP(out) = VARHDRSZ; PG_FREE_IF_COPY(txt, 0); PG_FREE_IF_COPY(query, 1); PG_RETURN_POINTER(out); } for (i = 0; i < txt->size; i++) { if (!pptr[i].haspos) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("no pos info"))); dlen += POSDATALEN(txt, &(pptr[i])); } dwptr = dw = palloc(sizeof(DocWord) * dlen); memset(dw, 0, sizeof(DocWord) * dlen); for (i = 0; i < txt->size; i++) { WordEntryPos *posdata = POSDATAPTR(txt, &(pptr[i])); for (j = 0; j < POSDATALEN(txt, &(pptr[i])); j++) { dw[cur].w = STRPTR(txt) + pptr[i].pos; dw[cur].len = pptr[i].len; dw[cur].pos = WEP_GETPOS(posdata[j]); cur++; } len += (pptr[i].len + 1) * (int) POSDATALEN(txt, &(pptr[i])); } qsort((void *) dw, dlen, sizeof(DocWord), compareDocWord); MemSet(&ext, 0, sizeof(Extention)); while (Cover(doc, rlen, query, &ext)) { dwptr = dw + olddwpos; while (dwptr->pos < ext.p && dwptr - dw < dlen) dwptr++; olddwpos = dwptr - dw; dwptr->start = ncover; while (dwptr->pos < ext.q + 1 && dwptr - dw < dlen) dwptr++; (dwptr - 1)->finish = ncover; len += 4 /* {}+two spaces */ + 2 * 16 /* numbers */ ; ncover++; } out = palloc(VARHDRSZ + len); cptr = ((char *) out) + VARHDRSZ; dwptr = dw; while (dwptr - dw < dlen) { if (dwptr->start) { sprintf(cptr, "{%d ", dwptr->start); cptr = strchr(cptr, '\0'); } memcpy(cptr, dwptr->w, dwptr->len); cptr += dwptr->len; *cptr = ' '; cptr++; if (dwptr->finish) { sprintf(cptr, "}%d ", dwptr->finish); cptr = strchr(cptr, '\0'); } dwptr++; } VARATT_SIZEP(out) = cptr - ((char *) out); pfree(dw); for (i = 0; i < rlen; i++) if (doc[i].needfree) pfree(doc[i].item); pfree(doc); PG_FREE_IF_COPY(txt, 0); PG_FREE_IF_COPY(query, 1); PG_RETURN_POINTER(out); }
Datum tsvectorout(PG_FUNCTION_ARGS) { TSVector out = PG_GETARG_TSVECTOR(0); char *outbuf; int32 i, lenbuf = 0, pp; WordEntry *ptr = ARRPTR(out); char *curbegin, *curin, *curout; lenbuf = out->size * 2 /* '' */ + out->size - 1 /* space */ + 2 /* \0 */ ; for (i = 0; i < out->size; i++) { lenbuf += ptr[i].len * 2 * pg_database_encoding_max_length() /* for escape */ ; if (ptr[i].haspos) lenbuf += 1 /* : */ + 7 /* int2 + , + weight */ * POSDATALEN(out, &(ptr[i])); } curout = outbuf = (char *) palloc(lenbuf); for (i = 0; i < out->size; i++) { curbegin = curin = STRPTR(out) + ptr->pos; if (i != 0) *curout++ = ' '; *curout++ = '\''; while (curin - curbegin < ptr->len) { int len = pg_mblen(curin); if (t_iseq(curin, '\'')) *curout++ = '\''; else if (t_iseq(curin, '\\')) *curout++ = '\\'; while (len--) *curout++ = *curin++; } *curout++ = '\''; if ((pp = POSDATALEN(out, ptr)) != 0) { WordEntryPos *wptr; *curout++ = ':'; wptr = POSDATAPTR(out, ptr); while (pp) { curout += sprintf(curout, "%d", WEP_GETPOS(*wptr)); switch (WEP_GETWEIGHT(*wptr)) { case 3: *curout++ = 'A'; break; case 2: *curout++ = 'B'; break; case 1: *curout++ = 'C'; break; case 0: default: break; } if (pp > 1) *curout++ = ','; pp--; wptr++; } } ptr++; } *curout = '\0'; PG_FREE_IF_COPY(out, 0); PG_RETURN_CSTRING(outbuf); }
static DocRepresentation * get_docrep(tsvector * txt, QUERYTYPE * query, int *doclen) { ITEM *item = GETQUERY(query); WordEntry *entry; WordEntryPos *post; int4 dimt, j, i; int len = query->size * 4, cur = 0; DocRepresentation *doc; char *operand; *(uint16 *) POSNULL = lengthof(POSNULL) - 1; doc = (DocRepresentation *) palloc(sizeof(DocRepresentation) * len); operand = GETOPERAND(query); reset_istrue_flag(query); for (i = 0; i < query->size; i++) { if (item[i].type != VAL || item[i].istrue) continue; entry = find_wordentry(txt, query, &(item[i])); if (!entry) continue; if (entry->haspos) { dimt = POSDATALEN(txt, entry); post = POSDATAPTR(txt, entry); } else { dimt = *(uint16 *) POSNULL; post = POSNULL + 1; } while (cur + dimt >= len) { len *= 2; doc = (DocRepresentation *) repalloc(doc, sizeof(DocRepresentation) * len); } for (j = 0; j < dimt; j++) { if (j == 0) { ITEM *kptr, *iptr = item + i; int k; doc[cur].needfree = false; doc[cur].nitem = 0; doc[cur].item = (ITEM **) palloc(sizeof(ITEM *) * query->size); for (k = 0; k < query->size; k++) { kptr = item + k; if (k == i || (item[k].type == VAL && compareITEM(&kptr, &iptr, operand) == 0)) { doc[cur].item[doc[cur].nitem] = item + k; doc[cur].nitem++; kptr->istrue = 1; } } } else { doc[cur].needfree = false; doc[cur].nitem = doc[cur - 1].nitem; doc[cur].item = doc[cur - 1].item; } doc[cur].pos = WEP_GETPOS(post[j]); doc[cur].wclass = WEP_GETWEIGHT(post[j]); cur++; } } *doclen = cur; if (cur > 0) { if (cur > 1) qsort((void *) doc, cur, sizeof(DocRepresentation), compareDocR); return doc; } pfree(doc); return NULL; }
static float calc_rank_or(float *w, tsvector * t, QUERYTYPE * q) { WordEntry *entry; WordEntryPos *post; int4 dimt, j, i; float res = 0.0; ITEM **item; int size = q->size; *(uint16 *) POSNULL = lengthof(POSNULL) - 1; item = SortAndUniqItems(GETOPERAND(q), GETQUERY(q), &size); for (i = 0; i < size; i++) { float resj, wjm; int4 jm; entry = find_wordentry(t, q, item[i]); if (!entry) continue; if (entry->haspos) { dimt = POSDATALEN(t, entry); post = POSDATAPTR(t, entry); } else { dimt = *(uint16 *) POSNULL; post = POSNULL + 1; } resj = 0.0; wjm = -1.0; jm = 0; for (j = 0; j < dimt; j++) { resj = resj + wpos(post[j]) / ((j + 1) * (j + 1)); if (wpos(post[j]) > wjm) { wjm = wpos(post[j]); jm = j; } } /* limit (sum(i/i^2),i->inf) = pi^2/6 resj = sum(wi/i^2),i=1,noccurence, wi - should be sorted desc, don't sort for now, just choose maximum weight. This should be corrected Oleg Bartunov */ res = res + (wjm + resj - wjm / ((jm + 1) * (jm + 1))) / 1.64493406685; } if (size > 0) res = res / size; pfree(item); return res; }
static DocRepresentation * get_docrep(TSVector txt, QueryRepresentation *qr, int *doclen) { QueryItem *item = GETQUERY(qr->query); WordEntry *entry, *firstentry; WordEntryPos *post; int32 dimt, j, i, nitem; int len = qr->query->size * 4, cur = 0; DocRepresentation *doc; char *operand; doc = (DocRepresentation *) palloc(sizeof(DocRepresentation) * len); operand = GETOPERAND(qr->query); for (i = 0; i < qr->query->size; i++) { QueryOperand *curoperand; if (item[i].type != QI_VAL) continue; curoperand = &item[i].qoperand; if (QR_GET_OPERAND_EXISTS(qr, &item[i])) continue; firstentry = entry = find_wordentry(txt, qr->query, curoperand, &nitem); if (!entry) continue; while (entry - firstentry < nitem) { if (entry->haspos) { dimt = POSDATALEN(txt, entry); post = POSDATAPTR(txt, entry); } else { dimt = POSNULL.npos; post = POSNULL.pos; } while (cur + dimt >= len) { len *= 2; doc = (DocRepresentation *) repalloc(doc, sizeof(DocRepresentation) * len); } for (j = 0; j < dimt; j++) { if (j == 0) { int k; doc[cur].nitem = 0; doc[cur].item = (QueryItem **) palloc(sizeof(QueryItem *) * qr->query->size); for (k = 0; k < qr->query->size; k++) { QueryOperand *kptr = &item[k].qoperand; QueryOperand *iptr = &item[i].qoperand; if (k == i || (item[k].type == QI_VAL && compareQueryOperand(&kptr, &iptr, operand) == 0)) { /* * if k == i, we've already checked above that * it's type == Q_VAL */ doc[cur].item[doc[cur].nitem] = item + k; doc[cur].nitem++; QR_SET_OPERAND_EXISTS(qr, item + k); } } } else { doc[cur].nitem = doc[cur - 1].nitem; doc[cur].item = doc[cur - 1].item; } doc[cur].pos = WEP_GETPOS(post[j]); doc[cur].wclass = WEP_GETWEIGHT(post[j]); cur++; } entry++; } } *doclen = cur; if (cur > 0) { qsort((void *) doc, cur, sizeof(DocRepresentation), compareDocR); return doc; } pfree(doc); return NULL; }
static float calc_rank_or(float *w, TSVector t, TSQuery q) { WordEntry *entry, *firstentry; WordEntryPos *post; int32 dimt, j, i, nitem; float res = 0.0; QueryOperand **item; int size = q->size; item = SortAndUniqItems(q, &size); for (i = 0; i < size; i++) { float resj, wjm; int32 jm; firstentry = entry = find_wordentry(t, q, item[i], &nitem); if (!entry) continue; while (entry - firstentry < nitem) { if (entry->haspos) { dimt = POSDATALEN(t, entry); post = POSDATAPTR(t, entry); } else { dimt = POSNULL.npos; post = POSNULL.pos; } resj = 0.0; wjm = -1.0; jm = 0; for (j = 0; j < dimt; j++) { resj = resj + wpos(post[j]) / ((j + 1) * (j + 1)); if (wpos(post[j]) > wjm) { wjm = wpos(post[j]); jm = j; } } /* limit (sum(i/i^2),i->inf) = pi^2/6 resj = sum(wi/i^2),i=1,noccurence, wi - should be sorted desc, don't sort for now, just choose maximum weight. This should be corrected Oleg Bartunov */ res = res + (wjm + resj - wjm / ((jm + 1) * (jm + 1))) / 1.64493406685; entry++; } } if (size > 0) res = res / size; pfree(item); return res; }
static void insertStatEntry(MemoryContext persistentContext, TSVectorStat *stat, TSVector txt, uint32 off) { WordEntry *we = ARRPTR(txt) + off; StatEntry *node = stat->root, *pnode = NULL; int n, res = 0; uint32 depth = 1; if (stat->weight == 0) n = (we->haspos) ? POSDATALEN(txt, we) : 1; else n = (we->haspos) ? check_weight(txt, we, stat->weight) : 0; if (n == 0) return; /* nothing to insert */ while (node) { res = compareStatWord(node, we, txt); if (res == 0) { break; } else { pnode = node; node = (res < 0) ? node->left : node->right; } depth++; } if (depth > stat->maxdepth) stat->maxdepth = depth; if (node == NULL) { node = MemoryContextAlloc(persistentContext, STATENTRYHDRSZ + we->len); node->left = node->right = NULL; node->ndoc = 1; node->nentry = n; node->lenlexeme = we->len; memcpy(node->lexeme, STRPTR(txt) + we->pos, node->lenlexeme); if (pnode == NULL) { stat->root = node; } else { if (res < 0) pnode->left = node; else pnode->right = node; } } else { node->ndoc++; node->nentry += n; } }
Datum tsvector_out(PG_FUNCTION_ARGS) { tsvector *out = (tsvector *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0)); char *outbuf; int4 i, j, lenbuf = 0, pp; WordEntry *ptr = ARRPTR(out); char *curin, *curout; lenbuf = out->size * 2 /* '' */ + out->size - 1 /* space */ + 2 /* \0 */ ; for (i = 0; i < out->size; i++) { lenbuf += ptr[i].len * 2 /* for escape */ ; if (ptr[i].haspos) lenbuf += 7 * POSDATALEN(out, &(ptr[i])); } curout = outbuf = (char *) palloc(lenbuf); for (i = 0; i < out->size; i++) { curin = STRPTR(out) + ptr->pos; if (i != 0) *curout++ = ' '; *curout++ = '\''; j = ptr->len; while (j--) { if (*curin == '\'') { int4 pos = curout - outbuf; outbuf = (char *) repalloc((void *) outbuf, ++lenbuf); curout = outbuf + pos; *curout++ = '\\'; } *curout++ = *curin++; } *curout++ = '\''; if ((pp = POSDATALEN(out, ptr)) != 0) { WordEntryPos *wptr; *curout++ = ':'; wptr = POSDATAPTR(out, ptr); while (pp) { sprintf(curout, "%d", wptr->pos); curout = strchr(curout, '\0'); switch (wptr->weight) { case 3: *curout++ = 'A'; break; case 2: *curout++ = 'B'; break; case 1: *curout++ = 'C'; break; case 0: default: break; } if (pp > 1) *curout++ = ','; pp--; wptr++; } } ptr++; } *curout = '\0'; outbuf[lenbuf - 1] = '\0'; PG_FREE_IF_COPY(out, 0); PG_RETURN_POINTER(outbuf); }