/* * Forms union of oldtup and addtup, if union == oldtup then return NULL */ IndexTuple gistgetadjusted(Relation r, IndexTuple oldtup, IndexTuple addtup, GISTSTATE *giststate) { bool neednew = FALSE; GISTENTRY oldentries[INDEX_MAX_KEYS], addentries[INDEX_MAX_KEYS]; bool oldisnull[INDEX_MAX_KEYS], addisnull[INDEX_MAX_KEYS]; IndexTuple newtup = NULL; int i; if (GistTupleIsInvalid(oldtup) || GistTupleIsInvalid(addtup)) return gist_form_invalid_tuple(ItemPointerGetBlockNumber(&(oldtup->t_tid))); gistDeCompressAtt(giststate, r, oldtup, NULL, (OffsetNumber) 0, oldentries, oldisnull); gistDeCompressAtt(giststate, r, addtup, NULL, (OffsetNumber) 0, addentries, addisnull); for (i = 0; i < r->rd_att->natts; i++) { gistMakeUnionKey(giststate, i, oldentries + i, oldisnull[i], addentries + i, addisnull[i], attrS + i, isnullS + i); if (neednew) /* we already need new key, so we can skip check */ continue; if (isnullS[i]) /* union of key may be NULL if and only if both keys are NULL */ continue; if (!addisnull[i]) { if (oldisnull[i] || gistKeyIsEQ(giststate, i, oldentries[i].key, attrS[i]) == false) neednew = true; } } if (neednew) { /* need to update key */ newtup = gistFormTuple(giststate, r, attrS, isnullS, false); newtup->t_tid = oldtup->t_tid; } return newtup; }
/* * if it was invalid tuple then we need special processing. * We move all invalid tuples on right page. * * if there is no place on left page, gistSplit will be called one more * time for left page. * * Normally, we never exec this code, but after crash replay it's possible * to get 'invalid' tuples (probability is low enough) */ static void gistSplitByInvalid(GISTSTATE *giststate, GistSplitVector *v, IndexTuple *itup, int len) { int i; static OffsetNumber offInvTuples[MaxOffsetNumber]; int nOffInvTuples = 0; for (i = 1; i <= len; i++) if (GistTupleIsInvalid(itup[i - 1])) offInvTuples[nOffInvTuples++] = i; if (nOffInvTuples == len) { /* corner case, all tuples are invalid */ v->spl_rightvalid = v->spl_leftvalid = false; gistSplitHalf(&v->splitVector, len); } else { GistSplitUnion gsvp; v->splitVector.spl_right = offInvTuples; v->splitVector.spl_nright = nOffInvTuples; v->spl_rightvalid = false; v->splitVector.spl_left = (OffsetNumber *) palloc(len * sizeof(OffsetNumber)); v->splitVector.spl_nleft = 0; for (i = 1; i <= len; i++) if (!GistTupleIsInvalid(itup[i - 1])) v->splitVector.spl_left[v->splitVector.spl_nleft++] = i; v->spl_leftvalid = true; gsvp.equiv = NULL; gsvp.attr = v->spl_lattr; gsvp.len = v->splitVector.spl_nleft; gsvp.entries = v->splitVector.spl_left; gsvp.isnull = v->spl_lisnull; gistunionsubkeyvec(giststate, itup, &gsvp, 0); } }
/* * gistindex_keytest() -- does this index tuple satisfy the scan key(s)? * * The index tuple might represent either a heap tuple or a lower index page, * depending on whether the containing page is a leaf page or not. * * On success return for a heap tuple, *recheck_p is set to indicate * whether recheck is needed. We recheck if any of the consistent() functions * request it. recheck is not interesting when examining a non-leaf entry, * since we must visit the lower index page if there's any doubt. * * If we are doing an ordered scan, so->distances[] is filled with distance * data from the distance() functions before returning success. * * We must decompress the key in the IndexTuple before passing it to the * sk_funcs (which actually are the opclass Consistent or Distance methods). * * Note that this function is always invoked in a short-lived memory context, * so we don't need to worry about cleaning up allocated memory, either here * or in the implementation of any Consistent or Distance methods. */ static bool gistindex_keytest(IndexScanDesc scan, IndexTuple tuple, Page page, OffsetNumber offset, bool *recheck_p) { GISTScanOpaque so = (GISTScanOpaque) scan->opaque; GISTSTATE *giststate = so->giststate; ScanKey key = scan->keyData; int keySize = scan->numberOfKeys; double *distance_p; Relation r = scan->indexRelation; *recheck_p = false; /* * If it's a leftover invalid tuple from pre-9.1, treat it as a match with * minimum possible distances. This means we'll always follow it to the * referenced page. * * GPDB: the virtual TIDs created for AO tables use the full range of * offset numbers from 0 to 65535. So a tuple on leaf page that looks like * an invalid tuple, is actually ok. */ if (!GistPageIsLeaf(page) && GistTupleIsInvalid(tuple)) { int i; for (i = 0; i < scan->numberOfOrderBys; i++) so->distances[i] = -get_float8_infinity(); return true; } /* Check whether it matches according to the Consistent functions */ while (keySize > 0) { Datum datum; bool isNull; datum = index_getattr(tuple, key->sk_attno, giststate->tupdesc, &isNull); if (key->sk_flags & SK_ISNULL) { /* * On non-leaf page we can't conclude that child hasn't NULL * values because of assumption in GiST: union (VAL, NULL) is VAL. * But if on non-leaf page key IS NULL, then all children are * NULL. */ if (key->sk_flags & SK_SEARCHNULL) { if (GistPageIsLeaf(page) && !isNull) return false; } else { Assert(key->sk_flags & SK_SEARCHNOTNULL); if (isNull) return false; } } else if (isNull) { return false; } else { Datum test; bool recheck; GISTENTRY de; gistdentryinit(giststate, key->sk_attno - 1, &de, datum, r, page, offset, FALSE, isNull); /* * Call the Consistent function to evaluate the test. The * arguments are the index datum (as a GISTENTRY*), the comparison * datum, the comparison operator's strategy number and subtype * from pg_amop, and the recheck flag. * * (Presently there's no need to pass the subtype since it'll * always be zero, but might as well pass it for possible future * use.) * * We initialize the recheck flag to true (the safest assumption) * in case the Consistent function forgets to set it. */ recheck = true; test = FunctionCall5Coll(&key->sk_func, key->sk_collation, PointerGetDatum(&de), key->sk_argument, Int32GetDatum(key->sk_strategy), ObjectIdGetDatum(key->sk_subtype), PointerGetDatum(&recheck)); if (!DatumGetBool(test)) return false; *recheck_p |= recheck; } key++; keySize--; } /* OK, it passes --- now let's compute the distances */ key = scan->orderByData; distance_p = so->distances; keySize = scan->numberOfOrderBys; while (keySize > 0) { Datum datum; bool isNull; datum = index_getattr(tuple, key->sk_attno, giststate->tupdesc, &isNull); if ((key->sk_flags & SK_ISNULL) || isNull) { /* Assume distance computes as null and sorts to the end */ *distance_p = get_float8_infinity(); } else { Datum dist; GISTENTRY de; gistdentryinit(giststate, key->sk_attno - 1, &de, datum, r, page, offset, FALSE, isNull); /* * Call the Distance function to evaluate the distance. The * arguments are the index datum (as a GISTENTRY*), the comparison * datum, and the ordering operator's strategy number and subtype * from pg_amop. * * (Presently there's no need to pass the subtype since it'll * always be zero, but might as well pass it for possible future * use.) * * Note that Distance functions don't get a recheck argument. We * can't tolerate lossy distance calculations on leaf tuples; * there is no opportunity to re-sort the tuples afterwards. */ dist = FunctionCall4Coll(&key->sk_func, key->sk_collation, PointerGetDatum(&de), key->sk_argument, Int32GetDatum(key->sk_strategy), ObjectIdGetDatum(key->sk_subtype)); *distance_p = DatumGetFloat8(dist); } key++; distance_p++; keySize--; } return true; }
/* * Bulk deletion of all index entries pointing to a set of heap tuples and * check invalid tuples left after upgrade. * The set of target tuples is specified via a callback routine that tells * whether any given heap tuple (identified by ItemPointer) is being deleted. * * Result: a palloc'd struct containing statistical info for VACUUM displays. */ IndexBulkDeleteResult * gistbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state) { Relation rel = info->index; GistBDItem *stack, *ptr; /* first time through? */ if (stats == NULL) stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); /* we'll re-count the tuples each time */ stats->estimated_count = false; stats->num_index_tuples = 0; stack = (GistBDItem *) palloc0(sizeof(GistBDItem)); stack->blkno = GIST_ROOT_BLKNO; while (stack) { Buffer buffer; Page page; OffsetNumber i, maxoff; IndexTuple idxtuple; ItemId iid; buffer = ReadBufferExtended(rel, MAIN_FORKNUM, stack->blkno, RBM_NORMAL, info->strategy); LockBuffer(buffer, GIST_SHARE); gistcheckpage(rel, buffer); page = (Page) BufferGetPage(buffer); if (GistPageIsLeaf(page)) { OffsetNumber todelete[MaxOffsetNumber]; int ntodelete = 0; LockBuffer(buffer, GIST_UNLOCK); LockBuffer(buffer, GIST_EXCLUSIVE); page = (Page) BufferGetPage(buffer); if (stack->blkno == GIST_ROOT_BLKNO && !GistPageIsLeaf(page)) { /* only the root can become non-leaf during relock */ UnlockReleaseBuffer(buffer); /* one more check */ continue; } /* * check for split proceeded after look at parent, we should check * it after relock */ pushStackIfSplited(page, stack); /* * Remove deletable tuples from page */ maxoff = PageGetMaxOffsetNumber(page); for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) { iid = PageGetItemId(page, i); idxtuple = (IndexTuple) PageGetItem(page, iid); if (callback(&(idxtuple->t_tid), callback_state)) todelete[ntodelete++] = i; else stats->num_index_tuples += 1; } stats->tuples_removed += ntodelete; if (ntodelete) { START_CRIT_SECTION(); MarkBufferDirty(buffer); PageIndexMultiDelete(page, todelete, ntodelete); GistMarkTuplesDeleted(page); if (RelationNeedsWAL(rel)) { XLogRecPtr recptr; recptr = gistXLogUpdate(buffer, todelete, ntodelete, NULL, 0, InvalidBuffer); PageSetLSN(page, recptr); } else PageSetLSN(page, gistGetFakeLSN(rel)); END_CRIT_SECTION(); } } else { /* check for split proceeded after look at parent */ pushStackIfSplited(page, stack); maxoff = PageGetMaxOffsetNumber(page); for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) { iid = PageGetItemId(page, i); idxtuple = (IndexTuple) PageGetItem(page, iid); ptr = (GistBDItem *) palloc(sizeof(GistBDItem)); ptr->blkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid)); ptr->parentlsn = BufferGetLSNAtomic(buffer); ptr->next = stack->next; stack->next = ptr; if (GistTupleIsInvalid(idxtuple)) ereport(LOG, (errmsg("index \"%s\" contains an inner tuple marked as invalid", RelationGetRelationName(rel)), errdetail("This is caused by an incomplete page split at crash recovery before upgrading to PostgreSQL 9.1."), errhint("Please REINDEX it."))); } } UnlockReleaseBuffer(buffer); ptr = stack->next; pfree(stack); stack = ptr; vacuum_delay_point(); } return stats; }
/* * Workhouse routine for doing insertion into a GiST index. Note that * this routine assumes it is invoked in a short-lived memory context, * so it does not bother releasing palloc'd allocations. */ void gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *giststate) { ItemId iid; IndexTuple idxtuple; GISTInsertStack firststack; GISTInsertStack *stack; GISTInsertState state; bool xlocked = false; memset(&state, 0, sizeof(GISTInsertState)); state.freespace = freespace; state.r = r; /* Start from the root */ firststack.blkno = GIST_ROOT_BLKNO; firststack.lsn = 0; firststack.parent = NULL; firststack.downlinkoffnum = InvalidOffsetNumber; state.stack = stack = &firststack; /* * Walk down along the path of smallest penalty, updating the parent * pointers with the key we're inserting as we go. If we crash in the * middle, the tree is consistent, although the possible parent updates * were a waste. */ for (;;) { if (XLogRecPtrIsInvalid(stack->lsn)) stack->buffer = ReadBuffer(state.r, stack->blkno); /* * Be optimistic and grab shared lock first. Swap it for an exclusive * lock later if we need to update the page. */ if (!xlocked) { LockBuffer(stack->buffer, GIST_SHARE); gistcheckpage(state.r, stack->buffer); } stack->page = (Page) BufferGetPage(stack->buffer); stack->lsn = PageGetLSN(stack->page); Assert(!RelationNeedsWAL(state.r) || !XLogRecPtrIsInvalid(stack->lsn)); /* * If this page was split but the downlink was never inserted to the * parent because the inserting backend crashed before doing that, fix * that now. */ if (GistFollowRight(stack->page)) { if (!xlocked) { LockBuffer(stack->buffer, GIST_UNLOCK); LockBuffer(stack->buffer, GIST_EXCLUSIVE); xlocked = true; /* someone might've completed the split when we unlocked */ if (!GistFollowRight(stack->page)) continue; } gistfixsplit(&state, giststate); UnlockReleaseBuffer(stack->buffer); xlocked = false; state.stack = stack = stack->parent; continue; } if (stack->blkno != GIST_ROOT_BLKNO && stack->parent->lsn < GistPageGetNSN(stack->page)) { /* * Concurrent split detected. There's no guarantee that the * downlink for this page is consistent with the tuple we're * inserting anymore, so go back to parent and rechoose the best * child. */ UnlockReleaseBuffer(stack->buffer); xlocked = false; state.stack = stack = stack->parent; continue; } if (!GistPageIsLeaf(stack->page)) { /* * This is an internal page so continue to walk down the tree. * Find the child node that has the minimum insertion penalty. */ BlockNumber childblkno; IndexTuple newtup; GISTInsertStack *item; OffsetNumber downlinkoffnum; downlinkoffnum = gistchoose(state.r, stack->page, itup, giststate); iid = PageGetItemId(stack->page, downlinkoffnum); idxtuple = (IndexTuple) PageGetItem(stack->page, iid); childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid)); /* * Check that it's not a leftover invalid tuple from pre-9.1 */ if (GistTupleIsInvalid(idxtuple)) ereport(ERROR, (errmsg("index \"%s\" contains an inner tuple marked as invalid", RelationGetRelationName(r)), errdetail("This is caused by an incomplete page split at crash recovery before upgrading to PostgreSQL 9.1."), errhint("Please REINDEX it."))); /* * Check that the key representing the target child node is * consistent with the key we're inserting. Update it if it's not. */ newtup = gistgetadjusted(state.r, idxtuple, itup, giststate); if (newtup) { /* * Swap shared lock for an exclusive one. Beware, the page may * change while we unlock/lock the page... */ if (!xlocked) { LockBuffer(stack->buffer, GIST_UNLOCK); LockBuffer(stack->buffer, GIST_EXCLUSIVE); xlocked = true; stack->page = (Page) BufferGetPage(stack->buffer); if (PageGetLSN(stack->page) != stack->lsn) { /* the page was changed while we unlocked it, retry */ continue; } } /* * Update the tuple. * * We still hold the lock after gistinserttuple(), but it * might have to split the page to make the updated tuple fit. * In that case the updated tuple might migrate to the other * half of the split, so we have to go back to the parent and * descend back to the half that's a better fit for the new * tuple. */ if (gistinserttuple(&state, stack, giststate, newtup, downlinkoffnum)) { /* * If this was a root split, the root page continues to be * the parent and the updated tuple went to one of the * child pages, so we just need to retry from the root * page. */ if (stack->blkno != GIST_ROOT_BLKNO) { UnlockReleaseBuffer(stack->buffer); xlocked = false; state.stack = stack = stack->parent; } continue; } } LockBuffer(stack->buffer, GIST_UNLOCK); xlocked = false; /* descend to the chosen child */ item = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack)); item->blkno = childblkno; item->parent = stack; item->downlinkoffnum = downlinkoffnum; state.stack = stack = item; } else { /* * Leaf page. Insert the new key. We've already updated all the * parents on the way down, but we might have to split the page if * it doesn't fit. gistinserthere() will take care of that. */ /* * Swap shared lock for an exclusive one. Be careful, the page may * change while we unlock/lock the page... */ if (!xlocked) { LockBuffer(stack->buffer, GIST_UNLOCK); LockBuffer(stack->buffer, GIST_EXCLUSIVE); xlocked = true; stack->page = (Page) BufferGetPage(stack->buffer); stack->lsn = PageGetLSN(stack->page); if (stack->blkno == GIST_ROOT_BLKNO) { /* * the only page that can become inner instead of leaf is * the root page, so for root we should recheck it */ if (!GistPageIsLeaf(stack->page)) { /* * very rare situation: during unlock/lock index with * number of pages = 1 was increased */ LockBuffer(stack->buffer, GIST_UNLOCK); xlocked = false; continue; } /* * we don't need to check root split, because checking * leaf/inner is enough to recognize split for root */ } else if (GistFollowRight(stack->page) || stack->parent->lsn < GistPageGetNSN(stack->page)) { /* * The page was split while we momentarily unlocked the * page. Go back to parent. */ UnlockReleaseBuffer(stack->buffer); xlocked = false; state.stack = stack = stack->parent; continue; } } /* now state.stack->(page, buffer and blkno) points to leaf page */ gistinserttuple(&state, stack, giststate, itup, InvalidOffsetNumber); LockBuffer(stack->buffer, GIST_UNLOCK); /* Release any pins we might still hold before exiting */ for (; stack; stack = stack->parent) ReleaseBuffer(stack->buffer); break; } } }
/* * Bulk deletion of all index entries pointing to a set of heap tuples and * check invalid tuples after crash recovery. * The set of target tuples is specified via a callback routine that tells * whether any given heap tuple (identified by ItemPointer) is being deleted. * * Result: a palloc'd struct containing statistical info for VACUUM displays. */ Datum gistbulkdelete(PG_FUNCTION_ARGS) { IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0); GistBulkDeleteResult *stats = (GistBulkDeleteResult *) PG_GETARG_POINTER(1); IndexBulkDeleteCallback callback = (IndexBulkDeleteCallback) PG_GETARG_POINTER(2); void *callback_state = (void *) PG_GETARG_POINTER(3); Relation rel = info->index; GistBDItem *stack, *ptr; /* first time through? */ if (stats == NULL) stats = (GistBulkDeleteResult *) palloc0(sizeof(GistBulkDeleteResult)); /* we'll re-count the tuples each time */ stats->std.num_index_tuples = 0; stack = (GistBDItem *) palloc0(sizeof(GistBDItem)); stack->blkno = GIST_ROOT_BLKNO; while (stack) { Buffer buffer = ReadBufferWithStrategy(rel, stack->blkno, info->strategy); Page page; OffsetNumber i, maxoff; IndexTuple idxtuple; ItemId iid; LockBuffer(buffer, GIST_SHARE); gistcheckpage(rel, buffer); page = (Page) BufferGetPage(buffer); if (GistPageIsLeaf(page)) { OffsetNumber todelete[MaxOffsetNumber]; int ntodelete = 0; LockBuffer(buffer, GIST_UNLOCK); LockBuffer(buffer, GIST_EXCLUSIVE); page = (Page) BufferGetPage(buffer); if (stack->blkno == GIST_ROOT_BLKNO && !GistPageIsLeaf(page)) { /* only the root can become non-leaf during relock */ UnlockReleaseBuffer(buffer); /* one more check */ continue; } /* * check for split proceeded after look at parent, we should check * it after relock */ pushStackIfSplited(page, stack); /* * Remove deletable tuples from page */ maxoff = PageGetMaxOffsetNumber(page); for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) { iid = PageGetItemId(page, i); idxtuple = (IndexTuple) PageGetItem(page, iid); if (callback(&(idxtuple->t_tid), callback_state)) { todelete[ntodelete] = i - ntodelete; ntodelete++; stats->std.tuples_removed += 1; } else stats->std.num_index_tuples += 1; } if (ntodelete) { START_CRIT_SECTION(); MarkBufferDirty(buffer); for (i = 0; i < ntodelete; i++) PageIndexTupleDelete(page, todelete[i]); GistMarkTuplesDeleted(page); if (!rel->rd_istemp) { XLogRecData *rdata; XLogRecPtr recptr; gistxlogPageUpdate *xlinfo; rdata = formUpdateRdata(rel->rd_node, buffer, todelete, ntodelete, NULL, 0, NULL); xlinfo = (gistxlogPageUpdate *) rdata->next->data; recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_UPDATE, rdata); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); pfree(xlinfo); pfree(rdata); } else PageSetLSN(page, XLogRecPtrForTemp); END_CRIT_SECTION(); } } else { /* check for split proceeded after look at parent */ pushStackIfSplited(page, stack); maxoff = PageGetMaxOffsetNumber(page); for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) { iid = PageGetItemId(page, i); idxtuple = (IndexTuple) PageGetItem(page, iid); ptr = (GistBDItem *) palloc(sizeof(GistBDItem)); ptr->blkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid)); ptr->parentlsn = PageGetLSN(page); ptr->next = stack->next; stack->next = ptr; if (GistTupleIsInvalid(idxtuple)) stats->needFullVacuum = true; } } UnlockReleaseBuffer(buffer); ptr = stack->next; pfree(stack); stack = ptr; vacuum_delay_point(); } PG_RETURN_POINTER(stats); }
static ArrayTuple gistVacuumUpdate(GistVacuum *gv, BlockNumber blkno, bool needunion) { ArrayTuple res = {NULL, 0, false}; Buffer buffer; Page page, tempPage = NULL; OffsetNumber i, maxoff; ItemId iid; int lenaddon = 4, curlenaddon = 0, nOffToDelete = 0, nBlkToDelete = 0; IndexTuple idxtuple, *addon = NULL; bool needwrite = false; OffsetNumber offToDelete[MaxOffsetNumber]; BlockNumber blkToDelete[MaxOffsetNumber]; ItemPointerData *completed = NULL; int ncompleted = 0, lencompleted = 16; vacuum_delay_point(); buffer = ReadBufferWithStrategy(gv->index, blkno, gv->strategy); LockBuffer(buffer, GIST_EXCLUSIVE); gistcheckpage(gv->index, buffer); page = (Page) BufferGetPage(buffer); maxoff = PageGetMaxOffsetNumber(page); if (GistPageIsLeaf(page)) { if (GistTuplesDeleted(page)) needunion = needwrite = true; } else { completed = (ItemPointerData *) palloc(sizeof(ItemPointerData) * lencompleted); addon = (IndexTuple *) palloc(sizeof(IndexTuple) * lenaddon); /* get copy of page to work */ tempPage = GistPageGetCopyPage(page); for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) { ArrayTuple chldtuple; bool needchildunion; iid = PageGetItemId(tempPage, i); idxtuple = (IndexTuple) PageGetItem(tempPage, iid); needchildunion = (GistTupleIsInvalid(idxtuple)) ? true : false; if (needchildunion) elog(DEBUG2, "gistVacuumUpdate: need union for block %u", ItemPointerGetBlockNumber(&(idxtuple->t_tid))); chldtuple = gistVacuumUpdate(gv, ItemPointerGetBlockNumber(&(idxtuple->t_tid)), needchildunion); if (chldtuple.ituplen || chldtuple.emptypage) { /* update tuple or/and inserts new */ if (chldtuple.emptypage) blkToDelete[nBlkToDelete++] = ItemPointerGetBlockNumber(&(idxtuple->t_tid)); offToDelete[nOffToDelete++] = i; PageIndexTupleDelete(tempPage, i); i--; maxoff--; needwrite = needunion = true; if (chldtuple.ituplen) { Assert(chldtuple.emptypage == false); while (curlenaddon + chldtuple.ituplen >= lenaddon) { lenaddon *= 2; addon = (IndexTuple *) repalloc(addon, sizeof(IndexTuple) * lenaddon); } memcpy(addon + curlenaddon, chldtuple.itup, chldtuple.ituplen * sizeof(IndexTuple)); curlenaddon += chldtuple.ituplen; if (chldtuple.ituplen > 1) { /* * child was split, so we need mark completion * insert(split) */ int j; while (ncompleted + chldtuple.ituplen > lencompleted) { lencompleted *= 2; completed = (ItemPointerData *) repalloc(completed, sizeof(ItemPointerData) * lencompleted); } for (j = 0; j < chldtuple.ituplen; j++) { ItemPointerCopy(&(chldtuple.itup[j]->t_tid), completed + ncompleted); ncompleted++; } } pfree(chldtuple.itup); } } } Assert(maxoff == PageGetMaxOffsetNumber(tempPage)); if (curlenaddon) { /* insert updated tuples */ if (gistnospace(tempPage, addon, curlenaddon, InvalidOffsetNumber, 0)) { /* there is no space on page to insert tuples */ res = vacuumSplitPage(gv, tempPage, buffer, addon, curlenaddon); tempPage = NULL; /* vacuumSplitPage() free tempPage */ needwrite = needunion = false; /* gistSplit already forms * unions and writes pages */ } else /* enough free space */ gistfillbuffer(gv->index, tempPage, addon, curlenaddon, InvalidOffsetNumber); } } /* * If page is empty, we should remove pointer to it before deleting page * (except root) */ if (blkno != GIST_ROOT_BLKNO && (PageIsEmpty(page) || (tempPage && PageIsEmpty(tempPage)))) { /* * New version of page is empty, so leave it unchanged, upper call * will mark our page as deleted. In case of page split we never will * be here... * * If page was empty it can't become non-empty during processing */ res.emptypage = true; UnlockReleaseBuffer(buffer); } else { /* write page and remove its childs if it need */ START_CRIT_SECTION(); if (tempPage && needwrite) { PageRestoreTempPage(tempPage, page); tempPage = NULL; } /* Empty index */ if (PageIsEmpty(page) && blkno == GIST_ROOT_BLKNO) { needwrite = true; GistPageSetLeaf(page); } if (needwrite) { MarkBufferDirty(buffer); GistClearTuplesDeleted(page); if (!gv->index->rd_istemp) { XLogRecData *rdata; XLogRecPtr recptr; char *xlinfo; rdata = formUpdateRdata(gv->index->rd_node, buffer, offToDelete, nOffToDelete, addon, curlenaddon, NULL); xlinfo = rdata->next->data; recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_UPDATE, rdata); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); pfree(xlinfo); pfree(rdata); } else PageSetLSN(page, XLogRecPtrForTemp); } END_CRIT_SECTION(); if (needunion && !PageIsEmpty(page)) { res.itup = (IndexTuple *) palloc(sizeof(IndexTuple)); res.ituplen = 1; res.itup[0] = PageMakeUnionKey(gv, buffer); } UnlockReleaseBuffer(buffer); /* delete empty children, now we havn't any links to pointed subtrees */ for (i = 0; i < nBlkToDelete; i++) gistDeleteSubtree(gv, blkToDelete[i]); if (ncompleted && !gv->index->rd_istemp) gistxlogInsertCompletion(gv->index->rd_node, completed, ncompleted); } for (i = 0; i < curlenaddon; i++) pfree(addon[i]); if (addon) pfree(addon); if (completed) pfree(completed); if (tempPage) pfree(tempPage); return res; }
/* * find entry with lowest penalty */ OffsetNumber gistchoose(Relation r, Page p, IndexTuple it, /* it has compressed entry */ GISTSTATE *giststate) { OffsetNumber maxoff; OffsetNumber i; OffsetNumber which; float sum_grow, which_grow[INDEX_MAX_KEYS]; GISTENTRY entry, identry[INDEX_MAX_KEYS]; bool isnull[INDEX_MAX_KEYS]; maxoff = PageGetMaxOffsetNumber(p); *which_grow = -1.0; which = InvalidOffsetNumber; sum_grow = 1; gistDeCompressAtt(giststate, r, it, NULL, (OffsetNumber) 0, identry, isnull); Assert(maxoff >= FirstOffsetNumber); Assert(!GistPageIsLeaf(p)); for (i = FirstOffsetNumber; i <= maxoff && sum_grow; i = OffsetNumberNext(i)) { int j; IndexTuple itup = (IndexTuple) PageGetItem(p, PageGetItemId(p, i)); if (!GistPageIsLeaf(p) && GistTupleIsInvalid(itup)) { ereport(LOG, (errmsg("index \"%s\" needs VACUUM or REINDEX to finish crash recovery", RelationGetRelationName(r)))); continue; } sum_grow = 0; for (j = 0; j < r->rd_att->natts; j++) { Datum datum; float usize; bool IsNull; datum = index_getattr(itup, j + 1, giststate->tupdesc, &IsNull); gistdentryinit(giststate, j, &entry, datum, r, p, i, FALSE, IsNull); usize = gistpenalty(giststate, j, &entry, IsNull, &identry[j], isnull[j]); if (which_grow[j] < 0 || usize < which_grow[j]) { which = i; which_grow[j] = usize; if (j < r->rd_att->natts - 1 && i == FirstOffsetNumber) which_grow[j + 1] = -1; sum_grow += which_grow[j]; } else if (which_grow[j] == usize) sum_grow += usize; else { sum_grow = 1; break; } } } if (which == InvalidOffsetNumber) which = FirstOffsetNumber; return which; }
bool gistMakeUnionItVec(GISTSTATE *giststate, IndexTuple *itvec, int len, int startkey, Datum *attr, bool *isnull) { int i; GistEntryVector *evec; int attrsize; evec = (GistEntryVector *) palloc((len + 2) * sizeof(GISTENTRY) + GEVHDRSZ); for (i = startkey; i < giststate->tupdesc->natts; i++) { int j; evec->n = 0; if (!isnull[i]) { gistentryinit(evec->vector[evec->n], attr[i], NULL, NULL, (OffsetNumber) 0, FALSE); evec->n++; } for (j = 0; j < len; j++) { Datum datum; bool IsNull; if (GistTupleIsInvalid(itvec[j])) return FALSE; /* signals that union with invalid tuple => * result is invalid */ datum = index_getattr(itvec[j], i + 1, giststate->tupdesc, &IsNull); if (IsNull) continue; gistdentryinit(giststate, i, evec->vector + evec->n, datum, NULL, NULL, (OffsetNumber) 0, FALSE, IsNull); evec->n++; } /* If this tuple vector was all NULLs, the union is NULL */ if (evec->n == 0) { attr[i] = (Datum) 0; isnull[i] = TRUE; } else { if (evec->n == 1) { evec->n = 2; evec->vector[1] = evec->vector[0]; } /* Make union and store in attr array */ attr[i] = FunctionCall2(&giststate->unionFn[i], PointerGetDatum(evec), PointerGetDatum(&attrsize)); isnull[i] = FALSE; } } return TRUE; }
/* * gistindex_keytest() -- does this index tuple satisfy the scan key(s)? * * The index tuple might represent either a heap tuple or a lower index page, * depending on whether the containing page is a leaf page or not. * * On success return for a heap tuple, *recheck_p is set to indicate whether * the quals need to be rechecked. We recheck if any of the consistent() * functions request it. recheck is not interesting when examining a non-leaf * entry, since we must visit the lower index page if there's any doubt. * Similarly, *recheck_distances_p is set to indicate whether the distances * need to be rechecked, and it is also ignored for non-leaf entries. * * If we are doing an ordered scan, so->distances[] is filled with distance * data from the distance() functions before returning success. * * We must decompress the key in the IndexTuple before passing it to the * sk_funcs (which actually are the opclass Consistent or Distance methods). * * Note that this function is always invoked in a short-lived memory context, * so we don't need to worry about cleaning up allocated memory, either here * or in the implementation of any Consistent or Distance methods. */ static bool gistindex_keytest(IndexScanDesc scan, IndexTuple tuple, Page page, OffsetNumber offset, bool *recheck_p, bool *recheck_distances_p) { GISTScanOpaque so = (GISTScanOpaque) scan->opaque; GISTSTATE *giststate = so->giststate; ScanKey key = scan->keyData; int keySize = scan->numberOfKeys; double *distance_p; Relation r = scan->indexRelation; *recheck_p = false; *recheck_distances_p = false; /* * If it's a leftover invalid tuple from pre-9.1, treat it as a match with * minimum possible distances. This means we'll always follow it to the * referenced page. */ if (GistTupleIsInvalid(tuple)) { int i; if (GistPageIsLeaf(page)) /* shouldn't happen */ elog(ERROR, "invalid GiST tuple found on leaf page"); for (i = 0; i < scan->numberOfOrderBys; i++) so->distances[i] = -get_float8_infinity(); return true; } /* Check whether it matches according to the Consistent functions */ while (keySize > 0) { Datum datum; bool isNull; datum = index_getattr(tuple, key->sk_attno, giststate->tupdesc, &isNull); if (key->sk_flags & SK_ISNULL) { /* * On non-leaf page we can't conclude that child hasn't NULL * values because of assumption in GiST: union (VAL, NULL) is VAL. * But if on non-leaf page key IS NULL, then all children are * NULL. */ if (key->sk_flags & SK_SEARCHNULL) { if (GistPageIsLeaf(page) && !isNull) return false; } else { Assert(key->sk_flags & SK_SEARCHNOTNULL); if (isNull) return false; } } else if (isNull) { return false; } else { Datum test; bool recheck; GISTENTRY de; gistdentryinit(giststate, key->sk_attno - 1, &de, datum, r, page, offset, false, isNull); /* * Call the Consistent function to evaluate the test. The * arguments are the index datum (as a GISTENTRY*), the comparison * datum, the comparison operator's strategy number and subtype * from pg_amop, and the recheck flag. * * (Presently there's no need to pass the subtype since it'll * always be zero, but might as well pass it for possible future * use.) * * We initialize the recheck flag to true (the safest assumption) * in case the Consistent function forgets to set it. */ recheck = true; test = FunctionCall5Coll(&key->sk_func, key->sk_collation, PointerGetDatum(&de), key->sk_argument, Int16GetDatum(key->sk_strategy), ObjectIdGetDatum(key->sk_subtype), PointerGetDatum(&recheck)); if (!DatumGetBool(test)) return false; *recheck_p |= recheck; } key++; keySize--; } /* OK, it passes --- now let's compute the distances */ key = scan->orderByData; distance_p = so->distances; keySize = scan->numberOfOrderBys; while (keySize > 0) { Datum datum; bool isNull; datum = index_getattr(tuple, key->sk_attno, giststate->tupdesc, &isNull); if ((key->sk_flags & SK_ISNULL) || isNull) { /* Assume distance computes as null and sorts to the end */ *distance_p = get_float8_infinity(); } else { Datum dist; bool recheck; GISTENTRY de; gistdentryinit(giststate, key->sk_attno - 1, &de, datum, r, page, offset, false, isNull); /* * Call the Distance function to evaluate the distance. The * arguments are the index datum (as a GISTENTRY*), the comparison * datum, the ordering operator's strategy number and subtype from * pg_amop, and the recheck flag. * * (Presently there's no need to pass the subtype since it'll * always be zero, but might as well pass it for possible future * use.) * * If the function sets the recheck flag, the returned distance is * a lower bound on the true distance and needs to be rechecked. * We initialize the flag to 'false'. This flag was added in * version 9.5; distance functions written before that won't know * about the flag, but are expected to never be lossy. */ recheck = false; dist = FunctionCall5Coll(&key->sk_func, key->sk_collation, PointerGetDatum(&de), key->sk_argument, Int16GetDatum(key->sk_strategy), ObjectIdGetDatum(key->sk_subtype), PointerGetDatum(&recheck)); *recheck_distances_p |= recheck; *distance_p = DatumGetFloat8(dist); } key++; distance_p++; keySize--; } return true; }
/* * gistindex_keytest() -- does this index tuple satisfy the scan key(s)? * * We must decompress the key in the IndexTuple before passing it to the * sk_func (and we have previously overwritten the sk_func to use the * user-defined Consistent method, so we actually are invoking that). * * Note that this function is always invoked in a short-lived memory context, * so we don't need to worry about cleaning up allocated memory, either here * or in the implementation of any Consistent methods. */ static bool gistindex_keytest(IndexTuple tuple, IndexScanDesc scan, OffsetNumber offset) { int keySize = scan->numberOfKeys; ScanKey key = scan->keyData; Relation r = scan->indexRelation; GISTScanOpaque so; Page p; GISTSTATE *giststate; so = (GISTScanOpaque) scan->opaque; giststate = so->giststate; p = BufferGetPage(so->curbuf); IncrIndexProcessed(); /* * Tuple doesn't restore after crash recovery because of incomplete insert */ if (!GistPageIsLeaf(p) && GistTupleIsInvalid(tuple)) return true; while (keySize > 0) { Datum datum; bool isNull; Datum test; GISTENTRY de; datum = index_getattr(tuple, key->sk_attno, giststate->tupdesc, &isNull); if (key->sk_flags & SK_ISNULL) { /* * On non-leaf page we can't conclude that child hasn't NULL * values because of assumption in GiST: uinon (VAL, NULL) is VAL * But if on non-leaf page key IS NULL then all childs has NULL. */ Assert(key->sk_flags & SK_SEARCHNULL); if (GistPageIsLeaf(p) && !isNull) return false; } else if (isNull) { return false; } else { gistdentryinit(giststate, key->sk_attno - 1, &de, datum, r, p, offset, FALSE, isNull); /* * Call the Consistent function to evaluate the test. The * arguments are the index datum (as a GISTENTRY*), the comparison * datum, and the comparison operator's strategy number and * subtype from pg_amop. * * (Presently there's no need to pass the subtype since it'll * always be zero, but might as well pass it for possible future * use.) */ test = FunctionCall4(&key->sk_func, PointerGetDatum(&de), key->sk_argument, Int32GetDatum(key->sk_strategy), ObjectIdGetDatum(key->sk_subtype)); if (!DatumGetBool(test)) return false; } keySize--; key++; } return true; }
/* * trys to split page by attno key, in a case of null * values move its to separate page. */ void gistSplitByKey(Relation r, Page page, IndexTuple *itup, int len, GISTSTATE *giststate, GistSplitVector *v, GistEntryVector *entryvec, int attno) { int i; static OffsetNumber offNullTuples[MaxOffsetNumber]; int nOffNullTuples = 0; for (i = 1; i <= len; i++) { Datum datum; bool IsNull; if (!GistPageIsLeaf(page) && GistTupleIsInvalid(itup[i - 1])) { gistSplitByInvalid(giststate, v, itup, len); return; } datum = index_getattr(itup[i - 1], attno + 1, giststate->tupdesc, &IsNull); gistdentryinit(giststate, attno, &(entryvec->vector[i]), datum, r, page, i, FALSE, IsNull); if (IsNull) offNullTuples[nOffNullTuples++] = i; } v->spl_leftvalid = v->spl_rightvalid = true; if (nOffNullTuples == len) { /* * Corner case: All keys in attno column are null, we should try to * split by keys in next column. It all keys in all columns are NULL * just split page half by half */ v->spl_risnull[attno] = v->spl_lisnull[attno] = TRUE; if (attno + 1 == r->rd_att->natts) gistSplitHalf(&v->splitVector, len); else gistSplitByKey(r, page, itup, len, giststate, v, entryvec, attno + 1); } else if (nOffNullTuples > 0) { int j = 0; /* * We don't want to mix NULLs and not-NULLs keys on one page, so move * nulls to right page */ v->splitVector.spl_right = offNullTuples; v->splitVector.spl_nright = nOffNullTuples; v->spl_risnull[attno] = TRUE; v->splitVector.spl_left = (OffsetNumber *) palloc(len * sizeof(OffsetNumber)); v->splitVector.spl_nleft = 0; for (i = 1; i <= len; i++) if (j < v->splitVector.spl_nright && offNullTuples[j] == i) j++; else v->splitVector.spl_left[v->splitVector.spl_nleft++] = i; v->spl_equiv = NULL; gistunionsubkey(giststate, itup, v, attno); } else { /* * all keys are not-null */ entryvec->n = len + 1; if (gistUserPicksplit(r, entryvec, attno, v, itup, len, giststate) && attno + 1 != r->rd_att->natts) { /* * Splitting on attno column is not optimized: there is a tuples * which can be freely left or right page, we will try to split * page by following columns */ if (v->spl_equiv == NULL) { /* * simple case: left and right keys for attno column are * equial */ gistSplitByKey(r, page, itup, len, giststate, v, entryvec, attno + 1); } else { /* we should clean up vector from already distributed tuples */ IndexTuple *newitup = (IndexTuple *) palloc((len + 1) * sizeof(IndexTuple)); OffsetNumber *map = (OffsetNumber *) palloc((len + 1) * sizeof(IndexTuple)); int newlen = 0; GIST_SPLITVEC backupSplit = v->splitVector; for (i = 0; i < len; i++) if (v->spl_equiv[i + 1]) { map[newlen] = i + 1; newitup[newlen++] = itup[i]; } Assert(newlen > 0); backupSplit.spl_left = (OffsetNumber *) palloc(sizeof(OffsetNumber) * len); memcpy(backupSplit.spl_left, v->splitVector.spl_left, sizeof(OffsetNumber) * v->splitVector.spl_nleft); backupSplit.spl_right = (OffsetNumber *) palloc(sizeof(OffsetNumber) * len); memcpy(backupSplit.spl_right, v->splitVector.spl_right, sizeof(OffsetNumber) * v->splitVector.spl_nright); gistSplitByKey(r, page, newitup, newlen, giststate, v, entryvec, attno + 1); /* merge result of subsplit */ for (i = 0; i < v->splitVector.spl_nleft; i++) backupSplit.spl_left[backupSplit.spl_nleft++] = map[v->splitVector.spl_left[i] - 1]; for (i = 0; i < v->splitVector.spl_nright; i++) backupSplit.spl_right[backupSplit.spl_nright++] = map[v->splitVector.spl_right[i] - 1]; v->splitVector = backupSplit; /* reunion left and right datums */ gistunionsubkey(giststate, itup, v, attno); } } } }