/* * Mask a Gist page before running consistency checks on it. */ void gist_mask(char *pagedata, BlockNumber blkno) { Page page = (Page) pagedata; mask_page_lsn_and_checksum(page); mask_page_hint_bits(page); mask_unused_space(page); /* * NSN is nothing but a special purpose LSN. Hence, mask it for the same * reason as mask_page_lsn_and_checksum. */ GistPageSetNSN(page, (uint64) MASK_MARKER); /* * We update F_FOLLOW_RIGHT flag on the left child after writing WAL * record. Hence, mask this flag. See gistplacetopage() for details. */ GistMarkFollowRight(page); if (GistPageIsLeaf(page)) { /* * In gist leaf pages, it is possible to modify the LP_FLAGS without * emitting any WAL record. Hence, mask the line pointer flags. See * gistkillitems() for details. */ mask_lp_flags(page); } /* * During gist redo, we never mark a page as garbage. Hence, mask it to * ignore any differences. */ GistClearPageHasGarbage(page); }
/* * pgstat_gist_page -- check tuples in a gist page */ static void pgstat_gist_page(pgstattuple_type *stat, Relation rel, BlockNumber blkno) { Buffer buf; Page page; buf = ReadBuffer(rel, blkno); LockBuffer(buf, GIST_SHARE); gistcheckpage(rel, buf); page = BufferGetPage(buf); if (GistPageIsLeaf(page)) { pgstat_index_page(stat, page, FirstOffsetNumber, PageGetMaxOffsetNumber(page)); } else { /* root or node */ } UnlockReleaseBuffer(buf); }
/* * pgstat_gist_page -- check tuples in a gist page */ static void pgstat_gist_page(pgstattuple_type *stat, Relation rel, BlockNumber blkno, BufferAccessStrategy bstrategy) { Buffer buf; Page page; buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, bstrategy); LockBuffer(buf, GIST_SHARE); gistcheckpage(rel, buf); page = BufferGetPage(buf); if (GistPageIsLeaf(page)) { pgstat_index_page(stat, page, FirstOffsetNumber, PageGetMaxOffsetNumber(page)); } else { /* root or node */ } UnlockReleaseBuffer(buf); }
static ArrayTuple gistVacuumUpdate(GistVacuum *gv, BlockNumber blkno, bool needunion) { ArrayTuple res = {NULL, 0, false}; Buffer buffer; Page page, tempPage = NULL; OffsetNumber i, maxoff; ItemId iid; int lenaddon = 4, curlenaddon = 0, nOffToDelete = 0, nBlkToDelete = 0; IndexTuple idxtuple, *addon = NULL; bool needwrite = false; OffsetNumber offToDelete[MaxOffsetNumber]; BlockNumber blkToDelete[MaxOffsetNumber]; ItemPointerData *completed = NULL; int ncompleted = 0, lencompleted = 16; vacuum_delay_point(); buffer = ReadBufferWithStrategy(gv->index, blkno, gv->strategy); LockBuffer(buffer, GIST_EXCLUSIVE); gistcheckpage(gv->index, buffer); page = (Page) BufferGetPage(buffer); maxoff = PageGetMaxOffsetNumber(page); if (GistPageIsLeaf(page)) { if (GistTuplesDeleted(page)) needunion = needwrite = true; } else { completed = (ItemPointerData *) palloc(sizeof(ItemPointerData) * lencompleted); addon = (IndexTuple *) palloc(sizeof(IndexTuple) * lenaddon); /* get copy of page to work */ tempPage = GistPageGetCopyPage(page); for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) { ArrayTuple chldtuple; bool needchildunion; iid = PageGetItemId(tempPage, i); idxtuple = (IndexTuple) PageGetItem(tempPage, iid); needchildunion = (GistTupleIsInvalid(idxtuple)) ? true : false; if (needchildunion) elog(DEBUG2, "gistVacuumUpdate: need union for block %u", ItemPointerGetBlockNumber(&(idxtuple->t_tid))); chldtuple = gistVacuumUpdate(gv, ItemPointerGetBlockNumber(&(idxtuple->t_tid)), needchildunion); if (chldtuple.ituplen || chldtuple.emptypage) { /* update tuple or/and inserts new */ if (chldtuple.emptypage) blkToDelete[nBlkToDelete++] = ItemPointerGetBlockNumber(&(idxtuple->t_tid)); offToDelete[nOffToDelete++] = i; PageIndexTupleDelete(tempPage, i); i--; maxoff--; needwrite = needunion = true; if (chldtuple.ituplen) { Assert(chldtuple.emptypage == false); while (curlenaddon + chldtuple.ituplen >= lenaddon) { lenaddon *= 2; addon = (IndexTuple *) repalloc(addon, sizeof(IndexTuple) * lenaddon); } memcpy(addon + curlenaddon, chldtuple.itup, chldtuple.ituplen * sizeof(IndexTuple)); curlenaddon += chldtuple.ituplen; if (chldtuple.ituplen > 1) { /* * child was split, so we need mark completion * insert(split) */ int j; while (ncompleted + chldtuple.ituplen > lencompleted) { lencompleted *= 2; completed = (ItemPointerData *) repalloc(completed, sizeof(ItemPointerData) * lencompleted); } for (j = 0; j < chldtuple.ituplen; j++) { ItemPointerCopy(&(chldtuple.itup[j]->t_tid), completed + ncompleted); ncompleted++; } } pfree(chldtuple.itup); } } } Assert(maxoff == PageGetMaxOffsetNumber(tempPage)); if (curlenaddon) { /* insert updated tuples */ if (gistnospace(tempPage, addon, curlenaddon, InvalidOffsetNumber, 0)) { /* there is no space on page to insert tuples */ res = vacuumSplitPage(gv, tempPage, buffer, addon, curlenaddon); tempPage = NULL; /* vacuumSplitPage() free tempPage */ needwrite = needunion = false; /* gistSplit already forms * unions and writes pages */ } else /* enough free space */ gistfillbuffer(gv->index, tempPage, addon, curlenaddon, InvalidOffsetNumber); } } /* * If page is empty, we should remove pointer to it before deleting page * (except root) */ if (blkno != GIST_ROOT_BLKNO && (PageIsEmpty(page) || (tempPage && PageIsEmpty(tempPage)))) { /* * New version of page is empty, so leave it unchanged, upper call * will mark our page as deleted. In case of page split we never will * be here... * * If page was empty it can't become non-empty during processing */ res.emptypage = true; UnlockReleaseBuffer(buffer); } else { /* write page and remove its childs if it need */ START_CRIT_SECTION(); if (tempPage && needwrite) { PageRestoreTempPage(tempPage, page); tempPage = NULL; } /* Empty index */ if (PageIsEmpty(page) && blkno == GIST_ROOT_BLKNO) { needwrite = true; GistPageSetLeaf(page); } if (needwrite) { MarkBufferDirty(buffer); GistClearTuplesDeleted(page); if (!gv->index->rd_istemp) { XLogRecData *rdata; XLogRecPtr recptr; char *xlinfo; rdata = formUpdateRdata(gv->index->rd_node, buffer, offToDelete, nOffToDelete, addon, curlenaddon, NULL); xlinfo = rdata->next->data; recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_UPDATE, rdata); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); pfree(xlinfo); pfree(rdata); } else PageSetLSN(page, XLogRecPtrForTemp); } END_CRIT_SECTION(); if (needunion && !PageIsEmpty(page)) { res.itup = (IndexTuple *) palloc(sizeof(IndexTuple)); res.ituplen = 1; res.itup[0] = PageMakeUnionKey(gv, buffer); } UnlockReleaseBuffer(buffer); /* delete empty children, now we havn't any links to pointed subtrees */ for (i = 0; i < nBlkToDelete; i++) gistDeleteSubtree(gv, blkToDelete[i]); if (ncompleted && !gv->index->rd_istemp) gistxlogInsertCompletion(gv->index->rd_node, completed, ncompleted); } for (i = 0; i < curlenaddon; i++) pfree(addon[i]); if (addon) pfree(addon); if (completed) pfree(completed); if (tempPage) pfree(tempPage); return res; }
/* * Fetch a tuples that matchs the search key; this can be invoked * either to fetch the first such tuple or subsequent matching * tuples. Returns true iff a matching tuple was found. */ static int gistnext(IndexScanDesc scan, ScanDirection dir, ItemPointer tids, int maxtids, bool ignore_killed_tuples) { MIRROREDLOCK_BUFMGR_DECLARE; Page p; OffsetNumber n; GISTScanOpaque so; GISTSearchStack *stk; IndexTuple it; GISTPageOpaque opaque; int ntids = 0; so = (GISTScanOpaque) scan->opaque; // -------- MirroredLock ---------- MIRROREDLOCK_BUFMGR_LOCK; if ( so->qual_ok == false ) return 0; if (ItemPointerIsValid(&so->curpos) == false) { /* Being asked to fetch the first entry, so start at the root */ Assert(so->curbuf == InvalidBuffer); Assert(so->stack == NULL); so->curbuf = ReadBuffer(scan->indexRelation, GIST_ROOT_BLKNO); stk = so->stack = (GISTSearchStack *) palloc0(sizeof(GISTSearchStack)); stk->next = NULL; stk->block = GIST_ROOT_BLKNO; pgstat_count_index_scan(scan->indexRelation); } else if (so->curbuf == InvalidBuffer) { MIRROREDLOCK_BUFMGR_UNLOCK; // -------- MirroredLock ---------- return 0; } /* * check stored pointers from last visit */ if ( so->nPageData > 0 ) { while( ntids < maxtids && so->curPageData < so->nPageData ) { tids[ ntids ] = scan->xs_ctup.t_self = so->pageData[ so->curPageData ].heapPtr; ItemPointerSet(&(so->curpos), BufferGetBlockNumber(so->curbuf), so->pageData[ so->curPageData ].pageOffset); so->curPageData ++; ntids++; } if ( ntids == maxtids ) { MIRROREDLOCK_BUFMGR_UNLOCK; // -------- MirroredLock ---------- return ntids; } /* * Go to the next page */ stk = so->stack->next; pfree(so->stack); so->stack = stk; /* If we're out of stack entries, we're done */ if (so->stack == NULL) { ReleaseBuffer(so->curbuf); so->curbuf = InvalidBuffer; MIRROREDLOCK_BUFMGR_UNLOCK; // -------- MirroredLock ---------- return ntids; } so->curbuf = ReleaseAndReadBuffer(so->curbuf, scan->indexRelation, stk->block); } for (;;) { /* First of all, we need lock buffer */ Assert(so->curbuf != InvalidBuffer); LockBuffer(so->curbuf, GIST_SHARE); gistcheckpage(scan->indexRelation, so->curbuf); p = BufferGetPage(so->curbuf); opaque = GistPageGetOpaque(p); /* remember lsn to identify page changed for tuple's killing */ so->stack->lsn = PageGetLSN(p); /* check page split, occured from last visit or visit to parent */ if (!XLogRecPtrIsInvalid(so->stack->parentlsn) && XLByteLT(so->stack->parentlsn, opaque->nsn) && opaque->rightlink != InvalidBlockNumber /* sanity check */ && (so->stack->next == NULL || so->stack->next->block != opaque->rightlink) /* check if already added */ ) { /* detect page split, follow right link to add pages */ stk = (GISTSearchStack *) palloc(sizeof(GISTSearchStack)); stk->next = so->stack->next; stk->block = opaque->rightlink; stk->parentlsn = so->stack->parentlsn; memset(&(stk->lsn), 0, sizeof(GistNSN)); so->stack->next = stk; } /* if page is empty, then just skip it */ if (PageIsEmpty(p)) { LockBuffer(so->curbuf, GIST_UNLOCK); stk = so->stack->next; pfree(so->stack); so->stack = stk; if (so->stack == NULL) { ReleaseBuffer(so->curbuf); so->curbuf = InvalidBuffer; MIRROREDLOCK_BUFMGR_UNLOCK; // -------- MirroredLock ---------- return ntids; } so->curbuf = ReleaseAndReadBuffer(so->curbuf, scan->indexRelation, stk->block); continue; } if (ScanDirectionIsBackward(dir)) n = PageGetMaxOffsetNumber(p); else n = FirstOffsetNumber; /* wonderful, we can look at page */ so->nPageData = so->curPageData = 0; for (;;) { n = gistfindnext(scan, n, dir); if (!OffsetNumberIsValid(n)) { while( ntids < maxtids && so->curPageData < so->nPageData ) { tids[ ntids ] = scan->xs_ctup.t_self = so->pageData[ so->curPageData ].heapPtr; ItemPointerSet(&(so->curpos), BufferGetBlockNumber(so->curbuf), so->pageData[ so->curPageData ].pageOffset); so->curPageData ++; ntids++; } if ( ntids == maxtids ) { LockBuffer(so->curbuf, GIST_UNLOCK); MIRROREDLOCK_BUFMGR_UNLOCK; // -------- MirroredLock ---------- return ntids; } /* * We ran out of matching index entries on the current page, * so pop the top stack entry and use it to continue the * search. */ LockBuffer(so->curbuf, GIST_UNLOCK); stk = so->stack->next; pfree(so->stack); so->stack = stk; /* If we're out of stack entries, we're done */ if (so->stack == NULL) { ReleaseBuffer(so->curbuf); so->curbuf = InvalidBuffer; MIRROREDLOCK_BUFMGR_UNLOCK; // -------- MirroredLock ---------- return ntids; } so->curbuf = ReleaseAndReadBuffer(so->curbuf, scan->indexRelation, stk->block); /* XXX go up */ break; } if (GistPageIsLeaf(p)) { /* * We've found a matching index entry in a leaf page, so * return success. Note that we keep "curbuf" pinned so that * we can efficiently resume the index scan later. */ if (!(ignore_killed_tuples && ItemIdIsDead(PageGetItemId(p, n)))) { it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n)); so->pageData[ so->nPageData ].heapPtr = it->t_tid; so->pageData[ so->nPageData ].pageOffset = n; so->nPageData ++; } } else { /* * We've found an entry in an internal node whose key is * consistent with the search key, so push it to stack */ stk = (GISTSearchStack *) palloc(sizeof(GISTSearchStack)); it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n)); stk->block = ItemPointerGetBlockNumber(&(it->t_tid)); memset(&(stk->lsn), 0, sizeof(GistNSN)); stk->parentlsn = so->stack->lsn; stk->next = so->stack->next; so->stack->next = stk; } if (ScanDirectionIsBackward(dir)) n = OffsetNumberPrev(n); else n = OffsetNumberNext(n); } } MIRROREDLOCK_BUFMGR_UNLOCK; // -------- MirroredLock ---------- return ntids; }
/* * Scan all items on the GiST index page identified by *pageItem, and insert * them into the queue (or directly to output areas) * * scan: index scan we are executing * pageItem: search queue item identifying an index page to scan * myDistances: distances array associated with pageItem, or NULL at the root * tbm: if not NULL, gistgetbitmap's output bitmap * ntids: if not NULL, gistgetbitmap's output tuple counter * * If tbm/ntids aren't NULL, we are doing an amgetbitmap scan, and heap * tuples should be reported directly into the bitmap. If they are NULL, * we're doing a plain or ordered indexscan. For a plain indexscan, heap * tuple TIDs are returned into so->pageData[]. For an ordered indexscan, * heap tuple TIDs are pushed into individual search queue items. * * If we detect that the index page has split since we saw its downlink * in the parent, we push its new right sibling onto the queue so the * sibling will be processed next. */ static void gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, double *myDistances, TIDBitmap *tbm, int64 *ntids) { GISTScanOpaque so = (GISTScanOpaque) scan->opaque; Buffer buffer; Page page; GISTPageOpaque opaque; OffsetNumber maxoff; OffsetNumber i; GISTSearchTreeItem *tmpItem = so->tmpTreeItem; bool isNew; MemoryContext oldcxt; Assert(!GISTSearchItemIsHeap(*pageItem)); buffer = ReadBuffer(scan->indexRelation, pageItem->blkno); LockBuffer(buffer, GIST_SHARE); gistcheckpage(scan->indexRelation, buffer); page = BufferGetPage(buffer); opaque = GistPageGetOpaque(page); /* * Check if we need to follow the rightlink. We need to follow it if the * page was concurrently split since we visited the parent (in which case * parentlsn < nsn), or if the system crashed after a page split but * before the downlink was inserted into the parent. */ if (!XLogRecPtrIsInvalid(pageItem->data.parentlsn) && (GistFollowRight(page) || pageItem->data.parentlsn < GistPageGetNSN(page)) && opaque->rightlink != InvalidBlockNumber /* sanity check */ ) { /* There was a page split, follow right link to add pages */ GISTSearchItem *item; /* This can't happen when starting at the root */ Assert(myDistances != NULL); oldcxt = MemoryContextSwitchTo(so->queueCxt); /* Create new GISTSearchItem for the right sibling index page */ item = palloc(sizeof(GISTSearchItem)); item->next = NULL; item->blkno = opaque->rightlink; item->data.parentlsn = pageItem->data.parentlsn; /* Insert it into the queue using same distances as for this page */ tmpItem->head = item; tmpItem->lastHeap = NULL; memcpy(tmpItem->distances, myDistances, sizeof(double) * scan->numberOfOrderBys); (void) rb_insert(so->queue, (RBNode *) tmpItem, &isNew); MemoryContextSwitchTo(oldcxt); } so->nPageData = so->curPageData = 0; /* * check all tuples on page */ maxoff = PageGetMaxOffsetNumber(page); for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) { IndexTuple it = (IndexTuple) PageGetItem(page, PageGetItemId(page, i)); bool match; bool recheck; /* * Must call gistindex_keytest in tempCxt, and clean up any leftover * junk afterward. */ oldcxt = MemoryContextSwitchTo(so->giststate->tempCxt); match = gistindex_keytest(scan, it, page, i, &recheck); MemoryContextSwitchTo(oldcxt); MemoryContextReset(so->giststate->tempCxt); /* Ignore tuple if it doesn't match */ if (!match) continue; if (tbm && GistPageIsLeaf(page)) { /* * getbitmap scan, so just push heap tuple TIDs into the bitmap * without worrying about ordering */ tbm_add_tuples(tbm, &it->t_tid, 1, recheck); (*ntids)++; } else if (scan->numberOfOrderBys == 0 && GistPageIsLeaf(page)) { /* * Non-ordered scan, so report heap tuples in so->pageData[] */ so->pageData[so->nPageData].heapPtr = it->t_tid; so->pageData[so->nPageData].recheck = recheck; so->nPageData++; } else { /* * Must push item into search queue. We get here for any lower * index page, and also for heap tuples if doing an ordered * search. */ GISTSearchItem *item; oldcxt = MemoryContextSwitchTo(so->queueCxt); /* Create new GISTSearchItem for this item */ item = palloc(sizeof(GISTSearchItem)); item->next = NULL; if (GistPageIsLeaf(page)) { /* Creating heap-tuple GISTSearchItem */ item->blkno = InvalidBlockNumber; item->data.heap.heapPtr = it->t_tid; item->data.heap.recheck = recheck; } else { /* Creating index-page GISTSearchItem */ item->blkno = ItemPointerGetBlockNumber(&it->t_tid); /* * LSN of current page is lsn of parent page for child. We * only have a shared lock, so we need to get the LSN * atomically. */ item->data.parentlsn = BufferGetLSNAtomic(buffer); } /* Insert it into the queue using new distance data */ tmpItem->head = item; tmpItem->lastHeap = GISTSearchItemIsHeap(*item) ? item : NULL; memcpy(tmpItem->distances, so->distances, sizeof(double) * scan->numberOfOrderBys); (void) rb_insert(so->queue, (RBNode *) tmpItem, &isNew); MemoryContextSwitchTo(oldcxt); } } UnlockReleaseBuffer(buffer); }
/* * Traverse the tree to find path from root page to specified "child" block. * * returns a new insertion stack, starting from the parent of "child", up * to the root. *downlinkoffnum is set to the offset of the downlink in the * direct parent of child. * * To prevent deadlocks, this should lock only one page at a time. */ static GISTInsertStack * gistFindPath(Relation r, BlockNumber child, OffsetNumber *downlinkoffnum) { Page page; Buffer buffer; OffsetNumber i, maxoff; ItemId iid; IndexTuple idxtuple; List *fifo; GISTInsertStack *top, *ptr; BlockNumber blkno; top = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack)); top->blkno = GIST_ROOT_BLKNO; top->downlinkoffnum = InvalidOffsetNumber; fifo = list_make1(top); while (fifo != NIL) { /* Get next page to visit */ top = linitial(fifo); fifo = list_delete_first(fifo); buffer = ReadBuffer(r, top->blkno); LockBuffer(buffer, GIST_SHARE); gistcheckpage(r, buffer); page = (Page) BufferGetPage(buffer); if (GistPageIsLeaf(page)) { /* * Because we scan the index top-down, all the rest of the pages * in the queue must be leaf pages as well. */ UnlockReleaseBuffer(buffer); break; } top->lsn = PageGetLSN(page); /* * If F_FOLLOW_RIGHT is set, the page to the right doesn't have a * downlink. This should not normally happen.. */ if (GistFollowRight(page)) elog(ERROR, "concurrent GiST page split was incomplete"); if (top->parent && top->parent->lsn < GistPageGetNSN(page) && GistPageGetOpaque(page)->rightlink != InvalidBlockNumber /* sanity check */ ) { /* * Page was split while we looked elsewhere. We didn't see the * downlink to the right page when we scanned the parent, so add * it to the queue now. * * Put the right page ahead of the queue, so that we visit it * next. That's important, because if this is the lowest internal * level, just above leaves, we might already have queued up some * leaf pages, and we assume that there can't be any non-leaf * pages behind leaf pages. */ ptr = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack)); ptr->blkno = GistPageGetOpaque(page)->rightlink; ptr->downlinkoffnum = InvalidOffsetNumber; ptr->parent = top->parent; fifo = lcons(ptr, fifo); } maxoff = PageGetMaxOffsetNumber(page); for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) { iid = PageGetItemId(page, i); idxtuple = (IndexTuple) PageGetItem(page, iid); blkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid)); if (blkno == child) { /* Found it! */ UnlockReleaseBuffer(buffer); *downlinkoffnum = i; return top; } else { /* Append this child to the list of pages to visit later */ ptr = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack)); ptr->blkno = blkno; ptr->downlinkoffnum = i; ptr->parent = top; fifo = lappend(fifo, ptr); } } UnlockReleaseBuffer(buffer); } elog(ERROR, "failed to re-find parent of a page in index \"%s\", block %u", RelationGetRelationName(r), child); return NULL; /* keep compiler quiet */ }
/* * Place tuples from 'itup' to 'buffer'. If 'oldoffnum' is valid, the tuple * at that offset is atomically removed along with inserting the new tuples. * This is used to replace a tuple with a new one. * * If 'leftchildbuf' is valid, we're inserting the downlink for the page * to the right of 'leftchildbuf', or updating the downlink for 'leftchildbuf'. * F_FOLLOW_RIGHT flag on 'leftchildbuf' is cleared and NSN is set. * * If 'markfollowright' is true and the page is split, the left child is * marked with F_FOLLOW_RIGHT flag. That is the normal case. During buffered * index build, however, there is no concurrent access and the page splitting * is done in a slightly simpler fashion, and false is passed. * * If there is not enough room on the page, it is split. All the split * pages are kept pinned and locked and returned in *splitinfo, the caller * is responsible for inserting the downlinks for them. However, if * 'buffer' is the root page and it needs to be split, gistplacetopage() * performs the split as one atomic operation, and *splitinfo is set to NIL. * In that case, we continue to hold the root page locked, and the child * pages are released; note that new tuple(s) are *not* on the root page * but in one of the new child pages. * * If 'newblkno' is not NULL, returns the block number of page the first * new/updated tuple was inserted to. Usually it's the given page, but could * be its right sibling if the page was split. * * Returns 'true' if the page was split, 'false' otherwise. */ bool gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate, Buffer buffer, IndexTuple *itup, int ntup, OffsetNumber oldoffnum, BlockNumber *newblkno, Buffer leftchildbuf, List **splitinfo, bool markfollowright) { BlockNumber blkno = BufferGetBlockNumber(buffer); Page page = BufferGetPage(buffer); bool is_leaf = (GistPageIsLeaf(page)) ? true : false; XLogRecPtr recptr; int i; bool is_split; /* * Refuse to modify a page that's incompletely split. This should not * happen because we finish any incomplete splits while we walk down the * tree. However, it's remotely possible that another concurrent inserter * splits a parent page, and errors out before completing the split. We * will just throw an error in that case, and leave any split we had in * progress unfinished too. The next insert that comes along will clean up * the mess. */ if (GistFollowRight(page)) elog(ERROR, "concurrent GiST page split was incomplete"); *splitinfo = NIL; /* * if isupdate, remove old key: This node's key has been modified, either * because a child split occurred or because we needed to adjust our key * for an insert in a child node. Therefore, remove the old version of * this node's key. * * for WAL replay, in the non-split case we handle this by setting up a * one-element todelete array; in the split case, it's handled implicitly * because the tuple vector passed to gistSplit won't include this tuple. */ is_split = gistnospace(page, itup, ntup, oldoffnum, freespace); if (is_split) { /* no space for insertion */ IndexTuple *itvec; int tlen; SplitedPageLayout *dist = NULL, *ptr; BlockNumber oldrlink = InvalidBlockNumber; GistNSN oldnsn = 0; SplitedPageLayout rootpg; bool is_rootsplit; is_rootsplit = (blkno == GIST_ROOT_BLKNO); /* * Form index tuples vector to split. If we're replacing an old tuple, * remove the old version from the vector. */ itvec = gistextractpage(page, &tlen); if (OffsetNumberIsValid(oldoffnum)) { /* on inner page we should remove old tuple */ int pos = oldoffnum - FirstOffsetNumber; tlen--; if (pos != tlen) memmove(itvec + pos, itvec + pos + 1, sizeof(IndexTuple) * (tlen - pos)); } itvec = gistjoinvector(itvec, &tlen, itup, ntup); dist = gistSplit(rel, page, itvec, tlen, giststate); /* * Set up pages to work with. Allocate new buffers for all but the * leftmost page. The original page becomes the new leftmost page, and * is just replaced with the new contents. * * For a root-split, allocate new buffers for all child pages, the * original page is overwritten with new root page containing * downlinks to the new child pages. */ ptr = dist; if (!is_rootsplit) { /* save old rightlink and NSN */ oldrlink = GistPageGetOpaque(page)->rightlink; oldnsn = GistPageGetNSN(page); dist->buffer = buffer; dist->block.blkno = BufferGetBlockNumber(buffer); dist->page = PageGetTempPageCopySpecial(BufferGetPage(buffer)); /* clean all flags except F_LEAF */ GistPageGetOpaque(dist->page)->flags = (is_leaf) ? F_LEAF : 0; ptr = ptr->next; } for (; ptr; ptr = ptr->next) { /* Allocate new page */ ptr->buffer = gistNewBuffer(rel); GISTInitBuffer(ptr->buffer, (is_leaf) ? F_LEAF : 0); ptr->page = BufferGetPage(ptr->buffer); ptr->block.blkno = BufferGetBlockNumber(ptr->buffer); } /* * Now that we know which blocks the new pages go to, set up downlink * tuples to point to them. */ for (ptr = dist; ptr; ptr = ptr->next) { ItemPointerSetBlockNumber(&(ptr->itup->t_tid), ptr->block.blkno); GistTupleSetValid(ptr->itup); } /* * If this is a root split, we construct the new root page with the * downlinks here directly, instead of requiring the caller to insert * them. Add the new root page to the list along with the child pages. */ if (is_rootsplit) { IndexTuple *downlinks; int ndownlinks = 0; int i; rootpg.buffer = buffer; rootpg.page = PageGetTempPageCopySpecial(BufferGetPage(rootpg.buffer)); GistPageGetOpaque(rootpg.page)->flags = 0; /* Prepare a vector of all the downlinks */ for (ptr = dist; ptr; ptr = ptr->next) ndownlinks++; downlinks = palloc(sizeof(IndexTuple) * ndownlinks); for (i = 0, ptr = dist; ptr; ptr = ptr->next) downlinks[i++] = ptr->itup; rootpg.block.blkno = GIST_ROOT_BLKNO; rootpg.block.num = ndownlinks; rootpg.list = gistfillitupvec(downlinks, ndownlinks, &(rootpg.lenlist)); rootpg.itup = NULL; rootpg.next = dist; dist = &rootpg; } else { /* Prepare split-info to be returned to caller */ for (ptr = dist; ptr; ptr = ptr->next) { GISTPageSplitInfo *si = palloc(sizeof(GISTPageSplitInfo)); si->buf = ptr->buffer; si->downlink = ptr->itup; *splitinfo = lappend(*splitinfo, si); } } /* * Fill all pages. All the pages are new, ie. freshly allocated empty * pages, or a temporary copy of the old page. */ for (ptr = dist; ptr; ptr = ptr->next) { char *data = (char *) (ptr->list); for (i = 0; i < ptr->block.num; i++) { IndexTuple thistup = (IndexTuple) data; if (PageAddItem(ptr->page, (Item) data, IndexTupleSize(thistup), i + FirstOffsetNumber, false, false) == InvalidOffsetNumber) elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(rel)); /* * If this is the first inserted/updated tuple, let the caller * know which page it landed on. */ if (newblkno && ItemPointerEquals(&thistup->t_tid, &(*itup)->t_tid)) *newblkno = ptr->block.blkno; data += IndexTupleSize(thistup); } /* Set up rightlinks */ if (ptr->next && ptr->block.blkno != GIST_ROOT_BLKNO) GistPageGetOpaque(ptr->page)->rightlink = ptr->next->block.blkno; else GistPageGetOpaque(ptr->page)->rightlink = oldrlink; /* * Mark the all but the right-most page with the follow-right * flag. It will be cleared as soon as the downlink is inserted * into the parent, but this ensures that if we error out before * that, the index is still consistent. (in buffering build mode, * any error will abort the index build anyway, so this is not * needed.) */ if (ptr->next && !is_rootsplit && markfollowright) GistMarkFollowRight(ptr->page); else GistClearFollowRight(ptr->page); /* * Copy the NSN of the original page to all pages. The * F_FOLLOW_RIGHT flags ensure that scans will follow the * rightlinks until the downlinks are inserted. */ GistPageSetNSN(ptr->page, oldnsn); } START_CRIT_SECTION(); /* * Must mark buffers dirty before XLogInsert, even though we'll still * be changing their opaque fields below. */ for (ptr = dist; ptr; ptr = ptr->next) MarkBufferDirty(ptr->buffer); if (BufferIsValid(leftchildbuf)) MarkBufferDirty(leftchildbuf); /* * The first page in the chain was a temporary working copy meant to * replace the old page. Copy it over the old page. */ PageRestoreTempPage(dist->page, BufferGetPage(dist->buffer)); dist->page = BufferGetPage(dist->buffer); /* Write the WAL record */ if (RelationNeedsWAL(rel)) recptr = gistXLogSplit(rel->rd_node, blkno, is_leaf, dist, oldrlink, oldnsn, leftchildbuf, markfollowright); else recptr = gistGetFakeLSN(rel); for (ptr = dist; ptr; ptr = ptr->next) { PageSetLSN(ptr->page, recptr); } /* * Return the new child buffers to the caller. * * If this was a root split, we've already inserted the downlink * pointers, in the form of a new root page. Therefore we can release * all the new buffers, and keep just the root page locked. */ if (is_rootsplit) { for (ptr = dist->next; ptr; ptr = ptr->next) UnlockReleaseBuffer(ptr->buffer); } } else { /* * Enough space. We also get here if ntuples==0. */ START_CRIT_SECTION(); if (OffsetNumberIsValid(oldoffnum)) PageIndexTupleDelete(page, oldoffnum); gistfillbuffer(page, itup, ntup, InvalidOffsetNumber); MarkBufferDirty(buffer); if (BufferIsValid(leftchildbuf)) MarkBufferDirty(leftchildbuf); if (RelationNeedsWAL(rel)) { OffsetNumber ndeloffs = 0, deloffs[1]; if (OffsetNumberIsValid(oldoffnum)) { deloffs[0] = oldoffnum; ndeloffs = 1; } recptr = gistXLogUpdate(rel->rd_node, buffer, deloffs, ndeloffs, itup, ntup, leftchildbuf); PageSetLSN(page, recptr); } else { recptr = gistGetFakeLSN(rel); PageSetLSN(page, recptr); } if (newblkno) *newblkno = blkno; } /* * If we inserted the downlink for a child page, set NSN and clear * F_FOLLOW_RIGHT flag on the left child, so that concurrent scans know to * follow the rightlink if and only if they looked at the parent page * before we inserted the downlink. * * Note that we do this *after* writing the WAL record. That means that * the possible full page image in the WAL record does not include these * changes, and they must be replayed even if the page is restored from * the full page image. There's a chicken-and-egg problem: if we updated * the child pages first, we wouldn't know the recptr of the WAL record * we're about to write. */ if (BufferIsValid(leftchildbuf)) { Page leftpg = BufferGetPage(leftchildbuf); GistPageSetNSN(leftpg, recptr); GistClearFollowRight(leftpg); PageSetLSN(leftpg, recptr); } END_CRIT_SECTION(); return is_split; }
static void gistfindleaf(GISTInsertState *state, GISTSTATE *giststate) { ItemId iid; IndexTuple idxtuple; GISTPageOpaque opaque; MIRROREDLOCK_BUFMGR_MUST_ALREADY_BE_HELD; /* * walk down, We don't lock page for a long time, but so we should be * ready to recheck path in a bad case... We remember, that page->lsn * should never be invalid. */ for (;;) { if (XLogRecPtrIsInvalid(state->stack->lsn)) state->stack->buffer = ReadBuffer(state->r, state->stack->blkno); LockBuffer(state->stack->buffer, GIST_SHARE); gistcheckpage(state->r, state->stack->buffer); state->stack->page = (Page) BufferGetPage(state->stack->buffer); opaque = GistPageGetOpaque(state->stack->page); state->stack->lsn = PageGetLSN(state->stack->page); Assert(state->r->rd_istemp || !XLogRecPtrIsInvalid(state->stack->lsn)); if (state->stack->blkno != GIST_ROOT_BLKNO && XLByteLT(state->stack->parent->lsn, opaque->nsn)) { /* * caused split non-root page is detected, go up to parent to * choose best child */ UnlockReleaseBuffer(state->stack->buffer); state->stack = state->stack->parent; continue; } if (!GistPageIsLeaf(state->stack->page)) { /* * This is an internal page, so continue to walk down the tree. We * find the child node that has the minimum insertion penalty and * recursively invoke ourselves to modify that node. Once the * recursive call returns, we may need to adjust the parent node * for two reasons: the child node split, or the key in this node * needs to be adjusted for the newly inserted key below us. */ GISTInsertStack *item = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack)); state->stack->childoffnum = gistchoose(state->r, state->stack->page, state->itup[0], giststate); iid = PageGetItemId(state->stack->page, state->stack->childoffnum); idxtuple = (IndexTuple) PageGetItem(state->stack->page, iid); item->blkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid)); LockBuffer(state->stack->buffer, GIST_UNLOCK); item->parent = state->stack; item->child = NULL; if (state->stack) state->stack->child = item; state->stack = item; } else { /* be carefull, during unlock/lock page may be changed... */ LockBuffer(state->stack->buffer, GIST_UNLOCK); LockBuffer(state->stack->buffer, GIST_EXCLUSIVE); state->stack->page = (Page) BufferGetPage(state->stack->buffer); opaque = GistPageGetOpaque(state->stack->page); if (state->stack->blkno == GIST_ROOT_BLKNO) { /* * the only page can become inner instead of leaf is a root * page, so for root we should recheck it */ if (!GistPageIsLeaf(state->stack->page)) { /* * very rarely situation: during unlock/lock index with * number of pages = 1 was increased */ LockBuffer(state->stack->buffer, GIST_UNLOCK); continue; } /* * we don't need to check root split, because checking * leaf/inner is enough to recognize split for root */ } else if (XLByteLT(state->stack->parent->lsn, opaque->nsn)) { /* * detecting split during unlock/lock, so we should find * better child on parent */ /* forget buffer */ UnlockReleaseBuffer(state->stack->buffer); state->stack = state->stack->parent; continue; } state->stack->lsn = PageGetLSN(state->stack->page); /* ok we found a leaf page and it X-locked */ break; } } /* now state->stack->(page, buffer and blkno) points to leaf page */ }
static bool gistplacetopage(GISTInsertState *state, GISTSTATE *giststate) { bool is_splitted = false; bool is_leaf = (GistPageIsLeaf(state->stack->page)) ? true : false; MIRROREDLOCK_BUFMGR_MUST_ALREADY_BE_HELD; /* * if (!is_leaf) remove old key: This node's key has been modified, either * because a child split occurred or because we needed to adjust our key * for an insert in a child node. Therefore, remove the old version of * this node's key. * * for WAL replay, in the non-split case we handle this by setting up a * one-element todelete array; in the split case, it's handled implicitly * because the tuple vector passed to gistSplit won't include this tuple. * * XXX: If we want to change fillfactors between node and leaf, fillfactor * = (is_leaf ? state->leaf_fillfactor : state->node_fillfactor) */ if (gistnospace(state->stack->page, state->itup, state->ituplen, is_leaf ? InvalidOffsetNumber : state->stack->childoffnum, state->freespace)) { /* no space for insertion */ IndexTuple *itvec; int tlen; SplitedPageLayout *dist = NULL, *ptr; BlockNumber rrlink = InvalidBlockNumber; GistNSN oldnsn; is_splitted = true; /* * Form index tuples vector to split: remove old tuple if t's needed * and add new tuples to vector */ itvec = gistextractpage(state->stack->page, &tlen); if (!is_leaf) { /* on inner page we should remove old tuple */ int pos = state->stack->childoffnum - FirstOffsetNumber; tlen--; if (pos != tlen) memmove(itvec + pos, itvec + pos + 1, sizeof(IndexTuple) * (tlen - pos)); } itvec = gistjoinvector(itvec, &tlen, state->itup, state->ituplen); dist = gistSplit(state->r, state->stack->page, itvec, tlen, giststate); state->itup = (IndexTuple *) palloc(sizeof(IndexTuple) * tlen); state->ituplen = 0; if (state->stack->blkno != GIST_ROOT_BLKNO) { /* * if non-root split then we should not allocate new buffer, but * we must create temporary page to operate */ dist->buffer = state->stack->buffer; dist->page = PageGetTempPage(BufferGetPage(dist->buffer), sizeof(GISTPageOpaqueData)); /* clean all flags except F_LEAF */ GistPageGetOpaque(dist->page)->flags = (is_leaf) ? F_LEAF : 0; } /* make new pages and fills them */ for (ptr = dist; ptr; ptr = ptr->next) { int i; char *data; /* get new page */ if (ptr->buffer == InvalidBuffer) { ptr->buffer = gistNewBuffer(state->r); GISTInitBuffer(ptr->buffer, (is_leaf) ? F_LEAF : 0); ptr->page = BufferGetPage(ptr->buffer); } ptr->block.blkno = BufferGetBlockNumber(ptr->buffer); /* * fill page, we can do it because all these pages are new * (ie not linked in tree or masked by temp page */ data = (char *) (ptr->list); for (i = 0; i < ptr->block.num; i++) { if (PageAddItem(ptr->page, (Item) data, IndexTupleSize((IndexTuple) data), i + FirstOffsetNumber, LP_USED) == InvalidOffsetNumber) elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(state->r)); data += IndexTupleSize((IndexTuple) data); } /* set up ItemPointer and remember it for parent */ ItemPointerSetBlockNumber(&(ptr->itup->t_tid), ptr->block.blkno); state->itup[state->ituplen] = ptr->itup; state->ituplen++; } /* saves old rightlink */ if (state->stack->blkno != GIST_ROOT_BLKNO) rrlink = GistPageGetOpaque(dist->page)->rightlink; START_CRIT_SECTION(); /* * must mark buffers dirty before XLogInsert, even though we'll still * be changing their opaque fields below. set up right links. */ for (ptr = dist; ptr; ptr = ptr->next) { MarkBufferDirty(ptr->buffer); GistPageGetOpaque(ptr->page)->rightlink = (ptr->next) ? ptr->next->block.blkno : rrlink; } /* restore splitted non-root page */ if (state->stack->blkno != GIST_ROOT_BLKNO) { PageRestoreTempPage(dist->page, BufferGetPage(dist->buffer)); dist->page = BufferGetPage(dist->buffer); } if (!state->r->rd_istemp) { XLogRecPtr recptr; XLogRecData *rdata; rdata = formSplitRdata(state->r, state->stack->blkno, is_leaf, &(state->key), dist); recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_SPLIT, rdata); for (ptr = dist; ptr; ptr = ptr->next) { PageSetLSN(ptr->page, recptr); PageSetTLI(ptr->page, ThisTimeLineID); } } else { for (ptr = dist; ptr; ptr = ptr->next) { PageSetLSN(ptr->page, XLogRecPtrForTemp); } } /* set up NSN */ oldnsn = GistPageGetOpaque(dist->page)->nsn; if (state->stack->blkno == GIST_ROOT_BLKNO) /* if root split we should put initial value */ oldnsn = PageGetLSN(dist->page); for (ptr = dist; ptr; ptr = ptr->next) { /* only for last set oldnsn */ GistPageGetOpaque(ptr->page)->nsn = (ptr->next) ? PageGetLSN(ptr->page) : oldnsn; } /* * release buffers, if it was a root split then release all buffers * because we create all buffers */ ptr = (state->stack->blkno == GIST_ROOT_BLKNO) ? dist : dist->next; for (; ptr; ptr = ptr->next) UnlockReleaseBuffer(ptr->buffer); if (state->stack->blkno == GIST_ROOT_BLKNO) { gistnewroot(state->r, state->stack->buffer, state->itup, state->ituplen, &(state->key)); state->needInsertComplete = false; } END_CRIT_SECTION(); } else { /* enough space */ START_CRIT_SECTION(); if (!is_leaf) PageIndexTupleDelete(state->stack->page, state->stack->childoffnum); gistfillbuffer(state->r, state->stack->page, state->itup, state->ituplen, InvalidOffsetNumber); MarkBufferDirty(state->stack->buffer); if (!state->r->rd_istemp) { OffsetNumber noffs = 0, offs[1]; XLogRecPtr recptr; XLogRecData *rdata; if (!is_leaf) { /* only on inner page we should delete previous version */ offs[0] = state->stack->childoffnum; noffs = 1; } rdata = formUpdateRdata(state->r, state->stack->buffer, offs, noffs, state->itup, state->ituplen, &(state->key)); recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_UPDATE, rdata); PageSetLSN(state->stack->page, recptr); PageSetTLI(state->stack->page, ThisTimeLineID); } else PageSetLSN(state->stack->page, XLogRecPtrForTemp); if (state->stack->blkno == GIST_ROOT_BLKNO) state->needInsertComplete = false; END_CRIT_SECTION(); if (state->ituplen > 1) { /* previous is_splitted==true */ /* * child was splited, so we must form union for insertion in * parent */ IndexTuple newtup = gistunion(state->r, state->itup, state->ituplen, giststate); ItemPointerSetBlockNumber(&(newtup->t_tid), state->stack->blkno); state->itup[0] = newtup; state->ituplen = 1; } else if (is_leaf) { /* * itup[0] store key to adjust parent, we set it to valid to * correct check by GistTupleIsInvalid macro in gistgetadjusted() */ ItemPointerSetBlockNumber(&(state->itup[0]->t_tid), state->stack->blkno); GistTupleSetValid(state->itup[0]); } } return is_splitted; }
/* * redo any page update (except page split) */ static void gistRedoPageUpdateRecord(XLogRecPtr lsn, XLogRecord *record) { char *begin = XLogRecGetData(record); gistxlogPageUpdate *xldata = (gistxlogPageUpdate *) begin; Buffer buffer; Page page; char *data; if (BlockNumberIsValid(xldata->leftchild)) gistRedoClearFollowRight(xldata->node, lsn, xldata->leftchild); /* nothing more to do if page was backed up (and no info to do it with) */ if (record->xl_info & XLR_BKP_BLOCK_1) return; buffer = XLogReadBuffer(xldata->node, xldata->blkno, false); if (!BufferIsValid(buffer)) return; page = (Page) BufferGetPage(buffer); if (XLByteLE(lsn, PageGetLSN(page))) { UnlockReleaseBuffer(buffer); return; } data = begin + sizeof(gistxlogPageUpdate); /* Delete old tuples */ if (xldata->ntodelete > 0) { int i; OffsetNumber *todelete = (OffsetNumber *) data; data += sizeof(OffsetNumber) * xldata->ntodelete; for (i = 0; i < xldata->ntodelete; i++) PageIndexTupleDelete(page, todelete[i]); if (GistPageIsLeaf(page)) GistMarkTuplesDeleted(page); } /* add tuples */ if (data - begin < record->xl_len) { OffsetNumber off = (PageIsEmpty(page)) ? FirstOffsetNumber : OffsetNumberNext(PageGetMaxOffsetNumber(page)); while (data - begin < record->xl_len) { IndexTuple itup = (IndexTuple) data; Size sz = IndexTupleSize(itup); OffsetNumber l; data += sz; l = PageAddItem(page, (Item) itup, sz, off, false, false); if (l == InvalidOffsetNumber) elog(ERROR, "failed to add item to GiST index page, size %d bytes", (int) sz); off++; } } else { /* * special case: leafpage, nothing to insert, nothing to delete, then * vacuum marks page */ if (GistPageIsLeaf(page) && xldata->ntodelete == 0) GistClearTuplesDeleted(page); } if (!GistPageIsLeaf(page) && PageGetMaxOffsetNumber(page) == InvalidOffsetNumber && xldata->blkno == GIST_ROOT_BLKNO) /* * all links on non-leaf root page was deleted by vacuum full, so root * page becomes a leaf */ GistPageSetLeaf(page); GistPageGetOpaque(page)->rightlink = InvalidBlockNumber; PageSetLSN(page, lsn); PageSetTLI(page, ThisTimeLineID); MarkBufferDirty(buffer); UnlockReleaseBuffer(buffer); }
/* * redo any page update (except page split) */ static void gistRedoPageUpdateRecord(XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; gistxlogPageUpdate *xldata = (gistxlogPageUpdate *) XLogRecGetData(record); Buffer buffer; Page page; if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) { char *begin; char *data; Size datalen; int ninserted = 0; data = begin = XLogRecGetBlockData(record, 0, &datalen); page = (Page) BufferGetPage(buffer); /* Delete old tuples */ if (xldata->ntodelete > 0) { int i; OffsetNumber *todelete = (OffsetNumber *) data; data += sizeof(OffsetNumber) * xldata->ntodelete; for (i = 0; i < xldata->ntodelete; i++) PageIndexTupleDelete(page, todelete[i]); if (GistPageIsLeaf(page)) GistMarkTuplesDeleted(page); } /* add tuples */ if (data - begin < datalen) { OffsetNumber off = (PageIsEmpty(page)) ? FirstOffsetNumber : OffsetNumberNext(PageGetMaxOffsetNumber(page)); while (data - begin < datalen) { IndexTuple itup = (IndexTuple) data; Size sz = IndexTupleSize(itup); OffsetNumber l; data += sz; l = PageAddItem(page, (Item) itup, sz, off, false, false); if (l == InvalidOffsetNumber) elog(ERROR, "failed to add item to GiST index page, size %d bytes", (int) sz); off++; ninserted++; } } Assert(ninserted == xldata->ntoinsert); PageSetLSN(page, lsn); MarkBufferDirty(buffer); } /* * Fix follow-right data on left child page * * This must be done while still holding the lock on the target page. Note * that even if the target page no longer exists, we still attempt to * replay the change on the child page. */ if (XLogRecHasBlockRef(record, 1)) gistRedoClearFollowRight(record, 1); if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); }
/* * redo any page update (except page split) */ static void gistRedoPageUpdateRecord(XLogRecPtr lsn, XLogRecord *record) { char *begin = XLogRecGetData(record); gistxlogPageUpdate *xldata = (gistxlogPageUpdate *) begin; Buffer buffer; Page page; char *data; /* * We need to acquire and hold lock on target page while updating the left * child page. If we have a full-page image of target page, getting the * lock is a side-effect of restoring that image. Note that even if the * target page no longer exists, we'll still attempt to replay the change * on the child page. */ if (record->xl_info & XLR_BKP_BLOCK(0)) buffer = RestoreBackupBlock(lsn, record, 0, false, true); else buffer = XLogReadBuffer(xldata->node, xldata->blkno, false); /* Fix follow-right data on left child page */ if (BlockNumberIsValid(xldata->leftchild)) gistRedoClearFollowRight(lsn, record, 1, xldata->node, xldata->leftchild); /* Done if target page no longer exists */ if (!BufferIsValid(buffer)) return; /* nothing more to do if page was backed up (and no info to do it with) */ if (record->xl_info & XLR_BKP_BLOCK(0)) { UnlockReleaseBuffer(buffer); return; } page = (Page) BufferGetPage(buffer); /* nothing more to do if change already applied */ if (lsn <= PageGetLSN(page)) { UnlockReleaseBuffer(buffer); return; } data = begin + sizeof(gistxlogPageUpdate); /* Delete old tuples */ if (xldata->ntodelete > 0) { int i; OffsetNumber *todelete = (OffsetNumber *) data; data += sizeof(OffsetNumber) * xldata->ntodelete; for (i = 0; i < xldata->ntodelete; i++) PageIndexTupleDelete(page, todelete[i]); if (GistPageIsLeaf(page)) GistMarkTuplesDeleted(page); } /* add tuples */ if (data - begin < record->xl_len) { OffsetNumber off = (PageIsEmpty(page)) ? FirstOffsetNumber : OffsetNumberNext(PageGetMaxOffsetNumber(page)); while (data - begin < record->xl_len) { IndexTuple itup = (IndexTuple) data; Size sz = IndexTupleSize(itup); OffsetNumber l; data += sz; l = PageAddItem(page, (Item) itup, sz, off, false, false); if (l == InvalidOffsetNumber) elog(ERROR, "failed to add item to GiST index page, size %d bytes", (int) sz); off++; } } else { /* * special case: leafpage, nothing to insert, nothing to delete, then * vacuum marks page */ if (GistPageIsLeaf(page) && xldata->ntodelete == 0) GistClearTuplesDeleted(page); } if (!GistPageIsLeaf(page) && PageGetMaxOffsetNumber(page) == InvalidOffsetNumber && xldata->blkno == GIST_ROOT_BLKNO) { /* * all links on non-leaf root page was deleted by vacuum full, so root * page becomes a leaf */ GistPageSetLeaf(page); } GistPageGetOpaque(page)->rightlink = InvalidBlockNumber; PageSetLSN(page, lsn); MarkBufferDirty(buffer); UnlockReleaseBuffer(buffer); }
/* * gistindex_keytest() -- does this index tuple satisfy the scan key(s)? * * We must decompress the key in the IndexTuple before passing it to the * sk_func (and we have previously overwritten the sk_func to use the * user-defined Consistent method, so we actually are invoking that). * * Note that this function is always invoked in a short-lived memory context, * so we don't need to worry about cleaning up allocated memory, either here * or in the implementation of any Consistent methods. */ static bool gistindex_keytest(IndexTuple tuple, IndexScanDesc scan, OffsetNumber offset) { int keySize = scan->numberOfKeys; ScanKey key = scan->keyData; Relation r = scan->indexRelation; GISTScanOpaque so; Page p; GISTSTATE *giststate; so = (GISTScanOpaque) scan->opaque; giststate = so->giststate; p = BufferGetPage(so->curbuf); IncrIndexProcessed(); /* * Tuple doesn't restore after crash recovery because of incomplete insert */ if (!GistPageIsLeaf(p) && GistTupleIsInvalid(tuple)) return true; while (keySize > 0) { Datum datum; bool isNull; Datum test; GISTENTRY de; datum = index_getattr(tuple, key->sk_attno, giststate->tupdesc, &isNull); if (key->sk_flags & SK_ISNULL) { /* * On non-leaf page we can't conclude that child hasn't NULL * values because of assumption in GiST: uinon (VAL, NULL) is VAL * But if on non-leaf page key IS NULL then all childs has NULL. */ Assert(key->sk_flags & SK_SEARCHNULL); if (GistPageIsLeaf(p) && !isNull) return false; } else if (isNull) { return false; } else { gistdentryinit(giststate, key->sk_attno - 1, &de, datum, r, p, offset, FALSE, isNull); /* * Call the Consistent function to evaluate the test. The * arguments are the index datum (as a GISTENTRY*), the comparison * datum, and the comparison operator's strategy number and * subtype from pg_amop. * * (Presently there's no need to pass the subtype since it'll * always be zero, but might as well pass it for possible future * use.) */ test = FunctionCall4(&key->sk_func, PointerGetDatum(&de), key->sk_argument, Int32GetDatum(key->sk_strategy), ObjectIdGetDatum(key->sk_subtype)); if (!DatumGetBool(test)) return false; } keySize--; key++; } return true; }
/* * Bulk deletion of all index entries pointing to a set of heap tuples and * check invalid tuples after crash recovery. * The set of target tuples is specified via a callback routine that tells * whether any given heap tuple (identified by ItemPointer) is being deleted. * * Result: a palloc'd struct containing statistical info for VACUUM displays. */ Datum gistbulkdelete(PG_FUNCTION_ARGS) { IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0); GistBulkDeleteResult *stats = (GistBulkDeleteResult *) PG_GETARG_POINTER(1); IndexBulkDeleteCallback callback = (IndexBulkDeleteCallback) PG_GETARG_POINTER(2); void *callback_state = (void *) PG_GETARG_POINTER(3); Relation rel = info->index; GistBDItem *stack, *ptr; /* first time through? */ if (stats == NULL) stats = (GistBulkDeleteResult *) palloc0(sizeof(GistBulkDeleteResult)); /* we'll re-count the tuples each time */ stats->std.num_index_tuples = 0; stack = (GistBDItem *) palloc0(sizeof(GistBDItem)); stack->blkno = GIST_ROOT_BLKNO; while (stack) { Buffer buffer = ReadBufferWithStrategy(rel, stack->blkno, info->strategy); Page page; OffsetNumber i, maxoff; IndexTuple idxtuple; ItemId iid; LockBuffer(buffer, GIST_SHARE); gistcheckpage(rel, buffer); page = (Page) BufferGetPage(buffer); if (GistPageIsLeaf(page)) { OffsetNumber todelete[MaxOffsetNumber]; int ntodelete = 0; LockBuffer(buffer, GIST_UNLOCK); LockBuffer(buffer, GIST_EXCLUSIVE); page = (Page) BufferGetPage(buffer); if (stack->blkno == GIST_ROOT_BLKNO && !GistPageIsLeaf(page)) { /* only the root can become non-leaf during relock */ UnlockReleaseBuffer(buffer); /* one more check */ continue; } /* * check for split proceeded after look at parent, we should check * it after relock */ pushStackIfSplited(page, stack); /* * Remove deletable tuples from page */ maxoff = PageGetMaxOffsetNumber(page); for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) { iid = PageGetItemId(page, i); idxtuple = (IndexTuple) PageGetItem(page, iid); if (callback(&(idxtuple->t_tid), callback_state)) { todelete[ntodelete] = i - ntodelete; ntodelete++; stats->std.tuples_removed += 1; } else stats->std.num_index_tuples += 1; } if (ntodelete) { START_CRIT_SECTION(); MarkBufferDirty(buffer); for (i = 0; i < ntodelete; i++) PageIndexTupleDelete(page, todelete[i]); GistMarkTuplesDeleted(page); if (!rel->rd_istemp) { XLogRecData *rdata; XLogRecPtr recptr; gistxlogPageUpdate *xlinfo; rdata = formUpdateRdata(rel->rd_node, buffer, todelete, ntodelete, NULL, 0, NULL); xlinfo = (gistxlogPageUpdate *) rdata->next->data; recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_UPDATE, rdata); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); pfree(xlinfo); pfree(rdata); } else PageSetLSN(page, XLogRecPtrForTemp); END_CRIT_SECTION(); } } else { /* check for split proceeded after look at parent */ pushStackIfSplited(page, stack); maxoff = PageGetMaxOffsetNumber(page); for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) { iid = PageGetItemId(page, i); idxtuple = (IndexTuple) PageGetItem(page, iid); ptr = (GistBDItem *) palloc(sizeof(GistBDItem)); ptr->blkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid)); ptr->parentlsn = PageGetLSN(page); ptr->next = stack->next; stack->next = ptr; if (GistTupleIsInvalid(idxtuple)) stats->needFullVacuum = true; } } UnlockReleaseBuffer(buffer); ptr = stack->next; pfree(stack); stack = ptr; vacuum_delay_point(); } PG_RETURN_POINTER(stats); }
static void gistDeleteSubtree(GistVacuum *gv, BlockNumber blkno) { Buffer buffer; Page page; buffer = ReadBufferWithStrategy(gv->index, blkno, gv->strategy); LockBuffer(buffer, GIST_EXCLUSIVE); page = (Page) BufferGetPage(buffer); if (!GistPageIsLeaf(page)) { int i; for (i = FirstOffsetNumber; i <= PageGetMaxOffsetNumber(page); i = OffsetNumberNext(i)) { ItemId iid = PageGetItemId(page, i); IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid); gistDeleteSubtree(gv, ItemPointerGetBlockNumber(&(idxtuple->t_tid))); } } START_CRIT_SECTION(); MarkBufferDirty(buffer); page = (Page) BufferGetPage(buffer); GistPageSetDeleted(page); gv->result->std.pages_deleted++; if (!gv->index->rd_istemp) { XLogRecData rdata[2]; XLogRecPtr recptr; gistxlogPageDelete xlrec; xlrec.node = gv->index->rd_node; xlrec.blkno = blkno; rdata[0].buffer = buffer; rdata[0].buffer_std = true; rdata[0].data = NULL; rdata[0].len = 0; rdata[0].next = &(rdata[1]); rdata[1].buffer = InvalidBuffer; rdata[1].data = (char *) &xlrec; rdata[1].len = sizeof(gistxlogPageDelete); rdata[1].next = NULL; recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_DELETE, rdata); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); } else PageSetLSN(page, XLogRecPtrForTemp); END_CRIT_SECTION(); UnlockReleaseBuffer(buffer); }
/* * Traverse the tree to find path from root page to specified "child" block. * * returns from the beginning of closest parent; * * To prevent deadlocks, this should lock only one page simultaneously. */ GISTInsertStack * gistFindPath(Relation r, BlockNumber child) { Page page; Buffer buffer; OffsetNumber i, maxoff; ItemId iid; IndexTuple idxtuple; GISTInsertStack *top, *tail, *ptr; BlockNumber blkno; MIRROREDLOCK_BUFMGR_MUST_ALREADY_BE_HELD; top = tail = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack)); top->blkno = GIST_ROOT_BLKNO; while (top && top->blkno != child) { buffer = ReadBuffer(r, top->blkno); LockBuffer(buffer, GIST_SHARE); gistcheckpage(r, buffer); page = (Page) BufferGetPage(buffer); if (GistPageIsLeaf(page)) { /* we can safety go away, follows only leaf pages */ UnlockReleaseBuffer(buffer); return NULL; } top->lsn = PageGetLSN(page); if (top->parent && XLByteLT(top->parent->lsn, GistPageGetOpaque(page)->nsn) && GistPageGetOpaque(page)->rightlink != InvalidBlockNumber /* sanity check */ ) { /* page splited while we thinking of... */ ptr = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack)); ptr->blkno = GistPageGetOpaque(page)->rightlink; ptr->childoffnum = InvalidOffsetNumber; ptr->parent = top; ptr->next = NULL; tail->next = ptr; tail = ptr; } maxoff = PageGetMaxOffsetNumber(page); for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) { iid = PageGetItemId(page, i); idxtuple = (IndexTuple) PageGetItem(page, iid); blkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid)); if (blkno == child) { OffsetNumber poff = InvalidOffsetNumber; /* make childs links */ ptr = top; while (ptr->parent) { /* set child link */ ptr->parent->child = ptr; /* move childoffnum.. */ if (ptr == top) { /* first iteration */ poff = ptr->parent->childoffnum; ptr->parent->childoffnum = ptr->childoffnum; } else { OffsetNumber tmp = ptr->parent->childoffnum; ptr->parent->childoffnum = poff; poff = tmp; } ptr = ptr->parent; } top->childoffnum = i; UnlockReleaseBuffer(buffer); return top; } else { /* Install next inner page to the end of stack */ ptr = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack)); ptr->blkno = blkno; ptr->childoffnum = i; /* set offsetnumber of child to child * !!! */ ptr->parent = top; ptr->next = NULL; tail->next = ptr; tail = ptr; } } UnlockReleaseBuffer(buffer); top = top->next; } return NULL; }
/* * Search an upper index page for the entry with lowest penalty for insertion * of the new index key contained in "it". * * Returns the index of the page entry to insert into. */ OffsetNumber gistchoose(Relation r, Page p, IndexTuple it, /* it has compressed entry */ GISTSTATE *giststate) { OffsetNumber result; OffsetNumber maxoff; OffsetNumber i; float best_penalty[INDEX_MAX_KEYS]; GISTENTRY entry, identry[INDEX_MAX_KEYS]; bool isnull[INDEX_MAX_KEYS]; int keep_current_best; Assert(!GistPageIsLeaf(p)); gistDeCompressAtt(giststate, r, it, NULL, (OffsetNumber) 0, identry, isnull); /* we'll return FirstOffsetNumber if page is empty (shouldn't happen) */ result = FirstOffsetNumber; /* * The index may have multiple columns, and there's a penalty value for * each column. The penalty associated with a column that appears earlier * in the index definition is strictly more important than the penalty of * a column that appears later in the index definition. * * best_penalty[j] is the best penalty we have seen so far for column j, * or -1 when we haven't yet examined column j. Array entries to the * right of the first -1 are undefined. */ best_penalty[0] = -1; /* * If we find a tuple that's exactly as good as the currently best one, we * could use either one. When inserting a lot of tuples with the same or * similar keys, it's preferable to descend down the same path when * possible, as that's more cache-friendly. On the other hand, if all * inserts land on the same leaf page after a split, we're never going to * insert anything to the other half of the split, and will end up using * only 50% of the available space. Distributing the inserts evenly would * lead to better space usage, but that hurts cache-locality during * insertion. To get the best of both worlds, when we find a tuple that's * exactly as good as the previous best, choose randomly whether to stick * to the old best, or use the new one. Once we decide to stick to the * old best, we keep sticking to it for any subsequent equally good tuples * we might find. This favors tuples with low offsets, but still allows * some inserts to go to other equally-good subtrees. * * keep_current_best is -1 if we haven't yet had to make a random choice * whether to keep the current best tuple. If we have done so, and * decided to keep it, keep_current_best is 1; if we've decided to * replace, keep_current_best is 0. (This state will be reset to -1 as * soon as we've made the replacement, but sometimes we make the choice in * advance of actually finding a replacement best tuple.) */ keep_current_best = -1; /* * Loop over tuples on page. */ maxoff = PageGetMaxOffsetNumber(p); Assert(maxoff >= FirstOffsetNumber); for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) { IndexTuple itup = (IndexTuple) PageGetItem(p, PageGetItemId(p, i)); bool zero_penalty; int j; zero_penalty = true; /* Loop over index attributes. */ for (j = 0; j < r->rd_att->natts; j++) { Datum datum; float usize; bool IsNull; /* Compute penalty for this column. */ datum = index_getattr(itup, j + 1, giststate->tupdesc, &IsNull); gistdentryinit(giststate, j, &entry, datum, r, p, i, false, IsNull); usize = gistpenalty(giststate, j, &entry, IsNull, &identry[j], isnull[j]); if (usize > 0) zero_penalty = false; if (best_penalty[j] < 0 || usize < best_penalty[j]) { /* * New best penalty for column. Tentatively select this tuple * as the target, and record the best penalty. Then reset the * next column's penalty to "unknown" (and indirectly, the * same for all the ones to its right). This will force us to * adopt this tuple's penalty values as the best for all the * remaining columns during subsequent loop iterations. */ result = i; best_penalty[j] = usize; if (j < r->rd_att->natts - 1) best_penalty[j + 1] = -1; /* we have new best, so reset keep-it decision */ keep_current_best = -1; } else if (best_penalty[j] == usize) { /* * The current tuple is exactly as good for this column as the * best tuple seen so far. The next iteration of this loop * will compare the next column. */ } else { /* * The current tuple is worse for this column than the best * tuple seen so far. Skip the remaining columns and move on * to the next tuple, if any. */ zero_penalty = false; /* so outer loop won't exit */ break; } } /* * If we looped past the last column, and did not update "result", * then this tuple is exactly as good as the prior best tuple. */ if (j == r->rd_att->natts && result != i) { if (keep_current_best == -1) { /* we didn't make the random choice yet for this old best */ keep_current_best = (random() <= (MAX_RANDOM_VALUE / 2)) ? 1 : 0; } if (keep_current_best == 0) { /* we choose to use the new tuple */ result = i; /* choose again if there are even more exactly-as-good ones */ keep_current_best = -1; } } /* * If we find a tuple with zero penalty for all columns, and we've * decided we don't want to search for another tuple with equal * penalty, there's no need to examine remaining tuples; just break * out of the loop and return it. */ if (zero_penalty) { if (keep_current_best == -1) { /* we didn't make the random choice yet for this old best */ keep_current_best = (random() <= (MAX_RANDOM_VALUE / 2)) ? 1 : 0; } if (keep_current_best == 1) break; } } return result; }
/* * gistindex_keytest() -- does this index tuple satisfy the scan key(s)? * * The index tuple might represent either a heap tuple or a lower index page, * depending on whether the containing page is a leaf page or not. * * On success return for a heap tuple, *recheck_p is set to indicate whether * the quals need to be rechecked. We recheck if any of the consistent() * functions request it. recheck is not interesting when examining a non-leaf * entry, since we must visit the lower index page if there's any doubt. * Similarly, *recheck_distances_p is set to indicate whether the distances * need to be rechecked, and it is also ignored for non-leaf entries. * * If we are doing an ordered scan, so->distances[] is filled with distance * data from the distance() functions before returning success. * * We must decompress the key in the IndexTuple before passing it to the * sk_funcs (which actually are the opclass Consistent or Distance methods). * * Note that this function is always invoked in a short-lived memory context, * so we don't need to worry about cleaning up allocated memory, either here * or in the implementation of any Consistent or Distance methods. */ static bool gistindex_keytest(IndexScanDesc scan, IndexTuple tuple, Page page, OffsetNumber offset, bool *recheck_p, bool *recheck_distances_p) { GISTScanOpaque so = (GISTScanOpaque) scan->opaque; GISTSTATE *giststate = so->giststate; ScanKey key = scan->keyData; int keySize = scan->numberOfKeys; double *distance_p; Relation r = scan->indexRelation; *recheck_p = false; *recheck_distances_p = false; /* * If it's a leftover invalid tuple from pre-9.1, treat it as a match with * minimum possible distances. This means we'll always follow it to the * referenced page. */ if (GistTupleIsInvalid(tuple)) { int i; if (GistPageIsLeaf(page)) /* shouldn't happen */ elog(ERROR, "invalid GiST tuple found on leaf page"); for (i = 0; i < scan->numberOfOrderBys; i++) so->distances[i] = -get_float8_infinity(); return true; } /* Check whether it matches according to the Consistent functions */ while (keySize > 0) { Datum datum; bool isNull; datum = index_getattr(tuple, key->sk_attno, giststate->tupdesc, &isNull); if (key->sk_flags & SK_ISNULL) { /* * On non-leaf page we can't conclude that child hasn't NULL * values because of assumption in GiST: union (VAL, NULL) is VAL. * But if on non-leaf page key IS NULL, then all children are * NULL. */ if (key->sk_flags & SK_SEARCHNULL) { if (GistPageIsLeaf(page) && !isNull) return false; } else { Assert(key->sk_flags & SK_SEARCHNOTNULL); if (isNull) return false; } } else if (isNull) { return false; } else { Datum test; bool recheck; GISTENTRY de; gistdentryinit(giststate, key->sk_attno - 1, &de, datum, r, page, offset, false, isNull); /* * Call the Consistent function to evaluate the test. The * arguments are the index datum (as a GISTENTRY*), the comparison * datum, the comparison operator's strategy number and subtype * from pg_amop, and the recheck flag. * * (Presently there's no need to pass the subtype since it'll * always be zero, but might as well pass it for possible future * use.) * * We initialize the recheck flag to true (the safest assumption) * in case the Consistent function forgets to set it. */ recheck = true; test = FunctionCall5Coll(&key->sk_func, key->sk_collation, PointerGetDatum(&de), key->sk_argument, Int16GetDatum(key->sk_strategy), ObjectIdGetDatum(key->sk_subtype), PointerGetDatum(&recheck)); if (!DatumGetBool(test)) return false; *recheck_p |= recheck; } key++; keySize--; } /* OK, it passes --- now let's compute the distances */ key = scan->orderByData; distance_p = so->distances; keySize = scan->numberOfOrderBys; while (keySize > 0) { Datum datum; bool isNull; datum = index_getattr(tuple, key->sk_attno, giststate->tupdesc, &isNull); if ((key->sk_flags & SK_ISNULL) || isNull) { /* Assume distance computes as null and sorts to the end */ *distance_p = get_float8_infinity(); } else { Datum dist; bool recheck; GISTENTRY de; gistdentryinit(giststate, key->sk_attno - 1, &de, datum, r, page, offset, false, isNull); /* * Call the Distance function to evaluate the distance. The * arguments are the index datum (as a GISTENTRY*), the comparison * datum, the ordering operator's strategy number and subtype from * pg_amop, and the recheck flag. * * (Presently there's no need to pass the subtype since it'll * always be zero, but might as well pass it for possible future * use.) * * If the function sets the recheck flag, the returned distance is * a lower bound on the true distance and needs to be rechecked. * We initialize the flag to 'false'. This flag was added in * version 9.5; distance functions written before that won't know * about the flag, but are expected to never be lossy. */ recheck = false; dist = FunctionCall5Coll(&key->sk_func, key->sk_collation, PointerGetDatum(&de), key->sk_argument, Int16GetDatum(key->sk_strategy), ObjectIdGetDatum(key->sk_subtype), PointerGetDatum(&recheck)); *recheck_distances_p |= recheck; *distance_p = DatumGetFloat8(dist); } key++; distance_p++; keySize--; } return true; }
/* * Workhouse routine for doing insertion into a GiST index. Note that * this routine assumes it is invoked in a short-lived memory context, * so it does not bother releasing palloc'd allocations. */ void gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *giststate) { ItemId iid; IndexTuple idxtuple; GISTInsertStack firststack; GISTInsertStack *stack; GISTInsertState state; bool xlocked = false; memset(&state, 0, sizeof(GISTInsertState)); state.freespace = freespace; state.r = r; /* Start from the root */ firststack.blkno = GIST_ROOT_BLKNO; firststack.lsn = 0; firststack.parent = NULL; firststack.downlinkoffnum = InvalidOffsetNumber; state.stack = stack = &firststack; /* * Walk down along the path of smallest penalty, updating the parent * pointers with the key we're inserting as we go. If we crash in the * middle, the tree is consistent, although the possible parent updates * were a waste. */ for (;;) { if (XLogRecPtrIsInvalid(stack->lsn)) stack->buffer = ReadBuffer(state.r, stack->blkno); /* * Be optimistic and grab shared lock first. Swap it for an exclusive * lock later if we need to update the page. */ if (!xlocked) { LockBuffer(stack->buffer, GIST_SHARE); gistcheckpage(state.r, stack->buffer); } stack->page = (Page) BufferGetPage(stack->buffer); stack->lsn = PageGetLSN(stack->page); Assert(!RelationNeedsWAL(state.r) || !XLogRecPtrIsInvalid(stack->lsn)); /* * If this page was split but the downlink was never inserted to the * parent because the inserting backend crashed before doing that, fix * that now. */ if (GistFollowRight(stack->page)) { if (!xlocked) { LockBuffer(stack->buffer, GIST_UNLOCK); LockBuffer(stack->buffer, GIST_EXCLUSIVE); xlocked = true; /* someone might've completed the split when we unlocked */ if (!GistFollowRight(stack->page)) continue; } gistfixsplit(&state, giststate); UnlockReleaseBuffer(stack->buffer); xlocked = false; state.stack = stack = stack->parent; continue; } if (stack->blkno != GIST_ROOT_BLKNO && stack->parent->lsn < GistPageGetNSN(stack->page)) { /* * Concurrent split detected. There's no guarantee that the * downlink for this page is consistent with the tuple we're * inserting anymore, so go back to parent and rechoose the best * child. */ UnlockReleaseBuffer(stack->buffer); xlocked = false; state.stack = stack = stack->parent; continue; } if (!GistPageIsLeaf(stack->page)) { /* * This is an internal page so continue to walk down the tree. * Find the child node that has the minimum insertion penalty. */ BlockNumber childblkno; IndexTuple newtup; GISTInsertStack *item; OffsetNumber downlinkoffnum; downlinkoffnum = gistchoose(state.r, stack->page, itup, giststate); iid = PageGetItemId(stack->page, downlinkoffnum); idxtuple = (IndexTuple) PageGetItem(stack->page, iid); childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid)); /* * Check that it's not a leftover invalid tuple from pre-9.1 */ if (GistTupleIsInvalid(idxtuple)) ereport(ERROR, (errmsg("index \"%s\" contains an inner tuple marked as invalid", RelationGetRelationName(r)), errdetail("This is caused by an incomplete page split at crash recovery before upgrading to PostgreSQL 9.1."), errhint("Please REINDEX it."))); /* * Check that the key representing the target child node is * consistent with the key we're inserting. Update it if it's not. */ newtup = gistgetadjusted(state.r, idxtuple, itup, giststate); if (newtup) { /* * Swap shared lock for an exclusive one. Beware, the page may * change while we unlock/lock the page... */ if (!xlocked) { LockBuffer(stack->buffer, GIST_UNLOCK); LockBuffer(stack->buffer, GIST_EXCLUSIVE); xlocked = true; stack->page = (Page) BufferGetPage(stack->buffer); if (PageGetLSN(stack->page) != stack->lsn) { /* the page was changed while we unlocked it, retry */ continue; } } /* * Update the tuple. * * We still hold the lock after gistinserttuple(), but it * might have to split the page to make the updated tuple fit. * In that case the updated tuple might migrate to the other * half of the split, so we have to go back to the parent and * descend back to the half that's a better fit for the new * tuple. */ if (gistinserttuple(&state, stack, giststate, newtup, downlinkoffnum)) { /* * If this was a root split, the root page continues to be * the parent and the updated tuple went to one of the * child pages, so we just need to retry from the root * page. */ if (stack->blkno != GIST_ROOT_BLKNO) { UnlockReleaseBuffer(stack->buffer); xlocked = false; state.stack = stack = stack->parent; } continue; } } LockBuffer(stack->buffer, GIST_UNLOCK); xlocked = false; /* descend to the chosen child */ item = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack)); item->blkno = childblkno; item->parent = stack; item->downlinkoffnum = downlinkoffnum; state.stack = stack = item; } else { /* * Leaf page. Insert the new key. We've already updated all the * parents on the way down, but we might have to split the page if * it doesn't fit. gistinserthere() will take care of that. */ /* * Swap shared lock for an exclusive one. Be careful, the page may * change while we unlock/lock the page... */ if (!xlocked) { LockBuffer(stack->buffer, GIST_UNLOCK); LockBuffer(stack->buffer, GIST_EXCLUSIVE); xlocked = true; stack->page = (Page) BufferGetPage(stack->buffer); stack->lsn = PageGetLSN(stack->page); if (stack->blkno == GIST_ROOT_BLKNO) { /* * the only page that can become inner instead of leaf is * the root page, so for root we should recheck it */ if (!GistPageIsLeaf(stack->page)) { /* * very rare situation: during unlock/lock index with * number of pages = 1 was increased */ LockBuffer(stack->buffer, GIST_UNLOCK); xlocked = false; continue; } /* * we don't need to check root split, because checking * leaf/inner is enough to recognize split for root */ } else if (GistFollowRight(stack->page) || stack->parent->lsn < GistPageGetNSN(stack->page)) { /* * The page was split while we momentarily unlocked the * page. Go back to parent. */ UnlockReleaseBuffer(stack->buffer); xlocked = false; state.stack = stack = stack->parent; continue; } } /* now state.stack->(page, buffer and blkno) points to leaf page */ gistinserttuple(&state, stack, giststate, itup, InvalidOffsetNumber); LockBuffer(stack->buffer, GIST_UNLOCK); /* Release any pins we might still hold before exiting */ for (; stack; stack = stack->parent) ReleaseBuffer(stack->buffer); break; } } }
/* * Scan all items on the GiST index page identified by *pageItem, and insert * them into the queue (or directly to output areas) * * scan: index scan we are executing * pageItem: search queue item identifying an index page to scan * myDistances: distances array associated with pageItem, or NULL at the root * tbm: if not NULL, gistgetbitmap's output bitmap * ntids: if not NULL, gistgetbitmap's output tuple counter * * If tbm/ntids aren't NULL, we are doing an amgetbitmap scan, and heap * tuples should be reported directly into the bitmap. If they are NULL, * we're doing a plain or ordered indexscan. For a plain indexscan, heap * tuple TIDs are returned into so->pageData[]. For an ordered indexscan, * heap tuple TIDs are pushed into individual search queue items. In an * index-only scan, reconstructed index tuples are returned along with the * TIDs. * * If we detect that the index page has split since we saw its downlink * in the parent, we push its new right sibling onto the queue so the * sibling will be processed next. */ static void gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, double *myDistances, TIDBitmap *tbm, int64 *ntids) { GISTScanOpaque so = (GISTScanOpaque) scan->opaque; GISTSTATE *giststate = so->giststate; Relation r = scan->indexRelation; Buffer buffer; Page page; GISTPageOpaque opaque; OffsetNumber maxoff; OffsetNumber i; MemoryContext oldcxt; Assert(!GISTSearchItemIsHeap(*pageItem)); buffer = ReadBuffer(scan->indexRelation, pageItem->blkno); LockBuffer(buffer, GIST_SHARE); PredicateLockPage(r, BufferGetBlockNumber(buffer), scan->xs_snapshot); gistcheckpage(scan->indexRelation, buffer); page = BufferGetPage(buffer); TestForOldSnapshot(scan->xs_snapshot, r, page); opaque = GistPageGetOpaque(page); /* * Check if we need to follow the rightlink. We need to follow it if the * page was concurrently split since we visited the parent (in which case * parentlsn < nsn), or if the system crashed after a page split but * before the downlink was inserted into the parent. */ if (!XLogRecPtrIsInvalid(pageItem->data.parentlsn) && (GistFollowRight(page) || pageItem->data.parentlsn < GistPageGetNSN(page)) && opaque->rightlink != InvalidBlockNumber /* sanity check */ ) { /* There was a page split, follow right link to add pages */ GISTSearchItem *item; /* This can't happen when starting at the root */ Assert(myDistances != NULL); oldcxt = MemoryContextSwitchTo(so->queueCxt); /* Create new GISTSearchItem for the right sibling index page */ item = palloc(SizeOfGISTSearchItem(scan->numberOfOrderBys)); item->blkno = opaque->rightlink; item->data.parentlsn = pageItem->data.parentlsn; /* Insert it into the queue using same distances as for this page */ memcpy(item->distances, myDistances, sizeof(double) * scan->numberOfOrderBys); pairingheap_add(so->queue, &item->phNode); MemoryContextSwitchTo(oldcxt); } so->nPageData = so->curPageData = 0; scan->xs_hitup = NULL; /* might point into pageDataCxt */ if (so->pageDataCxt) MemoryContextReset(so->pageDataCxt); /* * We save the LSN of the page as we read it, so that we know whether it * safe to apply LP_DEAD hints to the page later. This allows us to drop * the pin for MVCC scans, which allows vacuum to avoid blocking. */ so->curPageLSN = BufferGetLSNAtomic(buffer); /* * check all tuples on page */ maxoff = PageGetMaxOffsetNumber(page); for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) { ItemId iid = PageGetItemId(page, i); IndexTuple it; bool match; bool recheck; bool recheck_distances; /* * If the scan specifies not to return killed tuples, then we treat a * killed tuple as not passing the qual. */ if (scan->ignore_killed_tuples && ItemIdIsDead(iid)) continue; it = (IndexTuple) PageGetItem(page, iid); /* * Must call gistindex_keytest in tempCxt, and clean up any leftover * junk afterward. */ oldcxt = MemoryContextSwitchTo(so->giststate->tempCxt); match = gistindex_keytest(scan, it, page, i, &recheck, &recheck_distances); MemoryContextSwitchTo(oldcxt); MemoryContextReset(so->giststate->tempCxt); /* Ignore tuple if it doesn't match */ if (!match) continue; if (tbm && GistPageIsLeaf(page)) { /* * getbitmap scan, so just push heap tuple TIDs into the bitmap * without worrying about ordering */ tbm_add_tuples(tbm, &it->t_tid, 1, recheck); (*ntids)++; } else if (scan->numberOfOrderBys == 0 && GistPageIsLeaf(page)) { /* * Non-ordered scan, so report tuples in so->pageData[] */ so->pageData[so->nPageData].heapPtr = it->t_tid; so->pageData[so->nPageData].recheck = recheck; so->pageData[so->nPageData].offnum = i; /* * In an index-only scan, also fetch the data from the tuple. The * reconstructed tuples are stored in pageDataCxt. */ if (scan->xs_want_itup) { oldcxt = MemoryContextSwitchTo(so->pageDataCxt); so->pageData[so->nPageData].recontup = gistFetchTuple(giststate, r, it); MemoryContextSwitchTo(oldcxt); } so->nPageData++; } else { /* * Must push item into search queue. We get here for any lower * index page, and also for heap tuples if doing an ordered * search. */ GISTSearchItem *item; oldcxt = MemoryContextSwitchTo(so->queueCxt); /* Create new GISTSearchItem for this item */ item = palloc(SizeOfGISTSearchItem(scan->numberOfOrderBys)); if (GistPageIsLeaf(page)) { /* Creating heap-tuple GISTSearchItem */ item->blkno = InvalidBlockNumber; item->data.heap.heapPtr = it->t_tid; item->data.heap.recheck = recheck; item->data.heap.recheckDistances = recheck_distances; /* * In an index-only scan, also fetch the data from the tuple. */ if (scan->xs_want_itup) item->data.heap.recontup = gistFetchTuple(giststate, r, it); } else { /* Creating index-page GISTSearchItem */ item->blkno = ItemPointerGetBlockNumber(&it->t_tid); /* * LSN of current page is lsn of parent page for child. We * only have a shared lock, so we need to get the LSN * atomically. */ item->data.parentlsn = BufferGetLSNAtomic(buffer); } /* Insert it into the queue using new distance data */ memcpy(item->distances, so->distances, sizeof(double) * scan->numberOfOrderBys); pairingheap_add(so->queue, &item->phNode); MemoryContextSwitchTo(oldcxt); } } UnlockReleaseBuffer(buffer); }
/* * Bulk deletion of all index entries pointing to a set of heap tuples and * check invalid tuples left after upgrade. * The set of target tuples is specified via a callback routine that tells * whether any given heap tuple (identified by ItemPointer) is being deleted. * * Result: a palloc'd struct containing statistical info for VACUUM displays. */ IndexBulkDeleteResult * gistbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state) { Relation rel = info->index; GistBDItem *stack, *ptr; /* first time through? */ if (stats == NULL) stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); /* we'll re-count the tuples each time */ stats->estimated_count = false; stats->num_index_tuples = 0; stack = (GistBDItem *) palloc0(sizeof(GistBDItem)); stack->blkno = GIST_ROOT_BLKNO; while (stack) { Buffer buffer; Page page; OffsetNumber i, maxoff; IndexTuple idxtuple; ItemId iid; buffer = ReadBufferExtended(rel, MAIN_FORKNUM, stack->blkno, RBM_NORMAL, info->strategy); LockBuffer(buffer, GIST_SHARE); gistcheckpage(rel, buffer); page = (Page) BufferGetPage(buffer); if (GistPageIsLeaf(page)) { OffsetNumber todelete[MaxOffsetNumber]; int ntodelete = 0; LockBuffer(buffer, GIST_UNLOCK); LockBuffer(buffer, GIST_EXCLUSIVE); page = (Page) BufferGetPage(buffer); if (stack->blkno == GIST_ROOT_BLKNO && !GistPageIsLeaf(page)) { /* only the root can become non-leaf during relock */ UnlockReleaseBuffer(buffer); /* one more check */ continue; } /* * check for split proceeded after look at parent, we should check * it after relock */ pushStackIfSplited(page, stack); /* * Remove deletable tuples from page */ maxoff = PageGetMaxOffsetNumber(page); for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) { iid = PageGetItemId(page, i); idxtuple = (IndexTuple) PageGetItem(page, iid); if (callback(&(idxtuple->t_tid), callback_state)) todelete[ntodelete++] = i; else stats->num_index_tuples += 1; } stats->tuples_removed += ntodelete; if (ntodelete) { START_CRIT_SECTION(); MarkBufferDirty(buffer); PageIndexMultiDelete(page, todelete, ntodelete); GistMarkTuplesDeleted(page); if (RelationNeedsWAL(rel)) { XLogRecPtr recptr; recptr = gistXLogUpdate(buffer, todelete, ntodelete, NULL, 0, InvalidBuffer); PageSetLSN(page, recptr); } else PageSetLSN(page, gistGetFakeLSN(rel)); END_CRIT_SECTION(); } } else { /* check for split proceeded after look at parent */ pushStackIfSplited(page, stack); maxoff = PageGetMaxOffsetNumber(page); for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) { iid = PageGetItemId(page, i); idxtuple = (IndexTuple) PageGetItem(page, iid); ptr = (GistBDItem *) palloc(sizeof(GistBDItem)); ptr->blkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid)); ptr->parentlsn = BufferGetLSNAtomic(buffer); ptr->next = stack->next; stack->next = ptr; if (GistTupleIsInvalid(idxtuple)) ereport(LOG, (errmsg("index \"%s\" contains an inner tuple marked as invalid", RelationGetRelationName(rel)), errdetail("This is caused by an incomplete page split at crash recovery before upgrading to PostgreSQL 9.1."), errhint("Please REINDEX it."))); } } UnlockReleaseBuffer(buffer); ptr = stack->next; pfree(stack); stack = ptr; vacuum_delay_point(); } return stats; }
/* * gistkillitems() -- set LP_DEAD state for items an indexscan caller has * told us were killed. * * We re-read page here, so it's important to check page LSN. If the page * has been modified since the last read (as determined by LSN), we cannot * flag any entries because it is possible that the old entry was vacuumed * away and the TID was re-used by a completely different heap tuple. */ static void gistkillitems(IndexScanDesc scan) { GISTScanOpaque so = (GISTScanOpaque) scan->opaque; Buffer buffer; Page page; OffsetNumber offnum; ItemId iid; int i; bool killedsomething = false; Assert(so->curBlkno != InvalidBlockNumber); Assert(!XLogRecPtrIsInvalid(so->curPageLSN)); Assert(so->killedItems != NULL); buffer = ReadBuffer(scan->indexRelation, so->curBlkno); if (!BufferIsValid(buffer)) return; LockBuffer(buffer, GIST_SHARE); gistcheckpage(scan->indexRelation, buffer); page = BufferGetPage(buffer); /* * If page LSN differs it means that the page was modified since the last * read. killedItems could be not valid so LP_DEAD hints applying is not * safe. */ if (BufferGetLSNAtomic(buffer) != so->curPageLSN) { UnlockReleaseBuffer(buffer); so->numKilled = 0; /* reset counter */ return; } Assert(GistPageIsLeaf(page)); /* * Mark all killedItems as dead. We need no additional recheck, because, * if page was modified, pageLSN must have changed. */ for (i = 0; i < so->numKilled; i++) { offnum = so->killedItems[i]; iid = PageGetItemId(page, offnum); ItemIdMarkDead(iid); killedsomething = true; } if (killedsomething) { GistMarkPageHasGarbage(page); MarkBufferDirtyHint(buffer, true); } UnlockReleaseBuffer(buffer); /* * Always reset the scan state, so we don't look for same items on other * pages. */ so->numKilled = 0; }
/* * gistindex_keytest() -- does this index tuple satisfy the scan key(s)? * * The index tuple might represent either a heap tuple or a lower index page, * depending on whether the containing page is a leaf page or not. * * On success return for a heap tuple, *recheck_p is set to indicate * whether recheck is needed. We recheck if any of the consistent() functions * request it. recheck is not interesting when examining a non-leaf entry, * since we must visit the lower index page if there's any doubt. * * If we are doing an ordered scan, so->distances[] is filled with distance * data from the distance() functions before returning success. * * We must decompress the key in the IndexTuple before passing it to the * sk_funcs (which actually are the opclass Consistent or Distance methods). * * Note that this function is always invoked in a short-lived memory context, * so we don't need to worry about cleaning up allocated memory, either here * or in the implementation of any Consistent or Distance methods. */ static bool gistindex_keytest(IndexScanDesc scan, IndexTuple tuple, Page page, OffsetNumber offset, bool *recheck_p) { GISTScanOpaque so = (GISTScanOpaque) scan->opaque; GISTSTATE *giststate = so->giststate; ScanKey key = scan->keyData; int keySize = scan->numberOfKeys; double *distance_p; Relation r = scan->indexRelation; *recheck_p = false; /* * If it's a leftover invalid tuple from pre-9.1, treat it as a match with * minimum possible distances. This means we'll always follow it to the * referenced page. * * GPDB: the virtual TIDs created for AO tables use the full range of * offset numbers from 0 to 65535. So a tuple on leaf page that looks like * an invalid tuple, is actually ok. */ if (!GistPageIsLeaf(page) && GistTupleIsInvalid(tuple)) { int i; for (i = 0; i < scan->numberOfOrderBys; i++) so->distances[i] = -get_float8_infinity(); return true; } /* Check whether it matches according to the Consistent functions */ while (keySize > 0) { Datum datum; bool isNull; datum = index_getattr(tuple, key->sk_attno, giststate->tupdesc, &isNull); if (key->sk_flags & SK_ISNULL) { /* * On non-leaf page we can't conclude that child hasn't NULL * values because of assumption in GiST: union (VAL, NULL) is VAL. * But if on non-leaf page key IS NULL, then all children are * NULL. */ if (key->sk_flags & SK_SEARCHNULL) { if (GistPageIsLeaf(page) && !isNull) return false; } else { Assert(key->sk_flags & SK_SEARCHNOTNULL); if (isNull) return false; } } else if (isNull) { return false; } else { Datum test; bool recheck; GISTENTRY de; gistdentryinit(giststate, key->sk_attno - 1, &de, datum, r, page, offset, FALSE, isNull); /* * Call the Consistent function to evaluate the test. The * arguments are the index datum (as a GISTENTRY*), the comparison * datum, the comparison operator's strategy number and subtype * from pg_amop, and the recheck flag. * * (Presently there's no need to pass the subtype since it'll * always be zero, but might as well pass it for possible future * use.) * * We initialize the recheck flag to true (the safest assumption) * in case the Consistent function forgets to set it. */ recheck = true; test = FunctionCall5Coll(&key->sk_func, key->sk_collation, PointerGetDatum(&de), key->sk_argument, Int32GetDatum(key->sk_strategy), ObjectIdGetDatum(key->sk_subtype), PointerGetDatum(&recheck)); if (!DatumGetBool(test)) return false; *recheck_p |= recheck; } key++; keySize--; } /* OK, it passes --- now let's compute the distances */ key = scan->orderByData; distance_p = so->distances; keySize = scan->numberOfOrderBys; while (keySize > 0) { Datum datum; bool isNull; datum = index_getattr(tuple, key->sk_attno, giststate->tupdesc, &isNull); if ((key->sk_flags & SK_ISNULL) || isNull) { /* Assume distance computes as null and sorts to the end */ *distance_p = get_float8_infinity(); } else { Datum dist; GISTENTRY de; gistdentryinit(giststate, key->sk_attno - 1, &de, datum, r, page, offset, FALSE, isNull); /* * Call the Distance function to evaluate the distance. The * arguments are the index datum (as a GISTENTRY*), the comparison * datum, and the ordering operator's strategy number and subtype * from pg_amop. * * (Presently there's no need to pass the subtype since it'll * always be zero, but might as well pass it for possible future * use.) * * Note that Distance functions don't get a recheck argument. We * can't tolerate lossy distance calculations on leaf tuples; * there is no opportunity to re-sort the tuples afterwards. */ dist = FunctionCall4Coll(&key->sk_func, key->sk_collation, PointerGetDatum(&de), key->sk_argument, Int32GetDatum(key->sk_strategy), ObjectIdGetDatum(key->sk_subtype)); *distance_p = DatumGetFloat8(dist); } key++; distance_p++; keySize--; } return true; }
/* * find entry with lowest penalty */ OffsetNumber gistchoose(Relation r, Page p, IndexTuple it, /* it has compressed entry */ GISTSTATE *giststate) { OffsetNumber maxoff; OffsetNumber i; OffsetNumber which; float sum_grow, which_grow[INDEX_MAX_KEYS]; GISTENTRY entry, identry[INDEX_MAX_KEYS]; bool isnull[INDEX_MAX_KEYS]; maxoff = PageGetMaxOffsetNumber(p); *which_grow = -1.0; which = InvalidOffsetNumber; sum_grow = 1; gistDeCompressAtt(giststate, r, it, NULL, (OffsetNumber) 0, identry, isnull); Assert(maxoff >= FirstOffsetNumber); Assert(!GistPageIsLeaf(p)); for (i = FirstOffsetNumber; i <= maxoff && sum_grow; i = OffsetNumberNext(i)) { int j; IndexTuple itup = (IndexTuple) PageGetItem(p, PageGetItemId(p, i)); if (!GistPageIsLeaf(p) && GistTupleIsInvalid(itup)) { ereport(LOG, (errmsg("index \"%s\" needs VACUUM or REINDEX to finish crash recovery", RelationGetRelationName(r)))); continue; } sum_grow = 0; for (j = 0; j < r->rd_att->natts; j++) { Datum datum; float usize; bool IsNull; datum = index_getattr(itup, j + 1, giststate->tupdesc, &IsNull); gistdentryinit(giststate, j, &entry, datum, r, p, i, FALSE, IsNull); usize = gistpenalty(giststate, j, &entry, IsNull, &identry[j], isnull[j]); if (which_grow[j] < 0 || usize < which_grow[j]) { which = i; which_grow[j] = usize; if (j < r->rd_att->natts - 1 && i == FirstOffsetNumber) which_grow[j + 1] = -1; sum_grow += which_grow[j]; } else if (which_grow[j] == usize) sum_grow += usize; else { sum_grow = 1; break; } } } if (which == InvalidOffsetNumber) which = FirstOffsetNumber; return which; }
/* * redo any page update (except page split) */ static void gistRedoPageUpdateRecord(XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; gistxlogPageUpdate *xldata = (gistxlogPageUpdate *) XLogRecGetData(record); Buffer buffer; Page page; if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) { char *begin; char *data; Size datalen; int ninserted = 0; data = begin = XLogRecGetBlockData(record, 0, &datalen); page = (Page) BufferGetPage(buffer); if (xldata->ntodelete == 1 && xldata->ntoinsert == 1) { /* * When replacing one tuple with one other tuple, we must use * PageIndexTupleOverwrite for consistency with gistplacetopage. */ OffsetNumber offnum = *((OffsetNumber *) data); IndexTuple itup; Size itupsize; data += sizeof(OffsetNumber); itup = (IndexTuple) data; itupsize = IndexTupleSize(itup); if (!PageIndexTupleOverwrite(page, offnum, (Item) itup, itupsize)) elog(ERROR, "failed to add item to GiST index page, size %d bytes", (int) itupsize); data += itupsize; /* should be nothing left after consuming 1 tuple */ Assert(data - begin == datalen); /* update insertion count for assert check below */ ninserted++; } else if (xldata->ntodelete > 0) { /* Otherwise, delete old tuples if any */ OffsetNumber *todelete = (OffsetNumber *) data; data += sizeof(OffsetNumber) * xldata->ntodelete; PageIndexMultiDelete(page, todelete, xldata->ntodelete); if (GistPageIsLeaf(page)) GistMarkTuplesDeleted(page); } /* Add new tuples if any */ if (data - begin < datalen) { OffsetNumber off = (PageIsEmpty(page)) ? FirstOffsetNumber : OffsetNumberNext(PageGetMaxOffsetNumber(page)); while (data - begin < datalen) { IndexTuple itup = (IndexTuple) data; Size sz = IndexTupleSize(itup); OffsetNumber l; data += sz; l = PageAddItem(page, (Item) itup, sz, off, false, false); if (l == InvalidOffsetNumber) elog(ERROR, "failed to add item to GiST index page, size %d bytes", (int) sz); off++; ninserted++; } } /* Check that XLOG record contained expected number of tuples */ Assert(ninserted == xldata->ntoinsert); PageSetLSN(page, lsn); MarkBufferDirty(buffer); } /* * Fix follow-right data on left child page * * This must be done while still holding the lock on the target page. Note * that even if the target page no longer exists, we still attempt to * replay the change on the child page. */ if (XLogRecHasBlockRef(record, 1)) gistRedoClearFollowRight(record, 1); if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); }
/* * trys to split page by attno key, in a case of null * values move its to separate page. */ void gistSplitByKey(Relation r, Page page, IndexTuple *itup, int len, GISTSTATE *giststate, GistSplitVector *v, GistEntryVector *entryvec, int attno) { int i; static OffsetNumber offNullTuples[MaxOffsetNumber]; int nOffNullTuples = 0; for (i = 1; i <= len; i++) { Datum datum; bool IsNull; if (!GistPageIsLeaf(page) && GistTupleIsInvalid(itup[i - 1])) { gistSplitByInvalid(giststate, v, itup, len); return; } datum = index_getattr(itup[i - 1], attno + 1, giststate->tupdesc, &IsNull); gistdentryinit(giststate, attno, &(entryvec->vector[i]), datum, r, page, i, FALSE, IsNull); if (IsNull) offNullTuples[nOffNullTuples++] = i; } v->spl_leftvalid = v->spl_rightvalid = true; if (nOffNullTuples == len) { /* * Corner case: All keys in attno column are null, we should try to * split by keys in next column. It all keys in all columns are NULL * just split page half by half */ v->spl_risnull[attno] = v->spl_lisnull[attno] = TRUE; if (attno + 1 == r->rd_att->natts) gistSplitHalf(&v->splitVector, len); else gistSplitByKey(r, page, itup, len, giststate, v, entryvec, attno + 1); } else if (nOffNullTuples > 0) { int j = 0; /* * We don't want to mix NULLs and not-NULLs keys on one page, so move * nulls to right page */ v->splitVector.spl_right = offNullTuples; v->splitVector.spl_nright = nOffNullTuples; v->spl_risnull[attno] = TRUE; v->splitVector.spl_left = (OffsetNumber *) palloc(len * sizeof(OffsetNumber)); v->splitVector.spl_nleft = 0; for (i = 1; i <= len; i++) if (j < v->splitVector.spl_nright && offNullTuples[j] == i) j++; else v->splitVector.spl_left[v->splitVector.spl_nleft++] = i; v->spl_equiv = NULL; gistunionsubkey(giststate, itup, v, attno); } else { /* * all keys are not-null */ entryvec->n = len + 1; if (gistUserPicksplit(r, entryvec, attno, v, itup, len, giststate) && attno + 1 != r->rd_att->natts) { /* * Splitting on attno column is not optimized: there is a tuples * which can be freely left or right page, we will try to split * page by following columns */ if (v->spl_equiv == NULL) { /* * simple case: left and right keys for attno column are * equial */ gistSplitByKey(r, page, itup, len, giststate, v, entryvec, attno + 1); } else { /* we should clean up vector from already distributed tuples */ IndexTuple *newitup = (IndexTuple *) palloc((len + 1) * sizeof(IndexTuple)); OffsetNumber *map = (OffsetNumber *) palloc((len + 1) * sizeof(IndexTuple)); int newlen = 0; GIST_SPLITVEC backupSplit = v->splitVector; for (i = 0; i < len; i++) if (v->spl_equiv[i + 1]) { map[newlen] = i + 1; newitup[newlen++] = itup[i]; } Assert(newlen > 0); backupSplit.spl_left = (OffsetNumber *) palloc(sizeof(OffsetNumber) * len); memcpy(backupSplit.spl_left, v->splitVector.spl_left, sizeof(OffsetNumber) * v->splitVector.spl_nleft); backupSplit.spl_right = (OffsetNumber *) palloc(sizeof(OffsetNumber) * len); memcpy(backupSplit.spl_right, v->splitVector.spl_right, sizeof(OffsetNumber) * v->splitVector.spl_nright); gistSplitByKey(r, page, newitup, newlen, giststate, v, entryvec, attno + 1); /* merge result of subsplit */ for (i = 0; i < v->splitVector.spl_nleft; i++) backupSplit.spl_left[backupSplit.spl_nleft++] = map[v->splitVector.spl_left[i] - 1]; for (i = 0; i < v->splitVector.spl_nright; i++) backupSplit.spl_right[backupSplit.spl_nright++] = map[v->splitVector.spl_right[i] - 1]; v->splitVector = backupSplit; /* reunion left and right datums */ gistunionsubkey(giststate, itup, v, attno); } } } }