/* * _bt_clear_incomplete_split -- clear INCOMPLETE_SPLIT flag on a page * * This is a common subroutine of the redo functions of all the WAL record * types that can insert a downlink: insert, split, and newroot. */ static void _bt_clear_incomplete_split(XLogReaderState *record, uint8 block_id) { XLogRecPtr lsn = record->EndRecPtr; Buffer buf; if (XLogReadBufferForRedo(record, block_id, &buf) == BLK_NEEDS_REDO) { Page page = (Page) BufferGetPage(buf); BTPageOpaque pageop = (BTPageOpaque) PageGetSpecialPointer(page); Assert(P_INCOMPLETE_SPLIT(pageop)); pageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT; PageSetLSN(page, lsn); MarkBufferDirty(buf); } if (BufferIsValid(buf)) UnlockReleaseBuffer(buf); }
/* * _bt_moveright() -- move right in the btree if necessary. * * When we follow a pointer to reach a page, it is possible that * the page has changed in the meanwhile. If this happens, we're * guaranteed that the page has "split right" -- that is, that any * data that appeared on the page originally is either on the page * or strictly to the right of it. * * This routine decides whether or not we need to move right in the * tree by examining the high key entry on the page. If that entry * is strictly less than the scankey, or <= the scankey in the nextkey=true * case, then we followed the wrong link and we need to move right. * * The passed scankey must be an insertion-type scankey (see nbtree/README), * but it can omit the rightmost column(s) of the index. * * When nextkey is false (the usual case), we are looking for the first * item >= scankey. When nextkey is true, we are looking for the first * item strictly greater than scankey. * * If forupdate is true, we will attempt to finish any incomplete splits * that we encounter. This is required when locking a target page for an * insertion, because we don't allow inserting on a page before the split * is completed. 'stack' is only used if forupdate is true. * * On entry, we have the buffer pinned and a lock of the type specified by * 'access'. If we move right, we release the buffer and lock and acquire * the same on the right sibling. Return value is the buffer we stop at. */ Buffer _bt_moveright(Relation rel, Buffer buf, int keysz, ScanKey scankey, bool nextkey, bool forupdate, BTStack stack, int access) { Page page; BTPageOpaque opaque; int32 cmpval; /* * When nextkey = false (normal case): if the scan key that brought us to * this page is > the high key stored on the page, then the page has split * and we need to move right. (If the scan key is equal to the high key, * we might or might not need to move right; have to scan the page first * anyway.) * * When nextkey = true: move right if the scan key is >= page's high key. * * The page could even have split more than once, so scan as far as * needed. * * We also have to move right if we followed a link that brought us to a * dead page. */ cmpval = nextkey ? 0 : 1; for (;;) { page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (P_RIGHTMOST(opaque)) break; /* * Finish any incomplete splits we encounter along the way. */ if (forupdate && P_INCOMPLETE_SPLIT(opaque)) { BlockNumber blkno = BufferGetBlockNumber(buf); /* upgrade our lock if necessary */ if (access == BT_READ) { LockBuffer(buf, BUFFER_LOCK_UNLOCK); LockBuffer(buf, BT_WRITE); } if (P_INCOMPLETE_SPLIT(opaque)) _bt_finish_split(rel, buf, stack); else _bt_relbuf(rel, buf); /* re-acquire the lock in the right mode, and re-check */ buf = _bt_getbuf(rel, blkno, access); continue; } if (P_IGNORE(opaque) || _bt_compare(rel, keysz, scankey, page, P_HIKEY) >= cmpval) { /* step right one page */ buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access); continue; } else break; } if (P_IGNORE(opaque)) elog(ERROR, "fell off the end of index \"%s\"", RelationGetRelationName(rel)); return buf; }