Datum rtgetmulti(PG_FUNCTION_ARGS) { IndexScanDesc s = (IndexScanDesc) PG_GETARG_POINTER(0); ItemPointer tids = (ItemPointer) PG_GETARG_POINTER(1); int32 max_tids = PG_GETARG_INT32(2); int32 *returned_tids = (int32 *) PG_GETARG_POINTER(3); RTreeScanOpaque so = (RTreeScanOpaque) s->opaque; bool res = true; int32 ntids = 0; /* XXX generic implementation: loop around guts of rtgettuple */ while (ntids < max_tids) { res = rtnext(s, ForwardScanDirection); if (res && s->ignore_killed_tuples) { Page page; OffsetNumber offnum; offnum = ItemPointerGetOffsetNumber(&(s->currentItemData)); page = BufferGetPage(so->curbuf); if (ItemIdDeleted(PageGetItemId(page, offnum))) continue; } if (!res) break; tids[ntids] = s->xs_ctup.t_self; ntids++; } *returned_tids = ntids; PG_RETURN_BOOL(res); }
/* * MUST BE CALLED ONLY ON RECOVERY. * * Check if exists valid (inserted by not aborted xaction) heap tuple * for given item pointer */ bool XLogIsValidTuple(RelFileNode hnode, ItemPointer iptr) { Relation reln; Buffer buffer; Page page; ItemId lp; HeapTupleHeader htup; reln = XLogOpenRelation(false, RM_HEAP_ID, hnode); if (!RelationIsValid(reln)) return (false); buffer = ReadBuffer(reln, ItemPointerGetBlockNumber(iptr)); if (!BufferIsValid(buffer)) return (false); LockBuffer(buffer, BUFFER_LOCK_SHARE); page = (Page) BufferGetPage(buffer); if (PageIsNew((PageHeader) page) || ItemPointerGetOffsetNumber(iptr) > PageGetMaxOffsetNumber(page)) { UnlockAndReleaseBuffer(buffer); return (false); } if (PageGetSUI(page) != ThisStartUpID) { Assert(PageGetSUI(page) < ThisStartUpID); UnlockAndReleaseBuffer(buffer); return (true); } lp = PageGetItemId(page, ItemPointerGetOffsetNumber(iptr)); if (!ItemIdIsUsed(lp) || ItemIdDeleted(lp)) { UnlockAndReleaseBuffer(buffer); return (false); } htup = (HeapTupleHeader) PageGetItem(page, lp); /* MUST CHECK WASN'T TUPLE INSERTED IN PREV STARTUP */ if (!(htup->t_infomask & HEAP_XMIN_COMMITTED)) { if (htup->t_infomask & HEAP_XMIN_INVALID || (htup->t_infomask & HEAP_MOVED_IN && TransactionIdDidAbort(HeapTupleHeaderGetXvac(htup))) || TransactionIdDidAbort(HeapTupleHeaderGetXmin(htup))) { UnlockAndReleaseBuffer(buffer); return (false); } } UnlockAndReleaseBuffer(buffer); return (true); }
/* * Check if specified heap tuple was inserted by given * xaction/command and return * * - -1 if not * - 0 if there is no tuple at all * - 1 if yes */ int XLogIsOwnerOfTuple(RelFileNode hnode, ItemPointer iptr, TransactionId xid, CommandId cid) { Relation reln; Buffer buffer; Page page; ItemId lp; HeapTupleHeader htup; reln = XLogOpenRelation(false, RM_HEAP_ID, hnode); if (!RelationIsValid(reln)) return (0); buffer = ReadBuffer(reln, ItemPointerGetBlockNumber(iptr)); if (!BufferIsValid(buffer)) return (0); LockBuffer(buffer, BUFFER_LOCK_SHARE); page = (Page) BufferGetPage(buffer); if (PageIsNew((PageHeader) page) || ItemPointerGetOffsetNumber(iptr) > PageGetMaxOffsetNumber(page)) { UnlockAndReleaseBuffer(buffer); return (0); } lp = PageGetItemId(page, ItemPointerGetOffsetNumber(iptr)); if (!ItemIdIsUsed(lp) || ItemIdDeleted(lp)) { UnlockAndReleaseBuffer(buffer); return (0); } htup = (HeapTupleHeader) PageGetItem(page, lp); Assert(PageGetSUI(page) == ThisStartUpID); if (!TransactionIdEquals(HeapTupleHeaderGetXmin(htup), xid) || HeapTupleHeaderGetCmin(htup) != cid) { UnlockAndReleaseBuffer(buffer); return (-1); } UnlockAndReleaseBuffer(buffer); return (1); }
static text * istatus_text(ItemId itemid) { StringInfoData buf; initStringInfo(&buf); if (ItemIdDeleted(itemid)) appendStringInfoString(&buf, "DELETED "); if (ItemIdIsNormal(itemid)) appendStringInfoString(&buf, "USED "); if (ItemIdIsDead(itemid)) appendStringInfoString(&buf, "DEAD "); if (buf.len == 0) appendStringInfoString(&buf, "UNUSED "); buf.data[buf.len - 1] = '\0'; return cstring_to_text(buf.data); }
Datum rtgettuple(PG_FUNCTION_ARGS) { IndexScanDesc s = (IndexScanDesc) PG_GETARG_POINTER(0); ScanDirection dir = (ScanDirection) PG_GETARG_INT32(1); RTreeScanOpaque so = (RTreeScanOpaque) s->opaque; Page page; OffsetNumber offnum; /* * If we've already produced a tuple and the executor has informed us that * it should be marked "killed", do so now. */ if (s->kill_prior_tuple && ItemPointerIsValid(&(s->currentItemData))) { offnum = ItemPointerGetOffsetNumber(&(s->currentItemData)); page = BufferGetPage(so->curbuf); PageGetItemId(page, offnum)->lp_flags |= LP_DELETE; SetBufferCommitInfoNeedsSave(so->curbuf); } /* * Get the next tuple that matches the search key; if asked to skip killed * tuples, find the first non-killed tuple that matches. Return as soon as * we've run out of matches or we've found an acceptable match. */ for (;;) { bool res = rtnext(s, dir); if (res && s->ignore_killed_tuples) { offnum = ItemPointerGetOffsetNumber(&(s->currentItemData)); page = BufferGetPage(so->curbuf); if (ItemIdDeleted(PageGetItemId(page, offnum))) continue; } PG_RETURN_BOOL(res); } }
/* ------------------------------------------------- * GetBTPageStatistics() * * Collect statistics of single b-tree leaf page * ------------------------------------------------- */ static bool GetBTPageStatistics(BlockNumber blkno, Buffer buffer, BTPageStat * stat) { Page page = BufferGetPage(buffer); PageHeader phdr = (PageHeader) page; OffsetNumber maxoff = PageGetMaxOffsetNumber(page); BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page); int item_size = 0; int off; stat->blkno = blkno; stat->max_avail = BLCKSZ - (BLCKSZ - phdr->pd_special + SizeOfPageHeaderData); stat->dead_items = stat->live_items = 0; stat->page_size = PageGetPageSize(page); /* page type (flags) */ if (P_ISDELETED(opaque)) { stat->type = 'd'; return true; } else if (P_IGNORE(opaque)) stat->type = 'e'; else if (P_ISLEAF(opaque)) stat->type = 'l'; else if (P_ISROOT(opaque)) stat->type = 'r'; else stat->type = 'i'; /* btpage opaque data */ stat->btpo_prev = opaque->btpo_prev; stat->btpo_next = opaque->btpo_next; if (P_ISDELETED(opaque)) stat->btpo.xact = opaque->btpo.xact; else stat->btpo.level = opaque->btpo.level; stat->btpo_flags = opaque->btpo_flags; stat->btpo_cycleid = opaque->btpo_cycleid; /*---------------------------------------------- * If a next leaf is on the previous block, * it means a fragmentation. *---------------------------------------------- */ stat->fragments = 0; if (stat->type == 'l') { if (opaque->btpo_next != P_NONE && opaque->btpo_next < blkno) stat->fragments++; } /* count live and dead tuples, and free space */ for (off = FirstOffsetNumber; off <= maxoff; off++) { IndexTuple itup; ItemId id = PageGetItemId(page, off); itup = (IndexTuple) PageGetItem(page, id); item_size += IndexTupleSize(itup); if (!ItemIdDeleted(id)) stat->live_items++; else stat->dead_items++; } stat->free_size = PageGetFreeSpace(page); if ((stat->live_items + stat->dead_items) > 0) stat->avg_item_size = item_size / (stat->live_items + stat->dead_items); else stat->avg_item_size = 0; return true; }
/* * Test whether an indextuple satisfies all the scankey conditions. * * If so, copy its TID into scan->xs_ctup.t_self, and return TRUE. * If not, return FALSE (xs_ctup is not changed). * * If the tuple fails to pass the qual, we also determine whether there's * any need to continue the scan beyond this tuple, and set *continuescan * accordingly. See comments for _bt_preprocess_keys(), above, about how * this is done. * * scan: index scan descriptor (containing a search-type scankey) * page: buffer page containing index tuple * offnum: offset number of index tuple (must be a valid item!) * dir: direction we are scanning in * continuescan: output parameter (will be set correctly in all cases) */ bool _bt_checkkeys(IndexScanDesc scan, Page page, OffsetNumber offnum, ScanDirection dir, bool *continuescan) { ItemId iid = PageGetItemId(page, offnum); bool tuple_valid; IndexTuple tuple; TupleDesc tupdesc; BTScanOpaque so; int keysz; int ikey; ScanKey key; *continuescan = true; /* default assumption */ /* * If the scan specifies not to return killed tuples, then we treat a * killed tuple as not passing the qual. Most of the time, it's a win to * not bother examining the tuple's index keys, but just return * immediately with continuescan = true to proceed to the next tuple. * However, if this is the last tuple on the page, we should check the * index keys to prevent uselessly advancing to the next page. */ if (scan->ignore_killed_tuples && ItemIdDeleted(iid)) { /* return immediately if there are more tuples on the page */ if (ScanDirectionIsForward(dir)) { if (offnum < PageGetMaxOffsetNumber(page)) return false; } else { BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (offnum > P_FIRSTDATAKEY(opaque)) return false; } /* * OK, we want to check the keys, but we'll return FALSE even if the * tuple passes the key tests. */ tuple_valid = false; } else tuple_valid = true; tuple = (IndexTuple) PageGetItem(page, iid); IncrIndexProcessed(); tupdesc = RelationGetDescr(scan->indexRelation); so = (BTScanOpaque) scan->opaque; keysz = so->numberOfKeys; for (key = so->keyData, ikey = 0; ikey < keysz; key++, ikey++) { Datum datum; bool isNull; Datum test; /* row-comparison keys need special processing */ if (key->sk_flags & SK_ROW_HEADER) { if (_bt_check_rowcompare(key, tuple, tupdesc, dir, continuescan)) continue; return false; } datum = index_getattr(tuple, key->sk_attno, tupdesc, &isNull); /* btree doesn't support 'A is null' clauses, yet */ if (key->sk_flags & SK_ISNULL) { /* we shouldn't get here, really; see _bt_preprocess_keys() */ *continuescan = false; return false; } if (isNull) { if (key->sk_flags & SK_BT_NULLS_FIRST) { /* * Since NULLs are sorted before non-NULLs, we know we have * reached the lower limit of the range of values for this * index attr. On a backward scan, we can stop if this qual is * one of the "must match" subset. On a forward scan, * however, we should keep going. */ if ((key->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsBackward(dir)) *continuescan = false; } else { /* * Since NULLs are sorted after non-NULLs, we know we have * reached the upper limit of the range of values for this * index attr. On a forward scan, we can stop if this qual is * one of the "must match" subset. On a backward scan, * however, we should keep going. */ if ((key->sk_flags & SK_BT_REQFWD) && ScanDirectionIsForward(dir)) *continuescan = false; } /* * In any case, this indextuple doesn't match the qual. */ return false; } test = FunctionCall2(&key->sk_func, datum, key->sk_argument); if (!DatumGetBool(test)) { /* * Tuple fails this qual. If it's a required qual for the current * scan direction, then we can conclude no further tuples will * pass, either. * * Note: because we stop the scan as soon as any required equality * qual fails, it is critical that equality quals be used for the * initial positioning in _bt_first() when they are available. See * comments in _bt_first(). */ if ((key->sk_flags & SK_BT_REQFWD) && ScanDirectionIsForward(dir)) *continuescan = false; else if ((key->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsBackward(dir)) *continuescan = false; /* * In any case, this indextuple doesn't match the qual. */ return false; } } /* If we get here, the tuple passes all index quals. */ if (tuple_valid) scan->xs_ctup.t_self = tuple->t_tid; return tuple_valid; }
/* * hashgetmulti() -- get multiple tuples at once * * This is a somewhat generic implementation: it avoids lock reacquisition * overhead, but there's no smarts about picking especially good stopping * points such as index page boundaries. */ Datum hashgetmulti(PG_FUNCTION_ARGS) { IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0); ItemPointer tids = (ItemPointer) PG_GETARG_POINTER(1); int32 max_tids = PG_GETARG_INT32(2); int32 *returned_tids = (int32 *) PG_GETARG_POINTER(3); HashScanOpaque so = (HashScanOpaque) scan->opaque; Relation rel = scan->indexRelation; bool res = true; int32 ntids = 0; /* * We hold pin but not lock on current buffer while outside the hash AM. * Reacquire the read lock here. */ if (BufferIsValid(so->hashso_curbuf)) _hash_chgbufaccess(rel, so->hashso_curbuf, HASH_NOLOCK, HASH_READ); while (ntids < max_tids) { /* * Start scan, or advance to next tuple. */ if (ItemPointerIsValid(&(scan->currentItemData))) res = _hash_next(scan, ForwardScanDirection); else res = _hash_first(scan, ForwardScanDirection); /* * Skip killed tuples if asked to. */ if (scan->ignore_killed_tuples) { while (res) { Page page; OffsetNumber offnum; offnum = ItemPointerGetOffsetNumber(&(scan->currentItemData)); page = BufferGetPage(so->hashso_curbuf); if (!ItemIdDeleted(PageGetItemId(page, offnum))) break; res = _hash_next(scan, ForwardScanDirection); } } if (!res) break; /* Save tuple ID, and continue scanning */ tids[ntids] = scan->xs_ctup.t_self; ntids++; } /* Release read lock on current buffer, but keep it pinned */ if (BufferIsValid(so->hashso_curbuf)) _hash_chgbufaccess(rel, so->hashso_curbuf, HASH_READ, HASH_NOLOCK); *returned_tids = ntids; PG_RETURN_BOOL(res); }
/* * hashgettuple() -- Get the next tuple in the scan. */ Datum hashgettuple(PG_FUNCTION_ARGS) { IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0); ScanDirection dir = (ScanDirection) PG_GETARG_INT32(1); HashScanOpaque so = (HashScanOpaque) scan->opaque; Relation rel = scan->indexRelation; Page page; OffsetNumber offnum; bool res; /* * We hold pin but not lock on current buffer while outside the hash AM. * Reacquire the read lock here. */ if (BufferIsValid(so->hashso_curbuf)) _hash_chgbufaccess(rel, so->hashso_curbuf, HASH_NOLOCK, HASH_READ); /* * If we've already initialized this scan, we can just advance it in the * appropriate direction. If we haven't done so yet, we call a routine to * get the first item in the scan. */ if (ItemPointerIsValid(&(scan->currentItemData))) { /* * Check to see if we should kill the previously-fetched tuple. */ if (scan->kill_prior_tuple) { /* * Yes, so mark it by setting the LP_DELETE bit in the item flags. */ offnum = ItemPointerGetOffsetNumber(&(scan->currentItemData)); page = BufferGetPage(so->hashso_curbuf); PageGetItemId(page, offnum)->lp_flags |= LP_DELETE; /* * Since this can be redone later if needed, it's treated the same * as a commit-hint-bit status update for heap tuples: we mark the * buffer dirty but don't make a WAL log entry. */ SetBufferCommitInfoNeedsSave(so->hashso_curbuf); } /* * Now continue the scan. */ res = _hash_next(scan, dir); } else res = _hash_first(scan, dir); /* * Skip killed tuples if asked to. */ if (scan->ignore_killed_tuples) { while (res) { offnum = ItemPointerGetOffsetNumber(&(scan->currentItemData)); page = BufferGetPage(so->hashso_curbuf); if (!ItemIdDeleted(PageGetItemId(page, offnum))) break; res = _hash_next(scan, dir); } } /* Release read lock on current buffer, but keep it pinned */ if (BufferIsValid(so->hashso_curbuf)) _hash_chgbufaccess(rel, so->hashso_curbuf, HASH_READ, HASH_NOLOCK); PG_RETURN_BOOL(res); }
/* * PageRepairFragmentation * * Frees fragmented space on a page. * It doesn't remove unused line pointers! Please don't change this. * * This routine is usable for heap pages only, but see PageIndexMultiDelete. * * Returns number of unused line pointers on page. If "unused" is not NULL * then the unused[] array is filled with indexes of unused line pointers. */ int PageRepairFragmentation(Page page, OffsetNumber *unused) { Offset pd_lower = ((PageHeader) page)->pd_lower; Offset pd_upper = ((PageHeader) page)->pd_upper; Offset pd_special = ((PageHeader) page)->pd_special; itemIdSort itemidbase, itemidptr; ItemId lp; int nline, nused; int i; Size totallen; Offset upper; /* * It's worth the trouble to be more paranoid here than in most places, * because we are about to reshuffle data in (what is usually) a shared * disk buffer. If we aren't careful then corrupted pointers, lengths, * etc could cause us to clobber adjacent disk buffers, spreading the data * loss further. So, check everything. */ if (pd_lower < SizeOfPageHeaderData || pd_lower > pd_upper || pd_upper > pd_special || pd_special > BLCKSZ || pd_special != MAXALIGN(pd_special)) ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("corrupted page pointers: lower = %u, upper = %u, special = %u", pd_lower, pd_upper, pd_special), errSendAlert(true))); nline = PageGetMaxOffsetNumber(page); nused = 0; for (i = 0; i < nline; i++) { lp = PageGetItemId(page, i + 1); if (ItemIdDeleted(lp)) /* marked for deletion */ lp->lp_flags &= ~(LP_USED | LP_DELETE); if (ItemIdIsUsed(lp)) nused++; else if (unused) unused[i - nused] = (OffsetNumber) i; } if (nused == 0) { /* Page is completely empty, so just reset it quickly */ for (i = 0; i < nline; i++) { lp = PageGetItemId(page, i + 1); lp->lp_len = 0; /* indicate unused & deallocated */ } ((PageHeader) page)->pd_upper = pd_special; } else { /* nused != 0 */ /* Need to compact the page the hard way */ itemidbase = (itemIdSort) palloc(sizeof(itemIdSortData) * nused); itemidptr = itemidbase; totallen = 0; for (i = 0; i < nline; i++) { lp = PageGetItemId(page, i + 1); if (ItemIdIsUsed(lp)) { itemidptr->offsetindex = i; itemidptr->itemoff = ItemIdGetOffset(lp); if (itemidptr->itemoff < (int) pd_upper || itemidptr->itemoff >= (int) pd_special) ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("corrupted item pointer: %u", itemidptr->itemoff), errSendAlert(true))); itemidptr->alignedlen = MAXALIGN(ItemIdGetLength(lp)); totallen += itemidptr->alignedlen; itemidptr++; } else { lp->lp_len = 0; /* indicate unused & deallocated */ } } if (totallen > (Size) (pd_special - pd_lower)) ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("corrupted item lengths: total %u, available space %u", (unsigned int) totallen, pd_special - pd_lower), errSendAlert(true))); /* sort itemIdSortData array into decreasing itemoff order */ qsort((char *) itemidbase, nused, sizeof(itemIdSortData), itemoffcompare); /* compactify page */ upper = pd_special; for (i = 0, itemidptr = itemidbase; i < nused; i++, itemidptr++) { lp = PageGetItemId(page, itemidptr->offsetindex + 1); upper -= itemidptr->alignedlen; memmove((char *) page + upper, (char *) page + itemidptr->itemoff, itemidptr->alignedlen); lp->lp_off = upper; } ((PageHeader) page)->pd_upper = upper; pfree(itemidbase); } /* Set hint bit for PageAddItem */ if (nused < nline) PageSetHasFreeLinePointers(page); else PageClearHasFreeLinePointers(page); return (nline - nused); }