/* * Check if our cached information about a datatype is still valid */ static bool PLy_procedure_argument_valid(PLyTypeInfo *arg) { HeapTuple relTup; bool valid; /* Nothing to cache unless type is composite */ if (arg->is_rowtype != 1) return true; /* * Zero typ_relid means that we got called on an output argument of a * function returning a unnamed record type; the info for it can't change. */ if (!OidIsValid(arg->typ_relid)) return true; /* Else we should have some cached data */ Assert(TransactionIdIsValid(arg->typrel_xmin)); Assert(ItemPointerIsValid(&arg->typrel_tid)); /* Get the pg_class tuple for the data type */ relTup = SearchSysCache1(RELOID, ObjectIdGetDatum(arg->typ_relid)); if (!HeapTupleIsValid(relTup)) elog(ERROR, "cache lookup failed for relation %u", arg->typ_relid); /* If it has changed, the cached data is not valid */ valid = (arg->typrel_xmin == HeapTupleHeaderGetRawXmin(relTup->t_data) && ItemPointerEquals(&arg->typrel_tid, &relTup->t_self)); ReleaseSysCache(relTup); return valid; }
/* * Decide whether a cached PLyProcedure struct is still valid */ static bool PLy_procedure_valid(PLyProcedure *proc, HeapTuple procTup) { int i; bool valid; Assert(proc != NULL); /* If the pg_proc tuple has changed, it's not valid */ if (!(proc->fn_xmin == HeapTupleHeaderGetRawXmin(procTup->t_data) && ItemPointerEquals(&proc->fn_tid, &procTup->t_self))) return false; /* Else check the input argument datatypes */ valid = true; for (i = 0; i < proc->nargs; i++) { valid = PLy_procedure_argument_valid(&proc->args[i]); /* Short-circuit on first changed argument */ if (!valid) break; } /* if the output type is composite, it might have changed */ if (valid) valid = PLy_procedure_argument_valid(&proc->result); return valid; }
/* * Collect data from a pending-list page in preparation for insertion into * the main index. * * Go through all tuples >= startoff on page and collect values in accum * * Note that ka is just workspace --- it does not carry any state across * calls. */ static void processPendingPage(BuildAccumulator *accum, KeyArray *ka, Page page, OffsetNumber startoff) { ItemPointerData heapptr; OffsetNumber i, maxoff; OffsetNumber attrnum; /* reset *ka to empty */ ka->nvalues = 0; maxoff = PageGetMaxOffsetNumber(page); Assert(maxoff >= FirstOffsetNumber); ItemPointerSetInvalid(&heapptr); attrnum = 0; for (i = startoff; i <= maxoff; i = OffsetNumberNext(i)) { IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, i)); OffsetNumber curattnum; Datum curkey; GinNullCategory curcategory; /* Check for change of heap TID or attnum */ curattnum = gintuple_get_attrnum(accum->ginstate, itup); if (!ItemPointerIsValid(&heapptr)) { heapptr = itup->t_tid; attrnum = curattnum; } else if (!(ItemPointerEquals(&heapptr, &itup->t_tid) && curattnum == attrnum)) { /* * ginInsertBAEntries can insert several datums per call, but only * for one heap tuple and one column. So call it at a boundary, * and reset ka. */ ginInsertBAEntries(accum, &heapptr, attrnum, ka->keys, ka->categories, ka->nvalues); ka->nvalues = 0; heapptr = itup->t_tid; attrnum = curattnum; } /* Add key to KeyArray */ curkey = gintuple_get_key(accum->ginstate, itup, &curcategory); addDatum(ka, curkey, curcategory); } /* Dump out all remaining keys */ ginInsertBAEntries(accum, &heapptr, attrnum, ka->keys, ka->categories, ka->nvalues); }
/* * update a tuple in in-memory heap table. * * if the target tuple already in the memory, * update it in-place with flag INMEM_HEAP_TUPLE_UPDATED. * else report an error. * * update should not change the otid of the old tuple, * since updated tuple should write back to the master and update there. */ void InMemHeap_Update(InMemHeapRelation relation, ItemPointer otid, HeapTuple tup) { int pos; HeapTuple target; MemoryContext oldmem = CurrentMemoryContext; Assert(ItemPointerIsValid(otid)); pos = InMemHeap_Find(relation, otid); CurrentMemoryContext = relation->memcxt; /* * not found, report error */ if (pos >= relation->tupsize) { ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("update a tuple which does not exist," " relname = %s, relid = %u", relation->rel->rd_rel->relname.data, relation->relid))); } Insist(relation->hashIndex == NULL && "cannot handle index in in-memory heap when update"); /* * already in table */ Assert(relation->tuples[pos].flags == INMEM_HEAP_TUPLE_DISPATCHED || relation->tuples[pos].flags == INMEM_HEAP_TUPLE_UPDATED); relation->tuples[pos].flags = INMEM_HEAP_TUPLE_UPDATED; target = heaptuple_copy_to(tup, NULL, NULL ); /* * do not modify original tuple header */ ItemPointerCopy(&target->t_self, &relation->tuples[pos].tuple->t_self); Assert(ItemPointerEquals(&target->t_self, otid)); memcpy(target->t_data, relation->tuples[pos].tuple->t_data, sizeof(HeapTupleHeaderData)); CurrentMemoryContext = oldmem; pfree(relation->tuples[pos].tuple); relation->tuples[pos].tuple = target; }
/* * Decide whether a cached PLyProcedure struct is still valid */ static bool PLy_procedure_valid(PLyProcedure *proc, HeapTuple procTup) { if (proc == NULL) return false; /* If the pg_proc tuple has changed, it's not valid */ if (!(proc->fn_xmin == HeapTupleHeaderGetRawXmin(procTup->t_data) && ItemPointerEquals(&proc->fn_tid, &procTup->t_self))) return false; return true; }
static void killtuple(Relation r, GISTScanOpaque so, ItemPointer iptr) { MIRROREDLOCK_BUFMGR_DECLARE; Page p; OffsetNumber offset; // -------- MirroredLock ---------- MIRROREDLOCK_BUFMGR_LOCK; LockBuffer(so->curbuf, GIST_SHARE); gistcheckpage(r, so->curbuf); p = (Page) BufferGetPage(so->curbuf); if (XLByteEQ(so->stack->lsn, PageGetLSN(p))) { /* page unchanged, so all is simple */ offset = ItemPointerGetOffsetNumber(iptr); ItemIdMarkDead(PageGetItemId(p, offset)); SetBufferCommitInfoNeedsSave(so->curbuf); } else { OffsetNumber maxoff = PageGetMaxOffsetNumber(p); for (offset = FirstOffsetNumber; offset <= maxoff; offset = OffsetNumberNext(offset)) { IndexTuple ituple = (IndexTuple) PageGetItem(p, PageGetItemId(p, offset)); if (ItemPointerEquals(&(ituple->t_tid), iptr)) { /* found */ ItemIdMarkDead(PageGetItemId(p, offset)); SetBufferCommitInfoNeedsSave(so->curbuf); break; } } } LockBuffer(so->curbuf, GIST_UNLOCK); MIRROREDLOCK_BUFMGR_UNLOCK; // -------- MirroredLock ---------- }
/* * Add TID to pendingList, but only if not already present. * * Note that new items are always appended at the end of the list; this * ensures that scans of the list don't miss items added during the scan. */ static void spgAddPendingTID(spgBulkDeleteState *bds, ItemPointer tid) { spgVacPendingItem *pitem; spgVacPendingItem **listLink; /* search the list for pre-existing entry */ listLink = &bds->pendingList; while (*listLink != NULL) { pitem = *listLink; if (ItemPointerEquals(tid, &pitem->tid)) return; /* already in list, do nothing */ listLink = &pitem->next; } /* not there, so append new entry */ pitem = (spgVacPendingItem *) palloc(sizeof(spgVacPendingItem)); pitem->tid = *tid; pitem->done = false; pitem->next = NULL; *listLink = pitem; }
/* * PyPgFunction_IsCurrent - determine if the current pg_proc entry is newer than 'func' */ bool PyPgFunction_IsCurrent(PyObj func) { HeapTuple ht; ItemPointerData fn_tid; TransactionId fn_xmin, last_fn_xmin; last_fn_xmin = PyPgFunction_GetXMin(func); if (last_fn_xmin == InvalidTransactionId) { /* pseudo-function */ return(true); } ht = SearchSysCache(PROCOID, PyPgFunction_GetOid(func), 0, 0, 0); if (!HeapTupleIsValid(ht)) return(false); fn_xmin = HeapTupleHeaderGetXmin(ht->t_data); fn_tid = ht->t_self; ReleaseSysCache(ht); if (last_fn_xmin != fn_xmin || !ItemPointerEquals(PyPgFunction_GetItemPointer(func), &fn_tid)) { return(false); } if (!PyPgTupleDesc_IsCurrent(PyPgFunction_GetInput(func))) { return(false); } if (!PyPgType_IsCurrent(PyPgFunction_GetOutput(func))) return(false); return(true); }
/* * Fetch the BrinTuple for a given heap block. * * The buffer containing the tuple is locked, and returned in *buf. As an * optimization, the caller can pass a pinned buffer *buf on entry, which will * avoid a pin-unpin cycle when the next tuple is on the same page as a * previous one. * * If no tuple is found for the given heap range, returns NULL. In that case, * *buf might still be updated, but it's not locked. * * The output tuple offset within the buffer is returned in *off, and its size * is returned in *size. */ BrinTuple * brinGetTupleForHeapBlock(BrinRevmap *revmap, BlockNumber heapBlk, Buffer *buf, OffsetNumber *off, Size *size, int mode) { Relation idxRel = revmap->rm_irel; BlockNumber mapBlk; RevmapContents *contents; ItemPointerData *iptr; BlockNumber blk; Page page; ItemId lp; BrinTuple *tup; ItemPointerData previptr; /* normalize the heap block number to be the first page in the range */ heapBlk = (heapBlk / revmap->rm_pagesPerRange) * revmap->rm_pagesPerRange; /* Compute the revmap page number we need */ mapBlk = revmap_get_blkno(revmap, heapBlk); if (mapBlk == InvalidBlockNumber) { *off = InvalidOffsetNumber; return NULL; } ItemPointerSetInvalid(&previptr); for (;;) { CHECK_FOR_INTERRUPTS(); if (revmap->rm_currBuf == InvalidBuffer || BufferGetBlockNumber(revmap->rm_currBuf) != mapBlk) { if (revmap->rm_currBuf != InvalidBuffer) ReleaseBuffer(revmap->rm_currBuf); Assert(mapBlk != InvalidBlockNumber); revmap->rm_currBuf = ReadBuffer(revmap->rm_irel, mapBlk); } LockBuffer(revmap->rm_currBuf, BUFFER_LOCK_SHARE); contents = (RevmapContents *) PageGetContents(BufferGetPage(revmap->rm_currBuf)); iptr = contents->rm_tids; iptr += HEAPBLK_TO_REVMAP_INDEX(revmap->rm_pagesPerRange, heapBlk); if (!ItemPointerIsValid(iptr)) { LockBuffer(revmap->rm_currBuf, BUFFER_LOCK_UNLOCK); return NULL; } /* * Check the TID we got in a previous iteration, if any, and save the * current TID we got from the revmap; if we loop, we can sanity-check * that the next one we get is different. Otherwise we might be stuck * looping forever if the revmap is somehow badly broken. */ if (ItemPointerIsValid(&previptr) && ItemPointerEquals(&previptr, iptr)) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg_internal("corrupted BRIN index: inconsistent range map"))); previptr = *iptr; blk = ItemPointerGetBlockNumber(iptr); *off = ItemPointerGetOffsetNumber(iptr); LockBuffer(revmap->rm_currBuf, BUFFER_LOCK_UNLOCK); /* Ok, got a pointer to where the BrinTuple should be. Fetch it. */ if (!BufferIsValid(*buf) || BufferGetBlockNumber(*buf) != blk) { if (BufferIsValid(*buf)) ReleaseBuffer(*buf); *buf = ReadBuffer(idxRel, blk); } LockBuffer(*buf, mode); page = BufferGetPage(*buf); /* If we land on a revmap page, start over */ if (BRIN_IS_REGULAR_PAGE(page)) { lp = PageGetItemId(page, *off); if (ItemIdIsUsed(lp)) { tup = (BrinTuple *) PageGetItem(page, lp); if (tup->bt_blkno == heapBlk) { if (size) *size = ItemIdGetLength(lp); /* found it! */ return tup; } } } /* * No luck. Assume that the revmap was updated concurrently. */ LockBuffer(*buf, BUFFER_LOCK_UNLOCK); } /* not reached, but keep compiler quiet */ return NULL; }
/* * CatalogCacheIdInvalidate * * Invalidate entries in the specified cache, given a hash value and * item pointer. Positive entries are deleted if they match the item * pointer. Negative entries must be deleted if they match the hash * value (since we do not have the exact key of the tuple that's being * inserted). But this should only rarely result in loss of a cache * entry that could have been kept. * * Note that it's not very relevant whether the tuple identified by * the item pointer is being inserted or deleted. We don't expect to * find matching positive entries in the one case, and we don't expect * to find matching negative entries in the other; but we will do the * right things in any case. * * This routine is only quasi-public: it should only be used by inval.c. */ void CatalogCacheIdInvalidate(int cacheId, uint32 hashValue, ItemPointer pointer) { CatCache *ccp; /* * sanity checks */ Assert(ItemPointerIsValid(pointer)); CACHE1_elog(DEBUG2, "CatalogCacheIdInvalidate: called"); /* * inspect caches to find the proper cache */ for (ccp = CacheHdr->ch_caches; ccp; ccp = ccp->cc_next) { Index hashIndex; Dlelem *elt, *nextelt; if (cacheId != ccp->id) continue; /* * We don't bother to check whether the cache has finished * initialization yet; if not, there will be no entries in it so * no problem. */ /* * Invalidate *all* CatCLists in this cache; it's too hard to tell * which searches might still be correct, so just zap 'em all. */ for (elt = DLGetHead(&ccp->cc_lists); elt; elt = nextelt) { CatCList *cl = (CatCList *) DLE_VAL(elt); nextelt = DLGetSucc(elt); if (cl->refcount > 0) cl->dead = true; else CatCacheRemoveCList(ccp, cl); } /* * inspect the proper hash bucket for tuple matches */ hashIndex = HASH_INDEX(hashValue, ccp->cc_nbuckets); for (elt = DLGetHead(&ccp->cc_bucket[hashIndex]); elt; elt = nextelt) { CatCTup *ct = (CatCTup *) DLE_VAL(elt); nextelt = DLGetSucc(elt); if (hashValue != ct->hash_value) continue; /* ignore non-matching hash values */ if (ct->negative || ItemPointerEquals(pointer, &ct->tuple.t_self)) { if (ct->refcount > 0) ct->dead = true; else CatCacheRemoveCTup(ccp, ct); CACHE1_elog(DEBUG2, "CatalogCacheIdInvalidate: invalidated"); #ifdef CATCACHE_STATS ccp->cc_invals++; #endif /* could be multiple matches, so keep looking! */ } } break; /* need only search this one cache */ } }
/* * hashgettuple() -- Get the next tuple in the scan. */ bool hashgettuple(IndexScanDesc scan, ScanDirection dir) { HashScanOpaque so = (HashScanOpaque) scan->opaque; Relation rel = scan->indexRelation; Buffer buf; Page page; OffsetNumber offnum; ItemPointer current; bool res; /* Hash indexes are always lossy since we store only the hash code */ scan->xs_recheck = true; /* * We hold pin but not lock on current buffer while outside the hash AM. * Reacquire the read lock here. */ if (BufferIsValid(so->hashso_curbuf)) _hash_chgbufaccess(rel, so->hashso_curbuf, HASH_NOLOCK, HASH_READ); /* * If we've already initialized this scan, we can just advance it in the * appropriate direction. If we haven't done so yet, we call a routine to * get the first item in the scan. */ current = &(so->hashso_curpos); if (ItemPointerIsValid(current)) { /* * An insertion into the current index page could have happened while * we didn't have read lock on it. Re-find our position by looking * for the TID we previously returned. (Because we hold share lock on * the bucket, no deletions or splits could have occurred; therefore * we can expect that the TID still exists in the current index page, * at an offset >= where we were.) */ OffsetNumber maxoffnum; buf = so->hashso_curbuf; Assert(BufferIsValid(buf)); page = BufferGetPage(buf); TestForOldSnapshot(scan->xs_snapshot, rel, page); maxoffnum = PageGetMaxOffsetNumber(page); for (offnum = ItemPointerGetOffsetNumber(current); offnum <= maxoffnum; offnum = OffsetNumberNext(offnum)) { IndexTuple itup; itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); if (ItemPointerEquals(&(so->hashso_heappos), &(itup->t_tid))) break; } if (offnum > maxoffnum) elog(ERROR, "failed to re-find scan position within index \"%s\"", RelationGetRelationName(rel)); ItemPointerSetOffsetNumber(current, offnum); /* * Check to see if we should kill the previously-fetched tuple. */ if (scan->kill_prior_tuple) { /* * Yes, so mark it by setting the LP_DEAD state in the item flags. */ ItemIdMarkDead(PageGetItemId(page, offnum)); /* * Since this can be redone later if needed, mark as a hint. */ MarkBufferDirtyHint(buf, true); } /* * Now continue the scan. */ res = _hash_next(scan, dir); } else res = _hash_first(scan, dir); /* * Skip killed tuples if asked to. */ if (scan->ignore_killed_tuples) { while (res) { offnum = ItemPointerGetOffsetNumber(current); page = BufferGetPage(so->hashso_curbuf); if (!ItemIdIsDead(PageGetItemId(page, offnum))) break; res = _hash_next(scan, dir); } } /* Release read lock on current buffer, but keep it pinned */ if (BufferIsValid(so->hashso_curbuf)) _hash_chgbufaccess(rel, so->hashso_curbuf, HASH_READ, HASH_NOLOCK); /* Return current heap TID on success */ scan->xs_ctup.t_self = so->hashso_heappos; return res; }
/* * _bt_killitems - set LP_DEAD state for items an indexscan caller has * told us were killed * * scan->so contains information about the current page and killed tuples * thereon (generally, this should only be called if so->numKilled > 0). * * The caller must have pin on so->currPos.buf, but may or may not have * read-lock, as indicated by haveLock. Note that we assume read-lock * is sufficient for setting LP_DEAD status (which is only a hint). * * We match items by heap TID before assuming they are the right ones to * delete. We cope with cases where items have moved right due to insertions. * If an item has moved off the current page due to a split, we'll fail to * find it and do nothing (this is not an error case --- we assume the item * will eventually get marked in a future indexscan). Note that because we * hold pin on the target page continuously from initially reading the items * until applying this function, VACUUM cannot have deleted any items from * the page, and so there is no need to search left from the recorded offset. * (This observation also guarantees that the item is still the right one * to delete, which might otherwise be questionable since heap TIDs can get * recycled.) */ void _bt_killitems(IndexScanDesc scan, bool haveLock) { BTScanOpaque so = (BTScanOpaque) scan->opaque; Page page; BTPageOpaque opaque; OffsetNumber minoff; OffsetNumber maxoff; int i; bool killedsomething = false; Assert(BufferIsValid(so->currPos.buf)); if (!haveLock) LockBuffer(so->currPos.buf, BT_READ); page = BufferGetPage(so->currPos.buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); minoff = P_FIRSTDATAKEY(opaque); maxoff = PageGetMaxOffsetNumber(page); for (i = 0; i < so->numKilled; i++) { int itemIndex = so->killedItems[i]; BTScanPosItem *kitem = &so->currPos.items[itemIndex]; OffsetNumber offnum = kitem->indexOffset; Assert(itemIndex >= so->currPos.firstItem && itemIndex <= so->currPos.lastItem); if (offnum < minoff) continue; /* pure paranoia */ while (offnum <= maxoff) { ItemId iid = PageGetItemId(page, offnum); IndexTuple ituple = (IndexTuple) PageGetItem(page, iid); if (ItemPointerEquals(&ituple->t_tid, &kitem->heapTid)) { /* found the item */ ItemIdMarkDead(iid); killedsomething = true; break; /* out of inner search loop */ } offnum = OffsetNumberNext(offnum); } } /* * Since this can be redone later if needed, it's treated the same as a * commit-hint-bit status update for heap tuples: we mark the buffer dirty * but don't make a WAL log entry. * * Whenever we mark anything LP_DEAD, we also set the page's * BTP_HAS_GARBAGE flag, which is likewise just a hint. */ if (killedsomething) { opaque->btpo_flags |= BTP_HAS_GARBAGE; SetBufferCommitInfoNeedsSave(so->currPos.buf); } if (!haveLock) LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK); /* * Always reset the scan state, so we don't look for same items on other * pages. */ so->numKilled = 0; }
/* ---------------------------------------------------------------- * ExecDelete * * DELETE is like UPDATE, except that we delete the tuple and no * index modifications are needed. * DELETE can be part of an update operation when * there is a preceding SplitUpdate node. * * ---------------------------------------------------------------- */ void ExecDelete(ItemPointer tupleid, TupleTableSlot *planSlot, DestReceiver *dest, EState *estate, PlanGenerator planGen, bool isUpdate) { ResultRelInfo *resultRelInfo; Relation resultRelationDesc; HTSU_Result result; ItemPointerData update_ctid; TransactionId update_xmax; /* * Get information on the (current) result relation. */ if (estate->es_result_partitions && planGen == PLANGEN_OPTIMIZER) { Assert(estate->es_result_partitions->part->parrelid); #ifdef USE_ASSERT_CHECKING Oid parent = estate->es_result_partitions->part->parrelid; #endif /* Obtain part for current tuple. */ resultRelInfo = slot_get_partition(planSlot, estate); estate->es_result_relation_info = resultRelInfo; #ifdef USE_ASSERT_CHECKING Oid part = RelationGetRelid(resultRelInfo->ri_RelationDesc); #endif Assert(parent != part); } else { resultRelInfo = estate->es_result_relation_info; } resultRelationDesc = resultRelInfo->ri_RelationDesc; Assert (!resultRelInfo->ri_projectReturning); if (planGen == PLANGEN_PLANNER) { /* BEFORE ROW DELETE Triggers */ if (resultRelInfo->ri_TrigDesc && resultRelInfo->ri_TrigDesc->n_before_row[TRIGGER_EVENT_DELETE] > 0) { bool dodelete; dodelete = ExecBRDeleteTriggers(estate, resultRelInfo, tupleid, estate->es_snapshot->curcid); if (!dodelete) /* "do nothing" */ return; } } /* * delete the tuple * * Note: if es_crosscheck_snapshot isn't InvalidSnapshot, we check that * the row to be deleted is visible to that snapshot, and throw a can't- * serialize error if not. This is a special-case behavior needed for * referential integrity updates in serializable transactions. */ ldelete:; result = heap_delete(resultRelationDesc, tupleid, &update_ctid, &update_xmax, estate->es_snapshot->curcid, estate->es_crosscheck_snapshot, true /* wait for commit */ ); switch (result) { case HeapTupleSelfUpdated: /* already deleted by self; nothing to do */ /* * In an scenario in which R(a,b) and S(a,b) have * R S * ________ ________ * (1, 1) (1, 2) * (1, 7) * * An update query such as: * UPDATE R SET a = S.b FROM S WHERE R.b = S.a; * * will have an non-deterministic output. The tuple in R * can be updated to (2,1) or (7,1). * Since the introduction of SplitUpdate, these queries will * send multiple requests to delete the same tuple. Therefore, * in order to avoid a non-deterministic output, * an error is reported in such scenario. */ if (isUpdate) { ereport(ERROR, (errcode(ERRCODE_IN_FAILED_SQL_TRANSACTION ), errmsg("multiple updates to a row by the same query is not allowed"))); } return; case HeapTupleMayBeUpdated: break; case HeapTupleUpdated: if (IsXactIsoLevelSerializable) ereport(ERROR, (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), errmsg("could not serialize access due to concurrent update"))); else if (!ItemPointerEquals(tupleid, &update_ctid)) { TupleTableSlot *epqslot; epqslot = EvalPlanQual(estate, resultRelInfo->ri_RangeTableIndex, &update_ctid, update_xmax, estate->es_snapshot->curcid); if (!TupIsNull(epqslot)) { *tupleid = update_ctid; goto ldelete; } } /* tuple already deleted; nothing to do */ return; default: elog(ERROR, "unrecognized heap_delete status: %u", result); return; } if (!isUpdate) { IncrDeleted(); (estate->es_processed)++; } /* * Note: Normally one would think that we have to delete index tuples * associated with the heap tuple now... * * ... but in POSTGRES, we have no need to do this because VACUUM will * take care of it later. We can't delete index tuples immediately * anyway, since the tuple is still visible to other transactions. */ if (planGen == PLANGEN_PLANNER) { /* AFTER ROW DELETE Triggers */ ExecARDeleteTriggers(estate, resultRelInfo, tupleid); } }
/* * CatalogCacheIdInvalidate * * Invalidate entries in the specified cache, given a hash value and * item pointer. Positive entries are deleted if they match the item * pointer. Negative entries must be deleted if they match the hash * value (since we do not have the exact key of the tuple that's being * inserted). But this should only rarely result in loss of a cache * entry that could have been kept. * * Note that it's not very relevant whether the tuple identified by * the item pointer is being inserted or deleted. We don't expect to * find matching positive entries in the one case, and we don't expect * to find matching negative entries in the other; but we will do the * right things in any case. * * This routine is only quasi-public: it should only be used by inval.c. */ void CatalogCacheIdInvalidate(int cacheId, uint32 hashValue, ItemPointer pointer) { CatCache *ccp; /* * sanity checks */ #ifdef USE_ASSERT_CHECKING /* Add some debug info for MPP-5739 */ if (!ItemPointerIsValid(pointer)) { elog(LOG, "CatalogCacheIdInvalidate: cacheId %d, hash %u IP %p", cacheId, hashValue, pointer); if (pointer != NULL) { elog(LOG, "CatalogCacheIdInvalidate: bogus item (?): (blkid.hi %d blkid.lo %d posid %d)", pointer->ip_blkid.bi_hi, pointer->ip_blkid.bi_lo, pointer->ip_posid); } } #endif Assert(ItemPointerIsValid(pointer)); CACHE1_elog(DEBUG2, "CatalogCacheIdInvalidate: called"); /* * inspect caches to find the proper cache */ for (ccp = CacheHdr->ch_caches; ccp; ccp = ccp->cc_next) { Index hashIndex; Dlelem *elt, *nextelt; if (cacheId != ccp->id) continue; /* * We don't bother to check whether the cache has finished * initialization yet; if not, there will be no entries in it so no * problem. */ /* * Invalidate *all* CatCLists in this cache; it's too hard to tell * which searches might still be correct, so just zap 'em all. */ for (elt = DLGetHead(&ccp->cc_lists); elt; elt = nextelt) { CatCList *cl = (CatCList *) DLE_VAL(elt); nextelt = DLGetSucc(elt); if (cl->refcount > 0) cl->dead = true; else CatCacheRemoveCList(ccp, cl); } /* * inspect the proper hash bucket for tuple matches */ hashIndex = HASH_INDEX(hashValue, ccp->cc_nbuckets); for (elt = DLGetHead(&ccp->cc_bucket[hashIndex]); elt; elt = nextelt) { CatCTup *ct = (CatCTup *) DLE_VAL(elt); nextelt = DLGetSucc(elt); if (hashValue != ct->hash_value) continue; /* ignore non-matching hash values */ if (ct->negative || ItemPointerEquals(pointer, &ct->tuple.t_self)) { if (ct->refcount > 0 || (ct->c_list && ct->c_list->refcount > 0)) { ct->dead = true; /* list, if any, was marked dead above */ Assert(ct->c_list == NULL || ct->c_list->dead); } else CatCacheRemoveCTup(ccp, ct); CACHE1_elog(DEBUG2, "CatalogCacheIdInvalidate: invalidated"); #ifdef CATCACHE_STATS ccp->cc_invals++; #endif /* could be multiple matches, so keep looking! */ } } break; /* need only search this one cache */ } }
/* ---------------------------------------------------------------- * ExecUpdate * * note: we can't run UPDATE queries with transactions * off because UPDATEs are actually INSERTs and our * scan will mistakenly loop forever, updating the tuple * it just inserted.. This should be fixed but until it * is, we don't want to get stuck in an infinite loop * which corrupts your database.. * ---------------------------------------------------------------- */ void ExecUpdate(TupleTableSlot *slot, ItemPointer tupleid, TupleTableSlot *planSlot, DestReceiver *dest, EState *estate) { void* tuple; ResultRelInfo *resultRelInfo; Relation resultRelationDesc; HTSU_Result result; ItemPointerData update_ctid; TransactionId update_xmax; AOTupleId aoTupleId = AOTUPLEID_INIT; TupleTableSlot *partslot = NULL; /* * abort the operation if not running transactions */ if (IsBootstrapProcessingMode()) elog(ERROR, "cannot UPDATE during bootstrap"); /* * get information on the (current) result relation */ resultRelInfo = estate->es_result_relation_info; resultRelationDesc = resultRelInfo->ri_RelationDesc; bool rel_is_heap = RelationIsHeap(resultRelationDesc); bool rel_is_aorows = RelationIsAoRows(resultRelationDesc); bool rel_is_aocols = RelationIsAoCols(resultRelationDesc); bool rel_is_external = RelationIsExternal(resultRelationDesc); /* * get the heap tuple out of the tuple table slot, making sure we have a * writable copy */ if (rel_is_heap) { partslot = slot; tuple = ExecFetchSlotHeapTuple(partslot); } else if (rel_is_aorows || rel_is_aocols) { /* * It is necessary to reconstruct a logically compatible tuple to * a phyiscally compatible tuple. The slot's tuple descriptor comes * from the projection target list, which doesn't indicate dropped * columns, and MemTuple cannot deal with cases without converting * the target list back into the original relation's tuple desc. */ partslot = reconstructMatchingTupleSlot(slot, resultRelInfo); /* * We directly inline toasted columns here as update with toasted columns * would create two references to the same toasted value. */ tuple = ExecFetchSlotMemTuple(partslot, true); } else if (rel_is_external) { if (estate->es_result_partitions && estate->es_result_partitions->part->parrelid != 0) { ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("Update external partitions not supported."))); return; } else { partslot = slot; tuple = ExecFetchSlotHeapTuple(partslot); } } else { Insist(false); } /* see if this update would move the tuple to a different partition */ if (estate->es_result_partitions) checkPartitionUpdate(estate, partslot, resultRelInfo); /* BEFORE ROW UPDATE Triggers */ if (resultRelInfo->ri_TrigDesc && resultRelInfo->ri_TrigDesc->n_before_row[TRIGGER_EVENT_UPDATE] > 0) { HeapTuple newtuple; newtuple = ExecBRUpdateTriggers(estate, resultRelInfo, tupleid, tuple, estate->es_snapshot->curcid); if (newtuple == NULL) /* "do nothing" */ return; if (newtuple != tuple) /* modified by Trigger(s) */ { /* * Put the modified tuple into a slot for convenience of routines * below. We assume the tuple was allocated in per-tuple memory * context, and therefore will go away by itself. The tuple table * slot should not try to clear it. */ TupleTableSlot *newslot = estate->es_trig_tuple_slot; if (newslot->tts_tupleDescriptor != partslot->tts_tupleDescriptor) ExecSetSlotDescriptor(newslot, partslot->tts_tupleDescriptor); ExecStoreGenericTuple(newtuple, newslot, false); newslot->tts_tableOid = partslot->tts_tableOid; /* for constraints */ partslot = newslot; tuple = newtuple; } } /* * Check the constraints of the tuple * * If we generate a new candidate tuple after EvalPlanQual testing, we * must loop back here and recheck constraints. (We don't need to redo * triggers, however. If there are any BEFORE triggers then trigger.c * will have done heap_lock_tuple to lock the correct tuple, so there's no * need to do them again.) */ lreplace:; if (resultRelationDesc->rd_att->constr) ExecConstraints(resultRelInfo, partslot, estate); if (!GpPersistent_IsPersistentRelation(resultRelationDesc->rd_id)) { /* * Normal UPDATE path. */ /* * replace the heap tuple * * Note: if es_crosscheck_snapshot isn't InvalidSnapshot, we check that * the row to be updated is visible to that snapshot, and throw a can't- * serialize error if not. This is a special-case behavior needed for * referential integrity updates in serializable transactions. */ if (rel_is_heap) { result = heap_update(resultRelationDesc, tupleid, tuple, &update_ctid, &update_xmax, estate->es_snapshot->curcid, estate->es_crosscheck_snapshot, true /* wait for commit */ ); } else if (rel_is_aorows) { if (IsXactIsoLevelSerializable) { ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("Updates on append-only tables are not supported in serializable transactions."))); } if (resultRelInfo->ri_updateDesc == NULL) { ResultRelInfoSetSegno(resultRelInfo, estate->es_result_aosegnos); resultRelInfo->ri_updateDesc = (AppendOnlyUpdateDesc) appendonly_update_init(resultRelationDesc, ActiveSnapshot, resultRelInfo->ri_aosegno); } result = appendonly_update(resultRelInfo->ri_updateDesc, tuple, (AOTupleId *) tupleid, &aoTupleId); } else if (rel_is_aocols) { if (IsXactIsoLevelSerializable) { ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("Updates on append-only tables are not supported in serializable transactions."))); } if (resultRelInfo->ri_updateDesc == NULL) { ResultRelInfoSetSegno(resultRelInfo, estate->es_result_aosegnos); resultRelInfo->ri_updateDesc = (AppendOnlyUpdateDesc) aocs_update_init(resultRelationDesc, resultRelInfo->ri_aosegno); } result = aocs_update(resultRelInfo->ri_updateDesc, partslot, (AOTupleId *) tupleid, &aoTupleId); } else { Assert(!"We should not be here"); } switch (result) { case HeapTupleSelfUpdated: /* already deleted by self; nothing to do */ return; case HeapTupleMayBeUpdated: break; case HeapTupleUpdated: if (IsXactIsoLevelSerializable) ereport(ERROR, (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), errmsg("could not serialize access due to concurrent update"))); else if (!ItemPointerEquals(tupleid, &update_ctid)) { TupleTableSlot *epqslot; epqslot = EvalPlanQual(estate, resultRelInfo->ri_RangeTableIndex, &update_ctid, update_xmax, estate->es_snapshot->curcid); if (!TupIsNull(epqslot)) { *tupleid = update_ctid; partslot = ExecFilterJunk(estate->es_junkFilter, epqslot); tuple = ExecFetchSlotHeapTuple(partslot); goto lreplace; } } /* tuple already deleted; nothing to do */ return; default: elog(ERROR, "unrecognized heap_update status: %u", result); return; } } else { HeapTuple persistentTuple; /* * Persistent metadata path. */ persistentTuple = heap_copytuple(tuple); persistentTuple->t_self = *tupleid; frozen_heap_inplace_update(resultRelationDesc, persistentTuple); heap_freetuple(persistentTuple); } IncrReplaced(); (estate->es_processed)++; (resultRelInfo->ri_aoprocessed)++; /* * Note: instead of having to update the old index tuples associated with * the heap tuple, all we do is form and insert new index tuples. This is * because UPDATEs are actually DELETEs and INSERTs, and index tuple * deletion is done later by VACUUM (see notes in ExecDelete). All we do * here is insert new index tuples. -cim 9/27/89 */ /* * insert index entries for tuple * * Note: heap_update returns the tid (location) of the new tuple in the * t_self field. */ if (rel_is_aorows || rel_is_aocols) { if (resultRelInfo->ri_NumIndices > 0) ExecInsertIndexTuples(partslot, (ItemPointer)&aoTupleId, estate, false); } else { if (resultRelInfo->ri_NumIndices > 0) ExecInsertIndexTuples(partslot, &(((HeapTuple) tuple)->t_self), estate, false); } /* AFTER ROW UPDATE Triggers */ ExecARUpdateTriggers(estate, resultRelInfo, tupleid, tuple); }
/* * hashgettuple() -- Get the next tuple in the scan. */ bool hashgettuple(IndexScanDesc scan, ScanDirection dir) { HashScanOpaque so = (HashScanOpaque) scan->opaque; Relation rel = scan->indexRelation; Buffer buf; Page page; OffsetNumber offnum; ItemPointer current; bool res; /* Hash indexes are always lossy since we store only the hash code */ scan->xs_recheck = true; /* * We hold pin but not lock on current buffer while outside the hash AM. * Reacquire the read lock here. */ if (BufferIsValid(so->hashso_curbuf)) LockBuffer(so->hashso_curbuf, BUFFER_LOCK_SHARE); /* * If we've already initialized this scan, we can just advance it in the * appropriate direction. If we haven't done so yet, we call a routine to * get the first item in the scan. */ current = &(so->hashso_curpos); if (ItemPointerIsValid(current)) { /* * An insertion into the current index page could have happened while * we didn't have read lock on it. Re-find our position by looking * for the TID we previously returned. (Because we hold a pin on the * primary bucket page, no deletions or splits could have occurred; * therefore we can expect that the TID still exists in the current * index page, at an offset >= where we were.) */ OffsetNumber maxoffnum; buf = so->hashso_curbuf; Assert(BufferIsValid(buf)); page = BufferGetPage(buf); /* * We don't need test for old snapshot here as the current buffer is * pinned, so vacuum can't clean the page. */ maxoffnum = PageGetMaxOffsetNumber(page); for (offnum = ItemPointerGetOffsetNumber(current); offnum <= maxoffnum; offnum = OffsetNumberNext(offnum)) { IndexTuple itup; itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); if (ItemPointerEquals(&(so->hashso_heappos), &(itup->t_tid))) break; } if (offnum > maxoffnum) elog(ERROR, "failed to re-find scan position within index \"%s\"", RelationGetRelationName(rel)); ItemPointerSetOffsetNumber(current, offnum); /* * Check to see if we should kill the previously-fetched tuple. */ if (scan->kill_prior_tuple) { /* * Yes, so remember it for later. (We'll deal with all such tuples * at once right after leaving the index page or at end of scan.) * In case if caller reverses the indexscan direction it is quite * possible that the same item might get entered multiple times. * But, we don't detect that; instead, we just forget any excess * entries. */ if (so->killedItems == NULL) so->killedItems = palloc(MaxIndexTuplesPerPage * sizeof(HashScanPosItem)); if (so->numKilled < MaxIndexTuplesPerPage) { so->killedItems[so->numKilled].heapTid = so->hashso_heappos; so->killedItems[so->numKilled].indexOffset = ItemPointerGetOffsetNumber(&(so->hashso_curpos)); so->numKilled++; } } /* * Now continue the scan. */ res = _hash_next(scan, dir); } else res = _hash_first(scan, dir); /* * Skip killed tuples if asked to. */ if (scan->ignore_killed_tuples) { while (res) { offnum = ItemPointerGetOffsetNumber(current); page = BufferGetPage(so->hashso_curbuf); if (!ItemIdIsDead(PageGetItemId(page, offnum))) break; res = _hash_next(scan, dir); } } /* Release read lock on current buffer, but keep it pinned */ if (BufferIsValid(so->hashso_curbuf)) LockBuffer(so->hashso_curbuf, BUFFER_LOCK_UNLOCK); /* Return current heap TID on success */ scan->xs_ctup.t_self = so->hashso_heappos; return res; }
/* * Place tuples from 'itup' to 'buffer'. If 'oldoffnum' is valid, the tuple * at that offset is atomically removed along with inserting the new tuples. * This is used to replace a tuple with a new one. * * If 'leftchildbuf' is valid, we're inserting the downlink for the page * to the right of 'leftchildbuf', or updating the downlink for 'leftchildbuf'. * F_FOLLOW_RIGHT flag on 'leftchildbuf' is cleared and NSN is set. * * If 'markfollowright' is true and the page is split, the left child is * marked with F_FOLLOW_RIGHT flag. That is the normal case. During buffered * index build, however, there is no concurrent access and the page splitting * is done in a slightly simpler fashion, and false is passed. * * If there is not enough room on the page, it is split. All the split * pages are kept pinned and locked and returned in *splitinfo, the caller * is responsible for inserting the downlinks for them. However, if * 'buffer' is the root page and it needs to be split, gistplacetopage() * performs the split as one atomic operation, and *splitinfo is set to NIL. * In that case, we continue to hold the root page locked, and the child * pages are released; note that new tuple(s) are *not* on the root page * but in one of the new child pages. * * If 'newblkno' is not NULL, returns the block number of page the first * new/updated tuple was inserted to. Usually it's the given page, but could * be its right sibling if the page was split. * * Returns 'true' if the page was split, 'false' otherwise. */ bool gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate, Buffer buffer, IndexTuple *itup, int ntup, OffsetNumber oldoffnum, BlockNumber *newblkno, Buffer leftchildbuf, List **splitinfo, bool markfollowright) { BlockNumber blkno = BufferGetBlockNumber(buffer); Page page = BufferGetPage(buffer); bool is_leaf = (GistPageIsLeaf(page)) ? true : false; XLogRecPtr recptr; int i; bool is_split; /* * Refuse to modify a page that's incompletely split. This should not * happen because we finish any incomplete splits while we walk down the * tree. However, it's remotely possible that another concurrent inserter * splits a parent page, and errors out before completing the split. We * will just throw an error in that case, and leave any split we had in * progress unfinished too. The next insert that comes along will clean up * the mess. */ if (GistFollowRight(page)) elog(ERROR, "concurrent GiST page split was incomplete"); *splitinfo = NIL; /* * if isupdate, remove old key: This node's key has been modified, either * because a child split occurred or because we needed to adjust our key * for an insert in a child node. Therefore, remove the old version of * this node's key. * * for WAL replay, in the non-split case we handle this by setting up a * one-element todelete array; in the split case, it's handled implicitly * because the tuple vector passed to gistSplit won't include this tuple. */ is_split = gistnospace(page, itup, ntup, oldoffnum, freespace); if (is_split) { /* no space for insertion */ IndexTuple *itvec; int tlen; SplitedPageLayout *dist = NULL, *ptr; BlockNumber oldrlink = InvalidBlockNumber; GistNSN oldnsn = 0; SplitedPageLayout rootpg; bool is_rootsplit; is_rootsplit = (blkno == GIST_ROOT_BLKNO); /* * Form index tuples vector to split. If we're replacing an old tuple, * remove the old version from the vector. */ itvec = gistextractpage(page, &tlen); if (OffsetNumberIsValid(oldoffnum)) { /* on inner page we should remove old tuple */ int pos = oldoffnum - FirstOffsetNumber; tlen--; if (pos != tlen) memmove(itvec + pos, itvec + pos + 1, sizeof(IndexTuple) * (tlen - pos)); } itvec = gistjoinvector(itvec, &tlen, itup, ntup); dist = gistSplit(rel, page, itvec, tlen, giststate); /* * Set up pages to work with. Allocate new buffers for all but the * leftmost page. The original page becomes the new leftmost page, and * is just replaced with the new contents. * * For a root-split, allocate new buffers for all child pages, the * original page is overwritten with new root page containing * downlinks to the new child pages. */ ptr = dist; if (!is_rootsplit) { /* save old rightlink and NSN */ oldrlink = GistPageGetOpaque(page)->rightlink; oldnsn = GistPageGetNSN(page); dist->buffer = buffer; dist->block.blkno = BufferGetBlockNumber(buffer); dist->page = PageGetTempPageCopySpecial(BufferGetPage(buffer)); /* clean all flags except F_LEAF */ GistPageGetOpaque(dist->page)->flags = (is_leaf) ? F_LEAF : 0; ptr = ptr->next; } for (; ptr; ptr = ptr->next) { /* Allocate new page */ ptr->buffer = gistNewBuffer(rel); GISTInitBuffer(ptr->buffer, (is_leaf) ? F_LEAF : 0); ptr->page = BufferGetPage(ptr->buffer); ptr->block.blkno = BufferGetBlockNumber(ptr->buffer); } /* * Now that we know which blocks the new pages go to, set up downlink * tuples to point to them. */ for (ptr = dist; ptr; ptr = ptr->next) { ItemPointerSetBlockNumber(&(ptr->itup->t_tid), ptr->block.blkno); GistTupleSetValid(ptr->itup); } /* * If this is a root split, we construct the new root page with the * downlinks here directly, instead of requiring the caller to insert * them. Add the new root page to the list along with the child pages. */ if (is_rootsplit) { IndexTuple *downlinks; int ndownlinks = 0; int i; rootpg.buffer = buffer; rootpg.page = PageGetTempPageCopySpecial(BufferGetPage(rootpg.buffer)); GistPageGetOpaque(rootpg.page)->flags = 0; /* Prepare a vector of all the downlinks */ for (ptr = dist; ptr; ptr = ptr->next) ndownlinks++; downlinks = palloc(sizeof(IndexTuple) * ndownlinks); for (i = 0, ptr = dist; ptr; ptr = ptr->next) downlinks[i++] = ptr->itup; rootpg.block.blkno = GIST_ROOT_BLKNO; rootpg.block.num = ndownlinks; rootpg.list = gistfillitupvec(downlinks, ndownlinks, &(rootpg.lenlist)); rootpg.itup = NULL; rootpg.next = dist; dist = &rootpg; } else { /* Prepare split-info to be returned to caller */ for (ptr = dist; ptr; ptr = ptr->next) { GISTPageSplitInfo *si = palloc(sizeof(GISTPageSplitInfo)); si->buf = ptr->buffer; si->downlink = ptr->itup; *splitinfo = lappend(*splitinfo, si); } } /* * Fill all pages. All the pages are new, ie. freshly allocated empty * pages, or a temporary copy of the old page. */ for (ptr = dist; ptr; ptr = ptr->next) { char *data = (char *) (ptr->list); for (i = 0; i < ptr->block.num; i++) { IndexTuple thistup = (IndexTuple) data; if (PageAddItem(ptr->page, (Item) data, IndexTupleSize(thistup), i + FirstOffsetNumber, false, false) == InvalidOffsetNumber) elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(rel)); /* * If this is the first inserted/updated tuple, let the caller * know which page it landed on. */ if (newblkno && ItemPointerEquals(&thistup->t_tid, &(*itup)->t_tid)) *newblkno = ptr->block.blkno; data += IndexTupleSize(thistup); } /* Set up rightlinks */ if (ptr->next && ptr->block.blkno != GIST_ROOT_BLKNO) GistPageGetOpaque(ptr->page)->rightlink = ptr->next->block.blkno; else GistPageGetOpaque(ptr->page)->rightlink = oldrlink; /* * Mark the all but the right-most page with the follow-right * flag. It will be cleared as soon as the downlink is inserted * into the parent, but this ensures that if we error out before * that, the index is still consistent. (in buffering build mode, * any error will abort the index build anyway, so this is not * needed.) */ if (ptr->next && !is_rootsplit && markfollowright) GistMarkFollowRight(ptr->page); else GistClearFollowRight(ptr->page); /* * Copy the NSN of the original page to all pages. The * F_FOLLOW_RIGHT flags ensure that scans will follow the * rightlinks until the downlinks are inserted. */ GistPageSetNSN(ptr->page, oldnsn); } START_CRIT_SECTION(); /* * Must mark buffers dirty before XLogInsert, even though we'll still * be changing their opaque fields below. */ for (ptr = dist; ptr; ptr = ptr->next) MarkBufferDirty(ptr->buffer); if (BufferIsValid(leftchildbuf)) MarkBufferDirty(leftchildbuf); /* * The first page in the chain was a temporary working copy meant to * replace the old page. Copy it over the old page. */ PageRestoreTempPage(dist->page, BufferGetPage(dist->buffer)); dist->page = BufferGetPage(dist->buffer); /* Write the WAL record */ if (RelationNeedsWAL(rel)) recptr = gistXLogSplit(rel->rd_node, blkno, is_leaf, dist, oldrlink, oldnsn, leftchildbuf, markfollowright); else recptr = gistGetFakeLSN(rel); for (ptr = dist; ptr; ptr = ptr->next) { PageSetLSN(ptr->page, recptr); } /* * Return the new child buffers to the caller. * * If this was a root split, we've already inserted the downlink * pointers, in the form of a new root page. Therefore we can release * all the new buffers, and keep just the root page locked. */ if (is_rootsplit) { for (ptr = dist->next; ptr; ptr = ptr->next) UnlockReleaseBuffer(ptr->buffer); } } else { /* * Enough space. We also get here if ntuples==0. */ START_CRIT_SECTION(); if (OffsetNumberIsValid(oldoffnum)) PageIndexTupleDelete(page, oldoffnum); gistfillbuffer(page, itup, ntup, InvalidOffsetNumber); MarkBufferDirty(buffer); if (BufferIsValid(leftchildbuf)) MarkBufferDirty(leftchildbuf); if (RelationNeedsWAL(rel)) { OffsetNumber ndeloffs = 0, deloffs[1]; if (OffsetNumberIsValid(oldoffnum)) { deloffs[0] = oldoffnum; ndeloffs = 1; } recptr = gistXLogUpdate(rel->rd_node, buffer, deloffs, ndeloffs, itup, ntup, leftchildbuf); PageSetLSN(page, recptr); } else { recptr = gistGetFakeLSN(rel); PageSetLSN(page, recptr); } if (newblkno) *newblkno = blkno; } /* * If we inserted the downlink for a child page, set NSN and clear * F_FOLLOW_RIGHT flag on the left child, so that concurrent scans know to * follow the rightlink if and only if they looked at the parent page * before we inserted the downlink. * * Note that we do this *after* writing the WAL record. That means that * the possible full page image in the WAL record does not include these * changes, and they must be replayed even if the page is restored from * the full page image. There's a chicken-and-egg problem: if we updated * the child pages first, we wouldn't know the recptr of the WAL record * we're about to write. */ if (BufferIsValid(leftchildbuf)) { Page leftpg = BufferGetPage(leftchildbuf); GistPageSetNSN(leftpg, recptr); GistClearFollowRight(leftpg); PageSetLSN(leftpg, recptr); } END_CRIT_SECTION(); return is_split; }
/* ---------------------------------------------------------------- * ExecUpdate * * note: we can't run UPDATE queries with transactions * off because UPDATEs are actually INSERTs and our * scan will mistakenly loop forever, updating the tuple * it just inserted.. This should be fixed but until it * is, we don't want to get stuck in an infinite loop * which corrupts your database.. * ---------------------------------------------------------------- */ void ExecUpdate(TupleTableSlot *slot, ItemPointer tupleid, TupleTableSlot *planSlot, DestReceiver *dest, EState *estate) { HeapTuple tuple; ResultRelInfo *resultRelInfo; Relation resultRelationDesc; HTSU_Result result; ItemPointerData update_ctid; TransactionId update_xmax; /* * abort the operation if not running transactions */ if (IsBootstrapProcessingMode()) elog(ERROR, "cannot UPDATE during bootstrap"); /* * get the heap tuple out of the tuple table slot, making sure we have a * writable copy */ tuple = ExecFetchSlotHeapTuple(slot); /* * get information on the (current) result relation */ resultRelInfo = estate->es_result_relation_info; resultRelationDesc = resultRelInfo->ri_RelationDesc; /* see if this update would move the tuple to a different partition */ if (estate->es_result_partitions) { AttrNumber max_attr; Datum *values; bool *nulls; Oid targetid; Assert(estate->es_partition_state != NULL && estate->es_partition_state->accessMethods != NULL); if (!estate->es_partition_state->accessMethods->part_cxt) estate->es_partition_state->accessMethods->part_cxt = GetPerTupleExprContext(estate)->ecxt_per_tuple_memory; Assert(PointerIsValid(estate->es_result_partitions)); max_attr = estate->es_partition_state->max_partition_attr; slot_getsomeattrs(slot, max_attr); values = slot_get_values(slot); nulls = slot_get_isnull(slot); targetid = selectPartition(estate->es_result_partitions, values, nulls, slot->tts_tupleDescriptor, estate->es_partition_state->accessMethods); if (!OidIsValid(targetid)) ereport(ERROR, (errcode(ERRCODE_NO_PARTITION_FOR_PARTITIONING_KEY), errmsg("no partition for partitioning key"))); if (RelationGetRelid(resultRelationDesc) != targetid) { ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("moving tuple from partition \"%s\" to " "partition \"%s\" not supported", get_rel_name(RelationGetRelid(resultRelationDesc)), get_rel_name(targetid)), errOmitLocation(true))); } } /* BEFORE ROW UPDATE Triggers */ if (resultRelInfo->ri_TrigDesc && resultRelInfo->ri_TrigDesc->n_before_row[TRIGGER_EVENT_UPDATE] > 0) { HeapTuple newtuple; newtuple = ExecBRUpdateTriggers(estate, resultRelInfo, tupleid, tuple, estate->es_snapshot->curcid); if (newtuple == NULL) /* "do nothing" */ return; if (newtuple != tuple) /* modified by Trigger(s) */ { /* * Put the modified tuple into a slot for convenience of routines * below. We assume the tuple was allocated in per-tuple memory * context, and therefore will go away by itself. The tuple table * slot should not try to clear it. */ TupleTableSlot *newslot = estate->es_trig_tuple_slot; if (newslot->tts_tupleDescriptor != slot->tts_tupleDescriptor) ExecSetSlotDescriptor(newslot, slot->tts_tupleDescriptor); ExecStoreGenericTuple(newtuple, newslot, false); newslot->tts_tableOid = slot->tts_tableOid; /* for constraints */ slot = newslot; tuple = newtuple; } } /* * Check the constraints of the tuple * * If we generate a new candidate tuple after EvalPlanQual testing, we * must loop back here and recheck constraints. (We don't need to redo * triggers, however. If there are any BEFORE triggers then trigger.c * will have done heap_lock_tuple to lock the correct tuple, so there's no * need to do them again.) */ lreplace:; if (resultRelationDesc->rd_att->constr) ExecConstraints(resultRelInfo, slot, estate); if (!GpPersistent_IsPersistentRelation(resultRelationDesc->rd_id)) { /* * Normal UPDATE path. */ /* * replace the heap tuple * * Note: if es_crosscheck_snapshot isn't InvalidSnapshot, we check that * the row to be updated is visible to that snapshot, and throw a can't- * serialize error if not. This is a special-case behavior needed for * referential integrity updates in serializable transactions. */ result = heap_update(resultRelationDesc, tupleid, tuple, &update_ctid, &update_xmax, estate->es_snapshot->curcid, estate->es_crosscheck_snapshot, true /* wait for commit */ ); switch (result) { case HeapTupleSelfUpdated: /* already deleted by self; nothing to do */ return; case HeapTupleMayBeUpdated: break; case HeapTupleUpdated: if (IsXactIsoLevelSerializable) ereport(ERROR, (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), errmsg("could not serialize access due to concurrent update"))); else if (!ItemPointerEquals(tupleid, &update_ctid)) { TupleTableSlot *epqslot; epqslot = EvalPlanQual(estate, resultRelInfo->ri_RangeTableIndex, &update_ctid, update_xmax, estate->es_snapshot->curcid); if (!TupIsNull(epqslot)) { *tupleid = update_ctid; slot = ExecFilterJunk(estate->es_junkFilter, epqslot); tuple = ExecFetchSlotHeapTuple(slot); goto lreplace; } } /* tuple already deleted; nothing to do */ return; default: elog(ERROR, "unrecognized heap_update status: %u", result); return; } } else { HeapTuple persistentTuple; /* * Persistent metadata path. */ persistentTuple = heap_copytuple(tuple); persistentTuple->t_self = *tupleid; frozen_heap_inplace_update(resultRelationDesc, persistentTuple); heap_freetuple(persistentTuple); } IncrReplaced(); (estate->es_processed)++; /* * Note: instead of having to update the old index tuples associated with * the heap tuple, all we do is form and insert new index tuples. This is * because UPDATEs are actually DELETEs and INSERTs, and index tuple * deletion is done later by VACUUM (see notes in ExecDelete). All we do * here is insert new index tuples. -cim 9/27/89 */ /* * insert index entries for tuple * * Note: heap_update returns the tid (location) of the new tuple in the * t_self field. */ if (resultRelInfo->ri_NumIndices > 0) ExecInsertIndexTuples(slot, &(tuple->t_self), estate, false); /* AFTER ROW UPDATE Triggers */ ExecARUpdateTriggers(estate, resultRelInfo, tupleid, tuple); }
/* * Add a tuple to the new heap. * * Visibility information is copied from the original tuple, except that * we "freeze" very-old tuples. Note that since we scribble on new_tuple, * it had better be temp storage not a pointer to the original tuple. * * state opaque state as returned by begin_heap_rewrite * old_tuple original tuple in the old heap * new_tuple new, rewritten tuple to be inserted to new heap */ void rewrite_heap_tuple(RewriteState state, HeapTuple old_tuple, HeapTuple new_tuple) { MemoryContext old_cxt; ItemPointerData old_tid; TidHashKey hashkey; bool found; bool free_new; old_cxt = MemoryContextSwitchTo(state->rs_cxt); /* * Copy the original tuple's visibility information into new_tuple. * * XXX we might later need to copy some t_infomask2 bits, too? Right now, * we intentionally clear the HOT status bits. */ memcpy(&new_tuple->t_data->t_choice.t_heap, &old_tuple->t_data->t_choice.t_heap, sizeof(HeapTupleFields)); new_tuple->t_data->t_infomask &= ~HEAP_XACT_MASK; new_tuple->t_data->t_infomask2 &= ~HEAP2_XACT_MASK; new_tuple->t_data->t_infomask |= old_tuple->t_data->t_infomask & HEAP_XACT_MASK; /* * While we have our hands on the tuple, we may as well freeze any * eligible xmin or xmax, so that future VACUUM effort can be saved. */ heap_freeze_tuple(new_tuple->t_data, state->rs_freeze_xid, state->rs_cutoff_multi); /* * Invalid ctid means that ctid should point to the tuple itself. We'll * override it later if the tuple is part of an update chain. */ ItemPointerSetInvalid(&new_tuple->t_data->t_ctid); /* * If the tuple has been updated, check the old-to-new mapping hash table. */ if (!((old_tuple->t_data->t_infomask & HEAP_XMAX_INVALID) || HeapTupleHeaderIsOnlyLocked(old_tuple->t_data)) && !(ItemPointerEquals(&(old_tuple->t_self), &(old_tuple->t_data->t_ctid)))) { OldToNewMapping mapping; memset(&hashkey, 0, sizeof(hashkey)); hashkey.xmin = HeapTupleHeaderGetUpdateXid(old_tuple->t_data); hashkey.tid = old_tuple->t_data->t_ctid; mapping = (OldToNewMapping) hash_search(state->rs_old_new_tid_map, &hashkey, HASH_FIND, NULL); if (mapping != NULL) { /* * We've already copied the tuple that t_ctid points to, so we can * set the ctid of this tuple to point to the new location, and * insert it right away. */ new_tuple->t_data->t_ctid = mapping->new_tid; /* We don't need the mapping entry anymore */ hash_search(state->rs_old_new_tid_map, &hashkey, HASH_REMOVE, &found); Assert(found); } else { /* * We haven't seen the tuple t_ctid points to yet. Stash this * tuple into unresolved_tups to be written later. */ UnresolvedTup unresolved; unresolved = hash_search(state->rs_unresolved_tups, &hashkey, HASH_ENTER, &found); Assert(!found); unresolved->old_tid = old_tuple->t_self; unresolved->tuple = heap_copytuple(new_tuple); /* * We can't do anything more now, since we don't know where the * tuple will be written. */ MemoryContextSwitchTo(old_cxt); return; } } /* * Now we will write the tuple, and then check to see if it is the B tuple * in any new or known pair. When we resolve a known pair, we will be * able to write that pair's A tuple, and then we have to check if it * resolves some other pair. Hence, we need a loop here. */ old_tid = old_tuple->t_self; free_new = false; for (;;) { ItemPointerData new_tid; /* Insert the tuple and find out where it's put in new_heap */ raw_heap_insert(state, new_tuple); new_tid = new_tuple->t_self; /* * If the tuple is the updated version of a row, and the prior version * wouldn't be DEAD yet, then we need to either resolve the prior * version (if it's waiting in rs_unresolved_tups), or make an entry * in rs_old_new_tid_map (so we can resolve it when we do see it). The * previous tuple's xmax would equal this one's xmin, so it's * RECENTLY_DEAD if and only if the xmin is not before OldestXmin. */ if ((new_tuple->t_data->t_infomask & HEAP_UPDATED) && !TransactionIdPrecedes(HeapTupleHeaderGetXmin(new_tuple->t_data), state->rs_oldest_xmin)) { /* * Okay, this is B in an update pair. See if we've seen A. */ UnresolvedTup unresolved; memset(&hashkey, 0, sizeof(hashkey)); hashkey.xmin = HeapTupleHeaderGetXmin(new_tuple->t_data); hashkey.tid = old_tid; unresolved = hash_search(state->rs_unresolved_tups, &hashkey, HASH_FIND, NULL); if (unresolved != NULL) { /* * We have seen and memorized the previous tuple already. Now * that we know where we inserted the tuple its t_ctid points * to, fix its t_ctid and insert it to the new heap. */ if (free_new) heap_freetuple(new_tuple); new_tuple = unresolved->tuple; free_new = true; old_tid = unresolved->old_tid; new_tuple->t_data->t_ctid = new_tid; /* * We don't need the hash entry anymore, but don't free its * tuple just yet. */ hash_search(state->rs_unresolved_tups, &hashkey, HASH_REMOVE, &found); Assert(found); /* loop back to insert the previous tuple in the chain */ continue; } else { /* * Remember the new tid of this tuple. We'll use it to set the * ctid when we find the previous tuple in the chain. */ OldToNewMapping mapping; mapping = hash_search(state->rs_old_new_tid_map, &hashkey, HASH_ENTER, &found); Assert(!found); mapping->new_tid = new_tid; } } /* Done with this (chain of) tuples, for now */ if (free_new) heap_freetuple(new_tuple); break; } MemoryContextSwitchTo(old_cxt); }
/* * SearchCatCacheList * * Generate a list of all tuples matching a partial key (that is, * a key specifying just the first K of the cache's N key columns). * * The caller must not modify the list object or the pointed-to tuples, * and must call ReleaseCatCacheList() when done with the list. */ CatCList * SearchCatCacheList(CatCache *cache, int nkeys, Datum v1, Datum v2, Datum v3, Datum v4) { ScanKeyData cur_skey[CATCACHE_MAXKEYS]; uint32 lHashValue; Dlelem *elt; CatCList *cl; CatCTup *ct; List *volatile ctlist; ListCell *ctlist_item; int nmembers; bool ordered; HeapTuple ntp; MemoryContext oldcxt; int i; /* * one-time startup overhead for each cache */ if (cache->cc_tupdesc == NULL) CatalogCacheInitializeCache(cache); Assert(nkeys > 0 && nkeys < cache->cc_nkeys); #ifdef CATCACHE_STATS cache->cc_lsearches++; #endif /* * initialize the search key information */ memcpy(cur_skey, cache->cc_skey, sizeof(cur_skey)); cur_skey[0].sk_argument = v1; cur_skey[1].sk_argument = v2; cur_skey[2].sk_argument = v3; cur_skey[3].sk_argument = v4; /* * compute a hash value of the given keys for faster search. We don't * presently divide the CatCList items into buckets, but this still lets * us skip non-matching items quickly most of the time. */ lHashValue = CatalogCacheComputeHashValue(cache, nkeys, cur_skey); /* * scan the items until we find a match or exhaust our list */ for (elt = DLGetHead(&cache->cc_lists); elt; elt = DLGetSucc(elt)) { bool res; cl = (CatCList *) DLE_VAL(elt); if (cl->dead) continue; /* ignore dead entries */ if (cl->hash_value != lHashValue) continue; /* quickly skip entry if wrong hash val */ /* * see if the cached list matches our key. */ if (cl->nkeys != nkeys) continue; HeapKeyTest(&cl->tuple, cache->cc_tupdesc, nkeys, cur_skey, res); if (!res) continue; /* * We found a matching list. Move the list to the front of the * cache's list-of-lists, to speed subsequent searches. (We do not * move the members to the fronts of their hashbucket lists, however, * since there's no point in that unless they are searched for * individually.) */ DLMoveToFront(&cl->cache_elem); /* Bump the list's refcount and return it */ ResourceOwnerEnlargeCatCacheListRefs(CurrentResourceOwner); cl->refcount++; ResourceOwnerRememberCatCacheListRef(CurrentResourceOwner, cl); CACHE2_elog(DEBUG2, "SearchCatCacheList(%s): found list", cache->cc_relname); #ifdef CATCACHE_STATS cache->cc_lhits++; #endif return cl; } /* * List was not found in cache, so we have to build it by reading the * relation. For each matching tuple found in the relation, use an * existing cache entry if possible, else build a new one. * * We have to bump the member refcounts temporarily to ensure they won't * get dropped from the cache while loading other members. We use a PG_TRY * block to ensure we can undo those refcounts if we get an error before * we finish constructing the CatCList. */ ResourceOwnerEnlargeCatCacheListRefs(CurrentResourceOwner); ctlist = NIL; PG_TRY(); { Relation relation; SysScanDesc scandesc; relation = heap_open(cache->cc_reloid, AccessShareLock); scandesc = systable_beginscan(relation, cache->cc_indexoid, IndexScanOK(cache, cur_skey), SnapshotNow, nkeys, cur_skey); /* The list will be ordered iff we are doing an index scan */ ordered = (scandesc->irel != NULL); while (HeapTupleIsValid(ntp = systable_getnext(scandesc))) { uint32 hashValue; Index hashIndex; /* * See if there's an entry for this tuple already. */ ct = NULL; hashValue = CatalogCacheComputeTupleHashValue(cache, ntp); hashIndex = HASH_INDEX(hashValue, cache->cc_nbuckets); for (elt = DLGetHead(&cache->cc_bucket[hashIndex]); elt; elt = DLGetSucc(elt)) { ct = (CatCTup *) DLE_VAL(elt); if (ct->dead || ct->negative) continue; /* ignore dead and negative entries */ if (ct->hash_value != hashValue) continue; /* quickly skip entry if wrong hash val */ if (!ItemPointerEquals(&(ct->tuple.t_self), &(ntp->t_self))) continue; /* not same tuple */ /* * Found a match, but can't use it if it belongs to another * list already */ if (ct->c_list) continue; break; /* A-OK */ } if (elt == NULL) { /* We didn't find a usable entry, so make a new one */ ct = CatalogCacheCreateEntry(cache, ntp, hashValue, hashIndex, false); } /* Careful here: add entry to ctlist, then bump its refcount */ /* This way leaves state correct if lappend runs out of memory */ ctlist = lappend(ctlist, ct); ct->refcount++; } systable_endscan(scandesc); heap_close(relation, AccessShareLock); /* * Now we can build the CatCList entry. First we need a dummy tuple * containing the key values... */ ntp = build_dummy_tuple(cache, nkeys, cur_skey); oldcxt = MemoryContextSwitchTo(CacheMemoryContext); nmembers = list_length(ctlist); cl = (CatCList *) palloc(sizeof(CatCList) + nmembers * sizeof(CatCTup *)); heap_copytuple_with_tuple(ntp, &cl->tuple); MemoryContextSwitchTo(oldcxt); heap_freetuple(ntp); /* * We are now past the last thing that could trigger an elog before we * have finished building the CatCList and remembering it in the * resource owner. So it's OK to fall out of the PG_TRY, and indeed * we'd better do so before we start marking the members as belonging * to the list. */ } PG_CATCH(); { foreach(ctlist_item, ctlist) { ct = (CatCTup *) lfirst(ctlist_item); Assert(ct->c_list == NULL); Assert(ct->refcount > 0); ct->refcount--; if ( #ifndef CATCACHE_FORCE_RELEASE ct->dead && #endif ct->refcount == 0 && (ct->c_list == NULL || ct->c_list->refcount == 0)) CatCacheRemoveCTup(cache, ct); } PG_RE_THROW(); }
/* * For a newly inserted heap tid, check if an entry with this tid * already exists in a unique index. If it does, abort the inserting * transaction. */ static void _bt_validate_tid(Relation irel, ItemPointer h_tid) { MIRROREDLOCK_BUFMGR_DECLARE; BlockNumber blkno; BlockNumber num_pages; Buffer buf; Page page; BTPageOpaque opaque; IndexTuple itup; OffsetNumber maxoff, minoff, offnum; elog(DEBUG1, "validating tid (%d,%d) for index (%s)", ItemPointerGetBlockNumber(h_tid), ItemPointerGetOffsetNumber(h_tid), RelationGetRelationName(irel)); blkno = BTREE_METAPAGE + 1; num_pages = RelationGetNumberOfBlocks(irel); MIRROREDLOCK_BUFMGR_LOCK; for (; blkno < num_pages; blkno++) { buf = ReadBuffer(irel, blkno); page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (!PageIsNew(page)) _bt_checkpage(irel, buf); if (P_ISLEAF(opaque)) { minoff = P_FIRSTDATAKEY(opaque); maxoff = PageGetMaxOffsetNumber(page); for (offnum = minoff; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); if (ItemPointerEquals(&itup->t_tid, h_tid)) { Form_pg_attribute key_att = RelationGetDescr(irel)->attrs[0]; Oid key = InvalidOid; bool isnull; if (key_att->atttypid == OIDOID) { key = DatumGetInt32( index_getattr(itup, 1, RelationGetDescr(irel), &isnull)); elog(ERROR, "found tid (%d,%d), %s (%d) already in index (%s)", ItemPointerGetBlockNumber(h_tid), ItemPointerGetOffsetNumber(h_tid), NameStr(key_att->attname), key, RelationGetRelationName(irel)); } else { elog(ERROR, "found tid (%d,%d) already in index (%s)", ItemPointerGetBlockNumber(h_tid), ItemPointerGetOffsetNumber(h_tid), RelationGetRelationName(irel)); } } } } ReleaseBuffer(buf); } MIRROREDLOCK_BUFMGR_UNLOCK; }
/* * SearchCatCacheList * * Generate a list of all tuples matching a partial key (that is, * a key specifying just the first K of the cache's N key columns). * * The caller must not modify the list object or the pointed-to tuples, * and must call ReleaseCatCacheList() when done with the list. */ CatCList * SearchCatCacheList(CatCache *cache, int nkeys, Datum v1, Datum v2, Datum v3, Datum v4) { ScanKeyData cur_skey[4]; uint32 lHashValue; Dlelem *elt; CatCList *cl; CatCTup *ct; List *ctlist; int nmembers; Relation relation; SysScanDesc scandesc; bool ordered; HeapTuple ntp; MemoryContext oldcxt; int i; /* * one-time startup overhead for each cache */ if (cache->cc_tupdesc == NULL) CatalogCacheInitializeCache(cache); Assert(nkeys > 0 && nkeys < cache->cc_nkeys); #ifdef CATCACHE_STATS cache->cc_lsearches++; #endif /* * initialize the search key information */ memcpy(cur_skey, cache->cc_skey, sizeof(cur_skey)); cur_skey[0].sk_argument = v1; cur_skey[1].sk_argument = v2; cur_skey[2].sk_argument = v3; cur_skey[3].sk_argument = v4; /* * compute a hash value of the given keys for faster search. We don't * presently divide the CatCList items into buckets, but this still * lets us skip non-matching items quickly most of the time. */ lHashValue = CatalogCacheComputeHashValue(cache, nkeys, cur_skey); /* * scan the items until we find a match or exhaust our list */ for (elt = DLGetHead(&cache->cc_lists); elt; elt = DLGetSucc(elt)) { bool res; cl = (CatCList *) DLE_VAL(elt); if (cl->dead) continue; /* ignore dead entries */ if (cl->hash_value != lHashValue) continue; /* quickly skip entry if wrong hash val */ /* * see if the cached list matches our key. */ if (cl->nkeys != nkeys) continue; HeapKeyTest(&cl->tuple, cache->cc_tupdesc, nkeys, cur_skey, res); if (!res) continue; /* * we found a matching list: move each of its members to the front * of the global LRU list. Also move the list itself to the front * of the cache's list-of-lists, to speed subsequent searches. (We * do not move the members to the fronts of their hashbucket * lists, however, since there's no point in that unless they are * searched for individually.) Also bump the members' refcounts. */ for (i = 0; i < cl->n_members; i++) { cl->members[i]->refcount++; DLMoveToFront(&cl->members[i]->lrulist_elem); } DLMoveToFront(&cl->cache_elem); /* Bump the list's refcount and return it */ cl->refcount++; CACHE2_elog(DEBUG2, "SearchCatCacheList(%s): found list", cache->cc_relname); #ifdef CATCACHE_STATS cache->cc_lhits++; #endif return cl; } /* * List was not found in cache, so we have to build it by reading the * relation. For each matching tuple found in the relation, use an * existing cache entry if possible, else build a new one. */ relation = heap_open(cache->cc_reloid, AccessShareLock); scandesc = systable_beginscan(relation, cache->cc_indname, true, SnapshotNow, nkeys, cur_skey); /* The list will be ordered iff we are doing an index scan */ ordered = (scandesc->irel != NULL); ctlist = NIL; nmembers = 0; while (HeapTupleIsValid(ntp = systable_getnext(scandesc))) { uint32 hashValue; Index hashIndex; /* * See if there's an entry for this tuple already. */ ct = NULL; hashValue = CatalogCacheComputeTupleHashValue(cache, ntp); hashIndex = HASH_INDEX(hashValue, cache->cc_nbuckets); for (elt = DLGetHead(&cache->cc_bucket[hashIndex]); elt; elt = DLGetSucc(elt)) { ct = (CatCTup *) DLE_VAL(elt); if (ct->dead || ct->negative) continue; /* ignore dead and negative entries */ if (ct->hash_value != hashValue) continue; /* quickly skip entry if wrong hash val */ if (!ItemPointerEquals(&(ct->tuple.t_self), &(ntp->t_self))) continue; /* not same tuple */ /* * Found a match, but can't use it if it belongs to another * list already */ if (ct->c_list) continue; /* Found a match, so bump its refcount and move to front */ ct->refcount++; DLMoveToFront(&ct->lrulist_elem); break; } if (elt == NULL) { /* We didn't find a usable entry, so make a new one */ ct = CatalogCacheCreateEntry(cache, ntp, hashValue, hashIndex, false); } ctlist = lcons(ct, ctlist); nmembers++; } systable_endscan(scandesc); heap_close(relation, AccessShareLock); /* * Now we can build the CatCList entry. First we need a dummy tuple * containing the key values... */ ntp = build_dummy_tuple(cache, nkeys, cur_skey); oldcxt = MemoryContextSwitchTo(CacheMemoryContext); cl = (CatCList *) palloc(sizeof(CatCList) + nmembers * sizeof(CatCTup *)); heap_copytuple_with_tuple(ntp, &cl->tuple); MemoryContextSwitchTo(oldcxt); heap_freetuple(ntp); cl->cl_magic = CL_MAGIC; cl->my_cache = cache; DLInitElem(&cl->cache_elem, (void *) cl); cl->refcount = 1; /* count this first reference */ cl->dead = false; cl->ordered = ordered; cl->nkeys = nkeys; cl->hash_value = lHashValue; cl->n_members = nmembers; /* The list is backwards because we built it with lcons */ for (i = nmembers; --i >= 0;) { cl->members[i] = ct = (CatCTup *) lfirst(ctlist); Assert(ct->c_list == NULL); ct->c_list = cl; /* mark list dead if any members already dead */ if (ct->dead) cl->dead = true; ctlist = lnext(ctlist); } DLAddHead(&cache->cc_lists, &cl->cache_elem); CACHE3_elog(DEBUG2, "SearchCatCacheList(%s): made list of %d members", cache->cc_relname, nmembers); return cl; }