/* * See whether two tuples (presumably of the same hash value) match * * As above, the passed pointers are pointers to TupleHashEntryData. */ static int TupleHashTableMatch(struct tuplehash_hash *tb, const MinimalTuple tuple1, const MinimalTuple tuple2) { TupleTableSlot *slot1; TupleTableSlot *slot2; TupleHashTable hashtable = (TupleHashTable) tb->private_data; ExprContext *econtext = hashtable->exprcontext; /* * We assume that simplehash.h will only ever call us with the first * argument being an actual table entry, and the second argument being * LookupTupleHashEntry's dummy TupleHashEntryData. The other direction * could be supported too, but is not currently required. */ Assert(tuple1 != NULL); slot1 = hashtable->tableslot; ExecStoreMinimalTuple(tuple1, slot1, false); Assert(tuple2 == NULL); slot2 = hashtable->inputslot; /* For crosstype comparisons, the inputslot must be first */ econtext->ecxt_innertuple = slot2; econtext->ecxt_outertuple = slot1; return !ExecQualAndReset(hashtable->cur_eq_func, econtext); }
/* * BitmapHeapRecheck -- access method routine to recheck a tuple in EvalPlanQual */ static bool BitmapHeapRecheck(BitmapHeapScanState *node, TupleTableSlot *slot) { ExprContext *econtext; /* * extract necessary information from index scan node */ econtext = node->ss.ps.ps_ExprContext; /* Does the tuple meet the original qual conditions? */ econtext->ecxt_scantuple = slot; return ExecQualAndReset(node->bitmapqualorig, econtext); }
/* ---------------------------------------------------------------- * IndexOnlyNext * * Retrieve a tuple from the IndexOnlyScan node's index. * ---------------------------------------------------------------- */ static TupleTableSlot * IndexOnlyNext(IndexOnlyScanState *node) { EState *estate; ExprContext *econtext; ScanDirection direction; IndexScanDesc scandesc; TupleTableSlot *slot; ItemPointer tid; /* * extract necessary information from index scan node */ estate = node->ss.ps.state; direction = estate->es_direction; /* flip direction if this is an overall backward scan */ if (ScanDirectionIsBackward(((IndexOnlyScan *) node->ss.ps.plan)->indexorderdir)) { if (ScanDirectionIsForward(direction)) direction = BackwardScanDirection; else if (ScanDirectionIsBackward(direction)) direction = ForwardScanDirection; } scandesc = node->ioss_ScanDesc; econtext = node->ss.ps.ps_ExprContext; slot = node->ss.ss_ScanTupleSlot; if (scandesc == NULL) { /* * We reach here if the index only scan is not parallel, or if we're * serially executing an index only scan that was planned to be * parallel. */ scandesc = index_beginscan(node->ss.ss_currentRelation, node->ioss_RelationDesc, estate->es_snapshot, node->ioss_NumScanKeys, node->ioss_NumOrderByKeys); node->ioss_ScanDesc = scandesc; /* Set it up for index-only scan */ node->ioss_ScanDesc->xs_want_itup = true; node->ioss_VMBuffer = InvalidBuffer; /* * If no run-time keys to calculate or they are ready, go ahead and * pass the scankeys to the index AM. */ if (node->ioss_NumRuntimeKeys == 0 || node->ioss_RuntimeKeysReady) index_rescan(scandesc, node->ioss_ScanKeys, node->ioss_NumScanKeys, node->ioss_OrderByKeys, node->ioss_NumOrderByKeys); } /* * OK, now that we have what we need, fetch the next tuple. */ while ((tid = index_getnext_tid(scandesc, direction)) != NULL) { HeapTuple tuple = NULL; CHECK_FOR_INTERRUPTS(); /* * We can skip the heap fetch if the TID references a heap page on * which all tuples are known visible to everybody. In any case, * we'll use the index tuple not the heap tuple as the data source. * * Note on Memory Ordering Effects: visibilitymap_get_status does not * lock the visibility map buffer, and therefore the result we read * here could be slightly stale. However, it can't be stale enough to * matter. * * We need to detect clearing a VM bit due to an insert right away, * because the tuple is present in the index page but not visible. The * reading of the TID by this scan (using a shared lock on the index * buffer) is serialized with the insert of the TID into the index * (using an exclusive lock on the index buffer). Because the VM bit * is cleared before updating the index, and locking/unlocking of the * index page acts as a full memory barrier, we are sure to see the * cleared bit if we see a recently-inserted TID. * * Deletes do not update the index page (only VACUUM will clear out * the TID), so the clearing of the VM bit by a delete is not * serialized with this test below, and we may see a value that is * significantly stale. However, we don't care about the delete right * away, because the tuple is still visible until the deleting * transaction commits or the statement ends (if it's our * transaction). In either case, the lock on the VM buffer will have * been released (acting as a write barrier) after clearing the bit. * And for us to have a snapshot that includes the deleting * transaction (making the tuple invisible), we must have acquired * ProcArrayLock after that time, acting as a read barrier. * * It's worth going through this complexity to avoid needing to lock * the VM buffer, which could cause significant contention. */ if (!VM_ALL_VISIBLE(scandesc->heapRelation, ItemPointerGetBlockNumber(tid), &node->ioss_VMBuffer)) { /* * Rats, we have to visit the heap to check visibility. */ InstrCountTuples2(node, 1); tuple = index_fetch_heap(scandesc); if (tuple == NULL) continue; /* no visible tuple, try next index entry */ /* * Only MVCC snapshots are supported here, so there should be no * need to keep following the HOT chain once a visible entry has * been found. If we did want to allow that, we'd need to keep * more state to remember not to call index_getnext_tid next time. */ if (scandesc->xs_continue_hot) elog(ERROR, "non-MVCC snapshots are not supported in index-only scans"); /* * Note: at this point we are holding a pin on the heap page, as * recorded in scandesc->xs_cbuf. We could release that pin now, * but it's not clear whether it's a win to do so. The next index * entry might require a visit to the same heap page. */ } /* * Fill the scan tuple slot with data from the index. This might be * provided in either HeapTuple or IndexTuple format. Conceivably an * index AM might fill both fields, in which case we prefer the heap * format, since it's probably a bit cheaper to fill a slot from. */ if (scandesc->xs_hitup) { /* * We don't take the trouble to verify that the provided tuple has * exactly the slot's format, but it seems worth doing a quick * check on the number of fields. */ Assert(slot->tts_tupleDescriptor->natts == scandesc->xs_hitupdesc->natts); ExecStoreHeapTuple(scandesc->xs_hitup, slot, false); } else if (scandesc->xs_itup) StoreIndexTuple(slot, scandesc->xs_itup, scandesc->xs_itupdesc); else elog(ERROR, "no data returned for index-only scan"); /* * If the index was lossy, we have to recheck the index quals. * (Currently, this can never happen, but we should support the case * for possible future use, eg with GiST indexes.) */ if (scandesc->xs_recheck) { econtext->ecxt_scantuple = slot; if (!ExecQualAndReset(node->indexqual, econtext)) { /* Fails recheck, so drop it and loop back for another */ InstrCountFiltered2(node, 1); continue; } } /* * We don't currently support rechecking ORDER BY distances. (In * principle, if the index can support retrieval of the originally * indexed value, it should be able to produce an exact distance * calculation too. So it's not clear that adding code here for * recheck/re-sort would be worth the trouble. But we should at least * throw an error if someone tries it.) */ if (scandesc->numberOfOrderBys > 0 && scandesc->xs_recheckorderby) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("lossy distance functions are not supported in index-only scans"))); /* * Predicate locks for index-only scans must be acquired at the page * level when the heap is not accessed, since tuple-level predicate * locks need the tuple's xmin value. If we had to visit the tuple * anyway, then we already have the tuple-level lock and can skip the * page lock. */ if (tuple == NULL) PredicateLockPage(scandesc->heapRelation, ItemPointerGetBlockNumber(tid), estate->es_snapshot); return slot; } /* * if we get here it means the index scan failed so we are at the end of * the scan.. */ return ExecClearTuple(slot); }
/* * ExecSetOp for non-hashed case */ static TupleTableSlot * setop_retrieve_direct(SetOpState *setopstate) { PlanState *outerPlan; SetOpStatePerGroup pergroup; TupleTableSlot *outerslot; TupleTableSlot *resultTupleSlot; ExprContext *econtext = setopstate->ps.ps_ExprContext; /* * get state info from node */ outerPlan = outerPlanState(setopstate); pergroup = (SetOpStatePerGroup) setopstate->pergroup; resultTupleSlot = setopstate->ps.ps_ResultTupleSlot; /* * We loop retrieving groups until we find one we should return */ while (!setopstate->setop_done) { /* * If we don't already have the first tuple of the new group, fetch it * from the outer plan. */ if (setopstate->grp_firstTuple == NULL) { outerslot = ExecProcNode(outerPlan); if (!TupIsNull(outerslot)) { /* Make a copy of the first input tuple */ setopstate->grp_firstTuple = ExecCopySlotHeapTuple(outerslot); } else { /* outer plan produced no tuples at all */ setopstate->setop_done = true; return NULL; } } /* * Store the copied first input tuple in the tuple table slot reserved * for it. The tuple will be deleted when it is cleared from the * slot. */ ExecStoreHeapTuple(setopstate->grp_firstTuple, resultTupleSlot, true); setopstate->grp_firstTuple = NULL; /* don't keep two pointers */ /* Initialize working state for a new input tuple group */ initialize_counts(pergroup); /* Count the first input tuple */ advance_counts(pergroup, fetch_tuple_flag(setopstate, resultTupleSlot)); /* * Scan the outer plan until we exhaust it or cross a group boundary. */ for (;;) { outerslot = ExecProcNode(outerPlan); if (TupIsNull(outerslot)) { /* no more outer-plan tuples available */ setopstate->setop_done = true; break; } /* * Check whether we've crossed a group boundary. */ econtext->ecxt_outertuple = resultTupleSlot; econtext->ecxt_innertuple = outerslot; if (!ExecQualAndReset(setopstate->eqfunction, econtext)) { /* * Save the first input tuple of the next group. */ setopstate->grp_firstTuple = ExecCopySlotHeapTuple(outerslot); break; } /* Still in same group, so count this tuple */ advance_counts(pergroup, fetch_tuple_flag(setopstate, outerslot)); } /* * Done scanning input tuple group. See if we should emit any copies * of result tuple, and if so return the first copy. */ set_output_count(setopstate, pergroup); if (setopstate->numOutput > 0) { setopstate->numOutput--; return resultTupleSlot; } } /* No more groups */ ExecClearTuple(resultTupleSlot); return NULL; }
/* ---------------------------------------------------------------- * BitmapHeapNext * * Retrieve next tuple from the BitmapHeapScan node's currentRelation * ---------------------------------------------------------------- */ static TupleTableSlot * BitmapHeapNext(BitmapHeapScanState *node) { ExprContext *econtext; HeapScanDesc scan; TIDBitmap *tbm; TBMIterator *tbmiterator = NULL; TBMSharedIterator *shared_tbmiterator = NULL; TBMIterateResult *tbmres; OffsetNumber targoffset; TupleTableSlot *slot; ParallelBitmapHeapState *pstate = node->pstate; dsa_area *dsa = node->ss.ps.state->es_query_dsa; /* * extract necessary information from index scan node */ econtext = node->ss.ps.ps_ExprContext; slot = node->ss.ss_ScanTupleSlot; scan = node->ss.ss_currentScanDesc; tbm = node->tbm; if (pstate == NULL) tbmiterator = node->tbmiterator; else shared_tbmiterator = node->shared_tbmiterator; tbmres = node->tbmres; /* * If we haven't yet performed the underlying index scan, do it, and begin * the iteration over the bitmap. * * For prefetching, we use *two* iterators, one for the pages we are * actually scanning and another that runs ahead of the first for * prefetching. node->prefetch_pages tracks exactly how many pages ahead * the prefetch iterator is. Also, node->prefetch_target tracks the * desired prefetch distance, which starts small and increases up to the * node->prefetch_maximum. This is to avoid doing a lot of prefetching in * a scan that stops after a few tuples because of a LIMIT. */ if (!node->initialized) { if (!pstate) { tbm = (TIDBitmap *) MultiExecProcNode(outerPlanState(node)); if (!tbm || !IsA(tbm, TIDBitmap)) elog(ERROR, "unrecognized result from subplan"); node->tbm = tbm; node->tbmiterator = tbmiterator = tbm_begin_iterate(tbm); node->tbmres = tbmres = NULL; #ifdef USE_PREFETCH if (node->prefetch_maximum > 0) { node->prefetch_iterator = tbm_begin_iterate(tbm); node->prefetch_pages = 0; node->prefetch_target = -1; } #endif /* USE_PREFETCH */ } else { /* * The leader will immediately come out of the function, but * others will be blocked until leader populates the TBM and wakes * them up. */ if (BitmapShouldInitializeSharedState(pstate)) { tbm = (TIDBitmap *) MultiExecProcNode(outerPlanState(node)); if (!tbm || !IsA(tbm, TIDBitmap)) elog(ERROR, "unrecognized result from subplan"); node->tbm = tbm; /* * Prepare to iterate over the TBM. This will return the * dsa_pointer of the iterator state which will be used by * multiple processes to iterate jointly. */ pstate->tbmiterator = tbm_prepare_shared_iterate(tbm); #ifdef USE_PREFETCH if (node->prefetch_maximum > 0) { pstate->prefetch_iterator = tbm_prepare_shared_iterate(tbm); /* * We don't need the mutex here as we haven't yet woke up * others. */ pstate->prefetch_pages = 0; pstate->prefetch_target = -1; } #endif /* We have initialized the shared state so wake up others. */ BitmapDoneInitializingSharedState(pstate); } /* Allocate a private iterator and attach the shared state to it */ node->shared_tbmiterator = shared_tbmiterator = tbm_attach_shared_iterate(dsa, pstate->tbmiterator); node->tbmres = tbmres = NULL; #ifdef USE_PREFETCH if (node->prefetch_maximum > 0) { node->shared_prefetch_iterator = tbm_attach_shared_iterate(dsa, pstate->prefetch_iterator); } #endif /* USE_PREFETCH */ } node->initialized = true; } for (;;) { Page dp; ItemId lp; CHECK_FOR_INTERRUPTS(); /* * Get next page of results if needed */ if (tbmres == NULL) { if (!pstate) node->tbmres = tbmres = tbm_iterate(tbmiterator); else node->tbmres = tbmres = tbm_shared_iterate(shared_tbmiterator); if (tbmres == NULL) { /* no more entries in the bitmap */ break; } BitmapAdjustPrefetchIterator(node, tbmres); /* * Ignore any claimed entries past what we think is the end of the * relation. (This is probably not necessary given that we got at * least AccessShareLock on the table before performing any of the * indexscans, but let's be safe.) */ if (tbmres->blockno >= scan->rs_nblocks) { node->tbmres = tbmres = NULL; continue; } /* * We can skip fetching the heap page if we don't need any fields * from the heap, and the bitmap entries don't need rechecking, * and all tuples on the page are visible to our transaction. */ node->skip_fetch = (node->can_skip_fetch && !tbmres->recheck && VM_ALL_VISIBLE(node->ss.ss_currentRelation, tbmres->blockno, &node->vmbuffer)); if (node->skip_fetch) { /* * The number of tuples on this page is put into * scan->rs_ntuples; note we don't fill scan->rs_vistuples. */ scan->rs_ntuples = tbmres->ntuples; } else { /* * Fetch the current heap page and identify candidate tuples. */ bitgetpage(scan, tbmres); } if (tbmres->ntuples >= 0) node->exact_pages++; else node->lossy_pages++; /* * Set rs_cindex to first slot to examine */ scan->rs_cindex = 0; /* Adjust the prefetch target */ BitmapAdjustPrefetchTarget(node); } else { /* * Continuing in previously obtained page; advance rs_cindex */ scan->rs_cindex++; #ifdef USE_PREFETCH /* * Try to prefetch at least a few pages even before we get to the * second page if we don't stop reading after the first tuple. */ if (!pstate) { if (node->prefetch_target < node->prefetch_maximum) node->prefetch_target++; } else if (pstate->prefetch_target < node->prefetch_maximum) { /* take spinlock while updating shared state */ SpinLockAcquire(&pstate->mutex); if (pstate->prefetch_target < node->prefetch_maximum) pstate->prefetch_target++; SpinLockRelease(&pstate->mutex); } #endif /* USE_PREFETCH */ } /* * Out of range? If so, nothing more to look at on this page */ if (scan->rs_cindex < 0 || scan->rs_cindex >= scan->rs_ntuples) { node->tbmres = tbmres = NULL; continue; } /* * We issue prefetch requests *after* fetching the current page to try * to avoid having prefetching interfere with the main I/O. Also, this * should happen only when we have determined there is still something * to do on the current page, else we may uselessly prefetch the same * page we are just about to request for real. */ BitmapPrefetch(node, scan); if (node->skip_fetch) { /* * If we don't have to fetch the tuple, just return nulls. */ ExecStoreAllNullTuple(slot); } else { /* * Okay to fetch the tuple. */ targoffset = scan->rs_vistuples[scan->rs_cindex]; dp = (Page) BufferGetPage(scan->rs_cbuf); lp = PageGetItemId(dp, targoffset); Assert(ItemIdIsNormal(lp)); scan->rs_ctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp); scan->rs_ctup.t_len = ItemIdGetLength(lp); scan->rs_ctup.t_tableOid = scan->rs_rd->rd_id; ItemPointerSet(&scan->rs_ctup.t_self, tbmres->blockno, targoffset); pgstat_count_heap_fetch(scan->rs_rd); /* * Set up the result slot to point to this tuple. Note that the * slot acquires a pin on the buffer. */ ExecStoreBufferHeapTuple(&scan->rs_ctup, slot, scan->rs_cbuf); /* * If we are using lossy info, we have to recheck the qual * conditions at every tuple. */ if (tbmres->recheck) { econtext->ecxt_scantuple = slot; if (!ExecQualAndReset(node->bitmapqualorig, econtext)) { /* Fails recheck, so drop it and loop back for another */ InstrCountFiltered2(node, 1); ExecClearTuple(slot); continue; } } } /* OK to return this tuple */ return slot; } /* * if we get here it means we are at the end of the scan.. */ return ExecClearTuple(slot); }