/* ---------------------------------------------------------------- * BitmapHeapNext * * Retrieve next tuple from the BitmapHeapScan node's currentRelation * ---------------------------------------------------------------- */ static TupleTableSlot * BitmapHeapNext(BitmapHeapScanState *node) { ExprContext *econtext; HeapScanDesc scan; TIDBitmap *tbm; TBMIterator *tbmiterator; TBMIterateResult *tbmres; TBMIterator *prefetch_iterator; OffsetNumber targoffset; TupleTableSlot *slot; /* * extract necessary information from index scan node */ econtext = node->ss.ps.ps_ExprContext; slot = node->ss.ss_ScanTupleSlot; scan = node->ss.ss_currentScanDesc; tbm = node->tbm; tbmiterator = node->tbmiterator; tbmres = node->tbmres; prefetch_iterator = node->prefetch_iterator; /* * If we haven't yet performed the underlying index scan, do it, and begin * the iteration over the bitmap. * * For prefetching, we use *two* iterators, one for the pages we are * actually scanning and another that runs ahead of the first for * prefetching. node->prefetch_pages tracks exactly how many pages ahead * the prefetch iterator is. Also, node->prefetch_target tracks the * desired prefetch distance, which starts small and increases up to the * GUC-controlled maximum, target_prefetch_pages. This is to avoid doing * a lot of prefetching in a scan that stops after a few tuples because of * a LIMIT. */ if (tbm == NULL) { tbm = (TIDBitmap *) MultiExecProcNode(outerPlanState(node)); if (!tbm || !IsA(tbm, TIDBitmap)) elog(ERROR, "unrecognized result from subplan"); node->tbm = tbm; node->tbmiterator = tbmiterator = tbm_begin_iterate(tbm); node->tbmres = tbmres = NULL; #ifdef USE_PREFETCH if (target_prefetch_pages > 0) { node->prefetch_iterator = prefetch_iterator = tbm_begin_iterate(tbm); node->prefetch_pages = 0; node->prefetch_target = -1; } #endif /* USE_PREFETCH */ } for (;;) { Page dp; ItemId lp; /* * Get next page of results if needed */ if (tbmres == NULL) { node->tbmres = tbmres = tbm_iterate(tbmiterator); if (tbmres == NULL) { /* no more entries in the bitmap */ break; } #ifdef USE_PREFETCH if (node->prefetch_pages > 0) { /* The main iterator has closed the distance by one page */ node->prefetch_pages--; } else if (prefetch_iterator) { /* Do not let the prefetch iterator get behind the main one */ TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator); if (tbmpre == NULL || tbmpre->blockno != tbmres->blockno) elog(ERROR, "prefetch and main iterators are out of sync"); } #endif /* USE_PREFETCH */ /* * Ignore any claimed entries past what we think is the end of the * relation. (This is probably not necessary given that we got at * least AccessShareLock on the table before performing any of the * indexscans, but let's be safe.) */ if (tbmres->blockno >= scan->rs_nblocks) { node->tbmres = tbmres = NULL; continue; } /* * Fetch the current heap page and identify candidate tuples. */ bitgetpage(scan, tbmres); /* * Set rs_cindex to first slot to examine */ scan->rs_cindex = 0; #ifdef USE_PREFETCH /* * Increase prefetch target if it's not yet at the max. Note that * we will increase it to zero after fetching the very first * page/tuple, then to one after the second tuple is fetched, then * it doubles as later pages are fetched. */ if (node->prefetch_target >= target_prefetch_pages) /* don't increase any further */ ; else if (node->prefetch_target >= target_prefetch_pages / 2) node->prefetch_target = target_prefetch_pages; else if (node->prefetch_target > 0) node->prefetch_target *= 2; else node->prefetch_target++; #endif /* USE_PREFETCH */ } else { /* * Continuing in previously obtained page; advance rs_cindex */ scan->rs_cindex++; #ifdef USE_PREFETCH /* * Try to prefetch at least a few pages even before we get to the * second page if we don't stop reading after the first tuple. */ if (node->prefetch_target < target_prefetch_pages) node->prefetch_target++; #endif /* USE_PREFETCH */ } /* * Out of range? If so, nothing more to look at on this page */ if (scan->rs_cindex < 0 || scan->rs_cindex >= scan->rs_ntuples) { node->tbmres = tbmres = NULL; continue; } #ifdef USE_PREFETCH /* * We issue prefetch requests *after* fetching the current page to try * to avoid having prefetching interfere with the main I/O. Also, this * should happen only when we have determined there is still something * to do on the current page, else we may uselessly prefetch the same * page we are just about to request for real. */ if (prefetch_iterator) { while (node->prefetch_pages < node->prefetch_target) { TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator); if (tbmpre == NULL) { /* No more pages to prefetch */ tbm_end_iterate(prefetch_iterator); node->prefetch_iterator = prefetch_iterator = NULL; break; } node->prefetch_pages++; PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno); } } #endif /* USE_PREFETCH */ /* * Okay to fetch the tuple */ targoffset = scan->rs_vistuples[scan->rs_cindex]; dp = (Page) BufferGetPage(scan->rs_cbuf); lp = PageGetItemId(dp, targoffset); Assert(ItemIdIsNormal(lp)); scan->rs_ctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp); scan->rs_ctup.t_len = ItemIdGetLength(lp); ItemPointerSet(&scan->rs_ctup.t_self, tbmres->blockno, targoffset); pgstat_count_heap_fetch(scan->rs_rd); /* * Set up the result slot to point to this tuple. Note that the slot * acquires a pin on the buffer. */ ExecStoreTuple(&scan->rs_ctup, slot, scan->rs_cbuf, false); /* * If we are using lossy info, we have to recheck the qual conditions * at every tuple. */ if (tbmres->recheck) { econtext->ecxt_scantuple = slot; ResetExprContext(econtext); if (!ExecQual(node->bitmapqualorig, econtext, false)) { /* Fails recheck, so drop it and loop back for another */ ExecClearTuple(slot); continue; } } /* OK to return this tuple */ return slot; } /* * if we get here it means we are at the end of the scan.. */ return ExecClearTuple(slot); }
/* * BitmapPrefetch - Prefetch, if prefetch_pages are behind prefetch_target */ static inline void BitmapPrefetch(BitmapHeapScanState *node, HeapScanDesc scan) { #ifdef USE_PREFETCH ParallelBitmapHeapState *pstate = node->pstate; if (pstate == NULL) { TBMIterator *prefetch_iterator = node->prefetch_iterator; if (prefetch_iterator) { while (node->prefetch_pages < node->prefetch_target) { TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator); if (tbmpre == NULL) { /* No more pages to prefetch */ tbm_end_iterate(prefetch_iterator); node->prefetch_iterator = NULL; break; } node->prefetch_pages++; PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno); } } return; } if (pstate->prefetch_pages < pstate->prefetch_target) { TBMSharedIterator *prefetch_iterator = node->shared_prefetch_iterator; if (prefetch_iterator) { while (1) { TBMIterateResult *tbmpre; bool do_prefetch = false; /* * Recheck under the mutex. If some other process has already * done enough prefetching then we need not to do anything. */ SpinLockAcquire(&pstate->mutex); if (pstate->prefetch_pages < pstate->prefetch_target) { pstate->prefetch_pages++; do_prefetch = true; } SpinLockRelease(&pstate->mutex); if (!do_prefetch) return; tbmpre = tbm_shared_iterate(prefetch_iterator); if (tbmpre == NULL) { /* No more pages to prefetch */ tbm_end_shared_iterate(prefetch_iterator); node->shared_prefetch_iterator = NULL; break; } PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno); } } } #endif /* USE_PREFETCH */ }
/* * pg_prewarm(regclass, mode text, fork text, * first_block int8, last_block int8) * * The first argument is the relation to be prewarmed; the second controls * how prewarming is done; legal options are 'prefetch', 'read', and 'buffer'. * The third is the name of the relation fork to be prewarmed. The fourth * and fifth arguments specify the first and last block to be prewarmed. * If the fourth argument is NULL, it will be taken as 0; if the fifth argument * is NULL, it will be taken as the number of blocks in the relation. The * return value is the number of blocks successfully prewarmed. */ Datum pg_prewarm(PG_FUNCTION_ARGS) { Oid relOid; text *forkName; text *type; int64 first_block; int64 last_block; int64 nblocks; int64 blocks_done = 0; int64 block; Relation rel; ForkNumber forkNumber; char *forkString; char *ttype; PrewarmType ptype; AclResult aclresult; /* Basic sanity checking. */ if (PG_ARGISNULL(0)) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("relation cannot be null"))); relOid = PG_GETARG_OID(0); if (PG_ARGISNULL(1)) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), (errmsg("prewarm type cannot be null")))); type = PG_GETARG_TEXT_P(1); ttype = text_to_cstring(type); if (strcmp(ttype, "prefetch") == 0) ptype = PREWARM_PREFETCH; else if (strcmp(ttype, "read") == 0) ptype = PREWARM_READ; else if (strcmp(ttype, "buffer") == 0) ptype = PREWARM_BUFFER; else { ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("invalid prewarm type"), errhint("Valid prewarm types are \"prefetch\", \"read\", and \"buffer\"."))); PG_RETURN_INT64(0); /* Placate compiler. */ } if (PG_ARGISNULL(2)) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), (errmsg("relation fork cannot be null")))); forkName = PG_GETARG_TEXT_P(2); forkString = text_to_cstring(forkName); forkNumber = forkname_to_number(forkString); /* Open relation and check privileges. */ rel = relation_open(relOid, AccessShareLock); aclresult = pg_class_aclcheck(relOid, GetUserId(), ACL_SELECT); if (aclresult != ACLCHECK_OK) aclcheck_error(aclresult, ACL_KIND_CLASS, get_rel_name(relOid)); /* Check that the fork exists. */ RelationOpenSmgr(rel); if (!smgrexists(rel->rd_smgr, forkNumber)) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("fork \"%s\" does not exist for this relation", forkString))); /* Validate block numbers, or handle nulls. */ nblocks = RelationGetNumberOfBlocksInFork(rel, forkNumber); if (PG_ARGISNULL(3)) first_block = 0; else { first_block = PG_GETARG_INT64(3); if (first_block < 0 || first_block >= nblocks) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("starting block number must be between 0 and " INT64_FORMAT, nblocks - 1))); } if (PG_ARGISNULL(4)) last_block = nblocks - 1; else { last_block = PG_GETARG_INT64(4); if (last_block < 0 || last_block >= nblocks) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("ending block number must be between 0 and " INT64_FORMAT, nblocks - 1))); } /* Now we're ready to do the real work. */ if (ptype == PREWARM_PREFETCH) { #ifdef USE_PREFETCH /* * In prefetch mode, we just hint the OS to read the blocks, but we * don't know whether it really does it, and we don't wait for it to * finish. * * It would probably be better to pass our prefetch requests in chunks * of a megabyte or maybe even a whole segment at a time, but there's * no practical way to do that at present without a gross modularity * violation, so we just do this. */ for (block = first_block; block <= last_block; ++block) { CHECK_FOR_INTERRUPTS(); PrefetchBuffer(rel, forkNumber, block); ++blocks_done; } #else ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("prefetch is not supported by this build"))); #endif } else if (ptype == PREWARM_READ) { /* * In read mode, we actually read the blocks, but not into shared * buffers. This is more portable than prefetch mode (it works * everywhere) and is synchronous. */ for (block = first_block; block <= last_block; ++block) { CHECK_FOR_INTERRUPTS(); smgrread(rel->rd_smgr, forkNumber, block, blockbuffer); ++blocks_done; } } else if (ptype == PREWARM_BUFFER) { /* * In buffer mode, we actually pull the data into shared_buffers. */ for (block = first_block; block <= last_block; ++block) { Buffer buf; CHECK_FOR_INTERRUPTS(); buf = ReadBufferExtended(rel, forkNumber, block, RBM_NORMAL, NULL); ReleaseBuffer(buf); ++blocks_done; } } /* Close relation, release lock. */ relation_close(rel, AccessShareLock); PG_RETURN_INT64(blocks_done); }
/* * BitmapPrefetch - Prefetch, if prefetch_pages are behind prefetch_target */ static inline void BitmapPrefetch(BitmapHeapScanState *node, HeapScanDesc scan) { #ifdef USE_PREFETCH ParallelBitmapHeapState *pstate = node->pstate; if (pstate == NULL) { TBMIterator *prefetch_iterator = node->prefetch_iterator; if (prefetch_iterator) { while (node->prefetch_pages < node->prefetch_target) { TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator); bool skip_fetch; if (tbmpre == NULL) { /* No more pages to prefetch */ tbm_end_iterate(prefetch_iterator); node->prefetch_iterator = NULL; break; } node->prefetch_pages++; /* * If we expect not to have to actually read this heap page, * skip this prefetch call, but continue to run the prefetch * logic normally. (Would it be better not to increment * prefetch_pages?) * * This depends on the assumption that the index AM will * report the same recheck flag for this future heap page as * it did for the current heap page; which is not a certainty * but is true in many cases. */ skip_fetch = (node->can_skip_fetch && (node->tbmres ? !node->tbmres->recheck : false) && VM_ALL_VISIBLE(node->ss.ss_currentRelation, tbmpre->blockno, &node->pvmbuffer)); if (!skip_fetch) PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno); } } return; } if (pstate->prefetch_pages < pstate->prefetch_target) { TBMSharedIterator *prefetch_iterator = node->shared_prefetch_iterator; if (prefetch_iterator) { while (1) { TBMIterateResult *tbmpre; bool do_prefetch = false; bool skip_fetch; /* * Recheck under the mutex. If some other process has already * done enough prefetching then we need not to do anything. */ SpinLockAcquire(&pstate->mutex); if (pstate->prefetch_pages < pstate->prefetch_target) { pstate->prefetch_pages++; do_prefetch = true; } SpinLockRelease(&pstate->mutex); if (!do_prefetch) return; tbmpre = tbm_shared_iterate(prefetch_iterator); if (tbmpre == NULL) { /* No more pages to prefetch */ tbm_end_shared_iterate(prefetch_iterator); node->shared_prefetch_iterator = NULL; break; } /* As above, skip prefetch if we expect not to need page */ skip_fetch = (node->can_skip_fetch && (node->tbmres ? !node->tbmres->recheck : false) && VM_ALL_VISIBLE(node->ss.ss_currentRelation, tbmpre->blockno, &node->pvmbuffer)); if (!skip_fetch) PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno); } } } #endif /* USE_PREFETCH */ }