/* * GetBufferFromRing -- returns a buffer from the ring, or NULL if the * ring is empty. * * The bufhdr spin lock is held on the returned buffer. */ static volatile BufferDesc * GetBufferFromRing(BufferAccessStrategy strategy) { volatile BufferDesc *buf; Buffer bufnum; /* Advance to next ring slot */ if (++strategy->current >= strategy->ring_size) strategy->current = 0; /* * If the slot hasn't been filled yet, tell the caller to allocate a new___ * buffer with the normal allocation strategy. He will then fill this * slot by calling AddBufferToRing with the new___ buffer. */ bufnum = strategy->buffers[strategy->current]; if (bufnum == InvalidBuffer) { strategy->current_was_in_ring = false; return NULL; } /* * If the buffer is pinned we cannot use it under any circumstances. * * If usage_count is 0 or 1 then the buffer is fair game (we expect 1, * since our own previous usage of the ring element would have left it * there, but it might've been decremented by clock sweep since then). A * higher usage_count indicates someone else has touched the buffer, so we * shouldn't re-use it. */ buf = GetBufferDescriptor(bufnum - 1); LockBufHdr(buf); if (buf->refcount == 0 && buf->usage_count <= 1) { strategy->current_was_in_ring = true; return buf; } UnlockBufHdr(buf); /* * Tell caller to allocate a new___ buffer with the normal allocation * strategy. He'll then replace this ring element via AddBufferToRing. */ strategy->current_was_in_ring = false; return NULL; }
/* * Initialize shared buffer pool * * This is called once during shared-memory initialization (either in the * postmaster, or in a standalone backend). */ void InitBufferPool(void) { bool foundBufs, foundDescs, foundIOLocks, foundBufCkpt; /* Align descriptors to a cacheline boundary. */ BufferDescriptors = (BufferDescPadded *) ShmemInitStruct("Buffer Descriptors", NBuffers * sizeof(BufferDescPadded), &foundDescs); BufferBlocks = (char *) ShmemInitStruct("Buffer Blocks", NBuffers * (Size) BLCKSZ, &foundBufs); /* Align lwlocks to cacheline boundary */ BufferIOLWLockArray = (LWLockMinimallyPadded *) ShmemInitStruct("Buffer IO Locks", NBuffers * (Size) sizeof(LWLockMinimallyPadded), &foundIOLocks); LWLockRegisterTranche(LWTRANCHE_BUFFER_IO_IN_PROGRESS, "buffer_io"); LWLockRegisterTranche(LWTRANCHE_BUFFER_CONTENT, "buffer_content"); /* * The array used to sort to-be-checkpointed buffer ids is located in * shared memory, to avoid having to allocate significant amounts of * memory at runtime. As that'd be in the middle of a checkpoint, or when * the checkpointer is restarted, memory allocation failures would be * painful. */ CkptBufferIds = (CkptSortItem *) ShmemInitStruct("Checkpoint BufferIds", NBuffers * sizeof(CkptSortItem), &foundBufCkpt); if (foundDescs || foundBufs || foundIOLocks || foundBufCkpt) { /* should find all of these, or none of them */ Assert(foundDescs && foundBufs && foundIOLocks && foundBufCkpt); /* note: this path is only taken in EXEC_BACKEND case */ } else { int i; /* * Initialize all the buffer headers. */ for (i = 0; i < NBuffers; i++) { BufferDesc *buf = GetBufferDescriptor(i); CLEAR_BUFFERTAG(buf->tag); pg_atomic_init_u32(&buf->state, 0); buf->wait_backend_pid = 0; buf->buf_id = i; /* * Initially link all the buffers together as unused. Subsequent * management of this list is done by freelist.c. */ buf->freeNext = i + 1; LWLockInitialize(BufferDescriptorGetContentLock(buf), LWTRANCHE_BUFFER_CONTENT); LWLockInitialize(BufferDescriptorGetIOLock(buf), LWTRANCHE_BUFFER_IO_IN_PROGRESS); } /* Correct last entry of linked list */ GetBufferDescriptor(NBuffers - 1)->freeNext = FREENEXT_END_OF_LIST; } /* Init other shared buffer-management stuff */ StrategyInitialize(!foundDescs); /* Initialize per-backend file flush context */ WritebackContextInit(&BackendWritebackContext, &backend_flush_after); }
/* * Initialize shared buffer pool * * This is called once during shared-memory initialization (either in the * postmaster, or in a standalone backend). */ void InitBufferPool(void) { bool foundBufs, foundDescs, foundIOLocks; /* Align descriptors to a cacheline boundary. */ BufferDescriptors = (BufferDescPadded *) CACHELINEALIGN( ShmemInitStruct("Buffer Descriptors", NBuffers * sizeof(BufferDescPadded) + PG_CACHE_LINE_SIZE, &foundDescs)); BufferBlocks = (char *) ShmemInitStruct("Buffer Blocks", NBuffers * (Size) BLCKSZ, &foundBufs); /* Align lwlocks to cacheline boundary */ BufferIOLWLockArray = (LWLockMinimallyPadded *) CACHELINEALIGN(ShmemInitStruct("Buffer IO Locks", NBuffers * (Size) sizeof(LWLockMinimallyPadded) + PG_CACHE_LINE_SIZE, &foundIOLocks)); BufferIOLWLockTranche.name = "buffer_io"; BufferIOLWLockTranche.array_base = BufferIOLWLockArray; BufferIOLWLockTranche.array_stride = sizeof(LWLockMinimallyPadded); LWLockRegisterTranche(LWTRANCHE_BUFFER_IO_IN_PROGRESS, &BufferIOLWLockTranche); BufferContentLWLockTranche.name = "buffer_content"; BufferContentLWLockTranche.array_base = ((char *) BufferDescriptors) + offsetof(BufferDesc, content_lock); BufferContentLWLockTranche.array_stride = sizeof(BufferDescPadded); LWLockRegisterTranche(LWTRANCHE_BUFFER_CONTENT, &BufferContentLWLockTranche); if (foundDescs || foundBufs || foundIOLocks) { /* should find all of these, or none of them */ Assert(foundDescs && foundBufs && foundIOLocks); /* note: this path is only taken in EXEC_BACKEND case */ } else { int i; /* * Initialize all the buffer headers. */ for (i = 0; i < NBuffers; i++) { BufferDesc *buf = GetBufferDescriptor(i); CLEAR_BUFFERTAG(buf->tag); buf->flags = 0; buf->usage_count = 0; buf->refcount = 0; buf->wait_backend_pid = 0; SpinLockInit(&buf->buf_hdr_lock); buf->buf_id = i; /* * Initially link all the buffers together as unused. Subsequent * management of this list is done by freelist.c. */ buf->freeNext = i + 1; LWLockInitialize(BufferDescriptorGetContentLock(buf), LWTRANCHE_BUFFER_CONTENT); LWLockInitialize(BufferDescriptorGetIOLock(buf), LWTRANCHE_BUFFER_IO_IN_PROGRESS); } /* Correct last entry of linked list */ GetBufferDescriptor(NBuffers - 1)->freeNext = FREENEXT_END_OF_LIST; } /* Init other shared buffer-management stuff */ StrategyInitialize(!foundDescs); }
/* * StrategyGetBuffer * * Called by the bufmgr to get the next candidate buffer to use in * BufferAlloc(). The only hard requirement BufferAlloc() has is that * the selected buffer must not currently be pinned by anyone. * * strategy is a BufferAccessStrategy object, or NULL for default strategy. * * To ensure that no one else can pin the buffer before we do, we must * return the buffer with the buffer header spinlock still held. */ volatile BufferDesc * StrategyGetBuffer(BufferAccessStrategy strategy) { volatile BufferDesc *buf; int bgwprocno; int trycounter; /* * If given a strategy object, see whether it can select a buffer. We * assume strategy objects don't need buffer_strategy_lock. */ if (strategy != NULL) { buf = GetBufferFromRing(strategy); if (buf != NULL) return buf; } /* * If asked, we need to waken the bgwriter. Since we don't want to rely on * a spinlock for this we force a read from shared memory once, and then * set the latch based on that value. We need to go through that length * because otherwise bgprocno might be reset while/after we check because * the compiler might just reread from memory. * * This can possibly set the latch of the wrong process if the bgwriter * dies in the wrong moment. But since PGPROC->procLatch is never * deallocated the worst consequence of that is that we set the latch of * some arbitrary process. */ bgwprocno = INT_ACCESS_ONCE(StrategyControl->bgwprocno); if (bgwprocno != -1) { /* reset bgwprocno first, before setting the latch */ StrategyControl->bgwprocno = -1; /* * Not acquiring ProcArrayLock here which is slightly icky. It's * actually fine because procLatch isn't ever freed, so we just can * potentially set the wrong process' (or no process') latch. */ SetLatch(&ProcGlobal->allProcs[bgwprocno].procLatch); } /* * We count buffer allocation requests so that the bgwriter can estimate * the rate of buffer consumption. Note that buffers recycled by a * strategy object are intentionally not counted here. */ pg_atomic_fetch_add_u32(&StrategyControl->numBufferAllocs, 1); /* * First check, without acquiring the lock, whether there's buffers in the * freelist. Since we otherwise don't require the spinlock in every * StrategyGetBuffer() invocation, it'd be sad to acquire it here - * uselessly in most cases. That obviously leaves a race where a buffer is * put on the freelist but we don't see the store yet - but that's pretty * harmless, it'll just get used during the next buffer acquisition. * * If there's buffers on the freelist, acquire the spinlock to pop one * buffer of the freelist. Then check whether that buffer is usable and * repeat if not. * * Note that the freeNext fields are considered to be protected by the * buffer_strategy_lock not the individual buffer spinlocks, so it's OK to * manipulate them without holding the spinlock. */ if (StrategyControl->firstFreeBuffer >= 0) { while (true) { /* Acquire the spinlock to remove element from the freelist */ SpinLockAcquire(&StrategyControl->buffer_strategy_lock); if (StrategyControl->firstFreeBuffer < 0) { SpinLockRelease(&StrategyControl->buffer_strategy_lock); break; } buf = GetBufferDescriptor(StrategyControl->firstFreeBuffer); Assert(buf->freeNext != FREENEXT_NOT_IN_LIST); /* Unconditionally remove buffer from freelist */ StrategyControl->firstFreeBuffer = buf->freeNext; buf->freeNext = FREENEXT_NOT_IN_LIST; /* * Release the lock so someone else can access the freelist while * we check out this buffer. */ SpinLockRelease(&StrategyControl->buffer_strategy_lock); /* * If the buffer is pinned or has a nonzero usage_count, we cannot * use it; discard it and retry. (This can only happen if VACUUM * put a valid buffer in the freelist and then someone else used * it before we got to it. It's probably impossible altogether as * of 8.3, but we'd better check anyway.) */ LockBufHdr(buf); if (buf->refcount == 0 && buf->usage_count == 0) { if (strategy != NULL) AddBufferToRing(strategy, buf); return buf; } UnlockBufHdr(buf); } } /* Nothing on the freelist, so run the "clock sweep" algorithm */ trycounter = NBuffers; for (;;) { buf = GetBufferDescriptor(ClockSweepTick()); /* * If the buffer is pinned or has a nonzero usage_count, we cannot use * it; decrement the usage_count (unless pinned) and keep scanning. */ LockBufHdr(buf); if (buf->refcount == 0) { if (buf->usage_count > 0) { buf->usage_count--; trycounter = NBuffers; } else { /* Found a usable buffer */ if (strategy != NULL) AddBufferToRing(strategy, buf); return buf; } } else if (--trycounter == 0) { /* * We've scanned all the buffers without making any state changes, * so all the buffers are pinned (or were when we looked at them). * We could hope that someone will free one eventually, but it's * probably better to fail than to risk getting stuck in an * infinite loop. */ UnlockBufHdr(buf); elog(ERROR, "no unpinned buffers available"); } UnlockBufHdr(buf); } }
Datum pg_buffercache_pages(PG_FUNCTION_ARGS) { FuncCallContext *funcctx; Datum result; MemoryContext oldcontext; BufferCachePagesContext *fctx; /* User function context. */ TupleDesc tupledesc; TupleDesc expected_tupledesc; HeapTuple tuple; if (SRF_IS_FIRSTCALL()) { int i; funcctx = SRF_FIRSTCALL_INIT(); /* Switch context when allocating stuff to be used in later calls */ oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); /* Create a user function context for cross-call persistence */ fctx = (BufferCachePagesContext *) palloc(sizeof(BufferCachePagesContext)); /* * To smoothly support upgrades from version 1.0 of this extension * transparently handle the (non-)existence of the pinning_backends * column. We unfortunately have to get the result type for that... - * we can't use the result type determined by the function definition * without potentially crashing when somebody uses the old (or even * wrong) function definition though. */ if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE) elog(ERROR, "return type must be a row type"); if (expected_tupledesc->natts < NUM_BUFFERCACHE_PAGES_MIN_ELEM || expected_tupledesc->natts > NUM_BUFFERCACHE_PAGES_ELEM) elog(ERROR, "incorrect number of output arguments"); /* Construct a tuple descriptor for the result rows. */ tupledesc = CreateTemplateTupleDesc(expected_tupledesc->natts, false); TupleDescInitEntry(tupledesc, (AttrNumber) 1, "bufferid", INT4OID, -1, 0); TupleDescInitEntry(tupledesc, (AttrNumber) 2, "relfilenode", OIDOID, -1, 0); TupleDescInitEntry(tupledesc, (AttrNumber) 3, "reltablespace", OIDOID, -1, 0); TupleDescInitEntry(tupledesc, (AttrNumber) 4, "reldatabase", OIDOID, -1, 0); TupleDescInitEntry(tupledesc, (AttrNumber) 5, "relforknumber", INT2OID, -1, 0); TupleDescInitEntry(tupledesc, (AttrNumber) 6, "relblocknumber", INT8OID, -1, 0); TupleDescInitEntry(tupledesc, (AttrNumber) 7, "isdirty", BOOLOID, -1, 0); TupleDescInitEntry(tupledesc, (AttrNumber) 8, "usage_count", INT2OID, -1, 0); if (expected_tupledesc->natts == NUM_BUFFERCACHE_PAGES_ELEM) TupleDescInitEntry(tupledesc, (AttrNumber) 9, "pinning_backends", INT4OID, -1, 0); fctx->tupdesc = BlessTupleDesc(tupledesc); /* Allocate NBuffers worth of BufferCachePagesRec records. */ fctx->record = (BufferCachePagesRec *) MemoryContextAllocHuge(CurrentMemoryContext, sizeof(BufferCachePagesRec) * NBuffers); /* Set max calls and remember the user function context. */ funcctx->max_calls = NBuffers; funcctx->user_fctx = fctx; /* Return to original context when allocating transient memory */ MemoryContextSwitchTo(oldcontext); /* * Scan through all the buffers, saving the relevant fields in the * fctx->record structure. * * We don't hold the partition locks, so we don't get a consistent * snapshot across all buffers, but we do grab the buffer header * locks, so the information of each buffer is self-consistent. */ for (i = 0; i < NBuffers; i++) { BufferDesc *bufHdr; uint32 buf_state; bufHdr = GetBufferDescriptor(i); /* Lock each buffer header before inspecting. */ buf_state = LockBufHdr(bufHdr); fctx->record[i].bufferid = BufferDescriptorGetBuffer(bufHdr); fctx->record[i].relfilenode = bufHdr->tag.rnode.relNode; fctx->record[i].reltablespace = bufHdr->tag.rnode.spcNode; fctx->record[i].reldatabase = bufHdr->tag.rnode.dbNode; fctx->record[i].forknum = bufHdr->tag.forkNum; fctx->record[i].blocknum = bufHdr->tag.blockNum; fctx->record[i].usagecount = BUF_STATE_GET_USAGECOUNT(buf_state); fctx->record[i].pinning_backends = BUF_STATE_GET_REFCOUNT(buf_state); if (buf_state & BM_DIRTY) fctx->record[i].isdirty = true; else fctx->record[i].isdirty = false; /* Note if the buffer is valid, and has storage created */ if ((buf_state & BM_VALID) && (buf_state & BM_TAG_VALID)) fctx->record[i].isvalid = true; else fctx->record[i].isvalid = false; UnlockBufHdr(bufHdr, buf_state); } } funcctx = SRF_PERCALL_SETUP(); /* Get the saved state */ fctx = funcctx->user_fctx; if (funcctx->call_cntr < funcctx->max_calls) { uint32 i = funcctx->call_cntr; Datum values[NUM_BUFFERCACHE_PAGES_ELEM]; bool nulls[NUM_BUFFERCACHE_PAGES_ELEM]; values[0] = Int32GetDatum(fctx->record[i].bufferid); nulls[0] = false; /* * Set all fields except the bufferid to null if the buffer is unused * or not valid. */ if (fctx->record[i].blocknum == InvalidBlockNumber || fctx->record[i].isvalid == false) { nulls[1] = true; nulls[2] = true; nulls[3] = true; nulls[4] = true; nulls[5] = true; nulls[6] = true; nulls[7] = true; /* unused for v1.0 callers, but the array is always long enough */ nulls[8] = true; } else { values[1] = ObjectIdGetDatum(fctx->record[i].relfilenode); nulls[1] = false; values[2] = ObjectIdGetDatum(fctx->record[i].reltablespace); nulls[2] = false; values[3] = ObjectIdGetDatum(fctx->record[i].reldatabase); nulls[3] = false; values[4] = ObjectIdGetDatum(fctx->record[i].forknum); nulls[4] = false; values[5] = Int64GetDatum((int64) fctx->record[i].blocknum); nulls[5] = false; values[6] = BoolGetDatum(fctx->record[i].isdirty); nulls[6] = false; values[7] = Int16GetDatum(fctx->record[i].usagecount); nulls[7] = false; /* unused for v1.0 callers, but the array is always long enough */ values[8] = Int32GetDatum(fctx->record[i].pinning_backends); nulls[8] = false; } /* Build and return the tuple. */ tuple = heap_form_tuple(fctx->tupdesc, values, nulls); result = HeapTupleGetDatum(tuple); SRF_RETURN_NEXT(funcctx, result); } else SRF_RETURN_DONE(funcctx); }