/* * PinBuffer -- make buffer unavailable for replacement. * * This should be applied only to shared buffers, never local ones. * Bufmgr lock must be held by caller. */ void PinBuffer(BufferDesc *buf) { int b = BufferDescriptorGetBuffer(buf) - 1; if (buf->refcount == 0) { IsInQueue(buf); /* remove from freelist queue */ /********* BEGIN NEW CODE ************/ if ( buf->buf_id == HeadPosition ) { HeadPosition = buf->freeNext; // remove the head buffer, update position } /********* END NEW CODE ************/ BufferDescriptors[buf->freeNext].freePrev = buf->freePrev; BufferDescriptors[buf->freePrev].freeNext = buf->freeNext; buf->freeNext = buf->freePrev = INVALID_DESCRIPTOR; /* mark buffer as no longer free */ buf->flags &= ~BM_FREE; } else IsNotInQueue(buf); if (PrivateRefCount[b] == 0) buf->refcount++; PrivateRefCount[b]++; Assert(PrivateRefCount[b] > 0); }
/* * AddBufferToRing -- add a buffer to the buffer ring * * Caller must hold the buffer header spinlock on the buffer. Since this * is called with the spinlock held, it had better be quite cheap. */ static void AddBufferToRing(BufferAccessStrategy strategy, volatile BufferDesc *buf) { buf->order = maxOrder() + 1; strategy->buffers[strategy->current] = BufferDescriptorGetBuffer(buf); clean(); }
/* * AddBufferToFreelist * * In theory, this is the only routine that needs to be changed * if the buffer replacement strategy changes. Just change * the manner in which buffers are added to the freelist queue. * Currently, they are added on an LRU basis. */ static void AddBufferToFreelist(BufferDesc *bf) { #ifdef BMTRACE _bm_trace(bf->tag.relId.dbId, bf->tag.relId.relId, bf->tag.blockNum, BufferDescriptorGetBuffer(bf), BMT_DEALLOC); #endif /* BMTRACE */ IsNotInQueue(bf); /* change bf so it points to inFrontOfNew and its successor */ /********* BEGIN OLD CODE***************/ // bf->freePrev = SharedFreeList->freePrev; // bf->freeNext = Free_List_Descriptor; /********* END OLD CODE*****************/ /********* BEGIN NEW CODE ************/ bf->freePrev = BufferDescriptors[HeadPosition].freePrev; bf->freeNext = HeadPosition; HeadPosition = bf->buf_id; // update the head position /********* END NEW CODE *************/ /* insert new into chain */ BufferDescriptors[bf->freeNext].freePrev = bf->buf_id; BufferDescriptors[bf->freePrev].freeNext = bf->buf_id; }
/* * UnpinBuffer -- make buffer available for replacement. * * This should be applied only to shared buffers, never local ones. * Bufmgr lock must be held by caller. */ void UnpinBuffer(BufferDesc *buf) { int b = BufferDescriptorGetBuffer(buf) - 1; IsNotInQueue(buf); Assert(buf->refcount > 0); Assert(PrivateRefCount[b] > 0); PrivateRefCount[b]--; if (PrivateRefCount[b] == 0) buf->refcount--; if (buf->refcount == 0) { /* buffer is now unpinned */ AddBufferToFreelist(buf); buf->flags |= BM_FREE; } else if ((buf->flags & BM_PIN_COUNT_WAITER) != 0 && buf->refcount == 1) { /* we just released the last pin other than the waiter's */ buf->flags &= ~BM_PIN_COUNT_WAITER; ProcSendSignal(buf->wait_backend_id); } else { /* do nothing */ } }
void UnpinBuffer_Debug(char *file, int line, BufferDesc *buf) { UnpinBuffer(buf); if (ShowPinTrace) { Buffer buffer = BufferDescriptorGetBuffer(buf); fprintf(stderr, "UNPIN(Unpin) %ld relname = %s, blockNum = %d, \ refcount = %ld, file: %s, line: %d\n", buffer, buf->blind.relname, buf->tag.blockNum, PrivateRefCount[buffer - 1], file, line); } }
/* * StrategyRejectBuffer -- consider rejecting a dirty buffer * * When a nondefault strategy is used, the buffer manager calls this function * when it turns out that the buffer selected by StrategyGetBuffer needs to * be written out and doing so would require flushing WAL too. This gives us * a chance to choose a different victim. * * Returns true if buffer manager should ask for a new victim, and false * if this buffer should be written and re-used. */ bool StrategyRejectBuffer(BufferAccessStrategy strategy, volatile BufferDesc *buf) { /* We only do this in bulkread mode */ if (strategy->btype != BAS_BULKREAD) return false; /* Don't muck with behavior of normal buffer-replacement strategy */ if (!strategy->current_was_in_ring || strategy->buffers[strategy->current] != BufferDescriptorGetBuffer(buf)) return false; /* * Remove the dirty buffer from the ring; necessary to prevent infinite * loop if all ring members are dirty. */ strategy->buffers[strategy->current] = InvalidBuffer; return true; }
/* * AddBufferToRing -- add a buffer to the buffer ring * * Caller must hold the buffer header spinlock on the buffer. Since this * is called with the spinlock held, it had better be quite cheap. */ static void AddBufferToRing(BufferAccessStrategy strategy, volatile BufferDesc *buf) { strategy->buffers[strategy->current] = BufferDescriptorGetBuffer(buf); }
/* * LocalBufferAlloc - * Find or create a local buffer for the given page of the given relation. * * API is similar to bufmgr.c's BufferAlloc, except that we do not need * to do any locking since this is all local. Also, IO_IN_PROGRESS * does not get set. Lastly, we support only default access strategy * (hence, usage_count is always advanced). */ BufferDesc * LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, bool *foundPtr) { BufferTag newTag; /* identity of requested block */ LocalBufferLookupEnt *hresult; BufferDesc *bufHdr; int b; int trycounter; bool found; INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum); /* Initialize local buffers if first request in this session */ if (LocalBufHash == NULL) InitLocalBuffers(); /* See if the desired buffer already exists */ hresult = (LocalBufferLookupEnt *) hash_search(LocalBufHash, (void *) &newTag, HASH_FIND, NULL); if (hresult) { b = hresult->id; bufHdr = &LocalBufferDescriptors[b]; Assert(BUFFERTAGS_EQUAL(bufHdr->tag, newTag)); #ifdef LBDEBUG fprintf(stderr, "LB ALLOC (%u,%d,%d) %d\n", smgr->smgr_rnode.node.relNode, forkNum, blockNum, -b - 1); #endif /* this part is equivalent to PinBuffer for a shared buffer */ if (LocalRefCount[b] == 0) { if (bufHdr->usage_count < BM_MAX_USAGE_COUNT) bufHdr->usage_count++; } LocalRefCount[b]++; ResourceOwnerRememberBuffer(CurrentResourceOwner, BufferDescriptorGetBuffer(bufHdr)); if (bufHdr->flags & BM_VALID) *foundPtr = TRUE; else { /* Previous read attempt must have failed; try again */ *foundPtr = FALSE; } return bufHdr; } #ifdef LBDEBUG fprintf(stderr, "LB ALLOC (%u,%d,%d) %d\n", smgr->smgr_rnode.node.relNode, forkNum, blockNum, -nextFreeLocalBuf - 1); #endif /* * Need to get a new buffer. We use a clock sweep algorithm (essentially * the same as what freelist.c does now...) */ trycounter = NLocBuffer; for (;;) { b = nextFreeLocalBuf; if (++nextFreeLocalBuf >= NLocBuffer) nextFreeLocalBuf = 0; bufHdr = &LocalBufferDescriptors[b]; if (LocalRefCount[b] == 0) { if (bufHdr->usage_count > 0) { bufHdr->usage_count--; trycounter = NLocBuffer; } else { /* Found a usable buffer */ LocalRefCount[b]++; ResourceOwnerRememberBuffer(CurrentResourceOwner, BufferDescriptorGetBuffer(bufHdr)); break; } } else if (--trycounter == 0) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_RESOURCES), errmsg("no empty local buffer available"))); } /* * this buffer is not referenced but it might still be dirty. if that's * the case, write it out before reusing it! */ if (bufHdr->flags & BM_DIRTY) { SMgrRelation oreln; /* Find smgr relation for buffer */ oreln = smgropen(bufHdr->tag.rnode, MyBackendId); /* And write... */ smgrwrite(oreln, bufHdr->tag.forkNum, bufHdr->tag.blockNum, (char *) LocalBufHdrGetBlock(bufHdr), false); /* Mark not-dirty now in case we error out below */ bufHdr->flags &= ~BM_DIRTY; pgBufferUsage.local_blks_written++; } /* * lazy memory allocation: allocate space on first use of a buffer. */ if (LocalBufHdrGetBlock(bufHdr) == NULL) { /* Set pointer for use by BufferGetBlock() macro */ LocalBufHdrGetBlock(bufHdr) = GetLocalBufferStorage(); } /* * Update the hash table: remove old entry, if any, and make new one. */ if (bufHdr->flags & BM_TAG_VALID) { hresult = (LocalBufferLookupEnt *) hash_search(LocalBufHash, (void *) &bufHdr->tag, HASH_REMOVE, NULL); if (!hresult) /* shouldn't happen */ elog(ERROR, "local buffer hash table corrupted"); /* mark buffer invalid just in case hash insert fails */ CLEAR_BUFFERTAG(bufHdr->tag); bufHdr->flags &= ~(BM_VALID | BM_TAG_VALID); } hresult = (LocalBufferLookupEnt *) hash_search(LocalBufHash, (void *) &newTag, HASH_ENTER, &found); if (found) /* shouldn't happen */ elog(ERROR, "local buffer hash table corrupted"); hresult->id = b; /* * it's all ours now. */ bufHdr->tag = newTag; bufHdr->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_IO_ERROR); bufHdr->flags |= BM_TAG_VALID; bufHdr->usage_count = 1; *foundPtr = FALSE; return bufHdr; }
Datum pg_buffercache_pages(PG_FUNCTION_ARGS) { FuncCallContext *funcctx; Datum result; MemoryContext oldcontext; BufferCachePagesContext *fctx; /* User function context. */ TupleDesc tupledesc; HeapTuple tuple; if (SRF_IS_FIRSTCALL()) { int i; volatile BufferDesc *bufHdr; funcctx = SRF_FIRSTCALL_INIT(); /* Switch context when allocating stuff to be used in later calls */ oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); /* Create a user function context for cross-call persistence */ fctx = (BufferCachePagesContext *) palloc(sizeof(BufferCachePagesContext)); /* Construct a tuple descriptor for the result rows. */ tupledesc = CreateTemplateTupleDesc(NUM_BUFFERCACHE_PAGES_ELEM, false); TupleDescInitEntry(tupledesc, (AttrNumber) 1, "bufferid", INT4OID, -1, 0); TupleDescInitEntry(tupledesc, (AttrNumber) 2, "relfilenode", OIDOID, -1, 0); TupleDescInitEntry(tupledesc, (AttrNumber) 3, "reltablespace", OIDOID, -1, 0); TupleDescInitEntry(tupledesc, (AttrNumber) 4, "reldatabase", OIDOID, -1, 0); TupleDescInitEntry(tupledesc, (AttrNumber) 5, "relforknumber", INT2OID, -1, 0); TupleDescInitEntry(tupledesc, (AttrNumber) 6, "relblocknumber", INT8OID, -1, 0); TupleDescInitEntry(tupledesc, (AttrNumber) 7, "isdirty", BOOLOID, -1, 0); TupleDescInitEntry(tupledesc, (AttrNumber) 8, "usage_count", INT2OID, -1, 0); fctx->tupdesc = BlessTupleDesc(tupledesc); /* Allocate NBuffers worth of BufferCachePagesRec records. */ fctx->record = (BufferCachePagesRec *) palloc(sizeof(BufferCachePagesRec) * NBuffers); /* Set max calls and remember the user function context. */ funcctx->max_calls = NBuffers; funcctx->user_fctx = fctx; /* Return to original context when allocating transient memory */ MemoryContextSwitchTo(oldcontext); /* * To get a consistent picture of the buffer state, we must lock all * partitions of the buffer map. Needless to say, this is horrible * for concurrency. Must grab locks in increasing order to avoid * possible deadlocks. */ for (i = 0; i < NUM_BUFFER_PARTITIONS; i++) LWLockAcquire(FirstBufMappingLock + i, LW_SHARED); /* * Scan though all the buffers, saving the relevant fields in the * fctx->record structure. */ for (i = 0, bufHdr = BufferDescriptors; i < NBuffers; i++, bufHdr++) { /* Lock each buffer header before inspecting. */ LockBufHdr(bufHdr); fctx->record[i].bufferid = BufferDescriptorGetBuffer(bufHdr); fctx->record[i].relfilenode = bufHdr->tag.rnode.relNode; fctx->record[i].reltablespace = bufHdr->tag.rnode.spcNode; fctx->record[i].reldatabase = bufHdr->tag.rnode.dbNode; fctx->record[i].forknum = bufHdr->tag.forkNum; fctx->record[i].blocknum = bufHdr->tag.blockNum; fctx->record[i].usagecount = bufHdr->usage_count; if (bufHdr->flags & BM_DIRTY) fctx->record[i].isdirty = true; else fctx->record[i].isdirty = false; /* Note if the buffer is valid, and has storage created */ if ((bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_TAG_VALID)) fctx->record[i].isvalid = true; else fctx->record[i].isvalid = false; UnlockBufHdr(bufHdr); } /* * And release locks. We do this in reverse order for two reasons: * (1) Anyone else who needs more than one of the locks will be trying * to lock them in increasing order; we don't want to release the * other process until it can get all the locks it needs. (2) This * avoids O(N^2) behavior inside LWLockRelease. */ for (i = NUM_BUFFER_PARTITIONS; --i >= 0;) LWLockRelease(FirstBufMappingLock + i); } funcctx = SRF_PERCALL_SETUP(); /* Get the saved state */ fctx = funcctx->user_fctx; if (funcctx->call_cntr < funcctx->max_calls) { uint32 i = funcctx->call_cntr; Datum values[NUM_BUFFERCACHE_PAGES_ELEM]; bool nulls[NUM_BUFFERCACHE_PAGES_ELEM]; values[0] = Int32GetDatum(fctx->record[i].bufferid); nulls[0] = false; /* * Set all fields except the bufferid to null if the buffer is unused * or not valid. */ if (fctx->record[i].blocknum == InvalidBlockNumber || fctx->record[i].isvalid == false) { nulls[1] = true; nulls[2] = true; nulls[3] = true; nulls[4] = true; nulls[5] = true; nulls[6] = true; nulls[7] = true; } else { values[1] = ObjectIdGetDatum(fctx->record[i].relfilenode); nulls[1] = false; values[2] = ObjectIdGetDatum(fctx->record[i].reltablespace); nulls[2] = false; values[3] = ObjectIdGetDatum(fctx->record[i].reldatabase); nulls[3] = false; values[4] = ObjectIdGetDatum(fctx->record[i].forknum); nulls[4] = false; values[5] = Int64GetDatum((int64) fctx->record[i].blocknum); nulls[5] = false; values[6] = BoolGetDatum(fctx->record[i].isdirty); nulls[6] = false; values[7] = Int16GetDatum(fctx->record[i].usagecount); nulls[7] = false; } /* Build and return the tuple. */ tuple = heap_form_tuple(fctx->tupdesc, values, nulls); result = HeapTupleGetDatum(tuple); SRF_RETURN_NEXT(funcctx, result); } else SRF_RETURN_DONE(funcctx); }
Datum pg_buffercache_pages(PG_FUNCTION_ARGS) { FuncCallContext *funcctx; Datum result; MemoryContext oldcontext; BufferCachePagesContext *fctx; /* User function context. */ TupleDesc tupledesc; TupleDesc expected_tupledesc; HeapTuple tuple; if (SRF_IS_FIRSTCALL()) { int i; funcctx = SRF_FIRSTCALL_INIT(); /* Switch context when allocating stuff to be used in later calls */ oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); /* Create a user function context for cross-call persistence */ fctx = (BufferCachePagesContext *) palloc(sizeof(BufferCachePagesContext)); /* * To smoothly support upgrades from version 1.0 of this extension * transparently handle the (non-)existence of the pinning_backends * column. We unfortunately have to get the result type for that... - * we can't use the result type determined by the function definition * without potentially crashing when somebody uses the old (or even * wrong) function definition though. */ if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE) elog(ERROR, "return type must be a row type"); if (expected_tupledesc->natts < NUM_BUFFERCACHE_PAGES_MIN_ELEM || expected_tupledesc->natts > NUM_BUFFERCACHE_PAGES_ELEM) elog(ERROR, "incorrect number of output arguments"); /* Construct a tuple descriptor for the result rows. */ tupledesc = CreateTemplateTupleDesc(expected_tupledesc->natts, false); TupleDescInitEntry(tupledesc, (AttrNumber) 1, "bufferid", INT4OID, -1, 0); TupleDescInitEntry(tupledesc, (AttrNumber) 2, "relfilenode", OIDOID, -1, 0); TupleDescInitEntry(tupledesc, (AttrNumber) 3, "reltablespace", OIDOID, -1, 0); TupleDescInitEntry(tupledesc, (AttrNumber) 4, "reldatabase", OIDOID, -1, 0); TupleDescInitEntry(tupledesc, (AttrNumber) 5, "relforknumber", INT2OID, -1, 0); TupleDescInitEntry(tupledesc, (AttrNumber) 6, "relblocknumber", INT8OID, -1, 0); TupleDescInitEntry(tupledesc, (AttrNumber) 7, "isdirty", BOOLOID, -1, 0); TupleDescInitEntry(tupledesc, (AttrNumber) 8, "usage_count", INT2OID, -1, 0); if (expected_tupledesc->natts == NUM_BUFFERCACHE_PAGES_ELEM) TupleDescInitEntry(tupledesc, (AttrNumber) 9, "pinning_backends", INT4OID, -1, 0); fctx->tupdesc = BlessTupleDesc(tupledesc); /* Allocate NBuffers worth of BufferCachePagesRec records. */ fctx->record = (BufferCachePagesRec *) MemoryContextAllocHuge(CurrentMemoryContext, sizeof(BufferCachePagesRec) * NBuffers); /* Set max calls and remember the user function context. */ funcctx->max_calls = NBuffers; funcctx->user_fctx = fctx; /* Return to original context when allocating transient memory */ MemoryContextSwitchTo(oldcontext); /* * Scan through all the buffers, saving the relevant fields in the * fctx->record structure. * * We don't hold the partition locks, so we don't get a consistent * snapshot across all buffers, but we do grab the buffer header * locks, so the information of each buffer is self-consistent. */ for (i = 0; i < NBuffers; i++) { BufferDesc *bufHdr; uint32 buf_state; bufHdr = GetBufferDescriptor(i); /* Lock each buffer header before inspecting. */ buf_state = LockBufHdr(bufHdr); fctx->record[i].bufferid = BufferDescriptorGetBuffer(bufHdr); fctx->record[i].relfilenode = bufHdr->tag.rnode.relNode; fctx->record[i].reltablespace = bufHdr->tag.rnode.spcNode; fctx->record[i].reldatabase = bufHdr->tag.rnode.dbNode; fctx->record[i].forknum = bufHdr->tag.forkNum; fctx->record[i].blocknum = bufHdr->tag.blockNum; fctx->record[i].usagecount = BUF_STATE_GET_USAGECOUNT(buf_state); fctx->record[i].pinning_backends = BUF_STATE_GET_REFCOUNT(buf_state); if (buf_state & BM_DIRTY) fctx->record[i].isdirty = true; else fctx->record[i].isdirty = false; /* Note if the buffer is valid, and has storage created */ if ((buf_state & BM_VALID) && (buf_state & BM_TAG_VALID)) fctx->record[i].isvalid = true; else fctx->record[i].isvalid = false; UnlockBufHdr(bufHdr, buf_state); } } funcctx = SRF_PERCALL_SETUP(); /* Get the saved state */ fctx = funcctx->user_fctx; if (funcctx->call_cntr < funcctx->max_calls) { uint32 i = funcctx->call_cntr; Datum values[NUM_BUFFERCACHE_PAGES_ELEM]; bool nulls[NUM_BUFFERCACHE_PAGES_ELEM]; values[0] = Int32GetDatum(fctx->record[i].bufferid); nulls[0] = false; /* * Set all fields except the bufferid to null if the buffer is unused * or not valid. */ if (fctx->record[i].blocknum == InvalidBlockNumber || fctx->record[i].isvalid == false) { nulls[1] = true; nulls[2] = true; nulls[3] = true; nulls[4] = true; nulls[5] = true; nulls[6] = true; nulls[7] = true; /* unused for v1.0 callers, but the array is always long enough */ nulls[8] = true; } else { values[1] = ObjectIdGetDatum(fctx->record[i].relfilenode); nulls[1] = false; values[2] = ObjectIdGetDatum(fctx->record[i].reltablespace); nulls[2] = false; values[3] = ObjectIdGetDatum(fctx->record[i].reldatabase); nulls[3] = false; values[4] = ObjectIdGetDatum(fctx->record[i].forknum); nulls[4] = false; values[5] = Int64GetDatum((int64) fctx->record[i].blocknum); nulls[5] = false; values[6] = BoolGetDatum(fctx->record[i].isdirty); nulls[6] = false; values[7] = Int16GetDatum(fctx->record[i].usagecount); nulls[7] = false; /* unused for v1.0 callers, but the array is always long enough */ values[8] = Int32GetDatum(fctx->record[i].pinning_backends); nulls[8] = false; } /* Build and return the tuple. */ tuple = heap_form_tuple(fctx->tupdesc, values, nulls); result = HeapTupleGetDatum(tuple); SRF_RETURN_NEXT(funcctx, result); } else SRF_RETURN_DONE(funcctx); }