Datum pg_buffercache_pages(PG_FUNCTION_ARGS) { FuncCallContext *funcctx; Datum result; MemoryContext oldcontext; BufferCachePagesContext *fctx; /* User function context. */ TupleDesc tupledesc; TupleDesc expected_tupledesc; HeapTuple tuple; if (SRF_IS_FIRSTCALL()) { int i; funcctx = SRF_FIRSTCALL_INIT(); /* Switch context when allocating stuff to be used in later calls */ oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); /* Create a user function context for cross-call persistence */ fctx = (BufferCachePagesContext *) palloc(sizeof(BufferCachePagesContext)); /* * To smoothly support upgrades from version 1.0 of this extension * transparently handle the (non-)existence of the pinning_backends * column. We unfortunately have to get the result type for that... - * we can't use the result type determined by the function definition * without potentially crashing when somebody uses the old (or even * wrong) function definition though. */ if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE) elog(ERROR, "return type must be a row type"); if (expected_tupledesc->natts < NUM_BUFFERCACHE_PAGES_MIN_ELEM || expected_tupledesc->natts > NUM_BUFFERCACHE_PAGES_ELEM) elog(ERROR, "incorrect number of output arguments"); /* Construct a tuple descriptor for the result rows. */ tupledesc = CreateTemplateTupleDesc(expected_tupledesc->natts, false); TupleDescInitEntry(tupledesc, (AttrNumber) 1, "bufferid", INT4OID, -1, 0); TupleDescInitEntry(tupledesc, (AttrNumber) 2, "relfilenode", OIDOID, -1, 0); TupleDescInitEntry(tupledesc, (AttrNumber) 3, "reltablespace", OIDOID, -1, 0); TupleDescInitEntry(tupledesc, (AttrNumber) 4, "reldatabase", OIDOID, -1, 0); TupleDescInitEntry(tupledesc, (AttrNumber) 5, "relforknumber", INT2OID, -1, 0); TupleDescInitEntry(tupledesc, (AttrNumber) 6, "relblocknumber", INT8OID, -1, 0); TupleDescInitEntry(tupledesc, (AttrNumber) 7, "isdirty", BOOLOID, -1, 0); TupleDescInitEntry(tupledesc, (AttrNumber) 8, "usage_count", INT2OID, -1, 0); if (expected_tupledesc->natts == NUM_BUFFERCACHE_PAGES_ELEM) TupleDescInitEntry(tupledesc, (AttrNumber) 9, "pinning_backends", INT4OID, -1, 0); fctx->tupdesc = BlessTupleDesc(tupledesc); /* Allocate NBuffers worth of BufferCachePagesRec records. */ fctx->record = (BufferCachePagesRec *) MemoryContextAllocHuge(CurrentMemoryContext, sizeof(BufferCachePagesRec) * NBuffers); /* Set max calls and remember the user function context. */ funcctx->max_calls = NBuffers; funcctx->user_fctx = fctx; /* Return to original context when allocating transient memory */ MemoryContextSwitchTo(oldcontext); /* * Scan through all the buffers, saving the relevant fields in the * fctx->record structure. * * We don't hold the partition locks, so we don't get a consistent * snapshot across all buffers, but we do grab the buffer header * locks, so the information of each buffer is self-consistent. */ for (i = 0; i < NBuffers; i++) { BufferDesc *bufHdr; uint32 buf_state; bufHdr = GetBufferDescriptor(i); /* Lock each buffer header before inspecting. */ buf_state = LockBufHdr(bufHdr); fctx->record[i].bufferid = BufferDescriptorGetBuffer(bufHdr); fctx->record[i].relfilenode = bufHdr->tag.rnode.relNode; fctx->record[i].reltablespace = bufHdr->tag.rnode.spcNode; fctx->record[i].reldatabase = bufHdr->tag.rnode.dbNode; fctx->record[i].forknum = bufHdr->tag.forkNum; fctx->record[i].blocknum = bufHdr->tag.blockNum; fctx->record[i].usagecount = BUF_STATE_GET_USAGECOUNT(buf_state); fctx->record[i].pinning_backends = BUF_STATE_GET_REFCOUNT(buf_state); if (buf_state & BM_DIRTY) fctx->record[i].isdirty = true; else fctx->record[i].isdirty = false; /* Note if the buffer is valid, and has storage created */ if ((buf_state & BM_VALID) && (buf_state & BM_TAG_VALID)) fctx->record[i].isvalid = true; else fctx->record[i].isvalid = false; UnlockBufHdr(bufHdr, buf_state); } } funcctx = SRF_PERCALL_SETUP(); /* Get the saved state */ fctx = funcctx->user_fctx; if (funcctx->call_cntr < funcctx->max_calls) { uint32 i = funcctx->call_cntr; Datum values[NUM_BUFFERCACHE_PAGES_ELEM]; bool nulls[NUM_BUFFERCACHE_PAGES_ELEM]; values[0] = Int32GetDatum(fctx->record[i].bufferid); nulls[0] = false; /* * Set all fields except the bufferid to null if the buffer is unused * or not valid. */ if (fctx->record[i].blocknum == InvalidBlockNumber || fctx->record[i].isvalid == false) { nulls[1] = true; nulls[2] = true; nulls[3] = true; nulls[4] = true; nulls[5] = true; nulls[6] = true; nulls[7] = true; /* unused for v1.0 callers, but the array is always long enough */ nulls[8] = true; } else { values[1] = ObjectIdGetDatum(fctx->record[i].relfilenode); nulls[1] = false; values[2] = ObjectIdGetDatum(fctx->record[i].reltablespace); nulls[2] = false; values[3] = ObjectIdGetDatum(fctx->record[i].reldatabase); nulls[3] = false; values[4] = ObjectIdGetDatum(fctx->record[i].forknum); nulls[4] = false; values[5] = Int64GetDatum((int64) fctx->record[i].blocknum); nulls[5] = false; values[6] = BoolGetDatum(fctx->record[i].isdirty); nulls[6] = false; values[7] = Int16GetDatum(fctx->record[i].usagecount); nulls[7] = false; /* unused for v1.0 callers, but the array is always long enough */ values[8] = Int32GetDatum(fctx->record[i].pinning_backends); nulls[8] = false; } /* Build and return the tuple. */ tuple = heap_form_tuple(fctx->tupdesc, values, nulls); result = HeapTupleGetDatum(tuple); SRF_RETURN_NEXT(funcctx, result); } else SRF_RETURN_DONE(funcctx); }
/* * LocalBufferAlloc - * Find or create a local buffer for the given page of the given relation. * * API is similar to bufmgr.c's BufferAlloc, except that we do not need * to do any locking since this is all local. Also, IO_IN_PROGRESS * does not get set. Lastly, we support only default access strategy * (hence, usage_count is always advanced). */ BufferDesc * LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, bool *foundPtr) { BufferTag newTag; /* identity of requested block */ LocalBufferLookupEnt *hresult; BufferDesc *bufHdr; int b; int trycounter; bool found; uint32 buf_state; INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum); /* Initialize local buffers if first request in this session */ if (LocalBufHash == NULL) InitLocalBuffers(); /* See if the desired buffer already exists */ hresult = (LocalBufferLookupEnt *) hash_search(LocalBufHash, (void *) &newTag, HASH_FIND, NULL); if (hresult) { b = hresult->id; bufHdr = GetLocalBufferDescriptor(b); Assert(BUFFERTAGS_EQUAL(bufHdr->tag, newTag)); #ifdef LBDEBUG fprintf(stderr, "LB ALLOC (%u,%d,%d) %d\n", smgr->smgr_rnode.node.relNode, forkNum, blockNum, -b - 1); #endif buf_state = pg_atomic_read_u32(&bufHdr->state); /* this part is equivalent to PinBuffer for a shared buffer */ if (LocalRefCount[b] == 0) { if (BUF_STATE_GET_USAGECOUNT(buf_state) < BM_MAX_USAGE_COUNT) { buf_state += BUF_USAGECOUNT_ONE; pg_atomic_write_u32(&bufHdr->state, buf_state); } } LocalRefCount[b]++; ResourceOwnerRememberBuffer(CurrentResourceOwner, BufferDescriptorGetBuffer(bufHdr)); if (buf_state & BM_VALID) *foundPtr = TRUE; else { /* Previous read attempt must have failed; try again */ *foundPtr = FALSE; } return bufHdr; } #ifdef LBDEBUG fprintf(stderr, "LB ALLOC (%u,%d,%d) %d\n", smgr->smgr_rnode.node.relNode, forkNum, blockNum, -nextFreeLocalBuf - 1); #endif /* * Need to get a new buffer. We use a clock sweep algorithm (essentially * the same as what freelist.c does now...) */ trycounter = NLocBuffer; for (;;) { b = nextFreeLocalBuf; if (++nextFreeLocalBuf >= NLocBuffer) nextFreeLocalBuf = 0; bufHdr = GetLocalBufferDescriptor(b); if (LocalRefCount[b] == 0) { buf_state = pg_atomic_read_u32(&bufHdr->state); if (BUF_STATE_GET_USAGECOUNT(buf_state) > 0) { buf_state -= BUF_USAGECOUNT_ONE; pg_atomic_write_u32(&bufHdr->state, buf_state); trycounter = NLocBuffer; } else { /* Found a usable buffer */ LocalRefCount[b]++; ResourceOwnerRememberBuffer(CurrentResourceOwner, BufferDescriptorGetBuffer(bufHdr)); break; } } else if (--trycounter == 0) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_RESOURCES), errmsg("no empty local buffer available"))); } /* * this buffer is not referenced but it might still be dirty. if that's * the case, write it out before reusing it! */ if (buf_state & BM_DIRTY) { SMgrRelation oreln; Page localpage = (char *) LocalBufHdrGetBlock(bufHdr); /* Find smgr relation for buffer */ oreln = smgropen(bufHdr->tag.rnode, MyBackendId); PageSetChecksumInplace(localpage, bufHdr->tag.blockNum); /* And write... */ smgrwrite(oreln, bufHdr->tag.forkNum, bufHdr->tag.blockNum, localpage, false); /* Mark not-dirty now in case we error out below */ buf_state &= ~BM_DIRTY; pg_atomic_write_u32(&bufHdr->state, buf_state); pgBufferUsage.local_blks_written++; } /* * lazy memory allocation: allocate space on first use of a buffer. */ if (LocalBufHdrGetBlock(bufHdr) == NULL) { /* Set pointer for use by BufferGetBlock() macro */ LocalBufHdrGetBlock(bufHdr) = GetLocalBufferStorage(); } /* * Update the hash table: remove old entry, if any, and make new one. */ if (buf_state & BM_TAG_VALID) { hresult = (LocalBufferLookupEnt *) hash_search(LocalBufHash, (void *) &bufHdr->tag, HASH_REMOVE, NULL); if (!hresult) /* shouldn't happen */ elog(ERROR, "local buffer hash table corrupted"); /* mark buffer invalid just in case hash insert fails */ CLEAR_BUFFERTAG(bufHdr->tag); buf_state &= ~(BM_VALID | BM_TAG_VALID); pg_atomic_write_u32(&bufHdr->state, buf_state); } hresult = (LocalBufferLookupEnt *) hash_search(LocalBufHash, (void *) &newTag, HASH_ENTER, &found); if (found) /* shouldn't happen */ elog(ERROR, "local buffer hash table corrupted"); hresult->id = b; /* * it's all ours now. */ bufHdr->tag = newTag; buf_state &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_IO_ERROR); buf_state |= BM_TAG_VALID; buf_state &= ~BUF_USAGECOUNT_MASK; buf_state += BUF_USAGECOUNT_ONE; pg_atomic_write_u32(&bufHdr->state, buf_state); *foundPtr = FALSE; return bufHdr; }