StringInfo rest_call_with_lock(char *method, char *url, char *params, StringInfo postData, int64 mutex, bool shared, bool allowCancel) { CURL *curl; struct curl_slist *headers = NULL; char *errorbuff; StringInfo response = makeStringInfo(); CURLcode ret; int64 response_code; errorbuff = (char *) palloc0(CURL_ERROR_SIZE); curl = curl_easy_init(); if (curl) { headers = curl_slist_append(headers, "Transfer-Encoding:"); headers = curl_slist_append(headers, "Expect:"); curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers); curl_easy_setopt(curl, CURLOPT_FORBID_REUSE, 0L); /* allow connections to be reused */ if (allowCancel) { curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0); /* we want progress ... */ curl_easy_setopt(curl, CURLOPT_PROGRESSFUNCTION, curl_progress_func); /* to go here so we can detect a ^C within postgres */ } curl_easy_setopt(curl, CURLOPT_USERAGENT, "zombodb for PostgreSQL"); curl_easy_setopt(curl, CURLOPT_MAXREDIRS, 0); curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, curl_write_func); curl_easy_setopt(curl, CURLOPT_FAILONERROR, 0); curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, errorbuff); curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1); curl_easy_setopt(curl, CURLOPT_TIMEOUT, 60 * 60L); /* timeout of 60 minutes */ curl_easy_setopt(curl, CURLOPT_URL, url); curl_easy_setopt(curl, CURLOPT_CUSTOMREQUEST, method); curl_easy_setopt(curl, CURLOPT_WRITEDATA, response); curl_easy_setopt(curl, CURLOPT_POSTFIELDSIZE, postData ? postData->len : 0); curl_easy_setopt(curl, CURLOPT_POSTFIELDS, postData ? postData->data : NULL); curl_easy_setopt(curl, CURLOPT_POST, (strcmp(method, "POST") == 0) || (strcmp(method, "GET") != 0 && postData && postData->data) ? 1 : 0); } else { elog(IsTransactionState() ? ERROR : WARNING, "Unable to initialize libcurl"); } // if (mutex != 0) // { // if (shared) DirectFunctionCall1(pg_advisory_lock_shared_int8, Int64GetDatum(mutex)); // else DirectFunctionCall1(pg_advisory_lock_int8, Int64GetDatum(mutex)); // } ret = curl_easy_perform(curl); // if (mutex != 0) // { // if (shared) DirectFunctionCall1(pg_advisory_unlock_shared_int8, Int64GetDatum(mutex)); // else DirectFunctionCall1(pg_advisory_unlock_int8, Int64GetDatum(mutex)); // } if (allowCancel && IsTransactionState() && InterruptPending) { /* we might have detected one in the progress function, so check for sure */ CHECK_FOR_INTERRUPTS(); } if (ret != 0) { /* curl messed up */ elog(IsTransactionState() ? ERROR : WARNING, "libcurl error-code: %s(%d); message: %s; req=-X%s %s ", curl_easy_strerror(ret), ret, errorbuff, method, url); } curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &response_code); if (response_code < 200 || (response_code >=300 && response_code != 404)) { text *errorText = DatumGetTextP(DirectFunctionCall2(json_object_field_text, CStringGetTextDatum(response->data), CStringGetTextDatum("error"))); elog(IsTransactionState() ? ERROR : WARNING, "rc=%ld; %s", response_code, errorText != NULL ? TextDatumGetCString(errorText) : response->data); } if (headers) curl_slist_free_all(headers); curl_easy_cleanup(curl); pfree(errorbuff); return response; }
/* * Summarize the given page range of the given index. * * This routine can run in parallel with insertions into the heap. To avoid * missing those values from the summary tuple, we first insert a placeholder * index tuple into the index, then execute the heap scan; transactions * concurrent with the scan update the placeholder tuple. After the scan, we * union the placeholder tuple with the one computed by this routine. The * update of the index value happens in a loop, so that if somebody updates * the placeholder tuple after we read it, we detect the case and try again. * This ensures that the concurrently inserted tuples are not lost. */ static void summarize_range(IndexInfo *indexInfo, BrinBuildState *state, Relation heapRel, BlockNumber heapBlk) { Buffer phbuf; BrinTuple *phtup; Size phsz; OffsetNumber offset; /* * Insert the placeholder tuple */ phbuf = InvalidBuffer; phtup = brin_form_placeholder_tuple(state->bs_bdesc, heapBlk, &phsz); offset = brin_doinsert(state->bs_irel, state->bs_pagesPerRange, state->bs_rmAccess, &phbuf, heapBlk, phtup, phsz); /* * Execute the partial heap scan covering the heap blocks in the specified * page range, summarizing the heap tuples in it. This scan stops just * short of brinbuildCallback creating the new index entry. */ state->bs_currRangeStart = heapBlk; IndexBuildHeapRangeScan(heapRel, state->bs_irel, indexInfo, false, heapBlk, state->bs_pagesPerRange, brinbuildCallback, (void *) state); /* * Now we update the values obtained by the scan with the placeholder * tuple. We do this in a loop which only terminates if we're able to * update the placeholder tuple successfully; if we are not, this means * somebody else modified the placeholder tuple after we read it. */ for (;;) { BrinTuple *newtup; Size newsize; bool didupdate; bool samepage; CHECK_FOR_INTERRUPTS(); /* * Update the summary tuple and try to update. */ newtup = brin_form_tuple(state->bs_bdesc, heapBlk, state->bs_dtuple, &newsize); samepage = brin_can_do_samepage_update(phbuf, phsz, newsize); didupdate = brin_doupdate(state->bs_irel, state->bs_pagesPerRange, state->bs_rmAccess, heapBlk, phbuf, offset, phtup, phsz, newtup, newsize, samepage); brin_free_tuple(phtup); brin_free_tuple(newtup); /* If the update succeeded, we're done. */ if (didupdate) break; /* * If the update didn't work, it might be because somebody updated the * placeholder tuple concurrently. Extract the new version, union it * with the values we have from the scan, and start over. (There are * other reasons for the update to fail, but it's simple to treat them * the same.) */ phtup = brinGetTupleForHeapBlock(state->bs_rmAccess, heapBlk, &phbuf, &offset, &phsz, BUFFER_LOCK_SHARE); /* the placeholder tuple must exist */ if (phtup == NULL) elog(ERROR, "missing placeholder tuple"); phtup = brin_copy_tuple(phtup, phsz); LockBuffer(phbuf, BUFFER_LOCK_UNLOCK); /* merge it into the tuple from the heap scan */ union_tuples(state->bs_bdesc, state->bs_dtuple, phtup); } ReleaseBuffer(phbuf); }
/* * On the given BRIN index, summarize the heap page range that corresponds * to the heap block number given. * * This routine can run in parallel with insertions into the heap. To avoid * missing those values from the summary tuple, we first insert a placeholder * index tuple into the index, then execute the heap scan; transactions * concurrent with the scan update the placeholder tuple. After the scan, we * union the placeholder tuple with the one computed by this routine. The * update of the index value happens in a loop, so that if somebody updates * the placeholder tuple after we read it, we detect the case and try again. * This ensures that the concurrently inserted tuples are not lost. * * A further corner case is this routine being asked to summarize the partial * range at the end of the table. heapNumBlocks is the (possibly outdated) * table size; if we notice that the requested range lies beyond that size, * we re-compute the table size after inserting the placeholder tuple, to * avoid missing pages that were appended recently. */ static void summarize_range(IndexInfo *indexInfo, BrinBuildState *state, Relation heapRel, BlockNumber heapBlk, BlockNumber heapNumBlks) { Buffer phbuf; BrinTuple *phtup; Size phsz; OffsetNumber offset; BlockNumber scanNumBlks; /* * Insert the placeholder tuple */ phbuf = InvalidBuffer; phtup = brin_form_placeholder_tuple(state->bs_bdesc, heapBlk, &phsz); offset = brin_doinsert(state->bs_irel, state->bs_pagesPerRange, state->bs_rmAccess, &phbuf, heapBlk, phtup, phsz); /* * Compute range end. We hold ShareUpdateExclusive lock on table, so it * cannot shrink concurrently (but it can grow). */ Assert(heapBlk % state->bs_pagesPerRange == 0); if (heapBlk + state->bs_pagesPerRange > heapNumBlks) { /* * If we're asked to scan what we believe to be the final range on the * table (i.e. a range that might be partial) we need to recompute our * idea of what the latest page is after inserting the placeholder * tuple. Anyone that grows the table later will update the * placeholder tuple, so it doesn't matter that we won't scan these * pages ourselves. Careful: the table might have been extended * beyond the current range, so clamp our result. * * Fortunately, this should occur infrequently. */ scanNumBlks = Min(RelationGetNumberOfBlocks(heapRel) - heapBlk, state->bs_pagesPerRange); } else { /* Easy case: range is known to be complete */ scanNumBlks = state->bs_pagesPerRange; } /* * Execute the partial heap scan covering the heap blocks in the specified * page range, summarizing the heap tuples in it. This scan stops just * short of brinbuildCallback creating the new index entry. * * Note that it is critical we use the "any visible" mode of * IndexBuildHeapRangeScan here: otherwise, we would miss tuples inserted * by transactions that are still in progress, among other corner cases. */ state->bs_currRangeStart = heapBlk; IndexBuildHeapRangeScan(heapRel, state->bs_irel, indexInfo, false, true, heapBlk, scanNumBlks, brinbuildCallback, (void *) state, NULL); /* * Now we update the values obtained by the scan with the placeholder * tuple. We do this in a loop which only terminates if we're able to * update the placeholder tuple successfully; if we are not, this means * somebody else modified the placeholder tuple after we read it. */ for (;;) { BrinTuple *newtup; Size newsize; bool didupdate; bool samepage; CHECK_FOR_INTERRUPTS(); /* * Update the summary tuple and try to update. */ newtup = brin_form_tuple(state->bs_bdesc, heapBlk, state->bs_dtuple, &newsize); samepage = brin_can_do_samepage_update(phbuf, phsz, newsize); didupdate = brin_doupdate(state->bs_irel, state->bs_pagesPerRange, state->bs_rmAccess, heapBlk, phbuf, offset, phtup, phsz, newtup, newsize, samepage); brin_free_tuple(phtup); brin_free_tuple(newtup); /* If the update succeeded, we're done. */ if (didupdate) break; /* * If the update didn't work, it might be because somebody updated the * placeholder tuple concurrently. Extract the new version, union it * with the values we have from the scan, and start over. (There are * other reasons for the update to fail, but it's simple to treat them * the same.) */ phtup = brinGetTupleForHeapBlock(state->bs_rmAccess, heapBlk, &phbuf, &offset, &phsz, BUFFER_LOCK_SHARE, NULL); /* the placeholder tuple must exist */ if (phtup == NULL) elog(ERROR, "missing placeholder tuple"); phtup = brin_copy_tuple(phtup, phsz, NULL, NULL); LockBuffer(phbuf, BUFFER_LOCK_UNLOCK); /* merge it into the tuple from the heap scan */ union_tuples(state->bs_bdesc, state->bs_dtuple, phtup); } ReleaseBuffer(phbuf); }
/* * Write bytes into a shared message queue. */ static shm_mq_result shm_mq_send_bytes(shm_mq_handle *mqh, Size nbytes, const void *data, bool nowait, Size *bytes_written) { shm_mq *mq = mqh->mqh_queue; Size sent = 0; uint64 used; Size ringsize = mq->mq_ring_size; Size available; while (sent < nbytes) { bool detached; uint64 rb; /* Compute number of ring buffer bytes used and available. */ rb = shm_mq_get_bytes_read(mq, &detached); Assert(mq->mq_bytes_written >= rb); used = mq->mq_bytes_written - rb; Assert(used <= ringsize); available = Min(ringsize - used, nbytes - sent); /* Bail out if the queue has been detached. */ if (detached) { *bytes_written = sent; return SHM_MQ_DETACHED; } if (available == 0 && !mqh->mqh_counterparty_attached) { /* * The queue is full, so if the receiver isn't yet known to be * attached, we must wait for that to happen. */ if (nowait) { if (shm_mq_counterparty_gone(mq, mqh->mqh_handle)) { *bytes_written = sent; return SHM_MQ_DETACHED; } if (shm_mq_get_receiver(mq) == NULL) { *bytes_written = sent; return SHM_MQ_WOULD_BLOCK; } } else if (!shm_mq_wait_internal(mq, &mq->mq_receiver, mqh->mqh_handle)) { mq->mq_detached = true; *bytes_written = sent; return SHM_MQ_DETACHED; } mqh->mqh_counterparty_attached = true; /* * The receiver may have read some data after attaching, so we * must not wait without rechecking the queue state. */ } else if (available == 0) { shm_mq_result res; /* Let the receiver know that we need them to read some data. */ res = shm_mq_notify_receiver(mq); if (res != SHM_MQ_SUCCESS) { *bytes_written = sent; return res; } /* Skip manipulation of our latch if nowait = true. */ if (nowait) { *bytes_written = sent; return SHM_MQ_WOULD_BLOCK; } /* * Wait for our latch to be set. It might already be set for some * unrelated reason, but that'll just result in one extra trip * through the loop. It's worth it to avoid resetting the latch * at top of loop, because setting an already-set latch is much * cheaper than setting one that has been reset. */ WaitLatch(MyLatch, WL_LATCH_SET, 0); /* Reset the latch so we don't spin. */ ResetLatch(MyLatch); /* An interrupt may have occurred while we were waiting. */ CHECK_FOR_INTERRUPTS(); } else { Size offset = mq->mq_bytes_written % (uint64) ringsize; Size sendnow = Min(available, ringsize - offset); /* Write as much data as we can via a single memcpy(). */ memcpy(&mq->mq_ring[mq->mq_ring_offset + offset], (char *) data + sent, sendnow); sent += sendnow; /* * Update count of bytes written, with alignment padding. Note * that this will never actually insert any padding except at the * end of a run of bytes, because the buffer size is a multiple of * MAXIMUM_ALIGNOF, and each read is as well. */ Assert(sent == nbytes || sendnow == MAXALIGN(sendnow)); shm_mq_inc_bytes_written(mq, MAXALIGN(sendnow)); /* * For efficiency, we don't set the reader's latch here. We'll do * that only when the buffer fills up or after writing an entire * message. */ } } *bytes_written = sent; return SHM_MQ_SUCCESS; }
/* * GetNewRelFileNode * Generate a new relfilenode number that is unique within the * database of the given tablespace. * * If the relfilenode will also be used as the relation's OID, pass the * opened pg_class catalog, and this routine will guarantee that the result * is also an unused OID within pg_class. If the result is to be used only * as a relfilenode for an existing relation, pass NULL for pg_class. * * As with GetNewOid, there is some theoretical risk of a race condition, * but it doesn't seem worth worrying about. * * Note: we don't support using this in bootstrap mode. All relations * created by bootstrap have preassigned OIDs, so there's no need. */ Oid GetNewRelFileNode(Oid reltablespace, Relation pg_class, char relpersistence) { RelFileNodeBackend rnode; char *rpath; int fd; bool collides; BackendId backend; switch (relpersistence) { case RELPERSISTENCE_TEMP: backend = MyBackendId; break; case RELPERSISTENCE_UNLOGGED: case RELPERSISTENCE_PERMANENT: backend = InvalidBackendId; break; default: elog(ERROR, "invalid relpersistence: %c", relpersistence); return InvalidOid; /* placate compiler */ } /* This logic should match RelationInitPhysicalAddr */ rnode.node.spcNode = reltablespace ? reltablespace : MyDatabaseTableSpace; rnode.node.dbNode = (rnode.node.spcNode == GLOBALTABLESPACE_OID) ? InvalidOid : MyDatabaseId; /* * The relpath will vary based on the backend ID, so we must initialize * that properly here to make sure that any collisions based on filename * are properly detected. */ rnode.backend = backend; do { CHECK_FOR_INTERRUPTS(); /* Generate the OID */ if (pg_class) rnode.node.relNode = GetNewOid(pg_class); else rnode.node.relNode = GetNewObjectId(); /* Check for existing file of same name */ rpath = relpath(rnode, MAIN_FORKNUM); fd = BasicOpenFile(rpath, O_RDONLY | PG_BINARY, 0); if (fd >= 0) { /* definite collision */ close(fd); collides = true; } else { /* * Here we have a little bit of a dilemma: if errno is something * other than ENOENT, should we declare a collision and loop? In * particular one might think this advisable for, say, EPERM. * However there really shouldn't be any unreadable files in a * tablespace directory, and if the EPERM is actually complaining * that we can't read the directory itself, we'd be in an infinite * loop. In practice it seems best to go ahead regardless of the * errno. If there is a colliding file we will get an smgr * failure when we attempt to create the new relation file. */ collides = false; } pfree(rpath); } while (collides); return rnode.node.relNode; }
/* * Pipelined test of the shared memory message queue infrastructure. * * As in the basic test, we set up a ring of message queues passing through * 1 or more background processes and eventually looping back to ourselves. * Then, we send N copies of the user-specified message through the ring and * receive them all back. Since this might fill up all message queues in the * ring and then stall, we must be prepared to begin receiving the messages * back before we've finished sending them. */ Datum test_shm_mq_pipelined(PG_FUNCTION_ARGS) { int64 queue_size = PG_GETARG_INT64(0); text *message = PG_GETARG_TEXT_PP(1); char *message_contents = VARDATA_ANY(message); int message_size = VARSIZE_ANY_EXHDR(message); int32 loop_count = PG_GETARG_INT32(2); int32 nworkers = PG_GETARG_INT32(3); bool verify = PG_GETARG_BOOL(4); int32 send_count = 0; int32 receive_count = 0; dsm_segment *seg; shm_mq_handle *outqh; shm_mq_handle *inqh; shm_mq_result res; Size len; void *data; /* A negative loopcount is nonsensical. */ if (loop_count < 0) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("repeat count size must be a non-negative integer"))); /* * Using the nonblocking interfaces, we can even send data to ourselves, * so the minimum number of workers for this test is zero. */ if (nworkers < 0) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("number of workers must be a non-negative integer"))); /* Set up dynamic shared memory segment and background workers. */ test_shm_mq_setup(queue_size, nworkers, &seg, &outqh, &inqh); /* Main loop. */ for (;;) { bool wait = true; /* * If we haven't yet sent the message the requisite number of times, * try again to send it now. Note that when shm_mq_send() returns * SHM_MQ_WOULD_BLOCK, the next call to that function must pass the * same message size and contents; that's not an issue here because * we're sending the same message every time. */ if (send_count < loop_count) { res = shm_mq_send(outqh, message_size, message_contents, true); if (res == SHM_MQ_SUCCESS) { ++send_count; wait = false; } else if (res == SHM_MQ_DETACHED) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("could not send message"))); } /* * If we haven't yet received the message the requisite number of * times, try to receive it again now. */ if (receive_count < loop_count) { res = shm_mq_receive(inqh, &len, &data, true); if (res == SHM_MQ_SUCCESS) { ++receive_count; /* Verifying every time is slow, so it's optional. */ if (verify) verify_message(message_size, message_contents, len, data); wait = false; } else if (res == SHM_MQ_DETACHED) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("could not receive message"))); } else { /* * Otherwise, we've received the message enough times. This * shouldn't happen unless we've also sent it enough times. */ if (send_count != receive_count) ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("message sent %d times, but received %d times", send_count, receive_count))); break; } if (wait) { /* * If we made no progress, wait for one of the other processes to * which we are connected to set our latch, indicating that they * have read or written data and therefore there may now be work * for us to do. */ WaitLatch(MyLatch, WL_LATCH_SET, 0); CHECK_FOR_INTERRUPTS(); ResetLatch(MyLatch); } } /* Clean up. */ dsm_detach(seg); PG_RETURN_VOID(); }
/* * Rescan end pages to verify that they are (still) empty of tuples. * * Returns number of nondeletable pages (last nonempty page + 1). */ static BlockNumber count_nondeletable_pages(Relation onerel, LVRelStats *vacrelstats) { MIRROREDLOCK_BUFMGR_DECLARE; BlockNumber blkno; /* Strange coding of loop control is needed because blkno is unsigned */ blkno = vacrelstats->rel_pages; while (blkno > vacrelstats->nonempty_pages) { Buffer buf; Page page; OffsetNumber offnum, maxoff; bool hastup; /* * We don't insert a vacuum delay point here, because we have an * exclusive lock on the table which we want to hold for as short a * time as possible. We still need to check for interrupts however. */ CHECK_FOR_INTERRUPTS(); blkno--; /* -------- MirroredLock ---------- */ MIRROREDLOCK_BUFMGR_LOCK; buf = ReadBufferWithStrategy(onerel, blkno, vac_strategy); /* In this phase we only need shared access to the buffer */ LockBuffer(buf, BUFFER_LOCK_SHARE); page = BufferGetPage(buf); if (PageIsNew(page) || PageIsEmpty(page)) { /* PageIsNew probably shouldn't happen... */ UnlockReleaseBuffer(buf); MIRROREDLOCK_BUFMGR_UNLOCK; /* -------- MirroredLock ---------- */ continue; } hastup = false; maxoff = PageGetMaxOffsetNumber(page); for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { ItemId itemid; itemid = PageGetItemId(page, offnum); /* * Note: any non-unused item should be taken as a reason to keep * this page. We formerly thought that DEAD tuples could be * thrown away, but that's not so, because we'd not have cleaned * out their index entries. */ if (ItemIdIsUsed(itemid)) { hastup = true; break; /* can stop scanning */ } } /* scan along page */ UnlockReleaseBuffer(buf); MIRROREDLOCK_BUFMGR_UNLOCK; /* -------- MirroredLock ---------- */ /* Done scanning if we found a tuple here */ if (hastup) return blkno + 1; } /* * If we fall out of the loop, all the previously-thought-to-be-empty * pages still are; we need not bother to look at the last known-nonempty * page. */ return vacrelstats->nonempty_pages; }
/* * pg_prewarm(regclass, mode text, fork text, * first_block int8, last_block int8) * * The first argument is the relation to be prewarmed; the second controls * how prewarming is done; legal options are 'prefetch', 'read', and 'buffer'. * The third is the name of the relation fork to be prewarmed. The fourth * and fifth arguments specify the first and last block to be prewarmed. * If the fourth argument is NULL, it will be taken as 0; if the fifth argument * is NULL, it will be taken as the number of blocks in the relation. The * return value is the number of blocks successfully prewarmed. */ Datum pg_prewarm(PG_FUNCTION_ARGS) { Oid relOid; text *forkName; text *type; int64 first_block; int64 last_block; int64 nblocks; int64 blocks_done = 0; int64 block; Relation rel; ForkNumber forkNumber; char *forkString; char *ttype; PrewarmType ptype; AclResult aclresult; /* Basic sanity checking. */ if (PG_ARGISNULL(0)) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("relation cannot be null"))); relOid = PG_GETARG_OID(0); if (PG_ARGISNULL(1)) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), (errmsg("prewarm type cannot be null")))); type = PG_GETARG_TEXT_P(1); ttype = text_to_cstring(type); if (strcmp(ttype, "prefetch") == 0) ptype = PREWARM_PREFETCH; else if (strcmp(ttype, "read") == 0) ptype = PREWARM_READ; else if (strcmp(ttype, "buffer") == 0) ptype = PREWARM_BUFFER; else { ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("invalid prewarm type"), errhint("Valid prewarm types are \"prefetch\", \"read\", and \"buffer\"."))); PG_RETURN_INT64(0); /* Placate compiler. */ } if (PG_ARGISNULL(2)) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), (errmsg("relation fork cannot be null")))); forkName = PG_GETARG_TEXT_P(2); forkString = text_to_cstring(forkName); forkNumber = forkname_to_number(forkString); /* Open relation and check privileges. */ rel = relation_open(relOid, AccessShareLock); aclresult = pg_class_aclcheck(relOid, GetUserId(), ACL_SELECT); if (aclresult != ACLCHECK_OK) aclcheck_error(aclresult, ACL_KIND_CLASS, get_rel_name(relOid)); /* Check that the fork exists. */ RelationOpenSmgr(rel); if (!smgrexists(rel->rd_smgr, forkNumber)) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("fork \"%s\" does not exist for this relation", forkString))); /* Validate block numbers, or handle nulls. */ nblocks = RelationGetNumberOfBlocksInFork(rel, forkNumber); if (PG_ARGISNULL(3)) first_block = 0; else { first_block = PG_GETARG_INT64(3); if (first_block < 0 || first_block >= nblocks) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("starting block number must be between 0 and " INT64_FORMAT, nblocks - 1))); } if (PG_ARGISNULL(4)) last_block = nblocks - 1; else { last_block = PG_GETARG_INT64(4); if (last_block < 0 || last_block >= nblocks) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("ending block number must be between 0 and " INT64_FORMAT, nblocks - 1))); } /* Now we're ready to do the real work. */ if (ptype == PREWARM_PREFETCH) { #ifdef USE_PREFETCH /* * In prefetch mode, we just hint the OS to read the blocks, but we * don't know whether it really does it, and we don't wait for it to * finish. * * It would probably be better to pass our prefetch requests in chunks * of a megabyte or maybe even a whole segment at a time, but there's * no practical way to do that at present without a gross modularity * violation, so we just do this. */ for (block = first_block; block <= last_block; ++block) { CHECK_FOR_INTERRUPTS(); PrefetchBuffer(rel, forkNumber, block); ++blocks_done; } #else ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("prefetch is not supported by this build"))); #endif } else if (ptype == PREWARM_READ) { /* * In read mode, we actually read the blocks, but not into shared * buffers. This is more portable than prefetch mode (it works * everywhere) and is synchronous. */ for (block = first_block; block <= last_block; ++block) { CHECK_FOR_INTERRUPTS(); smgrread(rel->rd_smgr, forkNumber, block, blockbuffer); ++blocks_done; } } else if (ptype == PREWARM_BUFFER) { /* * In buffer mode, we actually pull the data into shared_buffers. */ for (block = first_block; block <= last_block; ++block) { Buffer buf; CHECK_FOR_INTERRUPTS(); buf = ReadBufferExtended(rel, forkNumber, block, RBM_NORMAL, NULL); ReleaseBuffer(buf); ++blocks_done; } } /* Close relation, release lock. */ relation_close(rel, AccessShareLock); PG_RETURN_INT64(blocks_done); }
/* * RequestCheckpoint * Called in backend processes to request a checkpoint * * flags is a bitwise OR of the following: * CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown. * CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery. * CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP, * ignoring checkpoint_completion_target parameter. * CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occurred * since the last one (implied by CHECKPOINT_IS_SHUTDOWN or * CHECKPOINT_END_OF_RECOVERY). * CHECKPOINT_WAIT: wait for completion before returning (otherwise, * just signal checkpointer to do it, and return). * CHECKPOINT_CAUSE_XLOG: checkpoint is requested due to xlog filling. * (This affects logging, and in particular enables CheckPointWarning.) */ void RequestCheckpoint(int flags) { int ntries; int old_failed, old_started; /* * If in a standalone backend, just do it ourselves. */ if (!IsPostmasterEnvironment) { /* * There's no point in doing slow checkpoints in a standalone backend, * because there's no other backends the checkpoint could disrupt. */ CreateCheckPoint(flags | CHECKPOINT_IMMEDIATE); /* * After any checkpoint, close all smgr files. This is so we won't * hang onto smgr references to deleted files indefinitely. */ smgrcloseall(); return; } /* * Atomically set the request flags, and take a snapshot of the counters. * When we see ckpt_started > old_started, we know the flags we set here * have been seen by checkpointer. * * Note that we OR the flags with any existing flags, to avoid overriding * a "stronger" request by another backend. The flag senses must be * chosen to make this work! */ SpinLockAcquire(&CheckpointerShmem->ckpt_lck); old_failed = CheckpointerShmem->ckpt_failed; old_started = CheckpointerShmem->ckpt_started; CheckpointerShmem->ckpt_flags |= flags; SpinLockRelease(&CheckpointerShmem->ckpt_lck); /* * Send signal to request checkpoint. It's possible that the checkpointer * hasn't started yet, or is in process of restarting, so we will retry a * few times if needed. Also, if not told to wait for the checkpoint to * occur, we consider failure to send the signal to be nonfatal and merely * LOG it. */ for (ntries = 0;; ntries++) { if (CheckpointerShmem->checkpointer_pid == 0) { if (ntries >= 20) /* max wait 2.0 sec */ { elog((flags & CHECKPOINT_WAIT) ? ERROR : LOG, "could not request checkpoint because checkpointer not running"); break; } } else if (kill(CheckpointerShmem->checkpointer_pid, SIGINT) != 0) { if (ntries >= 20) /* max wait 2.0 sec */ { elog((flags & CHECKPOINT_WAIT) ? ERROR : LOG, "could not signal for checkpoint: %m"); break; } } else break; /* signal sent successfully */ CHECK_FOR_INTERRUPTS(); pg_usleep(100000L); /* wait 0.1 sec, then retry */ } /* * If requested, wait for completion. We detect completion according to * the algorithm given above. */ if (flags & CHECKPOINT_WAIT) { int new_started, new_failed; /* Wait for a new checkpoint to start. */ for (;;) { SpinLockAcquire(&CheckpointerShmem->ckpt_lck); new_started = CheckpointerShmem->ckpt_started; SpinLockRelease(&CheckpointerShmem->ckpt_lck); if (new_started != old_started) break; CHECK_FOR_INTERRUPTS(); pg_usleep(100000L); } /* * We are waiting for ckpt_done >= new_started, in a modulo sense. */ for (;;) { int new_done; SpinLockAcquire(&CheckpointerShmem->ckpt_lck); new_done = CheckpointerShmem->ckpt_done; new_failed = CheckpointerShmem->ckpt_failed; SpinLockRelease(&CheckpointerShmem->ckpt_lck); if (new_done - new_started >= 0) break; CHECK_FOR_INTERRUPTS(); pg_usleep(100000L); } if (new_failed != old_failed) ereport(ERROR, (errmsg("checkpoint request failed"), errhint("Consult recent messages in the server log for details."))); } }
/* * Attempt to read a tuple from one of our parallel workers. */ static HeapTuple gather_readnext(GatherState *gatherstate) { int nvisited = 0; for (;;) { TupleQueueReader *reader; HeapTuple tup; bool readerdone; /* Check for async events, particularly messages from workers. */ CHECK_FOR_INTERRUPTS(); /* * Attempt to read a tuple, but don't block if none is available. * * Note that TupleQueueReaderNext will just return NULL for a worker * which fails to initialize. We'll treat that worker as having * produced no tuples; WaitForParallelWorkersToFinish will error out * when we get there. */ Assert(gatherstate->nextreader < gatherstate->nreaders); reader = gatherstate->reader[gatherstate->nextreader]; tup = TupleQueueReaderNext(reader, true, &readerdone); /* * If this reader is done, remove it from our working array of active * readers. If all readers are done, we're outta here. */ if (readerdone) { Assert(!tup); --gatherstate->nreaders; if (gatherstate->nreaders == 0) return NULL; memmove(&gatherstate->reader[gatherstate->nextreader], &gatherstate->reader[gatherstate->nextreader + 1], sizeof(TupleQueueReader *) * (gatherstate->nreaders - gatherstate->nextreader)); if (gatherstate->nextreader >= gatherstate->nreaders) gatherstate->nextreader = 0; continue; } /* If we got a tuple, return it. */ if (tup) return tup; /* * Advance nextreader pointer in round-robin fashion. Note that we * only reach this code if we weren't able to get a tuple from the * current worker. We used to advance the nextreader pointer after * every tuple, but it turns out to be much more efficient to keep * reading from the same queue until that would require blocking. */ gatherstate->nextreader++; if (gatherstate->nextreader >= gatherstate->nreaders) gatherstate->nextreader = 0; /* Have we visited every (surviving) TupleQueueReader? */ nvisited++; if (nvisited >= gatherstate->nreaders) { /* * If (still) running plan locally, return NULL so caller can * generate another tuple from the local copy of the plan. */ if (gatherstate->need_to_scan_locally) return NULL; /* Nothing to do except wait for developments. */ WaitLatch(MyLatch, WL_LATCH_SET, 0, WAIT_EVENT_EXECUTE_GATHER); ResetLatch(MyLatch); nvisited = 0; } } }
/* ---------------------------------------------------------------- * ExecGather(node) * * Scans the relation via multiple workers and returns * the next qualifying tuple. * ---------------------------------------------------------------- */ static TupleTableSlot * ExecGather(PlanState *pstate) { GatherState *node = castNode(GatherState, pstate); TupleTableSlot *slot; ExprContext *econtext; CHECK_FOR_INTERRUPTS(); /* * Initialize the parallel context and workers on first execution. We do * this on first execution rather than during node initialization, as it * needs to allocate a large dynamic segment, so it is better to do it * only if it is really needed. */ if (!node->initialized) { EState *estate = node->ps.state; Gather *gather = (Gather *) node->ps.plan; /* * Sometimes we might have to run without parallelism; but if parallel * mode is active then we can try to fire up some workers. */ if (gather->num_workers > 0 && estate->es_use_parallel_mode) { ParallelContext *pcxt; /* Initialize, or re-initialize, shared state needed by workers. */ if (!node->pei) node->pei = ExecInitParallelPlan(node->ps.lefttree, estate, gather->initParam, gather->num_workers, node->tuples_needed); else ExecParallelReinitialize(node->ps.lefttree, node->pei, gather->initParam); /* * Register backend workers. We might not get as many as we * requested, or indeed any at all. */ pcxt = node->pei->pcxt; LaunchParallelWorkers(pcxt); /* We save # workers launched for the benefit of EXPLAIN */ node->nworkers_launched = pcxt->nworkers_launched; /* Set up tuple queue readers to read the results. */ if (pcxt->nworkers_launched > 0) { ExecParallelCreateReaders(node->pei); /* Make a working array showing the active readers */ node->nreaders = pcxt->nworkers_launched; node->reader = (TupleQueueReader **) palloc(node->nreaders * sizeof(TupleQueueReader *)); memcpy(node->reader, node->pei->reader, node->nreaders * sizeof(TupleQueueReader *)); } else { /* No workers? Then never mind. */ node->nreaders = 0; node->reader = NULL; } node->nextreader = 0; } /* Run plan locally if no workers or enabled and not single-copy. */ node->need_to_scan_locally = (node->nreaders == 0) || (!gather->single_copy && parallel_leader_participation); node->initialized = true; } /* * Reset per-tuple memory context to free any expression evaluation * storage allocated in the previous tuple cycle. */ econtext = node->ps.ps_ExprContext; ResetExprContext(econtext); /* * Get next tuple, either from one of our workers, or by running the plan * ourselves. */ slot = gather_getnext(node); if (TupIsNull(slot)) return NULL; /* If no projection is required, we're done. */ if (node->ps.ps_ProjInfo == NULL) return slot; /* * Form the result tuple using ExecProject(), and return it. */ econtext->ecxt_outertuple = slot; return ExecProject(node->ps.ps_ProjInfo); }
void mk_qsort_impl(MKEntry *a, int left, int right, int lv, bool lvdown, MKContext *ctxt, bool seenNull) { int lastInLow; int firstInHigh; Assert(ctxt); Assert(lv < ctxt->total_lv); CHECK_FOR_INTERRUPTS(); if(right <= left) return; /* Prepare at level lv */ if(lvdown) mk_prepare_array(a, left, right, lv, ctxt); /* * According to Bentley & McIlroy [1] (1993), using insert sort for case * n < 7 is a significant saving. However, according to Sedgewick & * Bentley [2] (2002), the wisdom of new millenium is not to special case * smaller cases. Here, we do not special case it because we want to save * memtuple_getattr, and expensive comparisons that has been prepared. * * XXX Find out why we have a new wisdom in [2] and impl. & compare. */ mk_qsort_part3(a, left, right, lv, ctxt, &lastInLow, &firstInHigh); /* recurse to left chunk */ mk_qsort_impl(a, left, lastInLow, lv, false, ctxt, seenNull); /* recurse to middle (equal) chunk */ if(lv < ctxt->total_lv-1) { /* * [lastInLow+1,firstInHigh-1] defines the pivot region which was all equal at level lv. So increase the level and compare that region! */ mk_qsort_impl(a, lastInLow+1, firstInHigh-1, lv+1, true, ctxt, seenNull || mke_is_null(a+lastInLow+1)); /* a + lastInLow + 1 points to the pivot */ } else { /* values are all equal to the deepest level...no need for more compares, but check uniqueness if requested */ if(firstInHigh-1 > lastInLow+1 && !seenNull && !mke_is_null(a+lastInLow+1)) /* a + lastInLow + 1 points to the pivot */ { if ( ctxt->enforceUnique ) { ERROR_UNIQUENESS_VIOLATED(); } else if ( ctxt->unique) { int toFreeIndex; for ( toFreeIndex = lastInLow + 2; toFreeIndex < firstInHigh; toFreeIndex++) /* +2 because we want to keep one around! */ { MKEntry *toFree = a + toFreeIndex; if ( ctxt->cpfr) ctxt->cpfr(toFree, NULL, ctxt->lvctxt + lv); // todo: verify off-by-one ctxt->freeTup(toFree); mke_set_empty(toFree); } } } } /* recurse to right chunk */ mk_qsort_impl(a, firstInHigh, right, lv, false, ctxt, seenNull); #ifdef MKQSORT_VERIFY if(lv == 0) mkqsort_verify(a, left, right, ctxt); #endif }
/* * IndexBuildHeapScan - scan the heap relation to find tuples to be indexed * * This is called back from an access-method-specific index build procedure * after the AM has done whatever setup it needs. The parent heap relation * is scanned to find tuples that should be entered into the index. Each * such tuple is passed to the AM's callback routine, which does the right * things to add it to the new index. After we return, the AM's index * build procedure does whatever cleanup is needed; in particular, it should * close the heap and index relations. * * The total count of heap tuples is returned. This is for updating pg_class * statistics. (It's annoying not to be able to do that here, but we can't * do it until after the relation is closed.) Note that the index AM itself * must keep track of the number of index tuples; we don't do so here because * the AM might reject some of the tuples for its own reasons, such as being * unable to store NULLs. */ double IndexBuildHeapScan(Relation heapRelation, Relation indexRelation, IndexInfo *indexInfo, IndexBuildCallback callback, void *callback_state) { HeapScanDesc scan; HeapTuple heapTuple; TupleDesc heapDescriptor; Datum attdata[INDEX_MAX_KEYS]; char nulls[INDEX_MAX_KEYS]; double reltuples; List *predicate; TupleTable tupleTable; TupleTableSlot *slot; EState *estate; ExprContext *econtext; Snapshot snapshot; TransactionId OldestXmin; /* * sanity checks */ Assert(OidIsValid(indexRelation->rd_rel->relam)); heapDescriptor = RelationGetDescr(heapRelation); /* * Need an EState for evaluation of index expressions and * partial-index predicates. */ estate = CreateExecutorState(); econtext = GetPerTupleExprContext(estate); /* * If this is a predicate (partial) index, we will need to evaluate * the predicate using ExecQual, which requires the current tuple to * be in a slot of a TupleTable. Likewise if there are any * expressions. */ if (indexInfo->ii_Predicate != NIL || indexInfo->ii_Expressions != NIL) { tupleTable = ExecCreateTupleTable(1); slot = ExecAllocTableSlot(tupleTable); ExecSetSlotDescriptor(slot, heapDescriptor, false); /* Arrange for econtext's scan tuple to be the tuple under test */ econtext->ecxt_scantuple = slot; /* Set up execution state for predicate. */ predicate = (List *) ExecPrepareExpr((Expr *) indexInfo->ii_Predicate, estate); } else { tupleTable = NULL; slot = NULL; predicate = NIL; } /* * Ok, begin our scan of the base relation. We use SnapshotAny * because we must retrieve all tuples and do our own time qual * checks. */ if (IsBootstrapProcessingMode()) { snapshot = SnapshotNow; OldestXmin = InvalidTransactionId; } else { snapshot = SnapshotAny; OldestXmin = GetOldestXmin(heapRelation->rd_rel->relisshared); } scan = heap_beginscan(heapRelation, /* relation */ snapshot, /* seeself */ 0, /* number of keys */ (ScanKey) NULL); /* scan key */ reltuples = 0; /* * Scan all tuples in the base relation. */ while ((heapTuple = heap_getnext(scan, ForwardScanDirection)) != NULL) { bool tupleIsAlive; CHECK_FOR_INTERRUPTS(); if (snapshot == SnapshotAny) { /* do our own time qual check */ bool indexIt; uint16 sv_infomask; /* * HeapTupleSatisfiesVacuum may update tuple's hint status * bits. We could possibly get away with not locking the * buffer here, since caller should hold ShareLock on the * relation, but let's be conservative about it. */ LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE); sv_infomask = heapTuple->t_data->t_infomask; switch (HeapTupleSatisfiesVacuum(heapTuple->t_data, OldestXmin)) { case HEAPTUPLE_DEAD: indexIt = false; tupleIsAlive = false; break; case HEAPTUPLE_LIVE: indexIt = true; tupleIsAlive = true; break; case HEAPTUPLE_RECENTLY_DEAD: /* * If tuple is recently deleted then we must index it * anyway to keep VACUUM from complaining. */ indexIt = true; tupleIsAlive = false; break; case HEAPTUPLE_INSERT_IN_PROGRESS: /* * Since caller should hold ShareLock or better, we * should not see any tuples inserted by open * transactions --- unless it's our own transaction. * (Consider INSERT followed by CREATE INDEX within a * transaction.) An exception occurs when reindexing * a system catalog, because we often release lock on * system catalogs before committing. */ if (!TransactionIdIsCurrentTransactionId( HeapTupleHeaderGetXmin(heapTuple->t_data)) && !IsSystemRelation(heapRelation)) elog(ERROR, "concurrent insert in progress"); indexIt = true; tupleIsAlive = true; break; case HEAPTUPLE_DELETE_IN_PROGRESS: /* * Since caller should hold ShareLock or better, we * should not see any tuples deleted by open * transactions --- unless it's our own transaction. * (Consider DELETE followed by CREATE INDEX within a * transaction.) An exception occurs when reindexing * a system catalog, because we often release lock on * system catalogs before committing. */ if (!TransactionIdIsCurrentTransactionId( HeapTupleHeaderGetXmax(heapTuple->t_data)) && !IsSystemRelation(heapRelation)) elog(ERROR, "concurrent delete in progress"); indexIt = true; tupleIsAlive = false; break; default: elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); indexIt = tupleIsAlive = false; /* keep compiler quiet */ break; } /* check for hint-bit update by HeapTupleSatisfiesVacuum */ if (sv_infomask != heapTuple->t_data->t_infomask) SetBufferCommitInfoNeedsSave(scan->rs_cbuf); LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK); if (!indexIt) continue; } else { /* heap_getnext did the time qual check */ tupleIsAlive = true; } reltuples += 1; MemoryContextReset(econtext->ecxt_per_tuple_memory); /* Set up for predicate or expression evaluation */ if (slot) ExecStoreTuple(heapTuple, slot, InvalidBuffer, false); /* * In a partial index, discard tuples that don't satisfy the * predicate. We can also discard recently-dead tuples, since * VACUUM doesn't complain about tuple count mismatch for partial * indexes. */ if (predicate != NIL) { if (!tupleIsAlive) continue; if (!ExecQual(predicate, econtext, false)) continue; } /* * For the current heap tuple, extract all the attributes we use * in this index, and note which are null. This also performs * evaluation of any expressions needed. */ FormIndexDatum(indexInfo, heapTuple, heapDescriptor, estate, attdata, nulls); /* * You'd think we should go ahead and build the index tuple here, * but some index AMs want to do further processing on the data * first. So pass the attdata and nulls arrays, instead. */ /* Call the AM's callback routine to process the tuple */ callback(indexRelation, heapTuple, attdata, nulls, tupleIsAlive, callback_state); } heap_endscan(scan); if (tupleTable) ExecDropTupleTable(tupleTable, true); FreeExecutorState(estate); /* These may have been pointing to the now-gone estate */ indexInfo->ii_ExpressionsState = NIL; indexInfo->ii_PredicateState = NIL; return reltuples; }
/* * calculate total size of tablespace */ static int64 calculate_tablespace_size(Oid tblspcOid) { char tblspcPath[MAXPGPATH]; char pathname[MAXPGPATH]; int64 totalsize = 0; DIR *dirdesc; struct dirent *direntry; AclResult aclresult; /* * User must have CREATE privilege for target tablespace, either * explicitly granted or implicitly because it is default for current * database. */ if (tblspcOid != MyDatabaseTableSpace) { aclresult = pg_tablespace_aclcheck(tblspcOid, GetUserId(), ACL_CREATE); if (aclresult != ACLCHECK_OK) aclcheck_error(aclresult, ACL_KIND_TABLESPACE, get_tablespace_name(tblspcOid)); } if (tblspcOid == DEFAULTTABLESPACE_OID) snprintf(tblspcPath, MAXPGPATH, "base"); else if (tblspcOid == GLOBALTABLESPACE_OID) snprintf(tblspcPath, MAXPGPATH, "global"); else snprintf(tblspcPath, MAXPGPATH, "pg_tblspc/%u/%s", tblspcOid, TABLESPACE_VERSION_DIRECTORY); dirdesc = AllocateDir(tblspcPath); if (!dirdesc) ereport(ERROR, (errcode_for_file_access(), errmsg("could not open tablespace directory \"%s\": %m", tblspcPath))); while ((direntry = ReadDir(dirdesc, tblspcPath)) != NULL) { struct stat fst; CHECK_FOR_INTERRUPTS(); if (strcmp(direntry->d_name, ".") == 0 || strcmp(direntry->d_name, "..") == 0) continue; snprintf(pathname, MAXPGPATH, "%s/%s", tblspcPath, direntry->d_name); if (stat(pathname, &fst) < 0) { if (errno == ENOENT) continue; else ereport(ERROR, (errcode_for_file_access(), errmsg("could not stat file \"%s\": %m", pathname))); } if (S_ISDIR(fst.st_mode)) totalsize += db_dir_size(pathname); totalsize += fst.st_size; } FreeDir(dirdesc); return totalsize; }
/* * pgstat_heap -- returns live/dead tuples info in a heap */ static Datum pgstat_heap(Relation rel, FunctionCallInfo fcinfo) { HeapScanDesc scan; HeapTuple tuple; BlockNumber nblocks; BlockNumber block = 0; /* next block to count free space in */ BlockNumber tupblock; Buffer buffer; pgstattuple_type stat = {0}; /* Disable syncscan because we assume we scan from block zero upwards */ scan = heap_beginscan_strat(rel, SnapshotAny, 0, NULL, true, false); nblocks = scan->rs_nblocks; /* # blocks to be scanned */ /* scan the relation */ while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL) { CHECK_FOR_INTERRUPTS(); /* must hold a buffer lock to call HeapTupleSatisfiesVisibility */ LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE); if (HeapTupleSatisfiesVisibility(tuple, SnapshotNow, scan->rs_cbuf)) { stat.tuple_len += tuple->t_len; stat.tuple_count++; } else { stat.dead_tuple_len += tuple->t_len; stat.dead_tuple_count++; } LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK); /* * To avoid physically reading the table twice, try to do the * free-space scan in parallel with the heap scan. However, * heap_getnext may find no tuples on a given page, so we cannot * simply examine the pages returned by the heap scan. */ tupblock = BlockIdGetBlockNumber(&tuple->t_self.ip_blkid); while (block <= tupblock) { CHECK_FOR_INTERRUPTS(); buffer = ReadBuffer(rel, block); LockBuffer(buffer, BUFFER_LOCK_SHARE); stat.free_space += PageGetHeapFreeSpace((Page) BufferGetPage(buffer)); UnlockReleaseBuffer(buffer); block++; } } heap_endscan(scan); while (block < nblocks) { CHECK_FOR_INTERRUPTS(); buffer = ReadBuffer(rel, block); LockBuffer(buffer, BUFFER_LOCK_SHARE); stat.free_space += PageGetHeapFreeSpace((Page) BufferGetPage(buffer)); UnlockReleaseBuffer(buffer); block++; } relation_close(rel, AccessShareLock); stat.table_len = (uint64) nblocks *BLCKSZ; return build_pgstattuple_type(&stat, fcinfo); }
/* ---------------------------------------------------------------- * ExecScan * * Scans the relation using the 'access method' indicated and * returns the next qualifying tuple in the direction specified * in the global variable ExecDirection. * The access method returns the next tuple and execScan() is * responsible for checking the tuple returned against the qual-clause. * * A 'recheck method' must also be provided that can check an * arbitrary tuple of the relation against any qual conditions * that are implemented internal to the access method. * * Conditions: * -- the "cursor" maintained by the AMI is positioned at the tuple * returned previously. * * Initial States: * -- the relation indicated is opened for scanning so that the * "cursor" is positioned before the first qualifying tuple. * ---------------------------------------------------------------- */ TupleTableSlot * ExecScan(ScanState *node, ExecScanAccessMtd accessMtd, /* function returning a tuple */ ExecScanRecheckMtd recheckMtd) { ExprContext *econtext; List *qual; ProjectionInfo *projInfo; ExprDoneCond isDone; TupleTableSlot *resultSlot; /* * Fetch data from node */ qual = node->ps.qual; projInfo = node->ps.ps_ProjInfo; /* * If we have neither a qual to check nor a projection to do, just skip * all the overhead and return the raw scan tuple. */ if (!qual && !projInfo) return ExecScanFetch(node, accessMtd, recheckMtd); /* * Check to see if we're still projecting out tuples from a previous scan * tuple (because there is a function-returning-set in the projection * expressions). If so, try to project another one. */ if (node->ps.ps_TupFromTlist) { Assert(projInfo); /* can't get here if not projecting */ resultSlot = ExecProject(projInfo, &isDone); if (isDone == ExprMultipleResult) return resultSlot; /* Done with that source tuple... */ node->ps.ps_TupFromTlist = false; } /* * Reset per-tuple memory context to free any expression evaluation * storage allocated in the previous tuple cycle. Note this can't happen * until we're done projecting out tuples from a scan tuple. */ econtext = node->ps.ps_ExprContext; ResetExprContext(econtext); /* * get a tuple from the access method. Loop until we obtain a tuple that * passes the qualification. */ for (;;) { TupleTableSlot *slot; CHECK_FOR_INTERRUPTS(); slot = ExecScanFetch(node, accessMtd, recheckMtd); /* * if the slot returned by the accessMtd contains NULL, then it means * there is nothing more to scan so we just return an empty slot, * being careful to use the projection result slot so it has correct * tupleDesc. */ if (TupIsNull(slot)) { if (projInfo) return ExecClearTuple(projInfo->pi_slot); else return slot; } /* * place the current tuple into the expr context */ econtext->ecxt_scantuple = slot; /* * check that the current tuple satisfies the qual-clause * * check for non-nil qual here to avoid a function call to ExecQual() * when the qual is nil ... saves only a few cycles, but they add up * ... */ if (!qual || ExecQual(qual, econtext, false)) { /* * Found a satisfactory scan tuple. */ if (projInfo) { /* * Form a projection tuple, store it in the result tuple slot * and return it --- unless we find we can project no tuples * from this scan tuple, in which case continue scan. */ resultSlot = ExecProject(projInfo, &isDone); if (isDone != ExprEndResult) { node->ps.ps_TupFromTlist = (isDone == ExprMultipleResult); return resultSlot; } } else { /* * Here, we aren't projecting, so just return scan tuple. */ return slot; } } /* * Tuple fails qual, so free per-tuple memory and try again. */ ResetExprContext(econtext); } }
/* * Helper function for the various SQL callable logical decoding functions. */ static Datum pg_logical_slot_get_changes_guts(FunctionCallInfo fcinfo, bool confirm, bool binary) { Name name; XLogRecPtr upto_lsn; int32 upto_nchanges; ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; MemoryContext per_query_ctx; MemoryContext oldcontext; XLogRecPtr end_of_wal; XLogRecPtr startptr; LogicalDecodingContext *ctx; ResourceOwner old_resowner = CurrentResourceOwner; ArrayType *arr; Size ndim; List *options = NIL; DecodingOutputState *p; check_permissions(); CheckLogicalDecodingRequirements(); if (PG_ARGISNULL(0)) ereport(ERROR, (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), errmsg("slot name must not be null"))); name = PG_GETARG_NAME(0); if (PG_ARGISNULL(1)) upto_lsn = InvalidXLogRecPtr; else upto_lsn = PG_GETARG_LSN(1); if (PG_ARGISNULL(2)) upto_nchanges = InvalidXLogRecPtr; else upto_nchanges = PG_GETARG_INT32(2); if (PG_ARGISNULL(3)) ereport(ERROR, (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), errmsg("options array must not be null"))); arr = PG_GETARG_ARRAYTYPE_P(3); /* check to see if caller supports us returning a tuplestore */ if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo)) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("set-valued function called in context that cannot accept a set"))); if (!(rsinfo->allowedModes & SFRM_Materialize)) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("materialize mode required, but it is not allowed in this context"))); /* state to write output to */ p = palloc0(sizeof(DecodingOutputState)); p->binary_output = binary; /* Build a tuple descriptor for our result type */ if (get_call_result_type(fcinfo, NULL, &p->tupdesc) != TYPEFUNC_COMPOSITE) elog(ERROR, "return type must be a row type"); per_query_ctx = rsinfo->econtext->ecxt_per_query_memory; oldcontext = MemoryContextSwitchTo(per_query_ctx); /* Deconstruct options array */ ndim = ARR_NDIM(arr); if (ndim > 1) { ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("array must be one-dimensional"))); } else if (array_contains_nulls(arr)) { ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("array must not contain nulls"))); } else if (ndim == 1) { int nelems; Datum *datum_opts; int i; Assert(ARR_ELEMTYPE(arr) == TEXTOID); deconstruct_array(arr, TEXTOID, -1, false, 'i', &datum_opts, NULL, &nelems); if (nelems % 2 != 0) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("array must have even number of elements"))); for (i = 0; i < nelems; i += 2) { char *name = TextDatumGetCString(datum_opts[i]); char *opt = TextDatumGetCString(datum_opts[i + 1]); options = lappend(options, makeDefElem(name, (Node *) makeString(opt), -1)); } } p->tupstore = tuplestore_begin_heap(true, false, work_mem); rsinfo->returnMode = SFRM_Materialize; rsinfo->setResult = p->tupstore; rsinfo->setDesc = p->tupdesc; /* * Compute the current end-of-wal and maintain ThisTimeLineID. * RecoveryInProgress() will update ThisTimeLineID on promotion. */ if (!RecoveryInProgress()) end_of_wal = GetFlushRecPtr(); else end_of_wal = GetXLogReplayRecPtr(&ThisTimeLineID); ReplicationSlotAcquire(NameStr(*name)); PG_TRY(); { /* restart at slot's confirmed_flush */ ctx = CreateDecodingContext(InvalidXLogRecPtr, options, logical_read_local_xlog_page, LogicalOutputPrepareWrite, LogicalOutputWrite); MemoryContextSwitchTo(oldcontext); /* * Check whether the output plugin writes textual output if that's * what we need. */ if (!binary && ctx->options.output_type !=OUTPUT_PLUGIN_TEXTUAL_OUTPUT) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("logical decoding output plugin \"%s\" produces binary output, but function \"%s\" expects textual data", NameStr(MyReplicationSlot->data.plugin), format_procedure(fcinfo->flinfo->fn_oid)))); ctx->output_writer_private = p; /* * Decoding of WAL must start at restart_lsn so that the entirety of * xacts that committed after the slot's confirmed_flush can be * accumulated into reorder buffers. */ startptr = MyReplicationSlot->data.restart_lsn; CurrentResourceOwner = ResourceOwnerCreate(CurrentResourceOwner, "logical decoding"); /* invalidate non-timetravel entries */ InvalidateSystemCaches(); /* Decode until we run out of records */ while ((startptr != InvalidXLogRecPtr && startptr < end_of_wal) || (ctx->reader->EndRecPtr != InvalidXLogRecPtr && ctx->reader->EndRecPtr < end_of_wal)) { XLogRecord *record; char *errm = NULL; record = XLogReadRecord(ctx->reader, startptr, &errm); if (errm) elog(ERROR, "%s", errm); /* * Now that we've set up the xlog reader state, subsequent calls * pass InvalidXLogRecPtr to say "continue from last record" */ startptr = InvalidXLogRecPtr; /* * The {begin_txn,change,commit_txn}_wrapper callbacks above will * store the description into our tuplestore. */ if (record != NULL) LogicalDecodingProcessRecord(ctx, ctx->reader); /* check limits */ if (upto_lsn != InvalidXLogRecPtr && upto_lsn <= ctx->reader->EndRecPtr) break; if (upto_nchanges != 0 && upto_nchanges <= p->returned_rows) break; CHECK_FOR_INTERRUPTS(); } tuplestore_donestoring(tupstore); CurrentResourceOwner = old_resowner; /* * Next time, start where we left off. (Hunting things, the family * business..) */ if (ctx->reader->EndRecPtr != InvalidXLogRecPtr && confirm) { LogicalConfirmReceivedLocation(ctx->reader->EndRecPtr); /* * If only the confirmed_flush_lsn has changed the slot won't get * marked as dirty by the above. Callers on the walsender interface * are expected to keep track of their own progress and don't need * it written out. But SQL-interface users cannot specify their own * start positions and it's harder for them to keep track of their * progress, so we should make more of an effort to save it for them. * * Dirty the slot so it's written out at the next checkpoint. We'll * still lose its position on crash, as documented, but it's better * than always losing the position even on clean restart. */ ReplicationSlotMarkDirty(); } /* free context, call shutdown callback */ FreeDecodingContext(ctx); ReplicationSlotRelease(); InvalidateSystemCaches(); } PG_CATCH(); { /* clear all timetravel entries */ InvalidateSystemCaches(); PG_RE_THROW(); } PG_END_TRY(); return (Datum) 0; }
/* * copy one file */ void copy_file(char *fromfile, char *tofile) { char *buffer; int srcfd; int dstfd; int nbytes; off_t offset; off_t flush_offset; /* Size of copy buffer (read and write requests) */ #define COPY_BUF_SIZE (8 * BLCKSZ) /* * Size of data flush requests. It seems beneficial on most platforms to * do this every 1MB or so. But macOS, at least with early releases of * APFS, is really unfriendly to small mmap/msync requests, so there do it * only every 32MB. */ #if defined(__darwin__) #define FLUSH_DISTANCE (32 * 1024 * 1024) #else #define FLUSH_DISTANCE (1024 * 1024) #endif /* Use palloc to ensure we get a maxaligned buffer */ buffer = palloc(COPY_BUF_SIZE); /* * Open the files */ srcfd = OpenTransientFile(fromfile, O_RDONLY | PG_BINARY); if (srcfd < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not open file \"%s\": %m", fromfile))); dstfd = OpenTransientFile(tofile, O_RDWR | O_CREAT | O_EXCL | PG_BINARY); if (dstfd < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not create file \"%s\": %m", tofile))); /* * Do the data copying. */ flush_offset = 0; for (offset = 0;; offset += nbytes) { /* If we got a cancel signal during the copy of the file, quit */ CHECK_FOR_INTERRUPTS(); /* * We fsync the files later, but during the copy, flush them every so * often to avoid spamming the cache and hopefully get the kernel to * start writing them out before the fsync comes. */ if (offset - flush_offset >= FLUSH_DISTANCE) { pg_flush_data(dstfd, flush_offset, offset - flush_offset); flush_offset = offset; } pgstat_report_wait_start(WAIT_EVENT_COPY_FILE_READ); nbytes = read(srcfd, buffer, COPY_BUF_SIZE); pgstat_report_wait_end(); if (nbytes < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not read file \"%s\": %m", fromfile))); if (nbytes == 0) break; errno = 0; pgstat_report_wait_start(WAIT_EVENT_COPY_FILE_WRITE); if ((int) write(dstfd, buffer, nbytes) != nbytes) { /* if write didn't set errno, assume problem is no disk space */ if (errno == 0) errno = ENOSPC; ereport(ERROR, (errcode_for_file_access(), errmsg("could not write to file \"%s\": %m", tofile))); } pgstat_report_wait_end(); } if (offset > flush_offset) pg_flush_data(dstfd, flush_offset, offset - flush_offset); if (CloseTransientFile(dstfd)) ereport(ERROR, (errcode_for_file_access(), errmsg("could not close file \"%s\": %m", tofile))); if (CloseTransientFile(srcfd)) ereport(ERROR, (errcode_for_file_access(), errmsg("could not close file \"%s\": %m", fromfile))); pfree(buffer); }
/* * PGSemaphoreLock * * Lock a semaphore (decrement count), blocking if count would be < 0 */ void PGSemaphoreLock(PGSemaphore sema, bool interruptOK) { int errStatus; struct sembuf sops; sops.sem_op = -1; /* decrement */ sops.sem_flg = 0; sops.sem_num = sema->semNum; /* * Note: if errStatus is -1 and errno == EINTR then it means we returned * from the operation prematurely because we were sent a signal. So we * try and lock the semaphore again. * * Each time around the loop, we check for a cancel/die interrupt. On * some platforms, if such an interrupt comes in while we are waiting, it * will cause the semop() call to exit with errno == EINTR, allowing us to * service the interrupt (if not in a critical section already) during the * next loop iteration. * * Once we acquire the lock, we do NOT check for an interrupt before * returning. The caller needs to be able to record ownership of the lock * before any interrupt can be accepted. * * There is a window of a few instructions between CHECK_FOR_INTERRUPTS * and entering the semop() call. If a cancel/die interrupt occurs in * that window, we would fail to notice it until after we acquire the lock * (or get another interrupt to escape the semop()). We can avoid this * problem by temporarily setting ImmediateInterruptOK to true before we * do CHECK_FOR_INTERRUPTS; then, a die() interrupt in this interval will * execute directly. However, there is a huge pitfall: there is another * window of a few instructions after the semop() before we are able to * reset ImmediateInterruptOK. If an interrupt occurs then, we'll lose * control, which means that the lock has been acquired but our caller did * not get a chance to record the fact. Therefore, we only set * ImmediateInterruptOK if the caller tells us it's OK to do so, ie, the * caller does not need to record acquiring the lock. (This is currently * true for lockmanager locks, since the process that granted us the lock * did all the necessary state updates. It's not true for SysV semaphores * used to implement LW locks or emulate spinlocks --- but the wait time * for such locks should not be very long, anyway.) * * On some platforms, signals marked SA_RESTART (which is most, for us) * will not interrupt the semop(); it will just keep waiting. Therefore * it's necessary for cancel/die interrupts to be serviced directly by the * signal handler. On these platforms the behavior is really the same * whether the signal arrives just before the semop() begins, or while it * is waiting. The loop on EINTR is thus important only for other types * of interrupts. */ do { ImmediateInterruptOK = interruptOK; CHECK_FOR_INTERRUPTS(); errStatus = semop(sema->semId, &sops, 1); ImmediateInterruptOK = false; } while (errStatus < 0 && errno == EINTR); if (errStatus < 0) elog(FATAL, "semop(id=%d) failed: %m", sema->semId); }
/* * copydir: copy a directory * * If recurse is false, subdirectories are ignored. Anything that's not * a directory or a regular file is ignored. */ void copydir(char *fromdir, char *todir, bool recurse) { DIR *xldir; struct dirent *xlde; char fromfile[MAXPGPATH * 2]; char tofile[MAXPGPATH * 2]; if (MakePGDirectory(todir) != 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not create directory \"%s\": %m", todir))); xldir = AllocateDir(fromdir); while ((xlde = ReadDir(xldir, fromdir)) != NULL) { struct stat fst; /* If we got a cancel signal during the copy of the directory, quit */ CHECK_FOR_INTERRUPTS(); if (strcmp(xlde->d_name, ".") == 0 || strcmp(xlde->d_name, "..") == 0) continue; snprintf(fromfile, sizeof(fromfile), "%s/%s", fromdir, xlde->d_name); snprintf(tofile, sizeof(tofile), "%s/%s", todir, xlde->d_name); if (lstat(fromfile, &fst) < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not stat file \"%s\": %m", fromfile))); if (S_ISDIR(fst.st_mode)) { /* recurse to handle subdirectories */ if (recurse) copydir(fromfile, tofile, true); } else if (S_ISREG(fst.st_mode)) copy_file(fromfile, tofile); } FreeDir(xldir); /* * Be paranoid here and fsync all files to ensure the copy is really done. * But if fsync is disabled, we're done. */ if (!enableFsync) return; xldir = AllocateDir(todir); while ((xlde = ReadDir(xldir, todir)) != NULL) { struct stat fst; if (strcmp(xlde->d_name, ".") == 0 || strcmp(xlde->d_name, "..") == 0) continue; snprintf(tofile, sizeof(tofile), "%s/%s", todir, xlde->d_name); /* * We don't need to sync subdirectories here since the recursive * copydir will do it before it returns */ if (lstat(tofile, &fst) < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not stat file \"%s\": %m", tofile))); if (S_ISREG(fst.st_mode)) fsync_fname(tofile, false); } FreeDir(xldir); /* * It's important to fsync the destination directory itself as individual * file fsyncs don't guarantee that the directory entry for the file is * synced. Recent versions of ext4 have made the window much wider but * it's been true for ext3 and other filesystems in the past. */ fsync_fname(todir, true); }
/* * Assumes that the segment file lock is already held. * Assumes that the segment file should be compacted. * */ static void AppendOnlySegmentFileFullCompaction(Relation aorel, AppendOnlyEntry *aoEntry, AppendOnlyInsertDesc insertDesc, FileSegInfo* fsinfo) { const char* relname; AppendOnlyVisimap visiMap; AppendOnlyScanDesc scanDesc; TupleDesc tupDesc; MemTuple tuple; TupleTableSlot *slot; MemTupleBinding *mt_bind; int compact_segno; int64 movedTupleCount = 0; ResultRelInfo *resultRelInfo; EState *estate; AOTupleId *aoTupleId; int64 tupleCount = 0; int64 tuplePerPage = INT_MAX; Assert(Gp_role == GP_ROLE_EXECUTE || Gp_role == GP_ROLE_UTILITY); Assert(RelationIsAoRows(aorel)); Assert(insertDesc); compact_segno = fsinfo->segno; if (fsinfo->varblockcount > 0) { tuplePerPage = fsinfo->total_tupcount / fsinfo->varblockcount; } relname = RelationGetRelationName(aorel); AppendOnlyVisimap_Init(&visiMap, aoEntry->visimaprelid, aoEntry->visimapidxid, ShareUpdateExclusiveLock, SnapshotNow); elogif(Debug_appendonly_print_compaction, LOG, "Compact AO segno %d, relation %s, insert segno %d", compact_segno, relname, insertDesc->storageWrite.segmentFileNum); /* * Todo: We need to limit the scan to one file and we need to avoid to * lock the file again. * * We use SnapshotAny to get visible and invisible tuples. */ scanDesc = appendonly_beginrangescan(aorel, SnapshotAny, SnapshotNow, &compact_segno, 1, 0, NULL); tupDesc = RelationGetDescr(aorel); slot = MakeSingleTupleTableSlot(tupDesc); mt_bind = create_memtuple_binding(tupDesc); /* * We need a ResultRelInfo and an EState so we can use the regular * executor's index-entry-making machinery. */ estate = CreateExecutorState(); resultRelInfo = makeNode(ResultRelInfo); resultRelInfo->ri_RangeTableIndex = 1; /* dummy */ resultRelInfo->ri_RelationDesc = aorel; resultRelInfo->ri_TrigDesc = NULL; /* we don't fire triggers */ ExecOpenIndices(resultRelInfo); estate->es_result_relations = resultRelInfo; estate->es_num_result_relations = 1; estate->es_result_relation_info = resultRelInfo; /* * Go through all visible tuples and move them to a new segfile. */ while ((tuple = appendonly_getnext(scanDesc, ForwardScanDirection, slot)) != NULL) { /* Check interrupts as this may take time. */ CHECK_FOR_INTERRUPTS(); aoTupleId = (AOTupleId*)slot_get_ctid(slot); if (AppendOnlyVisimap_IsVisible(&scanDesc->visibilityMap, aoTupleId)) { AppendOnlyMoveTuple(tuple, slot, mt_bind, insertDesc, resultRelInfo, estate); movedTupleCount++; } else { /* Tuple is invisible and needs to be dropped */ AppendOnlyThrowAwayTuple(aorel, tuple, slot, mt_bind); } /* * Check for vacuum delay point after approximatly a var block */ tupleCount++; if (VacuumCostActive && tupleCount % tuplePerPage == 0) { vacuum_delay_point(); } } SetFileSegInfoState(aorel, aoEntry, compact_segno, AOSEG_STATE_AWAITING_DROP); AppendOnlyVisimap_DeleteSegmentFile(&visiMap, compact_segno); /* Delete all mini pages of the segment files if block directory exists */ if (OidIsValid(aoEntry->blkdirrelid)) { AppendOnlyBlockDirectory_DeleteSegmentFile( aoEntry, SnapshotNow, compact_segno, 0); } elogif(Debug_appendonly_print_compaction, LOG, "Finished compaction: " "AO segfile %d, relation %s, moved tuple count " INT64_FORMAT, compact_segno, relname, movedTupleCount); AppendOnlyVisimap_Finish(&visiMap, NoLock); ExecCloseIndices(resultRelInfo); FreeExecutorState(estate); ExecDropSingleTupleTableSlot(slot); destroy_memtuple_binding(mt_bind); appendonly_endscan(scanDesc); }
Datum ginbuild(PG_FUNCTION_ARGS) { Relation heap = (Relation) PG_GETARG_POINTER(0); Relation index = (Relation) PG_GETARG_POINTER(1); IndexInfo *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2); IndexBuildResult *result; double reltuples; GinBuildState buildstate; Buffer RootBuffer, MetaBuffer; ItemPointerData *list; Datum entry; uint32 nlist; MemoryContext oldCtx; OffsetNumber attnum; if (RelationGetNumberOfBlocks(index) != 0) elog(ERROR, "index \"%s\" already contains data", RelationGetRelationName(index)); initGinState(&buildstate.ginstate, index); /* initialize the meta page */ MetaBuffer = GinNewBuffer(index); /* initialize the root page */ RootBuffer = GinNewBuffer(index); START_CRIT_SECTION(); GinInitMetabuffer(MetaBuffer); MarkBufferDirty(MetaBuffer); GinInitBuffer(RootBuffer, GIN_LEAF); MarkBufferDirty(RootBuffer); if (!index->rd_istemp) { XLogRecPtr recptr; XLogRecData rdata; Page page; rdata.buffer = InvalidBuffer; rdata.data = (char *) &(index->rd_node); rdata.len = sizeof(RelFileNode); rdata.next = NULL; recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_CREATE_INDEX, &rdata); page = BufferGetPage(RootBuffer); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); page = BufferGetPage(MetaBuffer); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); } UnlockReleaseBuffer(MetaBuffer); UnlockReleaseBuffer(RootBuffer); END_CRIT_SECTION(); /* build the index */ buildstate.indtuples = 0; /* * create a temporary memory context that is reset once for each tuple * inserted into the index */ buildstate.tmpCtx = AllocSetContextCreate(CurrentMemoryContext, "Gin build temporary context", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); buildstate.funcCtx = AllocSetContextCreate(buildstate.tmpCtx, "Gin build temporary context for user-defined function", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); buildstate.accum.ginstate = &buildstate.ginstate; ginInitBA(&buildstate.accum); /* * Do the heap scan. We disallow sync scan here because dataPlaceToPage * prefers to receive tuples in TID order. */ reltuples = IndexBuildHeapScan(heap, index, indexInfo, false, ginBuildCallback, (void *) &buildstate); /* dump remaining entries to the index */ oldCtx = MemoryContextSwitchTo(buildstate.tmpCtx); while ((list = ginGetEntry(&buildstate.accum, &attnum, &entry, &nlist)) != NULL) { /* there could be many entries, so be willing to abort here */ CHECK_FOR_INTERRUPTS(); ginEntryInsert(index, &buildstate.ginstate, attnum, entry, list, nlist, TRUE); } MemoryContextSwitchTo(oldCtx); MemoryContextDelete(buildstate.tmpCtx); /* * Return statistics */ result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); result->heap_tuples = reltuples; result->index_tuples = buildstate.indtuples; PG_RETURN_POINTER(result); }
/* * Recursive guts of FreeSpaceMapVacuum */ static uint8 fsm_vacuum_page(Relation rel, FSMAddress addr, bool *eof_p) { Buffer buf; Page page; uint8 max_avail; /* Read the page if it exists, or return EOF */ buf = fsm_readbuf(rel, addr, false); if (!BufferIsValid(buf)) { *eof_p = true; return 0; } else *eof_p = false; page = BufferGetPage(buf); /* * Recurse into children, and fix the information stored about them at * this level. */ if (addr.level > FSM_BOTTOM_LEVEL) { int slot; bool eof = false; for (slot = 0; slot < SlotsPerFSMPage; slot++) { int child_avail; CHECK_FOR_INTERRUPTS(); /* After we hit end-of-file, just clear the rest of the slots */ if (!eof) child_avail = fsm_vacuum_page(rel, fsm_get_child(addr, slot), &eof); else child_avail = 0; /* Update information about the child */ if (fsm_get_avail(page, slot) != child_avail) { LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); fsm_set_avail(BufferGetPage(buf), slot, child_avail); MarkBufferDirty(buf); LockBuffer(buf, BUFFER_LOCK_UNLOCK); } } } max_avail = fsm_get_max_avail(BufferGetPage(buf)); /* * Reset the next slot pointer. This encourages the use of low-numbered * pages, increasing the chances that a later vacuum can truncate the * relation. */ ((FSMPage) PageGetContents(page))->fp_next_slot = 0; ReleaseBuffer(buf); return max_avail; }
/* ---------------------------------------------------------------- * ExecGatherMerge(node) * * Scans the relation via multiple workers and returns * the next qualifying tuple. * ---------------------------------------------------------------- */ static TupleTableSlot * ExecGatherMerge(PlanState *pstate) { GatherMergeState *node = castNode(GatherMergeState, pstate); TupleTableSlot *slot; ExprContext *econtext; CHECK_FOR_INTERRUPTS(); /* * As with Gather, we don't launch workers until this node is actually * executed. */ if (!node->initialized) { EState *estate = node->ps.state; GatherMerge *gm = castNode(GatherMerge, node->ps.plan); /* * Sometimes we might have to run without parallelism; but if parallel * mode is active then we can try to fire up some workers. */ if (gm->num_workers > 0 && estate->es_use_parallel_mode) { ParallelContext *pcxt; /* Initialize, or re-initialize, shared state needed by workers. */ if (!node->pei) node->pei = ExecInitParallelPlan(node->ps.lefttree, estate, gm->initParam, gm->num_workers, node->tuples_needed); else ExecParallelReinitialize(node->ps.lefttree, node->pei, gm->initParam); /* Try to launch workers. */ pcxt = node->pei->pcxt; LaunchParallelWorkers(pcxt); /* We save # workers launched for the benefit of EXPLAIN */ node->nworkers_launched = pcxt->nworkers_launched; /* Set up tuple queue readers to read the results. */ if (pcxt->nworkers_launched > 0) { ExecParallelCreateReaders(node->pei); /* Make a working array showing the active readers */ node->nreaders = pcxt->nworkers_launched; node->reader = (TupleQueueReader **) palloc(node->nreaders * sizeof(TupleQueueReader *)); memcpy(node->reader, node->pei->reader, node->nreaders * sizeof(TupleQueueReader *)); } else { /* No workers? Then never mind. */ node->nreaders = 0; node->reader = NULL; } } /* allow leader to participate if enabled or no choice */ if (parallel_leader_participation || node->nreaders == 0) node->need_to_scan_locally = true; node->initialized = true; } /* * Reset per-tuple memory context to free any expression evaluation * storage allocated in the previous tuple cycle. */ econtext = node->ps.ps_ExprContext; ResetExprContext(econtext); /* * Get next tuple, either from one of our workers, or by running the plan * ourselves. */ slot = gather_merge_getnext(node); if (TupIsNull(slot)) return NULL; /* If no projection is required, we're done. */ if (node->ps.ps_ProjInfo == NULL) return slot; /* * Form the result tuple using ExecProject(), and return it. */ econtext->ecxt_outertuple = slot; return ExecProject(node->ps.ps_ProjInfo); }
static size_t AsyncSourceRead(AsyncSource *self, void *buffer, size_t len) { char *data; int size; int begin; int end; char errhead; size_t bytesread; int n; /* 4 times of the needs size allocate a buffer at least */ if (self->size < len * 4) { char *newbuf; int newsize; /* read buffer a multiple of READ_UNIT_SIZE */ newsize = (len * 4 - 1) - ((len * 4 - 1) / READ_UNIT_SIZE) + READ_UNIT_SIZE; newbuf = palloc0(newsize); pthread_mutex_lock(&self->lock); /* copy it in new buffer from old buffer */ if (self->begin > self->end) { memcpy(newbuf, self->buffer + self->begin, self->size - self->begin); memcpy(newbuf + self->size - self->begin, self->buffer, self->end); self->end = self->size - self->begin + self->end; } else { memcpy(newbuf, self->buffer + self->begin, self->end - self->begin); self->end = self->end - self->begin; } pfree(self->buffer); self->buffer = newbuf; self->size = newsize; self->begin = 0; pthread_mutex_unlock(&self->lock); } /* this value that a read thread does not change */ data = self->buffer; size = self->size; begin = self->begin; bytesread = 0; retry: end = self->end; errhead = self->errmsg[0]; /* error in read thread */ if (errhead != '\0') { /* wait for error message to be set */ pthread_mutex_lock(&self->lock); pthread_mutex_unlock(&self->lock); ereport(ERROR, (errcode_for_file_access(), errmsg("%s", self->errmsg))); } if (begin < end) { n = Min(len - bytesread, end - begin); memcpy((char *) buffer + bytesread, data + begin, n); begin += n; bytesread += n; } else if (begin > end) { n = Min(len - bytesread, size - begin); memcpy((char *) buffer + bytesread, data + begin, n); begin += n; bytesread += n; if (begin == size) { self->begin = begin = 0; if (bytesread < len) goto retry; } } self->begin = begin; if (bytesread == len || (self->eof && begin == end)) return bytesread; /* not enough data yet */ CHECK_FOR_INTERRUPTS(); pg_usleep(SPIN_SLEEP_MSEC * 1000); goto retry; }
/* * Initialize the Gather Merge. * * Reset data structures to ensure they're empty. Then pull at least one * tuple from leader + each worker (or set its "done" indicator), and set up * the heap. */ static void gather_merge_init(GatherMergeState *gm_state) { int nreaders = gm_state->nreaders; bool nowait = true; int i; /* Assert that gather_merge_setup made enough space */ Assert(nreaders <= castNode(GatherMerge, gm_state->ps.plan)->num_workers); /* Reset leader's tuple slot to empty */ gm_state->gm_slots[0] = NULL; /* Reset the tuple slot and tuple array for each worker */ for (i = 0; i < nreaders; i++) { /* Reset tuple array to empty */ gm_state->gm_tuple_buffers[i].nTuples = 0; gm_state->gm_tuple_buffers[i].readCounter = 0; /* Reset done flag to not-done */ gm_state->gm_tuple_buffers[i].done = false; /* Ensure output slot is empty */ ExecClearTuple(gm_state->gm_slots[i + 1]); } /* Reset binary heap to empty */ binaryheap_reset(gm_state->gm_heap); /* * First, try to read a tuple from each worker (including leader) in * nowait mode. After this, if not all workers were able to produce a * tuple (or a "done" indication), then re-read from remaining workers, * this time using wait mode. Add all live readers (those producing at * least one tuple) to the heap. */ reread: for (i = 0; i <= nreaders; i++) { CHECK_FOR_INTERRUPTS(); /* skip this source if already known done */ if ((i == 0) ? gm_state->need_to_scan_locally : !gm_state->gm_tuple_buffers[i - 1].done) { if (TupIsNull(gm_state->gm_slots[i])) { /* Don't have a tuple yet, try to get one */ if (gather_merge_readnext(gm_state, i, nowait)) binaryheap_add_unordered(gm_state->gm_heap, Int32GetDatum(i)); } else { /* * We already got at least one tuple from this worker, but * might as well see if it has any more ready by now. */ load_tuple_array(gm_state, i); } } } /* need not recheck leader, since nowait doesn't matter for it */ for (i = 1; i <= nreaders; i++) { if (!gm_state->gm_tuple_buffers[i - 1].done && TupIsNull(gm_state->gm_slots[i])) { nowait = false; goto reread; } } /* Now heapify the heap. */ binaryheap_build(gm_state->gm_heap); gm_state->gm_initialized = true; }
/* ------------------------------------------------------ * pgstatindex() * * Usage: SELECT * FROM pgstatindex('t1_pkey'); * ------------------------------------------------------ */ Datum pgstatindex(PG_FUNCTION_ARGS) { text *relname = PG_GETARG_TEXT_P(0); Relation rel; RangeVar *relrv; Datum result; BlockNumber nblocks; BlockNumber blkno; BTIndexStat indexStat; if (!superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), (errmsg("must be superuser to use pgstattuple functions")))); relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname)); rel = relation_openrv(relrv, AccessShareLock); if (!IS_INDEX(rel) || !IS_BTREE(rel)) elog(ERROR, "relation \"%s\" is not a btree index", RelationGetRelationName(rel)); /* * Reject attempts to read non-local temporary relations; we would be * likely to get wrong data since we have no visibility into the owning * session's local buffers. */ if (RELATION_IS_OTHER_TEMP(rel)) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("cannot access temporary tables of other sessions"))); /* * Read metapage */ { Buffer buffer = ReadBuffer(rel, 0); Page page = BufferGetPage(buffer); BTMetaPageData *metad = BTPageGetMeta(page); indexStat.version = metad->btm_version; indexStat.level = metad->btm_level; indexStat.root_blkno = metad->btm_root; ReleaseBuffer(buffer); } /* -- init counters -- */ indexStat.root_pages = 0; indexStat.internal_pages = 0; indexStat.leaf_pages = 0; indexStat.empty_pages = 0; indexStat.deleted_pages = 0; indexStat.max_avail = 0; indexStat.free_space = 0; indexStat.fragments = 0; /* * Scan all blocks except the metapage */ nblocks = RelationGetNumberOfBlocks(rel); for (blkno = 1; blkno < nblocks; blkno++) { Buffer buffer; Page page; BTPageOpaque opaque; CHECK_FOR_INTERRUPTS(); /* Read and lock buffer */ buffer = ReadBuffer(rel, blkno); LockBuffer(buffer, BUFFER_LOCK_SHARE); page = BufferGetPage(buffer); opaque = (BTPageOpaque) PageGetSpecialPointer(page); /* Determine page type, and update totals */ if (P_ISLEAF(opaque)) { int max_avail; max_avail = BLCKSZ - (BLCKSZ - ((PageHeader) page)->pd_special + SizeOfPageHeaderData); indexStat.max_avail += max_avail; indexStat.free_space += PageGetFreeSpace(page); indexStat.leaf_pages++; /* * If the next leaf is on an earlier block, it means a * fragmentation. */ if (opaque->btpo_next != P_NONE && opaque->btpo_next < blkno) indexStat.fragments++; } else if (P_ISDELETED(opaque)) indexStat.deleted_pages++; else if (P_IGNORE(opaque)) indexStat.empty_pages++; else if (P_ISROOT(opaque)) indexStat.root_pages++; else indexStat.internal_pages++; /* Unlock and release buffer */ LockBuffer(buffer, BUFFER_LOCK_UNLOCK); ReleaseBuffer(buffer); } relation_close(rel, AccessShareLock); /*---------------------------- * Build a result tuple *---------------------------- */ { TupleDesc tupleDesc; int j; char *values[10]; HeapTuple tuple; /* Build a tuple descriptor for our result type */ if (get_call_result_type(fcinfo, NULL, &tupleDesc) != TYPEFUNC_COMPOSITE) elog(ERROR, "return type must be a row type"); j = 0; values[j] = palloc(32); snprintf(values[j++], 32, "%d", indexStat.version); values[j] = palloc(32); snprintf(values[j++], 32, "%d", indexStat.level); values[j] = palloc(32); snprintf(values[j++], 32, INT64_FORMAT, (indexStat.root_pages + indexStat.leaf_pages + indexStat.internal_pages + indexStat.deleted_pages + indexStat.empty_pages) * BLCKSZ); values[j] = palloc(32); snprintf(values[j++], 32, "%u", indexStat.root_blkno); values[j] = palloc(32); snprintf(values[j++], 32, INT64_FORMAT, indexStat.internal_pages); values[j] = palloc(32); snprintf(values[j++], 32, INT64_FORMAT, indexStat.leaf_pages); values[j] = palloc(32); snprintf(values[j++], 32, INT64_FORMAT, indexStat.empty_pages); values[j] = palloc(32); snprintf(values[j++], 32, INT64_FORMAT, indexStat.deleted_pages); values[j] = palloc(32); if (indexStat.max_avail > 0) snprintf(values[j++], 32, "%.2f", 100.0 - (double) indexStat.free_space / (double) indexStat.max_avail * 100.0); else snprintf(values[j++], 32, "NaN"); values[j] = palloc(32); if (indexStat.leaf_pages > 0) snprintf(values[j++], 32, "%.2f", (double) indexStat.fragments / (double) indexStat.leaf_pages * 100.0); else snprintf(values[j++], 32, "NaN"); tuple = BuildTupleFromCStrings(TupleDescGetAttInMetadata(tupleDesc), values); result = HeapTupleGetDatum(tuple); } PG_RETURN_DATUM(result); }
/* * _hash_init() -- Initialize the metadata page of a hash index, * the initial buckets, and the initial bitmap page. * * The initial number of buckets is dependent on num_tuples, an estimate * of the number of tuples to be loaded into the index initially. The * chosen number of buckets is returned. * * We are fairly cavalier about locking here, since we know that no one else * could be accessing this index. In particular the rule about not holding * multiple buffer locks is ignored. */ uint32 _hash_init(Relation rel, double num_tuples, ForkNumber forkNum) { Buffer metabuf; Buffer buf; Buffer bitmapbuf; Page pg; HashMetaPage metap; RegProcedure procid; int32 data_width; int32 item_width; int32 ffactor; uint32 num_buckets; uint32 i; bool use_wal; /* safety check */ if (RelationGetNumberOfBlocksInFork(rel, forkNum) != 0) elog(ERROR, "cannot initialize non-empty hash index \"%s\"", RelationGetRelationName(rel)); /* * WAL log creation of pages if the relation is persistent, or this is the * init fork. Init forks for unlogged relations always need to be WAL * logged. */ use_wal = RelationNeedsWAL(rel) || forkNum == INIT_FORKNUM; /* * Determine the target fill factor (in tuples per bucket) for this index. * The idea is to make the fill factor correspond to pages about as full * as the user-settable fillfactor parameter says. We can compute it * exactly since the index datatype (i.e. uint32 hash key) is fixed-width. */ data_width = sizeof(uint32); item_width = MAXALIGN(sizeof(IndexTupleData)) + MAXALIGN(data_width) + sizeof(ItemIdData); /* include the line pointer */ ffactor = RelationGetTargetPageUsage(rel, HASH_DEFAULT_FILLFACTOR) / item_width; /* keep to a sane range */ if (ffactor < 10) ffactor = 10; procid = index_getprocid(rel, 1, HASHSTANDARD_PROC); /* * We initialize the metapage, the first N bucket pages, and the first * bitmap page in sequence, using _hash_getnewbuf to cause smgrextend() * calls to occur. This ensures that the smgr level has the right idea of * the physical index length. * * Critical section not required, because on error the creation of the * whole relation will be rolled back. */ metabuf = _hash_getnewbuf(rel, HASH_METAPAGE, forkNum); _hash_init_metabuffer(metabuf, num_tuples, procid, ffactor, false); MarkBufferDirty(metabuf); pg = BufferGetPage(metabuf); metap = HashPageGetMeta(pg); /* XLOG stuff */ if (use_wal) { xl_hash_init_meta_page xlrec; XLogRecPtr recptr; xlrec.num_tuples = num_tuples; xlrec.procid = metap->hashm_procid; xlrec.ffactor = metap->hashm_ffactor; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfHashInitMetaPage); XLogRegisterBuffer(0, metabuf, REGBUF_WILL_INIT); recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_INIT_META_PAGE); PageSetLSN(BufferGetPage(metabuf), recptr); } num_buckets = metap->hashm_maxbucket + 1; /* * Release buffer lock on the metapage while we initialize buckets. * Otherwise, we'll be in interrupt holdoff and the CHECK_FOR_INTERRUPTS * won't accomplish anything. It's a bad idea to hold buffer locks for * long intervals in any case, since that can block the bgwriter. */ LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); /* * Initialize and WAL Log the first N buckets */ for (i = 0; i < num_buckets; i++) { BlockNumber blkno; /* Allow interrupts, in case N is huge */ CHECK_FOR_INTERRUPTS(); blkno = BUCKET_TO_BLKNO(metap, i); buf = _hash_getnewbuf(rel, blkno, forkNum); _hash_initbuf(buf, metap->hashm_maxbucket, i, LH_BUCKET_PAGE, false); MarkBufferDirty(buf); if (use_wal) log_newpage(&rel->rd_node, forkNum, blkno, BufferGetPage(buf), true); _hash_relbuf(rel, buf); } /* Now reacquire buffer lock on metapage */ LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); /* * Initialize bitmap page */ bitmapbuf = _hash_getnewbuf(rel, num_buckets + 1, forkNum); _hash_initbitmapbuffer(bitmapbuf, metap->hashm_bmsize, false); MarkBufferDirty(bitmapbuf); /* add the new bitmap page to the metapage's list of bitmaps */ /* metapage already has a write lock */ if (metap->hashm_nmaps >= HASH_MAX_BITMAPS) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("out of overflow pages in hash index \"%s\"", RelationGetRelationName(rel)))); metap->hashm_mapp[metap->hashm_nmaps] = num_buckets + 1; metap->hashm_nmaps++; MarkBufferDirty(metabuf); /* XLOG stuff */ if (use_wal) { xl_hash_init_bitmap_page xlrec; XLogRecPtr recptr; xlrec.bmsize = metap->hashm_bmsize; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfHashInitBitmapPage); XLogRegisterBuffer(0, bitmapbuf, REGBUF_WILL_INIT); /* * This is safe only because nobody else can be modifying the index at * this stage; it's only visible to the transaction that is creating * it. */ XLogRegisterBuffer(1, metabuf, REGBUF_STANDARD); recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_INIT_BITMAP_PAGE); PageSetLSN(BufferGetPage(bitmapbuf), recptr); PageSetLSN(BufferGetPage(metabuf), recptr); } /* all done */ _hash_relbuf(rel, bitmapbuf); _hash_relbuf(rel, metabuf); return num_buckets; }
/* * Summarize page ranges that are not already summarized. If pageRange is * BRIN_ALL_BLOCKRANGES then the whole table is scanned; otherwise, only the * page range containing the given heap page number is scanned. * If include_partial is true, then the partial range at the end of the table * is summarized, otherwise not. * * For each new index tuple inserted, *numSummarized (if not NULL) is * incremented; for each existing tuple, *numExisting (if not NULL) is * incremented. */ static void brinsummarize(Relation index, Relation heapRel, BlockNumber pageRange, bool include_partial, double *numSummarized, double *numExisting) { BrinRevmap *revmap; BrinBuildState *state = NULL; IndexInfo *indexInfo = NULL; BlockNumber heapNumBlocks; BlockNumber pagesPerRange; Buffer buf; BlockNumber startBlk; revmap = brinRevmapInitialize(index, &pagesPerRange, NULL); /* determine range of pages to process */ heapNumBlocks = RelationGetNumberOfBlocks(heapRel); if (pageRange == BRIN_ALL_BLOCKRANGES) startBlk = 0; else { startBlk = (pageRange / pagesPerRange) * pagesPerRange; heapNumBlocks = Min(heapNumBlocks, startBlk + pagesPerRange); } if (startBlk > heapNumBlocks) { /* Nothing to do if start point is beyond end of table */ brinRevmapTerminate(revmap); return; } /* * Scan the revmap to find unsummarized items. */ buf = InvalidBuffer; for (; startBlk < heapNumBlocks; startBlk += pagesPerRange) { BrinTuple *tup; OffsetNumber off; /* * Unless requested to summarize even a partial range, go away now if * we think the next range is partial. Caller would pass true when it * is typically run once bulk data loading is done * (brin_summarize_new_values), and false when it is typically the * result of arbitrarily-scheduled maintenance command (vacuuming). */ if (!include_partial && (startBlk + pagesPerRange > heapNumBlocks)) break; CHECK_FOR_INTERRUPTS(); tup = brinGetTupleForHeapBlock(revmap, startBlk, &buf, &off, NULL, BUFFER_LOCK_SHARE, NULL); if (tup == NULL) { /* no revmap entry for this heap range. Summarize it. */ if (state == NULL) { /* first time through */ Assert(!indexInfo); state = initialize_brin_buildstate(index, revmap, pagesPerRange); indexInfo = BuildIndexInfo(index); } summarize_range(indexInfo, state, heapRel, startBlk, heapNumBlocks); /* and re-initialize state for the next range */ brin_memtuple_initialize(state->bs_dtuple, state->bs_bdesc); if (numSummarized) *numSummarized += 1.0; } else { if (numExisting) *numExisting += 1.0; LockBuffer(buf, BUFFER_LOCK_UNLOCK); } } if (BufferIsValid(buf)) ReleaseBuffer(buf); /* free resources */ brinRevmapTerminate(revmap); if (state) { terminate_brin_buildstate(state); pfree(indexInfo); } }
/* * A tuple in the heap is being inserted. To keep a brin index up to date, * we need to obtain the relevant index tuple and compare its stored values * with those of the new tuple. If the tuple values are not consistent with * the summary tuple, we need to update the index tuple. * * If the range is not currently summarized (i.e. the revmap returns NULL for * it), there's nothing to do. */ Datum brininsert(PG_FUNCTION_ARGS) { Relation idxRel = (Relation) PG_GETARG_POINTER(0); Datum *values = (Datum *) PG_GETARG_POINTER(1); bool *nulls = (bool *) PG_GETARG_POINTER(2); ItemPointer heaptid = (ItemPointer) PG_GETARG_POINTER(3); /* we ignore the rest of our arguments */ BlockNumber pagesPerRange; BrinDesc *bdesc = NULL; BrinRevmap *revmap; Buffer buf = InvalidBuffer; MemoryContext tupcxt = NULL; MemoryContext oldcxt = NULL; revmap = brinRevmapInitialize(idxRel, &pagesPerRange); for (;;) { bool need_insert = false; OffsetNumber off; BrinTuple *brtup; BrinMemTuple *dtup; BlockNumber heapBlk; int keyno; #ifdef USE_ASSERT_CHECKING BrinTuple *tmptup; BrinMemTuple *tmpdtup; Size tmpsiz; #endif CHECK_FOR_INTERRUPTS(); heapBlk = ItemPointerGetBlockNumber(heaptid); /* normalize the block number to be the first block in the range */ heapBlk = (heapBlk / pagesPerRange) * pagesPerRange; brtup = brinGetTupleForHeapBlock(revmap, heapBlk, &buf, &off, NULL, BUFFER_LOCK_SHARE); /* if range is unsummarized, there's nothing to do */ if (!brtup) break; /* First time through? */ if (bdesc == NULL) { bdesc = brin_build_desc(idxRel); tupcxt = AllocSetContextCreate(CurrentMemoryContext, "brininsert cxt", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); oldcxt = MemoryContextSwitchTo(tupcxt); } dtup = brin_deform_tuple(bdesc, brtup); #ifdef USE_ASSERT_CHECKING { /* * When assertions are enabled, we use this as an opportunity to * test the "union" method, which would otherwise be used very * rarely: first create a placeholder tuple, and addValue the * value we just got into it. Then union the existing index tuple * with the updated placeholder tuple. The tuple resulting from * that union should be identical to the one resulting from the * regular operation (straight addValue) below. * * Here we create the tuple to compare with; the actual comparison * is below. */ tmptup = brin_form_placeholder_tuple(bdesc, heapBlk, &tmpsiz); tmpdtup = brin_deform_tuple(bdesc, tmptup); for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++) { BrinValues *bval; FmgrInfo *addValue; bval = &tmpdtup->bt_columns[keyno]; addValue = index_getprocinfo(idxRel, keyno + 1, BRIN_PROCNUM_ADDVALUE); FunctionCall4Coll(addValue, idxRel->rd_indcollation[keyno], PointerGetDatum(bdesc), PointerGetDatum(bval), values[keyno], nulls[keyno]); } union_tuples(bdesc, tmpdtup, brtup); tmpdtup->bt_placeholder = dtup->bt_placeholder; tmptup = brin_form_tuple(bdesc, heapBlk, tmpdtup, &tmpsiz); } #endif /* * Compare the key values of the new tuple to the stored index values; * our deformed tuple will get updated if the new tuple doesn't fit * the original range (note this means we can't break out of the loop * early). Make a note of whether this happens, so that we know to * insert the modified tuple later. */ for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++) { Datum result; BrinValues *bval; FmgrInfo *addValue; bval = &dtup->bt_columns[keyno]; addValue = index_getprocinfo(idxRel, keyno + 1, BRIN_PROCNUM_ADDVALUE); result = FunctionCall4Coll(addValue, idxRel->rd_indcollation[keyno], PointerGetDatum(bdesc), PointerGetDatum(bval), values[keyno], nulls[keyno]); /* if that returned true, we need to insert the updated tuple */ need_insert |= DatumGetBool(result); } #ifdef USE_ASSERT_CHECKING { /* * Now we can compare the tuple produced by the union function * with the one from plain addValue. */ BrinTuple *cmptup; Size cmpsz; cmptup = brin_form_tuple(bdesc, heapBlk, dtup, &cmpsz); Assert(brin_tuples_equal(tmptup, tmpsiz, cmptup, cmpsz)); } #endif if (!need_insert) { /* * The tuple is consistent with the new values, so there's nothing * to do. */ LockBuffer(buf, BUFFER_LOCK_UNLOCK); } else { Page page = BufferGetPage(buf); ItemId lp = PageGetItemId(page, off); Size origsz; BrinTuple *origtup; Size newsz; BrinTuple *newtup; bool samepage; /* * Make a copy of the old tuple, so that we can compare it after * re-acquiring the lock. */ origsz = ItemIdGetLength(lp); origtup = brin_copy_tuple(brtup, origsz); /* * Before releasing the lock, check if we can attempt a same-page * update. Another process could insert a tuple concurrently in * the same page though, so downstream we must be prepared to cope * if this turns out to not be possible after all. */ newtup = brin_form_tuple(bdesc, heapBlk, dtup, &newsz); samepage = brin_can_do_samepage_update(buf, origsz, newsz); LockBuffer(buf, BUFFER_LOCK_UNLOCK); /* * Try to update the tuple. If this doesn't work for whatever * reason, we need to restart from the top; the revmap might be * pointing at a different tuple for this block now, so we need to * recompute to ensure both our new heap tuple and the other * inserter's are covered by the combined tuple. It might be that * we don't need to update at all. */ if (!brin_doupdate(idxRel, pagesPerRange, revmap, heapBlk, buf, off, origtup, origsz, newtup, newsz, samepage)) { /* no luck; start over */ MemoryContextResetAndDeleteChildren(tupcxt); continue; } } /* success! */ break; } brinRevmapTerminate(revmap); if (BufferIsValid(buf)) ReleaseBuffer(buf); if (bdesc != NULL) { brin_free_desc(bdesc); MemoryContextSwitchTo(oldcxt); MemoryContextDelete(tupcxt); } return BoolGetDatum(false); }