/* * Primary entry point for VACUUM and ANALYZE commands. * * relid is normally InvalidOid; if it is not, then it provides the relation * OID to be processed, and vacstmt->relation is ignored. (The non-invalid * case is currently only used by autovacuum.) * * do_toast is passed as FALSE by autovacuum, because it processes TOAST * tables separately. * * for_wraparound is used by autovacuum to let us know when it's forcing * a vacuum for wraparound, which should not be auto-cancelled. * * bstrategy is normally given as NULL, but in autovacuum it can be passed * in to use the same buffer strategy object across multiple vacuum() calls. * * isTopLevel should be passed down from ProcessUtility. * * It is the caller's responsibility that vacstmt and bstrategy * (if given) be allocated in a memory context that won't disappear * at transaction commit. */ void vacuum(VacuumStmt *vacstmt, Oid relid, bool do_toast, BufferAccessStrategy bstrategy, bool for_wraparound, bool isTopLevel) { const char *stmttype; volatile bool all_rels, in_outer_xact, use_own_xacts; List *relations; /* sanity checks on options */ Assert(vacstmt->options & (VACOPT_VACUUM | VACOPT_ANALYZE)); Assert((vacstmt->options & VACOPT_VACUUM) || !(vacstmt->options & (VACOPT_FULL | VACOPT_FREEZE))); Assert((vacstmt->options & VACOPT_ANALYZE) || vacstmt->va_cols == NIL); stmttype = (vacstmt->options & VACOPT_VACUUM) ? "VACUUM" : "ANALYZE"; /* * We cannot run VACUUM inside a user transaction block; if we were inside * a transaction, then our commit- and start-transaction-command calls * would not have the intended effect! There are numerous other subtle * dependencies on this, too. * * ANALYZE (without VACUUM) can run either way. */ if (vacstmt->options & VACOPT_VACUUM) { PreventTransactionChain(isTopLevel, stmttype); in_outer_xact = false; } else in_outer_xact = IsInTransactionChain(isTopLevel); /* * Send info about dead objects to the statistics collector, unless we are * in autovacuum --- autovacuum.c does this for itself. */ if ((vacstmt->options & VACOPT_VACUUM) && !IsAutoVacuumWorkerProcess()) pgstat_vacuum_stat(); /* * Create special memory context for cross-transaction storage. * * Since it is a child of PortalContext, it will go away eventually even * if we suffer an error; there's no need for special abort cleanup logic. */ vac_context = AllocSetContextCreate(PortalContext, "Vacuum", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); /* * If caller didn't give us a buffer strategy object, make one in the * cross-transaction memory context. */ if (bstrategy == NULL) { MemoryContext old_context = MemoryContextSwitchTo(vac_context); bstrategy = GetAccessStrategy(BAS_VACUUM); MemoryContextSwitchTo(old_context); } vac_strategy = bstrategy; /* Remember whether we are processing everything in the DB */ all_rels = (!OidIsValid(relid) && vacstmt->relation == NULL); /* * Build list of relations to process, unless caller gave us one. (If we * build one, we put it in vac_context for safekeeping.) */ relations = get_rel_oids(relid, vacstmt->relation); /* * Decide whether we need to start/commit our own transactions. * * For VACUUM (with or without ANALYZE): always do so, so that we can * release locks as soon as possible. (We could possibly use the outer * transaction for a one-table VACUUM, but handling TOAST tables would be * problematic.) * * For ANALYZE (no VACUUM): if inside a transaction block, we cannot * start/commit our own transactions. Also, there's no need to do so if * only processing one relation. For multiple relations when not within a * transaction block, and also in an autovacuum worker, use own * transactions so we can release locks sooner. */ if (vacstmt->options & VACOPT_VACUUM) use_own_xacts = true; else { Assert(vacstmt->options & VACOPT_ANALYZE); if (IsAutoVacuumWorkerProcess()) use_own_xacts = true; else if (in_outer_xact) use_own_xacts = false; else if (list_length(relations) > 1) use_own_xacts = true; else use_own_xacts = false; } /* * vacuum_rel expects to be entered with no transaction active; it will * start and commit its own transaction. But we are called by an SQL * command, and so we are executing inside a transaction already. We * commit the transaction started in PostgresMain() here, and start * another one before exiting to match the commit waiting for us back in * PostgresMain(). */ if (use_own_xacts) { /* ActiveSnapshot is not set by autovacuum */ if (ActiveSnapshotSet()) PopActiveSnapshot(); /* matches the StartTransaction in PostgresMain() */ CommitTransactionCommand(); } /* Turn vacuum cost accounting on or off */ PG_TRY(); { ListCell *cur; VacuumCostActive = (VacuumCostDelay > 0); VacuumCostBalance = 0; /* * Loop to process each selected relation. */ foreach(cur, relations) { Oid relid = lfirst_oid(cur); bool scanned_all = false; if (vacstmt->options & VACOPT_VACUUM) vacuum_rel(relid, vacstmt, do_toast, for_wraparound, &scanned_all); if (vacstmt->options & VACOPT_ANALYZE) { /* * If using separate xacts, start one for analyze. Otherwise, * we can use the outer transaction. */ if (use_own_xacts) { StartTransactionCommand(); /* functions in indexes may want a snapshot set */ PushActiveSnapshot(GetTransactionSnapshot()); } analyze_rel(relid, vacstmt, vac_strategy, !scanned_all); if (use_own_xacts) { PopActiveSnapshot(); CommitTransactionCommand(); } } } }
/* ---------------------------------------------------------------- * ExecInitSetOp * * This initializes the setop node state structures and * the node's subplan. * ---------------------------------------------------------------- */ SetOpState * ExecInitSetOp(SetOp *node, EState *estate, int eflags) { SetOpState *setopstate; /* check for unsupported flags */ Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK))); /* * create state structure */ setopstate = makeNode(SetOpState); setopstate->ps.plan = (Plan *) node; setopstate->ps.state = estate; setopstate->ps.ps_OuterTupleSlot = NULL; setopstate->subplan_done = false; setopstate->numOutput = 0; /* * Miscellaneous initialization * * SetOp nodes have no ExprContext initialization because they never call * ExecQual or ExecProject. But they do need a per-tuple memory context * anyway for calling execTuplesMatch. */ setopstate->tempContext = AllocSetContextCreate(CurrentMemoryContext, "SetOp", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); #define SETOP_NSLOTS 1 /* * Tuple table initialization */ ExecInitResultTupleSlot(estate, &setopstate->ps); /* * then initialize outer plan */ outerPlanState(setopstate) = ExecInitNode(outerPlan(node), estate, eflags); /* * setop nodes do no projections, so initialize projection info for this * node appropriately */ ExecAssignResultTypeFromTL(&setopstate->ps); setopstate->ps.ps_ProjInfo = NULL; /* * Precompute fmgr lookup data for inner loop */ setopstate->eqfunctions = execTuplesMatchPrepare(ExecGetResultType(&setopstate->ps), node->numCols, node->dupColIdx); return setopstate; }
/* * hash_create -- create a new dynamic hash table * * tabname: a name for the table (for debugging purposes) * nelem: maximum number of elements expected * *info: additional table parameters, as indicated by flags * flags: bitmask indicating which parameters to take from *info * * Note: for a shared-memory hashtable, nelem needs to be a pretty good * estimate, since we can't expand the table on the fly. But an unshared * hashtable can be expanded on-the-fly, so it's better for nelem to be * on the small side and let the table grow if it's exceeded. An overly * large nelem will penalize hash_seq_search speed without buying much. */ HTAB * hash_create(const char *tabname, long nelem, HASHCTL *info, int flags) { HTAB *hashp; HASHHDR *hctl; /* * For shared hash tables, we have a local hash header (HTAB struct) that * we allocate in TopMemoryContext; all else is in shared memory. * * For non-shared hash tables, everything including the hash header is in * a memory context created specially for the hash table --- this makes * hash_destroy very simple. The memory context is made a child of either * a context specified by the caller, or TopMemoryContext if nothing is * specified. */ if (flags & HASH_SHARED_MEM) { /* Set up to allocate the hash header */ CurrentDynaHashCxt = TopMemoryContext; } else { /* Create the hash table's private memory context */ if (flags & HASH_CONTEXT) CurrentDynaHashCxt = info->hcxt; else CurrentDynaHashCxt = TopMemoryContext; CurrentDynaHashCxt = AllocSetContextCreate(CurrentDynaHashCxt, tabname, ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); } /* Initialize the hash header, plus a copy of the table name */ hashp = (HTAB *) DynaHashAlloc(sizeof(HTAB) + strlen(tabname) +1); MemSet(hashp, 0, sizeof(HTAB)); hashp->tabname = (char *) (hashp + 1); strcpy(hashp->tabname, tabname); if (flags & HASH_FUNCTION) hashp->hash = info->hash; else hashp->hash = string_hash; /* default hash function */ /* * If you don't specify a match function, it defaults to string_compare if * you used string_hash (either explicitly or by default) and to memcmp * otherwise. (Prior to PostgreSQL 7.4, memcmp was always used.) */ if (flags & HASH_COMPARE) hashp->match = info->match; else if (hashp->hash == string_hash) hashp->match = (HashCompareFunc) string_compare; else hashp->match = memcmp; /* * Similarly, the key-copying function defaults to strlcpy or memcpy. */ if (flags & HASH_KEYCOPY) hashp->keycopy = info->keycopy; else if (hashp->hash == string_hash) hashp->keycopy = (HashCopyFunc) strlcpy; else hashp->keycopy = memcpy; if (flags & HASH_ALLOC) hashp->alloc = info->alloc; else hashp->alloc = DynaHashAlloc; if (flags & HASH_SHARED_MEM) { /* * ctl structure and directory are preallocated for shared memory * tables. Note that HASH_DIRSIZE and HASH_ALLOC had better be set as * well. */ hashp->hctl = info->hctl; hashp->dir = (HASHSEGMENT *) (((char *) info->hctl) + sizeof(HASHHDR)); hashp->hcxt = NULL; hashp->isshared = true; /* hash table already exists, we're just attaching to it */ if (flags & HASH_ATTACH) { /* make local copies of some heavily-used values */ hctl = hashp->hctl; hashp->keysize = hctl->keysize; hashp->ssize = hctl->ssize; hashp->sshift = hctl->sshift; return hashp; } } else { /* setup hash table defaults */ hashp->hctl = NULL; hashp->dir = NULL; hashp->hcxt = CurrentDynaHashCxt; hashp->isshared = false; } if (!hashp->hctl) { hashp->hctl = (HASHHDR *) hashp->alloc(sizeof(HASHHDR)); if (!hashp->hctl) ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of memory"))); } hashp->frozen = false; hdefault(hashp); hctl = hashp->hctl; if (flags & HASH_PARTITION) { /* Doesn't make sense to partition a local hash table */ Assert(flags & HASH_SHARED_MEM); /* * The number of partitions had better be a power of 2. Also, it must * be less than INT_MAX (see init_htab()), so call the int version of * next_pow2. */ Assert(info->num_partitions == next_pow2_int(info->num_partitions)); hctl->num_partitions = info->num_partitions; } if (flags & HASH_SEGMENT) { hctl->ssize = info->ssize; hctl->sshift = my_log2(info->ssize); /* ssize had better be a power of 2 */ Assert(hctl->ssize == (1L << hctl->sshift)); } if (flags & HASH_FFACTOR) hctl->ffactor = info->ffactor; /* * SHM hash tables have fixed directory size passed by the caller. */ if (flags & HASH_DIRSIZE) { hctl->max_dsize = info->max_dsize; hctl->dsize = info->dsize; } /* * hash table now allocates space for key and data but you have to say how * much space to allocate */ if (flags & HASH_ELEM) { Assert(info->entrysize >= info->keysize); hctl->keysize = info->keysize; hctl->entrysize = info->entrysize; } /* make local copies of heavily-used constant fields */ hashp->keysize = hctl->keysize; hashp->ssize = hctl->ssize; hashp->sshift = hctl->sshift; /* Build the hash directory structure */ if (!init_htab(hashp, nelem)) elog(ERROR, "failed to initialize hash table \"%s\"", hashp->tabname); /* * For a shared hash table, preallocate the requested number of elements. * This reduces problems with run-time out-of-shared-memory conditions. * * For a non-shared hash table, preallocate the requested number of * elements if it's less than our chosen nelem_alloc. This avoids wasting * space if the caller correctly estimates a small table size. */ if ((flags & HASH_SHARED_MEM) || nelem < hctl->nelem_alloc) { if (!element_alloc(hashp, (int) nelem)) ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of memory"))); } if (flags & HASH_FIXED_SIZE) hashp->isfixed = true; return hashp; }
/* * Move tuples from pending pages into regular GIN structure. * * This can be called concurrently by multiple backends, so it must cope. * On first glance it looks completely not concurrent-safe and not crash-safe * either. The reason it's okay is that multiple insertion of the same entry * is detected and treated as a no-op by gininsert.c. If we crash after * posting entries to the main index and before removing them from the * pending list, it's okay because when we redo the posting later on, nothing * bad will happen. Likewise, if two backends simultaneously try to post * a pending entry into the main index, one will succeed and one will do * nothing. We try to notice when someone else is a little bit ahead of * us in the process, but that's just to avoid wasting cycles. Only the * action of removing a page from the pending list really needs exclusive * lock. * * vac_delay indicates that ginInsertCleanup is called from vacuum process, * so call vacuum_delay_point() periodically. * If stats isn't null, we count deleted pending pages into the counts. */ void ginInsertCleanup(GinState *ginstate, bool vac_delay, IndexBulkDeleteResult *stats) { Relation index = ginstate->index; Buffer metabuffer, buffer; Page metapage, page; GinMetaPageData *metadata; MemoryContext opCtx, oldCtx; BuildAccumulator accum; KeyArray datums; BlockNumber blkno; metabuffer = ReadBuffer(index, GIN_METAPAGE_BLKNO); LockBuffer(metabuffer, GIN_SHARE); metapage = BufferGetPage(metabuffer); metadata = GinPageGetMeta(metapage); if (metadata->head == InvalidBlockNumber) { /* Nothing to do */ UnlockReleaseBuffer(metabuffer); return; } /* * Read and lock head of pending list */ blkno = metadata->head; buffer = ReadBuffer(index, blkno); LockBuffer(buffer, GIN_SHARE); page = BufferGetPage(buffer); LockBuffer(metabuffer, GIN_UNLOCK); /* * Initialize. All temporary space will be in opCtx */ opCtx = AllocSetContextCreate(CurrentMemoryContext, "GIN insert cleanup temporary context", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); oldCtx = MemoryContextSwitchTo(opCtx); initKeyArray(&datums, 128); ginInitBA(&accum); accum.ginstate = ginstate; /* * At the top of this loop, we have pin and lock on the current page of * the pending list. However, we'll release that before exiting the loop. * Note we also have pin but not lock on the metapage. */ for (;;) { if (GinPageIsDeleted(page)) { /* another cleanup process is running concurrently */ UnlockReleaseBuffer(buffer); break; } /* * read page's datums into accum */ processPendingPage(&accum, &datums, page, FirstOffsetNumber); vacuum_delay_point(); /* * Is it time to flush memory to disk? Flush if we are at the end of * the pending list, or if we have a full row and memory is getting * full. * * XXX using up maintenance_work_mem here is probably unreasonably * much, since vacuum might already be using that much. */ if (GinPageGetOpaque(page)->rightlink == InvalidBlockNumber || (GinPageHasFullRow(page) && (accum.allocatedMemory >= maintenance_work_mem * 1024L))) { ItemPointerData *list; uint32 nlist; Datum key; GinNullCategory category; OffsetNumber maxoff, attnum; /* * Unlock current page to increase performance. Changes of page * will be checked later by comparing maxoff after completion of * memory flush. */ maxoff = PageGetMaxOffsetNumber(page); LockBuffer(buffer, GIN_UNLOCK); /* * Moving collected data into regular structure can take * significant amount of time - so, run it without locking pending * list. */ ginBeginBAScan(&accum); while ((list = ginGetBAEntry(&accum, &attnum, &key, &category, &nlist)) != NULL) { ginEntryInsert(ginstate, attnum, key, category, list, nlist, NULL); vacuum_delay_point(); } /* * Lock the whole list to remove pages */ LockBuffer(metabuffer, GIN_EXCLUSIVE); LockBuffer(buffer, GIN_SHARE); if (GinPageIsDeleted(page)) { /* another cleanup process is running concurrently */ UnlockReleaseBuffer(buffer); LockBuffer(metabuffer, GIN_UNLOCK); break; } /* * While we left the page unlocked, more stuff might have gotten * added to it. If so, process those entries immediately. There * shouldn't be very many, so we don't worry about the fact that * we're doing this with exclusive lock. Insertion algorithm * guarantees that inserted row(s) will not continue on next page. * NOTE: intentionally no vacuum_delay_point in this loop. */ if (PageGetMaxOffsetNumber(page) != maxoff) { ginInitBA(&accum); processPendingPage(&accum, &datums, page, maxoff + 1); ginBeginBAScan(&accum); while ((list = ginGetBAEntry(&accum, &attnum, &key, &category, &nlist)) != NULL) ginEntryInsert(ginstate, attnum, key, category, list, nlist, NULL); } /* * Remember next page - it will become the new list head */ blkno = GinPageGetOpaque(page)->rightlink; UnlockReleaseBuffer(buffer); /* shiftList will do exclusive * locking */ /* * remove read pages from pending list, at this point all * content of read pages is in regular structure */ if (shiftList(index, metabuffer, blkno, stats)) { /* another cleanup process is running concurrently */ LockBuffer(metabuffer, GIN_UNLOCK); break; } Assert(blkno == metadata->head); LockBuffer(metabuffer, GIN_UNLOCK); /* * if we removed the whole pending list just exit */ if (blkno == InvalidBlockNumber) break; /* * release memory used so far and reinit state */ MemoryContextReset(opCtx); initKeyArray(&datums, datums.maxvalues); ginInitBA(&accum); } else { blkno = GinPageGetOpaque(page)->rightlink; UnlockReleaseBuffer(buffer); } /* * Read next page in pending list */ vacuum_delay_point(); buffer = ReadBuffer(index, blkno); LockBuffer(buffer, GIN_SHARE); page = BufferGetPage(buffer); } ReleaseBuffer(metabuffer); /* Clean up temporary space */ MemoryContextSwitchTo(oldCtx); MemoryContextDelete(opCtx); }
Datum ginbuild(PG_FUNCTION_ARGS) { Relation heap = (Relation) PG_GETARG_POINTER(0); Relation index = (Relation) PG_GETARG_POINTER(1); IndexInfo *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2); IndexBuildResult *result; double reltuples; GinBuildState buildstate; Buffer RootBuffer, MetaBuffer; ItemPointerData *list; Datum entry; uint32 nlist; MemoryContext oldCtx; OffsetNumber attnum; if (RelationGetNumberOfBlocks(index) != 0) elog(ERROR, "index \"%s\" already contains data", RelationGetRelationName(index)); initGinState(&buildstate.ginstate, index); /* initialize the meta page */ MetaBuffer = GinNewBuffer(index); /* initialize the root page */ RootBuffer = GinNewBuffer(index); START_CRIT_SECTION(); GinInitMetabuffer(MetaBuffer); MarkBufferDirty(MetaBuffer); GinInitBuffer(RootBuffer, GIN_LEAF); MarkBufferDirty(RootBuffer); if (!index->rd_istemp) { XLogRecPtr recptr; XLogRecData rdata; Page page; rdata.buffer = InvalidBuffer; rdata.data = (char *) &(index->rd_node); rdata.len = sizeof(RelFileNode); rdata.next = NULL; recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_CREATE_INDEX, &rdata); page = BufferGetPage(RootBuffer); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); page = BufferGetPage(MetaBuffer); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); } UnlockReleaseBuffer(MetaBuffer); UnlockReleaseBuffer(RootBuffer); END_CRIT_SECTION(); /* build the index */ buildstate.indtuples = 0; /* * create a temporary memory context that is reset once for each tuple * inserted into the index */ buildstate.tmpCtx = AllocSetContextCreate(CurrentMemoryContext, "Gin build temporary context", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); buildstate.funcCtx = AllocSetContextCreate(buildstate.tmpCtx, "Gin build temporary context for user-defined function", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); buildstate.accum.ginstate = &buildstate.ginstate; ginInitBA(&buildstate.accum); /* * Do the heap scan. We disallow sync scan here because dataPlaceToPage * prefers to receive tuples in TID order. */ reltuples = IndexBuildHeapScan(heap, index, indexInfo, false, ginBuildCallback, (void *) &buildstate); /* dump remaining entries to the index */ oldCtx = MemoryContextSwitchTo(buildstate.tmpCtx); ginBeginBAScan(&buildstate.accum); while ((list = ginGetEntry(&buildstate.accum, &attnum, &entry, &nlist)) != NULL) { /* there could be many entries, so be willing to abort here */ CHECK_FOR_INTERRUPTS(); ginEntryInsert(index, &buildstate.ginstate, attnum, entry, list, nlist, TRUE); } MemoryContextSwitchTo(oldCtx); MemoryContextDelete(buildstate.tmpCtx); /* * Return statistics */ result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); result->heap_tuples = reltuples; result->index_tuples = buildstate.indtuples; PG_RETURN_POINTER(result); }
Datum Type_invokeSRF(Type self, jclass cls, jmethodID method, jvalue* args, PG_FUNCTION_ARGS) { bool hasRow; CallContextData* ctxData; FuncCallContext* context; MemoryContext currCtx; /* stuff done only on the first call of the function */ if(SRF_IS_FIRSTCALL()) { jobject tmp; /* create a function context for cross-call persistence */ context = SRF_FIRSTCALL_INIT(); currCtx = MemoryContextSwitchTo(context->multi_call_memory_ctx); /* Call the declared Java function. It returns an instance that can produce * the rows. */ tmp = Type_getSRFProducer(self, cls, method, args); if(tmp == 0) { Invocation_assertDisconnect(); MemoryContextSwitchTo(currCtx); fcinfo->isnull = true; SRF_RETURN_DONE(context); } ctxData = (CallContextData*)palloc(sizeof(CallContextData)); context->user_fctx = ctxData; ctxData->elemType = self; ctxData->rowProducer = JNI_newGlobalRef(tmp); JNI_deleteLocalRef(tmp); /* Some row producers will need a writable result set in order * to produce the row. If one is needed, it's created here. */ tmp = Type_getSRFCollector(self, fcinfo); if(tmp == 0) ctxData->rowCollector = 0; else { ctxData->rowCollector = JNI_newGlobalRef(tmp); JNI_deleteLocalRef(tmp); } ctxData->trusted = currentInvocation->trusted; ctxData->hasConnected = currentInvocation->hasConnected; ctxData->invocation = currentInvocation->invocation; if(ctxData->hasConnected) ctxData->spiContext = CurrentMemoryContext; else ctxData->spiContext = 0; ctxData->rowContext = AllocSetContextCreate(context->multi_call_memory_ctx, "PL/Java row context", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); /* Register callback to be called when the function ends */ RegisterExprContextCallback(((ReturnSetInfo*)fcinfo->resultinfo)->econtext, _endOfSetCB, PointerGetDatum(ctxData)); MemoryContextSwitchTo(currCtx); } context = SRF_PERCALL_SETUP(); ctxData = (CallContextData*)context->user_fctx; MemoryContextReset(ctxData->rowContext); currCtx = MemoryContextSwitchTo(ctxData->rowContext); currentInvocation->hasConnected = ctxData->hasConnected; currentInvocation->invocation = ctxData->invocation; hasRow = Type_hasNextSRF(self, ctxData->rowProducer, ctxData->rowCollector, (jint)context->call_cntr); ctxData->hasConnected = currentInvocation->hasConnected; ctxData->invocation = currentInvocation->invocation; currentInvocation->hasConnected = false; currentInvocation->invocation = 0; if(hasRow) { Datum result = Type_nextSRF(self, ctxData->rowProducer, ctxData->rowCollector); MemoryContextSwitchTo(currCtx); SRF_RETURN_NEXT(context, result); } MemoryContextSwitchTo(currCtx); /* Unregister this callback and call it manually. We do this because * otherwise it will be called when the backend is in progress of * cleaning up Portals. If we close cursors (i.e. drop portals) in * the close, then that mechanism fails since attempts are made to * delete portals more then once. */ UnregisterExprContextCallback( ((ReturnSetInfo*)fcinfo->resultinfo)->econtext, _endOfSetCB, PointerGetDatum(ctxData)); _closeIteration(ctxData); /* This is the end of the set. */ SRF_RETURN_DONE(context); }
/* * Main entry point for walwriter process * * This is invoked from BootstrapMain, which has already created the basic * execution environment, but not enabled signals yet. */ void WalWriterMain(void) { sigjmp_buf local_sigjmp_buf; MemoryContext walwriter_context; /* * If possible, make this process a group leader, so that the postmaster * can signal any child processes too. (walwriter probably never has any * child processes, but for consistency we make all postmaster child * processes do this.) */ #ifdef HAVE_SETSID if (setsid() < 0) elog(FATAL, "setsid() failed: %m"); #endif /* * Properly accept or ignore signals the postmaster might send us * * We have no particular use for SIGINT at the moment, but seems * reasonable to treat like SIGTERM. */ pqsignal(SIGHUP, WalSigHupHandler); /* set flag to read config file */ pqsignal(SIGINT, WalShutdownHandler); /* request shutdown */ pqsignal(SIGTERM, WalShutdownHandler); /* request shutdown */ pqsignal(SIGQUIT, wal_quickdie); /* hard crash time */ pqsignal(SIGALRM, SIG_IGN); pqsignal(SIGPIPE, SIG_IGN); pqsignal(SIGUSR1, SIG_IGN); /* reserve for ProcSignal */ pqsignal(SIGUSR2, SIG_IGN); /* not used */ /* * Reset some signals that are accepted by postmaster but not here */ pqsignal(SIGCHLD, SIG_DFL); pqsignal(SIGTTIN, SIG_DFL); pqsignal(SIGTTOU, SIG_DFL); pqsignal(SIGCONT, SIG_DFL); pqsignal(SIGWINCH, SIG_DFL); /* We allow SIGQUIT (quickdie) at all times */ sigdelset(&BlockSig, SIGQUIT); /* * Create a resource owner to keep track of our resources (not clear that * we need this, but may as well have one). */ CurrentResourceOwner = ResourceOwnerCreate(NULL, "Wal Writer"); /* * Create a memory context that we will do all our work in. We do this so * that we can reset the context during error recovery and thereby avoid * possible memory leaks. Formerly this code just ran in * TopMemoryContext, but resetting that would be a really bad idea. */ walwriter_context = AllocSetContextCreate(TopMemoryContext, "Wal Writer", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); MemoryContextSwitchTo(walwriter_context); /* * If an exception is encountered, processing resumes here. * * This code is heavily based on bgwriter.c, q.v. */ if (sigsetjmp(local_sigjmp_buf, 1) != 0) { /* Since not using PG_TRY, must reset error stack by hand */ error_context_stack = NULL; /* Prevent interrupts while cleaning up */ HOLD_INTERRUPTS(); /* Report the error to the server log */ EmitErrorReport(); /* * These operations are really just a minimal subset of * AbortTransaction(). We don't have very many resources to worry * about in walwriter, but we do have LWLocks, and perhaps buffers? */ LWLockReleaseAll(); AbortBufferIO(); UnlockBuffers(); /* buffer pins are released here: */ ResourceOwnerRelease(CurrentResourceOwner, RESOURCE_RELEASE_BEFORE_LOCKS, false, true); /* we needn't bother with the other ResourceOwnerRelease phases */ AtEOXact_Buffers(false); AtEOXact_Files(); AtEOXact_HashTables(false); /* * Now return to normal top-level context and clear ErrorContext for * next time. */ MemoryContextSwitchTo(walwriter_context); FlushErrorState(); /* Flush any leaked data in the top-level context */ MemoryContextResetAndDeleteChildren(walwriter_context); /* Now we can allow interrupts again */ RESUME_INTERRUPTS(); /* * Sleep at least 1 second after any error. A write error is likely * to be repeated, and we don't want to be filling the error logs as * fast as we can. */ pg_usleep(1000000L); /* * Close all open files after any error. This is helpful on Windows, * where holding deleted files open causes various strange errors. * It's not clear we need it elsewhere, but shouldn't hurt. */ smgrcloseall(); } /* We can now handle ereport(ERROR) */ PG_exception_stack = &local_sigjmp_buf; /* * Unblock signals (they were blocked when the postmaster forked us) */ PG_SETMASK(&UnBlockSig); /* * Loop forever */ for (;;) { long udelay; /* * Emergency bailout if postmaster has died. This is to avoid the * necessity for manual cleanup of all postmaster children. */ if (!PostmasterIsAlive(true)) exit(1); /* * Process any requests or signals received recently. */ if (got_SIGHUP) { got_SIGHUP = false; ProcessConfigFile(PGC_SIGHUP); } if (shutdown_requested) { /* Normal exit from the walwriter is here */ proc_exit(0); /* done */ } /* * Do what we're here for... */ XLogBackgroundFlush(); /* * Delay until time to do something more, but fall out of delay * reasonably quickly if signaled. */ udelay = WalWriterDelay * 1000L; while (udelay > 999999L) { if (got_SIGHUP || shutdown_requested) break; pg_usleep(1000000L); udelay -= 1000000L; } if (!(got_SIGHUP || shutdown_requested)) pg_usleep(udelay); } }
/* * Move tuples from pending pages into regular GIN structure. * * On first glance it looks completely not crash-safe. But if we crash * after posting entries to the main index and before removing them from the * pending list, it's okay because when we redo the posting later on, nothing * bad will happen. * * fill_fsm indicates that ginInsertCleanup should add deleted pages * to FSM otherwise caller is responsible to put deleted pages into * FSM. * * If stats isn't null, we count deleted pending pages into the counts. */ void ginInsertCleanup(GinState *ginstate, bool full_clean, bool fill_fsm, IndexBulkDeleteResult *stats) { Relation index = ginstate->index; Buffer metabuffer, buffer; Page metapage, page; GinMetaPageData *metadata; MemoryContext opCtx, oldCtx; BuildAccumulator accum; KeyArray datums; BlockNumber blkno, blknoFinish; bool cleanupFinish = false; bool fsm_vac = false; Size workMemory; bool inVacuum = (stats == NULL); /* * We would like to prevent concurrent cleanup process. For that we will * lock metapage in exclusive mode using LockPage() call. Nobody other * will use that lock for metapage, so we keep possibility of concurrent * insertion into pending list */ if (inVacuum) { /* * We are called from [auto]vacuum/analyze or gin_clean_pending_list() * and we would like to wait concurrent cleanup to finish. */ LockPage(index, GIN_METAPAGE_BLKNO, ExclusiveLock); workMemory = (IsAutoVacuumWorkerProcess() && autovacuum_work_mem != -1) ? autovacuum_work_mem : maintenance_work_mem; } else { /* * We are called from regular insert and if we see concurrent cleanup * just exit in hope that concurrent process will clean up pending * list. */ if (!ConditionalLockPage(index, GIN_METAPAGE_BLKNO, ExclusiveLock)) return; workMemory = work_mem; } metabuffer = ReadBuffer(index, GIN_METAPAGE_BLKNO); LockBuffer(metabuffer, GIN_SHARE); metapage = BufferGetPage(metabuffer); metadata = GinPageGetMeta(metapage); if (metadata->head == InvalidBlockNumber) { /* Nothing to do */ UnlockReleaseBuffer(metabuffer); UnlockPage(index, GIN_METAPAGE_BLKNO, ExclusiveLock); return; } /* * Remember a tail page to prevent infinite cleanup if other backends add * new tuples faster than we can cleanup. */ blknoFinish = metadata->tail; /* * Read and lock head of pending list */ blkno = metadata->head; buffer = ReadBuffer(index, blkno); LockBuffer(buffer, GIN_SHARE); page = BufferGetPage(buffer); LockBuffer(metabuffer, GIN_UNLOCK); /* * Initialize. All temporary space will be in opCtx */ opCtx = AllocSetContextCreate(CurrentMemoryContext, "GIN insert cleanup temporary context", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); oldCtx = MemoryContextSwitchTo(opCtx); initKeyArray(&datums, 128); ginInitBA(&accum); accum.ginstate = ginstate; /* * At the top of this loop, we have pin and lock on the current page of * the pending list. However, we'll release that before exiting the loop. * Note we also have pin but not lock on the metapage. */ for (;;) { Assert(!GinPageIsDeleted(page)); /* * Are we walk through the page which as we remember was a tail when * we start our cleanup? But if caller asks us to clean up whole * pending list then ignore old tail, we will work until list becomes * empty. */ if (blkno == blknoFinish && full_clean == false) cleanupFinish = true; /* * read page's datums into accum */ processPendingPage(&accum, &datums, page, FirstOffsetNumber); vacuum_delay_point(); /* * Is it time to flush memory to disk? Flush if we are at the end of * the pending list, or if we have a full row and memory is getting * full. */ if (GinPageGetOpaque(page)->rightlink == InvalidBlockNumber || (GinPageHasFullRow(page) && (accum.allocatedMemory >= workMemory * 1024L))) { ItemPointerData *list; uint32 nlist; Datum key; GinNullCategory category; OffsetNumber maxoff, attnum; /* * Unlock current page to increase performance. Changes of page * will be checked later by comparing maxoff after completion of * memory flush. */ maxoff = PageGetMaxOffsetNumber(page); LockBuffer(buffer, GIN_UNLOCK); /* * Moving collected data into regular structure can take * significant amount of time - so, run it without locking pending * list. */ ginBeginBAScan(&accum); while ((list = ginGetBAEntry(&accum, &attnum, &key, &category, &nlist)) != NULL) { ginEntryInsert(ginstate, attnum, key, category, list, nlist, NULL); vacuum_delay_point(); } /* * Lock the whole list to remove pages */ LockBuffer(metabuffer, GIN_EXCLUSIVE); LockBuffer(buffer, GIN_SHARE); Assert(!GinPageIsDeleted(page)); /* * While we left the page unlocked, more stuff might have gotten * added to it. If so, process those entries immediately. There * shouldn't be very many, so we don't worry about the fact that * we're doing this with exclusive lock. Insertion algorithm * guarantees that inserted row(s) will not continue on next page. * NOTE: intentionally no vacuum_delay_point in this loop. */ if (PageGetMaxOffsetNumber(page) != maxoff) { ginInitBA(&accum); processPendingPage(&accum, &datums, page, maxoff + 1); ginBeginBAScan(&accum); while ((list = ginGetBAEntry(&accum, &attnum, &key, &category, &nlist)) != NULL) ginEntryInsert(ginstate, attnum, key, category, list, nlist, NULL); } /* * Remember next page - it will become the new list head */ blkno = GinPageGetOpaque(page)->rightlink; UnlockReleaseBuffer(buffer); /* shiftList will do exclusive * locking */ /* * remove read pages from pending list, at this point all content * of read pages is in regular structure */ shiftList(index, metabuffer, blkno, fill_fsm, stats); /* At this point, some pending pages have been freed up */ fsm_vac = true; Assert(blkno == metadata->head); LockBuffer(metabuffer, GIN_UNLOCK); /* * if we removed the whole pending list or we cleanup tail (which * we remembered on start our cleanup process) then just exit */ if (blkno == InvalidBlockNumber || cleanupFinish) break; /* * release memory used so far and reinit state */ MemoryContextReset(opCtx); initKeyArray(&datums, datums.maxvalues); ginInitBA(&accum); } else { blkno = GinPageGetOpaque(page)->rightlink; UnlockReleaseBuffer(buffer); } /* * Read next page in pending list */ vacuum_delay_point(); buffer = ReadBuffer(index, blkno); LockBuffer(buffer, GIN_SHARE); page = BufferGetPage(buffer); } UnlockPage(index, GIN_METAPAGE_BLKNO, ExclusiveLock); ReleaseBuffer(metabuffer); /* * As pending list pages can have a high churn rate, it is desirable to * recycle them immediately to the FreeSpace Map when ordinary backends * clean the list. */ if (fsm_vac && fill_fsm) IndexFreeSpaceMapVacuum(index); /* Clean up temporary space */ MemoryContextSwitchTo(oldCtx); MemoryContextDelete(opCtx); }
/* * A tuple in the heap is being inserted. To keep a brin index up to date, * we need to obtain the relevant index tuple and compare its stored values * with those of the new tuple. If the tuple values are not consistent with * the summary tuple, we need to update the index tuple. * * If the range is not currently summarized (i.e. the revmap returns NULL for * it), there's nothing to do. */ bool brininsert(Relation idxRel, Datum *values, bool *nulls, ItemPointer heaptid, Relation heapRel, IndexUniqueCheck checkUnique) { BlockNumber pagesPerRange; BrinDesc *bdesc = NULL; BrinRevmap *revmap; Buffer buf = InvalidBuffer; MemoryContext tupcxt = NULL; MemoryContext oldcxt = NULL; revmap = brinRevmapInitialize(idxRel, &pagesPerRange, NULL); for (;;) { bool need_insert = false; OffsetNumber off; BrinTuple *brtup; BrinMemTuple *dtup; BlockNumber heapBlk; int keyno; CHECK_FOR_INTERRUPTS(); heapBlk = ItemPointerGetBlockNumber(heaptid); /* normalize the block number to be the first block in the range */ heapBlk = (heapBlk / pagesPerRange) * pagesPerRange; brtup = brinGetTupleForHeapBlock(revmap, heapBlk, &buf, &off, NULL, BUFFER_LOCK_SHARE, NULL); /* if range is unsummarized, there's nothing to do */ if (!brtup) break; /* First time through? */ if (bdesc == NULL) { bdesc = brin_build_desc(idxRel); tupcxt = AllocSetContextCreate(CurrentMemoryContext, "brininsert cxt", ALLOCSET_DEFAULT_SIZES); oldcxt = MemoryContextSwitchTo(tupcxt); } dtup = brin_deform_tuple(bdesc, brtup); /* * Compare the key values of the new tuple to the stored index values; * our deformed tuple will get updated if the new tuple doesn't fit * the original range (note this means we can't break out of the loop * early). Make a note of whether this happens, so that we know to * insert the modified tuple later. */ for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++) { Datum result; BrinValues *bval; FmgrInfo *addValue; bval = &dtup->bt_columns[keyno]; addValue = index_getprocinfo(idxRel, keyno + 1, BRIN_PROCNUM_ADDVALUE); result = FunctionCall4Coll(addValue, idxRel->rd_indcollation[keyno], PointerGetDatum(bdesc), PointerGetDatum(bval), values[keyno], nulls[keyno]); /* if that returned true, we need to insert the updated tuple */ need_insert |= DatumGetBool(result); } if (!need_insert) { /* * The tuple is consistent with the new values, so there's nothing * to do. */ LockBuffer(buf, BUFFER_LOCK_UNLOCK); } else { Page page = BufferGetPage(buf); ItemId lp = PageGetItemId(page, off); Size origsz; BrinTuple *origtup; Size newsz; BrinTuple *newtup; bool samepage; /* * Make a copy of the old tuple, so that we can compare it after * re-acquiring the lock. */ origsz = ItemIdGetLength(lp); origtup = brin_copy_tuple(brtup, origsz); /* * Before releasing the lock, check if we can attempt a same-page * update. Another process could insert a tuple concurrently in * the same page though, so downstream we must be prepared to cope * if this turns out to not be possible after all. */ newtup = brin_form_tuple(bdesc, heapBlk, dtup, &newsz); samepage = brin_can_do_samepage_update(buf, origsz, newsz); LockBuffer(buf, BUFFER_LOCK_UNLOCK); /* * Try to update the tuple. If this doesn't work for whatever * reason, we need to restart from the top; the revmap might be * pointing at a different tuple for this block now, so we need to * recompute to ensure both our new heap tuple and the other * inserter's are covered by the combined tuple. It might be that * we don't need to update at all. */ if (!brin_doupdate(idxRel, pagesPerRange, revmap, heapBlk, buf, off, origtup, origsz, newtup, newsz, samepage)) { /* no luck; start over */ MemoryContextResetAndDeleteChildren(tupcxt); continue; } } /* success! */ break; } brinRevmapTerminate(revmap); if (BufferIsValid(buf)) ReleaseBuffer(buf); if (bdesc != NULL) { brin_free_desc(bdesc); MemoryContextSwitchTo(oldcxt); MemoryContextDelete(tupcxt); } return false; }
/* * main entry pont of pcp worker child process */ void pcp_worker_main(int port) { sigjmp_buf local_sigjmp_buf; MemoryContext PCPMemoryContext; int authenticated = 0; char salt[4]; int random_salt = 0; struct timeval uptime; char tos; int rsize; char *buf = NULL; ereport(DEBUG1, (errmsg("I am PCP worker child with pid:%d",getpid()))); /* Identify myself via ps */ init_ps_display("", "", "", ""); gettimeofday(&uptime, NULL); srandom((unsigned int) (getpid() ^ uptime.tv_usec)); /* set up signal handlers */ signal(SIGTERM, die); signal(SIGINT, die); signal(SIGQUIT, die); signal(SIGCHLD, SIG_DFL); signal(SIGUSR2, wakeup_handler_child); signal(SIGUSR1, SIG_IGN); signal(SIGHUP, SIG_IGN); signal(SIGPIPE, SIG_IGN); signal(SIGALRM, SIG_IGN); /* Create per loop iteration memory context */ PCPMemoryContext = AllocSetContextCreate(TopMemoryContext, "PCP_worker_main_loop", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); MemoryContextSwitchTo(TopMemoryContext); /* * install the call back for preparation of pcp worker child exit */ on_system_exit(pcp_worker_will_go_down, (Datum)NULL); /* Initialize my backend status */ pool_initialize_private_backend_status(); /* Initialize process context */ pool_init_process_context(); pcp_frontend = pcp_open(port); unset_nonblock(pcp_frontend->fd); if (sigsetjmp(local_sigjmp_buf, 1) != 0) { error_context_stack = NULL; EmitErrorReport(); MemoryContextSwitchTo(TopMemoryContext); FlushErrorState(); } /* We can now handle ereport(ERROR) */ PG_exception_stack = &local_sigjmp_buf; for(;;) { MemoryContextSwitchTo(PCPMemoryContext); MemoryContextResetAndDeleteChildren(PCPMemoryContext); errno = 0; /* read a PCP packet */ do_pcp_read(pcp_frontend, &tos, 1); do_pcp_read(pcp_frontend, &rsize, sizeof(int)); rsize = ntohl(rsize); if ((rsize - sizeof(int)) > 0) { buf = (char *)palloc(rsize - sizeof(int)); do_pcp_read(pcp_frontend, buf, rsize - sizeof(int)); } ereport(DEBUG1, (errmsg("received PCP packet"), errdetail("PCP packet type of service '%c'", tos))); if (tos == 'R') /* authentication */ { set_ps_display("PCP: processing authentication", false); process_authentication(pcp_frontend, buf,salt, &random_salt); authenticated = 1; continue; } if (tos == 'M') /* md5 salt */ { set_ps_display("PCP: processing authentication", false); send_md5salt(pcp_frontend, salt); random_salt = 1; continue; } /* is this connection authenticated? if not disconnect immediately*/ if (!authenticated) ereport(FATAL, (errmsg("authentication failed for new PCP connection"), errdetail("connection not authorized"))); /* process a request */ pcp_process_command(tos, buf, rsize); } exit(0); }
/* * btvacuumscan --- scan the index for VACUUMing purposes * * This combines the functions of looking for leaf tuples that are deletable * according to the vacuum callback, looking for empty pages that can be * deleted, and looking for old deleted pages that can be recycled. Both * btbulkdelete and btvacuumcleanup invoke this (the latter only if no * btbulkdelete call occurred). * * The caller is responsible for initially allocating/zeroing a stats struct * and for obtaining a vacuum cycle ID if necessary. */ static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state, BTCycleId cycleid) { Relation rel = info->index; BTVacState vstate; BlockNumber num_pages; BlockNumber blkno; bool needLock; /* * Reset counts that will be incremented during the scan; needed in case * of multiple scans during a single VACUUM command */ stats->estimated_count = false; stats->num_index_tuples = 0; stats->pages_deleted = 0; /* Set up info to pass down to btvacuumpage */ vstate.info = info; vstate.stats = stats; vstate.callback = callback; vstate.callback_state = callback_state; vstate.cycleid = cycleid; vstate.lastBlockVacuumed = BTREE_METAPAGE; /* Initialise at first block */ vstate.lastBlockLocked = BTREE_METAPAGE; vstate.totFreePages = 0; /* Create a temporary memory context to run _bt_pagedel in */ vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext, "_bt_pagedel", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); /* * The outer loop iterates over all index pages except the metapage, in * physical order (we hope the kernel will cooperate in providing * read-ahead for speed). It is critical that we visit all leaf pages, * including ones added after we start the scan, else we might fail to * delete some deletable tuples. Hence, we must repeatedly check the * relation length. We must acquire the relation-extension lock while * doing so to avoid a race condition: if someone else is extending the * relation, there is a window where bufmgr/smgr have created a new___ * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If * we manage to scan such a page here, we'll improperly assume it can be * recycled. Taking the lock synchronizes things enough to prevent a * problem: either num_pages won't include the new___ page, or _bt_getbuf * already has write lock on the buffer and it will be fully initialized * before we can examine it. (See also vacuumlazy.c, which has the same * issue.) Also, we need not worry if a page is added immediately after * we look; the page splitting code already has write-lock on the left * page before it adds a right page, so we must already have processed any * tuples due to be moved into such a page. * * We can skip locking for new___ or temp relations, however, since no one * else could be accessing them. */ needLock = !RELATION_IS_LOCAL(rel); blkno = BTREE_METAPAGE + 1; for (;;) { /* Get the current relation length */ if (needLock) LockRelationForExtension(rel, ExclusiveLock); num_pages = RelationGetNumberOfBlocks(rel); if (needLock) UnlockRelationForExtension(rel, ExclusiveLock); /* Quit if we've scanned the whole relation */ if (blkno >= num_pages) break; /* Iterate over pages, then loop back to recheck length */ for (; blkno < num_pages; blkno++) { btvacuumpage(&vstate, blkno, blkno); } } /* * If the WAL is replayed in hot standby, the replay process needs to get * cleanup locks on all index leaf pages, just as we've been doing here. * However, we won't issue any WAL records about pages that have no items * to be deleted. For pages between pages we've vacuumed, the replay code * will take locks under the direction of the lastBlockVacuumed fields in * the XLOG_BTREE_VACUUM WAL records. To cover pages after the last one * we vacuum, we need to issue a dummy XLOG_BTREE_VACUUM WAL record * against the last leaf page in the index, if that one wasn't vacuumed. */ if (XLogStandbyInfoActive() && vstate.lastBlockVacuumed < vstate.lastBlockLocked) { Buffer buf; /* * The page should be valid, but we can't use _bt_getbuf() because we * want to use a nondefault buffer access strategy. Since we aren't * going to delete any items, getting cleanup lock again is probably * overkill, but for consistency do that anyway. */ buf = ReadBufferExtended(rel, MAIN_FORKNUM, vstate.lastBlockLocked, RBM_NORMAL, info->strategy); LockBufferForCleanup(buf); _bt_checkpage(rel, buf); _bt_delitems_vacuum(rel, buf, NULL, 0, vstate.lastBlockVacuumed); _bt_relbuf(rel, buf); } MemoryContextDelete(vstate.pagedelcontext); /* update statistics */ stats->num_pages = num_pages; stats->pages_free = vstate.totFreePages; }
/* * geqo_eval * * Returns cost of a query tree as an individual of the population. */ Cost geqo_eval(PlannerInfo *root, Gene *tour, int num_gene) { MemoryContext mycontext; MemoryContext oldcxt; RelOptInfo *joinrel; Path *best_path; Cost fitness; int savelength; struct HTAB *savehash; /* * Create a private memory context that will hold all temp storage * allocated inside gimme_tree(). * * Since geqo_eval() will be called many times, we can't afford to let all * that memory go unreclaimed until end of statement. Note we make the * temp context a child of the planner's normal context, so that it will * be freed even if we abort via ereport(ERROR). */ mycontext = AllocSetContextCreate(CurrentMemoryContext, "GEQO", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); oldcxt = MemoryContextSwitchTo(mycontext); /* * gimme_tree will add entries to root->join_rel_list, which may or may * not already contain some entries. The newly added entries will be * recycled by the MemoryContextDelete below, so we must ensure that the * list is restored to its former state before exiting. We can do this by * truncating the list to its original length. NOTE this assumes that any * added entries are appended at the end! * * We also must take care not to mess up the outer join_rel_hash, if there * is one. We can do this by just temporarily setting the link to NULL. * (If we are dealing with enough join rels, which we very likely are, a * new hash table will get built and used locally.) * * join_rel_level[] shouldn't be in use, so just Assert it isn't. */ savelength = list_length(root->join_rel_list); savehash = root->join_rel_hash; Assert(root->join_rel_level == NULL); root->join_rel_hash = NULL; /* construct the best path for the given combination of relations */ joinrel = gimme_tree(root, tour, num_gene); best_path = joinrel->cheapest_total_path; /* * compute fitness * * XXX geqo does not currently support optimization for partial result * retrieval, nor do we take any cognizance of possible use of * parameterized paths --- how to fix? */ fitness = best_path->total_cost; /* * Restore join_rel_list to its former state, and put back original * hashtable if any. */ root->join_rel_list = list_truncate(root->join_rel_list, savelength); root->join_rel_hash = savehash; /* release all the memory acquired within gimme_tree */ MemoryContextSwitchTo(oldcxt); MemoryContextDelete(mycontext); return fitness; }
/* ---------------------------------------------------------------- * ExecHashTableCreate * * create an empty hashtable data structure for hashjoin. * ---------------------------------------------------------------- */ HashJoinTable ExecHashTableCreate(HashState *hashState, HashJoinState *hjstate, List *hashOperators, uint64 operatorMemKB) { HashJoinTable hashtable; Plan *outerNode; int nbuckets; int nbatch; int nkeys; int i; ListCell *ho; MemoryContext oldcxt; START_MEMORY_ACCOUNT(hashState->ps.plan->memoryAccount); { Hash *node = (Hash *) hashState->ps.plan; /* * Get information about the size of the relation to be hashed (it's the * "outer" subtree of this node, but the inner relation of the hashjoin). * Compute the appropriate size of the hash table. */ outerNode = outerPlan(node); /* * Initialize the hash table control block. * * The hashtable control block is just palloc'd from the executor's * per-query memory context. */ hashtable = (HashJoinTable)palloc0(sizeof(HashJoinTableData)); hashtable->buckets = NULL; hashtable->bloom = NULL; hashtable->curbatch = 0; hashtable->growEnabled = true; hashtable->totalTuples = 0; hashtable->batches = NULL; hashtable->work_set = NULL; hashtable->state_file = NULL; hashtable->spaceAllowed = operatorMemKB * 1024L; hashtable->stats = NULL; hashtable->eagerlyReleased = false; hashtable->hjstate = hjstate; /* * Create temporary memory contexts in which to keep the hashtable working * storage. See notes in executor/hashjoin.h. */ hashtable->hashCxt = AllocSetContextCreate(CurrentMemoryContext, "HashTableContext", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); hashtable->batchCxt = AllocSetContextCreate(hashtable->hashCxt, "HashBatchContext", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); /* CDB */ /* track temp buf file allocations in separate context */ hashtable->bfCxt = AllocSetContextCreate(CurrentMemoryContext, "hbbfcxt", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); ExecChooseHashTableSize(outerNode->plan_rows, outerNode->plan_width, &hashtable->nbuckets, &hashtable->nbatch, operatorMemKB); nbuckets = hashtable->nbuckets; nbatch = hashtable->nbatch; hashtable->nbatch_original = nbatch; hashtable->nbatch_outstart = nbatch; #ifdef HJDEBUG elog(LOG, "HJ: nbatch = %d, nbuckets = %d\n", nbatch, nbuckets); #endif /* * Get info about the hash functions to be used for each hash key. * Also remember whether the join operators are strict. */ nkeys = list_length(hashOperators); hashtable->outer_hashfunctions = (FmgrInfo *) palloc(nkeys * sizeof(FmgrInfo)); hashtable->inner_hashfunctions = (FmgrInfo *) palloc(nkeys * sizeof(FmgrInfo)); hashtable->hashStrict = (bool *) palloc(nkeys * sizeof(bool)); i = 0; foreach(ho, hashOperators) { Oid hashop = lfirst_oid(ho); Oid left_hashfn; Oid right_hashfn; if (!get_op_hash_functions(hashop, &left_hashfn, &right_hashfn)) elog(ERROR, "could not find hash function for hash operator %u", hashop); fmgr_info(left_hashfn, &hashtable->outer_hashfunctions[i]); fmgr_info(right_hashfn, &hashtable->inner_hashfunctions[i]); hashtable->hashStrict[i] = op_strict(hashop); i++; } /* * Allocate data that will live for the life of the hashjoin */ oldcxt = MemoryContextSwitchTo(hashtable->hashCxt); #ifdef HJDEBUG { /* Memory needed to allocate hashtable->batches, which consists of nbatch pointers */ int md_batch_size = (nbatch * sizeof(hashtable->batches[0])) / (1024 * 1024); /* Memory needed to allocate hashtable->batches entries, which consist of nbatch HashJoinBatchData structures */ int md_batch_data_size = (nbatch * sizeof(HashJoinBatchData)) / (1024 * 1024); /* Memory needed to allocate hashtable->buckets, which consists of nbuckets HashJoinTuple structures*/ int md_buckets_size = (nbuckets * sizeof(HashJoinTuple)) / (1024 * 1024); /* Memory needed to allocate hashtable->bloom, which consists of nbuckets int64 values */ int md_bloom_size = (nbuckets * sizeof(uint64)) / (1024 * 1024); /* Total memory needed for the hashtable metadata */ int md_tot = md_batch_size + md_batch_data_size + md_buckets_size + md_bloom_size; elog(LOG, "About to allocate HashTable. HT_MEMORY=%dMB Memory needed for metadata: MDBATCH_ARR=%dMB, MDBATCH_DATA=%dMB, MDBUCKETS_ARR=%dMB, MDBLOOM_ARR=%dMB, TOTAL=%dMB", (int) (hashtable->spaceAllowed / (1024 * 1024)), md_batch_size, md_batch_data_size, md_buckets_size, md_bloom_size, md_tot); elog(LOG, "sizeof(hashtable->batches[0])=%d, sizeof(HashJoinBatchData)=%d, sizeof(HashJoinTuple)=%d, sizeof(uint64)=%d", (int) sizeof(hashtable->batches[0]), (int) sizeof(HashJoinBatchData), (int) sizeof(HashJoinTuple), (int) sizeof(uint64)); } #endif /* array of BatchData ptrs */ hashtable->batches = (HashJoinBatchData **)palloc(nbatch * sizeof(hashtable->batches[0])); /* one BatchData entry per initial batch */ for (i = 0; i < nbatch; i++) hashtable->batches[i] = (HashJoinBatchData *)palloc0(sizeof(HashJoinBatchData)); /* * Prepare context for the first-scan space allocations; allocate the * hashbucket array therein, and set each bucket "empty". */ MemoryContextSwitchTo(hashtable->batchCxt); hashtable->buckets = (HashJoinTuple *) palloc0(nbuckets * sizeof(HashJoinTuple)); if(gp_hashjoin_bloomfilter!=0) hashtable->bloom = (uint64*) palloc0(nbuckets * sizeof(uint64)); MemoryContextSwitchTo(oldcxt); }
/* * Build an SP-GiST index. */ Datum spgbuild(PG_FUNCTION_ARGS) { Relation heap = (Relation) PG_GETARG_POINTER(0); Relation index = (Relation) PG_GETARG_POINTER(1); IndexInfo *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2); IndexBuildResult *result; double reltuples; SpGistBuildState buildstate; Buffer metabuffer, rootbuffer; if (RelationGetNumberOfBlocks(index) != 0) elog(ERROR, "index \"%s\" already contains data", RelationGetRelationName(index)); /* * Initialize the meta page and root page */ metabuffer = SpGistNewBuffer(index); rootbuffer = SpGistNewBuffer(index); Assert(BufferGetBlockNumber(metabuffer) == SPGIST_METAPAGE_BLKNO); Assert(BufferGetBlockNumber(rootbuffer) == SPGIST_HEAD_BLKNO); START_CRIT_SECTION(); SpGistInitMetapage(BufferGetPage(metabuffer)); MarkBufferDirty(metabuffer); SpGistInitBuffer(rootbuffer, SPGIST_LEAF); MarkBufferDirty(rootbuffer); if (RelationNeedsWAL(index)) { XLogRecPtr recptr; XLogRecData rdata; /* WAL data is just the relfilenode */ rdata.data = (char *) &(index->rd_node); rdata.len = sizeof(RelFileNode); rdata.buffer = InvalidBuffer; rdata.next = NULL; recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_CREATE_INDEX, &rdata); PageSetLSN(BufferGetPage(metabuffer), recptr); PageSetTLI(BufferGetPage(metabuffer), ThisTimeLineID); PageSetLSN(BufferGetPage(rootbuffer), recptr); PageSetTLI(BufferGetPage(rootbuffer), ThisTimeLineID); } END_CRIT_SECTION(); UnlockReleaseBuffer(metabuffer); UnlockReleaseBuffer(rootbuffer); /* * Now insert all the heap data into the index */ initSpGistState(&buildstate.spgstate, index); buildstate.spgstate.isBuild = true; buildstate.tmpCtx = AllocSetContextCreate(CurrentMemoryContext, "SP-GiST build temporary context", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); reltuples = IndexBuildHeapScan(heap, index, indexInfo, true, spgistBuildCallback, (void *) &buildstate); MemoryContextDelete(buildstate.tmpCtx); SpGistUpdateMetaPage(index); result = (IndexBuildResult *) palloc0(sizeof(IndexBuildResult)); result->heap_tuples = result->index_tuples = reltuples; PG_RETURN_POINTER(result); }
/* * CopyIntoCStoreTable handles a "COPY cstore_table FROM" statement. This * function uses the COPY command's functions to read and parse rows from * the data source specified in the COPY statement. The function then writes * each row to the file specified in the cstore foreign table options. Finally, * the function returns the number of copied rows. */ static uint64 CopyIntoCStoreTable(const CopyStmt *copyStatement, const char *queryString) { uint64 processedRowCount = 0; Relation relation = NULL; Oid relationId = InvalidOid; TupleDesc tupleDescriptor = NULL; uint32 columnCount = 0; CopyState copyState = NULL; bool nextRowFound = true; Datum *columnValues = NULL; bool *columnNulls = NULL; TableWriteState *writeState = NULL; CStoreFdwOptions *cstoreFdwOptions = NULL; MemoryContext tupleContext = NULL; List *columnNameList = copyStatement->attlist; if (columnNameList != NULL) { ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("copy column list is not supported"))); } /* * We disallow copy from file or program except to superusers. These checks * are based on the checks in DoCopy() function of copy.c. */ if (copyStatement->filename != NULL && !superuser()) { if (copyStatement->is_program) { ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("must be superuser to COPY to or from a program"), errhint("Anyone can COPY to stdout or from stdin. " "psql's \\copy command also works for anyone."))); } else { ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("must be superuser to COPY to or from a file"), errhint("Anyone can COPY to stdout or from stdin. " "psql's \\copy command also works for anyone."))); } } Assert(copyStatement->relation != NULL); /* * Open and lock the relation. We acquire ExclusiveLock to allow concurrent * reads, but block concurrent writes. */ relation = heap_openrv(copyStatement->relation, ExclusiveLock); relationId = RelationGetRelid(relation); /* allocate column values and nulls arrays */ tupleDescriptor = RelationGetDescr(relation); columnCount = tupleDescriptor->natts; columnValues = palloc0(columnCount * sizeof(Datum)); columnNulls = palloc0(columnCount * sizeof(bool)); cstoreFdwOptions = CStoreGetOptions(relationId); /* * We create a new memory context called tuple context, and read and write * each row's values within this memory context. After each read and write, * we reset the memory context. That way, we immediately release memory * allocated for each row, and don't bloat memory usage with large input * files. */ tupleContext = AllocSetContextCreate(CurrentMemoryContext, "CStore COPY Row Memory Context", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); /* init state to read from COPY data source */ copyState = BeginCopyFrom(relation, copyStatement->filename, copyStatement->is_program, NIL, copyStatement->options); /* init state to write to the cstore file */ writeState = CStoreBeginWrite(cstoreFdwOptions->filename, cstoreFdwOptions->compressionType, cstoreFdwOptions->stripeRowCount, cstoreFdwOptions->blockRowCount, tupleDescriptor); while (nextRowFound) { /* read the next row in tupleContext */ MemoryContext oldContext = MemoryContextSwitchTo(tupleContext); nextRowFound = NextCopyFrom(copyState, NULL, columnValues, columnNulls, NULL); MemoryContextSwitchTo(oldContext); /* write the row to the cstore file */ if (nextRowFound) { CStoreWriteRow(writeState, columnValues, columnNulls); processedRowCount++; } MemoryContextReset(tupleContext); } /* end read/write sessions and close the relation */ EndCopyFrom(copyState); CStoreEndWrite(writeState); heap_close(relation, ExclusiveLock); return processedRowCount; }
/* * Execute the index scan. * * This works by reading index TIDs from the revmap, and obtaining the index * tuples pointed to by them; the summary values in the index tuples are * compared to the scan keys. We return into the TID bitmap all the pages in * ranges corresponding to index tuples that match the scan keys. * * If a TID from the revmap is read as InvalidTID, we know that range is * unsummarized. Pages in those ranges need to be returned regardless of scan * keys. */ int64 bringetbitmap(IndexScanDesc scan, TIDBitmap *tbm) { Relation idxRel = scan->indexRelation; Buffer buf = InvalidBuffer; BrinDesc *bdesc; Oid heapOid; Relation heapRel; BrinOpaque *opaque; BlockNumber nblocks; BlockNumber heapBlk; int totalpages = 0; FmgrInfo *consistentFn; MemoryContext oldcxt; MemoryContext perRangeCxt; opaque = (BrinOpaque *) scan->opaque; bdesc = opaque->bo_bdesc; pgstat_count_index_scan(idxRel); /* * We need to know the size of the table so that we know how long to * iterate on the revmap. */ heapOid = IndexGetRelation(RelationGetRelid(idxRel), false); heapRel = heap_open(heapOid, AccessShareLock); nblocks = RelationGetNumberOfBlocks(heapRel); heap_close(heapRel, AccessShareLock); /* * Make room for the consistent support procedures of indexed columns. We * don't look them up here; we do that lazily the first time we see a scan * key reference each of them. We rely on zeroing fn_oid to InvalidOid. */ consistentFn = palloc0(sizeof(FmgrInfo) * bdesc->bd_tupdesc->natts); /* * Setup and use a per-range memory context, which is reset every time we * loop below. This avoids having to free the tuples within the loop. */ perRangeCxt = AllocSetContextCreate(CurrentMemoryContext, "bringetbitmap cxt", ALLOCSET_DEFAULT_SIZES); oldcxt = MemoryContextSwitchTo(perRangeCxt); /* * Now scan the revmap. We start by querying for heap page 0, * incrementing by the number of pages per range; this gives us a full * view of the table. */ for (heapBlk = 0; heapBlk < nblocks; heapBlk += opaque->bo_pagesPerRange) { bool addrange; BrinTuple *tup; OffsetNumber off; Size size; CHECK_FOR_INTERRUPTS(); MemoryContextResetAndDeleteChildren(perRangeCxt); tup = brinGetTupleForHeapBlock(opaque->bo_rmAccess, heapBlk, &buf, &off, &size, BUFFER_LOCK_SHARE, scan->xs_snapshot); if (tup) { tup = brin_copy_tuple(tup, size); LockBuffer(buf, BUFFER_LOCK_UNLOCK); } /* * For page ranges with no indexed tuple, we must return the whole * range; otherwise, compare it to the scan keys. */ if (tup == NULL) { addrange = true; } else { BrinMemTuple *dtup; dtup = brin_deform_tuple(bdesc, tup); if (dtup->bt_placeholder) { /* * Placeholder tuples are always returned, regardless of the * values stored in them. */ addrange = true; } else { int keyno; /* * Compare scan keys with summary values stored for the range. * If scan keys are matched, the page range must be added to * the bitmap. We initially assume the range needs to be * added; in particular this serves the case where there are * no keys. */ addrange = true; for (keyno = 0; keyno < scan->numberOfKeys; keyno++) { ScanKey key = &scan->keyData[keyno]; AttrNumber keyattno = key->sk_attno; BrinValues *bval = &dtup->bt_columns[keyattno - 1]; Datum add; /* * The collation of the scan key must match the collation * used in the index column (but only if the search is not * IS NULL/ IS NOT NULL). Otherwise we shouldn't be using * this index ... */ Assert((key->sk_flags & SK_ISNULL) || (key->sk_collation == bdesc->bd_tupdesc->attrs[keyattno - 1]->attcollation)); /* First time this column? look up consistent function */ if (consistentFn[keyattno - 1].fn_oid == InvalidOid) { FmgrInfo *tmp; tmp = index_getprocinfo(idxRel, keyattno, BRIN_PROCNUM_CONSISTENT); fmgr_info_copy(&consistentFn[keyattno - 1], tmp, CurrentMemoryContext); } /* * Check whether the scan key is consistent with the page * range values; if so, have the pages in the range added * to the output bitmap. * * When there are multiple scan keys, failure to meet the * criteria for a single one of them is enough to discard * the range as a whole, so break out of the loop as soon * as a false return value is obtained. */ add = FunctionCall3Coll(&consistentFn[keyattno - 1], key->sk_collation, PointerGetDatum(bdesc), PointerGetDatum(bval), PointerGetDatum(key)); addrange = DatumGetBool(add); if (!addrange) break; } } } /* add the pages in the range to the output bitmap, if needed */ if (addrange) { BlockNumber pageno; for (pageno = heapBlk; pageno <= heapBlk + opaque->bo_pagesPerRange - 1; pageno++) { MemoryContextSwitchTo(oldcxt); tbm_add_page(tbm, pageno); totalpages++; MemoryContextSwitchTo(perRangeCxt); } } } MemoryContextSwitchTo(oldcxt); MemoryContextDelete(perRangeCxt); if (buf != InvalidBuffer) ReleaseBuffer(buf); /* * XXX We have an approximation of the number of *pages* that our scan * returns, but we don't have a precise idea of the number of heap tuples * involved. */ return totalpages * 10; }
/* * init_MultiFuncCall * Create an empty FuncCallContext data structure * and do some other basic Multi-function call setup * and error checking */ FuncCallContext * init_MultiFuncCall(PG_FUNCTION_ARGS) { FuncCallContext *retval; /* * Bail if we're called in the wrong context */ if (fcinfo->resultinfo == NULL || !IsA(fcinfo->resultinfo, ReturnSetInfo)) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("set-valued function called in context that cannot accept a set"))); if (fcinfo->flinfo->fn_extra == NULL) { /* * First call */ ReturnSetInfo *rsi = (ReturnSetInfo *) fcinfo->resultinfo; MemoryContext multi_call_ctx; /* * Create a suitably long-lived context to hold cross-call data */ multi_call_ctx = AllocSetContextCreate(fcinfo->flinfo->fn_mcxt, "SRF multi-call context", ALLOCSET_SMALL_MINSIZE, ALLOCSET_SMALL_INITSIZE, ALLOCSET_SMALL_MAXSIZE); /* * Allocate suitably long-lived space and zero it */ retval = (FuncCallContext *) MemoryContextAllocZero(multi_call_ctx, sizeof(FuncCallContext)); /* * initialize the elements */ retval->call_cntr = 0; retval->max_calls = 0; retval->slot = NULL; retval->user_fctx = NULL; retval->attinmeta = NULL; retval->tuple_desc = NULL; retval->multi_call_memory_ctx = multi_call_ctx; /* * save the pointer for cross-call use */ fcinfo->flinfo->fn_extra = retval; /* * Ensure we will get shut down cleanly if the exprcontext is not run * to completion. */ RegisterExprContextCallback(rsi->econtext, shutdown_MultiFuncCall, PointerGetDatum(fcinfo->flinfo)); } else { /* second and subsequent calls */ elog(ERROR, "init_MultiFuncCall cannot be called more than once"); /* never reached, but keep compiler happy */ retval = NULL; } return retval; }
/* * Main entry point for checkpointer process * * This is invoked from AuxiliaryProcessMain, which has already created the * basic execution environment, but not enabled signals yet. */ void CheckpointerMain(void) { sigjmp_buf local_sigjmp_buf; MemoryContext checkpointer_context; CheckpointerShmem->checkpointer_pid = MyProcPid; /* * If possible, make this process a group leader, so that the postmaster * can signal any child processes too. (checkpointer probably never has * any child processes, but for consistency we make all postmaster child * processes do this.) */ #ifdef HAVE_SETSID if (setsid() < 0) elog(FATAL, "setsid() failed: %m"); #endif /* * Properly accept or ignore signals the postmaster might send us * * Note: we deliberately ignore SIGTERM, because during a standard Unix * system shutdown cycle, init will SIGTERM all processes at once. We * want to wait for the backends to exit, whereupon the postmaster will * tell us it's okay to shut down (via SIGUSR2). */ pqsignal(SIGHUP, ChkptSigHupHandler); /* set flag to read config * file */ pqsignal(SIGINT, ReqCheckpointHandler); /* request checkpoint */ pqsignal(SIGTERM, SIG_IGN); /* ignore SIGTERM */ pqsignal(SIGQUIT, chkpt_quickdie); /* hard crash time */ pqsignal(SIGALRM, SIG_IGN); pqsignal(SIGPIPE, SIG_IGN); pqsignal(SIGUSR1, chkpt_sigusr1_handler); pqsignal(SIGUSR2, ReqShutdownHandler); /* request shutdown */ /* * Reset some signals that are accepted by postmaster but not here */ pqsignal(SIGCHLD, SIG_DFL); pqsignal(SIGTTIN, SIG_DFL); pqsignal(SIGTTOU, SIG_DFL); pqsignal(SIGCONT, SIG_DFL); pqsignal(SIGWINCH, SIG_DFL); /* We allow SIGQUIT (quickdie) at all times */ sigdelset(&BlockSig, SIGQUIT); /* * Initialize so that first time-driven event happens at the correct time. */ last_checkpoint_time = last_xlog_switch_time = (pg_time_t) time(NULL); /* * Create a resource owner to keep track of our resources (currently only * buffer pins). */ CurrentResourceOwner = ResourceOwnerCreate(NULL, "Checkpointer"); /* * Create a memory context that we will do all our work in. We do this so * that we can reset the context during error recovery and thereby avoid * possible memory leaks. Formerly this code just ran in * TopMemoryContext, but resetting that would be a really bad idea. */ checkpointer_context = AllocSetContextCreate(TopMemoryContext, "Checkpointer", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); MemoryContextSwitchTo(checkpointer_context); /* * If an exception is encountered, processing resumes here. * * See notes in postgres.c about the design of this coding. */ if (sigsetjmp(local_sigjmp_buf, 1) != 0) { /* Since not using PG_TRY, must reset error stack by hand */ error_context_stack = NULL; /* Prevent interrupts while cleaning up */ HOLD_INTERRUPTS(); /* Report the error to the server log */ EmitErrorReport(); /* * These operations are really just a minimal subset of * AbortTransaction(). We don't have very many resources to worry * about in checkpointer, but we do have LWLocks, buffers, and temp * files. */ LWLockReleaseAll(); AbortBufferIO(); UnlockBuffers(); /* buffer pins are released here: */ ResourceOwnerRelease(CurrentResourceOwner, RESOURCE_RELEASE_BEFORE_LOCKS, false, true); /* we needn't bother with the other ResourceOwnerRelease phases */ AtEOXact_Buffers(false); AtEOXact_SMgr(); AtEOXact_Files(); AtEOXact_HashTables(false); /* Warn any waiting backends that the checkpoint failed. */ if (ckpt_active) { /* use volatile pointer to prevent code rearrangement */ volatile CheckpointerShmemStruct *cps = CheckpointerShmem; SpinLockAcquire(&cps->ckpt_lck); cps->ckpt_failed++; cps->ckpt_done = cps->ckpt_started; SpinLockRelease(&cps->ckpt_lck); ckpt_active = false; } /* * Now return to normal top-level context and clear ErrorContext for * next time. */ MemoryContextSwitchTo(checkpointer_context); FlushErrorState(); /* Flush any leaked data in the top-level context */ MemoryContextResetAndDeleteChildren(checkpointer_context); /* Now we can allow interrupts again */ RESUME_INTERRUPTS(); /* * Sleep at least 1 second after any error. A write error is likely * to be repeated, and we don't want to be filling the error logs as * fast as we can. */ pg_usleep(1000000L); /* * Close all open files after any error. This is helpful on Windows, * where holding deleted files open causes various strange errors. * It's not clear we need it elsewhere, but shouldn't hurt. */ smgrcloseall(); } /* We can now handle ereport(ERROR) */ PG_exception_stack = &local_sigjmp_buf; /* * Unblock signals (they were blocked when the postmaster forked us) */ PG_SETMASK(&UnBlockSig); /* * Ensure all shared memory values are set correctly for the config. Doing * this here ensures no race conditions from other concurrent updaters. */ UpdateSharedMemoryConfig(); /* * Advertise our latch that backends can use to wake us up while we're * sleeping. */ ProcGlobal->checkpointerLatch = &MyProc->procLatch; /* * Loop forever */ for (;;) { bool do_checkpoint = false; int flags = 0; pg_time_t now; int elapsed_secs; int cur_timeout; int rc; /* Clear any already-pending wakeups */ ResetLatch(&MyProc->procLatch); /* * Process any requests or signals received recently. */ AbsorbFsyncRequests(); if (got_SIGHUP) { got_SIGHUP = false; ProcessConfigFile(PGC_SIGHUP); /* * Checkpointer is the last process to shut down, so we ask it to * hold the keys for a range of other tasks required most of which * have nothing to do with checkpointing at all. * * For various reasons, some config values can change dynamically * so the primary copy of them is held in shared memory to make * sure all backends see the same value. We make Checkpointer * responsible for updating the shared memory copy if the * parameter setting changes because of SIGHUP. */ UpdateSharedMemoryConfig(); } if (checkpoint_requested) { checkpoint_requested = false; do_checkpoint = true; BgWriterStats.m_requested_checkpoints++; } if (shutdown_requested) { /* * From here on, elog(ERROR) should end with exit(1), not send * control back to the sigsetjmp block above */ ExitOnAnyError = true; /* Close down the database */ ShutdownXLOG(0, 0); /* Normal exit from the checkpointer is here */ proc_exit(0); /* done */ } /* * Force a checkpoint if too much time has elapsed since the last one. * Note that we count a timed checkpoint in stats only when this * occurs without an external request, but we set the CAUSE_TIME flag * bit even if there is also an external request. */ now = (pg_time_t) time(NULL); elapsed_secs = now - last_checkpoint_time; if (elapsed_secs >= CheckPointTimeout) { if (!do_checkpoint) BgWriterStats.m_timed_checkpoints++; do_checkpoint = true; flags |= CHECKPOINT_CAUSE_TIME; } /* * Do a checkpoint if requested. */ if (do_checkpoint) { bool ckpt_performed = false; bool do_restartpoint; /* use volatile pointer to prevent code rearrangement */ volatile CheckpointerShmemStruct *cps = CheckpointerShmem; /* * Check if we should perform a checkpoint or a restartpoint. As a * side-effect, RecoveryInProgress() initializes TimeLineID if * it's not set yet. */ do_restartpoint = RecoveryInProgress(); /* * Atomically fetch the request flags to figure out what kind of a * checkpoint we should perform, and increase the started-counter * to acknowledge that we've started a new checkpoint. */ SpinLockAcquire(&cps->ckpt_lck); flags |= cps->ckpt_flags; cps->ckpt_flags = 0; cps->ckpt_started++; SpinLockRelease(&cps->ckpt_lck); /* * The end-of-recovery checkpoint is a real checkpoint that's * performed while we're still in recovery. */ if (flags & CHECKPOINT_END_OF_RECOVERY) do_restartpoint = false; /* * We will warn if (a) too soon since last checkpoint (whatever * caused it) and (b) somebody set the CHECKPOINT_CAUSE_XLOG flag * since the last checkpoint start. Note in particular that this * implementation will not generate warnings caused by * CheckPointTimeout < CheckPointWarning. */ if (!do_restartpoint && (flags & CHECKPOINT_CAUSE_XLOG) && elapsed_secs < CheckPointWarning) ereport(LOG, (errmsg_plural("checkpoints are occurring too frequently (%d second apart)", "checkpoints are occurring too frequently (%d seconds apart)", elapsed_secs, elapsed_secs), errhint("Consider increasing the configuration parameter \"checkpoint_segments\"."))); /* * Initialize checkpointer-private variables used during * checkpoint */ ckpt_active = true; if (!do_restartpoint) ckpt_start_recptr = GetInsertRecPtr(); ckpt_start_time = now; ckpt_cached_elapsed = 0; /* * Do the checkpoint. */ if (!do_restartpoint) { CreateCheckPoint(flags); ckpt_performed = true; } else ckpt_performed = CreateRestartPoint(flags); /* * After any checkpoint, close all smgr files. This is so we * won't hang onto smgr references to deleted files indefinitely. */ smgrcloseall(); /* * Indicate checkpoint completion to any waiting backends. */ SpinLockAcquire(&cps->ckpt_lck); cps->ckpt_done = cps->ckpt_started; SpinLockRelease(&cps->ckpt_lck); if (ckpt_performed) { /* * Note we record the checkpoint start time not end time as * last_checkpoint_time. This is so that time-driven * checkpoints happen at a predictable spacing. */ last_checkpoint_time = now; } else { /* * We were not able to perform the restartpoint (checkpoints * throw an ERROR in case of error). Most likely because we * have not received any new checkpoint WAL records since the * last restartpoint. Try again in 15 s. */ last_checkpoint_time = now - CheckPointTimeout + 15; } ckpt_active = false; } /* Check for archive_timeout and switch xlog files if necessary. */ CheckArchiveTimeout(); /* * Send off activity statistics to the stats collector. (The reason * why we re-use bgwriter-related code for this is that the bgwriter * and checkpointer used to be just one process. It's probably not * worth the trouble to split the stats support into two independent * stats message types.) */ pgstat_send_bgwriter(); /* * Sleep until we are signaled or it's time for another checkpoint or * xlog file switch. */ now = (pg_time_t) time(NULL); elapsed_secs = now - last_checkpoint_time; if (elapsed_secs >= CheckPointTimeout) continue; /* no sleep for us ... */ cur_timeout = CheckPointTimeout - elapsed_secs; if (XLogArchiveTimeout > 0 && !RecoveryInProgress()) { elapsed_secs = now - last_xlog_switch_time; if (elapsed_secs >= XLogArchiveTimeout) continue; /* no sleep for us ... */ cur_timeout = Min(cur_timeout, XLogArchiveTimeout - elapsed_secs); } rc = WaitLatch(&MyProc->procLatch, WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, cur_timeout * 1000L /* convert to ms */ ); /* * Emergency bailout if postmaster has died. This is to avoid the * necessity for manual cleanup of all postmaster children. */ if (rc & WL_POSTMASTER_DEATH) exit(1); } }
int SPI_connect(void) { int newdepth; /* * When procedure called by Executor _SPI_curid expected to be equal * to _SPI_connected */ if (_SPI_curid != _SPI_connected) return SPI_ERROR_CONNECT; if (_SPI_stack == NULL) { if (_SPI_connected != -1 || _SPI_stack_depth != 0) elog(ERROR, "SPI stack corrupted"); newdepth = 16; _SPI_stack = (_SPI_connection *) MemoryContextAlloc(TopTransactionContext, newdepth * sizeof(_SPI_connection)); _SPI_stack_depth = newdepth; } else { if (_SPI_stack_depth <= 0 || _SPI_stack_depth <= _SPI_connected) elog(ERROR, "SPI stack corrupted"); if (_SPI_stack_depth == _SPI_connected + 1) { newdepth = _SPI_stack_depth * 2; _SPI_stack = (_SPI_connection *) repalloc(_SPI_stack, newdepth * sizeof(_SPI_connection)); _SPI_stack_depth = newdepth; } } /* * We're entering procedure where _SPI_curid == _SPI_connected - 1 */ _SPI_connected++; Assert(_SPI_connected >= 0 && _SPI_connected < _SPI_stack_depth); _SPI_current = &(_SPI_stack[_SPI_connected]); _SPI_current->processed = 0; _SPI_current->tuptable = NULL; _SPI_current->procCxt = NULL; /* in case we fail to create 'em */ _SPI_current->execCxt = NULL; _SPI_current->connectSubid = GetCurrentSubTransactionId(); /* * Create memory contexts for this procedure * * XXX it would be better to use PortalContext as the parent context, but * we may not be inside a portal (consider deferred-trigger * execution). Perhaps CurTransactionContext would do? For now it * doesn't matter because we clean up explicitly in AtEOSubXact_SPI(). */ _SPI_current->procCxt = AllocSetContextCreate(TopTransactionContext, "SPI Proc", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); _SPI_current->execCxt = AllocSetContextCreate(TopTransactionContext, "SPI Exec", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); /* ... and switch to procedure's context */ _SPI_current->savedcxt = MemoryContextSwitchTo(_SPI_current->procCxt); return SPI_OK_CONNECT; }
Datum gistrescan(PG_FUNCTION_ARGS) { IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0); ScanKey key = (ScanKey) PG_GETARG_POINTER(1); ScanKey orderbys = (ScanKey) PG_GETARG_POINTER(3); /* nkeys and norderbys arguments are ignored */ GISTScanOpaque so = (GISTScanOpaque) scan->opaque; bool first_time; int i; MemoryContext oldCxt; /* rescan an existing indexscan --- reset state */ /* * The first time through, we create the search queue in the scanCxt. * Subsequent times through, we create the queue in a separate queueCxt, * which is created on the second call and reset on later calls. Thus, in * the common case where a scan is only rescan'd once, we just put the * queue in scanCxt and don't pay the overhead of making a second memory * context. If we do rescan more than once, the first RBTree is just left * for dead until end of scan; this small wastage seems worth the savings * in the common case. */ if (so->queue == NULL) { /* first time through */ Assert(so->queueCxt == so->giststate->scanCxt); first_time = true; } else if (so->queueCxt == so->giststate->scanCxt) { /* second time through */ so->queueCxt = AllocSetContextCreate(so->giststate->scanCxt, "GiST queue context", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); first_time = false; } else { /* third or later time through */ MemoryContextReset(so->queueCxt); first_time = false; } /* create new, empty RBTree for search queue */ oldCxt = MemoryContextSwitchTo(so->queueCxt); so->queue = pairingheap_allocate(pairingheap_GISTSearchItem_cmp, scan); MemoryContextSwitchTo(oldCxt); so->firstCall = true; /* Update scan key, if a new one is given */ if (key && scan->numberOfKeys > 0) { void **fn_extras = NULL; /* * If this isn't the first time through, preserve the fn_extra * pointers, so that if the consistentFns are using them to cache * data, that data is not leaked across a rescan. */ if (!first_time) { fn_extras = (void **) palloc(scan->numberOfKeys * sizeof(void *)); for (i = 0; i < scan->numberOfKeys; i++) fn_extras[i] = scan->keyData[i].sk_func.fn_extra; } memmove(scan->keyData, key, scan->numberOfKeys * sizeof(ScanKeyData)); /* * Modify the scan key so that the Consistent method is called for all * comparisons. The original operator is passed to the Consistent * function in the form of its strategy number, which is available * from the sk_strategy field, and its subtype from the sk_subtype * field. * * Next, if any of keys is a NULL and that key is not marked with * SK_SEARCHNULL/SK_SEARCHNOTNULL then nothing can be found (ie, we * assume all indexable operators are strict). */ so->qual_ok = true; for (i = 0; i < scan->numberOfKeys; i++) { ScanKey skey = scan->keyData + i; fmgr_info_copy(&(skey->sk_func), &(so->giststate->consistentFn[skey->sk_attno - 1]), so->giststate->scanCxt); /* Restore prior fn_extra pointers, if not first time */ if (!first_time) skey->sk_func.fn_extra = fn_extras[i]; if (skey->sk_flags & SK_ISNULL) { if (!(skey->sk_flags & (SK_SEARCHNULL | SK_SEARCHNOTNULL))) so->qual_ok = false; } } if (!first_time) pfree(fn_extras); } /* Update order-by key, if a new one is given */ if (orderbys && scan->numberOfOrderBys > 0) { void **fn_extras = NULL; /* As above, preserve fn_extra if not first time through */ if (!first_time) { fn_extras = (void **) palloc(scan->numberOfOrderBys * sizeof(void *)); for (i = 0; i < scan->numberOfOrderBys; i++) fn_extras[i] = scan->orderByData[i].sk_func.fn_extra; } memmove(scan->orderByData, orderbys, scan->numberOfOrderBys * sizeof(ScanKeyData)); /* * Modify the order-by key so that the Distance method is called for * all comparisons. The original operator is passed to the Distance * function in the form of its strategy number, which is available * from the sk_strategy field, and its subtype from the sk_subtype * field. */ for (i = 0; i < scan->numberOfOrderBys; i++) { ScanKey skey = scan->orderByData + i; FmgrInfo *finfo = &(so->giststate->distanceFn[skey->sk_attno - 1]); /* Check we actually have a distance function ... */ if (!OidIsValid(finfo->fn_oid)) elog(ERROR, "missing support function %d for attribute %d of index \"%s\"", GIST_DISTANCE_PROC, skey->sk_attno, RelationGetRelationName(scan->indexRelation)); fmgr_info_copy(&(skey->sk_func), finfo, so->giststate->scanCxt); /* Restore prior fn_extra pointers, if not first time */ if (!first_time) skey->sk_func.fn_extra = fn_extras[i]; } if (!first_time) pfree(fn_extras); } PG_RETURN_VOID(); }
/* * btvacuumscan --- scan the index for VACUUMing purposes * * This combines the functions of looking for leaf tuples that are deletable * according to the vacuum callback, looking for empty pages that can be * deleted, and looking for old deleted pages that can be recycled. Both * btbulkdelete and btvacuumcleanup invoke this (the latter only if no * btbulkdelete call occurred). * * The caller is responsible for initially allocating/zeroing a stats struct * and for obtaining a vacuum cycle ID if necessary. */ static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state, BTCycleId cycleid) { Relation rel = info->index; BTVacState vstate; BlockNumber num_pages; BlockNumber blkno; bool needLock; /* * Reset counts that will be incremented during the scan; needed in case * of multiple scans during a single VACUUM command */ stats->estimated_count = false; stats->num_index_tuples = 0; stats->pages_deleted = 0; /* Set up info to pass down to btvacuumpage */ vstate.info = info; vstate.stats = stats; vstate.callback = callback; vstate.callback_state = callback_state; vstate.cycleid = cycleid; vstate.lastBlockVacuumed = BTREE_METAPAGE; /* Initialise at first block */ vstate.lastUsedPage = BTREE_METAPAGE; vstate.totFreePages = 0; /* Create a temporary memory context to run _bt_pagedel in */ vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext, "_bt_pagedel", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); /* * The outer loop iterates over all index pages except the metapage, in * physical order (we hope the kernel will cooperate in providing * read-ahead for speed). It is critical that we visit all leaf pages, * including ones added after we start the scan, else we might fail to * delete some deletable tuples. Hence, we must repeatedly check the * relation length. We must acquire the relation-extension lock while * doing so to avoid a race condition: if someone else is extending the * relation, there is a window where bufmgr/smgr have created a new * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If * we manage to scan such a page here, we'll improperly assume it can be * recycled. Taking the lock synchronizes things enough to prevent a * problem: either num_pages won't include the new page, or _bt_getbuf * already has write lock on the buffer and it will be fully initialized * before we can examine it. (See also vacuumlazy.c, which has the same * issue.) Also, we need not worry if a page is added immediately after * we look; the page splitting code already has write-lock on the left * page before it adds a right page, so we must already have processed any * tuples due to be moved into such a page. * * We can skip locking for new or temp relations, however, since no one * else could be accessing them. */ needLock = !RELATION_IS_LOCAL(rel); blkno = BTREE_METAPAGE + 1; for (;;) { /* Get the current relation length */ if (needLock) LockRelationForExtension(rel, ExclusiveLock); num_pages = RelationGetNumberOfBlocks(rel); if (needLock) UnlockRelationForExtension(rel, ExclusiveLock); /* Quit if we've scanned the whole relation */ if (blkno >= num_pages) break; /* Iterate over pages, then loop back to recheck length */ for (; blkno < num_pages; blkno++) { btvacuumpage(&vstate, blkno, blkno); } } /* * InHotStandby we need to scan right up to the end of the index for * correct locking, so we may need to write a WAL record for the final * block in the index if it was not vacuumed. It's possible that VACUUMing * has actually removed zeroed pages at the end of the index so we need to * take care to issue the record for last actual block and not for the * last block that was scanned. Ignore empty indexes. */ if (XLogStandbyInfoActive() && num_pages > 1 && vstate.lastBlockVacuumed < (num_pages - 1)) { Buffer buf; /* * We can't use _bt_getbuf() here because it always applies * _bt_checkpage(), which will barf on an all-zero page. We want to * recycle all-zero pages, not fail. Also, we want to use a * nondefault buffer access strategy. */ buf = ReadBufferExtended(rel, MAIN_FORKNUM, num_pages - 1, RBM_NORMAL, info->strategy); LockBufferForCleanup(buf); _bt_delitems_vacuum(rel, buf, NULL, 0, vstate.lastBlockVacuumed); _bt_relbuf(rel, buf); } MemoryContextDelete(vstate.pagedelcontext); /* update statistics */ stats->num_pages = num_pages; stats->pages_free = vstate.totFreePages; }
Datum gistrescan(PG_FUNCTION_ARGS) { IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0); ScanKey key = (ScanKey) PG_GETARG_POINTER(1); ScanKey orderbys = (ScanKey) PG_GETARG_POINTER(3); /* nkeys and norderbys arguments are ignored */ GISTScanOpaque so = (GISTScanOpaque) scan->opaque; bool first_time; int i; MemoryContext oldCxt; /* rescan an existing indexscan --- reset state */ /* * The first time through, we create the search queue in the scanCxt. * Subsequent times through, we create the queue in a separate queueCxt, * which is created on the second call and reset on later calls. Thus, in * the common case where a scan is only rescan'd once, we just put the * queue in scanCxt and don't pay the overhead of making a second memory * context. If we do rescan more than once, the first RBTree is just left * for dead until end of scan; this small wastage seems worth the savings * in the common case. */ if (so->queue == NULL) { /* first time through */ Assert(so->queueCxt == so->giststate->scanCxt); first_time = true; } else if (so->queueCxt == so->giststate->scanCxt) { /* second time through */ so->queueCxt = AllocSetContextCreate(so->giststate->scanCxt, "GiST queue context", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); first_time = false; } else { /* third or later time through */ MemoryContextReset(so->queueCxt); first_time = false; } /* * If we're doing an index-only scan, on the first call, also initialize * a tuple descriptor to represent the returned index tuples and create a * memory context to hold them during the scan. */ if (scan->xs_want_itup && !scan->xs_itupdesc) { int natts; int attno; /* * The storage type of the index can be different from the original * datatype being indexed, so we cannot just grab the index's tuple * descriptor. Instead, construct a descriptor with the original data * types. */ natts = RelationGetNumberOfAttributes(scan->indexRelation); so->giststate->fetchTupdesc = CreateTemplateTupleDesc(natts, false); for (attno = 1; attno <= natts; attno++) { TupleDescInitEntry(so->giststate->fetchTupdesc, attno, NULL, scan->indexRelation->rd_opcintype[attno - 1], -1, 0); } scan->xs_itupdesc = so->giststate->fetchTupdesc; so->pageDataCxt = AllocSetContextCreate(so->giststate->scanCxt, "GiST page data context", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); } /* create new___, empty RBTree for search queue */ oldCxt = MemoryContextSwitchTo(so->queueCxt); so->queue = pairingheap_allocate(pairingheap_GISTSearchItem_cmp, scan); MemoryContextSwitchTo(oldCxt); so->firstCall = true; /* Update scan key, if a new___ one is given */ if (key && scan->numberOfKeys > 0) { void **fn_extras = NULL; /* * If this isn't the first time through, preserve the fn_extra * pointers, so that if the consistentFns are using them to cache * data, that data is not leaked across a rescan. */ if (!first_time) { fn_extras = (void **) palloc(scan->numberOfKeys * sizeof(void *)); for (i = 0; i < scan->numberOfKeys; i++) fn_extras[i] = scan->keyData[i].sk_func.fn_extra; } memmove(scan->keyData, key, scan->numberOfKeys * sizeof(ScanKeyData)); /* * Modify the scan key so that the Consistent method is called for all * comparisons. The original operator___ is passed to the Consistent * function in the form of its strategy number, which is available * from the sk_strategy field, and its subtype from the sk_subtype * field. * * Next, if any of keys is a NULL and that key is not marked with * SK_SEARCHNULL/SK_SEARCHNOTNULL then nothing can be found (ie, we * assume all indexable operators are strict). */ so->qual_ok = true; for (i = 0; i < scan->numberOfKeys; i++) { ScanKey skey = scan->keyData + i; fmgr_info_copy(&(skey->sk_func), &(so->giststate->consistentFn[skey->sk_attno - 1]), so->giststate->scanCxt); /* Restore prior fn_extra pointers, if not first time */ if (!first_time) skey->sk_func.fn_extra = fn_extras[i]; if (skey->sk_flags & SK_ISNULL) { if (!(skey->sk_flags & (SK_SEARCHNULL | SK_SEARCHNOTNULL))) so->qual_ok = false; } } if (!first_time) pfree(fn_extras); } /* Update order-by key, if a new___ one is given */ if (orderbys && scan->numberOfOrderBys > 0) { void **fn_extras = NULL; /* As above, preserve fn_extra if not first time through */ if (!first_time) { fn_extras = (void **) palloc(scan->numberOfOrderBys * sizeof(void *)); for (i = 0; i < scan->numberOfOrderBys; i++) fn_extras[i] = scan->orderByData[i].sk_func.fn_extra; } memmove(scan->orderByData, orderbys, scan->numberOfOrderBys * sizeof(ScanKeyData)); so->orderByTypes = (Oid *) palloc(scan->numberOfOrderBys * sizeof(Oid)); /* * Modify the order-by key so that the Distance method is called for * all comparisons. The original operator___ is passed to the Distance * function in the form of its strategy number, which is available * from the sk_strategy field, and its subtype from the sk_subtype * field. */ for (i = 0; i < scan->numberOfOrderBys; i++) { ScanKey skey = scan->orderByData + i; FmgrInfo *finfo = &(so->giststate->distanceFn[skey->sk_attno - 1]); /* Check we actually have a distance function ... */ if (!OidIsValid(finfo->fn_oid)) elog(ERROR, "missing support function %d for attribute %d of index \"%s\"", GIST_DISTANCE_PROC, skey->sk_attno, RelationGetRelationName(scan->indexRelation)); fmgr_info_copy(&(skey->sk_func), finfo, so->giststate->scanCxt); /* * Look up the datatype returned by the original ordering operator___. * GiST always uses a float8 for the distance function, but the * ordering operator___ could be anything else. * * XXX: The distance function is only allowed to be lossy if the * ordering operator___'s result type is float4 or float8. Otherwise * we don't know how to return the distance to the executor. But * we cannot check that here, as we won't know if the distance * function is lossy until it returns *recheck = true for the * first time. */ so->orderByTypes[i] = get_func_rettype(skey->sk_func.fn_oid); /* Restore prior fn_extra pointers, if not first time */ if (!first_time) skey->sk_func.fn_extra = fn_extras[i]; } if (!first_time) pfree(fn_extras); } PG_RETURN_VOID(); }
/* * Main entry point for bgwriter process * * This is invoked from BootstrapMain, which has already created the basic * execution environment, but not enabled signals yet. */ void BackgroundWriterMain(void) { sigjmp_buf local_sigjmp_buf; MemoryContext bgwriter_context; BgWriterShmem->bgwriter_pid = MyProcPid; am_bg_writer = true; /* * If possible, make this process a group leader, so that the postmaster * can signal any child processes too. (bgwriter probably never has any * child processes, but for consistency we make all postmaster child * processes do this.) */ #ifdef HAVE_SETSID if (setsid() < 0) elog(FATAL, "setsid() failed: %m"); #endif /* * Properly accept or ignore signals the postmaster might send us * * Note: we deliberately ignore SIGTERM, because during a standard Unix * system shutdown cycle, init will SIGTERM all processes at once. We * want to wait for the backends to exit, whereupon the postmaster will * tell us it's okay to shut down (via SIGUSR2). * * SIGUSR1 is presently unused; keep it spare in case someday we want this * process to participate in sinval messaging. */ pqsignal(SIGHUP, BgSigHupHandler); /* set flag to read config file */ pqsignal(SIGINT, ReqCheckpointHandler); /* request checkpoint */ pqsignal(SIGTERM, SIG_IGN); /* ignore SIGTERM */ pqsignal(SIGQUIT, bg_quickdie); /* hard crash time */ pqsignal(SIGALRM, SIG_IGN); pqsignal(SIGPIPE, SIG_IGN); pqsignal(SIGUSR1, SIG_IGN); /* reserve for sinval */ pqsignal(SIGUSR2, ReqShutdownHandler); /* request shutdown */ /* * Reset some signals that are accepted by postmaster but not here */ pqsignal(SIGCHLD, SIG_DFL); pqsignal(SIGTTIN, SIG_DFL); pqsignal(SIGTTOU, SIG_DFL); pqsignal(SIGCONT, SIG_DFL); pqsignal(SIGWINCH, SIG_DFL); /* We allow SIGQUIT (quickdie) at all times */ #ifdef HAVE_SIGPROCMASK sigdelset(&BlockSig, SIGQUIT); #else BlockSig &= ~(sigmask(SIGQUIT)); #endif /* * Initialize so that first time-driven event happens at the correct time. */ last_checkpoint_time = last_xlog_switch_time = time(NULL); /* * Create a resource owner to keep track of our resources (currently only * buffer pins). */ CurrentResourceOwner = ResourceOwnerCreate(NULL, "Background Writer"); /* * Create a memory context that we will do all our work in. We do this so * that we can reset the context during error recovery and thereby avoid * possible memory leaks. Formerly this code just ran in * TopMemoryContext, but resetting that would be a really bad idea. */ bgwriter_context = AllocSetContextCreate(TopMemoryContext, "Background Writer", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); MemoryContextSwitchTo(bgwriter_context); /* * If an exception is encountered, processing resumes here. * * See notes in postgres.c about the design of this coding. */ if (sigsetjmp(local_sigjmp_buf, 1) != 0) { /* Since not using PG_TRY, must reset error stack by hand */ error_context_stack = NULL; /* Prevent interrupts while cleaning up */ HOLD_INTERRUPTS(); /* Report the error to the server log */ EmitErrorReport(); /* * These operations are really just a minimal subset of * AbortTransaction(). We don't have very many resources to worry * about in bgwriter, but we do have LWLocks, buffers, and temp files. */ LWLockReleaseAll(); AbortBufferIO(); UnlockBuffers(); /* buffer pins are released here: */ ResourceOwnerRelease(CurrentResourceOwner, RESOURCE_RELEASE_BEFORE_LOCKS, false, true); /* we needn't bother with the other ResourceOwnerRelease phases */ AtEOXact_Buffers(false); AtEOXact_Files(); AtEOXact_HashTables(false); /* Warn any waiting backends that the checkpoint failed. */ if (ckpt_active) { /* use volatile pointer to prevent code rearrangement */ volatile BgWriterShmemStruct *bgs = BgWriterShmem; SpinLockAcquire(&bgs->ckpt_lck); bgs->ckpt_failed++; bgs->ckpt_done = bgs->ckpt_started; SpinLockRelease(&bgs->ckpt_lck); ckpt_active = false; } /* * Now return to normal top-level context and clear ErrorContext for * next time. */ MemoryContextSwitchTo(bgwriter_context); FlushErrorState(); /* Flush any leaked data in the top-level context */ MemoryContextResetAndDeleteChildren(bgwriter_context); /* Now we can allow interrupts again */ RESUME_INTERRUPTS(); /* * Sleep at least 1 second after any error. A write error is likely * to be repeated, and we don't want to be filling the error logs as * fast as we can. */ pg_usleep(1000000L); /* * Close all open files after any error. This is helpful on Windows, * where holding deleted files open causes various strange errors. * It's not clear we need it elsewhere, but shouldn't hurt. */ smgrcloseall(); } /* We can now handle ereport(ERROR) */ PG_exception_stack = &local_sigjmp_buf; /* * Unblock signals (they were blocked when the postmaster forked us) */ PG_SETMASK(&UnBlockSig); /* * Loop forever */ for (;;) { bool do_checkpoint = false; int flags = 0; time_t now; int elapsed_secs; /* * Emergency bailout if postmaster has died. This is to avoid the * necessity for manual cleanup of all postmaster children. */ if (!PostmasterIsAlive(true)) exit(1); /* * Process any requests or signals received recently. */ AbsorbFsyncRequests(); if (got_SIGHUP) { got_SIGHUP = false; ProcessConfigFile(PGC_SIGHUP); } if (checkpoint_requested) { checkpoint_requested = false; do_checkpoint = true; BgWriterStats.m_requested_checkpoints++; } if (shutdown_requested) { /* * From here on, elog(ERROR) should end with exit(1), not send * control back to the sigsetjmp block above */ ExitOnAnyError = true; /* Close down the database */ ShutdownXLOG(0, 0); DumpFreeSpaceMap(0, 0); /* Normal exit from the bgwriter is here */ proc_exit(0); /* done */ } /* * Force a checkpoint if too much time has elapsed since the last one. * Note that we count a timed checkpoint in stats only when this * occurs without an external request, but we set the CAUSE_TIME flag * bit even if there is also an external request. */ now = time(NULL); elapsed_secs = now - last_checkpoint_time; if (elapsed_secs >= CheckPointTimeout) { if (!do_checkpoint) BgWriterStats.m_timed_checkpoints++; do_checkpoint = true; flags |= CHECKPOINT_CAUSE_TIME; } /* * Do a checkpoint if requested, otherwise do one cycle of * dirty-buffer writing. */ if (do_checkpoint) { /* use volatile pointer to prevent code rearrangement */ volatile BgWriterShmemStruct *bgs = BgWriterShmem; /* * Atomically fetch the request flags to figure out what kind of a * checkpoint we should perform, and increase the started-counter * to acknowledge that we've started a new checkpoint. */ SpinLockAcquire(&bgs->ckpt_lck); flags |= bgs->ckpt_flags; bgs->ckpt_flags = 0; bgs->ckpt_started++; SpinLockRelease(&bgs->ckpt_lck); /* * We will warn if (a) too soon since last checkpoint (whatever * caused it) and (b) somebody set the CHECKPOINT_CAUSE_XLOG flag * since the last checkpoint start. Note in particular that this * implementation will not generate warnings caused by * CheckPointTimeout < CheckPointWarning. */ if ((flags & CHECKPOINT_CAUSE_XLOG) && elapsed_secs < CheckPointWarning) ereport(LOG, (errmsg("checkpoints are occurring too frequently (%d seconds apart)", elapsed_secs), errhint("Consider increasing the configuration parameter \"checkpoint_segments\"."))); /* * Initialize bgwriter-private variables used during checkpoint. */ ckpt_active = true; ckpt_start_recptr = GetInsertRecPtr(); ckpt_start_time = now; ckpt_cached_elapsed = 0; /* * Do the checkpoint. */ CreateCheckPoint(flags); /* * After any checkpoint, close all smgr files. This is so we * won't hang onto smgr references to deleted files indefinitely. */ smgrcloseall(); /* * Indicate checkpoint completion to any waiting backends. */ SpinLockAcquire(&bgs->ckpt_lck); bgs->ckpt_done = bgs->ckpt_started; SpinLockRelease(&bgs->ckpt_lck); ckpt_active = false; /* * Note we record the checkpoint start time not end time as * last_checkpoint_time. This is so that time-driven checkpoints * happen at a predictable spacing. */ last_checkpoint_time = now; } else BgBufferSync(); /* Check for archive_timeout and switch xlog files if necessary. */ CheckArchiveTimeout(); /* Nap for the configured time. */ BgWriterNap(); } }
/* * Insert a new item to a page. * * Returns true if the insertion was finished. On false, the page was split and * the parent needs to be updated. (A root split returns true as it doesn't * need any further action by the caller to complete.) * * When inserting a downlink to an internal page, 'childbuf' contains the * child page that was split. Its GIN_INCOMPLETE_SPLIT flag will be cleared * atomically with the insert. Also, the existing item at offset stack->off * in the target page is updated to point to updateblkno. * * stack->buffer is locked on entry, and is kept locked. * Likewise for childbuf, if given. */ static bool ginPlaceToPage(GinBtree btree, GinBtreeStack *stack, void *insertdata, BlockNumber updateblkno, Buffer childbuf, GinStatsData *buildStats) { Page page = BufferGetPage(stack->buffer); bool result; GinPlaceToPageRC rc; uint16 xlflags = 0; Page childpage = NULL; Page newlpage = NULL, newrpage = NULL; void *ptp_workspace = NULL; XLogRecData payloadrdata[10]; MemoryContext tmpCxt; MemoryContext oldCxt; /* * We do all the work of this function and its subfunctions in a temporary * memory context. This avoids leakages and simplifies APIs, since some * subfunctions allocate storage that has to survive until we've finished * the WAL insertion. */ tmpCxt = AllocSetContextCreate(CurrentMemoryContext, "ginPlaceToPage temporary context", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); oldCxt = MemoryContextSwitchTo(tmpCxt); if (GinPageIsData(page)) xlflags |= GIN_INSERT_ISDATA; if (GinPageIsLeaf(page)) { xlflags |= GIN_INSERT_ISLEAF; Assert(!BufferIsValid(childbuf)); Assert(updateblkno == InvalidBlockNumber); } else { Assert(BufferIsValid(childbuf)); Assert(updateblkno != InvalidBlockNumber); childpage = BufferGetPage(childbuf); } /* * See if the incoming tuple will fit on the page. beginPlaceToPage will * decide if the page needs to be split, and will compute the split * contents if so. See comments for beginPlaceToPage and execPlaceToPage * functions for more details of the API here. */ rc = btree->beginPlaceToPage(btree, stack->buffer, stack, insertdata, updateblkno, &ptp_workspace, &newlpage, &newrpage, payloadrdata); if (rc == GPTP_NO_WORK) { /* Nothing to do */ result = true; } else if (rc == GPTP_INSERT) { /* It will fit, perform the insertion */ START_CRIT_SECTION(); /* Perform the page update, and set up WAL data about it */ btree->execPlaceToPage(btree, stack->buffer, stack, insertdata, updateblkno, ptp_workspace, payloadrdata); MarkBufferDirty(stack->buffer); /* An insert to an internal page finishes the split of the child. */ if (BufferIsValid(childbuf)) { GinPageGetOpaque(childpage)->flags &= ~GIN_INCOMPLETE_SPLIT; MarkBufferDirty(childbuf); } if (RelationNeedsWAL(btree->index)) { XLogRecPtr recptr; XLogRecData rdata[3]; ginxlogInsert xlrec; BlockIdData childblknos[2]; xlrec.node = btree->index->rd_node; xlrec.blkno = BufferGetBlockNumber(stack->buffer); xlrec.flags = xlflags; rdata[0].buffer = InvalidBuffer; rdata[0].data = (char *) &xlrec; rdata[0].len = sizeof(ginxlogInsert); /* * Log information about child if this was an insertion of a * downlink. */ if (BufferIsValid(childbuf)) { rdata[0].next = &rdata[1]; BlockIdSet(&childblknos[0], BufferGetBlockNumber(childbuf)); BlockIdSet(&childblknos[1], GinPageGetOpaque(childpage)->rightlink); rdata[1].buffer = InvalidBuffer; rdata[1].data = (char *) childblknos; rdata[1].len = sizeof(BlockIdData) * 2; rdata[1].next = &rdata[2]; rdata[2].buffer = childbuf; rdata[2].buffer_std = true; rdata[2].data = NULL; rdata[2].len = 0; rdata[2].next = payloadrdata; } else rdata[0].next = payloadrdata; recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_INSERT, rdata); PageSetLSN(page, recptr); if (BufferIsValid(childbuf)) PageSetLSN(childpage, recptr); } END_CRIT_SECTION(); /* Insertion is complete. */ result = true; } else if (rc == GPTP_SPLIT) { /* * Didn't fit, need to split. The split has been computed in newlpage * and newrpage, which are pointers to palloc'd pages, not associated * with buffers. stack->buffer is not touched yet. */ Buffer rbuffer; BlockNumber savedRightLink; ginxlogSplit data; Buffer lbuffer = InvalidBuffer; Page newrootpg = NULL; /* Get a new index page to become the right page */ rbuffer = GinNewBuffer(btree->index); /* During index build, count the new page */ if (buildStats) { if (btree->isData) buildStats->nDataPages++; else buildStats->nEntryPages++; } savedRightLink = GinPageGetOpaque(page)->rightlink; /* Begin setting up WAL record (which we might not use) */ data.node = btree->index->rd_node; data.rblkno = BufferGetBlockNumber(rbuffer); data.flags = xlflags; if (BufferIsValid(childbuf)) { data.leftChildBlkno = BufferGetBlockNumber(childbuf); data.rightChildBlkno = GinPageGetOpaque(childpage)->rightlink; } else data.leftChildBlkno = data.rightChildBlkno = InvalidBlockNumber; if (stack->parent == NULL) { /* * splitting the root, so we need to allocate new left page and * place pointers to left and right page on root page. */ lbuffer = GinNewBuffer(btree->index); /* During index build, count the new left page */ if (buildStats) { if (btree->isData) buildStats->nDataPages++; else buildStats->nEntryPages++; } /* * root never has a right-link, so we borrow the rrlink field to * store the root block number. */ data.rrlink = BufferGetBlockNumber(stack->buffer); data.lblkno = BufferGetBlockNumber(lbuffer); data.flags |= GIN_SPLIT_ROOT; GinPageGetOpaque(newrpage)->rightlink = InvalidBlockNumber; GinPageGetOpaque(newlpage)->rightlink = BufferGetBlockNumber(rbuffer); /* * Construct a new root page containing downlinks to the new left * and right pages. (Do this in a temporary copy rather than * overwriting the original page directly, since we're not in the * critical section yet.) */ newrootpg = PageGetTempPage(newrpage); GinInitPage(newrootpg, GinPageGetOpaque(newlpage)->flags & ~(GIN_LEAF | GIN_COMPRESSED), BLCKSZ); btree->fillRoot(btree, newrootpg, BufferGetBlockNumber(lbuffer), newlpage, BufferGetBlockNumber(rbuffer), newrpage); } else { /* splitting a non-root page */ data.rrlink = savedRightLink; data.lblkno = BufferGetBlockNumber(stack->buffer); GinPageGetOpaque(newrpage)->rightlink = savedRightLink; GinPageGetOpaque(newlpage)->flags |= GIN_INCOMPLETE_SPLIT; GinPageGetOpaque(newlpage)->rightlink = BufferGetBlockNumber(rbuffer); } /* * OK, we have the new contents of the left page in a temporary copy * now (newlpage), and likewise for the new contents of the * newly-allocated right block. The original page is still unchanged. * * If this is a root split, we also have a temporary page containing * the new contents of the root. */ START_CRIT_SECTION(); MarkBufferDirty(rbuffer); MarkBufferDirty(stack->buffer); /* * Restore the temporary copies over the real buffers. */ if (stack->parent == NULL) { /* Splitting the root, three pages to update */ MarkBufferDirty(lbuffer); memcpy(page, newrootpg, BLCKSZ); memcpy(BufferGetPage(lbuffer), newlpage, BLCKSZ); memcpy(BufferGetPage(rbuffer), newrpage, BLCKSZ); } else { /* Normal split, only two pages to update */ memcpy(page, newlpage, BLCKSZ); memcpy(BufferGetPage(rbuffer), newrpage, BLCKSZ); } /* We also clear childbuf's INCOMPLETE_SPLIT flag, if passed */ if (BufferIsValid(childbuf)) { GinPageGetOpaque(childpage)->flags &= ~GIN_INCOMPLETE_SPLIT; MarkBufferDirty(childbuf); } /* write WAL record */ if (RelationNeedsWAL(btree->index)) { XLogRecData rdata[2]; XLogRecPtr recptr; rdata[0].buffer = InvalidBuffer; rdata[0].data = (char *) &data; rdata[0].len = sizeof(ginxlogSplit); if (BufferIsValid(childbuf)) { rdata[0].next = &rdata[1]; rdata[1].buffer = childbuf; rdata[1].buffer_std = true; rdata[1].data = NULL; rdata[1].len = 0; rdata[1].next = payloadrdata; } else rdata[0].next = payloadrdata; recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_SPLIT, rdata); PageSetLSN(page, recptr); PageSetLSN(BufferGetPage(rbuffer), recptr); if (stack->parent == NULL) PageSetLSN(BufferGetPage(lbuffer), recptr); if (BufferIsValid(childbuf)) PageSetLSN(childpage, recptr); } END_CRIT_SECTION(); /* * We can release the locks/pins on the new pages now, but keep * stack->buffer locked. childbuf doesn't get unlocked either. */ UnlockReleaseBuffer(rbuffer); if (stack->parent == NULL) UnlockReleaseBuffer(lbuffer); /* * If we split the root, we're done. Otherwise the split is not * complete until the downlink for the new page has been inserted to * the parent. */ result = (stack->parent == NULL); } else { elog(ERROR, "invalid return code from GIN placeToPage method: %d", rc); result = false; /* keep compiler quiet */ } /* Clean up temp context */ MemoryContextSwitchTo(oldCxt); MemoryContextDelete(tmpCxt); return result; }
void initMotionLayerStructs(MotionLayerState **mlStates) { MemoryContext oldCtxt; MemoryContext ml_mctx; uint8 *pData; if (Gp_role == GP_ROLE_UTILITY) return; if (Gp_interconnect_type == INTERCONNECT_TYPE_UDPIFC) Gp_max_tuple_chunk_size = Gp_max_packet_size - sizeof(struct icpkthdr) - TUPLE_CHUNK_HEADER_SIZE; else if (Gp_interconnect_type == INTERCONNECT_TYPE_TCP) Gp_max_tuple_chunk_size = Gp_max_packet_size - PACKET_HEADER_SIZE - TUPLE_CHUNK_HEADER_SIZE; /* * Use the statically allocated chunk that is intended for sending end-of- * stream messages so that we don't incur allocation and deallocation * overheads. */ s_eos_chunk_data->p_next = NULL; s_eos_chunk_data->inplace = NULL; s_eos_chunk_data->chunk_length = TUPLE_CHUNK_HEADER_SIZE; pData = s_eos_chunk_data->chunk_data; SetChunkDataSize(pData, 0); SetChunkType(pData, TC_END_OF_STREAM); /* * Create the memory-contexts that we will use within the Motion Layer. * * We make the Motion Layer memory-context a child of the ExecutorState * Context, as it lives inside of the estate of a specific query and needs * to get freed when the query is finished. * * The tuple-serial memory-context is a child of the Motion Layer * memory-context * * NOTE: we need to be sure the caller is in ExecutorState memory context * (estate->es_query_cxt) before calling us . */ ml_mctx = AllocSetContextCreate(CurrentMemoryContext, "MotionLayerMemCtxt", ALLOCSET_SMALL_MINSIZE, ALLOCSET_SMALL_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); /* use a setting bigger * than "small" */ /* * Switch to the Motion Layer memory context, so that we can clean things * up easily. */ oldCtxt = MemoryContextSwitchTo(ml_mctx); Assert(*mlStates == NULL); *mlStates = palloc0(sizeof(MotionLayerState)); (*mlStates)->mnEntries = palloc0(MNE_INITIAL_COUNT * sizeof(MotionNodeEntry)); (*mlStates)->mneCount = MNE_INITIAL_COUNT; /* Allocation is done. Go back to caller memory-context. */ MemoryContextSwitchTo(oldCtxt); /* * Keep our motion layer memory context in our newly created motion layer. */ (*mlStates)->motion_layer_mctx = ml_mctx; }
/* * Set the client encoding and save fmgrinfo for the conversion * function if necessary. Returns 0 if okay, -1 if not (bad encoding * or can't support conversion) */ int SetClientEncoding(int encoding, bool doit) { int current_server_encoding; Oid to_server_proc, to_client_proc; FmgrInfo *to_server; FmgrInfo *to_client; MemoryContext oldcontext; if (!PG_VALID_FE_ENCODING(encoding)) return -1; /* Can't do anything during startup, per notes above */ if (!backend_startup_complete) { if (doit) pending_client_encoding = encoding; return 0; } current_server_encoding = GetDatabaseEncoding(); /* * Check for cases that require no conversion function. */ if (current_server_encoding == encoding || current_server_encoding == PG_SQL_ASCII || encoding == PG_SQL_ASCII) { if (doit) { ClientEncoding = &pg_enc2name_tbl[encoding]; ToServerConvProc = NULL; ToClientConvProc = NULL; if (MbProcContext) MemoryContextReset(MbProcContext); } return 0; } /* * If we're not inside a transaction then we can't do catalog lookups, so * fail. After backend startup, this could only happen if we are * re-reading postgresql.conf due to SIGHUP --- so basically this just * constrains the ability to change client_encoding on the fly from * postgresql.conf. Which would probably be a stupid thing to do anyway. */ if (!IsTransactionState()) return -1; /* * Look up the conversion functions. */ to_server_proc = FindDefaultConversionProc(encoding, current_server_encoding); if (!OidIsValid(to_server_proc)) return -1; to_client_proc = FindDefaultConversionProc(current_server_encoding, encoding); if (!OidIsValid(to_client_proc)) return -1; /* * Done if not wanting to actually apply setting. */ if (!doit) return 0; /* Before loading the new fmgr info, remove the old info, if any */ ToServerConvProc = NULL; ToClientConvProc = NULL; if (MbProcContext != NULL) { MemoryContextReset(MbProcContext); } else { /* * This is the first time through, so create the context. Make it a * child of TopMemoryContext so that these values survive across * transactions. */ MbProcContext = AllocSetContextCreate(TopMemoryContext, "MbProcContext", ALLOCSET_SMALL_MINSIZE, ALLOCSET_SMALL_INITSIZE, ALLOCSET_SMALL_MAXSIZE); } /* Load the fmgr info into MbProcContext */ oldcontext = MemoryContextSwitchTo(MbProcContext); to_server = palloc(sizeof(FmgrInfo)); to_client = palloc(sizeof(FmgrInfo)); fmgr_info(to_server_proc, to_server); fmgr_info(to_client_proc, to_client); MemoryContextSwitchTo(oldcontext); ClientEncoding = &pg_enc2name_tbl[encoding]; ToServerConvProc = to_server; ToClientConvProc = to_client; return 0; }
IndexBulkDeleteResult * ginbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state) { Relation index = info->index; BlockNumber blkno = GIN_ROOT_BLKNO; GinVacuumState gvs; Buffer buffer; BlockNumber rootOfPostingTree[BLCKSZ / (sizeof(IndexTupleData) + sizeof(ItemId))]; uint32 nRoot; gvs.tmpCxt = AllocSetContextCreate(CurrentMemoryContext, "Gin vacuum temporary context", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); gvs.index = index; gvs.callback = callback; gvs.callback_state = callback_state; gvs.strategy = info->strategy; initGinState(&gvs.ginstate, index); /* first time through? */ if (stats == NULL) { /* Yes, so initialize stats to zeroes */ stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); /* and cleanup any pending inserts */ ginInsertCleanup(&gvs.ginstate, false, stats); } /* we'll re-count the tuples each time */ stats->num_index_tuples = 0; gvs.result = stats; buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy); /* find leaf page */ for (;;) { Page page = BufferGetPage(buffer); IndexTuple itup; LockBuffer(buffer, GIN_SHARE); Assert(!GinPageIsData(page)); if (GinPageIsLeaf(page)) { LockBuffer(buffer, GIN_UNLOCK); LockBuffer(buffer, GIN_EXCLUSIVE); if (blkno == GIN_ROOT_BLKNO && !GinPageIsLeaf(page)) { LockBuffer(buffer, GIN_UNLOCK); continue; /* check it one more */ } break; } Assert(PageGetMaxOffsetNumber(page) >= FirstOffsetNumber); itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, FirstOffsetNumber)); blkno = GinGetDownlink(itup); Assert(blkno != InvalidBlockNumber); UnlockReleaseBuffer(buffer); buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy); } /* right now we found leftmost page in entry's BTree */ for (;;) { Page page = BufferGetPage(buffer); Page resPage; uint32 i; Assert(!GinPageIsData(page)); resPage = ginVacuumEntryPage(&gvs, buffer, rootOfPostingTree, &nRoot); blkno = GinPageGetOpaque(page)->rightlink; if (resPage) { START_CRIT_SECTION(); PageRestoreTempPage(resPage, page); MarkBufferDirty(buffer); xlogVacuumPage(gvs.index, buffer); UnlockReleaseBuffer(buffer); END_CRIT_SECTION(); } else { UnlockReleaseBuffer(buffer); } vacuum_delay_point(); for (i = 0; i < nRoot; i++) { ginVacuumPostingTree(&gvs, rootOfPostingTree[i]); vacuum_delay_point(); } if (blkno == InvalidBlockNumber) /* rightmost page */ break; buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy); LockBuffer(buffer, GIN_EXCLUSIVE); } MemoryContextDelete(gvs.tmpCxt); return gvs.result; }
/* ---------------- * CreateExecutorState * * Create and initialize an EState node, which is the root of * working storage for an entire Executor invocation. * * Principally, this creates the per-query memory context that will be * used to hold all working data that lives till the end of the query. * Note that the per-query context will become a child of the caller's * CurrentMemoryContext. * ---------------- */ EState * CreateExecutorState(void) { EState *estate; MemoryContext qcontext; MemoryContext oldcontext; /* * Create the per-query context for this Executor run. */ qcontext = AllocSetContextCreate(CurrentMemoryContext, "ExecutorState", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); /* * Make the EState node within the per-query context. This way, we don't * need a separate pfree() operation for it at shutdown. */ oldcontext = MemoryContextSwitchTo(qcontext); estate = makeNode(EState); /* * Initialize all fields of the Executor State structure */ estate->es_direction = ForwardScanDirection; estate->es_snapshot = SnapshotNow; estate->es_crosscheck_snapshot = InvalidSnapshot; /* no crosscheck */ estate->es_range_table = NIL; estate->es_plannedstmt = NULL; estate->es_junkFilter = NULL; estate->es_output_cid = (CommandId) 0; estate->es_result_relations = NULL; estate->es_num_result_relations = 0; estate->es_result_relation_info = NULL; estate->es_trig_target_relations = NIL; estate->es_trig_tuple_slot = NULL; estate->es_trig_oldtup_slot = NULL; estate->es_trig_newtup_slot = NULL; estate->es_param_list_info = NULL; estate->es_param_exec_vals = NULL; estate->es_query_cxt = qcontext; estate->es_tupleTable = NIL; estate->es_rowMarks = NIL; estate->es_processed = 0; estate->es_lastoid = InvalidOid; estate->es_top_eflags = 0; estate->es_instrument = 0; estate->es_finished = false; estate->es_exprcontexts = NIL; estate->es_subplanstates = NIL; estate->es_auxmodifytables = NIL; estate->es_per_tuple_exprcontext = NULL; estate->es_epqTuple = NULL; estate->es_epqTupleSet = NULL; estate->es_epqScanDone = NULL; /* * Return the executor state structure */ MemoryContextSwitchTo(oldcontext); return estate; }
/* * SQL function json_array_elements * * get the elements from a json array * * a lot of this processing is similar to the json_each* functions */ Datum json_array_elements(PG_FUNCTION_ARGS) { text *json = PG_GETARG_TEXT_P(0); /* elements doesn't need any escaped strings, so use false here */ JsonLexContext *lex = makeJsonLexContext(json, false); JsonSemAction *sem; ReturnSetInfo *rsi; MemoryContext old_cxt; TupleDesc tupdesc; ElementsState *state; state = palloc0(sizeof(ElementsState)); sem = palloc0(sizeof(JsonSemAction)); rsi = (ReturnSetInfo *) fcinfo->resultinfo; if (!rsi || !IsA(rsi, ReturnSetInfo) || (rsi->allowedModes & SFRM_Materialize) == 0 || rsi->expectedDesc == NULL) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("set-valued function called in context that " "cannot accept a set"))); rsi->returnMode = SFRM_Materialize; /* it's a simple type, so don't use get_call_result_type() */ tupdesc = rsi->expectedDesc; /* make these in a sufficiently long-lived memory context */ old_cxt = MemoryContextSwitchTo(rsi->econtext->ecxt_per_query_memory); state->ret_tdesc = CreateTupleDescCopy(tupdesc); BlessTupleDesc(state->ret_tdesc); state->tuple_store = tuplestore_begin_heap(rsi->allowedModes & SFRM_Materialize_Random, false, work_mem); MemoryContextSwitchTo(old_cxt); sem->semstate = (void *) state; sem->object_start = elements_object_start; sem->scalar = elements_scalar; sem->array_element_start = elements_array_element_start; sem->array_element_end = elements_array_element_end; state->lex = lex; state->tmp_cxt = AllocSetContextCreate(CurrentMemoryContext, "json_array_elements temporary cxt", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); pg_parse_json(lex, sem); rsi->setResult = state->tuple_store; rsi->setDesc = state->ret_tdesc; PG_RETURN_NULL(); }
/* * Insert new tuple to the bloom index. */ bool blinsert(Relation index, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, IndexUniqueCheck checkUnique, IndexInfo *indexInfo) { BloomState blstate; BloomTuple *itup; MemoryContext oldCtx; MemoryContext insertCtx; BloomMetaPageData *metaData; Buffer buffer, metaBuffer; Page page, metaPage; BlockNumber blkno = InvalidBlockNumber; OffsetNumber nStart; GenericXLogState *state; insertCtx = AllocSetContextCreate(CurrentMemoryContext, "Bloom insert temporary context", ALLOCSET_DEFAULT_SIZES); oldCtx = MemoryContextSwitchTo(insertCtx); initBloomState(&blstate, index); itup = BloomFormTuple(&blstate, ht_ctid, values, isnull); /* * At first, try to insert new tuple to the first page in notFullPage * array. If successful, we don't need to modify the meta page. */ metaBuffer = ReadBuffer(index, BLOOM_METAPAGE_BLKNO); LockBuffer(metaBuffer, BUFFER_LOCK_SHARE); metaData = BloomPageGetMeta(BufferGetPage(metaBuffer)); if (metaData->nEnd > metaData->nStart) { Page page; blkno = metaData->notFullPage[metaData->nStart]; Assert(blkno != InvalidBlockNumber); /* Don't hold metabuffer lock while doing insert */ LockBuffer(metaBuffer, BUFFER_LOCK_UNLOCK); buffer = ReadBuffer(index, blkno); LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); state = GenericXLogStart(index); page = GenericXLogRegisterBuffer(state, buffer, 0); /* * We might have found a page that was recently deleted by VACUUM. If * so, we can reuse it, but we must reinitialize it. */ if (PageIsNew(page) || BloomPageIsDeleted(page)) BloomInitPage(page, 0); if (BloomPageAddItem(&blstate, page, itup)) { /* Success! Apply the change, clean up, and exit */ GenericXLogFinish(state); UnlockReleaseBuffer(buffer); ReleaseBuffer(metaBuffer); MemoryContextSwitchTo(oldCtx); MemoryContextDelete(insertCtx); return false; } /* Didn't fit, must try other pages */ GenericXLogAbort(state); UnlockReleaseBuffer(buffer); } else { /* No entries in notFullPage */ LockBuffer(metaBuffer, BUFFER_LOCK_UNLOCK); } /* * Try other pages in notFullPage array. We will have to change nStart in * metapage. Thus, grab exclusive lock on metapage. */ LockBuffer(metaBuffer, BUFFER_LOCK_EXCLUSIVE); /* nStart might have changed while we didn't have lock */ nStart = metaData->nStart; /* Skip first page if we already tried it above */ if (nStart < metaData->nEnd && blkno == metaData->notFullPage[nStart]) nStart++; /* * This loop iterates for each page we try from the notFullPage array, and * will also initialize a GenericXLogState for the fallback case of having * to allocate a new page. */ for (;;) { state = GenericXLogStart(index); /* get modifiable copy of metapage */ metaPage = GenericXLogRegisterBuffer(state, metaBuffer, 0); metaData = BloomPageGetMeta(metaPage); if (nStart >= metaData->nEnd) break; /* no more entries in notFullPage array */ blkno = metaData->notFullPage[nStart]; Assert(blkno != InvalidBlockNumber); buffer = ReadBuffer(index, blkno); LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); page = GenericXLogRegisterBuffer(state, buffer, 0); /* Basically same logic as above */ if (PageIsNew(page) || BloomPageIsDeleted(page)) BloomInitPage(page, 0); if (BloomPageAddItem(&blstate, page, itup)) { /* Success! Apply the changes, clean up, and exit */ metaData->nStart = nStart; GenericXLogFinish(state); UnlockReleaseBuffer(buffer); UnlockReleaseBuffer(metaBuffer); MemoryContextSwitchTo(oldCtx); MemoryContextDelete(insertCtx); return false; } /* Didn't fit, must try other pages */ GenericXLogAbort(state); UnlockReleaseBuffer(buffer); nStart++; } /* * Didn't find place to insert in notFullPage array. Allocate new page. * (XXX is it good to do this while holding ex-lock on the metapage??) */ buffer = BloomNewBuffer(index); page = GenericXLogRegisterBuffer(state, buffer, GENERIC_XLOG_FULL_IMAGE); BloomInitPage(page, 0); if (!BloomPageAddItem(&blstate, page, itup)) { /* We shouldn't be here since we're inserting to an empty page */ elog(ERROR, "could not add new bloom tuple to empty page"); } /* Reset notFullPage array to contain just this new page */ metaData->nStart = 0; metaData->nEnd = 1; metaData->notFullPage[0] = BufferGetBlockNumber(buffer); /* Apply the changes, clean up, and exit */ GenericXLogFinish(state); UnlockReleaseBuffer(buffer); UnlockReleaseBuffer(metaBuffer); MemoryContextSwitchTo(oldCtx); MemoryContextDelete(insertCtx); return false; }