/* ---------------------------------------------------------------- * ExecReScanRecursiveUnion * * Rescans the relation. * ---------------------------------------------------------------- */ void ExecReScanRecursiveUnion(RecursiveUnionState *node) { PlanState *outerPlan = outerPlanState(node); PlanState *innerPlan = innerPlanState(node); RecursiveUnion *plan = (RecursiveUnion *) node->ps.plan; /* * Set recursive term's chgParam to tell it that we'll modify the working * table and therefore it has to rescan. */ innerPlan->chgParam = bms_add_member(innerPlan->chgParam, plan->wtParam); /* * if chgParam of subnode is not null then plan will be re-scanned by * first ExecProcNode. Because of above, we only have to do this to the * non-recursive term. */ if (outerPlan->chgParam == NULL) ExecReScan(outerPlan); /* Release any hashtable storage */ if (node->tableContext) MemoryContextResetAndDeleteChildren(node->tableContext); /* And rebuild empty hashtable if needed */ if (plan->numCols > 0) build_hash_table(node); /* reset processing state */ node->recursing = false; node->intermediate_empty = true; tuplestore_clear(node->working_table); tuplestore_clear(node->intermediate_table); }
/* * Release current scan keys, if any. */ void ginFreeScanKeys(GinScanOpaque so) { uint32 i; if (so->keys == NULL) return; for (i = 0; i < so->totalentries; i++) { GinScanEntry entry = so->entries[i]; if (entry->buffer != InvalidBuffer) ReleaseBuffer(entry->buffer); if (entry->list) pfree(entry->list); if (entry->matchIterator) tbm_end_iterate(entry->matchIterator); if (entry->matchBitmap) tbm_free(entry->matchBitmap); } MemoryContextResetAndDeleteChildren(so->keyCtx); so->keys = NULL; so->nkeys = 0; so->entries = NULL; so->totalentries = 0; }
void ExecReScanSetOp(SetOpState *node) { ExecClearTuple(node->ps.ps_ResultTupleSlot); node->setop_done = false; node->numOutput = 0; if (((SetOp *) node->ps.plan)->strategy == SETOP_HASHED) { /* * In the hashed case, if we haven't yet built the hash table then we * can just return; nothing done yet, so nothing to undo. If subnode's * chgParam is not NULL then it will be re-scanned by ExecProcNode, * else no reason to re-scan it at all. */ if (!node->table_filled) return; /* * If we do have the hash table and the subplan does not have any * parameter changes, then we can just rescan the existing hash table; * no need to build it again. */ if (node->ps.lefttree->chgParam == NULL) { ResetTupleHashIterator(node->hashtable, &node->hashiter); return; } } /* Release first tuple of group, if we have made a copy */ if (node->grp_firstTuple != NULL) { heap_freetuple(node->grp_firstTuple); node->grp_firstTuple = NULL; } /* Release any hashtable storage */ if (node->tableContext) MemoryContextResetAndDeleteChildren(node->tableContext); /* And rebuild empty hashtable if needed */ if (((SetOp *) node->ps.plan)->strategy == SETOP_HASHED) { build_hash_table(node); node->table_filled = false; } /* * if chgParam of subnode is not null then plan will be re-scanned by * first ExecProcNode. */ if (node->ps.lefttree->chgParam == NULL) ExecReScan(node->ps.lefttree); }
/* * Flush all cache entries when pg_event_trigger is updated. * * This should be rare enough that we don't need to be very granular about * it, so we just blow away everything, which also avoids the possibility of * memory leaks. */ static void InvalidateEventCacheCallback(Datum arg, int cacheid, uint32 hashvalue) { /* * If the cache isn't valid, then there might be a rebuild in progress, * so we can't immediately blow it away. But it's advantageous to do * this when possible, so as to immediately free memory. */ if (EventTriggerCacheState == ETCS_VALID) { MemoryContextResetAndDeleteChildren(EventTriggerCacheContext); EventTriggerCache = NULL; } /* Mark cache for rebuild. */ EventTriggerCacheState = ETCS_NEEDS_REBUILD; }
static void resetTemporaryContext( twopoEssentials *essentials ) { # ifdef TWOPO_CACHE_PLANS int i; # endif Assert( essentials != NULL ); if ( ! essentials->ctx ) return; # ifdef TWOPO_DEBUG # ifdef TWOPO_CACHE_PLANS if( twopo_cache_plans ){ fprintf(stderr, "TwoPO DEBUG: resetting temporary memory context.\n"); //MemoryContextStats(essentials->ctx->mycontext); } # endif # endif essentials->root->join_rel_list = list_truncate(essentials->root->join_rel_list, essentials->ctx->savelength); essentials->root->join_rel_hash = NULL; # ifdef TWOPO_CACHE_PLANS /* * Cleaning parent nodes in nodeList deleted by * MemoryContextResetAndDeleteChildren() */ for( i=0; i<essentials->numNodes; i++ ){ essentials->nodeList[i].parents = NULL; } # endif MemoryContextResetAndDeleteChildren(essentials->ctx->mycontext); }
/* * Rebuild the event trigger cache. */ static void BuildEventTriggerCache(void) { HASHCTL ctl; HTAB *cache; MemoryContext oldcontext; Relation rel; Relation irel; SysScanDesc scan; if (EventTriggerCacheContext != NULL) { /* * Free up any memory already allocated in EventTriggerCacheContext. * This can happen either because a previous rebuild failed, or * because an invalidation happened before the rebuild was complete. */ MemoryContextResetAndDeleteChildren(EventTriggerCacheContext); } else { /* * This is our first time attempting to build the cache, so we need * to set up the memory context and register a syscache callback to * capture future invalidation events. */ if (CacheMemoryContext == NULL) CreateCacheMemoryContext(); EventTriggerCacheContext = AllocSetContextCreate(CacheMemoryContext, "EventTriggerCache", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); CacheRegisterSyscacheCallback(EVENTTRIGGEROID, InvalidateEventCacheCallback, (Datum) 0); } /* Switch to correct memory context. */ oldcontext = MemoryContextSwitchTo(EventTriggerCacheContext); /* Prevent the memory context from being nuked while we're rebuilding. */ EventTriggerCacheState = ETCS_REBUILD_STARTED; /* Create new hash table. */ MemSet(&ctl, 0, sizeof(ctl)); ctl.keysize = sizeof(EventTriggerEvent); ctl.entrysize = sizeof(EventTriggerCacheEntry); ctl.hash = tag_hash; ctl.hcxt = EventTriggerCacheContext; cache = hash_create("Event Trigger Cache", 32, &ctl, HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT); /* * Prepare to scan pg_event_trigger in name order. We use an MVCC * snapshot to avoid getting inconsistent results if the table is * being concurrently updated. */ rel = relation_open(EventTriggerRelationId, AccessShareLock); irel = index_open(EventTriggerNameIndexId, AccessShareLock); scan = systable_beginscan_ordered(rel, irel, GetLatestSnapshot(), 0, NULL); /* * Build a cache item for each pg_event_trigger tuple, and append each * one to the appropriate cache entry. */ for (;;) { HeapTuple tup; Form_pg_event_trigger form; char *evtevent; EventTriggerEvent event; EventTriggerCacheItem *item; Datum evttags; bool evttags_isnull; EventTriggerCacheEntry *entry; bool found; /* Get next tuple. */ tup = systable_getnext_ordered(scan, ForwardScanDirection); if (!HeapTupleIsValid(tup)) break; /* Skip trigger if disabled. */ form = (Form_pg_event_trigger) GETSTRUCT(tup); if (form->evtenabled == TRIGGER_DISABLED) continue; /* Decode event name. */ evtevent = NameStr(form->evtevent); if (strcmp(evtevent, "ddl_command_start") == 0) event = EVT_DDLCommandStart; else continue; /* Allocate new cache item. */ item = palloc0(sizeof(EventTriggerCacheItem)); item->fnoid = form->evtfoid; item->enabled = form->evtenabled; /* Decode and sort tags array. */ evttags = heap_getattr(tup, Anum_pg_event_trigger_evttags, RelationGetDescr(rel), &evttags_isnull); if (!evttags_isnull) { item->ntags = DecodeTextArrayToCString(evttags, &item->tag); qsort(item->tag, item->ntags, sizeof(char *), pg_qsort_strcmp); } /* Add to cache entry. */ entry = hash_search(cache, &event, HASH_ENTER, &found); if (found) entry->triggerlist = lappend(entry->triggerlist, item); else entry->triggerlist = list_make1(item); } /* Done with pg_event_trigger scan. */ systable_endscan_ordered(scan); index_close(irel, AccessShareLock); relation_close(rel, AccessShareLock); /* Restore previous memory context. */ MemoryContextSwitchTo(oldcontext); /* Install new cache. */ EventTriggerCache = cache; /* * If the cache has been invalidated since we entered this routine, we * still use and return the cache we just finished constructing, to avoid * infinite loops, but we leave the cache marked stale so that we'll * rebuild it again on next access. Otherwise, we mark the cache valid. */ if (EventTriggerCacheState == ETCS_REBUILD_STARTED) EventTriggerCacheState = ETCS_VALID; }
/* * Execute the index scan. * * This works by reading index TIDs from the revmap, and obtaining the index * tuples pointed to by them; the summary values in the index tuples are * compared to the scan keys. We return into the TID bitmap all the pages in * ranges corresponding to index tuples that match the scan keys. * * If a TID from the revmap is read as InvalidTID, we know that range is * unsummarized. Pages in those ranges need to be returned regardless of scan * keys. */ int64 bringetbitmap(IndexScanDesc scan, TIDBitmap *tbm) { Relation idxRel = scan->indexRelation; Buffer buf = InvalidBuffer; BrinDesc *bdesc; Oid heapOid; Relation heapRel; BrinOpaque *opaque; BlockNumber nblocks; BlockNumber heapBlk; int totalpages = 0; FmgrInfo *consistentFn; MemoryContext oldcxt; MemoryContext perRangeCxt; opaque = (BrinOpaque *) scan->opaque; bdesc = opaque->bo_bdesc; pgstat_count_index_scan(idxRel); /* * We need to know the size of the table so that we know how long to * iterate on the revmap. */ heapOid = IndexGetRelation(RelationGetRelid(idxRel), false); heapRel = heap_open(heapOid, AccessShareLock); nblocks = RelationGetNumberOfBlocks(heapRel); heap_close(heapRel, AccessShareLock); /* * Make room for the consistent support procedures of indexed columns. We * don't look them up here; we do that lazily the first time we see a scan * key reference each of them. We rely on zeroing fn_oid to InvalidOid. */ consistentFn = palloc0(sizeof(FmgrInfo) * bdesc->bd_tupdesc->natts); /* * Setup and use a per-range memory context, which is reset every time we * loop below. This avoids having to free the tuples within the loop. */ perRangeCxt = AllocSetContextCreate(CurrentMemoryContext, "bringetbitmap cxt", ALLOCSET_DEFAULT_SIZES); oldcxt = MemoryContextSwitchTo(perRangeCxt); /* * Now scan the revmap. We start by querying for heap page 0, * incrementing by the number of pages per range; this gives us a full * view of the table. */ for (heapBlk = 0; heapBlk < nblocks; heapBlk += opaque->bo_pagesPerRange) { bool addrange; BrinTuple *tup; OffsetNumber off; Size size; CHECK_FOR_INTERRUPTS(); MemoryContextResetAndDeleteChildren(perRangeCxt); tup = brinGetTupleForHeapBlock(opaque->bo_rmAccess, heapBlk, &buf, &off, &size, BUFFER_LOCK_SHARE, scan->xs_snapshot); if (tup) { tup = brin_copy_tuple(tup, size); LockBuffer(buf, BUFFER_LOCK_UNLOCK); } /* * For page ranges with no indexed tuple, we must return the whole * range; otherwise, compare it to the scan keys. */ if (tup == NULL) { addrange = true; } else { BrinMemTuple *dtup; dtup = brin_deform_tuple(bdesc, tup); if (dtup->bt_placeholder) { /* * Placeholder tuples are always returned, regardless of the * values stored in them. */ addrange = true; } else { int keyno; /* * Compare scan keys with summary values stored for the range. * If scan keys are matched, the page range must be added to * the bitmap. We initially assume the range needs to be * added; in particular this serves the case where there are * no keys. */ addrange = true; for (keyno = 0; keyno < scan->numberOfKeys; keyno++) { ScanKey key = &scan->keyData[keyno]; AttrNumber keyattno = key->sk_attno; BrinValues *bval = &dtup->bt_columns[keyattno - 1]; Datum add; /* * The collation of the scan key must match the collation * used in the index column (but only if the search is not * IS NULL/ IS NOT NULL). Otherwise we shouldn't be using * this index ... */ Assert((key->sk_flags & SK_ISNULL) || (key->sk_collation == bdesc->bd_tupdesc->attrs[keyattno - 1]->attcollation)); /* First time this column? look up consistent function */ if (consistentFn[keyattno - 1].fn_oid == InvalidOid) { FmgrInfo *tmp; tmp = index_getprocinfo(idxRel, keyattno, BRIN_PROCNUM_CONSISTENT); fmgr_info_copy(&consistentFn[keyattno - 1], tmp, CurrentMemoryContext); } /* * Check whether the scan key is consistent with the page * range values; if so, have the pages in the range added * to the output bitmap. * * When there are multiple scan keys, failure to meet the * criteria for a single one of them is enough to discard * the range as a whole, so break out of the loop as soon * as a false return value is obtained. */ add = FunctionCall3Coll(&consistentFn[keyattno - 1], key->sk_collation, PointerGetDatum(bdesc), PointerGetDatum(bval), PointerGetDatum(key)); addrange = DatumGetBool(add); if (!addrange) break; } } } /* add the pages in the range to the output bitmap, if needed */ if (addrange) { BlockNumber pageno; for (pageno = heapBlk; pageno <= heapBlk + opaque->bo_pagesPerRange - 1; pageno++) { MemoryContextSwitchTo(oldcxt); tbm_add_page(tbm, pageno); totalpages++; MemoryContextSwitchTo(perRangeCxt); } } } MemoryContextSwitchTo(oldcxt); MemoryContextDelete(perRangeCxt); if (buf != InvalidBuffer) ReleaseBuffer(buf); /* * XXX We have an approximation of the number of *pages* that our scan * returns, but we don't have a precise idea of the number of heap tuples * involved. */ return totalpages * 10; }
/* * Main entry point for walwriter process * * This is invoked from BootstrapMain, which has already created the basic * execution environment, but not enabled signals yet. */ void WalWriterMain(void) { sigjmp_buf local_sigjmp_buf; MemoryContext walwriter_context; /* * If possible, make this process a group leader, so that the postmaster * can signal any child processes too. (walwriter probably never has any * child processes, but for consistency we make all postmaster child * processes do this.) */ #ifdef HAVE_SETSID if (setsid() < 0) elog(FATAL, "setsid() failed: %m"); #endif /* * Properly accept or ignore signals the postmaster might send us * * We have no particular use for SIGINT at the moment, but seems * reasonable to treat like SIGTERM. */ pqsignal(SIGHUP, WalSigHupHandler); /* set flag to read config file */ pqsignal(SIGINT, WalShutdownHandler); /* request shutdown */ pqsignal(SIGTERM, WalShutdownHandler); /* request shutdown */ pqsignal(SIGQUIT, wal_quickdie); /* hard crash time */ pqsignal(SIGALRM, SIG_IGN); pqsignal(SIGPIPE, SIG_IGN); pqsignal(SIGUSR1, SIG_IGN); /* reserve for ProcSignal */ pqsignal(SIGUSR2, SIG_IGN); /* not used */ /* * Reset some signals that are accepted by postmaster but not here */ pqsignal(SIGCHLD, SIG_DFL); pqsignal(SIGTTIN, SIG_DFL); pqsignal(SIGTTOU, SIG_DFL); pqsignal(SIGCONT, SIG_DFL); pqsignal(SIGWINCH, SIG_DFL); /* We allow SIGQUIT (quickdie) at all times */ sigdelset(&BlockSig, SIGQUIT); /* * Create a resource owner to keep track of our resources (not clear that * we need this, but may as well have one). */ CurrentResourceOwner = ResourceOwnerCreate(NULL, "Wal Writer"); /* * Create a memory context that we will do all our work in. We do this so * that we can reset the context during error recovery and thereby avoid * possible memory leaks. Formerly this code just ran in * TopMemoryContext, but resetting that would be a really bad idea. */ walwriter_context = AllocSetContextCreate(TopMemoryContext, "Wal Writer", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); MemoryContextSwitchTo(walwriter_context); /* * If an exception is encountered, processing resumes here. * * This code is heavily based on bgwriter.c, q.v. */ if (sigsetjmp(local_sigjmp_buf, 1) != 0) { /* Since not using PG_TRY, must reset error stack by hand */ error_context_stack = NULL; /* Prevent interrupts while cleaning up */ HOLD_INTERRUPTS(); /* Report the error to the server log */ EmitErrorReport(); /* * These operations are really just a minimal subset of * AbortTransaction(). We don't have very many resources to worry * about in walwriter, but we do have LWLocks, and perhaps buffers? */ LWLockReleaseAll(); AbortBufferIO(); UnlockBuffers(); /* buffer pins are released here: */ ResourceOwnerRelease(CurrentResourceOwner, RESOURCE_RELEASE_BEFORE_LOCKS, false, true); /* we needn't bother with the other ResourceOwnerRelease phases */ AtEOXact_Buffers(false); AtEOXact_Files(); AtEOXact_HashTables(false); /* * Now return to normal top-level context and clear ErrorContext for * next time. */ MemoryContextSwitchTo(walwriter_context); FlushErrorState(); /* Flush any leaked data in the top-level context */ MemoryContextResetAndDeleteChildren(walwriter_context); /* Now we can allow interrupts again */ RESUME_INTERRUPTS(); /* * Sleep at least 1 second after any error. A write error is likely * to be repeated, and we don't want to be filling the error logs as * fast as we can. */ pg_usleep(1000000L); /* * Close all open files after any error. This is helpful on Windows, * where holding deleted files open causes various strange errors. * It's not clear we need it elsewhere, but shouldn't hurt. */ smgrcloseall(); } /* We can now handle ereport(ERROR) */ PG_exception_stack = &local_sigjmp_buf; /* * Unblock signals (they were blocked when the postmaster forked us) */ PG_SETMASK(&UnBlockSig); /* * Loop forever */ for (;;) { long udelay; /* * Emergency bailout if postmaster has died. This is to avoid the * necessity for manual cleanup of all postmaster children. */ if (!PostmasterIsAlive(true)) exit(1); /* * Process any requests or signals received recently. */ if (got_SIGHUP) { got_SIGHUP = false; ProcessConfigFile(PGC_SIGHUP); } if (shutdown_requested) { /* Normal exit from the walwriter is here */ proc_exit(0); /* done */ } /* * Do what we're here for... */ XLogBackgroundFlush(); /* * Delay until time to do something more, but fall out of delay * reasonably quickly if signaled. */ udelay = WalWriterDelay * 1000L; while (udelay > 999999L) { if (got_SIGHUP || shutdown_requested) break; pg_usleep(1000000L); udelay -= 1000000L; } if (!(got_SIGHUP || shutdown_requested)) pg_usleep(udelay); } }
/* * main entry pont of pcp worker child process */ void pcp_worker_main(int port) { sigjmp_buf local_sigjmp_buf; MemoryContext PCPMemoryContext; int authenticated = 0; char salt[4]; int random_salt = 0; struct timeval uptime; char tos; int rsize; char *buf = NULL; ereport(DEBUG1, (errmsg("I am PCP worker child with pid:%d",getpid()))); /* Identify myself via ps */ init_ps_display("", "", "", ""); gettimeofday(&uptime, NULL); srandom((unsigned int) (getpid() ^ uptime.tv_usec)); /* set up signal handlers */ signal(SIGTERM, die); signal(SIGINT, die); signal(SIGQUIT, die); signal(SIGCHLD, SIG_DFL); signal(SIGUSR2, wakeup_handler_child); signal(SIGUSR1, SIG_IGN); signal(SIGHUP, SIG_IGN); signal(SIGPIPE, SIG_IGN); signal(SIGALRM, SIG_IGN); /* Create per loop iteration memory context */ PCPMemoryContext = AllocSetContextCreate(TopMemoryContext, "PCP_worker_main_loop", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); MemoryContextSwitchTo(TopMemoryContext); /* * install the call back for preparation of pcp worker child exit */ on_system_exit(pcp_worker_will_go_down, (Datum)NULL); /* Initialize my backend status */ pool_initialize_private_backend_status(); /* Initialize process context */ pool_init_process_context(); pcp_frontend = pcp_open(port); unset_nonblock(pcp_frontend->fd); if (sigsetjmp(local_sigjmp_buf, 1) != 0) { error_context_stack = NULL; EmitErrorReport(); MemoryContextSwitchTo(TopMemoryContext); FlushErrorState(); } /* We can now handle ereport(ERROR) */ PG_exception_stack = &local_sigjmp_buf; for(;;) { MemoryContextSwitchTo(PCPMemoryContext); MemoryContextResetAndDeleteChildren(PCPMemoryContext); errno = 0; /* read a PCP packet */ do_pcp_read(pcp_frontend, &tos, 1); do_pcp_read(pcp_frontend, &rsize, sizeof(int)); rsize = ntohl(rsize); if ((rsize - sizeof(int)) > 0) { buf = (char *)palloc(rsize - sizeof(int)); do_pcp_read(pcp_frontend, buf, rsize - sizeof(int)); } ereport(DEBUG1, (errmsg("received PCP packet"), errdetail("PCP packet type of service '%c'", tos))); if (tos == 'R') /* authentication */ { set_ps_display("PCP: processing authentication", false); process_authentication(pcp_frontend, buf,salt, &random_salt); authenticated = 1; continue; } if (tos == 'M') /* md5 salt */ { set_ps_display("PCP: processing authentication", false); send_md5salt(pcp_frontend, salt); random_salt = 1; continue; } /* is this connection authenticated? if not disconnect immediately*/ if (!authenticated) ereport(FATAL, (errmsg("authentication failed for new PCP connection"), errdetail("connection not authorized"))); /* process a request */ pcp_process_command(tos, buf, rsize); } exit(0); }
/* * A tuple in the heap is being inserted. To keep a brin index up to date, * we need to obtain the relevant index tuple and compare its stored values * with those of the new tuple. If the tuple values are not consistent with * the summary tuple, we need to update the index tuple. * * If the range is not currently summarized (i.e. the revmap returns NULL for * it), there's nothing to do. */ bool brininsert(Relation idxRel, Datum *values, bool *nulls, ItemPointer heaptid, Relation heapRel, IndexUniqueCheck checkUnique) { BlockNumber pagesPerRange; BrinDesc *bdesc = NULL; BrinRevmap *revmap; Buffer buf = InvalidBuffer; MemoryContext tupcxt = NULL; MemoryContext oldcxt = NULL; revmap = brinRevmapInitialize(idxRel, &pagesPerRange, NULL); for (;;) { bool need_insert = false; OffsetNumber off; BrinTuple *brtup; BrinMemTuple *dtup; BlockNumber heapBlk; int keyno; CHECK_FOR_INTERRUPTS(); heapBlk = ItemPointerGetBlockNumber(heaptid); /* normalize the block number to be the first block in the range */ heapBlk = (heapBlk / pagesPerRange) * pagesPerRange; brtup = brinGetTupleForHeapBlock(revmap, heapBlk, &buf, &off, NULL, BUFFER_LOCK_SHARE, NULL); /* if range is unsummarized, there's nothing to do */ if (!brtup) break; /* First time through? */ if (bdesc == NULL) { bdesc = brin_build_desc(idxRel); tupcxt = AllocSetContextCreate(CurrentMemoryContext, "brininsert cxt", ALLOCSET_DEFAULT_SIZES); oldcxt = MemoryContextSwitchTo(tupcxt); } dtup = brin_deform_tuple(bdesc, brtup); /* * Compare the key values of the new tuple to the stored index values; * our deformed tuple will get updated if the new tuple doesn't fit * the original range (note this means we can't break out of the loop * early). Make a note of whether this happens, so that we know to * insert the modified tuple later. */ for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++) { Datum result; BrinValues *bval; FmgrInfo *addValue; bval = &dtup->bt_columns[keyno]; addValue = index_getprocinfo(idxRel, keyno + 1, BRIN_PROCNUM_ADDVALUE); result = FunctionCall4Coll(addValue, idxRel->rd_indcollation[keyno], PointerGetDatum(bdesc), PointerGetDatum(bval), values[keyno], nulls[keyno]); /* if that returned true, we need to insert the updated tuple */ need_insert |= DatumGetBool(result); } if (!need_insert) { /* * The tuple is consistent with the new values, so there's nothing * to do. */ LockBuffer(buf, BUFFER_LOCK_UNLOCK); } else { Page page = BufferGetPage(buf); ItemId lp = PageGetItemId(page, off); Size origsz; BrinTuple *origtup; Size newsz; BrinTuple *newtup; bool samepage; /* * Make a copy of the old tuple, so that we can compare it after * re-acquiring the lock. */ origsz = ItemIdGetLength(lp); origtup = brin_copy_tuple(brtup, origsz); /* * Before releasing the lock, check if we can attempt a same-page * update. Another process could insert a tuple concurrently in * the same page though, so downstream we must be prepared to cope * if this turns out to not be possible after all. */ newtup = brin_form_tuple(bdesc, heapBlk, dtup, &newsz); samepage = brin_can_do_samepage_update(buf, origsz, newsz); LockBuffer(buf, BUFFER_LOCK_UNLOCK); /* * Try to update the tuple. If this doesn't work for whatever * reason, we need to restart from the top; the revmap might be * pointing at a different tuple for this block now, so we need to * recompute to ensure both our new heap tuple and the other * inserter's are covered by the combined tuple. It might be that * we don't need to update at all. */ if (!brin_doupdate(idxRel, pagesPerRange, revmap, heapBlk, buf, off, origtup, origsz, newtup, newsz, samepage)) { /* no luck; start over */ MemoryContextResetAndDeleteChildren(tupcxt); continue; } } /* success! */ break; } brinRevmapTerminate(revmap); if (BufferIsValid(buf)) ReleaseBuffer(buf); if (bdesc != NULL) { brin_free_desc(bdesc); MemoryContextSwitchTo(oldcxt); MemoryContextDelete(tupcxt); } return false; }
/* fork heartbeat sender child */ pid_t wd_hb_sender(int fork_wait_time, WdHbIf *hb_if) { int sock; pid_t pid = 0; WdHbPacket pkt; WdInfo * p = WD_List; char pack_str[WD_MAX_PACKET_STRING]; int pack_str_len; sigjmp_buf local_sigjmp_buf; pid = fork(); if (pid != 0) { if (pid == -1) ereport(PANIC, (errmsg("failed to fork a heartbeat sender child"))); return pid; } on_exit_reset(); processType = PT_HB_SENDER; if (fork_wait_time > 0) { sleep(fork_wait_time); } POOL_SETMASK(&UnBlockSig); signal(SIGTERM, hb_sender_exit); signal(SIGINT, hb_sender_exit); signal(SIGQUIT, hb_sender_exit); signal(SIGCHLD, SIG_IGN); signal(SIGHUP, SIG_IGN); signal(SIGUSR1, SIG_IGN); signal(SIGUSR2, SIG_IGN); signal(SIGPIPE, SIG_IGN); signal(SIGALRM, SIG_IGN); init_ps_display("", "", "", ""); /* Create per loop iteration memory context */ ProcessLoopContext = AllocSetContextCreate(TopMemoryContext, "wdhb_sender", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); MemoryContextSwitchTo(TopMemoryContext); sock = wd_create_hb_send_socket(hb_if); set_ps_display("heartbeat sender", false); if (sigsetjmp(local_sigjmp_buf, 1) != 0) { /* Since not using PG_TRY, must reset error stack by hand */ error_context_stack = NULL; EmitErrorReport(); MemoryContextSwitchTo(TopMemoryContext); FlushErrorState(); sleep(pool_config->wd_heartbeat_keepalive); } /* We can now handle ereport(ERROR) */ PG_exception_stack = &local_sigjmp_buf; for(;;) { MemoryContextSwitchTo(ProcessLoopContext); MemoryContextResetAndDeleteChildren(ProcessLoopContext); /* contents of packet */ gettimeofday(&pkt.send_time, NULL); strlcpy(pkt.from, pool_config->wd_hostname, sizeof(pkt.from)); pkt.from_pgpool_port = pool_config->port; pkt.status = p->status; /* authentication key */ if (strlen(pool_config->wd_authkey)) { /* calculate hash from packet */ pack_str_len = packet_to_string_hb(&pkt, pack_str, sizeof(pack_str)); wd_calc_hash(pack_str, pack_str_len, pkt.hash); } /* send heartbeat signal */ wd_hb_send(sock, &pkt, sizeof(pkt), hb_if->addr, hb_if->dest_port); ereport(DEBUG1, (errmsg("watchdog heartbeat: send heartbeat signal to %s:%d", hb_if->addr, hb_if->dest_port))); sleep(pool_config->wd_heartbeat_keepalive); } return pid; }
/* * ContinuousQueryWorkerStartup * * Launches a CQ worker, which continuously generates partial query results to send * back to the combiner process. */ void ContinuousQueryWorkerRun(Portal portal, ContinuousViewState *state, QueryDesc *queryDesc, ResourceOwner owner) { EState *estate = NULL; DestReceiver *dest; CmdType operation; MemoryContext oldcontext; int timeoutms = state->maxwaitms; MemoryContext runcontext; CQProcEntry *entry = GetCQProcEntry(MyCQId); ResourceOwner cqowner = ResourceOwnerCreate(NULL, "CQResourceOwner"); bool savereadonly = XactReadOnly; cq_stat_initialize(state->viewid, MyProcPid); dest = CreateDestReceiver(DestCombiner); SetCombinerDestReceiverParams(dest, MyCQId); /* workers only need read-only transactions */ XactReadOnly = true; runcontext = AllocSetContextCreate(TopMemoryContext, "CQRunContext", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); elog(LOG, "\"%s\" worker %d running", queryDesc->plannedstmt->cq_target->relname, MyProcPid); MarkWorkerAsRunning(MyCQId, MyWorkerId); pgstat_report_activity(STATE_RUNNING, queryDesc->sourceText); TupleBufferInitLatch(WorkerTupleBuffer, MyCQId, MyWorkerId, &MyProc->procLatch); oldcontext = MemoryContextSwitchTo(runcontext); retry: PG_TRY(); { bool xact_commit = true; TimestampTz last_process = GetCurrentTimestamp(); TimestampTz last_commit = GetCurrentTimestamp(); start_executor(queryDesc, runcontext, cqowner); CurrentResourceOwner = cqowner; estate = queryDesc->estate; operation = queryDesc->operation; /* * Initialize context that lives for the duration of a single iteration * of the main worker loop */ CQExecutionContext = AllocSetContextCreate(estate->es_query_cxt, "CQExecutionContext", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); estate->es_lastoid = InvalidOid; /* * Startup combiner receiver */ (*dest->rStartup) (dest, operation, queryDesc->tupDesc); for (;;) { if (!TupleBufferHasUnreadSlots()) { if (TimestampDifferenceExceeds(last_process, GetCurrentTimestamp(), state->emptysleepms)) { /* force stats flush */ cq_stat_report(true); pgstat_report_activity(STATE_IDLE, queryDesc->sourceText); TupleBufferWait(WorkerTupleBuffer, MyCQId, MyWorkerId); pgstat_report_activity(STATE_RUNNING, queryDesc->sourceText); } else pg_usleep(Min(WAIT_SLEEP_MS, state->emptysleepms) * 1000); } TupleBufferResetNotify(WorkerTupleBuffer, MyCQId, MyWorkerId); if (xact_commit) StartTransactionCommand(); set_snapshot(estate, cqowner); CurrentResourceOwner = cqowner; MemoryContextSwitchTo(estate->es_query_cxt); estate->es_processed = 0; estate->es_filtered = 0; /* * Run plan on a microbatch */ ExecutePlan(estate, queryDesc->planstate, operation, true, 0, timeoutms, ForwardScanDirection, dest); IncrementCQExecutions(1); TupleBufferClearPinnedSlots(); if (state->long_xact) { if (TimestampDifferenceExceeds(last_commit, GetCurrentTimestamp(), LONG_RUNNING_XACT_DURATION)) xact_commit = true; else xact_commit = false; } unset_snapshot(estate, cqowner); if (xact_commit) { CommitTransactionCommand(); last_commit = GetCurrentTimestamp(); } MemoryContextResetAndDeleteChildren(CQExecutionContext); MemoryContextSwitchTo(runcontext); CurrentResourceOwner = cqowner; if (estate->es_processed || estate->es_filtered) { /* * If the CV query is such that the select does not return any tuples * ex: select id where id=99; and id=99 does not exist, then this reset * will fail. What will happen is that the worker will block at the latch for every * allocated slot, TILL a cv returns a non-zero tuple, at which point * the worker will resume a simple sleep for the threshold time. */ last_process = GetCurrentTimestamp(); /* * Send stats to the collector */ cq_stat_report(false); } /* Has the CQ been deactivated? */ if (!entry->active) { if (ActiveSnapshotSet()) unset_snapshot(estate, cqowner); if (IsTransactionState()) CommitTransactionCommand(); break; } } CurrentResourceOwner = cqowner; /* * The cleanup functions below expect these things to be registered */ RegisterSnapshotOnOwner(estate->es_snapshot, cqowner); RegisterSnapshotOnOwner(queryDesc->snapshot, cqowner); RegisterSnapshotOnOwner(queryDesc->crosscheck_snapshot, cqowner); /* cleanup */ ExecutorFinish(queryDesc); ExecutorEnd(queryDesc); FreeQueryDesc(queryDesc); } PG_CATCH(); { EmitErrorReport(); FlushErrorState(); /* Since the worker is read-only, we can simply commit the transaction. */ if (ActiveSnapshotSet()) unset_snapshot(estate, cqowner); if (IsTransactionState()) CommitTransactionCommand(); TupleBufferUnpinAllPinnedSlots(); TupleBufferClearReaders(); /* This resets the es_query_ctx and in turn the CQExecutionContext */ MemoryContextResetAndDeleteChildren(runcontext); IncrementCQErrors(1); if (continuous_query_crash_recovery) goto retry; } PG_END_TRY(); (*dest->rShutdown) (dest); MemoryContextSwitchTo(oldcontext); MemoryContextDelete(runcontext); XactReadOnly = savereadonly; /* * Remove proc-level stats */ cq_stat_report(true); cq_stat_send_purge(state->viewid, MyProcPid, CQ_STAT_WORKER); CurrentResourceOwner = owner; }
/* * Main entry point for checkpointer process * * This is invoked from AuxiliaryProcessMain, which has already created the * basic execution environment, but not enabled signals yet. */ void CheckpointerMain(void) { sigjmp_buf local_sigjmp_buf; MemoryContext checkpointer_context; CheckpointerShmem->checkpointer_pid = MyProcPid; /* * Properly accept or ignore signals the postmaster might send us * * Note: we deliberately ignore SIGTERM, because during a standard Unix * system shutdown cycle, init will SIGTERM all processes at once. We * want to wait for the backends to exit, whereupon the postmaster will * tell us it's okay to shut down (via SIGUSR2). */ pqsignal(SIGHUP, ChkptSigHupHandler); /* set flag to read config * file */ pqsignal(SIGINT, ReqCheckpointHandler); /* request checkpoint */ pqsignal(SIGTERM, SIG_IGN); /* ignore SIGTERM */ pqsignal(SIGQUIT, chkpt_quickdie); /* hard crash time */ pqsignal(SIGALRM, SIG_IGN); pqsignal(SIGPIPE, SIG_IGN); pqsignal(SIGUSR1, chkpt_sigusr1_handler); pqsignal(SIGUSR2, ReqShutdownHandler); /* request shutdown */ /* * Reset some signals that are accepted by postmaster but not here */ pqsignal(SIGCHLD, SIG_DFL); pqsignal(SIGTTIN, SIG_DFL); pqsignal(SIGTTOU, SIG_DFL); pqsignal(SIGCONT, SIG_DFL); pqsignal(SIGWINCH, SIG_DFL); /* We allow SIGQUIT (quickdie) at all times */ sigdelset(&BlockSig, SIGQUIT); /* * Initialize so that first time-driven event happens at the correct time. */ last_checkpoint_time = last_xlog_switch_time = (pg_time_t) time(NULL); /* * Create a resource owner to keep track of our resources (currently only * buffer pins). */ CurrentResourceOwner = ResourceOwnerCreate(NULL, "Checkpointer"); /* * Create a memory context that we will do all our work in. We do this so * that we can reset the context during error recovery and thereby avoid * possible memory leaks. Formerly this code just ran in * TopMemoryContext, but resetting that would be a really bad idea. */ checkpointer_context = AllocSetContextCreate(TopMemoryContext, "Checkpointer", ALLOCSET_DEFAULT_SIZES); MemoryContextSwitchTo(checkpointer_context); /* * If an exception is encountered, processing resumes here. * * See notes in postgres.c about the design of this coding. */ if (sigsetjmp(local_sigjmp_buf, 1) != 0) { /* Since not using PG_TRY, must reset error stack by hand */ error_context_stack = NULL; /* Prevent interrupts while cleaning up */ HOLD_INTERRUPTS(); /* Report the error to the server log */ EmitErrorReport(); /* * These operations are really just a minimal subset of * AbortTransaction(). We don't have very many resources to worry * about in checkpointer, but we do have LWLocks, buffers, and temp * files. */ LWLockReleaseAll(); ConditionVariableCancelSleep(); pgstat_report_wait_end(); AbortBufferIO(); UnlockBuffers(); /* buffer pins are released here: */ ResourceOwnerRelease(CurrentResourceOwner, RESOURCE_RELEASE_BEFORE_LOCKS, false, true); /* we needn't bother with the other ResourceOwnerRelease phases */ AtEOXact_Buffers(false); AtEOXact_SMgr(); AtEOXact_Files(); AtEOXact_HashTables(false); /* Warn any waiting backends that the checkpoint failed. */ if (ckpt_active) { SpinLockAcquire(&CheckpointerShmem->ckpt_lck); CheckpointerShmem->ckpt_failed++; CheckpointerShmem->ckpt_done = CheckpointerShmem->ckpt_started; SpinLockRelease(&CheckpointerShmem->ckpt_lck); ckpt_active = false; } /* * Now return to normal top-level context and clear ErrorContext for * next time. */ MemoryContextSwitchTo(checkpointer_context); FlushErrorState(); /* Flush any leaked data in the top-level context */ MemoryContextResetAndDeleteChildren(checkpointer_context); /* Now we can allow interrupts again */ RESUME_INTERRUPTS(); /* * Sleep at least 1 second after any error. A write error is likely * to be repeated, and we don't want to be filling the error logs as * fast as we can. */ pg_usleep(1000000L); /* * Close all open files after any error. This is helpful on Windows, * where holding deleted files open causes various strange errors. * It's not clear we need it elsewhere, but shouldn't hurt. */ smgrcloseall(); } /* We can now handle ereport(ERROR) */ PG_exception_stack = &local_sigjmp_buf; /* * Unblock signals (they were blocked when the postmaster forked us) */ PG_SETMASK(&UnBlockSig); /* * Ensure all shared memory values are set correctly for the config. Doing * this here ensures no race conditions from other concurrent updaters. */ UpdateSharedMemoryConfig(); /* * Advertise our latch that backends can use to wake us up while we're * sleeping. */ ProcGlobal->checkpointerLatch = &MyProc->procLatch; /* * Loop forever */ for (;;) { bool do_checkpoint = false; int flags = 0; pg_time_t now; int elapsed_secs; int cur_timeout; int rc; /* Clear any already-pending wakeups */ ResetLatch(MyLatch); /* * Process any requests or signals received recently. */ AbsorbFsyncRequests(); if (got_SIGHUP) { got_SIGHUP = false; ProcessConfigFile(PGC_SIGHUP); /* * Checkpointer is the last process to shut down, so we ask it to * hold the keys for a range of other tasks required most of which * have nothing to do with checkpointing at all. * * For various reasons, some config values can change dynamically * so the primary copy of them is held in shared memory to make * sure all backends see the same value. We make Checkpointer * responsible for updating the shared memory copy if the * parameter setting changes because of SIGHUP. */ UpdateSharedMemoryConfig(); } if (checkpoint_requested) { checkpoint_requested = false; do_checkpoint = true; BgWriterStats.m_requested_checkpoints++; } if (shutdown_requested) { /* * From here on, elog(ERROR) should end with exit(1), not send * control back to the sigsetjmp block above */ ExitOnAnyError = true; /* Close down the database */ ShutdownXLOG(0, 0); /* Normal exit from the checkpointer is here */ proc_exit(0); /* done */ } /* * Force a checkpoint if too much time has elapsed since the last one. * Note that we count a timed checkpoint in stats only when this * occurs without an external request, but we set the CAUSE_TIME flag * bit even if there is also an external request. */ now = (pg_time_t) time(NULL); elapsed_secs = now - last_checkpoint_time; if (elapsed_secs >= CheckPointTimeout) { if (!do_checkpoint) BgWriterStats.m_timed_checkpoints++; do_checkpoint = true; flags |= CHECKPOINT_CAUSE_TIME; } /* * Do a checkpoint if requested. */ if (do_checkpoint) { bool ckpt_performed = false; bool do_restartpoint; /* * Check if we should perform a checkpoint or a restartpoint. As a * side-effect, RecoveryInProgress() initializes TimeLineID if * it's not set yet. */ do_restartpoint = RecoveryInProgress(); /* * Atomically fetch the request flags to figure out what kind of a * checkpoint we should perform, and increase the started-counter * to acknowledge that we've started a new checkpoint. */ SpinLockAcquire(&CheckpointerShmem->ckpt_lck); flags |= CheckpointerShmem->ckpt_flags; CheckpointerShmem->ckpt_flags = 0; CheckpointerShmem->ckpt_started++; SpinLockRelease(&CheckpointerShmem->ckpt_lck); /* * The end-of-recovery checkpoint is a real checkpoint that's * performed while we're still in recovery. */ if (flags & CHECKPOINT_END_OF_RECOVERY) do_restartpoint = false; /* * We will warn if (a) too soon since last checkpoint (whatever * caused it) and (b) somebody set the CHECKPOINT_CAUSE_XLOG flag * since the last checkpoint start. Note in particular that this * implementation will not generate warnings caused by * CheckPointTimeout < CheckPointWarning. */ if (!do_restartpoint && (flags & CHECKPOINT_CAUSE_XLOG) && elapsed_secs < CheckPointWarning) ereport(LOG, (errmsg_plural("checkpoints are occurring too frequently (%d second apart)", "checkpoints are occurring too frequently (%d seconds apart)", elapsed_secs, elapsed_secs), errhint("Consider increasing the configuration parameter \"max_wal_size\"."))); /* * Initialize checkpointer-private variables used during * checkpoint. */ ckpt_active = true; if (do_restartpoint) ckpt_start_recptr = GetXLogReplayRecPtr(NULL); else ckpt_start_recptr = GetInsertRecPtr(); ckpt_start_time = now; ckpt_cached_elapsed = 0; /* * Do the checkpoint. */ if (!do_restartpoint) { CreateCheckPoint(flags); ckpt_performed = true; } else ckpt_performed = CreateRestartPoint(flags); /* * After any checkpoint, close all smgr files. This is so we * won't hang onto smgr references to deleted files indefinitely. */ smgrcloseall(); /* * Indicate checkpoint completion to any waiting backends. */ SpinLockAcquire(&CheckpointerShmem->ckpt_lck); CheckpointerShmem->ckpt_done = CheckpointerShmem->ckpt_started; SpinLockRelease(&CheckpointerShmem->ckpt_lck); if (ckpt_performed) { /* * Note we record the checkpoint start time not end time as * last_checkpoint_time. This is so that time-driven * checkpoints happen at a predictable spacing. */ last_checkpoint_time = now; } else { /* * We were not able to perform the restartpoint (checkpoints * throw an ERROR in case of error). Most likely because we * have not received any new checkpoint WAL records since the * last restartpoint. Try again in 15 s. */ last_checkpoint_time = now - CheckPointTimeout + 15; } ckpt_active = false; } /* Check for archive_timeout and switch xlog files if necessary. */ CheckArchiveTimeout(); /* * Send off activity statistics to the stats collector. (The reason * why we re-use bgwriter-related code for this is that the bgwriter * and checkpointer used to be just one process. It's probably not * worth the trouble to split the stats support into two independent * stats message types.) */ pgstat_send_bgwriter(); /* * Sleep until we are signaled or it's time for another checkpoint or * xlog file switch. */ now = (pg_time_t) time(NULL); elapsed_secs = now - last_checkpoint_time; if (elapsed_secs >= CheckPointTimeout) continue; /* no sleep for us ... */ cur_timeout = CheckPointTimeout - elapsed_secs; if (XLogArchiveTimeout > 0 && !RecoveryInProgress()) { elapsed_secs = now - last_xlog_switch_time; if (elapsed_secs >= XLogArchiveTimeout) continue; /* no sleep for us ... */ cur_timeout = Min(cur_timeout, XLogArchiveTimeout - elapsed_secs); } rc = WaitLatch(MyLatch, WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, cur_timeout * 1000L /* convert to ms */, WAIT_EVENT_CHECKPOINTER_MAIN); /* * Emergency bailout if postmaster has died. This is to avoid the * necessity for manual cleanup of all postmaster children. */ if (rc & WL_POSTMASTER_DEATH) exit(1); } }
void ContQuerySchedulerMain(int argc, char *argv[]) { sigjmp_buf local_sigjmp_buf; List *dbs = NIL; /* we are a postmaster subprocess now */ IsUnderPostmaster = true; am_cont_scheduler = true; /* reset MyProcPid */ MyProcPid = getpid(); MyPMChildSlot = AssignPostmasterChildSlot(); /* record Start Time for logging */ MyStartTime = time(NULL); /* Identify myself via ps */ init_ps_display("continuous query scheduler process", "", "", ""); ereport(LOG, (errmsg("continuous query scheduler started"))); if (PostAuthDelay) pg_usleep(PostAuthDelay * 1000000L); SetProcessingMode(InitProcessing); /* * If possible, make this process a group leader, so that the postmaster * can signal any child processes too. This is only for consistency sake, we * never fork the scheduler process. Instead dynamic bgworkers are used. */ #ifdef HAVE_SETSID if (setsid() < 0) elog(FATAL, "setsid() failed: %m"); #endif /* * Set up signal handlers. We operate on databases much like a regular * backend, so we use the same signal handling. See equivalent code in * tcop/postgres.c. */ pqsignal(SIGHUP, sighup_handler); pqsignal(SIGINT, sigint_handler); pqsignal(SIGTERM, sigterm_handler); pqsignal(SIGQUIT, quickdie); InitializeTimeouts(); /* establishes SIGALRM handler */ pqsignal(SIGPIPE, SIG_IGN); pqsignal(SIGUSR1, procsignal_sigusr1_handler); pqsignal(SIGUSR2, sigusr2_handler); pqsignal(SIGFPE, FloatExceptionHandler); pqsignal(SIGCHLD, SIG_DFL); #define BACKTRACE_SEGFAULTS #ifdef BACKTRACE_SEGFAULTS pqsignal(SIGSEGV, debug_segfault); #endif /* Early initialization */ BaseInit(); /* * Create a per-backend PGPROC struct in shared memory, except in the * EXEC_BACKEND case where this was done in SubPostmasterMain. We must do * this before we can use LWLocks (and in the EXEC_BACKEND case we already * had to do some stuff with LWLocks). */ #ifndef EXEC_BACKEND InitProcess(); #endif InitPostgres(NULL, InvalidOid, NULL, NULL); SetProcessingMode(NormalProcessing); /* * Create a memory context that we will do all our work in. We do this so * that we can reset the context during error recovery and thereby avoid * possible memory leaks. */ ContQuerySchedulerMemCxt = AllocSetContextCreate(TopMemoryContext, "ContQuerySchedulerCtx", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); MemoryContextSwitchTo(ContQuerySchedulerMemCxt); /* * If an exception is encountered, processing resumes here. * * This code is a stripped down version of PostgresMain error recovery. */ if (sigsetjmp(local_sigjmp_buf, 1) != 0) { /* since not using PG_TRY, must reset error stack by hand */ error_context_stack = NULL; /* Prevents interrupts while cleaning up */ HOLD_INTERRUPTS(); /* Forget any pending QueryCancel or timeout request */ disable_all_timeouts(false); QueryCancelPending = false; /* second to avoid race condition */ /* Report the error to the server log */ EmitErrorReport(); /* Abort the current transaction in order to recover */ AbortCurrentTransaction(); /* * Now return to normal top-level context and clear ErrorContext for * next time. */ MemoryContextSwitchTo(ContQuerySchedulerMemCxt); FlushErrorState(); /* Flush any leaked data in the top-level context */ MemoryContextResetAndDeleteChildren(ContQuerySchedulerMemCxt); /* Now we can allow interrupts again */ RESUME_INTERRUPTS(); /* * Sleep at least 1 second after any error. We don't want to be * filling the error logs as fast as we can. */ pg_usleep(1000000L); } /* We can now handle ereport(ERROR) */ PG_exception_stack = &local_sigjmp_buf; /* must unblock signals before calling rebuild_database_list */ PG_SETMASK(&UnBlockSig); ContQuerySchedulerShmem->scheduler_pid = MyProcPid; dbs = get_database_list(); /* Loop forever */ for (;;) { ListCell *lc; int rc; foreach(lc, dbs) { DatabaseEntry *db_entry = lfirst(lc); bool found; ContQueryProcGroup *grp = hash_search(ContQuerySchedulerShmem->proc_table, &db_entry->oid, HASH_ENTER, &found); /* If we don't have an entry for this dboid, initialize a new one and fire off bg procs */ if (!found) { grp->db_oid = db_entry->oid; namestrcpy(&grp->db_name, NameStr(db_entry->name)); start_group(grp); } } /* Allow sinval catchup interrupts while sleeping */ EnableCatchupInterrupt(); /* * Wait until naptime expires or we get some type of signal (all the * signal handlers will wake us by calling SetLatch). */ rc = WaitLatch(&MyProc->procLatch, WL_LATCH_SET | WL_POSTMASTER_DEATH, 0); ResetLatch(&MyProc->procLatch); DisableCatchupInterrupt(); /* * Emergency bailout if postmaster has died. This is to avoid the * necessity for manual cleanup of all postmaster children. */ if (rc & WL_POSTMASTER_DEATH) proc_exit(1); /* the normal shutdown case */ if (got_SIGTERM) break; /* update config? */ if (got_SIGHUP) { got_SIGHUP = false; ProcessConfigFile(PGC_SIGHUP); /* update tuning parameters, so that they can be read downstream by background processes */ update_tuning_params(); } /* terminate a proc group? */ if (got_SIGUSR2) { HASH_SEQ_STATUS status; ContQueryProcGroup *grp; got_SIGUSR2 = false; hash_seq_init(&status, ContQuerySchedulerShmem->proc_table); while ((grp = (ContQueryProcGroup *) hash_seq_search(&status)) != NULL) { ListCell *lc; if (!grp->terminate) continue; foreach(lc, dbs) { DatabaseEntry *entry = lfirst(lc); if (entry->oid == grp->db_oid) { dbs = list_delete(dbs, entry); break; } } terminate_group(grp); } }
/* * Main entry point for bgwriter process * * This is invoked from BootstrapMain, which has already created the basic * execution environment, but not enabled signals yet. */ void BackgroundWriterMain(void) { sigjmp_buf local_sigjmp_buf; MemoryContext bgwriter_context; BgWriterShmem->bgwriter_pid = MyProcPid; am_bg_writer = true; /* * If possible, make this process a group leader, so that the postmaster * can signal any child processes too. (bgwriter probably never has any * child processes, but for consistency we make all postmaster child * processes do this.) */ #ifdef HAVE_SETSID if (setsid() < 0) elog(FATAL, "setsid() failed: %m"); #endif /* * Properly accept or ignore signals the postmaster might send us * * Note: we deliberately ignore SIGTERM, because during a standard Unix * system shutdown cycle, init will SIGTERM all processes at once. We * want to wait for the backends to exit, whereupon the postmaster will * tell us it's okay to shut down (via SIGUSR2). * * SIGUSR1 is presently unused; keep it spare in case someday we want this * process to participate in sinval messaging. */ pqsignal(SIGHUP, BgSigHupHandler); /* set flag to read config file */ pqsignal(SIGINT, ReqCheckpointHandler); /* request checkpoint */ pqsignal(SIGTERM, SIG_IGN); /* ignore SIGTERM */ pqsignal(SIGQUIT, bg_quickdie); /* hard crash time */ pqsignal(SIGALRM, SIG_IGN); pqsignal(SIGPIPE, SIG_IGN); pqsignal(SIGUSR1, SIG_IGN); /* reserve for sinval */ pqsignal(SIGUSR2, ReqShutdownHandler); /* request shutdown */ /* * Reset some signals that are accepted by postmaster but not here */ pqsignal(SIGCHLD, SIG_DFL); pqsignal(SIGTTIN, SIG_DFL); pqsignal(SIGTTOU, SIG_DFL); pqsignal(SIGCONT, SIG_DFL); pqsignal(SIGWINCH, SIG_DFL); /* We allow SIGQUIT (quickdie) at all times */ #ifdef HAVE_SIGPROCMASK sigdelset(&BlockSig, SIGQUIT); #else BlockSig &= ~(sigmask(SIGQUIT)); #endif /* * Initialize so that first time-driven event happens at the correct time. */ last_checkpoint_time = last_xlog_switch_time = time(NULL); /* * Create a resource owner to keep track of our resources (currently only * buffer pins). */ CurrentResourceOwner = ResourceOwnerCreate(NULL, "Background Writer"); /* * Create a memory context that we will do all our work in. We do this so * that we can reset the context during error recovery and thereby avoid * possible memory leaks. Formerly this code just ran in * TopMemoryContext, but resetting that would be a really bad idea. */ bgwriter_context = AllocSetContextCreate(TopMemoryContext, "Background Writer", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); MemoryContextSwitchTo(bgwriter_context); /* * If an exception is encountered, processing resumes here. * * See notes in postgres.c about the design of this coding. */ if (sigsetjmp(local_sigjmp_buf, 1) != 0) { /* Since not using PG_TRY, must reset error stack by hand */ error_context_stack = NULL; /* Prevent interrupts while cleaning up */ HOLD_INTERRUPTS(); /* Report the error to the server log */ EmitErrorReport(); /* * These operations are really just a minimal subset of * AbortTransaction(). We don't have very many resources to worry * about in bgwriter, but we do have LWLocks, buffers, and temp files. */ LWLockReleaseAll(); AbortBufferIO(); UnlockBuffers(); /* buffer pins are released here: */ ResourceOwnerRelease(CurrentResourceOwner, RESOURCE_RELEASE_BEFORE_LOCKS, false, true); /* we needn't bother with the other ResourceOwnerRelease phases */ AtEOXact_Buffers(false); AtEOXact_Files(); AtEOXact_HashTables(false); /* Warn any waiting backends that the checkpoint failed. */ if (ckpt_active) { /* use volatile pointer to prevent code rearrangement */ volatile BgWriterShmemStruct *bgs = BgWriterShmem; SpinLockAcquire(&bgs->ckpt_lck); bgs->ckpt_failed++; bgs->ckpt_done = bgs->ckpt_started; SpinLockRelease(&bgs->ckpt_lck); ckpt_active = false; } /* * Now return to normal top-level context and clear ErrorContext for * next time. */ MemoryContextSwitchTo(bgwriter_context); FlushErrorState(); /* Flush any leaked data in the top-level context */ MemoryContextResetAndDeleteChildren(bgwriter_context); /* Now we can allow interrupts again */ RESUME_INTERRUPTS(); /* * Sleep at least 1 second after any error. A write error is likely * to be repeated, and we don't want to be filling the error logs as * fast as we can. */ pg_usleep(1000000L); /* * Close all open files after any error. This is helpful on Windows, * where holding deleted files open causes various strange errors. * It's not clear we need it elsewhere, but shouldn't hurt. */ smgrcloseall(); } /* We can now handle ereport(ERROR) */ PG_exception_stack = &local_sigjmp_buf; /* * Unblock signals (they were blocked when the postmaster forked us) */ PG_SETMASK(&UnBlockSig); /* * Loop forever */ for (;;) { bool do_checkpoint = false; int flags = 0; time_t now; int elapsed_secs; /* * Emergency bailout if postmaster has died. This is to avoid the * necessity for manual cleanup of all postmaster children. */ if (!PostmasterIsAlive(true)) exit(1); /* * Process any requests or signals received recently. */ AbsorbFsyncRequests(); if (got_SIGHUP) { got_SIGHUP = false; ProcessConfigFile(PGC_SIGHUP); } if (checkpoint_requested) { checkpoint_requested = false; do_checkpoint = true; BgWriterStats.m_requested_checkpoints++; } if (shutdown_requested) { /* * From here on, elog(ERROR) should end with exit(1), not send * control back to the sigsetjmp block above */ ExitOnAnyError = true; /* Close down the database */ ShutdownXLOG(0, 0); DumpFreeSpaceMap(0, 0); /* Normal exit from the bgwriter is here */ proc_exit(0); /* done */ } /* * Force a checkpoint if too much time has elapsed since the last one. * Note that we count a timed checkpoint in stats only when this * occurs without an external request, but we set the CAUSE_TIME flag * bit even if there is also an external request. */ now = time(NULL); elapsed_secs = now - last_checkpoint_time; if (elapsed_secs >= CheckPointTimeout) { if (!do_checkpoint) BgWriterStats.m_timed_checkpoints++; do_checkpoint = true; flags |= CHECKPOINT_CAUSE_TIME; } /* * Do a checkpoint if requested, otherwise do one cycle of * dirty-buffer writing. */ if (do_checkpoint) { /* use volatile pointer to prevent code rearrangement */ volatile BgWriterShmemStruct *bgs = BgWriterShmem; /* * Atomically fetch the request flags to figure out what kind of a * checkpoint we should perform, and increase the started-counter * to acknowledge that we've started a new checkpoint. */ SpinLockAcquire(&bgs->ckpt_lck); flags |= bgs->ckpt_flags; bgs->ckpt_flags = 0; bgs->ckpt_started++; SpinLockRelease(&bgs->ckpt_lck); /* * We will warn if (a) too soon since last checkpoint (whatever * caused it) and (b) somebody set the CHECKPOINT_CAUSE_XLOG flag * since the last checkpoint start. Note in particular that this * implementation will not generate warnings caused by * CheckPointTimeout < CheckPointWarning. */ if ((flags & CHECKPOINT_CAUSE_XLOG) && elapsed_secs < CheckPointWarning) ereport(LOG, (errmsg("checkpoints are occurring too frequently (%d seconds apart)", elapsed_secs), errhint("Consider increasing the configuration parameter \"checkpoint_segments\"."))); /* * Initialize bgwriter-private variables used during checkpoint. */ ckpt_active = true; ckpt_start_recptr = GetInsertRecPtr(); ckpt_start_time = now; ckpt_cached_elapsed = 0; /* * Do the checkpoint. */ CreateCheckPoint(flags); /* * After any checkpoint, close all smgr files. This is so we * won't hang onto smgr references to deleted files indefinitely. */ smgrcloseall(); /* * Indicate checkpoint completion to any waiting backends. */ SpinLockAcquire(&bgs->ckpt_lck); bgs->ckpt_done = bgs->ckpt_started; SpinLockRelease(&bgs->ckpt_lck); ckpt_active = false; /* * Note we record the checkpoint start time not end time as * last_checkpoint_time. This is so that time-driven checkpoints * happen at a predictable spacing. */ last_checkpoint_time = now; } else BgBufferSync(); /* Check for archive_timeout and switch xlog files if necessary. */ CheckArchiveTimeout(); /* Nap for the configured time. */ BgWriterNap(); } }
pid_t wd_child(int fork_wait_time) { int sock; volatile int fd; int rtn; pid_t pid = 0; sigjmp_buf local_sigjmp_buf; pid = fork(); if (pid != 0) { if (pid == -1) ereport(PANIC, (errmsg("failed to fork a watchdog process"))); return pid; } on_exit_reset(); processType = PT_WATCHDOG; if (fork_wait_time > 0) { sleep(fork_wait_time); } POOL_SETMASK(&UnBlockSig); signal(SIGTERM, wd_child_exit); signal(SIGINT, wd_child_exit); signal(SIGQUIT, wd_child_exit); signal(SIGCHLD, SIG_IGN); signal(SIGHUP, SIG_IGN); signal(SIGUSR1, SIG_IGN); signal(SIGUSR2, SIG_IGN); signal(SIGPIPE, SIG_IGN); signal(SIGALRM, SIG_IGN); init_ps_display("", "", "", ""); if (WD_List == NULL) { /* memory allocate is not ready */ wd_child_exit(15); } /* Create per loop iteration memory context */ ProcessLoopContext = AllocSetContextCreate(TopMemoryContext, "wd_child_main_loop", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); MemoryContextSwitchTo(TopMemoryContext); sock = wd_create_recv_socket(WD_MYSELF->wd_port); if (sock < 0) { /* socket create failed */ wd_child_exit(15); } set_ps_display("watchdog", false); if (sigsetjmp(local_sigjmp_buf, 1) != 0) { /* Since not using PG_TRY, must reset error stack by hand */ if(fd > 0) close(fd); error_context_stack = NULL; EmitErrorReport(); MemoryContextSwitchTo(TopMemoryContext); FlushErrorState(); } /* We can now handle ereport(ERROR) */ PG_exception_stack = &local_sigjmp_buf; /* child loop */ for(;;) { MemoryContextSwitchTo(ProcessLoopContext); MemoryContextResetAndDeleteChildren(ProcessLoopContext); fd = -1; WdPacket buf; fd = wd_accept(sock); if (fd < 0) { continue; } rtn = wd_recv_packet(fd, &buf); if (rtn == WD_OK) { wd_send_response(fd, &buf); } close(fd); } return pid; }
/* * A tuple in the heap is being inserted. To keep a brin index up to date, * we need to obtain the relevant index tuple and compare its stored values * with those of the new tuple. If the tuple values are not consistent with * the summary tuple, we need to update the index tuple. * * If autosummarization is enabled, check if we need to summarize the previous * page range. * * If the range is not currently summarized (i.e. the revmap returns NULL for * it), there's nothing to do for this tuple. */ bool brininsert(Relation idxRel, Datum *values, bool *nulls, ItemPointer heaptid, Relation heapRel, IndexUniqueCheck checkUnique, IndexInfo *indexInfo) { BlockNumber pagesPerRange; BlockNumber origHeapBlk; BlockNumber heapBlk; BrinDesc *bdesc = (BrinDesc *) indexInfo->ii_AmCache; BrinRevmap *revmap; Buffer buf = InvalidBuffer; MemoryContext tupcxt = NULL; MemoryContext oldcxt = CurrentMemoryContext; bool autosummarize = BrinGetAutoSummarize(idxRel); revmap = brinRevmapInitialize(idxRel, &pagesPerRange, NULL); /* * origHeapBlk is the block number where the insertion occurred. heapBlk * is the first block in the corresponding page range. */ origHeapBlk = ItemPointerGetBlockNumber(heaptid); heapBlk = (origHeapBlk / pagesPerRange) * pagesPerRange; for (;;) { bool need_insert = false; OffsetNumber off; BrinTuple *brtup; BrinMemTuple *dtup; int keyno; CHECK_FOR_INTERRUPTS(); /* * If auto-summarization is enabled and we just inserted the first * tuple into the first block of a new non-first page range, request a * summarization run of the previous range. */ if (autosummarize && heapBlk > 0 && heapBlk == origHeapBlk && ItemPointerGetOffsetNumber(heaptid) == FirstOffsetNumber) { BlockNumber lastPageRange = heapBlk - 1; BrinTuple *lastPageTuple; lastPageTuple = brinGetTupleForHeapBlock(revmap, lastPageRange, &buf, &off, NULL, BUFFER_LOCK_SHARE, NULL); if (!lastPageTuple) { bool recorded; recorded = AutoVacuumRequestWork(AVW_BRINSummarizeRange, RelationGetRelid(idxRel), lastPageRange); if (!recorded) ereport(LOG, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("request for BRIN range summarization for index \"%s\" page %u was not recorded", RelationGetRelationName(idxRel), lastPageRange))); } else LockBuffer(buf, BUFFER_LOCK_UNLOCK); } brtup = brinGetTupleForHeapBlock(revmap, heapBlk, &buf, &off, NULL, BUFFER_LOCK_SHARE, NULL); /* if range is unsummarized, there's nothing to do */ if (!brtup) break; /* First time through in this statement? */ if (bdesc == NULL) { MemoryContextSwitchTo(indexInfo->ii_Context); bdesc = brin_build_desc(idxRel); indexInfo->ii_AmCache = (void *) bdesc; MemoryContextSwitchTo(oldcxt); } /* First time through in this brininsert call? */ if (tupcxt == NULL) { tupcxt = AllocSetContextCreate(CurrentMemoryContext, "brininsert cxt", ALLOCSET_DEFAULT_SIZES); MemoryContextSwitchTo(tupcxt); } dtup = brin_deform_tuple(bdesc, brtup, NULL); /* * Compare the key values of the new tuple to the stored index values; * our deformed tuple will get updated if the new tuple doesn't fit * the original range (note this means we can't break out of the loop * early). Make a note of whether this happens, so that we know to * insert the modified tuple later. */ for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++) { Datum result; BrinValues *bval; FmgrInfo *addValue; bval = &dtup->bt_columns[keyno]; addValue = index_getprocinfo(idxRel, keyno + 1, BRIN_PROCNUM_ADDVALUE); result = FunctionCall4Coll(addValue, idxRel->rd_indcollation[keyno], PointerGetDatum(bdesc), PointerGetDatum(bval), values[keyno], nulls[keyno]); /* if that returned true, we need to insert the updated tuple */ need_insert |= DatumGetBool(result); } if (!need_insert) { /* * The tuple is consistent with the new values, so there's nothing * to do. */ LockBuffer(buf, BUFFER_LOCK_UNLOCK); } else { Page page = BufferGetPage(buf); ItemId lp = PageGetItemId(page, off); Size origsz; BrinTuple *origtup; Size newsz; BrinTuple *newtup; bool samepage; /* * Make a copy of the old tuple, so that we can compare it after * re-acquiring the lock. */ origsz = ItemIdGetLength(lp); origtup = brin_copy_tuple(brtup, origsz, NULL, NULL); /* * Before releasing the lock, check if we can attempt a same-page * update. Another process could insert a tuple concurrently in * the same page though, so downstream we must be prepared to cope * if this turns out to not be possible after all. */ newtup = brin_form_tuple(bdesc, heapBlk, dtup, &newsz); samepage = brin_can_do_samepage_update(buf, origsz, newsz); LockBuffer(buf, BUFFER_LOCK_UNLOCK); /* * Try to update the tuple. If this doesn't work for whatever * reason, we need to restart from the top; the revmap might be * pointing at a different tuple for this block now, so we need to * recompute to ensure both our new heap tuple and the other * inserter's are covered by the combined tuple. It might be that * we don't need to update at all. */ if (!brin_doupdate(idxRel, pagesPerRange, revmap, heapBlk, buf, off, origtup, origsz, newtup, newsz, samepage)) { /* no luck; start over */ MemoryContextResetAndDeleteChildren(tupcxt); continue; } } /* success! */ break; } brinRevmapTerminate(revmap); if (BufferIsValid(buf)) ReleaseBuffer(buf); MemoryContextSwitchTo(oldcxt); if (tupcxt != NULL) MemoryContextDelete(tupcxt); return false; }
/* * Fetch dictionary cache entry */ TSDictionaryCacheEntry * lookup_ts_dictionary_cache(Oid dictId) { TSDictionaryCacheEntry *entry; if (TSDictionaryCacheHash == NULL) { /* First time through: initialize the hash table */ HASHCTL ctl; MemSet(&ctl, 0, sizeof(ctl)); ctl.keysize = sizeof(Oid); ctl.entrysize = sizeof(TSDictionaryCacheEntry); TSDictionaryCacheHash = hash_create("Tsearch dictionary cache", 8, &ctl, HASH_ELEM | HASH_BLOBS); /* Flush cache on pg_ts_dict and pg_ts_template changes */ CacheRegisterSyscacheCallback(TSDICTOID, InvalidateTSCacheCallBack, PointerGetDatum(TSDictionaryCacheHash)); CacheRegisterSyscacheCallback(TSTEMPLATEOID, InvalidateTSCacheCallBack, PointerGetDatum(TSDictionaryCacheHash)); /* Also make sure CacheMemoryContext exists */ if (!CacheMemoryContext) CreateCacheMemoryContext(); } /* Check single-entry cache */ if (lastUsedDictionary && lastUsedDictionary->dictId == dictId && lastUsedDictionary->isvalid) return lastUsedDictionary; /* Try to look up an existing entry */ entry = (TSDictionaryCacheEntry *) hash_search(TSDictionaryCacheHash, (void *) &dictId, HASH_FIND, NULL); if (entry == NULL || !entry->isvalid) { /* * If we didn't find one, we want to make one. But first look up the * object to be sure the OID is real. */ HeapTuple tpdict, tptmpl; Form_pg_ts_dict dict; Form_pg_ts_template ctemplate; MemoryContext saveCtx; tpdict = SearchSysCache1(TSDICTOID, ObjectIdGetDatum(dictId)); if (!HeapTupleIsValid(tpdict)) elog(ERROR, "cache lookup failed for text search dictionary %u", dictId); dict = (Form_pg_ts_dict) GETSTRUCT(tpdict); /* * Sanity checks */ if (!OidIsValid(dict->dicttemplate)) elog(ERROR, "text search dictionary %u has no ctemplate", dictId); /* * Retrieve dictionary's ctemplate */ tptmpl = SearchSysCache1(TSTEMPLATEOID, ObjectIdGetDatum(dict->dicttemplate)); if (!HeapTupleIsValid(tptmpl)) elog(ERROR, "cache lookup failed for text search ctemplate %u", dict->dicttemplate); ctemplate = (Form_pg_ts_template) GETSTRUCT(tptmpl); /* * Sanity checks */ if (!OidIsValid(ctemplate->tmpllexize)) elog(ERROR, "text search ctemplate %u has no lexize method", ctemplate->tmpllexize); if (entry == NULL) { bool found; /* Now make the cache entry */ entry = (TSDictionaryCacheEntry *) hash_search(TSDictionaryCacheHash, (void *) &dictId, HASH_ENTER, &found); Assert(!found); /* it wasn't there a moment ago */ /* Create private___ memory context the first time through */ saveCtx = AllocSetContextCreate(CacheMemoryContext, NameStr(dict->dictname), ALLOCSET_SMALL_MINSIZE, ALLOCSET_SMALL_INITSIZE, ALLOCSET_SMALL_MAXSIZE); } else { /* Clear the existing entry's private___ context */ saveCtx = entry->dictCtx; MemoryContextResetAndDeleteChildren(saveCtx); } MemSet(entry, 0, sizeof(TSDictionaryCacheEntry)); entry->dictId = dictId; entry->dictCtx = saveCtx; entry->lexizeOid = ctemplate->tmpllexize; if (OidIsValid(ctemplate->tmplinit)) { List *dictoptions; Datum opt; bool isnull; MemoryContext oldcontext; /* * Init method runs in dictionary's private___ memory context, and we * make sure the options are stored there too */ oldcontext = MemoryContextSwitchTo(entry->dictCtx); opt = SysCacheGetAttr(TSDICTOID, tpdict, Anum_pg_ts_dict_dictinitoption, &isnull); if (isnull) dictoptions = NIL; else dictoptions = deserialize_deflist(opt); entry->dictData = DatumGetPointer(OidFunctionCall1(ctemplate->tmplinit, PointerGetDatum(dictoptions))); MemoryContextSwitchTo(oldcontext); } ReleaseSysCache(tptmpl); ReleaseSysCache(tpdict); fmgr_info_cxt(entry->lexizeOid, &entry->lexize, entry->dictCtx); entry->isvalid = true; } lastUsedDictionary = entry; return entry; }
/* * Main entry point for walwriter process * * This is invoked from AuxiliaryProcessMain, which has already created the * basic execution environment, but not enabled signals yet. */ void WalWriterMain(void) { sigjmp_buf local_sigjmp_buf; MemoryContext walwriter_context; int left_till_hibernate; bool hibernating; /* * Properly accept or ignore signals the postmaster might send us * * We have no particular use for SIGINT at the moment, but seems * reasonable to treat like SIGTERM. */ pqsignal(SIGHUP, WalSigHupHandler); /* set flag to read config file */ pqsignal(SIGINT, WalShutdownHandler); /* request shutdown */ pqsignal(SIGTERM, WalShutdownHandler); /* request shutdown */ pqsignal(SIGQUIT, wal_quickdie); /* hard crash time */ pqsignal(SIGALRM, SIG_IGN); pqsignal(SIGPIPE, SIG_IGN); pqsignal(SIGUSR1, walwriter_sigusr1_handler); pqsignal(SIGUSR2, SIG_IGN); /* not used */ /* * Reset some signals that are accepted by postmaster but not here */ pqsignal(SIGCHLD, SIG_DFL); pqsignal(SIGTTIN, SIG_DFL); pqsignal(SIGTTOU, SIG_DFL); pqsignal(SIGCONT, SIG_DFL); pqsignal(SIGWINCH, SIG_DFL); /* We allow SIGQUIT (quickdie) at all times */ sigdelset(&BlockSig, SIGQUIT); /* * Create a resource owner to keep track of our resources (not clear that * we need this, but may as well have one). */ CurrentResourceOwner = ResourceOwnerCreate(NULL, "Wal Writer"); /* * Create a memory context that we will do all our work in. We do this so * that we can reset the context during error recovery and thereby avoid * possible memory leaks. Formerly this code just ran in * TopMemoryContext, but resetting that would be a really bad idea. */ walwriter_context = AllocSetContextCreate(TopMemoryContext, "Wal Writer", ALLOCSET_DEFAULT_SIZES); MemoryContextSwitchTo(walwriter_context); /* * If an exception is encountered, processing resumes here. * * This code is heavily based on bgwriter.c, q.v. */ if (sigsetjmp(local_sigjmp_buf, 1) != 0) { /* Since not using PG_TRY, must reset error stack by hand */ error_context_stack = NULL; /* Prevent interrupts while cleaning up */ HOLD_INTERRUPTS(); /* Report the error to the server log */ EmitErrorReport(); /* * These operations are really just a minimal subset of * AbortTransaction(). We don't have very many resources to worry * about in walwriter, but we do have LWLocks, and perhaps buffers? */ LWLockReleaseAll(); ConditionVariableCancelSleep(); pgstat_report_wait_end(); AbortBufferIO(); UnlockBuffers(); /* buffer pins are released here: */ ResourceOwnerRelease(CurrentResourceOwner, RESOURCE_RELEASE_BEFORE_LOCKS, false, true); /* we needn't bother with the other ResourceOwnerRelease phases */ AtEOXact_Buffers(false); AtEOXact_SMgr(); AtEOXact_Files(); AtEOXact_HashTables(false); /* * Now return to normal top-level context and clear ErrorContext for * next time. */ MemoryContextSwitchTo(walwriter_context); FlushErrorState(); /* Flush any leaked data in the top-level context */ MemoryContextResetAndDeleteChildren(walwriter_context); /* Now we can allow interrupts again */ RESUME_INTERRUPTS(); /* * Sleep at least 1 second after any error. A write error is likely * to be repeated, and we don't want to be filling the error logs as * fast as we can. */ pg_usleep(1000000L); /* * Close all open files after any error. This is helpful on Windows, * where holding deleted files open causes various strange errors. * It's not clear we need it elsewhere, but shouldn't hurt. */ smgrcloseall(); } /* We can now handle ereport(ERROR) */ PG_exception_stack = &local_sigjmp_buf; /* * Unblock signals (they were blocked when the postmaster forked us) */ PG_SETMASK(&UnBlockSig); /* * Reset hibernation state after any error. */ left_till_hibernate = LOOPS_UNTIL_HIBERNATE; hibernating = false; SetWalWriterSleeping(false); /* * Advertise our latch that backends can use to wake us up while we're * sleeping. */ ProcGlobal->walwriterLatch = &MyProc->procLatch; /* * Loop forever */ for (;;) { long cur_timeout; int rc; /* * Advertise whether we might hibernate in this cycle. We do this * before resetting the latch to ensure that any async commits will * see the flag set if they might possibly need to wake us up, and * that we won't miss any signal they send us. (If we discover work * to do in the last cycle before we would hibernate, the global flag * will be set unnecessarily, but little harm is done.) But avoid * touching the global flag if it doesn't need to change. */ if (hibernating != (left_till_hibernate <= 1)) { hibernating = (left_till_hibernate <= 1); SetWalWriterSleeping(hibernating); } /* Clear any already-pending wakeups */ ResetLatch(MyLatch); /* * Process any requests or signals received recently. */ if (got_SIGHUP) { got_SIGHUP = false; ProcessConfigFile(PGC_SIGHUP); } if (shutdown_requested) { /* Normal exit from the walwriter is here */ proc_exit(0); /* done */ } /* * Do what we're here for; then, if XLogBackgroundFlush() found useful * work to do, reset hibernation counter. */ if (XLogBackgroundFlush()) left_till_hibernate = LOOPS_UNTIL_HIBERNATE; else if (left_till_hibernate > 0) left_till_hibernate--; /* * Sleep until we are signaled or WalWriterDelay has elapsed. If we * haven't done anything useful for quite some time, lengthen the * sleep time so as to reduce the server's idle power consumption. */ if (left_till_hibernate > 0) cur_timeout = WalWriterDelay; /* in ms */ else cur_timeout = WalWriterDelay * HIBERNATE_FACTOR; rc = WaitLatch(MyLatch, WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, cur_timeout, WAIT_EVENT_WAL_WRITER_MAIN); /* * Emergency bailout if postmaster has died. This is to avoid the * necessity for manual cleanup of all postmaster children. */ if (rc & WL_POSTMASTER_DEATH) exit(1); } }
/* * Main entry point for bgwriter process * * This is invoked from BootstrapMain, which has already created the basic * execution environment, but not enabled signals yet. */ void BackgroundWriterMain(void) { sigjmp_buf local_sigjmp_buf; MemoryContext bgwriter_context; am_bg_writer = true; /* * If possible, make this process a group leader, so that the postmaster * can signal any child processes too. (bgwriter probably never has any * child processes, but for consistency we make all postmaster child * processes do this.) */ #ifdef HAVE_SETSID if (setsid() < 0) elog(FATAL, "setsid() failed: %m"); #endif /* * Properly accept or ignore signals the postmaster might send us * * SIGUSR1 is presently unused; keep it spare in case someday we want this * process to participate in ProcSignal signalling. */ pqsignal(SIGHUP, BgSigHupHandler); /* set flag to read config file */ pqsignal(SIGINT, SIG_IGN); /* as of 9.2 no longer requests checkpoint */ pqsignal(SIGTERM, ReqShutdownHandler); /* shutdown */ pqsignal(SIGQUIT, bg_quickdie); /* hard crash time */ pqsignal(SIGALRM, SIG_IGN); pqsignal(SIGPIPE, SIG_IGN); pqsignal(SIGUSR1, SIG_IGN); /* reserve for ProcSignal */ pqsignal(SIGUSR2, SIG_IGN); /* * Reset some signals that are accepted by postmaster but not here */ pqsignal(SIGCHLD, SIG_DFL); pqsignal(SIGTTIN, SIG_DFL); pqsignal(SIGTTOU, SIG_DFL); pqsignal(SIGCONT, SIG_DFL); pqsignal(SIGWINCH, SIG_DFL); /* We allow SIGQUIT (quickdie) at all times */ sigdelset(&BlockSig, SIGQUIT); /* * Create a resource owner to keep track of our resources (currently only * buffer pins). */ CurrentResourceOwner = ResourceOwnerCreate(NULL, "Background Writer"); /* * Create a memory context that we will do all our work in. We do this so * that we can reset the context during error recovery and thereby avoid * possible memory leaks. Formerly this code just ran in * TopMemoryContext, but resetting that would be a really bad idea. */ bgwriter_context = AllocSetContextCreate(TopMemoryContext, "Background Writer", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); MemoryContextSwitchTo(bgwriter_context); /* * If an exception is encountered, processing resumes here. * * See notes in postgres.c about the design of this coding. */ if (sigsetjmp(local_sigjmp_buf, 1) != 0) { /* Since not using PG_TRY, must reset error stack by hand */ error_context_stack = NULL; /* Prevent interrupts while cleaning up */ HOLD_INTERRUPTS(); /* Report the error to the server log */ EmitErrorReport(); /* * These operations are really just a minimal subset of * AbortTransaction(). We don't have very many resources to worry * about in bgwriter, but we do have LWLocks, buffers, and temp files. */ LWLockReleaseAll(); AbortBufferIO(); UnlockBuffers(); /* buffer pins are released here: */ ResourceOwnerRelease(CurrentResourceOwner, RESOURCE_RELEASE_BEFORE_LOCKS, false, true); /* we needn't bother with the other ResourceOwnerRelease phases */ AtEOXact_Buffers(false); AtEOXact_Files(); AtEOXact_HashTables(false); /* * Now return to normal top-level context and clear ErrorContext for * next time. */ MemoryContextSwitchTo(bgwriter_context); FlushErrorState(); /* Flush any leaked data in the top-level context */ MemoryContextResetAndDeleteChildren(bgwriter_context); /* Now we can allow interrupts again */ RESUME_INTERRUPTS(); /* * Sleep at least 1 second after any error. A write error is likely * to be repeated, and we don't want to be filling the error logs as * fast as we can. */ pg_usleep(1000000L); /* * Close all open files after any error. This is helpful on Windows, * where holding deleted files open causes various strange errors. * It's not clear we need it elsewhere, but shouldn't hurt. */ smgrcloseall(); } /* We can now handle ereport(ERROR) */ PG_exception_stack = &local_sigjmp_buf; /* * Unblock signals (they were blocked when the postmaster forked us) */ PG_SETMASK(&UnBlockSig); /* * Use the recovery target timeline ID during recovery */ if (RecoveryInProgress()) ThisTimeLineID = GetRecoveryTargetTLI(); /* * Loop forever */ for (;;) { /* * Emergency bailout if postmaster has died. This is to avoid the * necessity for manual cleanup of all postmaster children. */ if (!PostmasterIsAlive()) exit(1); if (got_SIGHUP) { got_SIGHUP = false; ProcessConfigFile(PGC_SIGHUP); /* update global shmem state for sync rep */ } if (shutdown_requested) { /* * From here on, elog(ERROR) should end with exit(1), not send * control back to the sigsetjmp block above */ ExitOnAnyError = true; /* Normal exit from the bgwriter is here */ proc_exit(0); /* done */ } /* * Do one cycle of dirty-buffer writing. */ BgBufferSync(); /* Nap for the configured time. */ BgWriterNap(); } }
/* * Main entry point for bgwriter process * * This is invoked from AuxiliaryProcessMain, which has already created the * basic execution environment, but not enabled signals yet. */ void BackgroundWriterMain(void) { sigjmp_buf local_sigjmp_buf; MemoryContext bgwriter_context; bool prev_hibernate; /* * Properly accept or ignore signals the postmaster might send us. * * bgwriter doesn't participate in ProcSignal signalling, but a SIGUSR1 * handler is still needed for latch wakeups. */ pqsignal(SIGHUP, BgSigHupHandler); /* set flag to read config file */ pqsignal(SIGINT, SIG_IGN); pqsignal(SIGTERM, ReqShutdownHandler); /* shutdown */ pqsignal(SIGQUIT, bg_quickdie); /* hard crash time */ pqsignal(SIGALRM, SIG_IGN); pqsignal(SIGPIPE, SIG_IGN); pqsignal(SIGUSR1, bgwriter_sigusr1_handler); pqsignal(SIGUSR2, SIG_IGN); /* * Reset some signals that are accepted by postmaster but not here */ pqsignal(SIGCHLD, SIG_DFL); pqsignal(SIGTTIN, SIG_DFL); pqsignal(SIGTTOU, SIG_DFL); pqsignal(SIGCONT, SIG_DFL); pqsignal(SIGWINCH, SIG_DFL); /* We allow SIGQUIT (quickdie) at all times */ sigdelset(&BlockSig, SIGQUIT); /* * Create a resource owner to keep track of our resources (currently only * buffer pins). */ CurrentResourceOwner = ResourceOwnerCreate(NULL, "Background Writer"); /* * We just started, assume there has been either a shutdown or * end-of-recovery snapshot. */ last_snapshot_ts = GetCurrentTimestamp(); /* * Create a memory context that we will do all our work in. We do this so * that we can reset the context during error recovery and thereby avoid * possible memory leaks. Formerly this code just ran in * TopMemoryContext, but resetting that would be a really bad idea. */ bgwriter_context = AllocSetContextCreate(TopMemoryContext, "Background Writer", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); MemoryContextSwitchTo(bgwriter_context); /* * If an exception is encountered, processing resumes here. * * See notes in postgres.c about the design of this coding. */ if (sigsetjmp(local_sigjmp_buf, 1) != 0) { /* Since not using PG_TRY, must reset error stack by hand */ error_context_stack = NULL; /* Prevent interrupts while cleaning up */ HOLD_INTERRUPTS(); /* Report the error to the server log */ EmitErrorReport(); /* * These operations are really just a minimal subset of * AbortTransaction(). We don't have very many resources to worry * about in bgwriter, but we do have LWLocks, buffers, and temp files. */ LWLockReleaseAll(); AbortBufferIO(); UnlockBuffers(); /* buffer pins are released here: */ ResourceOwnerRelease(CurrentResourceOwner, RESOURCE_RELEASE_BEFORE_LOCKS, false, true); /* we needn't bother with the other ResourceOwnerRelease phases */ AtEOXact_Buffers(false); AtEOXact_SMgr(); AtEOXact_Files(); AtEOXact_HashTables(false); /* * Now return to normal top-level context and clear ErrorContext for * next time. */ MemoryContextSwitchTo(bgwriter_context); FlushErrorState(); /* Flush any leaked data in the top-level context */ MemoryContextResetAndDeleteChildren(bgwriter_context); /* Now we can allow interrupts again */ RESUME_INTERRUPTS(); /* * Sleep at least 1 second after any error. A write error is likely * to be repeated, and we don't want to be filling the error logs as * fast as we can. */ pg_usleep(1000000L); /* * Close all open files after any error. This is helpful on Windows, * where holding deleted files open causes various strange errors. * It's not clear we need it elsewhere, but shouldn't hurt. */ smgrcloseall(); /* Report wait end here, when there is no further possibility of wait */ pgstat_report_wait_end(); } /* We can now handle ereport(ERROR) */ PG_exception_stack = &local_sigjmp_buf; /* * Unblock signals (they were blocked when the postmaster forked us) */ PG_SETMASK(&UnBlockSig); /* * Reset hibernation state after any error. */ prev_hibernate = false; /* * Loop forever */ for (;;) { bool can_hibernate; int rc; /* Clear any already-pending wakeups */ ResetLatch(MyLatch); if (got_SIGHUP) { got_SIGHUP = false; ProcessConfigFile(PGC_SIGHUP); } if (shutdown_requested) { /* * From here on, elog(ERROR) should end with exit(1), not send * control back to the sigsetjmp block above */ ExitOnAnyError = true; /* Normal exit from the bgwriter is here */ proc_exit(0); /* done */ } /* * Do one cycle of dirty-buffer writing. */ can_hibernate = BgBufferSync(); /* * Send off activity statistics to the stats collector */ pgstat_send_bgwriter(); if (FirstCallSinceLastCheckpoint()) { /* * After any checkpoint, close all smgr files. This is so we * won't hang onto smgr references to deleted files indefinitely. */ smgrcloseall(); } /* * Log a new xl_running_xacts every now and then so replication can * get into a consistent state faster (think of suboverflowed * snapshots) and clean up resources (locks, KnownXids*) more * frequently. The costs of this are relatively low, so doing it 4 * times (LOG_SNAPSHOT_INTERVAL_MS) a minute seems fine. * * We assume the interval for writing xl_running_xacts is * significantly bigger than BgWriterDelay, so we don't complicate the * overall timeout handling but just assume we're going to get called * often enough even if hibernation mode is active. It's not that * important that log_snap_interval_ms is met strictly. To make sure * we're not waking the disk up unnecessarily on an idle system we * check whether there has been any WAL inserted since the last time * we've logged a running xacts. * * We do this logging in the bgwriter as its the only process that is * run regularly and returns to its mainloop all the time. E.g. * Checkpointer, when active, is barely ever in its mainloop and thus * makes it hard to log regularly. */ if (XLogStandbyInfoActive() && !RecoveryInProgress()) { TimestampTz timeout = 0; TimestampTz now = GetCurrentTimestamp(); timeout = TimestampTzPlusMilliseconds(last_snapshot_ts, LOG_SNAPSHOT_INTERVAL_MS); /* * only log if enough time has passed and some xlog record has * been inserted. */ if (now >= timeout && last_snapshot_lsn != GetXLogInsertRecPtr()) { last_snapshot_lsn = LogStandbySnapshot(); last_snapshot_ts = now; } } /* * Sleep until we are signaled or BgWriterDelay has elapsed. * * Note: the feedback control loop in BgBufferSync() expects that we * will call it every BgWriterDelay msec. While it's not critical for * correctness that that be exact, the feedback loop might misbehave * if we stray too far from that. Hence, avoid loading this process * down with latch events that are likely to happen frequently during * normal operation. */ rc = WaitLatch(MyLatch, WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, BgWriterDelay /* ms */ ); /* * If no latch event and BgBufferSync says nothing's happening, extend * the sleep in "hibernation" mode, where we sleep for much longer * than bgwriter_delay says. Fewer wakeups save electricity. When a * backend starts using buffers again, it will wake us up by setting * our latch. Because the extra sleep will persist only as long as no * buffer allocations happen, this should not distort the behavior of * BgBufferSync's control loop too badly; essentially, it will think * that the system-wide idle interval didn't exist. * * There is a race condition here, in that a backend might allocate a * buffer between the time BgBufferSync saw the alloc count as zero * and the time we call StrategyNotifyBgWriter. While it's not * critical that we not hibernate anyway, we try to reduce the odds of * that by only hibernating when BgBufferSync says nothing's happening * for two consecutive cycles. Also, we mitigate any possible * consequences of a missed wakeup by not hibernating forever. */ if (rc == WL_TIMEOUT && can_hibernate && prev_hibernate) { /* Ask for notification at next buffer allocation */ StrategyNotifyBgWriter(MyProc->pgprocno); /* Sleep ... */ rc = WaitLatch(MyLatch, WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, BgWriterDelay * HIBERNATE_FACTOR); /* Reset the notification request in case we timed out */ StrategyNotifyBgWriter(-1); } /* * Emergency bailout if postmaster has died. This is to avoid the * necessity for manual cleanup of all postmaster children. */ if (rc & WL_POSTMASTER_DEATH) exit(1); prev_hibernate = can_hibernate; } }
/* fork lifecheck process*/ static pid_t fork_a_lifecheck(int fork_wait_time) { pid_t pid; sigjmp_buf local_sigjmp_buf; pid = fork(); if (pid != 0) { if (pid == -1) ereport(ERROR, (errmsg("failed to fork a lifecheck process"))); return pid; } on_exit_reset(); processType = PT_LIFECHECK; if (fork_wait_time > 0) { sleep(fork_wait_time); } POOL_SETMASK(&UnBlockSig); init_ps_display("", "", "", ""); signal(SIGTERM, wd_exit); signal(SIGINT, wd_exit); signal(SIGQUIT, wd_exit); signal(SIGCHLD, SIG_DFL); signal(SIGHUP, SIG_IGN); signal(SIGPIPE, SIG_IGN); /* Create per loop iteration memory context */ ProcessLoopContext = AllocSetContextCreate(TopMemoryContext, "wd_lifecheck_main_loop", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); MemoryContextSwitchTo(TopMemoryContext); set_ps_display("lifecheck",false); /* wait until ready to go */ while (WD_OK != is_wd_lifecheck_ready()) { sleep(pool_config->wd_interval * 10); } ereport(LOG, (errmsg("watchdog: lifecheck started"))); if (sigsetjmp(local_sigjmp_buf, 1) != 0) { /* Since not using PG_TRY, must reset error stack by hand */ error_context_stack = NULL; EmitErrorReport(); MemoryContextSwitchTo(TopMemoryContext); FlushErrorState(); sleep(pool_config->wd_heartbeat_keepalive); } /* We can now handle ereport(ERROR) */ PG_exception_stack = &local_sigjmp_buf; /* watchdog loop */ for (;;) { MemoryContextSwitchTo(ProcessLoopContext); MemoryContextResetAndDeleteChildren(ProcessLoopContext); /* pgpool life check */ wd_lifecheck(); sleep(pool_config->wd_interval); } return pid; }
/* * A tuple in the heap is being inserted. To keep a brin index up to date, * we need to obtain the relevant index tuple and compare its stored values * with those of the new tuple. If the tuple values are not consistent with * the summary tuple, we need to update the index tuple. * * If the range is not currently summarized (i.e. the revmap returns NULL for * it), there's nothing to do. */ Datum brininsert(PG_FUNCTION_ARGS) { Relation idxRel = (Relation) PG_GETARG_POINTER(0); Datum *values = (Datum *) PG_GETARG_POINTER(1); bool *nulls = (bool *) PG_GETARG_POINTER(2); ItemPointer heaptid = (ItemPointer) PG_GETARG_POINTER(3); /* we ignore the rest of our arguments */ BlockNumber pagesPerRange; BrinDesc *bdesc = NULL; BrinRevmap *revmap; Buffer buf = InvalidBuffer; MemoryContext tupcxt = NULL; MemoryContext oldcxt = NULL; revmap = brinRevmapInitialize(idxRel, &pagesPerRange); for (;;) { bool need_insert = false; OffsetNumber off; BrinTuple *brtup; BrinMemTuple *dtup; BlockNumber heapBlk; int keyno; #ifdef USE_ASSERT_CHECKING BrinTuple *tmptup; BrinMemTuple *tmpdtup; Size tmpsiz; #endif CHECK_FOR_INTERRUPTS(); heapBlk = ItemPointerGetBlockNumber(heaptid); /* normalize the block number to be the first block in the range */ heapBlk = (heapBlk / pagesPerRange) * pagesPerRange; brtup = brinGetTupleForHeapBlock(revmap, heapBlk, &buf, &off, NULL, BUFFER_LOCK_SHARE); /* if range is unsummarized, there's nothing to do */ if (!brtup) break; /* First time through? */ if (bdesc == NULL) { bdesc = brin_build_desc(idxRel); tupcxt = AllocSetContextCreate(CurrentMemoryContext, "brininsert cxt", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); oldcxt = MemoryContextSwitchTo(tupcxt); } dtup = brin_deform_tuple(bdesc, brtup); #ifdef USE_ASSERT_CHECKING { /* * When assertions are enabled, we use this as an opportunity to * test the "union" method, which would otherwise be used very * rarely: first create a placeholder tuple, and addValue the * value we just got into it. Then union the existing index tuple * with the updated placeholder tuple. The tuple resulting from * that union should be identical to the one resulting from the * regular operation (straight addValue) below. * * Here we create the tuple to compare with; the actual comparison * is below. */ tmptup = brin_form_placeholder_tuple(bdesc, heapBlk, &tmpsiz); tmpdtup = brin_deform_tuple(bdesc, tmptup); for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++) { BrinValues *bval; FmgrInfo *addValue; bval = &tmpdtup->bt_columns[keyno]; addValue = index_getprocinfo(idxRel, keyno + 1, BRIN_PROCNUM_ADDVALUE); FunctionCall4Coll(addValue, idxRel->rd_indcollation[keyno], PointerGetDatum(bdesc), PointerGetDatum(bval), values[keyno], nulls[keyno]); } union_tuples(bdesc, tmpdtup, brtup); tmpdtup->bt_placeholder = dtup->bt_placeholder; tmptup = brin_form_tuple(bdesc, heapBlk, tmpdtup, &tmpsiz); } #endif /* * Compare the key values of the new tuple to the stored index values; * our deformed tuple will get updated if the new tuple doesn't fit * the original range (note this means we can't break out of the loop * early). Make a note of whether this happens, so that we know to * insert the modified tuple later. */ for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++) { Datum result; BrinValues *bval; FmgrInfo *addValue; bval = &dtup->bt_columns[keyno]; addValue = index_getprocinfo(idxRel, keyno + 1, BRIN_PROCNUM_ADDVALUE); result = FunctionCall4Coll(addValue, idxRel->rd_indcollation[keyno], PointerGetDatum(bdesc), PointerGetDatum(bval), values[keyno], nulls[keyno]); /* if that returned true, we need to insert the updated tuple */ need_insert |= DatumGetBool(result); } #ifdef USE_ASSERT_CHECKING { /* * Now we can compare the tuple produced by the union function * with the one from plain addValue. */ BrinTuple *cmptup; Size cmpsz; cmptup = brin_form_tuple(bdesc, heapBlk, dtup, &cmpsz); Assert(brin_tuples_equal(tmptup, tmpsiz, cmptup, cmpsz)); } #endif if (!need_insert) { /* * The tuple is consistent with the new values, so there's nothing * to do. */ LockBuffer(buf, BUFFER_LOCK_UNLOCK); } else { Page page = BufferGetPage(buf); ItemId lp = PageGetItemId(page, off); Size origsz; BrinTuple *origtup; Size newsz; BrinTuple *newtup; bool samepage; /* * Make a copy of the old tuple, so that we can compare it after * re-acquiring the lock. */ origsz = ItemIdGetLength(lp); origtup = brin_copy_tuple(brtup, origsz); /* * Before releasing the lock, check if we can attempt a same-page * update. Another process could insert a tuple concurrently in * the same page though, so downstream we must be prepared to cope * if this turns out to not be possible after all. */ newtup = brin_form_tuple(bdesc, heapBlk, dtup, &newsz); samepage = brin_can_do_samepage_update(buf, origsz, newsz); LockBuffer(buf, BUFFER_LOCK_UNLOCK); /* * Try to update the tuple. If this doesn't work for whatever * reason, we need to restart from the top; the revmap might be * pointing at a different tuple for this block now, so we need to * recompute to ensure both our new heap tuple and the other * inserter's are covered by the combined tuple. It might be that * we don't need to update at all. */ if (!brin_doupdate(idxRel, pagesPerRange, revmap, heapBlk, buf, off, origtup, origsz, newtup, newsz, samepage)) { /* no luck; start over */ MemoryContextResetAndDeleteChildren(tupcxt); continue; } } /* success! */ break; } brinRevmapTerminate(revmap); if (BufferIsValid(buf)) ReleaseBuffer(buf); if (bdesc != NULL) { brin_free_desc(bdesc); MemoryContextSwitchTo(oldcxt); MemoryContextDelete(tupcxt); } return BoolGetDatum(false); }
/* * Clean up SPI state at subtransaction commit or abort. * * During commit, there shouldn't be any unclosed entries remaining from * the current subtransaction; we emit a warning if any are found. */ void AtEOSubXact_SPI(bool isCommit, SubTransactionId mySubid) { bool found = false; while (_SPI_connected >= 0) { _SPI_connection *connection = &(_SPI_stack[_SPI_connected]); if (connection->connectSubid != mySubid) break; /* couldn't be any underneath it either */ found = true; /* * Release procedure memory explicitly (see note in SPI_connect) */ if (connection->execCxt) { MemoryContextDelete(connection->execCxt); connection->execCxt = NULL; } if (connection->procCxt) { MemoryContextDelete(connection->procCxt); connection->procCxt = NULL; } /* * Pop the stack entry and reset global variables. Unlike * SPI_finish(), we don't risk switching to memory contexts that might * be already gone. */ _SPI_connected--; _SPI_curid = _SPI_connected; if (_SPI_connected == -1) _SPI_current = NULL; else _SPI_current = &(_SPI_stack[_SPI_connected]); SPI_processed = 0; SPI_lastoid = InvalidOid; SPI_tuptable = NULL; } if (found && isCommit) ereport(WARNING, (errcode(ERRCODE_WARNING), errmsg("subtransaction left non-empty SPI stack"), errhint("Check for missing \"SPI_finish\" calls."))); /* * If we are aborting a subtransaction and there is an open SPI context * surrounding the subxact, clean up to prevent memory leakage. */ if (_SPI_current && !isCommit) { /* free Executor memory the same as _SPI_end_call would do */ MemoryContextResetAndDeleteChildren(_SPI_current->execCxt); /* throw away any partially created tuple-table */ SPI_freetuptable(_SPI_current->tuptable); _SPI_current->tuptable = NULL; } }
/* fork heartbeat receiver child */ pid_t wd_hb_receiver(int fork_wait_time, WdHbIf *hb_if) { int sock; pid_t pid = 0; WdHbPacket pkt; struct timeval tv; char from[WD_MAX_HOST_NAMELEN]; int from_pgpool_port; char buf[(MD5_PASSWD_LEN+1)*2]; char pack_str[WD_MAX_PACKET_STRING]; int pack_str_len; sigjmp_buf local_sigjmp_buf; WdInfo * p; pid = fork(); if (pid != 0) { if (pid == -1) ereport(PANIC, (errmsg("failed to fork a heartbeat receiver child"))); return pid; } on_exit_reset(); processType = PT_HB_RECEIVER; if (fork_wait_time > 0) { sleep(fork_wait_time); } POOL_SETMASK(&UnBlockSig); signal(SIGTERM, hb_receiver_exit); signal(SIGINT, hb_receiver_exit); signal(SIGQUIT, hb_receiver_exit); signal(SIGCHLD, SIG_IGN); signal(SIGHUP, SIG_IGN); signal(SIGUSR1, SIG_IGN); signal(SIGUSR2, SIG_IGN); signal(SIGPIPE, SIG_IGN); signal(SIGALRM, SIG_IGN); init_ps_display("", "", "", ""); /* Create per loop iteration memory context */ ProcessLoopContext = AllocSetContextCreate(TopMemoryContext, "wdhb_hb_receiver", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); MemoryContextSwitchTo(TopMemoryContext); sock = wd_create_hb_recv_socket(hb_if); set_ps_display("heartbeat receiver", false); if (sigsetjmp(local_sigjmp_buf, 1) != 0) { /* Since not using PG_TRY, must reset error stack by hand */ error_context_stack = NULL; EmitErrorReport(); MemoryContextSwitchTo(TopMemoryContext); FlushErrorState(); } /* We can now handle ereport(ERROR) */ PG_exception_stack = &local_sigjmp_buf; for(;;) { MemoryContextSwitchTo(ProcessLoopContext); MemoryContextResetAndDeleteChildren(ProcessLoopContext); /* receive heartbeat signal */ wd_hb_recv(sock, &pkt); /* authentication */ if (strlen(pool_config->wd_authkey)) { /* calculate hash from packet */ pack_str_len = packet_to_string_hb(&pkt, pack_str, sizeof(pack_str)); wd_calc_hash(pack_str, pack_str_len, buf); if (strcmp(pkt.hash, buf)) ereport(ERROR, (errmsg("watchdog heartbeat receive"), errdetail("authentication failed"))); } /* get current time */ gettimeofday(&tv, NULL); /* who send this packet? */ strlcpy(from, pkt.from, sizeof(from)); from_pgpool_port = pkt.from_pgpool_port; p = WD_List; while (p->status != WD_END) { if (!strcmp(p->hostname, from) && p->pgpool_port == from_pgpool_port) { /* ignore the packet from down pgpool */ if (pkt.status == WD_DOWN) { ereport(DEBUG1, (errmsg("watchdog heartbeat: received heartbeat signal from \"%s:%d\" whose status is down. ignored", from, from_pgpool_port))); break; } /* this is the first packet or the latest packet */ if (!WD_TIME_ISSET(p->hb_send_time) || WD_TIME_BEFORE(p->hb_send_time, pkt.send_time)) { ereport(DEBUG1, (errmsg("watchdog heartbeat: received heartbeat signal from \"%s:%d\"", from, from_pgpool_port))); p->hb_send_time = pkt.send_time; p->hb_last_recv_time = tv; } else { ereport(DEBUG1, (errmsg("watchdog heartbeat: received heartbeat signal is older than the latest, ignored"))); } break; } p++; } } return pid; }