/* * Returns the network latency in ms, note that this includes any * difference in clock settings between the servers, as well as timezone. */ int GetReplicationTransferLatency(void) { WalRcvData *walrcv = WalRcv; TimestampTz lastMsgSendTime; TimestampTz lastMsgReceiptTime; long secs = 0; int usecs = 0; int ms; SpinLockAcquire(&walrcv->mutex); lastMsgSendTime = walrcv->lastMsgSendTime; lastMsgReceiptTime = walrcv->lastMsgReceiptTime; SpinLockRelease(&walrcv->mutex); TimestampDifference(lastMsgSendTime, lastMsgReceiptTime, &secs, &usecs); ms = ((int) secs * 1000) + (usecs / 1000); return ms; }
/* * Returns the replication apply delay in ms */ int GetReplicationApplyDelay(void) { /* use volatile pointer to prevent code rearrangement */ volatile WalRcvData *walrcv = WalRcv; XLogRecPtr receivePtr; XLogRecPtr replayPtr; long secs; int usecs; SpinLockAcquire(&walrcv->mutex); receivePtr = walrcv->receivedUpto; SpinLockRelease(&walrcv->mutex); replayPtr = GetXLogReplayRecPtr(NULL); if (XLByteLE(receivePtr, replayPtr)) return 0; TimestampDifference(GetCurrentChunkReplayStartTime(), GetCurrentTimestamp(), &secs, &usecs); return (((int) secs * 1000) + (usecs / 1000)); }
/* * Returns the replication apply delay in ms or -1 * if the apply delay info is not available */ int GetReplicationApplyDelay(void) { WalRcvData *walrcv = WalRcv; XLogRecPtr receivePtr; XLogRecPtr replayPtr; long secs; int usecs; TimestampTz chunkReplayStartTime; SpinLockAcquire(&walrcv->mutex); receivePtr = walrcv->receivedUpto; SpinLockRelease(&walrcv->mutex); replayPtr = GetXLogReplayRecPtr(NULL); if (receivePtr == replayPtr) return 0; chunkReplayStartTime = GetCurrentChunkReplayStartTime(); if (chunkReplayStartTime == 0) return -1; TimestampDifference(chunkReplayStartTime, GetCurrentTimestamp(), &secs, &usecs); return (((int) secs * 1000) + (usecs / 1000)); }
/* * Helper function that actually kicks off the command on the libpq connection. */ static void dispatchCommand(CdbDispatchResult * dispatchResult, const char *query_text, int query_text_len) { SegmentDatabaseDescriptor *segdbDesc = dispatchResult->segdbDesc; TimestampTz beforeSend = 0; long secs; int usecs; if (DEBUG1 >= log_min_messages) beforeSend = GetCurrentTimestamp(); if (PQisBusy(segdbDesc->conn)) elog(LOG, "Trying to send to busy connection %s: asyncStatus %d", segdbDesc->whoami, segdbDesc->conn->asyncStatus); if (cdbconn_isBadConnection(segdbDesc)) { char *msg = PQerrorMessage(dispatchResult->segdbDesc->conn); dispatchResult->stillRunning = false; ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), errmsg("Connection lost before dispatch to segment %s: %s", dispatchResult->segdbDesc->whoami, msg ? msg : "unknown error"))); } /* * Submit the command asynchronously. */ if (PQsendGpQuery_shared(dispatchResult->segdbDesc->conn, (char *) query_text, query_text_len) == 0) { char *msg = PQerrorMessage(dispatchResult->segdbDesc->conn); dispatchResult->stillRunning = false; ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), errmsg("Command could not be dispatch to segment %s: %s", dispatchResult->segdbDesc->whoami, msg ? msg : "unknown error"))); } if (DEBUG1 >= log_min_messages) { TimestampDifference(beforeSend, GetCurrentTimestamp(), &secs, &usecs); if (secs != 0 || usecs > 1000) /* Time > 1ms? */ elog(LOG, "time for PQsendGpQuery_shared %ld.%06d", secs, usecs); } /* * We'll keep monitoring this QE -- whether or not the command * was dispatched -- in order to check for a lost connection * or any other errors that libpq might have in store for us. */ dispatchResult->stillRunning = true; dispatchResult->hasDispatched = true; ELOG_DISPATCHER_DEBUG("Command dispatched to QE (%s)", dispatchResult->segdbDesc->whoami); }
/* * Get next block number or InvalidBlockNumber when we're done. * * Uses linear probing algorithm for picking next block. */ Datum tsm_system_time_nextblock(PG_FUNCTION_ARGS) { TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); SystemSamplerData *sampler = (SystemSamplerData *) tsdesc->tsmdata; sampler->lb = (sampler->lb + sampler->step) % sampler->nblocks; sampler->doneblocks++; /* All blocks have been read, we're done */ if (sampler->doneblocks > sampler->nblocks) PG_RETURN_UINT32(InvalidBlockNumber); /* * Update the estimations for time limit at least 10 times per estimated * number of returned blocks to handle variations in block read speed. */ if (sampler->doneblocks % Max(sampler->estblocks/10, 1) == 0) { TimestampTz now = GetCurrentTimestamp(); long secs; int usecs; int usecs_remaining; int time_per_block; TimestampDifference(sampler->start_time, now, &secs, &usecs); usecs += (int) secs * 1000000; time_per_block = usecs / sampler->doneblocks; /* No time left, end. */ TimestampDifference(now, sampler->end_time, &secs, &usecs); if (secs <= 0 && usecs <= 0) PG_RETURN_UINT32(InvalidBlockNumber); /* Remaining microseconds */ usecs_remaining = usecs + (int) secs * 1000000; /* Recalculate estimated returned number of blocks */ if (time_per_block < usecs_remaining && time_per_block > 0) sampler->estblocks = sampler->time * time_per_block; } PG_RETURN_UINT32(sampler->lb); }
/* * Check for statement timeout. If the timeout time has come, * trigger a query-cancel interrupt; if not, reschedule the SIGALRM * interrupt to occur at the right time. * * Returns true if okay, false if failed to set the interrupt. */ static bool CheckStatementTimeout(void) { TimestampTz now; if (!statement_timeout_active) return true; /* do nothing if not active */ /* QD takes care of timeouts for QE. */ if (Gp_role == GP_ROLE_EXECUTE) return true; now = GetCurrentTimestamp(); if (now >= statement_fin_time) { /* Time to die */ statement_timeout_active = false; cancel_from_timeout = true; elog(LOG,"Issuing cancel signal (SIGINT) to my self (pid = %d) for statement timeout.", MyProcPid); #ifdef HAVE_SETSID /* try to signal whole process group */ kill(-MyProcPid, SIGINT); #endif kill(MyProcPid, SIGINT); } else { /* Not time yet, so (re)schedule the interrupt */ long secs; int usecs; struct itimerval timeval; TimestampDifference(now, statement_fin_time, &secs, &usecs); /* * It's possible that the difference is less than a microsecond; * ensure we don't cancel, rather than set, the interrupt. */ if (secs == 0 && usecs == 0) usecs = 1; MemSet(&timeval, 0, sizeof(struct itimerval)); timeval.it_value.tv_sec = secs; timeval.it_value.tv_usec = usecs; if (setitimer(ITIMER_REAL, &timeval, NULL)) return false; } return true; }
/* * Helper function to thread_DispatchCommand that actually kicks off the * command on the libpq connection. * * NOTE: since this is called via a thread, the same rules apply as to * thread_DispatchCommand absolutely no elog'ing. */ static void dispatchCommand(CdbDispatchResult * dispatchResult, const char *query_text, int query_text_len) { SegmentDatabaseDescriptor *segdbDesc = dispatchResult->segdbDesc; PGconn *conn = segdbDesc->conn; TimestampTz beforeSend = 0; long secs; int usecs; if (DEBUG1 >= log_min_messages) beforeSend = GetCurrentTimestamp(); /* * Submit the command asynchronously. */ if (PQsendGpQuery_shared(conn, (char *) query_text, query_text_len) == 0) { char *msg = PQerrorMessage(segdbDesc->conn); if (DEBUG3 >= log_min_messages) write_log("PQsendMPPQuery_shared error %s %s", segdbDesc->whoami, msg ? msg : ""); /* * Note the error. */ cdbdisp_appendMessage(dispatchResult, LOG, ERRCODE_GP_INTERCONNECTION_ERROR, "Command could not be sent to segment db %s; %s", segdbDesc->whoami, msg ? msg : ""); PQfinish(conn); segdbDesc->conn = NULL; dispatchResult->stillRunning = false; } if (DEBUG1 >= log_min_messages) { TimestampDifference(beforeSend, GetCurrentTimestamp(), &secs, &usecs); if (secs != 0 || usecs > 1000) /* Time > 1ms? */ write_log("time for PQsendGpQuery_shared %ld.%06d", secs, usecs); } dispatchResult->hasDispatched = true; /* * We'll keep monitoring this QE -- whether or not the command * was dispatched -- in order to check for a lost connection * or any other errors that libpq might have in store for us. */ }
static void calcCpuUsage(StringInfoData *str, int64 usageBegin, TimestampTz timestampBegin, int64 usageEnd, TimestampTz timestampEnd) { int64 duration; long secs; int usecs; int64 usage; usage = usageEnd - usageBegin; TimestampDifference(timestampBegin, timestampEnd, &secs, &usecs); duration = secs * 1000000 + usecs; appendStringInfo(str, "\"%d\":%.2f", GpIdentity.segindex, ResGroupOps_ConvertCpuUsageToPercent(usage, duration)); }
/* * Schedule alarm for the next active timeout, if any * * We assume the caller has obtained the current time, or a close-enough * approximation. */ static void schedule_alarm(TimestampTz now) { if (num_active_timeouts > 0) { struct itimerval timeval; long secs; int usecs; MemSet(&timeval, 0, sizeof(struct itimerval)); /* Get the time remaining till the nearest pending timeout */ TimestampDifference(now, active_timeouts[0]->fin_time, &secs, &usecs); /* * It's possible that the difference is less than a microsecond; * ensure we don't cancel, rather than set, the interrupt. */ if (secs == 0 && usecs == 0) usecs = 1; timeval.it_value.tv_sec = secs; timeval.it_value.tv_usec = usecs; /* * We must enable the signal handler before calling setitimer(); if we * did it in the other order, we'd have a race condition wherein the * interrupt could occur before we can set alarm_enabled, so that the * signal handler would fail to do anything. * * Because we didn't bother to reset the timer in disable_alarm(), * it's possible that a previously-set interrupt will fire between * enable_alarm() and setitimer(). This is safe, however. There are * two possible outcomes: * * 1. The signal handler finds nothing to do (because the nearest * timeout event is still in the future). It will re-set the timer * and return. Then we'll overwrite the timer value with a new one. * This will mean that the timer fires a little later than we * intended, but only by the amount of time it takes for the signal * handler to do nothing useful, which shouldn't be much. * * 2. The signal handler executes and removes one or more timeout * events. When it returns, either the queue is now empty or the * frontmost event is later than the one we looked at above. So we'll * overwrite the timer value with one that is too soon (plus or minus * the signal handler's execution time), causing a useless interrupt * to occur. But the handler will then re-set the timer and * everything will still work as expected. * * Since these cases are of very low probability (the window here * being quite narrow), it's not worth adding cycles to the mainline * code to prevent occasional wasted interrupts. */ enable_alarm(); /* Set the alarm timer */ if (setitimer(ITIMER_REAL, &timeval, NULL) != 0) elog(FATAL, "could not enable SIGALRM timer: %m"); } }
/* * lazy_vacuum_rel() -- perform LAZY VACUUM for one heap relation * * This routine vacuums a single heap, cleans out its indexes, and * updates its relpages and reltuples statistics. * * At entry, we have already established a transaction and opened * and locked the relation. */ void lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt, BufferAccessStrategy bstrategy) { LVRelStats *vacrelstats; Relation *Irel; int nindexes; BlockNumber possibly_freeable; PGRUsage ru0; TimestampTz starttime = 0; long secs; int usecs; double read_rate, write_rate; bool scan_all; TransactionId freezeTableLimit; BlockNumber new_rel_pages; double new_rel_tuples; BlockNumber new_rel_allvisible; TransactionId new_frozen_xid; /* measure elapsed time iff autovacuum logging requires it */ if (IsAutoVacuumWorkerProcess() && Log_autovacuum_min_duration >= 0) { pg_rusage_init(&ru0); starttime = GetCurrentTimestamp(); } if (vacstmt->options & VACOPT_VERBOSE) elevel = INFO; else elevel = DEBUG2; vac_strategy = bstrategy; vacuum_set_xid_limits(vacstmt->freeze_min_age, vacstmt->freeze_table_age, onerel->rd_rel->relisshared, &OldestXmin, &FreezeLimit, &freezeTableLimit); scan_all = TransactionIdPrecedesOrEquals(onerel->rd_rel->relfrozenxid, freezeTableLimit); vacrelstats = (LVRelStats *) palloc0(sizeof(LVRelStats)); vacrelstats->old_rel_pages = onerel->rd_rel->relpages; vacrelstats->old_rel_tuples = onerel->rd_rel->reltuples; vacrelstats->num_index_scans = 0; /* Open all indexes of the relation */ vac_open_indexes(onerel, RowExclusiveLock, &nindexes, &Irel); vacrelstats->hasindex = (nindexes > 0); /* Do the vacuuming */ lazy_scan_heap(onerel, vacrelstats, Irel, nindexes, scan_all); /* Done with indexes */ vac_close_indexes(nindexes, Irel, NoLock); /* * Optionally truncate the relation. * * Don't even think about it unless we have a shot at releasing a goodly * number of pages. Otherwise, the time taken isn't worth it. */ possibly_freeable = vacrelstats->rel_pages - vacrelstats->nonempty_pages; if (possibly_freeable > 0 && (possibly_freeable >= REL_TRUNCATE_MINIMUM || possibly_freeable >= vacrelstats->rel_pages / REL_TRUNCATE_FRACTION)) lazy_truncate_heap(onerel, vacrelstats); /* Vacuum the Free Space Map */ FreeSpaceMapVacuum(onerel); /* * Update statistics in pg_class. * * A corner case here is that if we scanned no pages at all because every * page is all-visible, we should not update relpages/reltuples, because * we have no new information to contribute. In particular this keeps * us from replacing relpages=reltuples=0 (which means "unknown tuple * density") with nonzero relpages and reltuples=0 (which means "zero * tuple density") unless there's some actual evidence for the latter. * * We do update relallvisible even in the corner case, since if the * table is all-visible we'd definitely like to know that. But clamp * the value to be not more than what we're setting relpages to. * * Also, don't change relfrozenxid if we skipped any pages, since then * we don't know for certain that all tuples have a newer xmin. */ new_rel_pages = vacrelstats->rel_pages; new_rel_tuples = vacrelstats->new_rel_tuples; if (vacrelstats->scanned_pages == 0 && new_rel_pages > 0) { new_rel_pages = vacrelstats->old_rel_pages; new_rel_tuples = vacrelstats->old_rel_tuples; } new_rel_allvisible = visibilitymap_count(onerel); if (new_rel_allvisible > new_rel_pages) new_rel_allvisible = new_rel_pages; new_frozen_xid = FreezeLimit; if (vacrelstats->scanned_pages < vacrelstats->rel_pages) new_frozen_xid = InvalidTransactionId; vac_update_relstats(onerel, new_rel_pages, new_rel_tuples, new_rel_allvisible, vacrelstats->hasindex, new_frozen_xid); /* report results to the stats collector, too */ pgstat_report_vacuum(RelationGetRelid(onerel), onerel->rd_rel->relisshared, new_rel_tuples); /* and log the action if appropriate */ if (IsAutoVacuumWorkerProcess() && Log_autovacuum_min_duration >= 0) { TimestampTz endtime = GetCurrentTimestamp(); if (Log_autovacuum_min_duration == 0 || TimestampDifferenceExceeds(starttime, endtime, Log_autovacuum_min_duration)) { TimestampDifference(starttime, endtime, &secs, &usecs); read_rate = 0; write_rate = 0; if ((secs > 0) || (usecs > 0)) { read_rate = (double) BLCKSZ * VacuumPageMiss / (1024 * 1024) / (secs + usecs / 1000000.0); write_rate = (double) BLCKSZ * VacuumPageDirty / (1024 * 1024) / (secs + usecs / 1000000.0); } ereport(LOG, (errmsg("automatic vacuum of table \"%s.%s.%s\": index scans: %d\n" "pages: %d removed, %d remain\n" "tuples: %.0f removed, %.0f remain\n" "buffer usage: %d hits, %d misses, %d dirtied\n" "avg read rate: %.3f MiB/s, avg write rate: %.3f MiB/s\n" "system usage: %s", get_database_name(MyDatabaseId), get_namespace_name(RelationGetNamespace(onerel)), RelationGetRelationName(onerel), vacrelstats->num_index_scans, vacrelstats->pages_removed, vacrelstats->rel_pages, vacrelstats->tuples_deleted, vacrelstats->new_rel_tuples, VacuumPageHit, VacuumPageMiss, VacuumPageDirty, read_rate,write_rate, pg_rusage_show(&ru0)))); } } }
/* Main loop of walsender process */ static int WalSndLoop(void) { char *output_message; bool caughtup = false; /* * Allocate buffer that will be used for each output message. We do this * just once to reduce palloc overhead. The buffer must be made large * enough for maximum-sized messages. */ output_message = palloc(1 + sizeof(WalDataMessageHeader) + MAX_SEND_SIZE); /* * Allocate buffer that will be used for processing reply messages. As * above, do this just once to reduce palloc overhead. */ initStringInfo(&reply_message); /* Initialize the last reply timestamp */ last_reply_timestamp = GetCurrentTimestamp(); /* Loop forever, unless we get an error */ for (;;) { /* Clear any already-pending wakeups */ ResetLatch(&MyWalSnd->latch); /* * Emergency bailout if postmaster has died. This is to avoid the * necessity for manual cleanup of all postmaster children. */ if (!PostmasterIsAlive()) exit(1); /* Process any requests or signals received recently */ if (got_SIGHUP) { got_SIGHUP = false; ProcessConfigFile(PGC_SIGHUP); SyncRepInitConfig(); } /* Normal exit from the walsender is here */ if (walsender_shutdown_requested) { /* Inform the standby that XLOG streaming is done */ pq_puttextmessage('C', "COPY 0"); pq_flush(); proc_exit(0); } /* Check for input from the client */ ProcessRepliesIfAny(); /* * If we don't have any pending data in the output buffer, try to send * some more. If there is some, we don't bother to call XLogSend * again until we've flushed it ... but we'd better assume we are not * caught up. */ if (!pq_is_send_pending()) XLogSend(output_message, &caughtup); else caughtup = false; /* Try to flush pending output to the client */ if (pq_flush_if_writable() != 0) break; /* If nothing remains to be sent right now ... */ if (caughtup && !pq_is_send_pending()) { /* * If we're in catchup state, move to streaming. This is an * important state change for users to know about, since before * this point data loss might occur if the primary dies and we * need to failover to the standby. The state change is also * important for synchronous replication, since commits that * started to wait at that point might wait for some time. */ if (MyWalSnd->state == WALSNDSTATE_CATCHUP) { ereport(DEBUG1, (errmsg("standby \"%s\" has now caught up with primary", application_name))); WalSndSetState(WALSNDSTATE_STREAMING); } /* * When SIGUSR2 arrives, we send any outstanding logs up to the * shutdown checkpoint record (i.e., the latest record) and exit. * This may be a normal termination at shutdown, or a promotion, * the walsender is not sure which. */ if (walsender_ready_to_stop) { /* ... let's just be real sure we're caught up ... */ XLogSend(output_message, &caughtup); if (caughtup && !pq_is_send_pending()) { walsender_shutdown_requested = true; continue; /* don't want to wait more */ } } } /* * We don't block if not caught up, unless there is unsent data * pending in which case we'd better block until the socket is * write-ready. This test is only needed for the case where XLogSend * loaded a subset of the available data but then pq_flush_if_writable * flushed it all --- we should immediately try to send more. */ if (caughtup || pq_is_send_pending()) { TimestampTz finish_time = 0; long sleeptime = -1; int wakeEvents; wakeEvents = WL_LATCH_SET | WL_POSTMASTER_DEATH | WL_SOCKET_READABLE; if (pq_is_send_pending()) wakeEvents |= WL_SOCKET_WRITEABLE; /* Determine time until replication timeout */ if (replication_timeout > 0) { long secs; int usecs; finish_time = TimestampTzPlusMilliseconds(last_reply_timestamp, replication_timeout); TimestampDifference(GetCurrentTimestamp(), finish_time, &secs, &usecs); sleeptime = secs * 1000 + usecs / 1000; /* Avoid Assert in WaitLatchOrSocket if timeout is past */ if (sleeptime < 0) sleeptime = 0; wakeEvents |= WL_TIMEOUT; } /* Sleep until something happens or replication timeout */ WaitLatchOrSocket(&MyWalSnd->latch, wakeEvents, MyProcPort->sock, sleeptime); /* * Check for replication timeout. Note we ignore the corner case * possibility that the client replied just as we reached the * timeout ... he's supposed to reply *before* that. */ if (replication_timeout > 0 && GetCurrentTimestamp() >= finish_time) { /* * Since typically expiration of replication timeout means * communication problem, we don't send the error message to * the standby. */ ereport(COMMERROR, (errmsg("terminating walsender process due to replication timeout"))); break; } } } /* * Get here on send failure. Clean up and exit. * * Reset whereToSendOutput to prevent ereport from attempting to send any * more messages to the standby. */ if (whereToSendOutput == DestRemote) whereToSendOutput = DestNone; proc_exit(0); return 1; /* keep the compiler quiet */ }
/* * ProcSleep -- put a process to sleep on the specified lock * * Caller must have set MyProc->heldLocks to reflect locks already held * on the lockable object by this process (under all XIDs). * * The lock table's partition lock must be held at entry, and will be held * at exit. * * Result: STATUS_OK if we acquired the lock, STATUS_ERROR if not (deadlock). * * ASSUME: that no one will fiddle with the queue until after * we release the partition lock. * * NOTES: The process queue is now a priority queue for locking. */ int ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable) { LOCKMODE lockmode = locallock->tag.mode; LOCK *lock = locallock->lock; PROCLOCK *proclock = locallock->proclock; uint32 hashcode = locallock->hashcode; LWLock *partitionLock = LockHashPartitionLock(hashcode); PROC_QUEUE *waitQueue = &(lock->waitProcs); LOCKMASK myHeldLocks = MyProc->heldLocks; bool early_deadlock = false; bool allow_autovacuum_cancel = true; int myWaitStatus; PGPROC *proc; int i; /* * Determine where to add myself in the wait queue. * * Normally I should go at the end of the queue. However, if I already * hold locks that conflict with the request of any previous waiter, put * myself in the queue just in front of the first such waiter. This is not * a necessary step, since deadlock detection would move me to before that * waiter anyway; but it's relatively cheap to detect such a conflict * immediately, and avoid delaying till deadlock timeout. * * Special case: if I find I should go in front of some waiter, check to * see if I conflict with already-held locks or the requests before that * waiter. If not, then just grant myself the requested lock immediately. * This is the same as the test for immediate grant in LockAcquire, except * we are only considering the part of the wait queue before my insertion * point. */ if (myHeldLocks != 0) { LOCKMASK aheadRequests = 0; proc = (PGPROC *) waitQueue->links.next; for (i = 0; i < waitQueue->size; i++) { /* Must he wait for me? */ if (lockMethodTable->conflictTab[proc->waitLockMode] & myHeldLocks) { /* Must I wait for him ? */ if (lockMethodTable->conflictTab[lockmode] & proc->heldLocks) { /* * Yes, so we have a deadlock. Easiest way to clean up * correctly is to call RemoveFromWaitQueue(), but we * can't do that until we are *on* the wait queue. So, set * a flag to check below, and break out of loop. Also, * record deadlock info for later message. */ RememberSimpleDeadLock(MyProc, lockmode, lock, proc); early_deadlock = true; break; } /* I must go before this waiter. Check special case. */ if ((lockMethodTable->conflictTab[lockmode] & aheadRequests) == 0 && LockCheckConflicts(lockMethodTable, lockmode, lock, proclock) == STATUS_OK) { /* Skip the wait and just grant myself the lock. */ GrantLock(lock, proclock, lockmode); GrantAwaitedLock(); return STATUS_OK; } /* Break out of loop to put myself before him */ break; } /* Nope, so advance to next waiter */ aheadRequests |= LOCKBIT_ON(proc->waitLockMode); proc = (PGPROC *) proc->links.next; } /* * If we fall out of loop normally, proc points to waitQueue head, so * we will insert at tail of queue as desired. */ } else { /* I hold no locks, so I can't push in front of anyone. */ proc = (PGPROC *) &(waitQueue->links); } /* * Insert self into queue, ahead of the given proc (or at tail of queue). */ SHMQueueInsertBefore(&(proc->links), &(MyProc->links)); waitQueue->size++; lock->waitMask |= LOCKBIT_ON(lockmode); /* Set up wait information in PGPROC object, too */ MyProc->waitLock = lock; MyProc->waitProcLock = proclock; MyProc->waitLockMode = lockmode; MyProc->waitStatus = STATUS_WAITING; /* * If we detected deadlock, give up without waiting. This must agree with * CheckDeadLock's recovery code, except that we shouldn't release the * semaphore since we haven't tried to lock it yet. */ if (early_deadlock) { RemoveFromWaitQueue(MyProc, hashcode); return STATUS_ERROR; } /* mark that we are waiting for a lock */ lockAwaited = locallock; /* * Release the lock table's partition lock. * * NOTE: this may also cause us to exit critical-section state, possibly * allowing a cancel/die interrupt to be accepted. This is OK because we * have recorded the fact that we are waiting for a lock, and so * LockErrorCleanup will clean up if cancel/die happens. */ LWLockRelease(partitionLock); /* * Also, now that we will successfully clean up after an ereport, it's * safe to check to see if there's a buffer pin deadlock against the * Startup process. Of course, that's only necessary if we're doing Hot * Standby and are not the Startup process ourselves. */ if (RecoveryInProgress() && !InRecovery) CheckRecoveryConflictDeadlock(); /* Reset deadlock_state before enabling the timeout handler */ deadlock_state = DS_NOT_YET_CHECKED; got_deadlock_timeout = false; /* * Set timer so we can wake up after awhile and check for a deadlock. If a * deadlock is detected, the handler releases the process's semaphore and * sets MyProc->waitStatus = STATUS_ERROR, allowing us to know that we * must report failure rather than success. * * By delaying the check until we've waited for a bit, we can avoid * running the rather expensive deadlock-check code in most cases. * * If LockTimeout is set, also enable the timeout for that. We can save a * few cycles by enabling both timeout sources in one call. */ if (LockTimeout > 0) { EnableTimeoutParams timeouts[2]; timeouts[0].id = DEADLOCK_TIMEOUT; timeouts[0].type = TMPARAM_AFTER; timeouts[0].delay_ms = DeadlockTimeout; timeouts[1].id = LOCK_TIMEOUT; timeouts[1].type = TMPARAM_AFTER; timeouts[1].delay_ms = LockTimeout; enable_timeouts(timeouts, 2); } else enable_timeout_after(DEADLOCK_TIMEOUT, DeadlockTimeout); /* * If somebody wakes us between LWLockRelease and WaitLatch, the latch * will not wait. But a set latch does not necessarily mean that the lock * is free now, as there are many other sources for latch sets than * somebody releasing the lock. * * We process interrupts whenever the latch has been set, so cancel/die * interrupts are processed quickly. This means we must not mind losing * control to a cancel/die interrupt here. We don't, because we have no * shared-state-change work to do after being granted the lock (the * grantor did it all). We do have to worry about canceling the deadlock * timeout and updating the locallock table, but if we lose control to an * error, LockErrorCleanup will fix that up. */ do { WaitLatch(MyLatch, WL_LATCH_SET, 0); ResetLatch(MyLatch); /* check for deadlocks first, as that's probably log-worthy */ if (got_deadlock_timeout) { CheckDeadLock(); got_deadlock_timeout = false; } CHECK_FOR_INTERRUPTS(); /* * waitStatus could change from STATUS_WAITING to something else * asynchronously. Read it just once per loop to prevent surprising * behavior (such as missing log messages). */ myWaitStatus = *((volatile int *) &MyProc->waitStatus); /* * If we are not deadlocked, but are waiting on an autovacuum-induced * task, send a signal to interrupt it. */ if (deadlock_state == DS_BLOCKED_BY_AUTOVACUUM && allow_autovacuum_cancel) { PGPROC *autovac = GetBlockingAutoVacuumPgproc(); PGXACT *autovac_pgxact = &ProcGlobal->allPgXact[autovac->pgprocno]; LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); /* * Only do it if the worker is not working to protect against Xid * wraparound. */ if ((autovac_pgxact->vacuumFlags & PROC_IS_AUTOVACUUM) && !(autovac_pgxact->vacuumFlags & PROC_VACUUM_FOR_WRAPAROUND)) { int pid = autovac->pid; StringInfoData locktagbuf; StringInfoData logbuf; /* errdetail for server log */ initStringInfo(&locktagbuf); initStringInfo(&logbuf); DescribeLockTag(&locktagbuf, &lock->tag); appendStringInfo(&logbuf, _("Process %d waits for %s on %s."), MyProcPid, GetLockmodeName(lock->tag.locktag_lockmethodid, lockmode), locktagbuf.data); /* release lock as quickly as possible */ LWLockRelease(ProcArrayLock); ereport(LOG, (errmsg("sending cancel to blocking autovacuum PID %d", pid), errdetail_log("%s", logbuf.data))); pfree(logbuf.data); pfree(locktagbuf.data); /* send the autovacuum worker Back to Old Kent Road */ if (kill(pid, SIGINT) < 0) { /* Just a warning to allow multiple callers */ ereport(WARNING, (errmsg("could not send signal to process %d: %m", pid))); } } else LWLockRelease(ProcArrayLock); /* prevent signal from being resent more than once */ allow_autovacuum_cancel = false; } /* * If awoken after the deadlock check interrupt has run, and * log_lock_waits is on, then report about the wait. */ if (log_lock_waits && deadlock_state != DS_NOT_YET_CHECKED) { StringInfoData buf, lock_waiters_sbuf, lock_holders_sbuf; const char *modename; long secs; int usecs; long msecs; SHM_QUEUE *procLocks; PROCLOCK *proclock; bool first_holder = true, first_waiter = true; int lockHoldersNum = 0; initStringInfo(&buf); initStringInfo(&lock_waiters_sbuf); initStringInfo(&lock_holders_sbuf); DescribeLockTag(&buf, &locallock->tag.lock); modename = GetLockmodeName(locallock->tag.lock.locktag_lockmethodid, lockmode); TimestampDifference(get_timeout_start_time(DEADLOCK_TIMEOUT), GetCurrentTimestamp(), &secs, &usecs); msecs = secs * 1000 + usecs / 1000; usecs = usecs % 1000; /* * we loop over the lock's procLocks to gather a list of all * holders and waiters. Thus we will be able to provide more * detailed information for lock debugging purposes. * * lock->procLocks contains all processes which hold or wait for * this lock. */ LWLockAcquire(partitionLock, LW_SHARED); procLocks = &(lock->procLocks); proclock = (PROCLOCK *) SHMQueueNext(procLocks, procLocks, offsetof(PROCLOCK, lockLink)); while (proclock) { /* * we are a waiter if myProc->waitProcLock == proclock; we are * a holder if it is NULL or something different */ if (proclock->tag.myProc->waitProcLock == proclock) { if (first_waiter) { appendStringInfo(&lock_waiters_sbuf, "%d", proclock->tag.myProc->pid); first_waiter = false; } else appendStringInfo(&lock_waiters_sbuf, ", %d", proclock->tag.myProc->pid); } else { if (first_holder) { appendStringInfo(&lock_holders_sbuf, "%d", proclock->tag.myProc->pid); first_holder = false; } else appendStringInfo(&lock_holders_sbuf, ", %d", proclock->tag.myProc->pid); lockHoldersNum++; } proclock = (PROCLOCK *) SHMQueueNext(procLocks, &proclock->lockLink, offsetof(PROCLOCK, lockLink)); } LWLockRelease(partitionLock); if (deadlock_state == DS_SOFT_DEADLOCK) ereport(LOG, (errmsg("process %d avoided deadlock for %s on %s by rearranging queue order after %ld.%03d ms", MyProcPid, modename, buf.data, msecs, usecs), (errdetail_log_plural("Process holding the lock: %s. Wait queue: %s.", "Processes holding the lock: %s. Wait queue: %s.", lockHoldersNum, lock_holders_sbuf.data, lock_waiters_sbuf.data)))); else if (deadlock_state == DS_HARD_DEADLOCK) { /* * This message is a bit redundant with the error that will be * reported subsequently, but in some cases the error report * might not make it to the log (eg, if it's caught by an * exception handler), and we want to ensure all long-wait * events get logged. */ ereport(LOG, (errmsg("process %d detected deadlock while waiting for %s on %s after %ld.%03d ms", MyProcPid, modename, buf.data, msecs, usecs), (errdetail_log_plural("Process holding the lock: %s. Wait queue: %s.", "Processes holding the lock: %s. Wait queue: %s.", lockHoldersNum, lock_holders_sbuf.data, lock_waiters_sbuf.data)))); } if (myWaitStatus == STATUS_WAITING) ereport(LOG, (errmsg("process %d still waiting for %s on %s after %ld.%03d ms", MyProcPid, modename, buf.data, msecs, usecs), (errdetail_log_plural("Process holding the lock: %s. Wait queue: %s.", "Processes holding the lock: %s. Wait queue: %s.", lockHoldersNum, lock_holders_sbuf.data, lock_waiters_sbuf.data)))); else if (myWaitStatus == STATUS_OK) ereport(LOG, (errmsg("process %d acquired %s on %s after %ld.%03d ms", MyProcPid, modename, buf.data, msecs, usecs))); else { Assert(myWaitStatus == STATUS_ERROR); /* * Currently, the deadlock checker always kicks its own * process, which means that we'll only see STATUS_ERROR when * deadlock_state == DS_HARD_DEADLOCK, and there's no need to * print redundant messages. But for completeness and * future-proofing, print a message if it looks like someone * else kicked us off the lock. */ if (deadlock_state != DS_HARD_DEADLOCK) ereport(LOG, (errmsg("process %d failed to acquire %s on %s after %ld.%03d ms", MyProcPid, modename, buf.data, msecs, usecs), (errdetail_log_plural("Process holding the lock: %s. Wait queue: %s.", "Processes holding the lock: %s. Wait queue: %s.", lockHoldersNum, lock_holders_sbuf.data, lock_waiters_sbuf.data)))); } /* * At this point we might still need to wait for the lock. Reset * state so we don't print the above messages again. */ deadlock_state = DS_NO_DEADLOCK; pfree(buf.data); pfree(lock_holders_sbuf.data); pfree(lock_waiters_sbuf.data); } } while (myWaitStatus == STATUS_WAITING); /* * Disable the timers, if they are still running. As in LockErrorCleanup, * we must preserve the LOCK_TIMEOUT indicator flag: if a lock timeout has * already caused QueryCancelPending to become set, we want the cancel to * be reported as a lock timeout, not a user cancel. */ if (LockTimeout > 0) { DisableTimeoutParams timeouts[2]; timeouts[0].id = DEADLOCK_TIMEOUT; timeouts[0].keep_indicator = false; timeouts[1].id = LOCK_TIMEOUT; timeouts[1].keep_indicator = true; disable_timeouts(timeouts, 2); } else disable_timeout(DEADLOCK_TIMEOUT, false); /* * Re-acquire the lock table's partition lock. We have to do this to hold * off cancel/die interrupts before we can mess with lockAwaited (else we * might have a missed or duplicated locallock update). */ LWLockAcquire(partitionLock, LW_EXCLUSIVE); /* * We no longer want LockErrorCleanup to do anything. */ lockAwaited = NULL; /* * If we got the lock, be sure to remember it in the locallock table. */ if (MyProc->waitStatus == STATUS_OK) GrantAwaitedLock(); /* * We don't have to do anything else, because the awaker did all the * necessary update of the lock table and MyProc. */ return MyProc->waitStatus; }
/* Main loop of walsender process */ static int WalSndLoop(void) { char *output_message; bool caughtup = false; /* * Allocate buffer that will be used for each output message. We do this * just once to reduce palloc overhead. The buffer must be made large * enough for maximum-sized messages. */ output_message = palloc(1 + sizeof(WalDataMessageHeader) + MAX_SEND_SIZE); /* * Allocate buffer that will be used for processing reply messages. As * above, do this just once to reduce palloc overhead. */ initStringInfo(&reply_message); /* Initialize the last reply timestamp */ last_reply_timestamp = GetCurrentTimestamp(); /* Loop forever, unless we get an error */ for (;;) { /* * Emergency bailout if postmaster has died. This is to avoid the * necessity for manual cleanup of all postmaster children. */ if (!PostmasterIsAlive(true)) exit(1); /* Process any requests or signals received recently */ if (got_SIGHUP) { got_SIGHUP = false; ProcessConfigFile(PGC_SIGHUP); SyncRepInitConfig(); } /* Normal exit from the walsender is here */ if (walsender_shutdown_requested) { /* Inform the standby that XLOG streaming was done */ pq_puttextmessage('C', "COPY 0"); pq_flush(); proc_exit(0); } /* * If we don't have any pending data in the output buffer, try to send * some more. */ if (!pq_is_send_pending()) { XLogSend(output_message, &caughtup); /* * Even if we wrote all the WAL that was available when we started * sending, more might have arrived while we were sending this * batch. We had the latch set while sending, so we have not * received any signals from that time. Let's arm the latch again, * and after that check that we're still up-to-date. */ if (caughtup && !pq_is_send_pending()) { ResetLatch(&MyWalSnd->latch); XLogSend(output_message, &caughtup); } } /* Flush pending output to the client */ if (pq_flush_if_writable() != 0) break; /* * When SIGUSR2 arrives, we send any outstanding logs up to the * shutdown checkpoint record (i.e., the latest record) and exit. */ if (walsender_ready_to_stop && !pq_is_send_pending()) { XLogSend(output_message, &caughtup); ProcessRepliesIfAny(); if (caughtup && !pq_is_send_pending()) walsender_shutdown_requested = true; } if ((caughtup || pq_is_send_pending()) && !got_SIGHUP && !walsender_shutdown_requested) { TimestampTz finish_time = 0; long sleeptime; /* Reschedule replication timeout */ if (replication_timeout > 0) { long secs; int usecs; finish_time = TimestampTzPlusMilliseconds(last_reply_timestamp, replication_timeout); TimestampDifference(GetCurrentTimestamp(), finish_time, &secs, &usecs); sleeptime = secs * 1000 + usecs / 1000; if (WalSndDelay < sleeptime) sleeptime = WalSndDelay; } else { /* * XXX: Without timeout, we don't really need the periodic * wakeups anymore, WaitLatchOrSocket should reliably wake up * as soon as something interesting happens. */ sleeptime = WalSndDelay; } /* Sleep */ WaitLatchOrSocket(&MyWalSnd->latch, MyProcPort->sock, true, pq_is_send_pending(), sleeptime); /* Check for replication timeout */ if (replication_timeout > 0 && GetCurrentTimestamp() >= finish_time) { /* * Since typically expiration of replication timeout means * communication problem, we don't send the error message to * the standby. */ ereport(COMMERROR, (errmsg("terminating walsender process due to replication timeout"))); break; } } /* * If we're in catchup state, see if its time to move to streaming. * This is an important state change for users, since before this * point data loss might occur if the primary dies and we need to * failover to the standby. The state change is also important for * synchronous replication, since commits that started to wait at that * point might wait for some time. */ if (MyWalSnd->state == WALSNDSTATE_CATCHUP && caughtup) { ereport(DEBUG1, (errmsg("standby \"%s\" has now caught up with primary", application_name))); WalSndSetState(WALSNDSTATE_STREAMING); } ProcessRepliesIfAny(); } /* * Get here on send failure. Clean up and exit. * * Reset whereToSendOutput to prevent ereport from attempting to send any * more messages to the standby. */ if (whereToSendOutput == DestRemote) whereToSendOutput = DestNone; proc_exit(0); return 1; /* keep the compiler quiet */ }
/* * This method takes a decision to run analyze based on the query and the number of modified tuples based * on the policy set via gp_autostats_mode. The following modes are currently supported: * none : no automatic analyzes are issued. simply return. * on_change : if the number of modified tuples > gp_onchange_threshold, then an automatic analyze is issued. * on_no_stats : if the operation is a ctas/insert-select and there are no stats on the modified table, * an automatic analyze is issued. */ void auto_stats(AutoStatsCmdType cmdType, Oid relationOid, uint64 ntuples, bool inFunction) { TimestampTz start; bool policyCheck = false; start = GetCurrentTimestamp(); if (Gp_role != GP_ROLE_DISPATCH || relationOid == InvalidOid || rel_is_partitioned(relationOid)) { return; } Assert(relationOid != InvalidOid); Assert(cmdType >= 0 && cmdType <= AUTOSTATS_CMDTYPE_SENTINEL); /* it is a valid command * as per auto-stats */ GpAutoStatsModeValue actual_gp_autostats_mode; if (inFunction) { actual_gp_autostats_mode = gp_autostats_mode_in_functions; } else { actual_gp_autostats_mode = gp_autostats_mode; } switch (actual_gp_autostats_mode) { case GP_AUTOSTATS_ON_CHANGE: policyCheck = autostats_on_change_check(cmdType, ntuples); break; case GP_AUTOSTATS_ON_NO_STATS: policyCheck = autostats_on_no_stats_check(cmdType, relationOid); break; default: Assert(actual_gp_autostats_mode == GP_AUTOSTATS_NONE); policyCheck = false; break; } if (!policyCheck) { elog(DEBUG3, "In mode %s, command %s on (dboid,tableoid)=(%d,%d) modifying " UINT64_FORMAT " tuples did not issue Auto-ANALYZE.", gpvars_show_gp_autostats_mode(), autostats_cmdtype_to_string(cmdType), MyDatabaseId, relationOid, ntuples); return; } if (log_autostats) { const char *autostats_mode; if (inFunction) { autostats_mode = gpvars_show_gp_autostats_mode_in_functions(); } else { autostats_mode = gpvars_show_gp_autostats_mode(); } elog(LOG, "In mode %s, command %s on (dboid,tableoid)=(%d,%d) modifying " UINT64_FORMAT " tuples caused Auto-ANALYZE.", autostats_mode, autostats_cmdtype_to_string(cmdType), MyDatabaseId, relationOid, ntuples); } autostats_issue_analyze(relationOid); if (log_duration) { long secs; int usecs; int msecs; TimestampDifference(start, GetCurrentTimestamp(), &secs, &usecs); msecs = usecs / 1000; elog(LOG, "duration: %ld.%03d ms Auto-ANALYZE", secs * 1000 + msecs, usecs % 1000); } }
/* * Perform garbage collection (if required) of file * @param map_path path to file map file (*.map). */ static bool cfs_gc_file(char* map_path) { int md = open(map_path, O_RDWR|PG_BINARY, 0); FileMap* map; uint32 physSize; uint32 usedSize; uint32 virtSize; int suf = strlen(map_path)-4; int fd = -1, fd2 = -1, md2 = -1; bool succeed = true; if (md < 0) { elog(LOG, "Failed to open map file %s: %m", map_path); return false; } map = cfs_mmap(md); if (map == MAP_FAILED) { elog(LOG, "Failed to map file %s: %m", map_path); close(md); return false; } usedSize = pg_atomic_read_u32(&map->usedSize); physSize = pg_atomic_read_u32(&map->physSize); virtSize = pg_atomic_read_u32(&map->virtSize); if ((physSize - usedSize)*100 > physSize*cfs_gc_threshold) /* do we need to perform defragmentation? */ { long delay = CFS_LOCK_MIN_TIMEOUT; char* file_path = (char*)palloc(suf+1); char* map_bck_path = (char*)palloc(suf+10); char* file_bck_path = (char*)palloc(suf+5); FileMap* newMap = (FileMap*)palloc0(sizeof(FileMap)); uint32 newSize = 0; inode_t** inodes = (inode_t**)palloc(RELSEG_SIZE*sizeof(inode_t*)); bool remove_backups = true; int n_pages = virtSize / BLCKSZ; TimestampTz startTime, endTime; long secs; int usecs; int i; startTime = GetCurrentTimestamp(); memcpy(file_path, map_path, suf); file_path[suf] = '\0'; strcat(strcpy(map_bck_path, map_path), ".bck"); strcat(strcpy(file_bck_path, file_path), ".bck"); while (true) { uint32 access_count = 0; if (pg_atomic_compare_exchange_u32(&map->lock, &access_count, CFS_GC_LOCK)) { break; } if (access_count >= CFS_GC_LOCK) { /* Uhhh... looks like last GC was interrupted. * Try to recover file */ if (access(file_bck_path, R_OK) != 0) { /* There is no backup file: new map should be constructed */ md2 = open(map_bck_path, O_RDWR|PG_BINARY, 0); if (md2 >= 0) { /* Recover map */ if (!cfs_read_file(md2, newMap, sizeof(FileMap))) { elog(LOG, "Failed to read file %s: %m", map_bck_path); goto Cleanup; } close(md2); md2 = -1; newSize = pg_atomic_read_u32(&newMap->usedSize); remove_backups = false; goto ReplaceMap; } } else { /* Presence of backup file means that we still have unchanged data and map files. * Just remove backup files, grab lock and continue processing */ unlink(file_bck_path); unlink(map_bck_path); break; } } pg_usleep(delay); if (delay < CFS_LOCK_MAX_TIMEOUT) { delay *= 2; } } md2 = open(map_bck_path, O_CREAT|O_RDWR|PG_BINARY|O_TRUNC, 0600); if (md2 < 0) { goto Cleanup; } for (i = 0; i < n_pages; i++) { newMap->inodes[i] = map->inodes[i]; inodes[i] = &newMap->inodes[i]; } /* sort inodes by offset to improve read locality */ qsort(inodes, n_pages, sizeof(inode_t*), cfs_cmp_page_offs); fd = open(file_path, O_RDWR|PG_BINARY, 0); if (fd < 0) { goto Cleanup; } fd2 = open(file_bck_path, O_CREAT|O_RDWR|PG_BINARY|O_TRUNC, 0600); if (fd2 < 0) { goto Cleanup; } for (i = 0; i < n_pages; i++) { int size = CFS_INODE_SIZE(*inodes[i]); if (size != 0) { char block[BLCKSZ]; off_t rc PG_USED_FOR_ASSERTS_ONLY; uint32 offs = CFS_INODE_OFFS(*inodes[i]); Assert(size <= BLCKSZ); rc = lseek(fd, offs, SEEK_SET); Assert(rc == offs); if (!cfs_read_file(fd, block, size)) { elog(LOG, "Failed to read file %s: %m", file_path); goto Cleanup; } if (!cfs_write_file(fd2, block, size)) { elog(LOG, "Failed to write file %s: %m", file_bck_path); goto Cleanup; } offs = newSize; newSize += size; *inodes[i] = CFS_INODE(size, offs); } } pg_atomic_write_u32(&map->usedSize, newSize); if (close(fd) < 0) { elog(LOG, "Failed to close file %s: %m", file_path); goto Cleanup; } fd = -1; /* Persist copy of data file */ if (pg_fsync(fd2) < 0) { elog(LOG, "Failed to sync file %s: %m", file_bck_path); goto Cleanup; } if (close(fd2) < 0) { elog(LOG, "Failed to close file %s: %m", file_bck_path); goto Cleanup; } fd2 = -1; /* Persist copy of map file */ if (!cfs_write_file(md2, &newMap, sizeof(newMap))) { elog(LOG, "Failed to write file %s: %m", map_bck_path); goto Cleanup; } if (pg_fsync(md2) < 0) { elog(LOG, "Failed to sync file %s: %m", map_bck_path); goto Cleanup; } if (close(md2) < 0) { elog(LOG, "Failed to close file %s: %m", map_bck_path); goto Cleanup; } md2 = -1; /* Persist map with CFS_GC_LOCK set: in case of crash we will know that map may be changed by GC */ if (cfs_msync(map) < 0) { elog(LOG, "Failed to sync map %s: %m", map_path); goto Cleanup; } if (pg_fsync(md) < 0) { elog(LOG, "Failed to sync file %s: %m", map_path); goto Cleanup; } /* * Now all information necessary for recovery is stored. * We are ready to replace existed file with defragmented one. * Use rename and rely on file system to provide atomicity of this operation. */ remove_backups = false; if (rename(file_bck_path, file_path) < 0) { elog(LOG, "Failed to rename file %s: %m", file_path); goto Cleanup; } ReplaceMap: /* At this moment defragmented file version is stored. We can perfrom in-place update of map. * If crash happens at this point, map can be recovered from backup file */ memcpy(map->inodes, newMap->inodes, n_pages * sizeof(inode_t)); pg_atomic_write_u32(&map->usedSize, newSize); pg_atomic_write_u32(&map->physSize, newSize); map->generation += 1; /* force all backends to reopen the file */ /* Before removing backup files and releasing locks we need to flush updated map file */ if (cfs_msync(map) < 0) { elog(LOG, "Failed to sync map %s: %m", map_path); goto Cleanup; } if (pg_fsync(md) < 0) { elog(LOG, "Failed to sync file %s: %m", map_path); Cleanup: if (fd >= 0) close(fd); if (fd2 >= 0) close(fd2); if (md2 >= 0) close(md2); if (remove_backups) { unlink(file_bck_path); unlink(map_bck_path); remove_backups = false; } succeed = false; } else { remove_backups = true; /* now backups are not need any more */ } pg_atomic_fetch_sub_u32(&map->lock, CFS_GC_LOCK); /* release lock */ /* remove map backup file */ if (remove_backups && unlink(map_bck_path)) { elog(LOG, "Failed to unlink file %s: %m", map_bck_path); succeed = false; } endTime = GetCurrentTimestamp(); TimestampDifference(startTime, endTime, &secs, &usecs); elog(LOG, "%d: defragment file %s: old size %d, new size %d, logical size %d, used %d, compression ratio %f, time %ld usec", MyProcPid, file_path, physSize, newSize, virtSize, usedSize, (double)virtSize/newSize, secs*USECS_PER_SEC + usecs); pfree(file_path); pfree(file_bck_path); pfree(map_bck_path); pfree(inodes); pfree(newMap); if (cfs_gc_delay != 0) { int rc = WaitLatch(MyLatch, WL_TIMEOUT | WL_POSTMASTER_DEATH, cfs_gc_delay /* ms */ ); if (rc & WL_POSTMASTER_DEATH) { exit(1); } } } else if (cfs_state->max_iterations == 1) { elog(LOG, "%d: file %.*s: physical size %d, logical size %d, used %d, compression ratio %f", MyProcPid, suf, map_path, physSize, virtSize, usedSize, (double)virtSize/physSize); } if (cfs_munmap(map) < 0) { elog(LOG, "Failed to unmap file %s: %m", map_path); succeed = false; } if (close(md) < 0) { elog(LOG, "Failed to close file %s: %m", map_path); succeed = false; } return succeed; }
/* * ResolveRecoveryConflictWithBufferPin is called from LockBufferForCleanup() * to resolve conflicts with other backends holding buffer pins. * * We either resolve conflicts immediately or set a SIGALRM to wake us at * the limit of our patience. The sleep in LockBufferForCleanup() is * performed here, for code clarity. * * Resolve conflict by sending a SIGUSR1 reason to all backends to check if * they hold one of the buffer pins that is blocking Startup process. If so, * backends will take an appropriate error action, ERROR or FATAL. * * We also check for deadlocks before we wait, though applications that cause * these will be extremely rare. Deadlocks occur because if queries * wait on a lock, that must be behind an AccessExclusiveLock, which can only * be cleared if the Startup process replays a transaction completion record. * If Startup process is also waiting then that is a deadlock. The deadlock * can occur if the query is waiting and then the Startup sleeps, or if * Startup is sleeping and the query waits on a lock. We protect against * only the former sequence here, the latter sequence is checked prior to * the query sleeping, in CheckRecoveryConflictDeadlock(). */ void ResolveRecoveryConflictWithBufferPin(void) { bool sig_alarm_enabled = false; Assert(InHotStandby); if (MaxStandbyDelay == 0) { /* * We don't want to wait, so just tell everybody holding the pin to * get out of town. */ SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN); } else if (MaxStandbyDelay < 0) { /* * Send out a request to check for buffer pin deadlocks before we * wait. This is fairly cheap, so no need to wait for deadlock timeout * before trying to send it out. */ SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK); } else { TimestampTz then = GetLatestXLogTime(); TimestampTz now = GetCurrentTimestamp(); /* Are we past max_standby_delay? */ if (TimestampDifferenceExceeds(then, now, MaxStandbyDelay)) { /* * We're already behind, so clear a path as quickly as possible. */ SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN); } else { TimestampTz fin_time; /* Expected wake-up time by timer */ long timer_delay_secs; /* Amount of time we set timer * for */ int timer_delay_usecs; /* * Send out a request to check for buffer pin deadlocks before we * wait. This is fairly cheap, so no need to wait for deadlock * timeout before trying to send it out. */ SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK); /* * How much longer we should wait? */ fin_time = TimestampTzPlusMilliseconds(then, MaxStandbyDelay); TimestampDifference(now, fin_time, &timer_delay_secs, &timer_delay_usecs); /* * It's possible that the difference is less than a microsecond; * ensure we don't cancel, rather than set, the interrupt. */ if (timer_delay_secs == 0 && timer_delay_usecs == 0) timer_delay_usecs = 1; if (enable_standby_sig_alarm(timer_delay_secs, timer_delay_usecs, fin_time)) sig_alarm_enabled = true; else elog(FATAL, "could not set timer for process wakeup"); } } /* Wait to be signaled by UnpinBuffer() */ ProcWaitForSignal(); if (sig_alarm_enabled) { if (!disable_standby_sig_alarm()) elog(FATAL, "could not disable timer for process wakeup"); } }