/* * Inserts or updates an entry in the Local MDVSN component. * Entry is inserted if not already existing, or updated otherwise. * * local_mdvsn: Which MDVSN to insert/update into. For subtransactions, we have * a separate Local MDVSN compoment for each level. * entry: The mdver_entry to be inserted or updated. A copy of it will be inserted * in the hashtable * local: true if the message was generated locally, and it's coming from CVQ. * false if message is coming from the global queue SVQ */ void mdver_local_mdvsn_add(mdver_local_mdvsn *local_mdvsn, mdver_entry *entry, bool local) { Assert(NULL != local_mdvsn); Assert(NULL != local_mdvsn->htable); Assert(NULL != entry); bool found_ptr = false; mdver_entry *inserted_entry = (mdver_entry *) hash_search(local_mdvsn->htable, &entry->key, HASH_ENTER, &found_ptr); Assert(NULL != inserted_entry); #ifdef MD_VERSIONING_INSTRUMENTATION if (found_ptr) { ereport(gp_mdversioning_loglevel, (errmsg("local_mdvsn_add: updating entry Oid=%d [" UINT64_FORMAT ", " UINT64_FORMAT "] --> [" UINT64_FORMAT ", " UINT64_FORMAT "]", inserted_entry->key, inserted_entry->ddl_version, inserted_entry->dml_version, entry->ddl_version, entry->dml_version), errprintstack(false))); } else { ereport(gp_mdversioning_loglevel, (errmsg("local_mdvsn_add: inserting entry Oid=%d [" UINT64_FORMAT ", " UINT64_FORMAT "]", inserted_entry->key, entry->ddl_version, entry->dml_version), errprintstack(false))); } #endif memcpy(inserted_entry, entry, sizeof(mdver_entry)); }
/** * If this function is changed then update varattrib_untoast_len as well */ void varattrib_untoast_ptr_len(Datum d, char **datastart, int *len, void **tofree) { if (DatumGetPointer(d) == NULL) { ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg(" Unable to detoast datum "), errprintstack(true))); } struct varlena *va = (struct varlena *) DatumGetPointer(d); varattrib *attr = (varattrib *) va; *len = -1; *tofree = NULL; if(VARATT_IS_EXTENDED(attr)) { if(VARATT_IS_EXTERNAL(attr)) { attr = (varattrib *)toast_fetch_datum((struct varlena *)attr); /* toast_fetch_datum will palloc, so set it up for free */ *tofree = attr; } if(VARATT_IS_COMPRESSED(attr)) { PGLZ_Header *tmp = (PGLZ_Header *) attr; attr = (varattrib *) palloc(PGLZ_RAW_SIZE(tmp) + VARHDRSZ); SET_VARSIZE(attr, PGLZ_RAW_SIZE(tmp) + VARHDRSZ); pglz_decompress(tmp, VARDATA(attr)); /* If tofree is set, that is, we get it from toast_fetch_datum. * We need to free it here */ if(*tofree) pfree(*tofree); *tofree = attr; } else if(VARATT_IS_SHORT(attr)) { /* Warning! Return unaligned pointer! */ *len = VARSIZE_SHORT(attr) - VARHDRSZ_SHORT; *datastart = VARDATA_SHORT(attr); attr = NULL; } } if(*len == -1) { *datastart = VARDATA(attr); *len = VARSIZE(attr) - VARHDRSZ; } Assert(*len >= 0); }
/** * If this function is changed then update varattrib_untoast_ptr_len as well */ int varattrib_untoast_len(Datum d) { if (DatumGetPointer(d) == NULL) { ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg(" Unable to detoast datum "), errprintstack(true))); } struct varlena *va = (struct varlena *) DatumGetPointer(d); varattrib *attr = (varattrib *) va; int len = -1; void *toFree = NULL; if(VARATT_IS_EXTENDED(attr)) { if(VARATT_IS_EXTERNAL(attr)) { attr = (varattrib *)toast_fetch_datum((struct varlena *)attr); /* toast_fetch_datum will palloc, so set it up for free */ toFree = attr; } if(VARATT_IS_COMPRESSED(attr)) { PGLZ_Header *tmp = (PGLZ_Header *) attr; len = PGLZ_RAW_SIZE(tmp); } else if(VARATT_IS_SHORT(attr)) { len = VARSIZE_SHORT(attr) - VARHDRSZ_SHORT; } } if(len == -1) { len = VARSIZE(attr) - VARHDRSZ; } if ( toFree) pfree(toFree); Assert(len >= 0); return len; }
/* * Starts a runaway cleanup by triggering an ERROR if the VMEM tracker is active * and a commit is not already in progress. Otherwise, it marks the process as clean */ void RunawayCleaner_StartCleanup() { /* * Cleanup can be attempted from multiple places, such as before deactivating * a process (if a pending runaway event) or periodically from CHECK_FOR_INTERRUPTS * (indirectly via RedZoneHandler_DetectRunaway). We don't carry multiple cleanup * for a single runaway event. Every time we *start* a cleanup process, we set the * beginCleanupRunawayVersion to the runaway version for which we started cleaning * up. Later on, if we reenter this method (e.g., another CHECK_FOR_INTERRUPTS() * during cleanup), we can observe that the cleanup already started from this runaway * event, and therefore we skip duplicate cleanup */ if (RunawayCleaner_ShouldStartRunawayCleanup()) { Assert(beginCleanupRunawayVersion < *latestRunawayVersion); Assert(endCleanupRunawayVersion < *latestRunawayVersion); /* We don't want to cleanup multiple times for same runaway event */ beginCleanupRunawayVersion = *latestRunawayVersion; if (CritSectionCount == 0 && InterruptHoldoffCount == 0 && vmemTrackerInited && gp_command_count > 0 /* Cleaning up QEs that are not executing a valid command may cause the QD to get stuck [MPP-24950] */ && /* Super user is terminated only when it's the primary runaway consumer (i.e., the top consumer) */ (!superuser() || MySessionState->runawayStatus == RunawayStatus_PrimaryRunawaySession)) { #ifdef FAULT_INJECTOR FaultInjector_InjectFaultIfSet( RunawayCleanup, DDLNotSpecified, "", // databaseName ""); // tableName #endif ereport(ERROR, (errmsg("Canceling query because of high VMEM usage. Used: %dMB, available %dMB, red zone: %dMB", VmemTracker_ConvertVmemChunksToMB(MySessionState->sessionVmem), VmemTracker_GetAvailableVmemMB(), RedZoneHandler_GetRedZoneLimitMB()), errprintstack(true))); } /* * If we cannot error out because of a critical section or because we are a super user * or for some other reason (such as the QE is not running any valid command, i.e., * gp_command_count is not positive) simply declare this process as clean */ RunawayCleaner_RunawayCleanupDoneForProcess(true /* ignoredCleanup */); } }
/* * increment_command_count * Increment gp_command_count. If the new command count is 0 or a negative number, reset it to 1. */ void increment_command_count() { if (gp_cancel_query_print_log) { ereport(LOG, (errmsg("Incrementing command count from %d to %d", gp_command_count, gp_command_count + 1), errprintstack(true))); } gp_command_count++; if (gp_command_count <= 0) { gp_command_count = 1; } }
/* * When a backend is requesting the more recent version of an object, * if the Local MDVSN cache doesn't have the version, and if a NUKE event * has been encountered in the current transaction, a new version is * generated and returned for the object. A new versioning event is also * produced. * * key: The key of the looked-up object * ddl_version: used to return the ddl version for the object * dml_version: used to return the dml version for the object * */ static void mdver_request_after_nuke(Oid key, uint64 *ddl_version, uint64 *dml_version) { Assert(NULL != ddl_version); Assert(NULL != dml_version); /* Generate new version */ *ddl_version = mdver_next_global_version(); *dml_version = mdver_next_global_version(); mdver_event *new_event = (mdver_event *) palloc0(sizeof(mdver_event)); new_event->key = key; new_event->new_ddl_version = *ddl_version; new_event->new_dml_version = *dml_version; new_event->old_ddl_version = INVALID_MD_VERSION; new_event->old_dml_version = INVALID_MD_VERSION; #ifdef MD_VERSIONING_INSTRUMENTATION /* Add my current process id as the originating backend pid */ new_event->backend_pid = MyProcPid; #endif /* Annotate Versioning Event with the current version from Global MDVSN if exists */ mdver_entry *crt_entry = mdver_glob_mdvsn_find(key); if (NULL != crt_entry) { new_event->old_ddl_version = crt_entry->ddl_version; new_event->old_dml_version = crt_entry->dml_version; } CacheAddVersioningEvent(new_event); #ifdef MD_VERSIONING_INSTRUMENTATION char *mdev_str = mdver_event_str(new_event); ereport(gp_mdversioning_loglevel, (errmsg("mdver_consume_after_nuke: generated new VE %s", mdev_str), errprintstack(false))); pfree(mdev_str); #endif /* A copy of the event is added to the queue above. We can pfree our local copy */ pfree(new_event); }
/* * Look up an entry in the specified Local MDVSN component. This function * doesn't traverse to the parent Local MDVSN. * * Returns the entry if found, NULL otherwise. * * local_mdvsn: Which MDVSN to look into. For subtransactions, we have * a separate Local MDVSN component for each level * key: The key of the object to look for */ mdver_entry * mdver_local_mdvsn_find(mdver_local_mdvsn *local_mdvsn, Oid key) { Assert(NULL != local_mdvsn); Assert(InvalidOid != key); mdver_entry *entry = (mdver_entry *) hash_search(local_mdvsn->htable, &key, HASH_FIND, NULL /* foundPtr */); ereport(gp_mdversioning_loglevel, (errmsg("LocalMDVSN find for key=%d. Found=%s, nukeHappened=%s", key, (NULL!=entry)?"yes":"no", local_mdvsn->nuke_happened?"yes":"no"), errprintstack(true))); return entry; }
/* * Workfile-manager specific function to clean up before releasing a * workfile set from the cache. * */ static void workfile_mgr_cleanup_set(const void *resource) { workfile_set *work_set = (workfile_set *) resource; ereport(gp_workfile_caching_loglevel, (errmsg("workfile mgr cleanup deleting set: key=0x%0xd, size=" INT64_FORMAT " in_progress_size=" INT64_FORMAT " path=%s", work_set->key, work_set->size, work_set->in_progress_size, work_set->path), errprintstack(true))); workfile_mgr_delete_set_directory(work_set->path); /* * The most accurate size of a workset is recorded in work_set->in_progress_size. * work_set->size is only updated when we close a file, so it lags behind */ Assert(work_set->in_progress_size >= work_set->size); int64 size_to_delete = work_set->in_progress_size; elog(gp_workfile_caching_loglevel, "Subtracting " INT64_FORMAT " from workfile diskspace", size_to_delete); /* * When subtracting the size of this workset from our accounting, * only update the per-query counter if we created the workset. * In that case, the state is ACQUIRED, otherwise is CACHED or DELETED */ CacheEntry *cacheEntry = CACHE_ENTRY_HEADER(resource); bool update_query_space = (cacheEntry->state == CACHE_ENTRY_ACQUIRED); WorkfileDiskspace_Commit(0, size_to_delete, update_query_space); }
/* * Workfile-manager specific function to clean up before releasing a * workfile set from the cache. * */ static void workfile_mgr_cleanup_set(const void *resource) { workfile_set *work_set = (workfile_set *) resource; /* * We have to make this callback function return cleanly ALL the * time. It shouldn't throw an exception. * We must try to clean up as much as we can in the callback, and * then never be called again. * This means holding interrupts, catching and handling all exceptions. */ if (work_set->on_disk) { ereport(gp_workfile_caching_loglevel, (errmsg("workfile mgr cleanup deleting set: key=0x%0xd, size=" INT64_FORMAT " in_progress_size=" INT64_FORMAT " path=%s", work_set->key, work_set->size, work_set->in_progress_size, work_set->path), errprintstack(true))); Assert(NULL == work_set->set_plan); PG_TRY(); { #ifdef FAULT_INJECTOR FaultInjector_InjectFaultIfSet( WorkfileCleanupSet, DDLNotSpecified, "", /* databaseName */ "" /* tableName */ ); #endif /* Prevent interrupts while cleaning up */ HOLD_INTERRUPTS(); workfile_mgr_delete_set_directory(work_set->path); /* Now we can allow interrupts again */ RESUME_INTERRUPTS(); } PG_CATCH(); { elog(LOG, "Cleaning up workfile set directory path=%s failed. Proceeding", work_set->path); /* We're not re-throwing the error. Otherwise we'll end up having * to clean up again, probably failing again. */ } PG_END_TRY(); /* * The most accurate size of a workset is recorded in work_set->in_progress_size. * work_set->size is only updated when we close a file, so it lags behind */ Assert(work_set->in_progress_size >= work_set->size); int64 size_to_delete = work_set->in_progress_size; elog(gp_workfile_caching_loglevel, "Subtracting " INT64_FORMAT " from workfile diskspace", size_to_delete); /* * When subtracting the size of this workset from our accounting, * only update the per-query counter if we created the workset. * In that case, the state is ACQUIRED, otherwise is CACHED or DELETED */ CacheEntry *cacheEntry = CACHE_ENTRY_HEADER(resource); bool update_query_space = (cacheEntry->state == CACHE_ENTRY_ACQUIRED); WorkfileDiskspace_Commit(0, size_to_delete, update_query_space); } else { /* Non-physical workfile set, we need to free up the plan memory */ if (NULL != work_set->set_plan->serialized_plan) { pfree(work_set->set_plan->serialized_plan); } if (NULL != work_set->set_plan) { pfree(work_set->set_plan); } } }
/* * Grabs one entry in the sessionStateArray for current session. * If the current session already has an entry, it just returns the * pointer to the previously grabbed entry. */ static SessionState* SessionState_Acquire(int sessionId) { LWLockAcquire(SessionStateLock, LW_EXCLUSIVE); SessionState *cur = AllSessionStateEntries->usedList; while (cur != NULL && cur->sessionId != sessionId) { Assert(INVALID_SESSION_ID != cur->sessionId); cur = cur->next; } if (NULL == cur && NULL == AllSessionStateEntries->freeList) { LWLockRelease(SessionStateLock); ereport(FATAL, (errcode(ERRCODE_TOO_MANY_CONNECTIONS), errmsg("Too many sessions."), errdetail("Could not acquire resources for additional sessions."), errhint("Disconnect some sessions and try again."))); } SessionState *acquired = cur; /* * Nothing was acquired for this session from any other processes. Therefore, * acquire a new entry, and reset its properties. */ if (NULL == acquired) { acquired = AllSessionStateEntries->freeList; Assert(INVALID_SESSION_ID == acquired->sessionId && acquired->runawayStatus == RunawayStatus_NotRunaway && 0 == acquired->pinCount && CLEANUP_COUNTDOWN_BEFORE_RUNAWAY == acquired->cleanupCountdown && 0 == acquired->activeProcessCount && 0 == acquired->sessionVmem && 0 == acquired->spinLock && 0 == acquired->sessionVmemRunaway && 0 == acquired->commandCountRunaway && !acquired->isModifiedSessionId); AllSessionStateEntries->freeList = acquired->next; acquired->next = AllSessionStateEntries->usedList; AllSessionStateEntries->usedList = acquired; AllSessionStateEntries->numSession++; Assert(AllSessionStateEntries->numSession <= AllSessionStateEntries->maxSession); acquired->sessionId = sessionId; acquired->runawayStatus = RunawayStatus_NotRunaway; acquired->sessionVmemRunaway = 0; acquired->commandCountRunaway = 0; acquired->pinCount = 0; acquired->sessionVmem = 0; acquired->cleanupCountdown = CLEANUP_COUNTDOWN_BEFORE_RUNAWAY; acquired->activeProcessCount = 0; acquired->idle_start = 0; acquired->resGroupSlot = NULL; #ifdef USE_ASSERT_CHECKING acquired->isModifiedSessionId = false; #endif /* * Make sure that the lock is reset to released. Note: this doesn't * have a matching SpinLockAcquire. We are just resetting the lock * as part of initialization */ SpinLockRelease(&acquired->spinLock); } Assert(NULL != acquired); int pinCount = pg_atomic_add_fetch_u32((pg_atomic_uint32 *) &acquired->pinCount, 1); ereport(gp_sessionstate_loglevel, (errmsg("SessionState_Acquire: pinCount: %d, activeProcessCount: %d", pinCount, acquired->activeProcessCount), errprintstack(true))); LWLockRelease(SessionStateLock); return acquired; }
/* * Releases the pinCount of a SessionState entry. If the pinCount * drops to 0, it puts the entry back to the freeList for reuse. */ static void SessionState_Release(SessionState *acquired) { if (!sessionStateInited) { Assert(NULL == acquired); return; } Assert(NULL != acquired); Assert(0 < acquired->pinCount); Assert(acquired->sessionId == gp_session_id || acquired->isModifiedSessionId); LWLockAcquire(SessionStateLock, LW_EXCLUSIVE); Assert(!isProcessActive); Assert(acquired->activeProcessCount < acquired->pinCount); int pinCount = pg_atomic_sub_fetch_u32((pg_atomic_uint32 *) &acquired->pinCount, 1); ereport(gp_sessionstate_loglevel, (errmsg("SessionState_Release: pinCount: %d, activeProcessCount: %d", pinCount, acquired->activeProcessCount), errprintstack(true))); /* Before this point the process should have been deactivated */ Assert(acquired->activeProcessCount <= acquired->pinCount); Assert(0 <= acquired->pinCount); if (0 == acquired->pinCount) { RunawayCleaner_RunawayCleanupDoneForSession(); acquired->sessionId = INVALID_SESSION_ID; Assert(acquired->runawayStatus == RunawayStatus_NotRunaway); Assert(CLEANUP_COUNTDOWN_BEFORE_RUNAWAY == acquired->cleanupCountdown); Assert(0 == acquired->activeProcessCount); acquired->sessionVmem = 0; acquired->runawayStatus = RunawayStatus_NotRunaway; acquired->sessionVmemRunaway = 0; acquired->commandCountRunaway = 0; acquired->cleanupCountdown = CLEANUP_COUNTDOWN_BEFORE_RUNAWAY; acquired->activeProcessCount = 0; acquired->idle_start = 0; acquired->resGroupSlot = NULL; #ifdef USE_ASSERT_CHECKING acquired->isModifiedSessionId = false; #endif SessionState *cur = AllSessionStateEntries->usedList; SessionState *prev = NULL; while (cur != acquired && cur != NULL) { prev = cur; cur = cur->next; } Assert(cur == acquired); /* grabbed is at the head of used list */ if (NULL == prev) { Assert(AllSessionStateEntries->usedList == acquired); AllSessionStateEntries->usedList = acquired->next; } else { prev->next = cur->next; } acquired->next = AllSessionStateEntries->freeList; AllSessionStateEntries->freeList = acquired; AllSessionStateEntries->numSession--; Assert(AllSessionStateEntries->numSession >= 0); } LWLockRelease(SessionStateLock); }
/* * Entry point for the Local Versioning Event Handler. This gets called * for every message that is executed locally at a backend. */ extern void mdver_localhandler_new_event(SharedInvalidationMessage *msg) { Assert(NULL != msg); Assert(SHAREDVERSIONINGMSG_ID == msg->id); #ifdef MD_VERSIONING_INSTRUMENTATION char *mdev_str = mdver_event_str(&msg->ve.verEvent); ereport(gp_mdversioning_loglevel, (errmsg("LocalExecuteVE: got %s event %s", msg->ve.local ? "LOCAL" : "REMOTE", mdev_str), errprintstack(false))); pfree(mdev_str); #endif /* * There are some cases where we don't have a transInvalInfo structure, * and thus we don't have a Local MDVSN. For example: * - an auxiliary process (fts prober comes to mind) that queries * catalog tables directly using heap functions (no transaction) * - updating persistent tables during transaction commit * (transInvalInfo has already been reset). * - bootstrap * * In other cases, we simply don't have a Local MDVSN since we don't * cache versions: * - a QE process running on the master or segments will have a * syscache, but not a Metadata Version cache * * In those cases we don't care about versioning, so skip adding * to local MDVSN. */ mdver_local_mdvsn *local_mdvsn = GetCurrentLocalMDVSN(); if (NULL != local_mdvsn) { mdver_event *event = &msg->ve.verEvent; if (mdver_is_nuke_event(event)) { elog(gp_mdversioning_loglevel, "Local VE Handler: Received NUKE event"); mdver_local_mdvsn_nuke(local_mdvsn); return; } if (msg->ve.local) { /* * Locally generated event, we must add or update the version * in the Local MDVSN. */ mdver_entry entry; entry.key = event->key; /* FIXME gcaragea 7/4/2014: Can we assert anything here? */ entry.ddl_version = event->new_ddl_version; entry.dml_version = event->new_dml_version; mdver_local_mdvsn_add(local_mdvsn, &entry, msg->ve.local); } else { /* * An event coming from the global queue (GVQ) * If we are interested in this object, add / update * version in Local MDVSN. * */ mdver_entry *local_entry = mdver_local_mdvsn_find(local_mdvsn, event->key); if (NULL != local_entry) { /* * A VE came from SVQ for a key that we already have locally. * Need to reconcile and record. */ mdver_localhandler_reconcile(event, local_entry); } else { elog(gp_mdversioning_loglevel, "Local VE Handler: Ignoring remote event for object not of interest key=%d", event->key); } /* TODO gcaragea 5/27/2014: For subtransactions, keep all messages (MPP-22935) */ } } }
void ProcNewMppSessionId(int *newSessionId) { Assert(newSessionId != NULL); *newSessionId = MyProc->mppSessionId = gp_atomic_add_32(&ProcGlobal->mppLocalProcessCounter, 1); /* * Make sure that our SessionState entry correctly records our * new session id. */ if (NULL != MySessionState) { /* This should not happen outside of dispatcher on the master */ Assert(GpIdentity.segindex == MASTER_CONTENT_ID && Gp_role == GP_ROLE_DISPATCH); ereport(gp_sessionstate_loglevel, (errmsg("ProcNewMppSessionId: changing session id (old: %d, new: %d), pinCount: %d, activeProcessCount: %d", MySessionState->sessionId, *newSessionId, MySessionState->pinCount, MySessionState->activeProcessCount), errprintstack(true))); #ifdef USE_ASSERT_CHECKING MySessionState->isModifiedSessionId = true; #endif MySessionState->sessionId = *newSessionId; } }