/* Returns true if the current process should start a runaway cleanup */ static bool RunawayCleaner_ShouldStartRunawayCleanup() { if (NULL != MySessionState && MySessionState->runawayStatus != RunawayStatus_NotRunaway && beginCleanupRunawayVersion != *latestRunawayVersion) { AssertImply(isProcessActive, activationVersion >= deactivationVersion); AssertImply(!isProcessActive, deactivationVersion >= activationVersion); /* * We are marked as runaway. Therefore, if the runaway event happened before deactivation, * we must have a version counter increment */ AssertImply(*latestRunawayVersion < deactivationVersion && !isProcessActive, activationVersion < deactivationVersion); if (isProcessActive && *latestRunawayVersion > activationVersion) { /* Active process and the runaway event came after the activation */ return true; } else if (!isProcessActive && *latestRunawayVersion < deactivationVersion && *latestRunawayVersion > activationVersion) { /* * The process is deactivated, but there is a pending runaway event before * the deactivation for which this process never cleaned up */ return true; } } return false; }
/* ---------- * toast_delete - * * Cascaded delete toast-entries on DELETE * ---------- */ void toast_delete(Relation rel, HeapTuple oldtup, MemTupleBinding *pbind) { TupleDesc tupleDesc; Form_pg_attribute *att; int numAttrs; int i; Datum toast_values[MaxHeapAttributeNumber]; bool toast_isnull[MaxHeapAttributeNumber]; bool ismemtuple = is_heaptuple_memtuple(oldtup); AssertImply(ismemtuple, pbind); AssertImply(!ismemtuple, !pbind); /* * We should only ever be called for tuples of plain relations --- * recursing on a toast rel is bad news. */ Assert(rel->rd_rel->relkind == RELKIND_RELATION); /* * Get the tuple descriptor and break down the tuple into fields. * * NOTE: it's debatable whether to use heap_deform_tuple() here or just * heap_getattr() only the varlena columns. The latter could win if there * are few varlena columns and many non-varlena ones. However, * heap_deform_tuple costs only O(N) while the heap_getattr way would cost * O(N^2) if there are many varlena columns, so it seems better to err on * the side of linear cost. (We won't even be here unless there's at * least one varlena column, by the way.) */ tupleDesc = rel->rd_att; att = tupleDesc->attrs; numAttrs = tupleDesc->natts; Assert(numAttrs <= MaxHeapAttributeNumber); if(ismemtuple) memtuple_deform((MemTuple) oldtup, pbind, toast_values, toast_isnull); else heap_deform_tuple(oldtup, tupleDesc, toast_values, toast_isnull); /* * Check for external stored attributes and delete them from the secondary * relation. */ for (i = 0; i < numAttrs; i++) { if (att[i]->attlen == -1) { Datum value = toast_values[i]; if (!toast_isnull[i] && VARATT_IS_EXTERNAL_D(value)) toast_delete_datum(rel, value); } } }
/* * Looks up an entry with a given key in the hashtable. * Returns pointer to the entry if found, NULL otherwise. * * This function is synchronized. Returned entry is AddRef'ed and needs to * be released. */ void * SyncHTLookup(SyncHT *syncHT, void *key) { Assert(NULL != syncHT); Assert(NULL != key); LWLockId partitionLock = SyncHTPartLockId(syncHT, key); LWLockAcquire(partitionLock, LW_SHARED); bool existing = false; void *entry = hash_search(syncHT->ht, key, HASH_FIND, &existing); AssertImply(entry != NULL, existing); /* AddRef the entry if found */ if (entry != NULL) { SyncHTAddRef(syncHT, entry); } LWLockRelease(partitionLock); return entry; }
/* * Get the next value. On success, a non-negative value is returned and *out is populated with the value * that was on the top of the heap. * * if this is an array-backed heap then *out is inserted into the heap. If it's a reader-backed heap then * *out is ignored on input. */ int mkheap_putAndGet(MKHeap *mkheap, MKEntry *out) { int ret = 0; Assert(out); /* * fetch from appropriate source * * note that these two cases don't behave the same in terms of how *out is treated. * mkheap_putAndGet_reader should be called mkheap_get_reader -- it never puts the input value * mkheap_putAndGet_impl will put *out if it's not empty, and then do the get. */ if(mkheap->nreader > 0) ret = mkheap_putAndGet_reader(mkheap, out); else ret = mkheap_putAndGet_impl(mkheap, out); /* check: underlying call must have enforced uniquness */ AssertImply(mkheap->mkctxt->enforceUnique, ret != 0); /* free *out */ if(mkheap->mkctxt->cpfr) (mkheap->mkctxt->cpfr)(out, NULL, mkheap->mkctxt->lvctxt + mke_get_lv(out)); return ret; }
/* * Close temporary files and delete their underlying files. * * isProcExit: if true, this is being called as the backend process is * exiting. If that's the case, we should remove all temporary files; if * that's not the case, we are being called for transaction commit/abort * and should only remove transaction-local temp files. In either case, * also clean up "allocated" stdio files and dirs. */ static void CleanupTempFiles(bool isProcExit) { Index i; if (SizeVfdCache > 0) { Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */ for (i = 1; i < SizeVfdCache; i++) { unsigned short fdstate = VfdCache[i].fdstate; /* * If we're in the process of exiting a backend process, close * all temporary files. Otherwise, only close temporary files * local to the current transaction. */ if((fdstate & FD_CLOSE_AT_EOXACT) || (isProcExit && (fdstate & FD_TEMPORARY)) ) { AssertImply( (fdstate & FD_TEMPORARY), VfdCache[i].fileName != NULL); FileClose(i); } } } workfile_mgr_cleanup(); while (numAllocatedDescs > 0) FreeDesc(&allocatedDescs[0]); }
/* * Create a new file set * type is the WorkFileType for the files: BUFFILE or BFZ * can_be_reused: if set to false, then we don't insert this set into the cache, * since the caller is telling us there is no point. This can happen for * example when spilling during index creation. * ps is the PlanState for the subtree rooted at the operator * snapshot contains snapshot information for the current transaction * */ workfile_set * workfile_mgr_create_set(enum ExecWorkFileType type, bool can_be_reused, PlanState *ps) { Assert(NULL != workfile_mgr_cache); Plan *plan = NULL; if (ps != NULL) { plan = ps->plan; } AssertImply(can_be_reused, plan != NULL); NodeTag node_type = T_Invalid; if (ps != NULL) { node_type = ps->type; } char *dir_path = create_workset_directory(node_type, currentSliceId); if (!workfile_sets_resowner_callback_registered) { RegisterResourceReleaseCallback(workfile_set_free_callback, NULL); workfile_sets_resowner_callback_registered = true; } /* Create parameter info for the populate function */ workset_info set_info; set_info.file_type = type; set_info.nodeType = node_type; set_info.dir_path = dir_path; set_info.session_start_time = GetCurrentTimestamp(); set_info.operator_work_mem = get_operator_work_mem(ps); CacheEntry *newEntry = Cache_AcquireEntry(workfile_mgr_cache, &set_info); if (NULL == newEntry) { /* Clean up the directory we created. */ workfile_mgr_delete_set_directory(dir_path); /* Could not acquire another entry from the cache - we filled it up */ ereport(ERROR, (errmsg("could not create workfile manager entry: exceeded number of concurrent spilling queries"))); } /* Path has now been copied to the workfile_set. We can free it */ pfree(dir_path); /* Complete initialization of the entry with post-acquire actions */ Assert(NULL != newEntry); workfile_set *work_set = CACHE_ENTRY_PAYLOAD(newEntry); Assert(work_set != NULL); elog(gp_workfile_caching_loglevel, "new spill file set. key=0x%x prefix=%s opMemKB=" INT64_FORMAT, work_set->key, work_set->path, work_set->metadata.operator_work_mem); return work_set; }
/* * Marks the current process as idle; i.e., it is no longer able to respond * to a runaway cleanup. However, before it returns from this method, it * would trigger one last runaway cleanup for a pre-dactivation era runaway * event, if necessary. */ void IdleTracker_DeactivateProcess() { if (NULL != MySessionState) { /* * Verify that deactivation during proc_exit_inprogress is protected in * critical section or the interrupt is disabled so that we don't attempt * any runaway cleanup */ AssertImply(proc_exit_inprogress, CritSectionCount > 0 || InterruptHoldoffCount > 0); /* * When an idle process receives a SIGTERM process, the signal handler * die() calls the cleanup directly, so we get here for an idle process. * Instead of re-activating it forcefully, just special case it * and don't do anything during process exit for already inactive processes. */ if (proc_exit_inprogress && ! isProcessActive) { Assert(deactivationVersion >= activationVersion); return; } Assert(isProcessActive); Assert(deactivationVersion <= activationVersion); /* No new runaway event can come in */ SpinLockAcquire(&MySessionState->spinLock); Assert(MySessionState->activeProcessCount <= MySessionState->pinCount); /* No atomic update necessary as the update is protected by spin lock */ MySessionState->activeProcessCount -= 1; Assert(0 <= MySessionState->activeProcessCount); MySessionState->idle_start = GetCurrentTimestamp(); isProcessActive = false; /* Save the point where we reduced the activeProcessCount */ deactivationVersion = *CurrentVersion; /* * Release spinLock as we no longer contend for isRunaway. */ SpinLockRelease(&MySessionState->spinLock); /* * We are still deactivated (i.e., activeProcessCount is decremented). If an ERROR is indeed thrown * from the VmemTracker_StartCleanupIfRunaway, the VmemTracker_RunawayCleanupDoneForProcess() * method would reactivate this process. */ RunawayCleaner_StartCleanup(); /* At this point the process must be clean, unless we don't have a runaway event before deactivation */ Assert(*latestRunawayVersion > deactivationVersion || !RunawayCleaner_IsCleanupInProgress()); } /* At this point the process is ready to be blocked in ReadCommand() */ }
/* * Updating accounting of size when closing a temporary file we created */ static void adjust_size_temp_file_new(workfile_set *work_set, int64 size) { #if USE_ASSERT_CHECKING bool isCached = (NULL != work_set) && Cache_IsCached(CACHE_ENTRY_HEADER(work_set)); #endif Assert(!isCached); AssertImply((NULL != work_set), work_set->size == 0); AssertImply((NULL != work_set), work_set->in_progress_size >= size); if (NULL != work_set) { work_set->in_progress_size -= size; } WorkfileDiskspace_Commit(0, size, true /* update_query_size */); elog(gp_workfile_caching_loglevel, "closed and deleted temp file, subtracted size " INT64_FORMAT " from disk space", size); }
/* * Updating accounting of size when closing a temporary file we created */ static void adjust_size_temp_file_new(workfile_set *work_set, int64 size) { #if USE_ASSERT_CHECKING bool isCached = (NULL != work_set) && Cache_IsCached(CACHE_ENTRY_HEADER(work_set)); #endif Assert(!isCached); AssertImply((NULL != work_set), work_set->size == 0); AssertImply((NULL != work_set), work_set->in_progress_size >= size); if (NULL != work_set) { work_set->in_progress_size -= size; } WorkfileDiskspace_Commit(0 /* commit_bytes */, size, true /* update_query_size */); elog(gp_workfile_caching_loglevel, "closed and deleted temp file, subtracted size " INT64_FORMAT " from disk space", size); /* About to physically delete a file we created. Update the per-query file count as well */ WorkfileQueryspace_SubtractWorkfile(1 /* nFiles */); }
/* * makeCdbSreh * * Allocate and initialize a Single Row Error Handling state object. * Pass in the only known parameters (both we get from the SQL stmt), * the other variables are set later on, when they are known. */ CdbSreh * makeCdbSreh(bool is_keep, bool reusing_existing_errtable, int rejectlimit, bool is_limit_in_rows, RangeVar *errortable, char *filename, char *relname, bool log_to_file) { CdbSreh *h; h = palloc(sizeof(CdbSreh)); h->errmsg = NULL; h->rawdata = NULL; h->linenumber = 0; h->processed = 0; h->relname = relname; h->rejectlimit = rejectlimit; h->is_limit_in_rows = is_limit_in_rows; h->rejectcount = 0; h->is_server_enc = false; h->is_keep = is_keep; h->should_drop = false; /* we'll decide later */ h->reusing_errtbl = reusing_existing_errtable; h->cdbcopy = NULL; h->errtbl = NULL; h->lastsegid = 0; h->consec_csv_err = 0; AssertImply(log_to_file, errortable == NULL); h->log_to_file = log_to_file; snprintf(h->filename, sizeof(h->filename), "%s", filename ? filename : "<stdin>"); /* error table was specified open it (and create it first if necessary) */ if(errortable) OpenErrorTable(h, errortable); /* * Create a temporary memory context that we can reset once per row to * recover palloc'd memory. This avoids any problems with leaks inside * datatype input routines, and should be faster than retail pfree's * anyway. */ h->badrowcontext = AllocSetContextCreate(CurrentMemoryContext, "SrehMemCtxt", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); return h; }
/* * InitScanStateInternal * Initialize ScanState common variables for various Scan node. */ void InitScanStateInternal(ScanState *scanState, Plan *plan, EState *estate, int eflags, bool initCurrentRelation) { Assert(IsA(plan, SeqScan) || IsA(plan, AppendOnlyScan) || IsA(plan, ParquetScan) || IsA(plan, TableScan) || IsA(plan, DynamicTableScan) || IsA(plan, BitmapTableScan)); PlanState *planState = &scanState->ps; planState->plan = plan; planState->state = estate; /* Create expression evaluation context */ ExecAssignExprContext(estate, planState); /* Initialize tuple table slot */ ExecInitResultTupleSlot(estate, planState); ExecInitScanTupleSlot(estate, scanState); /* * For dynamic table scan, We do not initialize expression states; instead * we wait until the first partition, and initialize the expression state * at that time. Also, for dynamic table scan, we do not need to open the * parent partition relation. */ if (initCurrentRelation) { InitScanStateRelationDetails(scanState, plan, estate); } /* Initialize result tuple type. */ ExecAssignResultTypeFromTL(planState); /* * If eflag contains EXEC_FLAG_REWIND or EXEC_FLAG_BACKWARD or EXEC_FLAG_MARK, * then this node is not eager free safe. */ scanState->ps.delayEagerFree = ((eflags & (EXEC_FLAG_REWIND | EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)) != 0); /* Currently, only SeqScan supports Mark/Restore. */ AssertImply((eflags & EXEC_FLAG_MARK) != 0, IsA(plan, SeqScan)); }
/* * Create a multi-key heap from an array of entries * * entries: the values to convert to a heap. This array will be under mkheap's ownership * alloc_sz: the allocation size of entries: that is, how much room the array has. * cnt: the number of elements in entries which should be used to build the heap * mkctxt: description of the heap to build * * If alloc_sz is zero then entries must be NULL */ MKHeap * mkheap_from_array(MKEntry *entries, int alloc_sz, int cnt, MKContext *mkctxt) { MKHeap *heap = (MKHeap *) palloc(sizeof(MKHeap)); Assert(mkctxt); Assert(alloc_sz >= cnt); AssertEquivalent(entries != NULL, cnt > 0); AssertEquivalent(!entries, cnt == 0); heap->mkctxt = mkctxt; heap->lvtops = palloc0(mkctxt->total_lv * sizeof(MKEntry)); heap->readers = NULL; heap->nreader = 0; AssertImply(alloc_sz == 0, !entries); Assert(cnt >= 0 && cnt <= alloc_sz); heap->p = entries; heap->alloc_size = alloc_sz; heap->count = cnt; heap->maxentry = cnt; #ifdef USE_ASSERT_CHECKING { int i; for (i = 0; i < cnt; ++i) { Assert(mke_get_lv(entries + i) == 0); Assert(mke_get_reader(entries + i) == 0); } } #endif /* * note: see NOTE ON UNIQUENESS CHECKING at the top of this file for * information about why we don't check uniqueness here */ mk_prepare_array(entries, 0, cnt - 1, 0, mkctxt); mkheap_heapify(heap, true); return heap; }
/* * For a new workfile, sets the capabilities flags according to * the known underlying file type capabilities and the method the file was created */ static void ExecWorkFile_SetFlags(ExecWorkFile *workfile, bool delOnClose, bool created) { Assert(workfile != NULL); /* Assert that only the creator of a file can delete it on close */ AssertImply(delOnClose, created); switch(workfile->fileType) { case BUFFILE: workfile->flags |= EXEC_WORKFILE_RANDOM_ACCESS; break; case BFZ: workfile->flags |= EXEC_WORKFILE_SUSPENDABLE; break; default: insist_log(false, "invalid work file type: %d", workfile->fileType); } if (delOnClose) { workfile->flags |= EXEC_WORKFILE_DEL_ON_CLOSE; } if (created) { workfile->flags |= EXEC_WORKFILE_CREATED; elog(gp_workfile_caching_loglevel, "Created workfile %s, delOnClose = %d", ExecWorkFile_GetFileName(workfile), delOnClose); } else { elog(gp_workfile_caching_loglevel, "Opened existing workfile %s, delOnClose = %d", ExecWorkFile_GetFileName(workfile), delOnClose); } if ((gp_workfile_limit_per_query > 0) || (gp_workfile_limit_per_segment > 0)) { workfile->flags |= EXEC_WORKFILE_LIMIT_SIZE; } }
/* * Open a temporary file that will (optionally) disappear when we close it. * * If 'makenameunique' is true, this function generates a file name which * should be unique to this particular OpenTemporaryFile() request and * distinct from any others in concurrent use on the same host. As a * convenience for monitoring and debugging, the given 'fileName' string * and 'extentseqnum' are embedded in the file name. * * If 'makenameunique' is false, then 'fileName' and 'extentseqnum' identify a * new or existing temporary file which other processes also could open and * share. * * If 'create' is true, a new file is created. If successful, a valid vfd * index (>0) is returned; otherwise an error is thrown. * * If 'create' is false, an existing file is opened. If successful, a valid * vfd index (>0) is returned. If the file does not exist or cannot be * opened, an invalid vfd index (<= 0) is returned. * * If 'delOnClose' is true, then the file is removed when you call * FileClose(); or when the process exits; or (provided 'closeAtEOXact' is * true) when the transaction ends. * * If 'closeAtEOXact' is true, the vfd is closed automatically at end of * transaction unless you have called FileClose() to close it before then. * If 'closeAtEOXact' is false, the vfd state is not changed at end of * transaction. * * In most cases, you don't want temporary files to outlive the transaction * that created them, so you should specify 'true' for both 'delOnClose' and * 'closeAtEOXact'. */ File OpenTemporaryFile(const char *fileName, int extentseqnum, bool makenameunique, bool create, bool delOnClose, bool closeAtEOXact) { char tempfilepath[MAXPGPATH]; Assert(fileName); AssertImply(makenameunique, create && delOnClose); char tempfileprefix[MAXPGPATH]; int len = GetTempFilePrefix(tempfileprefix, MAXPGPATH, fileName); insist_log(len <= MAXPGPATH - 1, "could not generate temporary file name"); if (makenameunique) { /* * Generate a tempfile name that should be unique within the current * database instance. */ snprintf(tempfilepath, sizeof(tempfilepath), "%s_%d_%04d.%ld", tempfileprefix, MyProcPid, extentseqnum, tempFileCounter++); } else { snprintf(tempfilepath, sizeof(tempfilepath), "%s.%04d", tempfileprefix, extentseqnum); } return OpenNamedFile(tempfilepath, create, delOnClose, closeAtEOXact); } /* OpenTemporaryFile */
/* * DynamicScan_InitNextPartition * Prepares the next partition for scanning by calling various * helper methods to open relation, map dropped attributes, * initialize expressions etc. */ static bool DynamicScan_InitNextPartition(ScanState *scanState, PartitionInitMethod *partitionInitMethod, PartitionEndMethod *partitionEndMethod, PartitionReScanMethod *partitionReScanMethod) { Assert(isDynamicScan((Scan *)scanState->ps.plan)); AssertImply(scanState->scan_state != SCAN_INIT, NULL != scanState->ss_currentRelation); Scan *scan = (Scan *)scanState->ps.plan; DynamicTableScanInfo *partitionInfo = scanState->ps.state->dynamicTableScanInfo; Assert(partitionInfo->numScans >= scan->partIndex); int32 numSelectors = list_nth_int(partitionInfo->numSelectorsPerScanId, scan->partIndex); Oid newOid = DynamicScan_AdvanceIterator(scanState, numSelectors); if (!OidIsValid(newOid)) { return false; } Relation oldRelation = NULL; Relation newRelation = NULL; DynamicScan_ObtainRelations(scanState, newOid, &oldRelation, &newRelation); /* Either we have a new relation or this is the first relation */ if (oldRelation != newRelation || NULL == scanState->ss_currentRelation) { AttrNumber *attMap = DynamicScan_MapRelationColumns(scanState, oldRelation, newRelation); DynamicScan_RemapExpression(scanState, attMap, (Node*)scanState->ps.plan->qual); DynamicScan_RemapExpression(scanState, attMap, (Node*)scanState->ps.plan->targetlist); /* * We only initialize expression if this is the first partition * or if the column mapping changes between two partitions. * Otherwise, we reuse the previously initialized expression. */ bool initExpressions = (NULL != attMap || SCAN_INIT == scanState->scan_state); if (newRelation != oldRelation) { /* Close the old relation */ DynamicScan_CleanupOneRelation(scanState, oldRelation, partitionEndMethod); } DynamicScan_UpdateScanStateForNewPart(scanState, newRelation); if (initExpressions) { DynamicScan_InitExpr(scanState); } partitionInitMethod(scanState, attMap); if (NULL != attMap) { pfree(attMap); attMap = NULL; } } else { /* Rescan of the same part */ partitionReScanMethod(scanState); } /* Collect number of partitions scanned in EXPLAIN ANALYZE */ if(NULL != scanState->ps.instrument) { Instrumentation *instr = scanState->ps.instrument; instr->numPartScanned ++; } return true; }
/* * Create a new file set * type is the WorkFileType for the files: BUFFILE or BFZ * can_be_reused: if set to false, then we don't insert this set into the cache, * since the caller is telling us there is no point. This can happen for * example when spilling during index creation. * ps is the PlanState for the subtree rooted at the operator * snapshot contains snapshot information for the current transaction * */ workfile_set * workfile_mgr_create_set(enum ExecWorkFileType type, bool can_be_reused, PlanState *ps, workfile_set_snapshot snapshot) { Assert(NULL != workfile_mgr_cache); Plan *plan = NULL; if (ps != NULL) { plan = ps->plan; } AssertImply(can_be_reused, plan != NULL); NodeTag node_type = T_Invalid; if (ps != NULL) { node_type = ps->type; } char *dir_path = create_workset_directory(node_type, currentSliceId); /* Create parameter info for the populate function */ workset_info set_info; set_info.file_type = type; set_info.snapshot = snapshot; set_info.nodeType = node_type; set_info.can_be_reused = can_be_reused && workfile_mgr_is_reusable(ps); set_info.dir_path = dir_path; set_info.session_start_time = GetCurrentTimestamp(); set_info.operator_work_mem = get_operator_work_mem(ps); set_info.on_disk = true; CacheEntry *newEntry = NULL; PG_TRY(); { newEntry = acquire_entry_retry(workfile_mgr_cache, &set_info); } PG_CATCH(); { /* Failed to acquire new entry, cache full. Clean up the directory we created. */ workfile_mgr_delete_set_directory(dir_path); PG_RE_THROW(); } PG_END_TRY(); /* Path has now been copied to the workfile_set. We can free it */ pfree(dir_path); /* Complete initialization of the entry with post-acquire actions */ Assert(NULL != newEntry); workfile_set *work_set = CACHE_ENTRY_PAYLOAD(newEntry); Assert(work_set != NULL); if (work_set->can_be_reused) { Assert(plan != NULL); Assert(nodeTag(plan) >= T_Plan && nodeTag(plan) < T_PlanInvalItem); workfile_set_plan *s_plan = workfile_mgr_serialize_plan(ps); work_set->key = workfile_mgr_hash_key(s_plan); workfile_mgr_save_plan(work_set, s_plan); workfile_mgr_free_plan(s_plan); } elog(gp_workfile_caching_loglevel, "new spill file set. key=0x%x can_be_reused=%d prefix=%s opMemKB=" INT64_FORMAT, work_set->key, work_set->can_be_reused, work_set->path, work_set->metadata.operator_work_mem); return work_set; }
/** * BackoffSweeper() looks at all the backend structures to determine if any * backends are not making progress. This is done by inspecting the lastchecked * time. It also calculates the total weight of all 'active' backends to * re-calculate the target CPU usage per backend process. If it finds that a * backend is trying to request more CPU resources than the maximum CPU that it * can get (such a backend is called a 'pegger'), it assigns maxCPU to it. * * For example: * Let Qi be the ith query statement, Ri be the target CPU usage for Qi, * Wi be the statement weight for Qi, W be the total statements weight. * For simplicity, let's assume every statement only has 1 backend per segment. * * Let there be 4 active queries with weights {1,100,10,1000} with K=3 CPUs * available per segment to share. The maximum CPU that a backend can get is * maxCPU = 1.0. The total active statements weight is * W (activeWeight) = 1 + 100 + 10 + 1000 = 1111. * The following algorithm determines that Q4 is pegger, because * K * W4 / W > maxCPU, which is 3000/1111 > 1.0, so we assign R4 = 1.0. * Now K becomes 2.0, W becomes 111. * It restarts from the beginning and determines that Q2 is now a pegger as * well, because K * W2 / W > maxCPU, which is 200/111 > 1.0, we assign * R2 = 1.0. Now there is only 1 CPU left and no peggers left. We continue * to distribute the left 1 CPU to other backends according to their weight, * so we assign the target CPU ratio of R1=1/11 and R3=10/11. The final * target CPU assignments are {0.09,1.0,0.91,1.0}. * * If there are multiple backends within a segment running for the query Qi, * the target CPU ratio Ri for query Qi is divided equally among all the * active backends belonging to the query. */ void BackoffSweeper() { int i = 0; /* The overall weight of active statements */ volatile double activeWeight = 0.0; int numActiveBackends = 0; int numActiveStatements = 0; /* The overall weight of active and inactive statements */ int totalStatementWeight = 0; int numValidBackends = 0; int numStatements = 0; struct timeval currentTime; if (gettimeofday(¤tTime, NULL) < 0) { elog(ERROR, "Unable to execute gettimeofday(). Please disable query prioritization."); } Assert(backoffSingleton->sweeperInProgress == false); backoffSingleton->sweeperInProgress = true; TRACE_POSTGRESQL_BACKOFF_GLOBALCHECK(); /* Reset status for all the backend entries */ for (i = 0; i < backoffSingleton->numEntries; i++) { BackoffBackendSharedEntry *se = getBackoffEntryRW(i); se->isActive = false; se->numFollowersActive = 0; se->backoff = true; } /* * Mark backends that are active. Count of active group members is * maintained at their group leader. */ for (i = 0; i < backoffSingleton->numEntries; i++) { BackoffBackendSharedEntry *se = getBackoffEntryRW(i); if (isValid(&se->statementId)) { Assert(se->weight > 0); if (TIMEVAL_DIFF_USEC(currentTime, se->lastCheckTime) < gp_resqueue_priority_inactivity_timeout * 1000.0) { /* * This is an active backend. Need to maintain count at group * leader */ BackoffBackendSharedEntry *gl = getBackoffEntryRW(se->groupLeaderIndex); if (gl->numFollowersActive == 0) { activeWeight += se->weight; numActiveStatements++; } gl->numFollowersActive++; numActiveBackends++; se->isActive = true; } if (isGroupLeader(i)) { totalStatementWeight += se->weight; numStatements++; } numValidBackends++; } } /* Sanity checks */ Assert(numActiveBackends <= numValidBackends); Assert(numValidBackends >= numStatements); /** * Under certain conditions, we want to avoid backoff. Cases are: * 1. A statement just entered or exited * 2. A statement's weight changed due to user intervention via gp_adjust_priority() * 3. There is no active backend * 4. There is exactly one statement * 5. Total number valid of backends <= number of procs per segment * Case 1 and 2 are approximated by checking if total statement weight changed since last sweeper loop. */ if (backoffSingleton->lastTotalStatementWeight != totalStatementWeight || numActiveBackends == 0 || numStatements == 1 || numValidBackends <= numProcsPerSegment()) { /* Write to targets */ for (i = 0; i < backoffSingleton->numEntries; i++) { BackoffBackendSharedEntry *se = getBackoffEntryRW(i); se->backoff = false; se->earlyBackoffExit = true; se->targetUsage = 1.0; } } else { /** * There are multiple statements with active backends. * * Let 'found' be true if we find a backend is trying to * request more CPU resources than the maximum CPU that it can * get. No matter how high the priority of a query process, it * can utilize at most a single CPU at a time. */ bool found = true; int numIterations = 0; double CPUAvailable = numProcsPerSegment(); double maxCPU = Min(1.0, numProcsPerSegment()); /* Maximum CPU that a * backend can get */ Assert(maxCPU > 0.0); if (gp_debug_resqueue_priority) { elog(LOG, "before allocation: active backends = %d, active weight = %f, cpu available = %f", numActiveBackends, activeWeight, CPUAvailable); } while (found) { found = false; /** * We try to find one or more backends that deserve maxCPU. */ for (i = 0; i < backoffSingleton->numEntries; i++) { BackoffBackendSharedEntry *se = getBackoffEntryRW(i); if (se->isActive && se->backoff) { double targetCPU = 0.0; const BackoffBackendSharedEntry *gl = getBackoffEntryRO(se->groupLeaderIndex); Assert(gl->numFollowersActive > 0); if (activeWeight <= 0.0) { /* * There is a race condition here: * Backend A,B,C are belong to same statement and have weight of * 100000. * * Timestamp1: backend A's leader is A, backend B's leader is B * backend C's leader is also B. * * Timestamp2: Sweeper calculates the activeWeight to 200000. * * Timestamp3: backend B changes it's leader to A. * * Timestamp4: Sweeper try to find the backends who deserve maxCPU, * if backend A, B, C all deserve maxCPU, then activeWeight = * 200000 - 100000/1 - 100000/1 - 100000/2 which is less than zero. * * We can stop sweeping for such race condition because current * backoff mechanism dose not ask for accurate control. */ backoffSingleton->sweeperInProgress = false; elog(LOG, "activeWeight underflow!"); return; } Assert(activeWeight > 0.0); Assert(se->weight > 0.0); targetCPU = (CPUAvailable) * (se->weight) / activeWeight / gl->numFollowersActive; /** * Some statements may be weighed so heavily that they are allocated the maximum cpu ratio. */ if (targetCPU >= maxCPU) { Assert(numProcsPerSegment() >= 1.0); /* This can only happen * when there is more * than one proc */ se->targetUsage = maxCPU; se->backoff = false; activeWeight -= (se->weight / gl->numFollowersActive); CPUAvailable -= maxCPU; found = true; } } } numIterations++; AssertImply(found, (numIterations <= floor(numProcsPerSegment()))); Assert(numIterations <= ceil(numProcsPerSegment())); } if (gp_debug_resqueue_priority) { elog(LOG, "after heavy backends: active backends = %d, active weight = %f, cpu available = %f", numActiveBackends, activeWeight, CPUAvailable); } /** * Distribute whatever is the CPU available among the rest. */ for (i = 0; i < backoffSingleton->numEntries; i++) { BackoffBackendSharedEntry *se = getBackoffEntryRW(i); if (se->isActive && se->backoff) { const BackoffBackendSharedEntry *gl = getBackoffEntryRO(se->groupLeaderIndex); Assert(activeWeight > 0.0); Assert(gl->numFollowersActive > 0); Assert(se->weight > 0.0); se->targetUsage = (CPUAvailable) * (se->weight) / activeWeight / gl->numFollowersActive; } } } backoffSingleton->lastTotalStatementWeight = totalStatementWeight; backoffSingleton->sweeperInProgress = false; if (gp_debug_resqueue_priority) { StringInfoData str; initStringInfo(&str); appendStringInfo(&str, "num active statements: %d ", numActiveStatements); appendStringInfo(&str, "num active backends: %d ", numActiveBackends); appendStringInfo(&str, "targetusages: "); for (i = 0; i < MaxBackends; i++) { const BackoffBackendSharedEntry *se = getBackoffEntryRO(i); if (se->isActive) appendStringInfo(&str, "(%d,%f)", i, se->targetUsage); } elog(LOG, "%s", (const char *) str.data); pfree(str.data); } }
HeapTuple toast_insert_or_update(Relation rel, HeapTuple newtup, HeapTuple oldtup, MemTupleBinding *pbind, int toast_tuple_target, bool isFrozen) { HeapTuple result_tuple; TupleDesc tupleDesc; Form_pg_attribute *att; int numAttrs; int i; bool need_change = false; bool need_free = false; bool need_delold = false; bool has_nulls = false; Size maxDataLen; char toast_action[MaxHeapAttributeNumber]; bool toast_isnull[MaxHeapAttributeNumber]; bool toast_oldisnull[MaxHeapAttributeNumber]; Datum toast_values[MaxHeapAttributeNumber]; Datum toast_oldvalues[MaxHeapAttributeNumber]; int32 toast_sizes[MaxHeapAttributeNumber]; bool toast_free[MaxHeapAttributeNumber]; bool toast_delold[MaxHeapAttributeNumber]; bool ismemtuple = is_heaptuple_memtuple(newtup); AssertImply(ismemtuple, pbind); AssertImply(!ismemtuple, !pbind); AssertImply(ismemtuple && oldtup, is_heaptuple_memtuple(oldtup)); Assert(toast_tuple_target > 0); /* * We should only ever be called for tuples of plain relations --- * recursing on a toast rel is bad news. */ //Assert(rel->rd_rel->relkind == RELKIND_RELATION); if (rel->rd_rel->relkind != RELKIND_RELATION) elog(LOG,"Why are we toasting a non-relation! %c ",rel->rd_rel->relkind); /* * Get the tuple descriptor and break down the tuple(s) into fields. */ tupleDesc = rel->rd_att; att = tupleDesc->attrs; numAttrs = tupleDesc->natts; Assert(numAttrs <= MaxHeapAttributeNumber); if(ismemtuple) memtuple_deform((MemTuple) newtup, pbind, toast_values, toast_isnull); else heap_deform_tuple(newtup, tupleDesc, toast_values, toast_isnull); if (oldtup != NULL) { if(ismemtuple) memtuple_deform((MemTuple) oldtup, pbind, toast_oldvalues, toast_oldisnull); else heap_deform_tuple(oldtup, tupleDesc, toast_oldvalues, toast_oldisnull); } /* ---------- * Then collect information about the values given * * NOTE: toast_action[i] can have these values: * ' ' default handling * 'p' already processed --- don't touch it * 'x' incompressible, but OK to move off * * NOTE: toast_sizes[i] is only made valid for varlena attributes with * toast_action[i] different from 'p'. * ---------- */ memset(toast_action, ' ', numAttrs * sizeof(char)); memset(toast_free, 0, numAttrs * sizeof(bool)); memset(toast_delold, 0, numAttrs * sizeof(bool)); for (i = 0; i < numAttrs; i++) { varattrib *old_value; varattrib *new_value; if (oldtup != NULL) { /* * For UPDATE get the old and new values of this attribute */ old_value = (varattrib *) DatumGetPointer(toast_oldvalues[i]); new_value = (varattrib *) DatumGetPointer(toast_values[i]); /* * If the old value is an external stored one, check if it has * changed so we have to delete it later. */ if (att[i]->attlen == -1 && !toast_oldisnull[i] && VARATT_IS_EXTERNAL(old_value)) { if (toast_isnull[i] || !VARATT_IS_EXTERNAL(new_value) || memcmp((char *) old_value, (char *) new_value, VARSIZE_EXTERNAL(old_value)) != 0) { /* * The old external stored value isn't needed any more * after the update */ toast_delold[i] = true; need_delold = true; } else { /* * This attribute isn't changed by this update so we reuse * the original reference to the old value in the new * tuple. */ toast_action[i] = 'p'; continue; } } } else { /* * For INSERT simply get the new value */ new_value = (varattrib *) DatumGetPointer(toast_values[i]); } /* * Handle NULL attributes */ if (toast_isnull[i]) { toast_action[i] = 'p'; has_nulls = true; continue; } /* * Now look at varlena attributes */ if (att[i]->attlen == -1) { /* * If the table's attribute says PLAIN always, force it so. */ if (att[i]->attstorage == 'p') toast_action[i] = 'p'; /* * We took care of UPDATE above, so any external value we find * still in the tuple must be someone else's we cannot reuse. * Fetch it back (without decompression, unless we are forcing * PLAIN storage). If necessary, we'll push it out as a new * external value below. */ if (VARATT_IS_EXTERNAL(new_value)) { if (att[i]->attstorage == 'p') new_value = (varattrib *)heap_tuple_untoast_attr((struct varlena *)new_value); else new_value = (varattrib *)heap_tuple_fetch_attr((struct varlena *)new_value); toast_values[i] = PointerGetDatum(new_value); toast_free[i] = true; need_change = true; need_free = true; } /* * Remember the size of this attribute */ toast_sizes[i] = VARSIZE_ANY(new_value); } else { /* * Not a varlena attribute, plain storage always */ toast_action[i] = 'p'; } } /* ---------- * Compress and/or save external until data fits into target length * * 1: Inline compress attributes with attstorage 'x', and store very * large attributes with attstorage 'x' or 'e' external immediately * 2: Store attributes with attstorage 'x' or 'e' external * 3: Inline compress attributes with attstorage 'm' * 4: Store attributes with attstorage 'm' external * ---------- */ if(!ismemtuple) { /* compute header overhead --- this should match heap_form_tuple() */ maxDataLen = offsetof(HeapTupleHeaderData, t_bits); if (has_nulls) maxDataLen += BITMAPLEN(numAttrs); if (newtup->t_data->t_infomask & HEAP_HASOID) maxDataLen += sizeof(Oid); maxDataLen = MAXALIGN(maxDataLen); Assert(maxDataLen == newtup->t_data->t_hoff); /* now convert to a limit on the tuple data size */ maxDataLen = toast_tuple_target - maxDataLen; } else maxDataLen = toast_tuple_target; /* * Look for attributes with attstorage 'x' to compress. Also find large * attributes with attstorage 'x' or 'e', and store them external. */ while (compute_dest_tuplen(tupleDesc, pbind, has_nulls, toast_values, toast_isnull) > maxDataLen) { int biggest_attno = -1; int32 biggest_size = MAXALIGN(sizeof(varattrib)); Datum old_value; Datum new_value; /* * Search for the biggest yet unprocessed internal attribute */ for (i = 0; i < numAttrs; i++) { if (toast_action[i] != ' ') continue; if (VARATT_IS_EXTERNAL_D(toast_values[i])) continue; if (VARATT_IS_COMPRESSED_D(toast_values[i])) continue; if (att[i]->attstorage != 'x') continue; if (toast_sizes[i] > biggest_size) { biggest_attno = i; biggest_size = toast_sizes[i]; } } if (biggest_attno < 0) break; /* * Attempt to compress it inline, if it has attstorage 'x' */ i = biggest_attno; old_value = toast_values[i]; new_value = toast_compress_datum(old_value); if (DatumGetPointer(new_value) != NULL) { /* successful compression */ if (toast_free[i]) pfree(DatumGetPointer(old_value)); toast_values[i] = new_value; toast_free[i] = true; toast_sizes[i] = VARSIZE_D(toast_values[i]); need_change = true; need_free = true; } else { /* * incompressible data, ignore on subsequent compression passes */ toast_action[i] = 'x'; } } /* * Second we look for attributes of attstorage 'x' or 'e' that are still * inline. */ while (compute_dest_tuplen(tupleDesc, pbind, has_nulls, toast_values, toast_isnull) > maxDataLen && rel->rd_rel->reltoastrelid != InvalidOid) { int biggest_attno = -1; int32 biggest_size = MAXALIGN(sizeof(varattrib)); Datum old_value; /*------ * Search for the biggest yet inlined attribute with * attstorage equals 'x' or 'e' *------ */ for (i = 0; i < numAttrs; i++) { if (toast_action[i] == 'p') continue; if (VARATT_IS_EXTERNAL_D(toast_values[i])) continue; if (att[i]->attstorage != 'x' && att[i]->attstorage != 'e') continue; if (toast_sizes[i] > biggest_size) { biggest_attno = i; biggest_size = toast_sizes[i]; } } if (biggest_attno < 0) break; /* * Store this external */ i = biggest_attno; old_value = toast_values[i]; toast_action[i] = 'p'; toast_values[i] = toast_save_datum(rel, toast_values[i], isFrozen); if (toast_free[i]) pfree(DatumGetPointer(old_value)); toast_free[i] = true; need_change = true; need_free = true; } /* * Round 3 - this time we take attributes with storage 'm' into * compression */ while (compute_dest_tuplen(tupleDesc, pbind, has_nulls, toast_values, toast_isnull) > maxDataLen) { int biggest_attno = -1; int32 biggest_size = MAXALIGN(sizeof(varattrib)); Datum old_value; Datum new_value; /* * Search for the biggest yet uncompressed internal attribute */ for (i = 0; i < numAttrs; i++) { if (toast_action[i] != ' ') continue; if (VARATT_IS_EXTERNAL_D(toast_values[i])) continue; /* can't happen, toast_action would be 'p' */ if (VARATT_IS_COMPRESSED_D(toast_values[i])) continue; if (att[i]->attstorage != 'm') continue; if (toast_sizes[i] > biggest_size) { biggest_attno = i; biggest_size = toast_sizes[i]; } } if (biggest_attno < 0) break; /* * Attempt to compress it inline */ i = biggest_attno; old_value = toast_values[i]; new_value = toast_compress_datum(old_value); if (DatumGetPointer(new_value) != NULL) { /* successful compression */ if (toast_free[i]) pfree(DatumGetPointer(old_value)); toast_values[i] = new_value; toast_free[i] = true; toast_sizes[i] = VARSIZE_D(toast_values[i]); need_change = true; need_free = true; } else { /* incompressible, ignore on subsequent compression passes */ toast_action[i] = 'x'; } } /* * Finally we store attributes of type 'm' external, if possible. */ while (compute_dest_tuplen(tupleDesc, pbind, has_nulls, toast_values, toast_isnull) > maxDataLen && rel->rd_rel->reltoastrelid != InvalidOid) { int biggest_attno = -1; int32 biggest_size = MAXALIGN(sizeof(varattrib)); Datum old_value; /*-------- * Search for the biggest yet inlined attribute with * attstorage = 'm' *-------- */ for (i = 0; i < numAttrs; i++) { if (toast_action[i] == 'p') continue; if (VARATT_IS_EXTERNAL_D(toast_values[i])) continue; /* can't happen, toast_action would be 'p' */ if (att[i]->attstorage != 'm') continue; if (toast_sizes[i] > biggest_size) { biggest_attno = i; biggest_size = toast_sizes[i]; } } if (biggest_attno < 0) break; /* * Store this external */ i = biggest_attno; old_value = toast_values[i]; toast_action[i] = 'p'; toast_values[i] = toast_save_datum(rel, toast_values[i], isFrozen); if (toast_free[i]) pfree(DatumGetPointer(old_value)); toast_free[i] = true; need_change = true; need_free = true; } /* XXX Maybe we should check here for any compressed inline attributes that * didn't save enough to warrant keeping. In particular attributes whose * rawsize is < 128 bytes and didn't save at least 3 bytes... or even maybe * more given alignment issues */ /* * In the case we toasted any values, we need to build a new heap tuple * with the changed values. */ if (need_change) { if(ismemtuple) result_tuple = (HeapTuple) memtuple_form_to(pbind, toast_values, toast_isnull, NULL, NULL, false); else { HeapTupleHeader olddata = newtup->t_data; HeapTupleHeader new_data; int32 new_len; /* * Calculate the new size of the tuple. Header size should not * change, but data size might. */ new_len = offsetof(HeapTupleHeaderData, t_bits); if (has_nulls) new_len += BITMAPLEN(numAttrs); if (olddata->t_infomask & HEAP_HASOID) new_len += sizeof(Oid); new_len = MAXALIGN(new_len); Assert(new_len == olddata->t_hoff); new_len += heap_compute_data_size(tupleDesc, toast_values, toast_isnull); /* * Allocate and zero the space needed, and fill HeapTupleData fields. */ result_tuple = (HeapTuple) palloc0(HEAPTUPLESIZE + new_len); result_tuple->t_len = new_len; result_tuple->t_self = newtup->t_self; new_data = (HeapTupleHeader) ((char *) result_tuple + HEAPTUPLESIZE); result_tuple->t_data = new_data; /* * Put the existing tuple header and the changed values into place */ memcpy(new_data, olddata, olddata->t_hoff); heap_fill_tuple(tupleDesc, toast_values, toast_isnull, (char *) new_data + olddata->t_hoff, &(new_data->t_infomask), has_nulls ? new_data->t_bits : NULL); } } else result_tuple = newtup; /* * Free allocated temp values */ if (need_free) for (i = 0; i < numAttrs; i++) if (toast_free[i]) pfree(DatumGetPointer(toast_values[i])); /* * Delete external values from the old tuple */ if (need_delold) for (i = 0; i < numAttrs; i++) if (toast_delold[i]) toast_delete_datum(rel, toast_oldvalues[i]); return result_tuple; }
/* * Finds and notifies the top vmem consuming session. */ static void RedZoneHandler_FlagTopConsumer() { if (!vmemTrackerInited) { return; } Assert(NULL != MySessionState); bool success = compare_and_swap_32((uint32*) isRunawayDetector, 0, 1); /* If successful then this process must be the runaway detector */ AssertImply(success, 1 == *isRunawayDetector); /* * Someone already determined the runaway query, so nothing to do. This * will also prevent re-entry to this method by a cleaning session. */ if (!success) { return; } /* * Grabbing a shared lock prevents others to modify the SessionState * data structure, therefore ensuring that we don't flag someone * who was already dying. A shared lock is enough as we access the * data structure in a read-only manner. */ LWLockAcquire(SessionStateLock, LW_SHARED); int32 maxVmem = 0; int32 maxActiveVmem = 0; SessionState *maxActiveVmemSessionState = NULL; SessionState *maxVmemSessionState = NULL; SessionState *curSessionState = AllSessionStateEntries->usedList; while (curSessionState != NULL) { Assert(INVALID_SESSION_ID != curSessionState->sessionId); int32 curVmem = curSessionState->sessionVmem; Assert(maxActiveVmem <= maxVmem); if (curVmem > maxActiveVmem) { if (curVmem > maxVmem) { maxVmemSessionState = curSessionState; maxVmem = curVmem; } /* * Only consider sessions with at least 1 active process. As we * are *not* grabbings locks, this does not guarantee that by the * time we finish walking all sessions the chosen session will * still have active process. */ if (curSessionState->activeProcessCount > 0) { maxActiveVmemSessionState = curSessionState; maxActiveVmem = curVmem; } } curSessionState = curSessionState->next; } if (NULL != maxActiveVmemSessionState) { SpinLockAcquire(&maxActiveVmemSessionState->spinLock); /* * Now that we grabbed lock, make sure we have at least 1 active process * before flagging this session for termination */ if (0 < maxActiveVmemSessionState->activeProcessCount) { /* * First update the runaway event detection version so that * an active process of the runaway session is forced to clean up before * it deactivates. As we grabbed the spin lock, no process of the runaway * session can deactivate unless we release the lock. The other sessions * don't care what global runaway version they observe as the runaway * event is not pertinent to them. * * We don't need any lock here as the runaway detector is singleton, * and only the detector can update this variable. */ *latestRunawayVersion = *CurrentVersion + 1; /* * Make sure that the runaway event version is not shared with any other * processes, and not shared with any other deactivation/reactivation version */ *CurrentVersion = *CurrentVersion + 2; Assert(CLEANUP_COUNTDOWN_BEFORE_RUNAWAY == maxActiveVmemSessionState->cleanupCountdown); /* * Determine how many processes need to cleanup to mark the session clean. */ maxActiveVmemSessionState->cleanupCountdown = maxActiveVmemSessionState->activeProcessCount; if (maxActiveVmemSessionState == maxVmemSessionState) { /* Finally signal the runaway process for cleanup */ maxActiveVmemSessionState->runawayStatus = RunawayStatus_PrimaryRunawaySession; } else { maxActiveVmemSessionState->runawayStatus = RunawayStatus_SecondaryRunawaySession; } /* Save the amount of vmem session was holding when it was flagged as runaway */ maxActiveVmemSessionState->sessionVmemRunaway = maxActiveVmemSessionState->sessionVmem; /* Save the command count currently running in the runaway session */ maxActiveVmemSessionState->commandCountRunaway = gp_command_count; } else { /* * Failed to find any viable runaway session. Reset runaway detector flag * for another round of runaway determination at a later time. As we couldn't * find any runaway session, the CurrentVersion is not changed. */ *isRunawayDetector = 0; } SpinLockRelease(&maxActiveVmemSessionState->spinLock); } else { /* * No active session to mark as runaway. So, reenable the runaway detection process */ *isRunawayDetector = 0; } LWLockRelease(SessionStateLock); }