/* Returns true if the current process should start a runaway cleanup */
static bool
RunawayCleaner_ShouldStartRunawayCleanup()
{
	if (NULL != MySessionState && MySessionState->runawayStatus != RunawayStatus_NotRunaway &&
			beginCleanupRunawayVersion != *latestRunawayVersion)
	{
		AssertImply(isProcessActive, activationVersion >= deactivationVersion);
		AssertImply(!isProcessActive, deactivationVersion >= activationVersion);

		/*
		 * We are marked as runaway. Therefore, if the runaway event happened before deactivation,
		 * we must have a version counter increment
		 */
		AssertImply(*latestRunawayVersion < deactivationVersion && !isProcessActive, activationVersion < deactivationVersion);

		if (isProcessActive && *latestRunawayVersion > activationVersion)
		{
			/* Active process and the runaway event came after the activation */
			return true;
		}
		else if (!isProcessActive && *latestRunawayVersion < deactivationVersion &&
				*latestRunawayVersion > activationVersion)
		{
			/*
			 * The process is deactivated, but there is a pending runaway event before
			 * the deactivation for which this process never cleaned up
			 */
			return true;
		}
	}

	return false;
}
Beispiel #2
0
/* ----------
 * toast_delete -
 *
 *	Cascaded delete toast-entries on DELETE
 * ----------
 */
void
toast_delete(Relation rel, HeapTuple oldtup, MemTupleBinding *pbind)
{
	TupleDesc	tupleDesc;
	Form_pg_attribute *att;
	int			numAttrs;
	int			i;
	Datum		toast_values[MaxHeapAttributeNumber];
	bool		toast_isnull[MaxHeapAttributeNumber];
	bool 		ismemtuple = is_heaptuple_memtuple(oldtup);
	
	AssertImply(ismemtuple, pbind);
	AssertImply(!ismemtuple, !pbind);

	/*
	 * We should only ever be called for tuples of plain relations ---
	 * recursing on a toast rel is bad news.
	 */
	Assert(rel->rd_rel->relkind == RELKIND_RELATION);

	/*
	 * Get the tuple descriptor and break down the tuple into fields.
	 *
	 * NOTE: it's debatable whether to use heap_deform_tuple() here or just
	 * heap_getattr() only the varlena columns.  The latter could win if there
	 * are few varlena columns and many non-varlena ones. However,
	 * heap_deform_tuple costs only O(N) while the heap_getattr way would cost
	 * O(N^2) if there are many varlena columns, so it seems better to err on
	 * the side of linear cost.  (We won't even be here unless there's at
	 * least one varlena column, by the way.)
	 */
	tupleDesc = rel->rd_att;
	att = tupleDesc->attrs;
	numAttrs = tupleDesc->natts;

	Assert(numAttrs <= MaxHeapAttributeNumber);

	if(ismemtuple)
		memtuple_deform((MemTuple) oldtup, pbind, toast_values, toast_isnull);
	else
		heap_deform_tuple(oldtup, tupleDesc, toast_values, toast_isnull);

	/*
	 * Check for external stored attributes and delete them from the secondary
	 * relation.
	 */
	for (i = 0; i < numAttrs; i++)
	{
		if (att[i]->attlen == -1)
		{
			Datum		value = toast_values[i];

			if (!toast_isnull[i] && VARATT_IS_EXTERNAL_D(value))
				toast_delete_datum(rel, value);
		}
	}
}
Beispiel #3
0
/*
 * Looks up an entry with a given key in the hashtable.
 * Returns pointer to the entry if found, NULL otherwise.
 *
 * This function is synchronized. Returned entry is AddRef'ed and needs to
 * be released.
 */
void *
SyncHTLookup(SyncHT *syncHT, void *key)
{
	Assert(NULL != syncHT);
	Assert(NULL != key);

	LWLockId partitionLock = SyncHTPartLockId(syncHT, key);

	LWLockAcquire(partitionLock, LW_SHARED);

	bool existing = false;
	void *entry = hash_search(syncHT->ht, key, HASH_FIND, &existing);

	AssertImply(entry != NULL, existing);

	/* AddRef the entry if found */
	if (entry != NULL)
	{
		SyncHTAddRef(syncHT, entry);
	}

	LWLockRelease(partitionLock);

	return entry;
}
/*
 * Get the next value.  On success, a non-negative value is returned and *out is populated with the value
 *   that was on the top of the heap.
 *
 * if this is an array-backed heap then *out is inserted into the heap.  If it's a reader-backed heap then
 *    *out is ignored on input.
 */
int mkheap_putAndGet(MKHeap *mkheap, MKEntry *out)
{
	int ret = 0;
	Assert(out);

	/*
	 * fetch from appropriate source
	 *
	 * note that these two cases don't behave the same in terms of how *out is treated.
	 *    mkheap_putAndGet_reader should be called mkheap_get_reader -- it never puts the input value
	 *    mkheap_putAndGet_impl will put *out if it's not empty, and then do the get.
	 */
    if(mkheap->nreader > 0)
        ret = mkheap_putAndGet_reader(mkheap, out);
    else
        ret = mkheap_putAndGet_impl(mkheap, out);

    /* check: underlying call must have enforced uniquness */
    AssertImply(mkheap->mkctxt->enforceUnique, ret != 0);

	/* free *out */
    if(mkheap->mkctxt->cpfr)
        (mkheap->mkctxt->cpfr)(out, NULL, mkheap->mkctxt->lvctxt + mke_get_lv(out));
    return ret;
}
Beispiel #5
0
/*
 * Close temporary files and delete their underlying files.
 *
 * isProcExit: if true, this is being called as the backend process is
 * exiting. If that's the case, we should remove all temporary files; if
 * that's not the case, we are being called for transaction commit/abort
 * and should only remove transaction-local temp files.  In either case,
 * also clean up "allocated" stdio files and dirs.
 */
static void
CleanupTempFiles(bool isProcExit)
{
	Index		i;

	if (SizeVfdCache > 0)
	{
		Assert(FileIsNotOpen(0));		/* Make sure ring not corrupted */
		for (i = 1; i < SizeVfdCache; i++)
		{
			unsigned short fdstate = VfdCache[i].fdstate;

			/*
			 * If we're in the process of exiting a backend process, close
			 * all temporary files. Otherwise, only close temporary files
			 * local to the current transaction.
			 */
			if((fdstate & FD_CLOSE_AT_EOXACT)
			  	||
			   (isProcExit && (fdstate & FD_TEMPORARY))
			  )
			{
				AssertImply( (fdstate & FD_TEMPORARY), VfdCache[i].fileName != NULL);
				FileClose(i);
			}
		}
	}

	workfile_mgr_cleanup();

	while (numAllocatedDescs > 0)
		FreeDesc(&allocatedDescs[0]);
}
Beispiel #6
0
/*
 * Create a new file set
 *   type is the WorkFileType for the files: BUFFILE or BFZ
 *   can_be_reused: if set to false, then we don't insert this set into the cache,
 *     since the caller is telling us there is no point. This can happen for
 *     example when spilling during index creation.
 *   ps is the PlanState for the subtree rooted at the operator
 *   snapshot contains snapshot information for the current transaction
 *
 */
workfile_set *
workfile_mgr_create_set(enum ExecWorkFileType type, bool can_be_reused, PlanState *ps)
{
	Assert(NULL != workfile_mgr_cache);

	Plan *plan = NULL;
	if (ps != NULL)
	{
		plan = ps->plan;
	}

	AssertImply(can_be_reused, plan != NULL);

	NodeTag node_type = T_Invalid;
	if (ps != NULL)
	{
		node_type = ps->type;
	}
	char *dir_path = create_workset_directory(node_type, currentSliceId);


	if (!workfile_sets_resowner_callback_registered)
	{
		RegisterResourceReleaseCallback(workfile_set_free_callback, NULL);
		workfile_sets_resowner_callback_registered = true;
	}

	/* Create parameter info for the populate function */
	workset_info set_info;
	set_info.file_type = type;
	set_info.nodeType = node_type;
	set_info.dir_path = dir_path;
	set_info.session_start_time = GetCurrentTimestamp();
	set_info.operator_work_mem = get_operator_work_mem(ps);

	CacheEntry *newEntry = Cache_AcquireEntry(workfile_mgr_cache, &set_info);

	if (NULL == newEntry)
	{
		/* Clean up the directory we created. */
		workfile_mgr_delete_set_directory(dir_path);

		/* Could not acquire another entry from the cache - we filled it up */
		ereport(ERROR,
				(errmsg("could not create workfile manager entry: exceeded number of concurrent spilling queries")));
	}

	/* Path has now been copied to the workfile_set. We can free it */
	pfree(dir_path);

	/* Complete initialization of the entry with post-acquire actions */
	Assert(NULL != newEntry);
	workfile_set *work_set = CACHE_ENTRY_PAYLOAD(newEntry);
	Assert(work_set != NULL);

	elog(gp_workfile_caching_loglevel, "new spill file set. key=0x%x prefix=%s opMemKB=" INT64_FORMAT,
			work_set->key, work_set->path, work_set->metadata.operator_work_mem);

	return work_set;
}
Beispiel #7
0
/*
 * Marks the current process as idle; i.e., it is no longer able to respond
 * to a runaway cleanup. However, before it returns from this method, it
 * would trigger one last runaway cleanup for a pre-dactivation era runaway
 * event, if necessary.
 */
void
IdleTracker_DeactivateProcess()
{
	if (NULL != MySessionState)
	{
		/*
		 * Verify that deactivation during proc_exit_inprogress is protected in
		 * critical section or the interrupt is disabled so that we don't attempt
		 * any runaway cleanup
		 */
		AssertImply(proc_exit_inprogress, CritSectionCount > 0 || InterruptHoldoffCount > 0);

		/*
		 * When an idle process receives a SIGTERM process, the signal handler
		 * die() calls the cleanup directly, so we get here for an idle process.
		 * Instead of re-activating it forcefully, just special case it
		 * and don't do anything during process exit for already inactive processes.
		 */
		if (proc_exit_inprogress && ! isProcessActive)
		{
			Assert(deactivationVersion >= activationVersion);
			return;
		}

		Assert(isProcessActive);
		Assert(deactivationVersion <= activationVersion);

		/* No new runaway event can come in */
		SpinLockAcquire(&MySessionState->spinLock);

		Assert(MySessionState->activeProcessCount <= MySessionState->pinCount);
		/* No atomic update necessary as the update is protected by spin lock */
		MySessionState->activeProcessCount -= 1;
		Assert(0 <= MySessionState->activeProcessCount);
		MySessionState->idle_start = GetCurrentTimestamp();
		isProcessActive = false;

		/* Save the point where we reduced the activeProcessCount */
		deactivationVersion = *CurrentVersion;
		/*
		 * Release spinLock as we no longer contend for isRunaway.
		 */
		SpinLockRelease(&MySessionState->spinLock);

		/*
		 * We are still deactivated (i.e., activeProcessCount is decremented). If an ERROR is indeed thrown
		 * from the VmemTracker_StartCleanupIfRunaway, the VmemTracker_RunawayCleanupDoneForProcess()
		 * method would reactivate this process.
		 */
		RunawayCleaner_StartCleanup();

		/* At this point the process must be clean, unless we don't have a runaway event before deactivation */
		Assert(*latestRunawayVersion > deactivationVersion ||
				!RunawayCleaner_IsCleanupInProgress());
	}

	/* At this point the process is ready to be blocked in ReadCommand() */
}
/*
 * Updating accounting of size when closing a temporary file we created
 */
static void
adjust_size_temp_file_new(workfile_set *work_set, int64 size)
{
#if USE_ASSERT_CHECKING
	bool isCached = (NULL != work_set) && Cache_IsCached(CACHE_ENTRY_HEADER(work_set));
#endif
	Assert(!isCached);
	AssertImply((NULL != work_set), work_set->size == 0);
	AssertImply((NULL != work_set), work_set->in_progress_size >= size);

	if (NULL != work_set)
	{
		work_set->in_progress_size -= size;
	}

	WorkfileDiskspace_Commit(0, size, true /* update_query_size */);
	elog(gp_workfile_caching_loglevel, "closed and deleted temp file, subtracted size " INT64_FORMAT " from disk space", size);
}
Beispiel #9
0
/*
 * Updating accounting of size when closing a temporary file we created
 */
static void
adjust_size_temp_file_new(workfile_set *work_set, int64 size)
{
#if USE_ASSERT_CHECKING
	bool isCached = (NULL != work_set) && Cache_IsCached(CACHE_ENTRY_HEADER(work_set));
#endif
	Assert(!isCached);
	AssertImply((NULL != work_set), work_set->size == 0);
	AssertImply((NULL != work_set), work_set->in_progress_size >= size);

	if (NULL != work_set)
	{
		work_set->in_progress_size -= size;
	}

	WorkfileDiskspace_Commit(0 /* commit_bytes */, size, true /* update_query_size */);
	elog(gp_workfile_caching_loglevel, "closed and deleted temp file, subtracted size " INT64_FORMAT " from disk space", size);

	/* About to physically delete a file we created. Update the per-query file count as well */
	WorkfileQueryspace_SubtractWorkfile(1 /* nFiles */);
}
Beispiel #10
0
/*
 * makeCdbSreh
 *
 * Allocate and initialize a Single Row Error Handling state object.
 * Pass in the only known parameters (both we get from the SQL stmt),
 * the other variables are set later on, when they are known.
 */
CdbSreh *
makeCdbSreh(bool is_keep, bool reusing_existing_errtable,
			int rejectlimit, bool is_limit_in_rows, 
			RangeVar *errortable, char *filename, char *relname,
			bool log_to_file)
{
	CdbSreh	*h;

	h = palloc(sizeof(CdbSreh));
	
	h->errmsg = NULL;
	h->rawdata = NULL;
	h->linenumber = 0;
	h->processed = 0;
	h->relname = relname;
	h->rejectlimit = rejectlimit;
	h->is_limit_in_rows = is_limit_in_rows;
	h->rejectcount = 0;
	h->is_server_enc = false;
	h->is_keep = is_keep;
	h->should_drop = false; /* we'll decide later */
	h->reusing_errtbl = reusing_existing_errtable;
	h->cdbcopy = NULL;
	h->errtbl = NULL;
	h->lastsegid = 0;
	h->consec_csv_err = 0;
	AssertImply(log_to_file, errortable == NULL);
	h->log_to_file = log_to_file;

	snprintf(h->filename, sizeof(h->filename),
			 "%s", filename ? filename : "<stdin>");

	/* error table was specified open it (and create it first if necessary) */
	if(errortable)
		OpenErrorTable(h, errortable);
	
	/*
	 * Create a temporary memory context that we can reset once per row to
	 * recover palloc'd memory.  This avoids any problems with leaks inside
	 * datatype input routines, and should be faster than retail pfree's
	 * anyway.
	 */
	h->badrowcontext = AllocSetContextCreate(CurrentMemoryContext,
											   "SrehMemCtxt",
											   ALLOCSET_DEFAULT_MINSIZE,
											   ALLOCSET_DEFAULT_INITSIZE,
											   ALLOCSET_DEFAULT_MAXSIZE);
	
	return h;
}
Beispiel #11
0
/*
 * InitScanStateInternal
 *   Initialize ScanState common variables for various Scan node.
 */
void
InitScanStateInternal(ScanState *scanState, Plan *plan, EState *estate,
		int eflags, bool initCurrentRelation)
{
	Assert(IsA(plan, SeqScan) ||
		   IsA(plan, AppendOnlyScan) ||
		   IsA(plan, ParquetScan) ||
		   IsA(plan, TableScan) ||
		   IsA(plan, DynamicTableScan) ||
		   IsA(plan, BitmapTableScan));

	PlanState *planState = &scanState->ps;

	planState->plan = plan;
	planState->state = estate;

	/* Create expression evaluation context */
	ExecAssignExprContext(estate, planState);
	
	/* Initialize tuple table slot */
	ExecInitResultTupleSlot(estate, planState);
	ExecInitScanTupleSlot(estate, scanState);
	
	/*
	 * For dynamic table scan, We do not initialize expression states; instead
	 * we wait until the first partition, and initialize the expression state
	 * at that time. Also, for dynamic table scan, we do not need to open the
	 * parent partition relation.
	 */
	if (initCurrentRelation)
	{
		InitScanStateRelationDetails(scanState, plan, estate);
	}

	/* Initialize result tuple type. */
	ExecAssignResultTypeFromTL(planState);

	/*
	 * If eflag contains EXEC_FLAG_REWIND or EXEC_FLAG_BACKWARD or EXEC_FLAG_MARK,
	 * then this node is not eager free safe.
	 */
	scanState->ps.delayEagerFree =
		((eflags & (EXEC_FLAG_REWIND | EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)) != 0);

	/* Currently, only SeqScan supports Mark/Restore. */
	AssertImply((eflags & EXEC_FLAG_MARK) != 0, IsA(plan, SeqScan));

}
Beispiel #12
0
/*
 * Create a multi-key heap from an array of entries
 *
 * entries: the values to convert to a heap.  This array will be under mkheap's ownership
 * alloc_sz: the allocation size of entries: that is, how much room the array has.
 * cnt: the number of elements in entries which should be used to build the heap
 * mkctxt: description of the heap to build
 *
 * If alloc_sz is zero then entries must be NULL
 */
MKHeap *
mkheap_from_array(MKEntry *entries, int alloc_sz, int cnt, MKContext *mkctxt)
{
	MKHeap	   *heap = (MKHeap *) palloc(sizeof(MKHeap));

	Assert(mkctxt);
	Assert(alloc_sz >= cnt);
	AssertEquivalent(entries != NULL, cnt > 0);
	AssertEquivalent(!entries, cnt == 0);

	heap->mkctxt = mkctxt;
	heap->lvtops = palloc0(mkctxt->total_lv * sizeof(MKEntry));

	heap->readers = NULL;
	heap->nreader = 0;

	AssertImply(alloc_sz == 0, !entries);
	Assert(cnt >= 0 && cnt <= alloc_sz);

	heap->p = entries;
	heap->alloc_size = alloc_sz;
	heap->count = cnt;
	heap->maxentry = cnt;

#ifdef USE_ASSERT_CHECKING
	{
		int			i;

		for (i = 0; i < cnt; ++i)
		{
			Assert(mke_get_lv(entries + i) == 0);
			Assert(mke_get_reader(entries + i) == 0);
		}
	}
#endif

	/*
	 * note: see NOTE ON UNIQUENESS CHECKING  at the top of this file for
	 * information about why we don't check uniqueness here
	 */

	mk_prepare_array(entries, 0, cnt - 1, 0, mkctxt);
	mkheap_heapify(heap, true);
	return heap;
}
Beispiel #13
0
/*
 * For a new workfile, sets the capabilities flags according to
 * the known underlying file type capabilities and the method the file was created
 */
static void
ExecWorkFile_SetFlags(ExecWorkFile *workfile, bool delOnClose, bool created)
{
	Assert(workfile != NULL);
	/* Assert that only the creator of a file can delete it on close */
	AssertImply(delOnClose, created);

	switch(workfile->fileType)
	{

	case BUFFILE:
		workfile->flags |= EXEC_WORKFILE_RANDOM_ACCESS;
		break;
	case BFZ:
		workfile->flags |= EXEC_WORKFILE_SUSPENDABLE;
		break;
	default:
		insist_log(false, "invalid work file type: %d", workfile->fileType);
	}

	if (delOnClose)
	{
		workfile->flags |= EXEC_WORKFILE_DEL_ON_CLOSE;
	}

	if (created)
	{
		workfile->flags |= EXEC_WORKFILE_CREATED;
		elog(gp_workfile_caching_loglevel, "Created workfile %s, delOnClose = %d",
				ExecWorkFile_GetFileName(workfile), delOnClose);
	}
	else
	{
		elog(gp_workfile_caching_loglevel, "Opened existing workfile %s, delOnClose = %d",
				ExecWorkFile_GetFileName(workfile), delOnClose);
	}

	if ((gp_workfile_limit_per_query > 0) || (gp_workfile_limit_per_segment > 0))
	{
		workfile->flags |= EXEC_WORKFILE_LIMIT_SIZE;
	}

}
Beispiel #14
0
/*
 * Open a temporary file that will (optionally) disappear when we close it.
 *
 * If 'makenameunique' is true, this function generates a file name which
 * should be unique to this particular OpenTemporaryFile() request and
 * distinct from any others in concurrent use on the same host.  As a
 * convenience for monitoring and debugging, the given 'fileName' string
 * and 'extentseqnum' are embedded in the file name.
 *
 * If 'makenameunique' is false, then 'fileName' and 'extentseqnum' identify a
 * new or existing temporary file which other processes also could open and
 * share.
 *
 * If 'create' is true, a new file is created.  If successful, a valid vfd
 * index (>0) is returned; otherwise an error is thrown.
 *
 * If 'create' is false, an existing file is opened.  If successful, a valid
 * vfd index (>0) is returned.  If the file does not exist or cannot be
 * opened, an invalid vfd index (<= 0) is returned.
 *
 * If 'delOnClose' is true, then the file is removed when you call
 * FileClose(); or when the process exits; or (provided 'closeAtEOXact' is
 * true) when the transaction ends.
 *
 * If 'closeAtEOXact' is true, the vfd is closed automatically at end of
 * transaction unless you have called FileClose() to close it before then.
 * If 'closeAtEOXact' is false, the vfd state is not changed at end of
 * transaction.
 *
 * In most cases, you don't want temporary files to outlive the transaction
 * that created them, so you should specify 'true' for both 'delOnClose' and
 * 'closeAtEOXact'.
 */
File
OpenTemporaryFile(const char   *fileName,
                  int           extentseqnum,
                  bool          makenameunique,
                  bool          create,
                  bool          delOnClose,
                  bool          closeAtEOXact)
{

	char	tempfilepath[MAXPGPATH];

	Assert(fileName);
    AssertImply(makenameunique, create && delOnClose);


    char tempfileprefix[MAXPGPATH];

    int len = GetTempFilePrefix(tempfileprefix, MAXPGPATH, fileName);
    insist_log(len <= MAXPGPATH - 1, "could not generate temporary file name");

    if (makenameunique)
	{
		/*
		 * Generate a tempfile name that should be unique within the current
		 * database instance.
		 */
		snprintf(tempfilepath, sizeof(tempfilepath),
				 "%s_%d_%04d.%ld",
				 tempfileprefix,
				 MyProcPid,
                 extentseqnum,
                 tempFileCounter++);
	}
	else
	{
        snprintf(tempfilepath, sizeof(tempfilepath),
				 "%s.%04d",
				 tempfileprefix,
				 extentseqnum);
	}

    return OpenNamedFile(tempfilepath, create, delOnClose, closeAtEOXact);
}    /* OpenTemporaryFile */
Beispiel #15
0
/*
 * DynamicScan_InitNextPartition
 *		Prepares the next partition for scanning by calling various
 *		helper methods to open relation, map dropped attributes,
 *		initialize expressions etc.
 */
static bool
DynamicScan_InitNextPartition(ScanState *scanState, PartitionInitMethod *partitionInitMethod, PartitionEndMethod *partitionEndMethod, PartitionReScanMethod *partitionReScanMethod)
{
	Assert(isDynamicScan((Scan *)scanState->ps.plan));
	AssertImply(scanState->scan_state != SCAN_INIT, NULL != scanState->ss_currentRelation);

	Scan *scan = (Scan *)scanState->ps.plan;
	DynamicTableScanInfo *partitionInfo = scanState->ps.state->dynamicTableScanInfo;
	Assert(partitionInfo->numScans >= scan->partIndex);
	int32 numSelectors = list_nth_int(partitionInfo->numSelectorsPerScanId, scan->partIndex);

	Oid newOid = DynamicScan_AdvanceIterator(scanState, numSelectors);

	if (!OidIsValid(newOid))
	{
		return false;
	}

	Relation oldRelation = NULL;
	Relation newRelation = NULL;

	DynamicScan_ObtainRelations(scanState, newOid, &oldRelation, &newRelation);
	/* Either we have a new relation or this is the first relation */
	if (oldRelation != newRelation || NULL == scanState->ss_currentRelation)
	{
		AttrNumber *attMap = DynamicScan_MapRelationColumns(scanState, oldRelation, newRelation);

		DynamicScan_RemapExpression(scanState, attMap, (Node*)scanState->ps.plan->qual);
		DynamicScan_RemapExpression(scanState, attMap, (Node*)scanState->ps.plan->targetlist);

		/*
		 * We only initialize expression if this is the first partition
		 * or if the column mapping changes between two partitions.
		 * Otherwise, we reuse the previously initialized expression.
		 */
		bool initExpressions = (NULL != attMap || SCAN_INIT == scanState->scan_state);

		if (newRelation != oldRelation)
		{
			/* Close the old relation */
			DynamicScan_CleanupOneRelation(scanState, oldRelation, partitionEndMethod);
		}

		DynamicScan_UpdateScanStateForNewPart(scanState, newRelation);

		if (initExpressions)
		{
			DynamicScan_InitExpr(scanState);
		}

		partitionInitMethod(scanState, attMap);

		if (NULL != attMap)
		{
			pfree(attMap);
			attMap = NULL;
		}
	}
	else
	{
		/* Rescan of the same part */
		partitionReScanMethod(scanState);
	}

	/* Collect number of partitions scanned in EXPLAIN ANALYZE */
	if(NULL != scanState->ps.instrument)
	{
		Instrumentation *instr = scanState->ps.instrument;
		instr->numPartScanned ++;
	}

	return true;
}
Beispiel #16
0
/*
 * Create a new file set
 *   type is the WorkFileType for the files: BUFFILE or BFZ
 *   can_be_reused: if set to false, then we don't insert this set into the cache,
 *     since the caller is telling us there is no point. This can happen for
 *     example when spilling during index creation.
 *   ps is the PlanState for the subtree rooted at the operator
 *   snapshot contains snapshot information for the current transaction
 *
 */
workfile_set *
workfile_mgr_create_set(enum ExecWorkFileType type, bool can_be_reused, PlanState *ps, workfile_set_snapshot snapshot)
{
	Assert(NULL != workfile_mgr_cache);

	Plan *plan = NULL;
	if (ps != NULL)
	{
		plan = ps->plan;
	}

	AssertImply(can_be_reused, plan != NULL);

	NodeTag node_type = T_Invalid;
	if (ps != NULL)
	{
		node_type = ps->type;
	}
	char *dir_path = create_workset_directory(node_type, currentSliceId);

	/* Create parameter info for the populate function */
	workset_info set_info;
	set_info.file_type = type;
	set_info.snapshot = snapshot;
	set_info.nodeType = node_type;
	set_info.can_be_reused = can_be_reused && workfile_mgr_is_reusable(ps);
	set_info.dir_path = dir_path;
	set_info.session_start_time = GetCurrentTimestamp();
	set_info.operator_work_mem = get_operator_work_mem(ps);
	set_info.on_disk = true;

	CacheEntry *newEntry = NULL;

	PG_TRY();
	{
		newEntry = acquire_entry_retry(workfile_mgr_cache, &set_info);
	}
	PG_CATCH();
	{
		/* Failed to acquire new entry, cache full. Clean up the directory we created. */
		workfile_mgr_delete_set_directory(dir_path);
		PG_RE_THROW();
	}
	PG_END_TRY();

	/* Path has now been copied to the workfile_set. We can free it */
	pfree(dir_path);

	/* Complete initialization of the entry with post-acquire actions */
	Assert(NULL != newEntry);
	workfile_set *work_set = CACHE_ENTRY_PAYLOAD(newEntry);
	Assert(work_set != NULL);
	if (work_set->can_be_reused)
	{
		Assert(plan != NULL);
		Assert(nodeTag(plan) >= T_Plan && nodeTag(plan) < T_PlanInvalItem);

		workfile_set_plan *s_plan = workfile_mgr_serialize_plan(ps);
		work_set->key = workfile_mgr_hash_key(s_plan);
		workfile_mgr_save_plan(work_set, s_plan);
		workfile_mgr_free_plan(s_plan);
	}

	elog(gp_workfile_caching_loglevel, "new spill file set. key=0x%x can_be_reused=%d prefix=%s opMemKB=" INT64_FORMAT,
			work_set->key, work_set->can_be_reused, work_set->path, work_set->metadata.operator_work_mem);

	return work_set;
}
Beispiel #17
0
/**
 * BackoffSweeper() looks at all the backend structures to determine if any
 * backends are not making progress. This is done by inspecting the lastchecked
 * time.  It also calculates the total weight of all 'active' backends to
 * re-calculate the target CPU usage per backend process. If it finds that a
 * backend is trying to request more CPU resources than the maximum CPU that it
 * can get (such a backend is called a 'pegger'), it assigns maxCPU to it.
 *
 * For example:
 * Let Qi be the ith query statement, Ri be the target CPU usage for Qi,
 * Wi be the statement weight for Qi, W be the total statements weight.
 * For simplicity, let's assume every statement only has 1 backend per segment.
 *
 * Let there be 4 active queries with weights {1,100,10,1000} with K=3 CPUs
 * available per segment to share. The maximum CPU that a backend can get is
 * maxCPU = 1.0. The total active statements weight is
 * W (activeWeight) = 1 + 100 + 10 + 1000 = 1111.
 * The following algorithm determines that Q4 is pegger, because
 * K * W4 / W > maxCPU, which is 3000/1111 > 1.0, so we assign R4 = 1.0.
 * Now K becomes 2.0, W becomes 111.
 * It restarts from the beginning and determines that Q2 is now a pegger as
 * well, because K * W2 / W > maxCPU, which is 200/111 > 1.0, we assign
 * R2 = 1.0. Now there is only 1 CPU left and no peggers left. We continue
 * to distribute the left 1 CPU to other backends according to their weight,
 * so we assign the target CPU ratio of R1=1/11 and R3=10/11. The final
 * target CPU assignments are {0.09,1.0,0.91,1.0}.
 *
 * If there are multiple backends within a segment running for the query Qi,
 * the target CPU ratio Ri for query Qi is divided equally among all the
 * active backends belonging to the query.
 */
void
BackoffSweeper()
{
	int			i = 0;

	/* The overall weight of active statements */
	volatile double activeWeight = 0.0;
	int			numActiveBackends = 0;
	int			numActiveStatements = 0;

	/* The overall weight of active and inactive statements */
	int			totalStatementWeight = 0;
	int			numValidBackends = 0;
	int			numStatements = 0;

	struct timeval currentTime;

	if (gettimeofday(&currentTime, NULL) < 0)
	{
		elog(ERROR, "Unable to execute gettimeofday(). Please disable query prioritization.");
	}

	Assert(backoffSingleton->sweeperInProgress == false);

	backoffSingleton->sweeperInProgress = true;

	TRACE_POSTGRESQL_BACKOFF_GLOBALCHECK();

	/* Reset status for all the backend entries */
	for (i = 0; i < backoffSingleton->numEntries; i++)
	{
		BackoffBackendSharedEntry *se = getBackoffEntryRW(i);

		se->isActive = false;
		se->numFollowersActive = 0;
		se->backoff = true;
	}

	/*
	 * Mark backends that are active. Count of active group members is
	 * maintained at their group leader.
	 */
	for (i = 0; i < backoffSingleton->numEntries; i++)
	{
		BackoffBackendSharedEntry *se = getBackoffEntryRW(i);

		if (isValid(&se->statementId))
		{
			Assert(se->weight > 0);
			if (TIMEVAL_DIFF_USEC(currentTime, se->lastCheckTime)
				< gp_resqueue_priority_inactivity_timeout * 1000.0)
			{
				/*
				 * This is an active backend. Need to maintain count at group
				 * leader
				 */
				BackoffBackendSharedEntry *gl = getBackoffEntryRW(se->groupLeaderIndex);

				if (gl->numFollowersActive == 0)
				{
					activeWeight += se->weight;
					numActiveStatements++;
				}
				gl->numFollowersActive++;
				numActiveBackends++;
				se->isActive = true;
			}
			if (isGroupLeader(i))
			{
				totalStatementWeight += se->weight;
				numStatements++;
			}
			numValidBackends++;
		}
	}

	/* Sanity checks */
	Assert(numActiveBackends <= numValidBackends);
	Assert(numValidBackends >= numStatements);

	/**
	 * Under certain conditions, we want to avoid backoff. Cases are:
	 * 1. A statement just entered or exited
	 * 2. A statement's weight changed due to user intervention via gp_adjust_priority()
	 * 3. There is no active backend
	 * 4. There is exactly one statement
	 * 5. Total number valid of backends <= number of procs per segment
	 * Case 1 and 2 are approximated by checking if total statement weight changed since last sweeper loop.
	 */
	if (backoffSingleton->lastTotalStatementWeight != totalStatementWeight
		|| numActiveBackends == 0
		|| numStatements == 1
		|| numValidBackends <= numProcsPerSegment())
	{
		/* Write to targets */
		for (i = 0; i < backoffSingleton->numEntries; i++)
		{
			BackoffBackendSharedEntry *se = getBackoffEntryRW(i);

			se->backoff = false;
			se->earlyBackoffExit = true;
			se->targetUsage = 1.0;
		}
	}
	else
	{
		/**
		 * There are multiple statements with active backends.
		 *
		 * Let 'found' be true if we find a backend is trying to
		 * request more CPU resources than the maximum CPU that it can
		 * get. No matter how high the priority of a query process, it
		 * can utilize at most a single CPU at a time.
		 */
		bool		found = true;
		int			numIterations = 0;
		double		CPUAvailable = numProcsPerSegment();
		double		maxCPU = Min(1.0, numProcsPerSegment());	/* Maximum CPU that a
																 * backend can get */

		Assert(maxCPU > 0.0);

		if (gp_debug_resqueue_priority)
		{
			elog(LOG, "before allocation: active backends = %d, active weight = %f, cpu available = %f", numActiveBackends, activeWeight, CPUAvailable);
		}

		while (found)
		{
			found = false;

			/**
			 * We try to find one or more backends that deserve maxCPU.
			 */
			for (i = 0; i < backoffSingleton->numEntries; i++)
			{
				BackoffBackendSharedEntry *se = getBackoffEntryRW(i);

				if (se->isActive
					&& se->backoff)
				{
					double		targetCPU = 0.0;
					const BackoffBackendSharedEntry *gl = getBackoffEntryRO(se->groupLeaderIndex);

					Assert(gl->numFollowersActive > 0);

					if (activeWeight <= 0.0)
					{
						/*
						 * There is a race condition here:
						 * Backend A,B,C are belong to same statement and have weight of
						 * 100000.
						 *
						 * Timestamp1: backend A's leader is A, backend B's leader is B
						 * backend C's leader is also B.
						 *
						 * Timestamp2: Sweeper calculates the activeWeight to 200000.
						 *
						 * Timestamp3: backend B changes it's leader to A.
						 *
						 * Timestamp4: Sweeper try to find the backends who deserve maxCPU,
						 * if backend A, B, C all deserve maxCPU, then activeWeight = 
						 * 200000 - 100000/1 - 100000/1 - 100000/2 which is less than zero.
						 *
						 * We can stop sweeping for such race condition because current
						 * backoff mechanism dose not ask for accurate control.
						 */
						backoffSingleton->sweeperInProgress = false;
						elog(LOG, "activeWeight underflow!");
						return;
					}

					Assert(activeWeight > 0.0);
					Assert(se->weight > 0.0);

					targetCPU = (CPUAvailable) * (se->weight) / activeWeight / gl->numFollowersActive;

					/**
					 * Some statements may be weighed so heavily that they are allocated the maximum cpu ratio.
					 */
					if (targetCPU >= maxCPU)
					{
						Assert(numProcsPerSegment() >= 1.0);	/* This can only happen
																 * when there is more
																 * than one proc */
						se->targetUsage = maxCPU;
						se->backoff = false;
						activeWeight -= (se->weight / gl->numFollowersActive);

						CPUAvailable -= maxCPU;
						found = true;
					}
				}
			}
			numIterations++;
			AssertImply(found, (numIterations <= floor(numProcsPerSegment())));
			Assert(numIterations <= ceil(numProcsPerSegment()));
		}

		if (gp_debug_resqueue_priority)
		{
			elog(LOG, "after heavy backends: active backends = %d, active weight = %f, cpu available = %f", numActiveBackends, activeWeight, CPUAvailable);
		}

		/**
		 * Distribute whatever is the CPU available among the rest.
		 */
		for (i = 0; i < backoffSingleton->numEntries; i++)
		{
			BackoffBackendSharedEntry *se = getBackoffEntryRW(i);

			if (se->isActive
				&& se->backoff)
			{
				const BackoffBackendSharedEntry *gl = getBackoffEntryRO(se->groupLeaderIndex);

				Assert(activeWeight > 0.0);
				Assert(gl->numFollowersActive > 0);
				Assert(se->weight > 0.0);
				se->targetUsage = (CPUAvailable) * (se->weight) / activeWeight / gl->numFollowersActive;
			}
		}
	}


	backoffSingleton->lastTotalStatementWeight = totalStatementWeight;
	backoffSingleton->sweeperInProgress = false;

	if (gp_debug_resqueue_priority)
	{
		StringInfoData str;

		initStringInfo(&str);
		appendStringInfo(&str, "num active statements: %d ", numActiveStatements);
		appendStringInfo(&str, "num active backends: %d ", numActiveBackends);
		appendStringInfo(&str, "targetusages: ");
		for (i = 0; i < MaxBackends; i++)
		{
			const BackoffBackendSharedEntry *se = getBackoffEntryRO(i);

			if (se->isActive)
				appendStringInfo(&str, "(%d,%f)", i, se->targetUsage);
		}
		elog(LOG, "%s", (const char *) str.data);
		pfree(str.data);
	}

}
Beispiel #18
0
HeapTuple
toast_insert_or_update(Relation rel, HeapTuple newtup, HeapTuple oldtup, 
					   MemTupleBinding *pbind, int toast_tuple_target,
					   bool isFrozen)
{
	HeapTuple	result_tuple;
	TupleDesc	tupleDesc;
	Form_pg_attribute *att;
	int			numAttrs;
	int			i;

	bool		need_change = false;
	bool		need_free = false;
	bool		need_delold = false;
	bool		has_nulls = false;

	Size		maxDataLen;

	char		toast_action[MaxHeapAttributeNumber];
	bool		toast_isnull[MaxHeapAttributeNumber];
	bool		toast_oldisnull[MaxHeapAttributeNumber];
	Datum		toast_values[MaxHeapAttributeNumber];
	Datum		toast_oldvalues[MaxHeapAttributeNumber];
	int32		toast_sizes[MaxHeapAttributeNumber];
	bool		toast_free[MaxHeapAttributeNumber];
	bool		toast_delold[MaxHeapAttributeNumber];

	bool 		ismemtuple = is_heaptuple_memtuple(newtup);

	AssertImply(ismemtuple, pbind);
	AssertImply(!ismemtuple, !pbind);
	AssertImply(ismemtuple && oldtup, is_heaptuple_memtuple(oldtup));
	Assert(toast_tuple_target > 0);
	
	/*
	 * We should only ever be called for tuples of plain relations ---
	 * recursing on a toast rel is bad news.
	 */
	//Assert(rel->rd_rel->relkind == RELKIND_RELATION);
	if (rel->rd_rel->relkind != RELKIND_RELATION)
		elog(LOG,"Why are we toasting a non-relation! %c ",rel->rd_rel->relkind);

	/*
	 * Get the tuple descriptor and break down the tuple(s) into fields.
	 */
	tupleDesc = rel->rd_att;
	att = tupleDesc->attrs;
	numAttrs = tupleDesc->natts;

	Assert(numAttrs <= MaxHeapAttributeNumber);

	if(ismemtuple)
		memtuple_deform((MemTuple) newtup, pbind, toast_values, toast_isnull);
	else
		heap_deform_tuple(newtup, tupleDesc, toast_values, toast_isnull);

	if (oldtup != NULL)
	{
		if(ismemtuple)
			memtuple_deform((MemTuple) oldtup, pbind, toast_oldvalues, toast_oldisnull);
		else
			heap_deform_tuple(oldtup, tupleDesc, toast_oldvalues, toast_oldisnull);
	}
	/* ----------
	 * Then collect information about the values given
	 *
	 * NOTE: toast_action[i] can have these values:
	 *		' '		default handling
	 *		'p'		already processed --- don't touch it
	 *		'x'		incompressible, but OK to move off
	 *
	 * NOTE: toast_sizes[i] is only made valid for varlena attributes with
	 *		toast_action[i] different from 'p'.
	 * ----------
	 */
	memset(toast_action, ' ', numAttrs * sizeof(char));
	memset(toast_free, 0, numAttrs * sizeof(bool));
	memset(toast_delold, 0, numAttrs * sizeof(bool));

	for (i = 0; i < numAttrs; i++)
	{
		varattrib *old_value;
		varattrib *new_value;

		if (oldtup != NULL)
		{
			/*
			 * For UPDATE get the old and new values of this attribute
			 */
			old_value = (varattrib *) DatumGetPointer(toast_oldvalues[i]);
			new_value = (varattrib *) DatumGetPointer(toast_values[i]);

			/*
			 * If the old value is an external stored one, check if it has
			 * changed so we have to delete it later.
			 */
			if (att[i]->attlen == -1 && !toast_oldisnull[i] &&
				VARATT_IS_EXTERNAL(old_value))
			{
				if (toast_isnull[i] || !VARATT_IS_EXTERNAL(new_value) ||
					memcmp((char *) old_value, (char *) new_value,
						   VARSIZE_EXTERNAL(old_value)) != 0)
				{
					/*
					 * The old external stored value isn't needed any more
					 * after the update
					 */
					toast_delold[i] = true;
					need_delold = true;
				}
				else
				{
					/*
					 * This attribute isn't changed by this update so we reuse
					 * the original reference to the old value in the new
					 * tuple.
					 */
					toast_action[i] = 'p';
					continue;
				}
			}
		}
		else
		{
			/*
			 * For INSERT simply get the new value
			 */
			new_value = (varattrib *) DatumGetPointer(toast_values[i]);
		}

		/*
		 * Handle NULL attributes
		 */
		if (toast_isnull[i])
		{
			toast_action[i] = 'p';
			has_nulls = true;
			continue;
		}

		/*
		 * Now look at varlena attributes
		 */
		if (att[i]->attlen == -1)
		{
			/*
			 * If the table's attribute says PLAIN always, force it so.
			 */
			if (att[i]->attstorage == 'p')
				toast_action[i] = 'p';

			/*
			 * We took care of UPDATE above, so any external value we find
			 * still in the tuple must be someone else's we cannot reuse.
			 * Fetch it back (without decompression, unless we are forcing
			 * PLAIN storage).	If necessary, we'll push it out as a new
			 * external value below.
			 */
			if (VARATT_IS_EXTERNAL(new_value))
			{
				if (att[i]->attstorage == 'p')
					new_value = (varattrib *)heap_tuple_untoast_attr((struct varlena *)new_value);
				else
					new_value = (varattrib *)heap_tuple_fetch_attr((struct varlena *)new_value);
				toast_values[i] = PointerGetDatum(new_value);
				toast_free[i] = true;
				need_change = true;
				need_free = true;
			}

			/*
			 * Remember the size of this attribute
			 */
			toast_sizes[i] = VARSIZE_ANY(new_value);
		}
		else
		{
			/*
			 * Not a varlena attribute, plain storage always
			 */
			toast_action[i] = 'p';
		}
	}

	/* ----------
	 * Compress and/or save external until data fits into target length
	 *
	 *	1: Inline compress attributes with attstorage 'x', and store very
	 *	   large attributes with attstorage 'x' or 'e' external immediately
	 *	2: Store attributes with attstorage 'x' or 'e' external
	 *	3: Inline compress attributes with attstorage 'm'
	 *	4: Store attributes with attstorage 'm' external
	 * ----------
	 */

	if(!ismemtuple)
	{
		/* compute header overhead --- this should match heap_form_tuple() */
		maxDataLen = offsetof(HeapTupleHeaderData, t_bits);
		if (has_nulls)
			maxDataLen += BITMAPLEN(numAttrs);
		if (newtup->t_data->t_infomask & HEAP_HASOID)
			maxDataLen += sizeof(Oid);
		maxDataLen = MAXALIGN(maxDataLen);
		Assert(maxDataLen == newtup->t_data->t_hoff);
		/* now convert to a limit on the tuple data size */
		maxDataLen = toast_tuple_target - maxDataLen;
	}
	else
		maxDataLen = toast_tuple_target;

	/*
	 * Look for attributes with attstorage 'x' to compress.  Also find large
	 * attributes with attstorage 'x' or 'e', and store them external.
	 */
	while (compute_dest_tuplen(tupleDesc, pbind, has_nulls, toast_values, toast_isnull) > maxDataLen)
	{
		int			biggest_attno = -1;
		int32		biggest_size = MAXALIGN(sizeof(varattrib));
		Datum		old_value;
		Datum		new_value;

		/*
		 * Search for the biggest yet unprocessed internal attribute
		 */
		for (i = 0; i < numAttrs; i++)
		{
			if (toast_action[i] != ' ')
				continue;
			if (VARATT_IS_EXTERNAL_D(toast_values[i]))
				continue;
			if (VARATT_IS_COMPRESSED_D(toast_values[i]))
				continue;
			if (att[i]->attstorage != 'x')
				continue;
			if (toast_sizes[i] > biggest_size)
			{
				biggest_attno = i;
				biggest_size = toast_sizes[i];
			}
		}

		if (biggest_attno < 0)
			break;

		/*
		 * Attempt to compress it inline, if it has attstorage 'x'
		 */
		i = biggest_attno;
		old_value = toast_values[i];
		new_value = toast_compress_datum(old_value);

		if (DatumGetPointer(new_value) != NULL)
		{
			/* successful compression */
			if (toast_free[i])
				pfree(DatumGetPointer(old_value));
			toast_values[i] = new_value;
			toast_free[i] = true;
			toast_sizes[i] = VARSIZE_D(toast_values[i]);
			need_change = true;
			need_free = true;
		}
		else
		{
			/*
			 * incompressible data, ignore on subsequent compression passes
			 */
			toast_action[i] = 'x';
		}
	}

	/*
	 * Second we look for attributes of attstorage 'x' or 'e' that are still
	 * inline.
	 */
	while (compute_dest_tuplen(tupleDesc, pbind, has_nulls, toast_values, toast_isnull) > maxDataLen &&
		   rel->rd_rel->reltoastrelid != InvalidOid)
	{
		int			biggest_attno = -1;
		int32		biggest_size = MAXALIGN(sizeof(varattrib));
		Datum		old_value;

		/*------
		 * Search for the biggest yet inlined attribute with
		 * attstorage equals 'x' or 'e'
		 *------
		 */
		for (i = 0; i < numAttrs; i++)
		{
			if (toast_action[i] == 'p')
				continue;
			if (VARATT_IS_EXTERNAL_D(toast_values[i]))
				continue;
			if (att[i]->attstorage != 'x' && att[i]->attstorage != 'e')
				continue;
			if (toast_sizes[i] > biggest_size)
			{
				biggest_attno = i;
				biggest_size = toast_sizes[i];
			}
		}

		if (biggest_attno < 0)
			break;

		/*
		 * Store this external
		 */
		i = biggest_attno;
		old_value = toast_values[i];
		toast_action[i] = 'p';
		toast_values[i] = toast_save_datum(rel, toast_values[i], isFrozen);
		if (toast_free[i])
			pfree(DatumGetPointer(old_value));
		toast_free[i] = true;

		need_change = true;
		need_free = true;
	}

	/*
	 * Round 3 - this time we take attributes with storage 'm' into
	 * compression
	 */
	while (compute_dest_tuplen(tupleDesc, pbind, has_nulls, toast_values, toast_isnull) > maxDataLen)
	{
		int			biggest_attno = -1;
		int32		biggest_size = MAXALIGN(sizeof(varattrib));
		Datum		old_value;
		Datum		new_value;

		/*
		 * Search for the biggest yet uncompressed internal attribute
		 */
		for (i = 0; i < numAttrs; i++)
		{
			if (toast_action[i] != ' ')
				continue;
			if (VARATT_IS_EXTERNAL_D(toast_values[i]))
				continue;		/* can't happen, toast_action would be 'p' */
			if (VARATT_IS_COMPRESSED_D(toast_values[i]))
				continue;
			if (att[i]->attstorage != 'm')
				continue;
			if (toast_sizes[i] > biggest_size)
			{
				biggest_attno = i;
				biggest_size = toast_sizes[i];
			}
		}

		if (biggest_attno < 0)
			break;

		/*
		 * Attempt to compress it inline
		 */
		i = biggest_attno;
		old_value = toast_values[i];
		new_value = toast_compress_datum(old_value);

		if (DatumGetPointer(new_value) != NULL)
		{
			/* successful compression */
			if (toast_free[i])
				pfree(DatumGetPointer(old_value));
			toast_values[i] = new_value;
			toast_free[i] = true;
			toast_sizes[i] = VARSIZE_D(toast_values[i]);
			need_change = true;
			need_free = true;
		}
		else
		{
			/* incompressible, ignore on subsequent compression passes */
			toast_action[i] = 'x';
		}
	}

	/*
	 * Finally we store attributes of type 'm' external, if possible.
	 */
	while (compute_dest_tuplen(tupleDesc, pbind, has_nulls, toast_values, toast_isnull) > maxDataLen &&
		   rel->rd_rel->reltoastrelid != InvalidOid)
	{
		int			biggest_attno = -1;
		int32		biggest_size = MAXALIGN(sizeof(varattrib));
		Datum		old_value;

		/*--------
		 * Search for the biggest yet inlined attribute with
		 * attstorage = 'm'
		 *--------
		 */
		for (i = 0; i < numAttrs; i++)
		{
			if (toast_action[i] == 'p')
				continue;
			if (VARATT_IS_EXTERNAL_D(toast_values[i]))
				continue;		/* can't happen, toast_action would be 'p' */
			if (att[i]->attstorage != 'm')
				continue;
			if (toast_sizes[i] > biggest_size)
			{
				biggest_attno = i;
				biggest_size = toast_sizes[i];
			}
		}

		if (biggest_attno < 0)
			break;

		/*
		 * Store this external
		 */
		i = biggest_attno;
		old_value = toast_values[i];
		toast_action[i] = 'p';
		toast_values[i] = toast_save_datum(rel, toast_values[i], isFrozen);
		if (toast_free[i])
			pfree(DatumGetPointer(old_value));
		toast_free[i] = true;

		need_change = true;
		need_free = true;
	}

	/* XXX Maybe we should check here for any compressed inline attributes that
	 * didn't save enough to warrant keeping. In particular attributes whose
	 * rawsize is < 128 bytes and didn't save at least 3 bytes... or even maybe
	 * more given alignment issues 
	 */

	/*
	 * In the case we toasted any values, we need to build a new heap tuple
	 * with the changed values.
	 */
	if (need_change)
	{
		if(ismemtuple)
			result_tuple = (HeapTuple) memtuple_form_to(pbind, toast_values, toast_isnull, NULL, NULL, false);
		else
		{
			HeapTupleHeader olddata = newtup->t_data;
			HeapTupleHeader new_data;
			int32		new_len;

			/*
			 * Calculate the new size of the tuple.  Header size should not
			 * change, but data size might.
			 */
			new_len = offsetof(HeapTupleHeaderData, t_bits);
			if (has_nulls)
				new_len += BITMAPLEN(numAttrs);
			if (olddata->t_infomask & HEAP_HASOID)
				new_len += sizeof(Oid);
			new_len = MAXALIGN(new_len);
			Assert(new_len == olddata->t_hoff);
			new_len += heap_compute_data_size(tupleDesc,
					toast_values, toast_isnull);

			/*
			 * Allocate and zero the space needed, and fill HeapTupleData fields.
			 */
			result_tuple = (HeapTuple) palloc0(HEAPTUPLESIZE + new_len);
			result_tuple->t_len = new_len;
			result_tuple->t_self = newtup->t_self;
			new_data = (HeapTupleHeader) ((char *) result_tuple + HEAPTUPLESIZE);
			result_tuple->t_data = new_data;

			/*
			 * Put the existing tuple header and the changed values into place
			 */
			memcpy(new_data, olddata, olddata->t_hoff);

			heap_fill_tuple(tupleDesc,
					toast_values,
					toast_isnull,
					(char *) new_data + olddata->t_hoff,
					&(new_data->t_infomask),
					has_nulls ? new_data->t_bits : NULL);
		}
	}
	else
		result_tuple = newtup;

	/*
	 * Free allocated temp values
	 */
	if (need_free)
		for (i = 0; i < numAttrs; i++)
			if (toast_free[i])
				pfree(DatumGetPointer(toast_values[i]));

	/*
	 * Delete external values from the old tuple
	 */
	if (need_delold)
		for (i = 0; i < numAttrs; i++)
			if (toast_delold[i])
				toast_delete_datum(rel, toast_oldvalues[i]);

	return result_tuple;
}
Beispiel #19
0
/*
 * Finds and notifies the top vmem consuming session.
 */
static void
RedZoneHandler_FlagTopConsumer()
{
	if (!vmemTrackerInited)
	{
		return;
	}

	Assert(NULL != MySessionState);

	bool success = compare_and_swap_32((uint32*) isRunawayDetector, 0, 1);

	/* If successful then this process must be the runaway detector */
	AssertImply(success, 1 == *isRunawayDetector);

	/*
	 * Someone already determined the runaway query, so nothing to do. This
	 * will also prevent re-entry to this method by a cleaning session.
	 */
	if (!success)
	{
		return;
	}

	/*
	 * Grabbing a shared lock prevents others to modify the SessionState
	 * data structure, therefore ensuring that we don't flag someone
	 * who was already dying. A shared lock is enough as we access the
	 * data structure in a read-only manner.
	 */
	LWLockAcquire(SessionStateLock, LW_SHARED);

	int32 maxVmem = 0;
	int32 maxActiveVmem = 0;
	SessionState *maxActiveVmemSessionState = NULL;
	SessionState *maxVmemSessionState = NULL;

	SessionState *curSessionState = AllSessionStateEntries->usedList;

	while (curSessionState != NULL)
	{
		Assert(INVALID_SESSION_ID != curSessionState->sessionId);

		int32 curVmem = curSessionState->sessionVmem;

		Assert(maxActiveVmem <= maxVmem);

		if (curVmem > maxActiveVmem)
		{
			if (curVmem > maxVmem)
			{
				maxVmemSessionState = curSessionState;
				maxVmem = curVmem;
			}

			/*
			 * Only consider sessions with at least 1 active process. As we
			 * are *not* grabbings locks, this does not guarantee that by the
			 * time we finish walking all sessions the chosen session will
			 * still have active process.
			 */
			if  (curSessionState->activeProcessCount > 0)
			{
				maxActiveVmemSessionState = curSessionState;
				maxActiveVmem = curVmem;
			}
		}

		curSessionState = curSessionState->next;
	}

	if (NULL != maxActiveVmemSessionState)
	{
		SpinLockAcquire(&maxActiveVmemSessionState->spinLock);

		/*
		 * Now that we grabbed lock, make sure we have at least 1 active process
		 * before flagging this session for termination
		 */
		if (0 < maxActiveVmemSessionState->activeProcessCount)
		{
			/*
			 * First update the runaway event detection version so that
			 * an active process of the runaway session is forced to clean up before
			 * it deactivates. As we grabbed the spin lock, no process of the runaway
			 * session can deactivate unless we release the lock. The other sessions
			 * don't care what global runaway version they observe as the runaway
			 * event is not pertinent to them.
			 *
			 * We don't need any lock here as the runaway detector is singleton,
			 * and only the detector can update this variable.
			 */
			*latestRunawayVersion = *CurrentVersion + 1;
			/*
			 * Make sure that the runaway event version is not shared with any other
			 * processes, and not shared with any other deactivation/reactivation version
			 */
			*CurrentVersion = *CurrentVersion + 2;

			Assert(CLEANUP_COUNTDOWN_BEFORE_RUNAWAY == maxActiveVmemSessionState->cleanupCountdown);
			/*
			 * Determine how many processes need to cleanup to mark the session clean.
			 */
			maxActiveVmemSessionState->cleanupCountdown = maxActiveVmemSessionState->activeProcessCount;

			if (maxActiveVmemSessionState == maxVmemSessionState)
			{
				/* Finally signal the runaway process for cleanup */
				maxActiveVmemSessionState->runawayStatus = RunawayStatus_PrimaryRunawaySession;
			}
			else
			{
				maxActiveVmemSessionState->runawayStatus = RunawayStatus_SecondaryRunawaySession;
			}

			/* Save the amount of vmem session was holding when it was flagged as runaway */
			maxActiveVmemSessionState->sessionVmemRunaway = maxActiveVmemSessionState->sessionVmem;

			/* Save the command count currently running in the runaway session */
			maxActiveVmemSessionState->commandCountRunaway = gp_command_count;
		}
		else
		{
			/*
			 * Failed to find any viable runaway session. Reset runaway detector flag
			 * for another round of runaway determination at a later time. As we couldn't
			 * find any runaway session, the CurrentVersion is not changed.
			 */
			*isRunawayDetector = 0;
		}

		SpinLockRelease(&maxActiveVmemSessionState->spinLock);
	}
	else
	{
		/*
		 * No active session to mark as runaway. So, reenable the runaway detection process
		 */
		*isRunawayDetector = 0;
	}

	LWLockRelease(SessionStateLock);
}