Example #1
0
/*
 * initNextTableToScan
 *   Find the next table to scan and initiate the scan if the previous table
 * is finished.
 *
 * If scanning on the current table is not finished, or a new table is found,
 * this function returns true.
 * If no more table is found, this function returns false.
 */
static bool
initNextTableToScan(DynamicTableScanState *node)
{
	ScanState *scanState = (ScanState *)node;

	if (scanState->scan_state == SCAN_INIT ||
		scanState->scan_state == SCAN_DONE)
	{
		Oid *pid = hash_seq_search(&node->pidStatus);
		if (pid == NULL)
		{
			node->shouldCallHashSeqTerm = false;
			return false;
		}
		
		/* Collect number of partitions scanned in EXPLAIN ANALYZE */
		if (NULL != scanState->ps.instrument)
		{
			Instrumentation *instr = scanState->ps.instrument;
			instr->numPartScanned ++;
		}

		/*
		 * Inside ExecInitScanTupleSlot() we set the tuple table slot's oid
		 * to range table entry's relid, which for partitioned table always set
		 * to parent table's oid. In queries where we need to read table oids
		 * (MPP-20736) we use the tuple table slot's saved oid (refer to slot_getsysattr()).
		 * This wrongly returns parent oid, instead of partition oid. Therefore,
		 * to return correct partition oid, we need to update
		 * our tuple table slot's oid to reflect the partition oid.
		 */
		scanState->ss_ScanTupleSlot->tts_tableOid = *pid;

		scanState->ss_currentRelation = OpenScanRelationByOid(*pid);
		Relation lastScannedRel = OpenScanRelationByOid(node->lastRelOid);
		TupleDesc lastTupDesc = RelationGetDescr(lastScannedRel);
		CloseScanRelation(lastScannedRel);

		TupleDesc partTupDesc = RelationGetDescr(scanState->ss_currentRelation);

		ExecAssignScanType(scanState, partTupDesc);

		AttrNumber	*attMap;

		attMap = varattnos_map(lastTupDesc, partTupDesc);

		/* If attribute remapping is not necessary, then do not change the varattno */
		if (attMap)
		{
			change_varattnos_of_a_varno((Node*)scanState->ps.plan->qual, attMap, node->scanrelid);
			change_varattnos_of_a_varno((Node*)scanState->ps.plan->targetlist, attMap, node->scanrelid);

			/*
			 * Now that the varattno mapping has been changed, change the relation that
			 * the new varnos correspond to
			 */
			node->lastRelOid = *pid;
		}

		/*
		 * For the very first partition, the targetlist of planstate is set to null. So, we must
		 * initialize quals and targetlist, regardless of remapping requirements. For later
		 * partitions, we only initialize quals and targetlist if a column re-mapping is necessary.
		 */
		if (attMap || node->firstPartition)
		{
			node->firstPartition = false;
			MemoryContextReset(node->partitionMemoryContext);
			MemoryContext oldCxt = MemoryContextSwitchTo(node->partitionMemoryContext);

			/* Initialize child expressions */
			scanState->ps.qual = (List *)ExecInitExpr((Expr *)scanState->ps.plan->qual, (PlanState*)scanState);
			scanState->ps.targetlist = (List *)ExecInitExpr((Expr *)scanState->ps.plan->targetlist, (PlanState*)scanState);

			MemoryContextSwitchTo(oldCxt);
		}

		if (attMap)
		{
			pfree(attMap);
		}

		ExecAssignScanProjectionInfo(scanState);
		
		scanState->tableType = getTableType(scanState->ss_currentRelation);
		BeginTableScanRelation(scanState);
	}

	return true;
}
Example #2
0
/*
 * DatabaseInfo_SortRelArray()
 *   Builds the sorted RelArray structure based on a RelHash
 */
static void
DatabaseInfo_SortRelArray(
	DatabaseInfo		*info, 
	HTAB				*dbInfoRelHashTable,
	int					 count)
{
	HASH_SEQ_STATUS		  iterateStatus;
	DbInfoRel			**dbInfoRelPtrArray;
	int					  d;

	/* This function will populate the dbInfoRelArray */
	Assert(info->dbInfoRelArray == NULL);

	/* Construct an array of pointers by scanning through the hash table */
	dbInfoRelPtrArray = (DbInfoRel**) palloc(sizeof(DbInfoRel*) * count);
	hash_seq_init(&iterateStatus, dbInfoRelHashTable);
	for (d = 0; d < count; d++)
	{
		dbInfoRelPtrArray[d] = (DbInfoRel*) hash_seq_search(&iterateStatus);

		/* should have as many entries in the hash scan as "count" */
		if (dbInfoRelPtrArray[d] == NULL)
			elog(ERROR, "insufficient #/entries in dbInfoRelHashTable");
	}

	/* double check that the hash contained the right number of elements */
	if (hash_seq_search(&iterateStatus) != NULL)
		elog(ERROR, "too many entries in dbInfoRelHashTable");
	
	/* sort the pointer array */
	qsort(dbInfoRelPtrArray,
		  count, 
		  sizeof(DbInfoRel*),
		  DbInfoRelPtrArray_Compare);
		   
	/*
	 * Finally convert the sorted pointer array into a sorted record array.
	 */
	info->dbInfoRelArray = (DbInfoRel*) palloc(sizeof(DbInfoRel)*count);
	for (d = 0; d < count; d++)
	{
		info->dbInfoRelArray[d] = *(dbInfoRelPtrArray[d]);
		
		/*
		 * For each record in the array we have three lists:
		 *   - gpRelationNodes
		 *   - appendOnlyCatalogSegmentInfo
		 *   - physicalSegmentFiles 
		 *
		 * All three of which need to be sorted on segmentFileNum otherwise
		 * we will not be able to merge the lists correctly.
		 *
		 * XXX - this seems like a bad design, it seems like we have three
		 * sources of information on the same thing, which should be able
		 * to be satisfied with a single Hash rather than trying to keep 
		 * around three different lists and have code spread throughout the
		 * source trying to deal with merging the lists.
		 */
		if (info->dbInfoRelArray[d].gpRelationNodes)
			qsort(info->dbInfoRelArray[d].gpRelationNodes,
				  info->dbInfoRelArray[d].gpRelationNodesCount,
				  sizeof(DbInfoGpRelationNode),
				  DbInfoGpRelationNode_Compare);
		if (info->dbInfoRelArray[d].appendOnlyCatalogSegmentInfo)
			qsort(info->dbInfoRelArray[d].appendOnlyCatalogSegmentInfo,
				  info->dbInfoRelArray[d].appendOnlyCatalogSegmentInfoCount,
				  sizeof(DbInfoAppendOnlyCatalogSegmentInfo),
				  DbInfoAppendOnlyCatalogSegmentInfo_Compare);
		if (info->dbInfoRelArray[d].physicalSegmentFiles)
			qsort(info->dbInfoRelArray[d].physicalSegmentFiles,
				  info->dbInfoRelArray[d].physicalSegmentFilesCount,
				  sizeof(DbInfoSegmentFile),
				  DbInfoSegmentFile_Compare);
	}
	info->dbInfoRelArrayCount = count;

	/* Release the temporary pointer array and return */
	pfree(dbInfoRelPtrArray);
	return;
}
Example #3
0
/*
 *	compute_tsvector_stats() -- compute statistics for a tsvector column
 *
 *	This functions computes statistics that are useful for determining @@
 *	operations' selectivity, along with the fraction of non-null rows and
 *	average width.
 *
 *	Instead of finding the most common values, as we do for most datatypes,
 *	we're looking for the most common lexemes. This is more useful, because
 *	there most probably won't be any two rows with the same tsvector and thus
 *	the notion of a MCV is a bit bogus with this datatype. With a list of the
 *	most common lexemes we can do a better job at figuring out @@ selectivity.
 *
 *	For the same reasons we assume that tsvector columns are unique when
 *	determining the number of distinct values.
 *
 *	The algorithm used is Lossy Counting, as proposed in the paper "Approximate
 *	frequency counts over data streams" by G. S. Manku and R. Motwani, in
 *	Proceedings of the 28th International Conference on Very Large Data Bases,
 *	Hong Kong, China, August 2002, section 4.2. The paper is available at
 *	http://www.vldb.org/conf/2002/S10P03.pdf
 *
 *	The Lossy Counting (aka LC) algorithm goes like this:
 *	Let s be the threshold frequency for an item (the minimum frequency we
 *	are interested in) and epsilon the error margin for the frequency. Let D
 *	be a set of triples (e, f, delta), where e is an element value, f is that
 *	element's frequency (actually, its current occurrence count) and delta is
 *	the maximum error in f. We start with D empty and process the elements in
 *	batches of size w. (The batch size is also known as "bucket size" and is
 *	equal to 1/epsilon.) Let the current batch number be b_current, starting
 *	with 1. For each element e we either increment its f count, if it's
 *	already in D, or insert a new triple into D with values (e, 1, b_current
 *	- 1). After processing each batch we prune D, by removing from it all
 *	elements with f + delta <= b_current.  After the algorithm finishes we
 *	suppress all elements from D that do not satisfy f >= (s - epsilon) * N,
 *	where N is the total number of elements in the input.  We emit the
 *	remaining elements with estimated frequency f/N.  The LC paper proves
 *	that this algorithm finds all elements with true frequency at least s,
 *	and that no frequency is overestimated or is underestimated by more than
 *	epsilon.  Furthermore, given reasonable assumptions about the input
 *	distribution, the required table size is no more than about 7 times w.
 *
 *	We set s to be the estimated frequency of the K'th word in a natural
 *	language's frequency table, where K is the target number of entries in
 *	the MCELEM array plus an arbitrary constant, meant to reflect the fact
 *	that the most common words in any language would usually be stopwords
 *	so we will not actually see them in the input.  We assume that the
 *	distribution of word frequencies (including the stopwords) follows Zipf's
 *	law with an exponent of 1.
 *
 *	Assuming Zipfian distribution, the frequency of the K'th word is equal
 *	to 1/(K * H(W)) where H(n) is 1/2 + 1/3 + ... + 1/n and W is the number of
 *	words in the language.  Putting W as one million, we get roughly 0.07/K.
 *	Assuming top 10 words are stopwords gives s = 0.07/(K + 10).  We set
 *	epsilon = s/10, which gives bucket width w = (K + 10)/0.007 and
 *	maximum expected hashtable size of about 1000 * (K + 10).
 *
 *	Note: in the above discussion, s, epsilon, and f/N are in terms of a
 *	lexeme's frequency as a fraction of all lexemes seen in the input.
 *	However, what we actually want to store in the finished pg_statistic
 *	entry is each lexeme's frequency as a fraction of all rows that it occurs
 *	in.  Assuming that the input tsvectors are correctly constructed, no
 *	lexeme occurs more than once per tsvector, so the final count f is a
 *	correct estimate of the number of input tsvectors it occurs in, and we
 *	need only change the divisor from N to nonnull_cnt to get the number we
 *	want.
 */
static void
compute_tsvector_stats(VacAttrStats *stats,
					   AnalyzeAttrFetchFunc fetchfunc,
					   int samplerows,
					   double totalrows)
{
	int			num_mcelem;
	int			null_cnt = 0;
	double		total_width = 0;

	/* This is D from the LC algorithm. */
	HTAB	   *lexemes_tab;
	HASHCTL		hash_ctl;
	HASH_SEQ_STATUS scan_status;

	/* This is the current bucket number from the LC algorithm */
	int			b_current;

	/* This is 'w' from the LC algorithm */
	int			bucket_width;
	int			vector_no,
				lexeme_no;
	LexemeHashKey hash_key;
	TrackItem  *item;

	/*
	 * We want statistics_target * 10 lexemes in the MCELEM array.  This
	 * multiplier is pretty arbitrary, but is meant to reflect the fact that
	 * the number of individual lexeme values tracked in pg_statistic ought to
	 * be more than the number of values for a simple scalar column.
	 */
	num_mcelem = stats->attr->attstattarget * 10;

	/*
	 * We set bucket width equal to (num_mcelem + 10) / 0.007 as per the
	 * comment above.
	 */
	bucket_width = (num_mcelem + 10) * 1000 / 7;

	/*
	 * Create the hashtable. It will be in local memory, so we don't need to
	 * worry about overflowing the initial size. Also we don't need to pay any
	 * attention to locking and memory management.
	 */
	MemSet(&hash_ctl, 0, sizeof(hash_ctl));
	hash_ctl.keysize = sizeof(LexemeHashKey);
	hash_ctl.entrysize = sizeof(TrackItem);
	hash_ctl.hash = lexeme_hash;
	hash_ctl.match = lexeme_match;
	hash_ctl.hcxt = CurrentMemoryContext;
	lexemes_tab = hash_create("Analyzed lexemes table",
							  num_mcelem,
							  &hash_ctl,
							  HASH_ELEM | HASH_FUNCTION | HASH_COMPARE | HASH_CONTEXT);

	/* Initialize counters. */
	b_current = 1;
	lexeme_no = 0;

	/* Loop over the tsvectors. */
	for (vector_no = 0; vector_no < samplerows; vector_no++)
	{
		Datum		value;
		bool		isnull;
		TSVector	vector;
		WordEntry  *curentryptr;
		char	   *lexemesptr;
		int			j;

		vacuum_delay_point();

		value = fetchfunc(stats, vector_no, &isnull);

		/*
		 * Check for null/nonnull.
		 */
		if (isnull)
		{
			null_cnt++;
			continue;
		}

		/*
		 * Add up widths for average-width calculation.  Since it's a
		 * tsvector, we know it's varlena.  As in the regular
		 * compute_minimal_stats function, we use the toasted width for this
		 * calculation.
		 */
		total_width += VARSIZE_ANY(DatumGetPointer(value));

		/*
		 * Now detoast the tsvector if needed.
		 */
		vector = DatumGetTSVector(value);

		/*
		 * We loop through the lexemes in the tsvector and add them to our
		 * tracking hashtable.
		 */
		lexemesptr = STRPTR(vector);
		curentryptr = ARRPTR(vector);
		for (j = 0; j < vector->size; j++)
		{
			bool		found;

			/*
			 * Construct a hash key.  The key points into the (detoasted)
			 * tsvector value at this point, but if a new entry is created, we
			 * make a copy of it.  This way we can free the tsvector value
			 * once we've processed all its lexemes.
			 */
			hash_key.lexeme = lexemesptr + curentryptr->pos;
			hash_key.length = curentryptr->len;

			/* Lookup current lexeme in hashtable, adding it if new */
			item = (TrackItem *) hash_search(lexemes_tab,
											 (const void *) &hash_key,
											 HASH_ENTER, &found);

			if (found)
			{
				/* The lexeme is already on the tracking list */
				item->frequency++;
			}
			else
			{
				/* Initialize new tracking list element */
				item->frequency = 1;
				item->delta = b_current - 1;

				item->key.lexeme = palloc(hash_key.length);
				memcpy(item->key.lexeme, hash_key.lexeme, hash_key.length);
			}

			/* lexeme_no is the number of elements processed (ie N) */
			lexeme_no++;

			/* We prune the D structure after processing each bucket */
			if (lexeme_no % bucket_width == 0)
			{
				prune_lexemes_hashtable(lexemes_tab, b_current);
				b_current++;
			}

			/* Advance to the next WordEntry in the tsvector */
			curentryptr++;
		}

		/* If the vector was toasted, free the detoasted copy. */
		if (TSVectorGetDatum(vector) != value)
			pfree(vector);
	}

	/* We can only compute real stats if we found some non-null values. */
	if (null_cnt < samplerows)
	{
		int			nonnull_cnt = samplerows - null_cnt;
		int			i;
		TrackItem **sort_table;
		int			track_len;
		int			cutoff_freq;
		int			minfreq,
					maxfreq;

		stats->stats_valid = true;
		/* Do the simple null-frac and average width stats */
		stats->stanullfrac = (double) null_cnt / (double) samplerows;
		stats->stawidth = total_width / (double) nonnull_cnt;

		/* Assume it's a unique column (see notes above) */
		stats->stadistinct = -1.0 * (1.0 - stats->stanullfrac);

		/*
		 * Construct an array of the interesting hashtable items, that is,
		 * those meeting the cutoff frequency (s - epsilon)*N.  Also identify
		 * the minimum and maximum frequencies among these items.
		 *
		 * Since epsilon = s/10 and bucket_width = 1/epsilon, the cutoff
		 * frequency is 9*N / bucket_width.
		 */
		cutoff_freq = 9 * lexeme_no / bucket_width;

		i = hash_get_num_entries(lexemes_tab);	/* surely enough space */
		sort_table = (TrackItem **) palloc(sizeof(TrackItem *) * i);

		hash_seq_init(&scan_status, lexemes_tab);
		track_len = 0;
		minfreq = lexeme_no;
		maxfreq = 0;
		while ((item = (TrackItem *) hash_seq_search(&scan_status)) != NULL)
		{
			if (item->frequency > cutoff_freq)
			{
				sort_table[track_len++] = item;
				minfreq = Min(minfreq, item->frequency);
				maxfreq = Max(maxfreq, item->frequency);
			}
		}
		Assert(track_len <= i);

		/* emit some statistics for debug purposes */
		elog(DEBUG3, "tsvector_stats: target # mces = %d, bucket width = %d, "
			 "# lexemes = %d, hashtable size = %d, usable entries = %d",
			 num_mcelem, bucket_width, lexeme_no, i, track_len);

		/*
		 * If we obtained more lexemes than we really want, get rid of those
		 * with least frequencies.  The easiest way is to qsort the array into
		 * descending frequency order and truncate the array.
		 */
		if (num_mcelem < track_len)
		{
			qsort(sort_table, track_len, sizeof(TrackItem *),
				  trackitem_compare_frequencies_desc);
			/* reset minfreq to the smallest frequency we're keeping */
			minfreq = sort_table[num_mcelem - 1]->frequency;
		}
		else
			num_mcelem = track_len;

		/* Generate MCELEM slot entry */
		if (num_mcelem > 0)
		{
			MemoryContext old_context;
			Datum	   *mcelem_values;
			float4	   *mcelem_freqs;

			/*
			 * We want to store statistics sorted on the lexeme value using
			 * first length, then byte-for-byte comparison. The reason for
			 * doing length comparison first is that we don't care about the
			 * ordering so long as it's consistent, and comparing lengths
			 * first gives us a chance to avoid a strncmp() call.
			 *
			 * This is different from what we do with scalar statistics --
			 * they get sorted on frequencies. The rationale is that we
			 * usually search through most common elements looking for a
			 * specific value, so we can grab its frequency.  When values are
			 * presorted we can employ binary search for that.  See
			 * ts_selfuncs.c for a real usage scenario.
			 */
			qsort(sort_table, num_mcelem, sizeof(TrackItem *),
				  trackitem_compare_lexemes);

			/* Must copy the target values into anl_context */
			old_context = MemoryContextSwitchTo(stats->anl_context);

			/*
			 * We sorted statistics on the lexeme value, but we want to be
			 * able to find out the minimal and maximal frequency without
			 * going through all the values.  We keep those two extra
			 * frequencies in two extra cells in mcelem_freqs.
			 *
			 * (Note: the MCELEM statistics slot definition allows for a third
			 * extra number containing the frequency of nulls, but we don't
			 * create that for a tsvector column, since null elements aren't
			 * possible.)
			 */
			mcelem_values = (Datum *) palloc(num_mcelem * sizeof(Datum));
			mcelem_freqs = (float4 *) palloc((num_mcelem + 2) * sizeof(float4));

			/*
			 * See comments above about use of nonnull_cnt as the divisor for
			 * the final frequency estimates.
			 */
			for (i = 0; i < num_mcelem; i++)
			{
				TrackItem  *item = sort_table[i];

				mcelem_values[i] =
					PointerGetDatum(cstring_to_text_with_len(item->key.lexeme,
															 item->key.length));
				mcelem_freqs[i] = (double) item->frequency / (double) nonnull_cnt;
			}
			mcelem_freqs[i++] = (double) minfreq / (double) nonnull_cnt;
			mcelem_freqs[i] = (double) maxfreq / (double) nonnull_cnt;
			MemoryContextSwitchTo(old_context);

			stats->stakind[0] = STATISTIC_KIND_MCELEM;
			stats->staop[0] = TextEqualOperator;
			stats->stanumbers[0] = mcelem_freqs;
			/* See above comment about two extra frequency fields */
			stats->numnumbers[0] = num_mcelem + 2;
			stats->stavalues[0] = mcelem_values;
			stats->numvalues[0] = num_mcelem;
			/* We are storing text values */
			stats->statypid[0] = TEXTOID;
			stats->statyplen[0] = -1;	/* typlen, -1 for varlena */
			stats->statypbyval[0] = false;
			stats->statypalign[0] = 'i';
		}
	}
	else
	{
		/* We found only nulls; assume the column is entirely null */
		stats->stats_valid = true;
		stats->stanullfrac = 1.0;
		stats->stawidth = 0;	/* "unknown" */
		stats->stadistinct = 0.0;	/* "unknown" */
	}

	/*
	 * We don't need to bother cleaning up any of our temporary palloc's. The
	 * hashtable should also go away, as it used a child memory context.
	 */
}
Example #4
0
int
FaultInjector_SetFaultInjection(
						   FaultInjectorEntry_s	*entry)
{
	int		status = STATUS_OK;
	bool	isRemoved = FALSE;
	
	getFileRepRoleAndState(&fileRepRole, &segmentState, &dataState, NULL, NULL);

	switch (entry->faultInjectorType) {
		case FaultInjectorTypeReset:
		{
			HASH_SEQ_STATUS			hash_status;
			FaultInjectorEntry_s	*entryLocal;
			
			if (entry->faultInjectorIdentifier == FaultInjectorIdAll) 
			{
				hash_seq_init(&hash_status, faultInjectorShmem->hash);
				
				LockAcquire();
				
				while ((entryLocal = (FaultInjectorEntry_s *) hash_seq_search(&hash_status)) != NULL) {
					isRemoved = FaultInjector_RemoveHashEntry(entryLocal->faultInjectorIdentifier);
					if (isRemoved == TRUE) {
						faultInjectorShmem->faultInjectorSlots--;
					}					
				}
				Assert(faultInjectorShmem->faultInjectorSlots == 0);
				LockRelease();
					
			} else {
				
				LockAcquire();	
				isRemoved = FaultInjector_RemoveHashEntry(entry->faultInjectorIdentifier);
				if (isRemoved == TRUE) {
					faultInjectorShmem->faultInjectorSlots--;
				}
				LockRelease();
			}
				
			if (isRemoved == FALSE) {
				ereport(DEBUG1,
						(errmsg("LOG(fault injector): could not remove fault injection from hash"
								"identifier:'%s' ",
								FaultInjectorIdentifierEnumToString[entry->faultInjectorIdentifier])));
			}			
			
			break;
		}
		case FaultInjectorTypeStatus:
		{	
			HASH_SEQ_STATUS			hash_status;
			FaultInjectorEntry_s	*entryLocal;
			bool					found = FALSE;
			
			if (faultInjectorShmem->hash == NULL) {
				status = STATUS_ERROR;
				break;
			} 
			snprintf(entry->bufOutput, sizeof(entry->bufOutput), "Success: ");
			
			if (entry->faultInjectorIdentifier == ChangeTrackingCompactingReport)
			{
				snprintf(entry->bufOutput, sizeof(entry->bufOutput), 
						 "Success: compacting in progress %s",
						 "false");
				break;
			}
			
			hash_seq_init(&hash_status, faultInjectorShmem->hash);
			
			while ((entryLocal = (FaultInjectorEntry_s *) hash_seq_search(&hash_status)) != NULL) {
				ereport(LOG,
					(errmsg("fault injector status: "
							"fault name:'%s' "
							"fault type:'%s' "
							"ddl statement:'%s' "
							"database name:'%s' "
							"table name:'%s' "
							"occurrence:'%d' "
							"sleep time:'%d' "
							"fault injection state:'%s' ",
							FaultInjectorIdentifierEnumToString[entryLocal->faultInjectorIdentifier],
							FaultInjectorTypeEnumToString[entryLocal->faultInjectorType],
							FaultInjectorDDLEnumToString[entryLocal->ddlStatement],
							entryLocal->databaseName,
							entryLocal->tableName,
							entryLocal->occurrence,
							entryLocal->sleepTime,
							FaultInjectorStateEnumToString[entryLocal->faultInjectorState])));
				
				if (entry->faultInjectorIdentifier == entryLocal->faultInjectorIdentifier ||
					entry->faultInjectorIdentifier == FaultInjectorIdAll) {
						snprintf(entry->bufOutput, sizeof(entry->bufOutput), 
								 "%s \n"
								 "fault name:'%s' "
								 "fault type:'%s' "
								 "ddl statement:'%s' "
								 "database name:'%s' "
								 "table name:'%s' "
								 "occurrence:'%d' "
								 "sleep time:'%d' "
								 "fault injection state:'%s' ",
								 entry->bufOutput,
								 FaultInjectorIdentifierEnumToString[entryLocal->faultInjectorIdentifier],
								 FaultInjectorTypeEnumToString[entryLocal->faultInjectorType],
								 FaultInjectorDDLEnumToString[entryLocal->ddlStatement],
								 entryLocal->databaseName,
								 entryLocal->tableName,
								 entryLocal->occurrence,
								 entryLocal->sleepTime,
								 FaultInjectorStateEnumToString[entryLocal->faultInjectorState]);		
						found = TRUE;
				}
			}
			if (found == FALSE) {
				snprintf(entry->bufOutput, sizeof(entry->bufOutput), "Failure: "
						 "fault name:'%s' not set",
						 FaultInjectorIdentifierEnumToString[entry->faultInjectorIdentifier]);
			}
			break;
		}
		case FaultInjectorTypeResume:
			ereport(LOG, 
					(errmsg("fault triggered, fault name:'%s' fault type:'%s' ",
							FaultInjectorIdentifierEnumToString[entry->faultInjectorIdentifier],
							FaultInjectorTypeEnumToString[entry->faultInjectorType])));	
			
			FaultInjector_UpdateHashEntry(entry);	
			
			break;
		default: 
			
			status = FaultInjector_NewHashEntry(entry);
			break;
	}
	return status;
}
Example #5
0
/*
 * purge_dropped_db_segments
 */
static void
purge_dropped_db_segments(bool force)
{
	static TimestampTz last_purge_time = 0;
	List *db_oids;
	List *dbs_to_remove = NIL;
	HASH_SEQ_STATUS status;
	broker_db_meta *db_meta;

	if (!force && !TimestampDifferenceExceeds(last_purge_time, GetCurrentTimestamp(), 10 * 1000)) /* 10s */
		return;

	db_oids = get_database_oids();

	LWLockAcquire(IPCMessageBrokerIndexLock, LW_SHARED);

	hash_seq_init(&status, broker_meta->db_meta_hash);
	while ((db_meta = (broker_db_meta *) hash_seq_search(&status)) != NULL)
	{
		bool found = false;
		ListCell *lc;

		foreach(lc, db_oids)
		{
			if (lfirst_oid(lc) == db_meta->dbid)
			{
				found = true;
				break;
			}
		}

		if (!found)
			dbs_to_remove = lappend_oid(dbs_to_remove, db_meta->dbid);
	}

	LWLockRelease(IPCMessageBrokerIndexLock);

	if (list_length(dbs_to_remove))
	{
		ListCell *lc;

		LWLockAcquire(IPCMessageBrokerIndexLock, LW_EXCLUSIVE);

		foreach(lc, dbs_to_remove)
		{
			Oid dbid = lfirst_oid(lc);
			bool found;

			db_meta = hash_search(broker_meta->db_meta_hash, &dbid, HASH_FIND, &found);
			Assert(found);

			Assert(db_meta->handle > 0);

			/* detach from main db segment */
			if (db_meta->segment)
				dsm_detach(db_meta->segment);

			if (db_meta->lqueues)
			{
				int i;

				for (i = 0; i < continuous_query_num_workers; i++)
				{
					local_queue *local_buf = &db_meta->lqueues[i];
					if (local_buf->slots)
						list_free_deep(local_buf->slots);
				}

				pfree(db_meta->lqueues);
			}

			hash_search(broker_meta->db_meta_hash, &dbid, HASH_REMOVE, &found);
			Assert(found);
		}

		mark_unused_locks_as_free(db_oids);

		LWLockRelease(IPCMessageBrokerIndexLock);
	}
/*
 * compute_array_stats() -- compute statistics for an array column
 *
 * This function computes statistics useful for determining selectivity of
 * the array operators <@, &&, and @>.  It is invoked by ANALYZE via the
 * compute_stats hook after sample rows have been collected.
 *
 * We also invoke the standard compute_stats function, which will compute
 * "scalar" statistics relevant to the btree-style array comparison operators.
 * However, exact duplicates of an entire array may be rare despite many
 * arrays sharing individual elements.  This especially afflicts long arrays,
 * which are also liable to lack all scalar statistics due to the low
 * WIDTH_THRESHOLD used in analyze.c.  So, in addition to the standard stats,
 * we find the most common array elements and compute a histogram of distinct
 * element counts.
 *
 * The algorithm used is Lossy Counting, as proposed in the paper "Approximate
 * frequency counts over data streams" by G. S. Manku and R. Motwani, in
 * Proceedings of the 28th International Conference on Very Large Data Bases,
 * Hong Kong, China, August 2002, section 4.2. The paper is available at
 * http://www.vldb.org/conf/2002/S10P03.pdf
 *
 * The Lossy Counting (aka LC) algorithm goes like this:
 * Let s be the threshold frequency for an item (the minimum frequency we
 * are interested in) and epsilon the error margin for the frequency. Let D
 * be a set of triples (e, f, delta), where e is an element value, f is that
 * element's frequency (actually, its current occurrence count) and delta is
 * the maximum error in f. We start with D empty and process the elements in
 * batches of size w. (The batch size is also known as "bucket size" and is
 * equal to 1/epsilon.) Let the current batch number be b_current, starting
 * with 1. For each element e we either increment its f count, if it's
 * already in D, or insert a new___ triple into D with values (e, 1, b_current
 * - 1). After processing each batch we prune D, by removing from it all
 * elements with f + delta <= b_current.  After the algorithm finishes we
 * suppress all elements from D that do not satisfy f >= (s - epsilon) * N,
 * where N is the total number of elements in the input.  We emit the
 * remaining elements with estimated frequency f/N.  The LC paper proves
 * that this algorithm finds all elements with true frequency at least s,
 * and that no frequency is overestimated or is underestimated by more than
 * epsilon.  Furthermore, given reasonable assumptions about the input
 * distribution, the required table size is no more than about 7 times w.
 *
 * In the absence of a principled basis for other particular values, we
 * follow ts_typanalyze() and use parameters s = 0.07/K, epsilon = s/10.
 * But we leave out the correction for stopwords, which do not apply to
 * arrays.  These parameters give bucket width w = K/0.007 and maximum
 * expected hashtable size of about 1000 * K.
 *
 * Elements may repeat within an array.  Since duplicates do not change the
 * behavior of <@, && or @>, we want to count each element only once per
 * array.  Therefore, we store in the finished pg_statistic entry each
 * element's frequency as the fraction of all non-null rows that contain it.
 * We divide the raw counts by nonnull_cnt to get those figures.
 */
static void
compute_array_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
					int samplerows, double totalrows)
{
	ArrayAnalyzeExtraData *extra_data;
	int			num_mcelem;
	int			null_cnt = 0;
	int			null_elem_cnt = 0;
	int			analyzed_rows = 0;

	/* This is D from the LC algorithm. */
	HTAB	   *elements_tab;
	HASHCTL		elem_hash_ctl;
	HASH_SEQ_STATUS scan_status;

	/* This is the current bucket number from the LC algorithm */
	int			b_current;

	/* This is 'w' from the LC algorithm */
	int			bucket_width;
	int			array_no;
	int64		element_no;
	TrackItem  *item;
	int			slot_idx;
	HTAB	   *count_tab;
	HASHCTL		count_hash_ctl;
	DECountItem *count_item;

	extra_data = (ArrayAnalyzeExtraData *) stats->extra_data;

	/*
	 * Invoke analyze.c's standard analysis function to create scalar-style
	 * stats for the column.  It will expect its own extra_data pointer, so
	 * temporarily install that.
	 */
	stats->extra_data = extra_data->std_extra_data;
	(*extra_data->std_compute_stats) (stats, fetchfunc, samplerows, totalrows);
	stats->extra_data = extra_data;

	/*
	 * Set up static pointer for use by subroutines.  We wait till here in
	 * case std_compute_stats somehow recursively invokes us (probably not
	 * possible, but ...)
	 */
	array_extra_data = extra_data;

	/*
	 * We want statistics_target * 10 elements in the MCELEM array. This
	 * multiplier is pretty arbitrary, but is meant to reflect the fact that
	 * the number of individual elements tracked in pg_statistic ought to be
	 * more than the number of values for a simple scalar column.
	 */
	num_mcelem = stats->attr->attstattarget * 10;

	/*
	 * We set bucket width equal to num_mcelem / 0.007 as per the comment
	 * above.
	 */
	bucket_width = num_mcelem * 1000 / 7;

	/*
	 * Create the hashtable. It will be in local memory, so we don't need to
	 * worry about overflowing the initial size. Also we don't need to pay any
	 * attention to locking and memory management.
	 */
	MemSet(&elem_hash_ctl, 0, sizeof(elem_hash_ctl));
	elem_hash_ctl.keysize = sizeof(Datum);
	elem_hash_ctl.entrysize = sizeof(TrackItem);
	elem_hash_ctl.hash = element_hash;
	elem_hash_ctl.match = element_match;
	elem_hash_ctl.hcxt = CurrentMemoryContext;
	elements_tab = hash_create("Analyzed elements table",
							   num_mcelem,
							   &elem_hash_ctl,
					HASH_ELEM | HASH_FUNCTION | HASH_COMPARE | HASH_CONTEXT);

	/* hashtable for array distinct elements counts */
	MemSet(&count_hash_ctl, 0, sizeof(count_hash_ctl));
	count_hash_ctl.keysize = sizeof(int);
	count_hash_ctl.entrysize = sizeof(DECountItem);
	count_hash_ctl.hcxt = CurrentMemoryContext;
	count_tab = hash_create("Array distinct element count table",
							64,
							&count_hash_ctl,
							HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);

	/* Initialize counters. */
	b_current = 1;
	element_no = 0;

	/* Loop over the arrays. */
	for (array_no = 0; array_no < samplerows; array_no++)
	{
		Datum		value;
		bool		isnull;
		ArrayType  *array;
		int			num_elems;
		Datum	   *elem_values;
		bool	   *elem_nulls;
		bool		null_present;
		int			j;
		int64		prev_element_no = element_no;
		int			distinct_count;
		bool		count_item_found;

		vacuum_delay_point();

		value = fetchfunc(stats, array_no, &isnull);
		if (isnull)
		{
			/* array is null, just count that */
			null_cnt++;
			continue;
		}

		/* Skip too-large values. */
		if (toast_raw_datum_size(value) > ARRAY_WIDTH_THRESHOLD)
			continue;
		else
			analyzed_rows++;

		/*
		 * Now detoast the array if needed, and deconstruct into datums.
		 */
		array = DatumGetArrayTypeP(value);

		Assert(ARR_ELEMTYPE(array) == extra_data->type_id);
		deconstruct_array(array,
						  extra_data->type_id,
						  extra_data->typlen,
						  extra_data->typbyval,
						  extra_data->typalign,
						  &elem_values, &elem_nulls, &num_elems);

		/*
		 * We loop through the elements in the array and add them to our
		 * tracking hashtable.
		 */
		null_present = false;
		for (j = 0; j < num_elems; j++)
		{
			Datum		elem_value;
			bool		found;

			/* No null element processing other than flag setting here */
			if (elem_nulls[j])
			{
				null_present = true;
				continue;
			}

			/* Lookup current element in hashtable, adding it if new___ */
			elem_value = elem_values[j];
			item = (TrackItem *) hash_search(elements_tab,
											 (const void *) &elem_value,
											 HASH_ENTER, &found);

			if (found)
			{
				/* The element value is already on the tracking list */

				/*
				 * The operators we assist ignore duplicate array elements, so
				 * count a given distinct element only once per array.
				 */
				if (item->last_container == array_no)
					continue;

				item->frequency++;
				item->last_container = array_no;
			}
			else
			{
				/* Initialize new___ tracking list element */

				/*
				 * If element type is pass-by-reference, we must copy it into
				 * palloc'd space, so that we can release the array below. (We
				 * do this so that the space needed for element values is
				 * limited by the size of the hashtable; if we kept all the
				 * array values around, it could be much more.)
				 */
				item->key = datumCopy(elem_value,
									  extra_data->typbyval,
									  extra_data->typlen);

				item->frequency = 1;
				item->delta = b_current - 1;
				item->last_container = array_no;
			}

			/* element_no is the number of elements processed (ie N) */
			element_no++;

			/* We prune the D structure after processing each bucket */
			if (element_no % bucket_width == 0)
			{
				prune_element_hashtable(elements_tab, b_current);
				b_current++;
			}
		}

		/* Count null element presence once per array. */
		if (null_present)
			null_elem_cnt++;

		/* Update frequency of the particular array distinct element count. */
		distinct_count = (int) (element_no - prev_element_no);
		count_item = (DECountItem *) hash_search(count_tab, &distinct_count,
												 HASH_ENTER,
												 &count_item_found);

		if (count_item_found)
			count_item->frequency++;
		else
			count_item->frequency = 1;

		/* Free memory allocated while detoasting. */
		if (PointerGetDatum(array) != value)
			pfree(array);
		pfree(elem_values);
		pfree(elem_nulls);
	}

	/* Skip pg_statistic slots occupied by standard statistics */
	slot_idx = 0;
	while (slot_idx < STATISTIC_NUM_SLOTS && stats->stakind[slot_idx] != 0)
		slot_idx++;
	if (slot_idx > STATISTIC_NUM_SLOTS - 2)
		elog(ERROR, "insufficient pg_statistic slots for array stats");

	/* We can only compute real stats if we found some non-null values. */
	if (analyzed_rows > 0)
	{
		int			nonnull_cnt = analyzed_rows;
		int			count_items_count;
		int			i;
		TrackItem **sort_table;
		int			track_len;
		int64		cutoff_freq;
		int64		minfreq,
					maxfreq;

		/*
		 * We assume the standard stats code already took care of setting
		 * stats_valid, stanullfrac, stawidth, stadistinct.  We'd have to
		 * re-compute those values if we wanted to not store the standard
		 * stats.
		 */

		/*
		 * Construct an array of the interesting hashtable items, that is,
		 * those meeting the cutoff frequency (s - epsilon)*N.  Also identify
		 * the minimum and maximum frequencies among these items.
		 *
		 * Since epsilon = s/10 and bucket_width = 1/epsilon, the cutoff
		 * frequency is 9*N / bucket_width.
		 */
		cutoff_freq = 9 * element_no / bucket_width;

		i = hash_get_num_entries(elements_tab); /* surely enough space */
		sort_table = (TrackItem **) palloc(sizeof(TrackItem *) * i);

		hash_seq_init(&scan_status, elements_tab);
		track_len = 0;
		minfreq = element_no;
		maxfreq = 0;
		while ((item = (TrackItem *) hash_seq_search(&scan_status)) != NULL)
		{
			if (item->frequency > cutoff_freq)
			{
				sort_table[track_len++] = item;
				minfreq = Min(minfreq, item->frequency);
				maxfreq = Max(maxfreq, item->frequency);
			}
		}
		Assert(track_len <= i);

		/* emit some statistics for debug purposes */
		elog(DEBUG3, "compute_array_stats: target # mces = %d, "
			 "bucket width = %d, "
			 "# elements = " INT64_FORMAT ", hashtable size = %d, "
			 "usable entries = %d",
			 num_mcelem, bucket_width, element_no, i, track_len);

		/*
		 * If we obtained more elements than we really want, get rid of those
		 * with least frequencies.  The easiest way is to qsort the array into
		 * descending frequency order and truncate the array.
		 */
		if (num_mcelem < track_len)
		{
			qsort(sort_table, track_len, sizeof(TrackItem *),
				  trackitem_compare_frequencies_desc);
			/* reset minfreq to the smallest frequency we're keeping */
			minfreq = sort_table[num_mcelem - 1]->frequency;
		}
		else
			num_mcelem = track_len;

		/* Generate MCELEM slot entry */
		if (num_mcelem > 0)
		{
			MemoryContext old_context;
			Datum	   *mcelem_values;
			float4	   *mcelem_freqs;

			/*
			 * We want to store statistics sorted on the element value using
			 * the element type's default comparison function.  This permits
			 * fast binary searches in selectivity estimation functions.
			 */
			qsort(sort_table, num_mcelem, sizeof(TrackItem *),
				  trackitem_compare_element);

			/* Must copy the target values into anl_context */
			old_context = MemoryContextSwitchTo(stats->anl_context);

			/*
			 * We sorted statistics on the element value, but we want to be
			 * able to find the minimal and maximal frequencies without going
			 * through all the values.  We also want the frequency of null
			 * elements.  Store these three values at the end of mcelem_freqs.
			 */
			mcelem_values = (Datum *) palloc(num_mcelem * sizeof(Datum));
			mcelem_freqs = (float4 *) palloc((num_mcelem + 3) * sizeof(float4));

			/*
			 * See comments above about use of nonnull_cnt as the divisor for
			 * the final frequency estimates.
			 */
			for (i = 0; i < num_mcelem; i++)
			{
				TrackItem  *item = sort_table[i];

				mcelem_values[i] = datumCopy(item->key,
											 extra_data->typbyval,
											 extra_data->typlen);
				mcelem_freqs[i] = (double) item->frequency /
					(double) nonnull_cnt;
			}
			mcelem_freqs[i++] = (double) minfreq / (double) nonnull_cnt;
			mcelem_freqs[i++] = (double) maxfreq / (double) nonnull_cnt;
			mcelem_freqs[i++] = (double) null_elem_cnt / (double) nonnull_cnt;

			MemoryContextSwitchTo(old_context);

			stats->stakind[slot_idx] = STATISTIC_KIND_MCELEM;
			stats->staop[slot_idx] = extra_data->eq_opr;
			stats->stanumbers[slot_idx] = mcelem_freqs;
			/* See above comment about extra stanumber entries */
			stats->numnumbers[slot_idx] = num_mcelem + 3;
			stats->stavalues[slot_idx] = mcelem_values;
			stats->numvalues[slot_idx] = num_mcelem;
			/* We are storing values of element type */
			stats->statypid[slot_idx] = extra_data->type_id;
			stats->statyplen[slot_idx] = extra_data->typlen;
			stats->statypbyval[slot_idx] = extra_data->typbyval;
			stats->statypalign[slot_idx] = extra_data->typalign;
			slot_idx++;
		}

		/* Generate DECHIST slot entry */
		count_items_count = hash_get_num_entries(count_tab);
		if (count_items_count > 0)
		{
			int			num_hist = stats->attr->attstattarget;
			DECountItem **sorted_count_items;
			int			j;
			int			delta;
			int64		frac;
			float4	   *hist;

			/* num_hist must be at least 2 for the loop below to work */
			num_hist = Max(num_hist, 2);

			/*
			 * Create an array of DECountItem pointers, and sort them into
			 * increasing count order.
			 */
			sorted_count_items = (DECountItem **)
				palloc(sizeof(DECountItem *) * count_items_count);
			hash_seq_init(&scan_status, count_tab);
			j = 0;
			while ((count_item = (DECountItem *) hash_seq_search(&scan_status)) != NULL)
			{
				sorted_count_items[j++] = count_item;
			}
			qsort(sorted_count_items, count_items_count,
				  sizeof(DECountItem *), countitem_compare_count);

			/*
			 * Prepare to fill stanumbers with the histogram, followed by the
			 * average count.  This array must be stored in anl_context.
			 */
			hist = (float4 *)
				MemoryContextAlloc(stats->anl_context,
								   sizeof(float4) * (num_hist + 1));
			hist[num_hist] = (double) element_no / (double) nonnull_cnt;

			/*----------
			 * Construct the histogram of distinct-element counts (DECs).
			 *
			 * The object of this loop is to copy the min and max DECs to
			 * hist[0] and hist[num_hist - 1], along with evenly-spaced DECs
			 * in between (where "evenly-spaced" is with reference to the
			 * whole input population of arrays).  If we had a complete sorted
			 * array of DECs, one per analyzed row, the i'th hist value would
			 * come from DECs[i * (analyzed_rows - 1) / (num_hist - 1)]
			 * (compare the histogram-making loop in compute_scalar_stats()).
			 * But instead of that we have the sorted_count_items[] array,
			 * which holds unique DEC values with their frequencies (that is,
			 * a run-length-compressed version of the full array).  So we
			 * control advancing through sorted_count_items[] with the
			 * variable "frac", which is defined as (x - y) * (num_hist - 1),
			 * where x is the index in the notional DECs array corresponding
			 * to the start of the next sorted_count_items[] element's run,
			 * and y is the index in DECs from which we should take the next
			 * histogram value.  We have to advance whenever x <= y, that is
			 * frac <= 0.  The x component is the sum of the frequencies seen
			 * so far (up through the current sorted_count_items[] element),
			 * and of course y * (num_hist - 1) = i * (analyzed_rows - 1),
			 * per the subscript calculation above.  (The subscript calculation
			 * implies dropping any fractional part of y; in this formulation
			 * that's handled by not advancing until frac reaches 1.)
			 *
			 * Even though frac has a bounded range, it could overflow int32
			 * when working with very large statistics targets, so we do that
			 * math in int64.
			 *----------
			 */
			delta = analyzed_rows - 1;
			j = 0;				/* current index in sorted_count_items */
			/* Initialize frac for sorted_count_items[0]; y is initially 0 */
			frac = (int64) sorted_count_items[0]->frequency * (num_hist - 1);
			for (i = 0; i < num_hist; i++)
			{
				while (frac <= 0)
				{
					/* Advance, and update x component of frac */
					j++;
					frac += (int64) sorted_count_items[j]->frequency * (num_hist - 1);
				}
				hist[i] = sorted_count_items[j]->count;
				frac -= delta;	/* update y for upcoming i increment */
			}
			Assert(j == count_items_count - 1);

			stats->stakind[slot_idx] = STATISTIC_KIND_DECHIST;
			stats->staop[slot_idx] = extra_data->eq_opr;
			stats->stanumbers[slot_idx] = hist;
			stats->numnumbers[slot_idx] = num_hist + 1;
			slot_idx++;
		}
	}

	/*
	 * We don't need to bother cleaning up any of our temporary palloc's. The
	 * hashtable should also go away, as it used a child memory context.
	 */
}
Example #7
0
/*
 * pgfdw_xact_callback --- cleanup at main-transaction end.
 */
static void
pgfdw_xact_callback(XactEvent event, void *arg)
{
    HASH_SEQ_STATUS scan;
    ConnCacheEntry *entry;

    /* Quick exit if no connections were touched in this transaction. */
    if (!xact_got_connection)
        return;

    /*
     * Scan all connection cache entries to find open remote transactions, and
     * close them.
     */
    hash_seq_init(&scan, ConnectionHash);
    while ((entry = (ConnCacheEntry *) hash_seq_search(&scan)))
    {
        Jresult   *res;

        /* Ignore cache entry if no open connection right now */
        if (entry->conn == NULL)
            continue;

        /* If it has an open remote transaction, try to close it */
        if (entry->xact_depth > 0)
        {
            elog(DEBUG3, "closing remote transaction on connection %p",
                 entry->conn);

            switch (event)
            {
                case XACT_EVENT_PRE_COMMIT:
                    /* Commit all remote transactions during pre-commit */
                    do_sql_command(entry->conn, "COMMIT TRANSACTION");

                    /*
                     * If there were any errors in subtransactions, and we
                     * made prepared statements, do a DEALLOCATE ALL to make
                     * sure we get rid of all prepared statements. This is
                     * annoying and not terribly bulletproof, but it's
                     * probably not worth trying harder.
                     *
                     * DEALLOCATE ALL only exists in 8.3 and later, so this
                     * constrains how old a server jdbc2_fdw can
                     * communicate with.  We intentionally ignore errors in
                     * the DEALLOCATE, so that we can hobble along to some
                     * extent with older servers (leaking prepared statements
                     * as we go; but we don't really support update operations
                     * pre-8.3 anyway).
                     */
                    if (entry->have_prep_stmt && entry->have_error)
                    {
                        res = JQexec(entry->conn, "DEALLOCATE ALL");
                        JQclear(res);
                    }
                    entry->have_prep_stmt = false;
                    entry->have_error = false;
                    break;
                case XACT_EVENT_PRE_PREPARE:

                    /*
                     * We disallow remote transactions that modified anything,
                     * since it's not very reasonable to hold them open until
                     * the prepared transaction is committed.  For the moment,
                     * throw error unconditionally; later we might allow
                     * read-only cases.  Note that the error will cause us to
                     * come right back here with event == XACT_EVENT_ABORT, so
                     * we'll clean up the connection state at that point.
                     */
                    ereport(ERROR,
                            (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
                             errmsg("cannot prepare a transaction that modified remote tables")));
                    break;
                case XACT_EVENT_COMMIT:
                case XACT_EVENT_PREPARE:
                    /* Pre-commit should have closed the open transaction */
                    elog(ERROR, "missed cleaning up connection during pre-commit");
                    break;
                case XACT_EVENT_ABORT:
                    /* Assume we might have lost track of prepared statements */
                    entry->have_error = true;
                    /* If we're aborting, abort all remote transactions too */
                    res = JQexec(entry->conn, "ABORT TRANSACTION");
                    /* Note: can't throw ERROR, it would be infinite loop */
                    if (JQresultStatus(res) != PGRES_COMMAND_OK)
                        pgfdw_report_error(WARNING, res, entry->conn, true,
                                           "ABORT TRANSACTION");
                    else
                    {
                        JQclear(res);
                        /* As above, make sure to clear any prepared stmts */
                        if (entry->have_prep_stmt && entry->have_error)
                        {
                            res = JQexec(entry->conn, "DEALLOCATE ALL");
                            JQclear(res);
                        }
                        entry->have_prep_stmt = false;
                        entry->have_error = false;
                    }
                    break;
            }
        }

        /* Reset state to show we're out of a transaction */
        entry->xact_depth = 0;

        /*
         * If the connection isn't in a good idle state, discard it to
         * recover. Next GetConnection will open a new connection.
         */
        if (JQstatus(entry->conn) != CONNECTION_OK ||
            JQtransactionStatus(entry->conn) != PQTRANS_IDLE)
        {
            elog(DEBUG3, "discarding connection %p", entry->conn);
            JQfinish(entry->conn);
            entry->conn = NULL;
        }
    }

    /*
     * Regardless of the event type, we can now mark ourselves as out of the
     * transaction.  (Note: if we are here during PRE_COMMIT or PRE_PREPARE,
     * this saves a useless scan of the hashtable during COMMIT or PREPARE.)
     */
    xact_got_connection = false;

    /* Also reset cursor numbering for next transaction */
    cursor_number = 0;
}
Example #8
0
/*
 * pgfdw_subxact_callback --- cleanup at subtransaction end.
 */
static void
pgfdw_subxact_callback(SubXactEvent event, SubTransactionId mySubid,
                       SubTransactionId parentSubid, void *arg)
{
    HASH_SEQ_STATUS scan;
    ConnCacheEntry *entry;
    int         curlevel;

    /* Nothing to do at subxact start, nor after commit. */
    if (!(event == SUBXACT_EVENT_PRE_COMMIT_SUB ||
          event == SUBXACT_EVENT_ABORT_SUB))
        return;

    /* Quick exit if no connections were touched in this transaction. */
    if (!xact_got_connection)
        return;

    /*
     * Scan all connection cache entries to find open remote subtransactions
     * of the current level, and close them.
     */
    curlevel = GetCurrentTransactionNestLevel();
    hash_seq_init(&scan, ConnectionHash);
    while ((entry = (ConnCacheEntry *) hash_seq_search(&scan)))
    {
        Jresult   *res;
        char        sql[100];

        /*
         * We only care about connections with open remote subtransactions of
         * the current level.
         */
        if (entry->conn == NULL || entry->xact_depth < curlevel)
            continue;

        if (entry->xact_depth > curlevel)
            elog(ERROR, "missed cleaning up remote subtransaction at level %d",
                 entry->xact_depth);

        if (event == SUBXACT_EVENT_PRE_COMMIT_SUB)
        {
            /* Commit all remote subtransactions during pre-commit */
            snprintf(sql, sizeof(sql), "RELEASE SAVEPOINT s%d", curlevel);
            do_sql_command(entry->conn, sql);
        }
        else
        {
            /* Assume we might have lost track of prepared statements */
            entry->have_error = true;
            /* Rollback all remote subtransactions during abort */
            snprintf(sql, sizeof(sql),
                     "ROLLBACK TO SAVEPOINT s%d; RELEASE SAVEPOINT s%d",
                     curlevel, curlevel);
            res = JQexec(entry->conn, sql);
            if (JQresultStatus(res) != PGRES_COMMAND_OK)
                pgfdw_report_error(WARNING, res, entry->conn, true, sql);
            else
                JQclear(res);
        }

        /* OK, we're outta that level of subtransaction */
        entry->xact_depth--;
    }
}
Example #9
0
void
ContQuerySchedulerMain(int argc, char *argv[])
{
	sigjmp_buf local_sigjmp_buf;
	List *dbs = NIL;

	/* we are a postmaster subprocess now */
	IsUnderPostmaster = true;
	am_cont_scheduler = true;

	/* reset MyProcPid */
	MyProcPid = getpid();
	MyPMChildSlot = AssignPostmasterChildSlot();

	/* record Start Time for logging */
	MyStartTime = time(NULL);

	/* Identify myself via ps */
	init_ps_display("continuous query scheduler process", "", "", "");

	ereport(LOG, (errmsg("continuous query scheduler started")));

	if (PostAuthDelay)
		pg_usleep(PostAuthDelay * 1000000L);

	SetProcessingMode(InitProcessing);

	/*
	 * If possible, make this process a group leader, so that the postmaster
	 * can signal any child processes too. This is only for consistency sake, we
	 * never fork the scheduler process. Instead dynamic bgworkers are used.
	 */
#ifdef HAVE_SETSID
	if (setsid() < 0)
		elog(FATAL, "setsid() failed: %m");
#endif

	/*
	 * Set up signal handlers.  We operate on databases much like a regular
	 * backend, so we use the same signal handling.  See equivalent code in
	 * tcop/postgres.c.
	 */
	pqsignal(SIGHUP, sighup_handler);
	pqsignal(SIGINT, sigint_handler);
	pqsignal(SIGTERM, sigterm_handler);

	pqsignal(SIGQUIT, quickdie);
	InitializeTimeouts(); /* establishes SIGALRM handler */

	pqsignal(SIGPIPE, SIG_IGN);
	pqsignal(SIGUSR1, procsignal_sigusr1_handler);
	pqsignal(SIGUSR2, sigusr2_handler);
	pqsignal(SIGFPE, FloatExceptionHandler);
	pqsignal(SIGCHLD, SIG_DFL);

#define BACKTRACE_SEGFAULTS
#ifdef BACKTRACE_SEGFAULTS
	pqsignal(SIGSEGV, debug_segfault);
#endif

	/* Early initialization */
	BaseInit();

	/*
	 * Create a per-backend PGPROC struct in shared memory, except in the
	 * EXEC_BACKEND case where this was done in SubPostmasterMain. We must do
	 * this before we can use LWLocks (and in the EXEC_BACKEND case we already
	 * had to do some stuff with LWLocks).
	 */
#ifndef EXEC_BACKEND
	InitProcess();
#endif

	InitPostgres(NULL, InvalidOid, NULL, NULL);

	SetProcessingMode(NormalProcessing);

	/*
	 * Create a memory context that we will do all our work in.  We do this so
	 * that we can reset the context during error recovery and thereby avoid
	 * possible memory leaks.
	 */
	ContQuerySchedulerMemCxt = AllocSetContextCreate(TopMemoryContext,
			"ContQuerySchedulerCtx",
			ALLOCSET_DEFAULT_MINSIZE,
			ALLOCSET_DEFAULT_INITSIZE,
			ALLOCSET_DEFAULT_MAXSIZE);
	MemoryContextSwitchTo(ContQuerySchedulerMemCxt);

	/*
	 * If an exception is encountered, processing resumes here.
	 *
	 * This code is a stripped down version of PostgresMain error recovery.
	 */
	if (sigsetjmp(local_sigjmp_buf, 1) != 0)
	{
		/* since not using PG_TRY, must reset error stack by hand */
		error_context_stack = NULL;

		/* Prevents interrupts while cleaning up */
		HOLD_INTERRUPTS();

		/* Forget any pending QueryCancel or timeout request */
		disable_all_timeouts(false);
		QueryCancelPending = false; /* second to avoid race condition */

		/* Report the error to the server log */
		EmitErrorReport();

		/* Abort the current transaction in order to recover */
		AbortCurrentTransaction();

		/*
		 * Now return to normal top-level context and clear ErrorContext for
		 * next time.
		 */
		MemoryContextSwitchTo(ContQuerySchedulerMemCxt);
		FlushErrorState();

		/* Flush any leaked data in the top-level context */
		MemoryContextResetAndDeleteChildren(ContQuerySchedulerMemCxt);

		/* Now we can allow interrupts again */
		RESUME_INTERRUPTS();

		/*
		 * Sleep at least 1 second after any error.  We don't want to be
		 * filling the error logs as fast as we can.
		 */
		pg_usleep(1000000L);
	}
	/* We can now handle ereport(ERROR) */
	PG_exception_stack = &local_sigjmp_buf;

	/* must unblock signals before calling rebuild_database_list */
	PG_SETMASK(&UnBlockSig);

	ContQuerySchedulerShmem->scheduler_pid = MyProcPid;

	dbs = get_database_list();

	/* Loop forever */
	for (;;)
	{
		ListCell *lc;
		int rc;

		foreach(lc, dbs)
		{
			DatabaseEntry *db_entry = lfirst(lc);
			bool found;
			ContQueryProcGroup *grp = hash_search(ContQuerySchedulerShmem->proc_table, &db_entry->oid, HASH_ENTER, &found);

			/* If we don't have an entry for this dboid, initialize a new one and fire off bg procs */
			if (!found)
			{
				grp->db_oid = db_entry->oid;
				namestrcpy(&grp->db_name, NameStr(db_entry->name));

				start_group(grp);
			}
		}

		/* Allow sinval catchup interrupts while sleeping */
		EnableCatchupInterrupt();

		/*
		 * Wait until naptime expires or we get some type of signal (all the
		 * signal handlers will wake us by calling SetLatch).
		 */
		rc = WaitLatch(&MyProc->procLatch, WL_LATCH_SET | WL_POSTMASTER_DEATH, 0);

		ResetLatch(&MyProc->procLatch);

		DisableCatchupInterrupt();

		/*
		 * Emergency bailout if postmaster has died.  This is to avoid the
		 * necessity for manual cleanup of all postmaster children.
		 */
		if (rc & WL_POSTMASTER_DEATH)
			proc_exit(1);

		/* the normal shutdown case */
		if (got_SIGTERM)
			break;

		/* update config? */
		if (got_SIGHUP)
		{
			got_SIGHUP = false;
			ProcessConfigFile(PGC_SIGHUP);

			/* update tuning parameters, so that they can be read downstream by background processes */
			update_tuning_params();
		}

		/* terminate a proc group? */
		if (got_SIGUSR2)
		{
			HASH_SEQ_STATUS status;
			ContQueryProcGroup *grp;

			got_SIGUSR2 = false;

			hash_seq_init(&status, ContQuerySchedulerShmem->proc_table);
			while ((grp = (ContQueryProcGroup *) hash_seq_search(&status)) != NULL)
			{
				ListCell *lc;

				if (!grp->terminate)
					continue;

				foreach(lc, dbs)
				{
					DatabaseEntry *entry = lfirst(lc);
					if (entry->oid == grp->db_oid)
					{
						dbs = list_delete(dbs, entry);
						break;
					}
				}

				terminate_group(grp);
			}
		}
Example #10
0
/*
 * pgfdw_subxact_callback --- cleanup at subtransaction end.
 */
static void
pgfdw_subxact_callback(SubXactEvent event, SubTransactionId mySubid,
					   SubTransactionId parentSubid, void *arg)
{
	HASH_SEQ_STATUS scan;
	ConnCacheEntry *entry;
	int			curlevel;

	/* Nothing to do at subxact start, nor after commit. */
	if (!(event == SUBXACT_EVENT_PRE_COMMIT_SUB ||
		  event == SUBXACT_EVENT_ABORT_SUB))
		return;

	/* Quick exit if no connections were touched in this transaction. */
	if (!xact_got_connection)
		return;

	/*
	 * Scan all connection cache entries to find open remote subtransactions
	 * of the current level, and close them.
	 */
	curlevel = GetCurrentTransactionNestLevel();
	hash_seq_init(&scan, ConnectionHash);
	while ((entry = (ConnCacheEntry *) hash_seq_search(&scan)))
	{
		PGresult   *res;
		char		sql[100];

		/*
		 * We only care about connections with open remote subtransactions of
		 * the current level.
		 */
		if (entry->conn == NULL || entry->xact_depth < curlevel)
			continue;

		if (entry->xact_depth > curlevel)
			elog(ERROR, "missed cleaning up remote subtransaction at level %d",
				 entry->xact_depth);

		if (event == SUBXACT_EVENT_PRE_COMMIT_SUB)
		{
			/* Commit all remote subtransactions during pre-commit */
			snprintf(sql, sizeof(sql), "RELEASE SAVEPOINT s%d", curlevel);
			do_sql_command(entry->conn, sql);
		}
		else
		{
			/* Assume we might have lost track of prepared statements */
			entry->have_error = true;

			/*
			 * If a command has been submitted to the remote server by using
			 * an asynchronous execution function, the command might not have
			 * yet completed.  Check to see if a command is still being
			 * processed by the remote server, and if so, request cancellation
			 * of the command.
			 */
			if (PQtransactionStatus(entry->conn) == PQTRANS_ACTIVE)
			{
				PGcancel   *cancel;
				char		errbuf[256];

				if ((cancel = PQgetCancel(entry->conn)))
				{
					if (!PQcancel(cancel, errbuf, sizeof(errbuf)))
						ereport(WARNING,
								(errcode(ERRCODE_CONNECTION_FAILURE),
								 errmsg("could not send cancel request: %s",
										errbuf)));
					PQfreeCancel(cancel);
				}
			}

			/* Rollback all remote subtransactions during abort */
			snprintf(sql, sizeof(sql),
					 "ROLLBACK TO SAVEPOINT s%d; RELEASE SAVEPOINT s%d",
					 curlevel, curlevel);
			res = PQexec(entry->conn, sql);
			if (PQresultStatus(res) != PGRES_COMMAND_OK)
				pgfdw_report_error(WARNING, res, entry->conn, true, sql);
			else
				PQclear(res);
		}

		/* OK, we're outta that level of subtransaction */
		entry->xact_depth--;
	}
}
Example #11
0
/*
 *	mdsync() -- Sync previous writes to stable storage.
 */
void
mdsync(void)
{
	static bool mdsync_in_progress = false;

	HASH_SEQ_STATUS hstat;
	PendingOperationEntry *entry;
	int			absorb_counter;

	/*
	 * This is only called during checkpoints, and checkpoints should only
	 * occur in processes that have created a pendingOpsTable.
	 */
	if (!pendingOpsTable)
		elog(ERROR, "cannot sync without a pendingOpsTable");

	/*
	 * If we are in the bgwriter, the sync had better include all fsync
	 * requests that were queued by backends up to this point.	The tightest
	 * race condition that could occur is that a buffer that must be written
	 * and fsync'd for the checkpoint could have been dumped by a backend just
	 * before it was visited by BufferSync().  We know the backend will have
	 * queued an fsync request before clearing the buffer's dirtybit, so we
	 * are safe as long as we do an Absorb after completing BufferSync().
	 */
	AbsorbFsyncRequests();

	/*
	 * To avoid excess fsync'ing (in the worst case, maybe a never-terminating
	 * checkpoint), we want to ignore fsync requests that are entered into the
	 * hashtable after this point --- they should be processed next time,
	 * instead.  We use mdsync_cycle_ctr to tell old entries apart from new
	 * ones: new ones will have cycle_ctr equal to the incremented value of
	 * mdsync_cycle_ctr.
	 *
	 * In normal circumstances, all entries present in the table at this point
	 * will have cycle_ctr exactly equal to the current (about to be old)
	 * value of mdsync_cycle_ctr.  However, if we fail partway through the
	 * fsync'ing loop, then older values of cycle_ctr might remain when we
	 * come back here to try again.  Repeated checkpoint failures would
	 * eventually wrap the counter around to the point where an old entry
	 * might appear new, causing us to skip it, possibly allowing a checkpoint
	 * to succeed that should not have.  To forestall wraparound, any time the
	 * previous mdsync() failed to complete, run through the table and
	 * forcibly set cycle_ctr = mdsync_cycle_ctr.
	 *
	 * Think not to merge this loop with the main loop, as the problem is
	 * exactly that that loop may fail before having visited all the entries.
	 * From a performance point of view it doesn't matter anyway, as this path
	 * will never be taken in a system that's functioning normally.
	 */
	if (mdsync_in_progress)
	{
		/* prior try failed, so update any stale cycle_ctr values */
		hash_seq_init(&hstat, pendingOpsTable);
		while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
		{
			entry->cycle_ctr = mdsync_cycle_ctr;
		}
	}

	/* Advance counter so that new hashtable entries are distinguishable */
	mdsync_cycle_ctr++;

	/* Set flag to detect failure if we don't reach the end of the loop */
	mdsync_in_progress = true;

	/* Now scan the hashtable for fsync requests to process */
	absorb_counter = FSYNCS_PER_ABSORB;
	hash_seq_init(&hstat, pendingOpsTable);
	while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
	{
		/*
		 * If the entry is new then don't process it this time.  Note that
		 * "continue" bypasses the hash-remove call at the bottom of the loop.
		 */
		if (entry->cycle_ctr == mdsync_cycle_ctr)
			continue;

		/* Else assert we haven't missed it */
		Assert((CycleCtr) (entry->cycle_ctr + 1) == mdsync_cycle_ctr);

		/*
		 * If fsync is off then we don't have to bother opening the file at
		 * all.  (We delay checking until this point so that changing fsync on
		 * the fly behaves sensibly.)  Also, if the entry is marked canceled,
		 * fall through to delete it.
		 */
		if (enableFsync && !entry->canceled)
		{
			int			failures;

			/*
			 * If in bgwriter, we want to absorb pending requests every so
			 * often to prevent overflow of the fsync request queue.  It is
			 * unspecified whether newly-added entries will be visited by
			 * hash_seq_search, but we don't care since we don't need to
			 * process them anyway.
			 */
			if (--absorb_counter <= 0)
			{
				AbsorbFsyncRequests();
				absorb_counter = FSYNCS_PER_ABSORB;
			}

			/*
			 * The fsync table could contain requests to fsync segments that
			 * have been deleted (unlinked) by the time we get to them. Rather
			 * than just hoping an ENOENT (or EACCES on Windows) error can be
			 * ignored, what we do on error is absorb pending requests and
			 * then retry.	Since mdunlink() queues a "revoke" message before
			 * actually unlinking, the fsync request is guaranteed to be
			 * marked canceled after the absorb if it really was this case.
			 * DROP DATABASE likewise has to tell us to forget fsync requests
			 * before it starts deletions.
			 */
			for (failures = 0;; failures++)		/* loop exits at "break" */
			{
				SMgrRelation reln;
				MdfdVec    *seg;
				char	   *path;

				/*
				 * Find or create an smgr hash entry for this relation. This
				 * may seem a bit unclean -- md calling smgr?  But it's really
				 * the best solution.  It ensures that the open file reference
				 * isn't permanently leaked if we get an error here. (You may
				 * say "but an unreferenced SMgrRelation is still a leak!" Not
				 * really, because the only case in which a checkpoint is done
				 * by a process that isn't about to shut down is in the
				 * bgwriter, and it will periodically do smgrcloseall(). This
				 * fact justifies our not closing the reln in the success path
				 * either, which is a good thing since in non-bgwriter cases
				 * we couldn't safely do that.)  Furthermore, in many cases
				 * the relation will have been dirtied through this same smgr
				 * relation, and so we can save a file open/close cycle.
				 */
				reln = smgropen(entry->tag.rnode.node,
								entry->tag.rnode.backend);

				/*
				 * It is possible that the relation has been dropped or
				 * truncated since the fsync request was entered.  Therefore,
				 * allow ENOENT, but only if we didn't fail already on this
				 * file.  This applies both during _mdfd_getseg() and during
				 * FileSync, since fd.c might have closed the file behind our
				 * back.
				 */
				seg = _mdfd_getseg(reln, entry->tag.forknum,
							  entry->tag.segno * ((BlockNumber) RELSEG_SIZE),
								   false, EXTENSION_RETURN_NULL);
				if (seg != NULL &&
					FileSync(seg->mdfd_vfd) >= 0)
					break;		/* success; break out of retry loop */

				/*
				 * XXX is there any point in allowing more than one retry?
				 * Don't see one at the moment, but easy to change the test
				 * here if so.
				 */
				path = _mdfd_segpath(reln, entry->tag.forknum,
									 entry->tag.segno);
				if (!FILE_POSSIBLY_DELETED(errno) ||
					failures > 0)
					ereport(ERROR,
							(errcode_for_file_access(),
						   errmsg("could not fsync file \"%s\": %m", path)));
				else
					ereport(DEBUG1,
							(errcode_for_file_access(),
					   errmsg("could not fsync file \"%s\" but retrying: %m",
							  path)));
				pfree(path);

				/*
				 * Absorb incoming requests and check to see if canceled.
				 */
				AbsorbFsyncRequests();
				absorb_counter = FSYNCS_PER_ABSORB;		/* might as well... */

				if (entry->canceled)
					break;
			}					/* end retry loop */
		}

		/*
		 * If we get here, either we fsync'd successfully, or we don't have to
		 * because enableFsync is off, or the entry is (now) marked canceled.
		 * Okay to delete it.
		 */
		if (hash_search(pendingOpsTable, &entry->tag,
						HASH_REMOVE, NULL) == NULL)
			elog(ERROR, "pendingOpsTable corrupted");
	}							/* end loop over hashtable entries */

	/* Flag successful completion of mdsync */
	mdsync_in_progress = false;
}
Example #12
0
/*
 * Activate a standby master by removing reference to the dead master
 * and changing our dbid to the old master's dbid
 */
void
PersistentFilespace_ActivateStandby(int16 oldmaster, int16 newmaster)
{
	HASH_SEQ_STATUS hstat;
	FilespaceDirEntry fde;
	WRITE_PERSISTENT_STATE_ORDERED_LOCK_DECLARE;

	if (Persistent_BeforePersistenceWork())
		elog(ERROR, "persistent table changes forbidden");

	hash_seq_init(&hstat, persistentFilespaceSharedHashTable);

	PersistentFilespace_VerifyInitScan();

	WRITE_PERSISTENT_STATE_ORDERED_LOCK;

	while ((fde = hash_seq_search(&hstat)) != NULL)
	{
		Oid filespace = fde->key.filespaceOid;
		PersistentFileSysObjName fsObjName;

		PersistentFileSysObjName_SetFilespaceDir(&fsObjName, filespace);

		if (fde->dbId1 == oldmaster)
		{
			fde->dbId1 = InvalidDbid;
			fde->dbId2 = newmaster;

			/* Copy standby filespace location into new master location */
			PersistentFilespace_BlankPadCopyLocation(
										fde->locationBlankPadded2,
										fde->locationBlankPadded1);

			PersistentFilespace_BlankPadCopyLocation(
										fde->locationBlankPadded1,
										"");
		}
		else if (fde->dbId2 == oldmaster)
		{
			fde->dbId2 = InvalidDbid;
			fde->dbId1 = newmaster;

			/* Copy standby filespace location into new master location */
			PersistentFilespace_BlankPadCopyLocation(
										fde->locationBlankPadded1,
										fde->locationBlankPadded2);

			PersistentFilespace_BlankPadCopyLocation(
										fde->locationBlankPadded2,
										"");
		}

		PersistentFileSysObj_ActivateStandby(&fsObjName,
								   &fde->persistentTid,
								   fde->persistentSerialNum,
								   oldmaster,
								   newmaster,
								   /* flushToXlog */ false);
	}

	WRITE_PERSISTENT_STATE_ORDERED_UNLOCK;
}
/*
 * Activate a standby master by removing reference to the dead master
 * and changing our dbid to the old master's dbid
 */
void
PersistentFilespace_ActivateStandby(int16 oldmaster, int16 newmaster)
{
	HASH_SEQ_STATUS hstat;
	FilespaceDirEntry fde;

	WRITE_PERSISTENT_STATE_ORDERED_LOCK_DECLARE;

	if (Persistent_BeforePersistenceWork())
		elog(ERROR, "persistent table changes forbidden");

	hash_seq_init(&hstat, persistentFilespaceSharedHashTable);

	PersistentFilespace_VerifyInitScan();

	WRITE_PERSISTENT_STATE_ORDERED_LOCK;

	/*
	 * We release FilespaceHashLock in the middle of the loop and re-acquire
	 * it after doing persistent table change.  This is needed to prevent
	 * holding the lock for any purpose other than to protect the filespace
	 * shared hash table.  Not releasing this lock could result in file I/O
	 * and potential deadlock due to other LW locks being acquired in the
	 * process.  Releasing the lock this way is safe because we are still
	 * holding PersistentObjLock in exclusive mode.  Any change to the
	 * filespace shared hash table is also protected by PersistentObjLock.
	 */
	WRITE_FILESPACE_HASH_LOCK;
	while ((fde = hash_seq_search(&hstat)) != NULL)
	{
		Oid			filespace = fde->key.filespaceOid;
		PersistentFileSysObjName fsObjName;
		ItemPointerData persistentTid;
		int64		persistentSerialNum = fde->persistentSerialNum;

		ItemPointerCopy(&fde->persistentTid, &persistentTid);

		PersistentFileSysObjName_SetFilespaceDir(&fsObjName, filespace);

		if (fde->dbId1 == oldmaster)
		{
			fde->dbId1 = InvalidDbid;
			fde->dbId2 = newmaster;

			/* Copy standby filespace location into new master location */
			PersistentFilespace_BlankPadCopyLocation(
													 fde->locationBlankPadded2,
													 fde->locationBlankPadded1);

			PersistentFilespace_BlankPadCopyLocation(
													 fde->locationBlankPadded1,
													 "");
		}
		else if (fde->dbId2 == oldmaster)
		{
			fde->dbId2 = InvalidDbid;
			fde->dbId1 = newmaster;

			/* Copy standby filespace location into new master location */
			PersistentFilespace_BlankPadCopyLocation(
													 fde->locationBlankPadded1,
													 fde->locationBlankPadded2);

			PersistentFilespace_BlankPadCopyLocation(
													 fde->locationBlankPadded2,
													 "");
		}
		WRITE_FILESPACE_HASH_UNLOCK;

		PersistentFileSysObj_ActivateStandby(&fsObjName,
											 &persistentTid,
											 persistentSerialNum,
											 oldmaster,
											 newmaster,
											  /* flushToXlog */ false);
		WRITE_FILESPACE_HASH_LOCK;
	}
	WRITE_FILESPACE_HASH_UNLOCK;

	WRITE_PERSISTENT_STATE_ORDERED_UNLOCK;
}