Beispiel #1
0
/*
 * Allocate a new hashtable entry.
 * caller must hold an exclusive lock on pgss->lock
 *
 * Note: despite needing exclusive lock, it's not an error for the target
 * entry to already exist.	This is because pgss_store releases and
 * reacquires lock after failing to find a match; so someone else could
 * have made the entry while we waited to get exclusive lock.
 */
static pgssEntry *
entry_alloc(pgssHashKey *key)
{
	pgssEntry  *entry;
	bool		found;

	/* Caller must have clipped query properly */
	Assert(key->query_len < pgss->query_size);

	/* Make space if needed */
	while (hash_get_num_entries(pgss_hash) >= pgss_max)
		entry_dealloc();

	/* Find or create an entry with desired hash code */
	entry = (pgssEntry *) hash_search(pgss_hash, key, HASH_ENTER, &found);

	if (!found)
	{
		/* New entry, initialize it */

		/* dynahash tried to copy the key for us, but must fix query_ptr */
		entry->key.query_ptr = entry->query;
		/* reset the statistics */
		memset(&entry->counters, 0, sizeof(Counters));
		entry->counters.usage = USAGE_INIT;
		/* re-initialize the mutex each time ... we assume no one using it */
		SpinLockInit(&entry->mutex);
		/* ... and don't forget the query text */
		memcpy(entry->query, key->query_ptr, key->query_len);
		entry->query[key->query_len] = '\0';
	}

	return entry;
}
/*
 * Deallocate least used entries.
 * Caller must hold an exclusive lock on pgss->lock.
 */
static void
entry_dealloc(void)
{
	HASH_SEQ_STATUS hash_seq;
	pgssEntry **entries;
	pgssEntry  *entry;
	int			nvictims;
	int			i;

	/* Sort entries by usage and deallocate USAGE_DEALLOC_PERCENT of them. */

	entries = palloc(hash_get_num_entries(pgss_hash) * sizeof(pgssEntry *));

	i = 0;
	hash_seq_init(&hash_seq, pgss_hash);
	while ((entry = hash_seq_search(&hash_seq)) != NULL)
	{
		entries[i++] = entry;
		entry->counters.usage *= USAGE_DECREASE_FACTOR;
	}

	qsort(entries, i, sizeof(pgssEntry *), entry_cmp);
	nvictims = Max(10, i * USAGE_DEALLOC_PERCENT / 100);
	nvictims = Min(nvictims, i);

	for (i = 0; i < nvictims; i++)
	{
		hash_search(pgss_hash, &entries[i]->key, HASH_REMOVE, NULL);
	}

	pfree(entries);
}
Beispiel #3
0
/* Are there any unresolved references to invalid pages? */
bool
XLogHaveInvalidPages(void)
{
	if (invalid_page_tab != NULL &&
		hash_get_num_entries(invalid_page_tab) > 0)
		return true;
	return false;
}
Beispiel #4
0
bool
Persistent_PostDTMRecv_IsHashFull(void)
{
	Insist(PT_PostDTMRecv_Info);
	Insist(PT_PostDTMRecv_Info->postDTMRecv_dbTblSpc_Hash);

	return (hash_get_num_entries(PT_PostDTMRecv_Info->postDTMRecv_dbTblSpc_Hash) == PT_MAX_NUM_POSTDTMRECV_DB);
}
Beispiel #5
0
/*
 * Returns the number of elements in the hashtable.
 *
 * This function is not synchronized.
 */
long
SyncHTNumEntries(SyncHT *syncHT)
{
	Assert(NULL != syncHT);
	Assert(NULL != syncHT->ht);

	return hash_get_num_entries(syncHT->ht);
}
Beispiel #6
0
/*
 * shmem_shutdown hook: Dump statistics into file.
 *
 * Note: we don't bother with acquiring lock, because there should be no
 * other processes running when this is called.
 */
static void
pgss_shmem_shutdown(int code, Datum arg)
{
	FILE	   *file;
	HASH_SEQ_STATUS hash_seq;
	int32		num_entries;
	pgssEntry  *entry;

	/* Don't try to dump during a crash. */
	if (code)
		return;

	/* Safety check ... shouldn't get here unless shmem is set up. */
	if (!pgss || !pgss_hash)
		return;

	/* Don't dump if told not to. */
	if (!pgss_save)
		return;

	file = AllocateFile(PGSS_DUMP_FILE, PG_BINARY_W);
	if (file == NULL)
		goto error;

	if (fwrite(&PGSS_FILE_HEADER, sizeof(uint32), 1, file) != 1)
		goto error;
	num_entries = hash_get_num_entries(pgss_hash);
	if (fwrite(&num_entries, sizeof(int32), 1, file) != 1)
		goto error;

	hash_seq_init(&hash_seq, pgss_hash);
	while ((entry = hash_seq_search(&hash_seq)) != NULL)
	{
		int			len = entry->key.query_len;

		if (fwrite(entry, offsetof(pgssEntry, mutex), 1, file) != 1 ||
			fwrite(entry->query, 1, len, file) != len)
			goto error;
	}

	if (FreeFile(file))
	{
		file = NULL;
		goto error;
	}

	return;

error:
	ereport(LOG,
			(errcode_for_file_access(),
			 errmsg("could not write pg_stat_statement file \"%s\": %m",
					PGSS_DUMP_FILE)));
	if (file)
		FreeFile(file);
	unlink(PGSS_DUMP_FILE);
}
Beispiel #7
0
/*
 * FindRandomNodeNotInList finds a random node from the shared hash that is not
 * a member of the current node list. The caller is responsible for making the
 * necessary node count checks to ensure that such a node exists.
 *
 * Note that this function has a selection bias towards nodes whose positions in
 * the shared hash are sequentially adjacent to the positions of nodes that are
 * in the current node list. This bias follows from our decision to first pick a
 * random node in the hash, and if that node is a member of the current list, to
 * simply iterate to the next node in the hash. Overall, this approach trades in
 * some selection bias for simplicity in design and for bounded execution time.
 */
static WorkerNode *
FindRandomNodeNotInList(HTAB *WorkerNodesHash, List *currentNodeList)
{
	WorkerNode *workerNode = NULL;
	HASH_SEQ_STATUS status;
	uint32 workerNodeCount = 0;
	uint32 currentNodeCount PG_USED_FOR_ASSERTS_ONLY = 0;
	bool lookForWorkerNode = true;
	uint32 workerPosition = 0;
	uint32 workerIndex = 0;

	workerNodeCount = hash_get_num_entries(WorkerNodesHash);
	currentNodeCount = list_length(currentNodeList);
	Assert(workerNodeCount > currentNodeCount);

	/*
	 * We determine a random position within the worker hash between [1, N],
	 * assuming that the number of elements in the hash is N. We then get to
	 * this random position by iterating over the worker hash. Please note that
	 * the random seed has already been set by the postmaster when starting up.
	 */
	workerPosition = (random() % workerNodeCount) + 1;
	hash_seq_init(&status, WorkerNodesHash);

	for (workerIndex = 0; workerIndex < workerPosition; workerIndex++)
	{
		workerNode = (WorkerNode *) hash_seq_search(&status);
	}

	while (lookForWorkerNode)
	{
		bool listMember = ListMember(currentNodeList, workerNode);

		if (workerNode->inWorkerFile && !listMember)
		{
			lookForWorkerNode = false;
		}
		else
		{
			/* iterate to the next worker node in the hash */
			workerNode = (WorkerNode *) hash_seq_search(&status);

			/* reached end of hash; start from the beginning */
			if (workerNode == NULL)
			{
				hash_seq_init(&status, WorkerNodesHash);
				workerNode = (WorkerNode *) hash_seq_search(&status);
			}
		}
	}

	/* we stopped scanning before completion; therefore clean up scan */
	hash_seq_term(&status);

	return workerNode;
}
Beispiel #8
0
/*
 * create and populate the crosstab tuplestore using the provided source query
 */
static Tuplestorestate *
get_crosstab_tuplestore(char *sql,
						HTAB *crosstab_hash,
						TupleDesc tupdesc,
						MemoryContext per_query_ctx,
						bool randomAccess)
{
	Tuplestorestate *tupstore;
	int			num_categories = hash_get_num_entries(crosstab_hash);
	AttInMetadata *attinmeta = TupleDescGetAttInMetadata(tupdesc);
	char	  **values;
	HeapTuple	tuple;
	int			ret;
	int			proc;

	/* initialize our tuplestore (while still in query context!) */
	tupstore = tuplestore_begin_heap(randomAccess, false, work_mem);

	/* Connect to SPI manager */
	if ((ret = SPI_connect()) < 0)
		/* internal error */
		elog(ERROR, "get_crosstab_tuplestore: SPI_connect returned %d", ret);

	/* Now retrieve the crosstab source rows */
	ret = SPI_execute(sql, true, 0);
	proc = SPI_processed;

	/* Check for qualifying tuples */
	if ((ret == SPI_OK_SELECT) && (proc > 0))
	{
		SPITupleTable *spi_tuptable = SPI_tuptable;
		TupleDesc	spi_tupdesc = spi_tuptable->tupdesc;
		int			ncols = spi_tupdesc->natts;
		char	   *rowid;
		char	   *lastrowid = NULL;
		bool		firstpass = true;
		int			i,
					j;
		int			result_ncols;

		if (num_categories == 0)
		{
			/* no qualifying category tuples */
			ereport(ERROR,
					(errcode(ERRCODE_SYNTAX_ERROR),
					 errmsg("provided \"categories\" SQL must " \
							"return 1 column of at least one row")));
		}

		/*
		 * The provided SQL query must always return at least three columns:
		 *
		 * 1. rowname	the label for each row - column 1 in the final result
		 * 2. category	the label for each value-column in the final result 3.
		 * value	 the values used to populate the value-columns
		 *
		 * If there are more than three columns, the last two are taken as
		 * "category" and "values". The first column is taken as "rowname".
		 * Additional columns (2 thru N-2) are assumed the same for the same
		 * "rowname", and are copied into the result tuple from the first time
		 * we encounter a particular rowname.
		 */
		if (ncols < 3)
			ereport(ERROR,
					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
					 errmsg("invalid source data SQL statement"),
					 errdetail("The provided SQL must return 3 " \
							   " columns; rowid, category, and values.")));

		result_ncols = (ncols - 2) + num_categories;

		/* Recheck to make sure we tuple descriptor still looks reasonable */
		if (tupdesc->natts != result_ncols)
			ereport(ERROR,
					(errcode(ERRCODE_SYNTAX_ERROR),
					 errmsg("invalid return type"),
					 errdetail("Query-specified return " \
							   "tuple has %d columns but crosstab " \
							   "returns %d.", tupdesc->natts, result_ncols)));

		/* allocate space */
		values = (char **) palloc(result_ncols * sizeof(char *));

		/* and make sure it's clear */
		memset(values, '\0', result_ncols * sizeof(char *));

		for (i = 0; i < proc; i++)
		{
			HeapTuple	spi_tuple;
			crosstab_cat_desc *catdesc;
			char	   *catname;

			/* get the next sql result tuple */
			spi_tuple = spi_tuptable->vals[i];

			/* get the rowid from the current sql result tuple */
			rowid = SPI_getvalue(spi_tuple, spi_tupdesc, 1);

			/*
			 * if we're on a new output row, grab the column values up to
			 * column N-2 now
			 */
			if (firstpass || !xstreq(lastrowid, rowid))
			{
				/*
				 * a new row means we need to flush the old one first, unless
				 * we're on the very first row
				 */
				if (!firstpass)
				{
					/* rowid changed, flush the previous output row */
					tuple = BuildTupleFromCStrings(attinmeta, values);

					tuplestore_puttuple(tupstore, tuple);

					for (j = 0; j < result_ncols; j++)
						xpfree(values[j]);
				}

				values[0] = rowid;
				for (j = 1; j < ncols - 2; j++)
					values[j] = SPI_getvalue(spi_tuple, spi_tupdesc, j + 1);

				/* we're no longer on the first pass */
				firstpass = false;
			}

			/* look up the category and fill in the appropriate column */
			catname = SPI_getvalue(spi_tuple, spi_tupdesc, ncols - 1);

			if (catname != NULL)
			{
				crosstab_HashTableLookup(crosstab_hash, catname, catdesc);

				if (catdesc)
					values[catdesc->attidx + ncols - 2] =
						SPI_getvalue(spi_tuple, spi_tupdesc, ncols);
			}

			xpfree(lastrowid);
			xpstrdup(lastrowid, rowid);
		}

		/* flush the last output row */
		tuple = BuildTupleFromCStrings(attinmeta, values);

		tuplestore_puttuple(tupstore, tuple);
	}

	if (SPI_finish() != SPI_OK_FINISH)
		/* internal error */
		elog(ERROR, "get_crosstab_tuplestore: SPI_finish() failed");

	tuplestore_donestoring(tupstore);

	return tupstore;
}
Beispiel #9
0
/*
 *	compute_tsvector_stats() -- compute statistics for a tsvector column
 *
 *	This functions computes statistics that are useful for determining @@
 *	operations' selectivity, along with the fraction of non-null rows and
 *	average width.
 *
 *	Instead of finding the most common values, as we do for most datatypes,
 *	we're looking for the most common lexemes. This is more useful, because
 *	there most probably won't be any two rows with the same tsvector and thus
 *	the notion of a MCV is a bit bogus with this datatype. With a list of the
 *	most common lexemes we can do a better job at figuring out @@ selectivity.
 *
 *	For the same reasons we assume that tsvector columns are unique when
 *	determining the number of distinct values.
 *
 *	The algorithm used is Lossy Counting, as proposed in the paper "Approximate
 *	frequency counts over data streams" by G. S. Manku and R. Motwani, in
 *	Proceedings of the 28th International Conference on Very Large Data Bases,
 *	Hong Kong, China, August 2002, section 4.2. The paper is available at
 *	http://www.vldb.org/conf/2002/S10P03.pdf
 *
 *	The Lossy Counting (aka LC) algorithm goes like this:
 *	Let s be the threshold frequency for an item (the minimum frequency we
 *	are interested in) and epsilon the error margin for the frequency. Let D
 *	be a set of triples (e, f, delta), where e is an element value, f is that
 *	element's frequency (actually, its current occurrence count) and delta is
 *	the maximum error in f. We start with D empty and process the elements in
 *	batches of size w. (The batch size is also known as "bucket size" and is
 *	equal to 1/epsilon.) Let the current batch number be b_current, starting
 *	with 1. For each element e we either increment its f count, if it's
 *	already in D, or insert a new triple into D with values (e, 1, b_current
 *	- 1). After processing each batch we prune D, by removing from it all
 *	elements with f + delta <= b_current.  After the algorithm finishes we
 *	suppress all elements from D that do not satisfy f >= (s - epsilon) * N,
 *	where N is the total number of elements in the input.  We emit the
 *	remaining elements with estimated frequency f/N.  The LC paper proves
 *	that this algorithm finds all elements with true frequency at least s,
 *	and that no frequency is overestimated or is underestimated by more than
 *	epsilon.  Furthermore, given reasonable assumptions about the input
 *	distribution, the required table size is no more than about 7 times w.
 *
 *	We set s to be the estimated frequency of the K'th word in a natural
 *	language's frequency table, where K is the target number of entries in
 *	the MCELEM array plus an arbitrary constant, meant to reflect the fact
 *	that the most common words in any language would usually be stopwords
 *	so we will not actually see them in the input.  We assume that the
 *	distribution of word frequencies (including the stopwords) follows Zipf's
 *	law with an exponent of 1.
 *
 *	Assuming Zipfian distribution, the frequency of the K'th word is equal
 *	to 1/(K * H(W)) where H(n) is 1/2 + 1/3 + ... + 1/n and W is the number of
 *	words in the language.  Putting W as one million, we get roughly 0.07/K.
 *	Assuming top 10 words are stopwords gives s = 0.07/(K + 10).  We set
 *	epsilon = s/10, which gives bucket width w = (K + 10)/0.007 and
 *	maximum expected hashtable size of about 1000 * (K + 10).
 *
 *	Note: in the above discussion, s, epsilon, and f/N are in terms of a
 *	lexeme's frequency as a fraction of all lexemes seen in the input.
 *	However, what we actually want to store in the finished pg_statistic
 *	entry is each lexeme's frequency as a fraction of all rows that it occurs
 *	in.  Assuming that the input tsvectors are correctly constructed, no
 *	lexeme occurs more than once per tsvector, so the final count f is a
 *	correct estimate of the number of input tsvectors it occurs in, and we
 *	need only change the divisor from N to nonnull_cnt to get the number we
 *	want.
 */
static void
compute_tsvector_stats(VacAttrStats *stats,
					   AnalyzeAttrFetchFunc fetchfunc,
					   int samplerows,
					   double totalrows)
{
	int			num_mcelem;
	int			null_cnt = 0;
	double		total_width = 0;

	/* This is D from the LC algorithm. */
	HTAB	   *lexemes_tab;
	HASHCTL		hash_ctl;
	HASH_SEQ_STATUS scan_status;

	/* This is the current bucket number from the LC algorithm */
	int			b_current;

	/* This is 'w' from the LC algorithm */
	int			bucket_width;
	int			vector_no,
				lexeme_no;
	LexemeHashKey hash_key;
	TrackItem  *item;

	/*
	 * We want statistics_target * 10 lexemes in the MCELEM array.  This
	 * multiplier is pretty arbitrary, but is meant to reflect the fact that
	 * the number of individual lexeme values tracked in pg_statistic ought to
	 * be more than the number of values for a simple scalar column.
	 */
	num_mcelem = stats->attr->attstattarget * 10;

	/*
	 * We set bucket width equal to (num_mcelem + 10) / 0.007 as per the
	 * comment above.
	 */
	bucket_width = (num_mcelem + 10) * 1000 / 7;

	/*
	 * Create the hashtable. It will be in local memory, so we don't need to
	 * worry about overflowing the initial size. Also we don't need to pay any
	 * attention to locking and memory management.
	 */
	MemSet(&hash_ctl, 0, sizeof(hash_ctl));
	hash_ctl.keysize = sizeof(LexemeHashKey);
	hash_ctl.entrysize = sizeof(TrackItem);
	hash_ctl.hash = lexeme_hash;
	hash_ctl.match = lexeme_match;
	hash_ctl.hcxt = CurrentMemoryContext;
	lexemes_tab = hash_create("Analyzed lexemes table",
							  num_mcelem,
							  &hash_ctl,
							  HASH_ELEM | HASH_FUNCTION | HASH_COMPARE | HASH_CONTEXT);

	/* Initialize counters. */
	b_current = 1;
	lexeme_no = 0;

	/* Loop over the tsvectors. */
	for (vector_no = 0; vector_no < samplerows; vector_no++)
	{
		Datum		value;
		bool		isnull;
		TSVector	vector;
		WordEntry  *curentryptr;
		char	   *lexemesptr;
		int			j;

		vacuum_delay_point();

		value = fetchfunc(stats, vector_no, &isnull);

		/*
		 * Check for null/nonnull.
		 */
		if (isnull)
		{
			null_cnt++;
			continue;
		}

		/*
		 * Add up widths for average-width calculation.  Since it's a
		 * tsvector, we know it's varlena.  As in the regular
		 * compute_minimal_stats function, we use the toasted width for this
		 * calculation.
		 */
		total_width += VARSIZE_ANY(DatumGetPointer(value));

		/*
		 * Now detoast the tsvector if needed.
		 */
		vector = DatumGetTSVector(value);

		/*
		 * We loop through the lexemes in the tsvector and add them to our
		 * tracking hashtable.
		 */
		lexemesptr = STRPTR(vector);
		curentryptr = ARRPTR(vector);
		for (j = 0; j < vector->size; j++)
		{
			bool		found;

			/*
			 * Construct a hash key.  The key points into the (detoasted)
			 * tsvector value at this point, but if a new entry is created, we
			 * make a copy of it.  This way we can free the tsvector value
			 * once we've processed all its lexemes.
			 */
			hash_key.lexeme = lexemesptr + curentryptr->pos;
			hash_key.length = curentryptr->len;

			/* Lookup current lexeme in hashtable, adding it if new */
			item = (TrackItem *) hash_search(lexemes_tab,
											 (const void *) &hash_key,
											 HASH_ENTER, &found);

			if (found)
			{
				/* The lexeme is already on the tracking list */
				item->frequency++;
			}
			else
			{
				/* Initialize new tracking list element */
				item->frequency = 1;
				item->delta = b_current - 1;

				item->key.lexeme = palloc(hash_key.length);
				memcpy(item->key.lexeme, hash_key.lexeme, hash_key.length);
			}

			/* lexeme_no is the number of elements processed (ie N) */
			lexeme_no++;

			/* We prune the D structure after processing each bucket */
			if (lexeme_no % bucket_width == 0)
			{
				prune_lexemes_hashtable(lexemes_tab, b_current);
				b_current++;
			}

			/* Advance to the next WordEntry in the tsvector */
			curentryptr++;
		}

		/* If the vector was toasted, free the detoasted copy. */
		if (TSVectorGetDatum(vector) != value)
			pfree(vector);
	}

	/* We can only compute real stats if we found some non-null values. */
	if (null_cnt < samplerows)
	{
		int			nonnull_cnt = samplerows - null_cnt;
		int			i;
		TrackItem **sort_table;
		int			track_len;
		int			cutoff_freq;
		int			minfreq,
					maxfreq;

		stats->stats_valid = true;
		/* Do the simple null-frac and average width stats */
		stats->stanullfrac = (double) null_cnt / (double) samplerows;
		stats->stawidth = total_width / (double) nonnull_cnt;

		/* Assume it's a unique column (see notes above) */
		stats->stadistinct = -1.0 * (1.0 - stats->stanullfrac);

		/*
		 * Construct an array of the interesting hashtable items, that is,
		 * those meeting the cutoff frequency (s - epsilon)*N.  Also identify
		 * the minimum and maximum frequencies among these items.
		 *
		 * Since epsilon = s/10 and bucket_width = 1/epsilon, the cutoff
		 * frequency is 9*N / bucket_width.
		 */
		cutoff_freq = 9 * lexeme_no / bucket_width;

		i = hash_get_num_entries(lexemes_tab);	/* surely enough space */
		sort_table = (TrackItem **) palloc(sizeof(TrackItem *) * i);

		hash_seq_init(&scan_status, lexemes_tab);
		track_len = 0;
		minfreq = lexeme_no;
		maxfreq = 0;
		while ((item = (TrackItem *) hash_seq_search(&scan_status)) != NULL)
		{
			if (item->frequency > cutoff_freq)
			{
				sort_table[track_len++] = item;
				minfreq = Min(minfreq, item->frequency);
				maxfreq = Max(maxfreq, item->frequency);
			}
		}
		Assert(track_len <= i);

		/* emit some statistics for debug purposes */
		elog(DEBUG3, "tsvector_stats: target # mces = %d, bucket width = %d, "
			 "# lexemes = %d, hashtable size = %d, usable entries = %d",
			 num_mcelem, bucket_width, lexeme_no, i, track_len);

		/*
		 * If we obtained more lexemes than we really want, get rid of those
		 * with least frequencies.  The easiest way is to qsort the array into
		 * descending frequency order and truncate the array.
		 */
		if (num_mcelem < track_len)
		{
			qsort(sort_table, track_len, sizeof(TrackItem *),
				  trackitem_compare_frequencies_desc);
			/* reset minfreq to the smallest frequency we're keeping */
			minfreq = sort_table[num_mcelem - 1]->frequency;
		}
		else
			num_mcelem = track_len;

		/* Generate MCELEM slot entry */
		if (num_mcelem > 0)
		{
			MemoryContext old_context;
			Datum	   *mcelem_values;
			float4	   *mcelem_freqs;

			/*
			 * We want to store statistics sorted on the lexeme value using
			 * first length, then byte-for-byte comparison. The reason for
			 * doing length comparison first is that we don't care about the
			 * ordering so long as it's consistent, and comparing lengths
			 * first gives us a chance to avoid a strncmp() call.
			 *
			 * This is different from what we do with scalar statistics --
			 * they get sorted on frequencies. The rationale is that we
			 * usually search through most common elements looking for a
			 * specific value, so we can grab its frequency.  When values are
			 * presorted we can employ binary search for that.  See
			 * ts_selfuncs.c for a real usage scenario.
			 */
			qsort(sort_table, num_mcelem, sizeof(TrackItem *),
				  trackitem_compare_lexemes);

			/* Must copy the target values into anl_context */
			old_context = MemoryContextSwitchTo(stats->anl_context);

			/*
			 * We sorted statistics on the lexeme value, but we want to be
			 * able to find out the minimal and maximal frequency without
			 * going through all the values.  We keep those two extra
			 * frequencies in two extra cells in mcelem_freqs.
			 *
			 * (Note: the MCELEM statistics slot definition allows for a third
			 * extra number containing the frequency of nulls, but we don't
			 * create that for a tsvector column, since null elements aren't
			 * possible.)
			 */
			mcelem_values = (Datum *) palloc(num_mcelem * sizeof(Datum));
			mcelem_freqs = (float4 *) palloc((num_mcelem + 2) * sizeof(float4));

			/*
			 * See comments above about use of nonnull_cnt as the divisor for
			 * the final frequency estimates.
			 */
			for (i = 0; i < num_mcelem; i++)
			{
				TrackItem  *item = sort_table[i];

				mcelem_values[i] =
					PointerGetDatum(cstring_to_text_with_len(item->key.lexeme,
															 item->key.length));
				mcelem_freqs[i] = (double) item->frequency / (double) nonnull_cnt;
			}
			mcelem_freqs[i++] = (double) minfreq / (double) nonnull_cnt;
			mcelem_freqs[i] = (double) maxfreq / (double) nonnull_cnt;
			MemoryContextSwitchTo(old_context);

			stats->stakind[0] = STATISTIC_KIND_MCELEM;
			stats->staop[0] = TextEqualOperator;
			stats->stacoll[0] = DEFAULT_COLLATION_OID;
			stats->stanumbers[0] = mcelem_freqs;
			/* See above comment about two extra frequency fields */
			stats->numnumbers[0] = num_mcelem + 2;
			stats->stavalues[0] = mcelem_values;
			stats->numvalues[0] = num_mcelem;
			/* We are storing text values */
			stats->statypid[0] = TEXTOID;
			stats->statyplen[0] = -1;	/* typlen, -1 for varlena */
			stats->statypbyval[0] = false;
			stats->statypalign[0] = 'i';
		}
	}
	else
	{
		/* We found only nulls; assume the column is entirely null */
		stats->stats_valid = true;
		stats->stanullfrac = 1.0;
		stats->stawidth = 0;	/* "unknown" */
		stats->stadistinct = 0.0;	/* "unknown" */
	}

	/*
	 * We don't need to bother cleaning up any of our temporary palloc's. The
	 * hashtable should also go away, as it used a child memory context.
	 */
}
Beispiel #10
0
/*
 * SQL function json_populate_record
 *
 * set fields in a record from the argument json
 *
 * Code adapted shamelessly from hstore's populate_record
 * which is in turn partly adapted from record_out.
 *
 * The json is decomposed into a hash table, in which each
 * field in the record is then looked up by name.
 */
Datum
json_populate_record(PG_FUNCTION_ARGS)
{
	Oid			argtype = get_fn_expr_argtype(fcinfo->flinfo, 0);
	text	   *json = PG_GETARG_TEXT_P(1);
	bool		use_json_as_text = PG_GETARG_BOOL(2);
	HTAB	   *json_hash;
	HeapTupleHeader rec;
	Oid			tupType;
	int32		tupTypmod;
	TupleDesc	tupdesc;
	HeapTupleData tuple;
	HeapTuple	rettuple;
	RecordIOData *my_extra;
	int			ncolumns;
	int			i;
	Datum	   *values;
	bool	   *nulls;
	char		fname[NAMEDATALEN];
	JsonHashEntry hashentry;


	if (!type_is_rowtype(argtype))
		ereport(ERROR,
				(errcode(ERRCODE_DATATYPE_MISMATCH),
				 errmsg("first argument must be a rowtype")));

	if (PG_ARGISNULL(0))
	{
		if (PG_ARGISNULL(1))
			PG_RETURN_NULL();

		rec = NULL;

		/*
		 * have no tuple to look at, so the only source of type info is the
		 * argtype. The lookup_rowtype_tupdesc call below will error out if we
		 * don't have a known composite type oid here.
		 */
		tupType = argtype;
		tupTypmod = -1;
	}
	else
	{
		rec = PG_GETARG_HEAPTUPLEHEADER(0);

		if (PG_ARGISNULL(1))
			PG_RETURN_POINTER(rec);

		/* Extract type info from the tuple itself */
		tupType = HeapTupleHeaderGetTypeId(rec);
		tupTypmod = HeapTupleHeaderGetTypMod(rec);
	}

	json_hash = get_json_object_as_hash(json, "json_populate_record", use_json_as_text);

	/*
	 * if the input json is empty, we can only skip the rest if we were passed
	 * in a non-null record, since otherwise there may be issues with domain
	 * nulls.
	 */
	if (hash_get_num_entries(json_hash) == 0 && rec)
		PG_RETURN_POINTER(rec);


	tupdesc = lookup_rowtype_tupdesc(tupType, tupTypmod);
	ncolumns = tupdesc->natts;

	if (rec)
	{
		/* Build a temporary HeapTuple control structure */
		tuple.t_len = HeapTupleHeaderGetDatumLength(rec);
		ItemPointerSetInvalid(&(tuple.t_self));
		tuple.t_data = rec;
	}

	/*
	 * We arrange to look up the needed I/O info just once per series of
	 * calls, assuming the record type doesn't change underneath us.
	 */
	my_extra = (RecordIOData *) fcinfo->flinfo->fn_extra;
	if (my_extra == NULL ||
		my_extra->ncolumns != ncolumns)
	{
		fcinfo->flinfo->fn_extra =
			MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
							   sizeof(RecordIOData) - sizeof(ColumnIOData)
							   + ncolumns * sizeof(ColumnIOData));
		my_extra = (RecordIOData *) fcinfo->flinfo->fn_extra;
		my_extra->record_type = InvalidOid;
		my_extra->record_typmod = 0;
	}

	if (my_extra->record_type != tupType ||
		my_extra->record_typmod != tupTypmod)
	{
		MemSet(my_extra, 0,
			   sizeof(RecordIOData) - sizeof(ColumnIOData)
			   + ncolumns * sizeof(ColumnIOData));
		my_extra->record_type = tupType;
		my_extra->record_typmod = tupTypmod;
		my_extra->ncolumns = ncolumns;
	}

	values = (Datum *) palloc(ncolumns * sizeof(Datum));
	nulls = (bool *) palloc(ncolumns * sizeof(bool));

	if (rec)
	{
		/* Break down the tuple into fields */
		heap_deform_tuple(&tuple, tupdesc, values, nulls);
	}
	else
	{
		for (i = 0; i < ncolumns; ++i)
		{
			values[i] = (Datum) 0;
			nulls[i] = true;
		}
	}

	for (i = 0; i < ncolumns; ++i)
	{
		ColumnIOData *column_info = &my_extra->columns[i];
		Oid			column_type = tupdesc->attrs[i]->atttypid;
		char	   *value;

		/* Ignore dropped columns in datatype */
		if (tupdesc->attrs[i]->attisdropped)
		{
			nulls[i] = true;
			continue;
		}

		memset(fname, 0, NAMEDATALEN);
		strncpy(fname, NameStr(tupdesc->attrs[i]->attname), NAMEDATALEN);
		hashentry = hash_search(json_hash, fname, HASH_FIND, NULL);

		/*
		 * we can't just skip here if the key wasn't found since we might have
		 * a domain to deal with. If we were passed in a non-null record
		 * datum, we assume that the existing values are valid (if they're
		 * not, then it's not our fault), but if we were passed in a null,
		 * then every field which we don't populate needs to be run through
		 * the input function just in case it's a domain type.
		 */
		if (hashentry == NULL && rec)
			continue;

		/*
		 * Prepare to convert the column value from text
		 */
		if (column_info->column_type != column_type)
		{
			getTypeInputInfo(column_type,
							 &column_info->typiofunc,
							 &column_info->typioparam);
			fmgr_info_cxt(column_info->typiofunc, &column_info->proc,
						  fcinfo->flinfo->fn_mcxt);
			column_info->column_type = column_type;
		}
		if (hashentry == NULL || hashentry->isnull)
		{
			/*
			 * need InputFunctionCall to happen even for nulls, so that domain
			 * checks are done
			 */
			values[i] = InputFunctionCall(&column_info->proc, NULL,
										  column_info->typioparam,
										  tupdesc->attrs[i]->atttypmod);
			nulls[i] = true;
		}
		else
		{
			value = hashentry->val;

			values[i] = InputFunctionCall(&column_info->proc, value,
										  column_info->typioparam,
										  tupdesc->attrs[i]->atttypmod);
			nulls[i] = false;
		}
	}

	rettuple = heap_form_tuple(tupdesc, values, nulls);

	ReleaseTupleDesc(tupdesc);

	PG_RETURN_DATUM(HeapTupleGetDatum(rettuple));
}
Beispiel #11
0
/* Process one per-dbspace directory for ResetUnloggedRelations */
static void
ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op)
{
	DIR		   *dbspace_dir;
	struct dirent *de;
	char		rm_path[MAXPGPATH];

	/* Caller must specify at least one operation. */
	Assert((op & (UNLOGGED_RELATION_CLEANUP | UNLOGGED_RELATION_INIT)) != 0);

	/*
	 * Cleanup is a two-pass operation.  First, we go through and identify all
	 * the files with init forks.  Then, we go through again and nuke
	 * everything with the same OID except the init fork.
	 */
	if ((op & UNLOGGED_RELATION_CLEANUP) != 0)
	{
		HTAB	   *hash = NULL;
		HASHCTL		ctl;

		/* Open the directory. */
		dbspace_dir = AllocateDir(dbspacedirname);
		if (dbspace_dir == NULL)
		{
			elog(LOG,
				 "could not open dbspace directory \"%s\": %m",
				 dbspacedirname);
			return;
		}

		/*
		 * It's possible that someone could create a ton of unlogged relations
		 * in the same database & tablespace, so we'd better use a hash table
		 * rather than an array or linked list to keep track of which files
		 * need to be reset.  Otherwise, this cleanup operation would be
		 * O(n^2).
		 */
		ctl.keysize = sizeof(unlogged_relation_entry);
		ctl.entrysize = sizeof(unlogged_relation_entry);
		hash = hash_create("unlogged hash", 32, &ctl, HASH_ELEM);

		/* Scan the directory. */
		while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL)
		{
			ForkNumber	forkNum;
			int			oidchars;
			unlogged_relation_entry ent;

			/* Skip anything that doesn't look like a relation data file. */
			if (!parse_filename_for_nontemp_relation(de->d_name, &oidchars,
													 &forkNum))
				continue;

			/* Also skip it unless this is the init fork. */
			if (forkNum != INIT_FORKNUM)
				continue;

			/*
			 * Put the OID portion of the name into the hash table, if it
			 * isn't already.
			 */
			memset(ent.oid, 0, sizeof(ent.oid));
			memcpy(ent.oid, de->d_name, oidchars);
			hash_search(hash, &ent, HASH_ENTER, NULL);
		}

		/* Done with the first pass. */
		FreeDir(dbspace_dir);

		/*
		 * If we didn't find any init forks, there's no point in continuing;
		 * we can bail out now.
		 */
		if (hash_get_num_entries(hash) == 0)
		{
			hash_destroy(hash);
			return;
		}

		/*
		 * Now, make a second pass and remove anything that matches. First,
		 * reopen the directory.
		 */
		dbspace_dir = AllocateDir(dbspacedirname);
		if (dbspace_dir == NULL)
		{
			elog(LOG,
				 "could not open dbspace directory \"%s\": %m",
				 dbspacedirname);
			hash_destroy(hash);
			return;
		}

		/* Scan the directory. */
		while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL)
		{
			ForkNumber	forkNum;
			int			oidchars;
			bool		found;
			unlogged_relation_entry ent;

			/* Skip anything that doesn't look like a relation data file. */
			if (!parse_filename_for_nontemp_relation(de->d_name, &oidchars,
													 &forkNum))
				continue;

			/* We never remove the init fork. */
			if (forkNum == INIT_FORKNUM)
				continue;

			/*
			 * See whether the OID portion of the name shows up in the hash
			 * table.
			 */
			memset(ent.oid, 0, sizeof(ent.oid));
			memcpy(ent.oid, de->d_name, oidchars);
			hash_search(hash, &ent, HASH_FIND, &found);

			/* If so, nuke it! */
			if (found)
			{
				snprintf(rm_path, sizeof(rm_path), "%s/%s",
						 dbspacedirname, de->d_name);

				/*
				 * It's tempting to actually throw an error here, but since
				 * this code gets run during database startup, that could
				 * result in the database failing to start.  (XXX Should we do
				 * it anyway?)
				 */
				if (unlink(rm_path))
					elog(LOG, "could not unlink file \"%s\": %m", rm_path);
				else
					elog(DEBUG2, "unlinked file \"%s\"", rm_path);
			}
		}

		/* Cleanup is complete. */
		FreeDir(dbspace_dir);
		hash_destroy(hash);
	}

	/*
	 * Initialization happens after cleanup is complete: we copy each init
	 * fork file to the corresponding main fork file.  Note that if we are
	 * asked to do both cleanup and init, we may never get here: if the
	 * cleanup code determines that there are no init forks in this dbspace,
	 * it will return before we get to this point.
	 */
	if ((op & UNLOGGED_RELATION_INIT) != 0)
	{
		/* Open the directory. */
		dbspace_dir = AllocateDir(dbspacedirname);
		if (dbspace_dir == NULL)
		{
			/* we just saw this directory, so it really ought to be there */
			elog(LOG,
				 "could not open dbspace directory \"%s\": %m",
				 dbspacedirname);
			return;
		}

		/* Scan the directory. */
		while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL)
		{
			ForkNumber	forkNum;
			int			oidchars;
			char		oidbuf[OIDCHARS + 1];
			char		srcpath[MAXPGPATH];
			char		dstpath[MAXPGPATH];

			/* Skip anything that doesn't look like a relation data file. */
			if (!parse_filename_for_nontemp_relation(de->d_name, &oidchars,
													 &forkNum))
				continue;

			/* Also skip it unless this is the init fork. */
			if (forkNum != INIT_FORKNUM)
				continue;

			/* Construct source pathname. */
			snprintf(srcpath, sizeof(srcpath), "%s/%s",
					 dbspacedirname, de->d_name);

			/* Construct destination pathname. */
			memcpy(oidbuf, de->d_name, oidchars);
			oidbuf[oidchars] = '\0';
			snprintf(dstpath, sizeof(dstpath), "%s/%s%s",
					 dbspacedirname, oidbuf, de->d_name + oidchars + 1 +
					 strlen(forkNames[INIT_FORKNUM]));

			/* OK, we're ready to perform the actual copy. */
			elog(DEBUG2, "copying %s to %s", srcpath, dstpath);
			copy_file(srcpath, dstpath);
		}

		/* Done with the first pass. */
		FreeDir(dbspace_dir);
	}
}
/*
 *	compute_tsvector_stats() -- compute statistics for a tsvector column
 *
 *	This functions computes statistics that are useful for determining @@
 *	operations' selectivity, along with the fraction of non-null rows and
 *	average width.
 *
 *	Instead of finding the most common values, as we do for most datatypes,
 *	we're looking for the most common lexemes. This is more useful, because
 *	there most probably won't be any two rows with the same tsvector and thus
 *	the notion of a MCV is a bit bogus with this datatype. With a list of the
 *	most common lexemes we can do a better job at figuring out @@ selectivity.
 *
 *	For the same reasons we assume that tsvector columns are unique when
 *	determining the number of distinct values.
 *
 *	The algorithm used is Lossy Counting, as proposed in the paper "Approximate
 *	frequency counts over data streams" by G. S. Manku and R. Motwani, in
 *	Proceedings of the 28th International Conference on Very Large Data Bases,
 *	Hong Kong, China, August 2002, section 4.2. The paper is available at
 *	http://www.vldb.org/conf/2002/S10P03.pdf
 *
 *	The Lossy Counting (aka LC) algorithm goes like this:
 *	Let D be a set of triples (e, f, d), where e is an element value, f is
 *	that element's frequency (occurrence count) and d is the maximum error in
 *	f.	We start with D empty and process the elements in batches of size
 *	w. (The batch size is also known as "bucket size".) Let the current batch
 *	number be b_current, starting with 1. For each element e we either
 *	increment its f count, if it's already in D, or insert a new triple into D
 *	with values (e, 1, b_current - 1). After processing each batch we prune D,
 *	by removing from it all elements with f + d <= b_current. Finally, we
 *	gather elements with largest f.  The LC paper proves error bounds on f
 *	dependent on the batch size w, and shows that the required table size
 *	is no more than a few times w.
 *
 *	We use a hashtable for the D structure and a bucket width of
 *	statistics_target * 10, where 10 is an arbitrarily chosen constant,
 *	meant to approximate the number of lexemes in a single tsvector.
 */
static void
compute_tsvector_stats(VacAttrStats *stats,
					   AnalyzeAttrFetchFunc fetchfunc,
					   int samplerows,
					   double totalrows)
{
	int			num_mcelem;
	int			null_cnt = 0;
	double		total_width = 0;

	/* This is D from the LC algorithm. */
	HTAB	   *lexemes_tab;
	HASHCTL		hash_ctl;
	HASH_SEQ_STATUS scan_status;

	/* This is the current bucket number from the LC algorithm */
	int			b_current;

	/* This is 'w' from the LC algorithm */
	int			bucket_width;
	int			vector_no,
				lexeme_no;
	LexemeHashKey hash_key;
	TrackItem  *item;

	/* We want statistics_target * 10 lexemes in the MCELEM array */
	num_mcelem = stats->attr->attstattarget * 10;

	/*
	 * We set bucket width equal to the target number of result lexemes. This
	 * is probably about right but perhaps might need to be scaled up or down
	 * a bit?
	 */
	bucket_width = num_mcelem;

	/*
	 * Create the hashtable. It will be in local memory, so we don't need to
	 * worry about initial size too much. Also we don't need to pay any
	 * attention to locking and memory management.
	 */
	MemSet(&hash_ctl, 0, sizeof(hash_ctl));
	hash_ctl.keysize = sizeof(LexemeHashKey);
	hash_ctl.entrysize = sizeof(TrackItem);
	hash_ctl.hash = lexeme_hash;
	hash_ctl.match = lexeme_match;
	hash_ctl.hcxt = CurrentMemoryContext;
	lexemes_tab = hash_create("Analyzed lexemes table",
							  bucket_width * 4,
							  &hash_ctl,
					HASH_ELEM | HASH_FUNCTION | HASH_COMPARE | HASH_CONTEXT);

	/* Initialize counters. */
	b_current = 1;
	lexeme_no = 1;

	/* Loop over the tsvectors. */
	for (vector_no = 0; vector_no < samplerows; vector_no++)
	{
		Datum		value;
		bool		isnull;
		TSVector	vector;
		WordEntry  *curentryptr;
		char	   *lexemesptr;
		int			j;

		vacuum_delay_point();

		value = fetchfunc(stats, vector_no, &isnull);

		/*
		 * Check for null/nonnull.
		 */
		if (isnull)
		{
			null_cnt++;
			continue;
		}

		/*
		 * Add up widths for average-width calculation.  Since it's a
		 * tsvector, we know it's varlena.  As in the regular
		 * compute_minimal_stats function, we use the toasted width for this
		 * calculation.
		 */
		total_width += VARSIZE_ANY(DatumGetPointer(value));

		/*
		 * Now detoast the tsvector if needed.
		 */
		vector = DatumGetTSVector(value);

		/*
		 * We loop through the lexemes in the tsvector and add them to our
		 * tracking hashtable.	Note: the hashtable entries will point into
		 * the (detoasted) tsvector value, therefore we cannot free that
		 * storage until we're done.
		 */
		lexemesptr = STRPTR(vector);
		curentryptr = ARRPTR(vector);
		for (j = 0; j < vector->size; j++)
		{
			bool		found;

			/* Construct a hash key */
			hash_key.lexeme = lexemesptr + curentryptr->pos;
			hash_key.length = curentryptr->len;

			/* Lookup current lexeme in hashtable, adding it if new */
			item = (TrackItem *) hash_search(lexemes_tab,
											 (const void *) &hash_key,
											 HASH_ENTER, &found);

			if (found)
			{
				/* The lexeme is already on the tracking list */
				item->frequency++;
			}
			else
			{
				/* Initialize new tracking list element */
				item->frequency = 1;
				item->delta = b_current - 1;
			}

			/* We prune the D structure after processing each bucket */
			if (lexeme_no % bucket_width == 0)
			{
				prune_lexemes_hashtable(lexemes_tab, b_current);
				b_current++;
			}

			/* Advance to the next WordEntry in the tsvector */
			lexeme_no++;
			curentryptr++;
		}
	}

	/* We can only compute real stats if we found some non-null values. */
	if (null_cnt < samplerows)
	{
		int			nonnull_cnt = samplerows - null_cnt;
		int			i;
		TrackItem **sort_table;
		int			track_len;
		int			minfreq,
					maxfreq;

		stats->stats_valid = true;
		/* Do the simple null-frac and average width stats */
		stats->stanullfrac = (double) null_cnt / (double) samplerows;
		stats->stawidth = total_width / (double) nonnull_cnt;

		/* Assume it's a unique column (see notes above) */
		stats->stadistinct = -1.0;

		/*
		 * Determine the top-N lexemes by simply copying pointers from the
		 * hashtable into an array and applying qsort()
		 */
		track_len = hash_get_num_entries(lexemes_tab);

		sort_table = (TrackItem **) palloc(sizeof(TrackItem *) * track_len);

		hash_seq_init(&scan_status, lexemes_tab);
		i = 0;
		while ((item = (TrackItem *) hash_seq_search(&scan_status)) != NULL)
		{
			sort_table[i++] = item;
		}
		Assert(i == track_len);

		qsort(sort_table, track_len, sizeof(TrackItem *),
			  trackitem_compare_frequencies_desc);

		/* Suppress any single-occurrence items */
		while (track_len > 0)
		{
			if (sort_table[track_len - 1]->frequency > 1)
				break;
			track_len--;
		}

		/* Determine the number of most common lexemes to be stored */
		if (num_mcelem > track_len)
			num_mcelem = track_len;

		/* Generate MCELEM slot entry */
		if (num_mcelem > 0)
		{
			MemoryContext old_context;
			Datum	   *mcelem_values;
			float4	   *mcelem_freqs;

			/* Grab the minimal and maximal frequencies that will get stored */
			minfreq = sort_table[num_mcelem - 1]->frequency;
			maxfreq = sort_table[0]->frequency;

			/*
			 * We want to store statistics sorted on the lexeme value using
			 * first length, then byte-for-byte comparison. The reason for
			 * doing length comparison first is that we don't care about the
			 * ordering so long as it's consistent, and comparing lengths
			 * first gives us a chance to avoid a strncmp() call.
			 *
			 * This is different from what we do with scalar statistics --
			 * they get sorted on frequencies. The rationale is that we
			 * usually search through most common elements looking for a
			 * specific value, so we can grab its frequency.  When values are
			 * presorted we can employ binary search for that.	See
			 * ts_selfuncs.c for a real usage scenario.
			 */
			qsort(sort_table, num_mcelem, sizeof(TrackItem *),
				  trackitem_compare_lexemes);

			/* Must copy the target values into anl_context */
			old_context = MemoryContextSwitchTo(stats->anl_context);

			/*
			 * We sorted statistics on the lexeme value, but we want to be
			 * able to find out the minimal and maximal frequency without
			 * going through all the values.  We keep those two extra
			 * frequencies in two extra cells in mcelem_freqs.
			 */
			mcelem_values = (Datum *) palloc(num_mcelem * sizeof(Datum));
			mcelem_freqs = (float4 *) palloc((num_mcelem + 2) * sizeof(float4));

			for (i = 0; i < num_mcelem; i++)
			{
				TrackItem  *item = sort_table[i];

				mcelem_values[i] =
					PointerGetDatum(cstring_to_text_with_len(item->key.lexeme,
														  item->key.length));
				mcelem_freqs[i] = (double) item->frequency / (double) nonnull_cnt;
			}
			mcelem_freqs[i++] = (double) minfreq / (double) nonnull_cnt;
			mcelem_freqs[i] = (double) maxfreq / (double) nonnull_cnt;
			MemoryContextSwitchTo(old_context);

			stats->stakind[0] = STATISTIC_KIND_MCELEM;
			stats->staop[0] = TextEqualOperator;
			stats->stanumbers[0] = mcelem_freqs;
			/* See above comment about two extra frequency fields */
			stats->numnumbers[0] = num_mcelem + 2;
			stats->stavalues[0] = mcelem_values;
			stats->numvalues[0] = num_mcelem;
			/* We are storing text values */
			stats->statypid[0] = TEXTOID;
			stats->statyplen[0] = -1;	/* typlen, -1 for varlena */
			stats->statypbyval[0] = false;
			stats->statypalign[0] = 'i';
		}
	}
	else
	{
		/* We found only nulls; assume the column is entirely null */
		stats->stats_valid = true;
		stats->stanullfrac = 1.0;
		stats->stawidth = 0;	/* "unknown" */
		stats->stadistinct = 0.0;		/* "unknown" */
	}

	/*
	 * We don't need to bother cleaning up any of our temporary palloc's. The
	 * hashtable should also go away, as it used a child memory context.
	 */
}
Beispiel #13
0
/*
 * Process one per-dbspace directory for ResetUnloggedRelations
 */
static void
ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op)
{
	DIR		   *dbspace_dir;
	struct dirent *de;
	char		rm_path[MAXPGPATH * 2];

	/* Caller must specify at least one operation. */
	Assert((op & (UNLOGGED_RELATION_CLEANUP | UNLOGGED_RELATION_INIT)) != 0);

	/*
	 * Cleanup is a two-pass operation.  First, we go through and identify all
	 * the files with init forks.  Then, we go through again and nuke
	 * everything with the same OID except the init fork.
	 */
	if ((op & UNLOGGED_RELATION_CLEANUP) != 0)
	{
		HTAB	   *hash;
		HASHCTL		ctl;

		/*
		 * It's possible that someone could create a ton of unlogged relations
		 * in the same database & tablespace, so we'd better use a hash table
		 * rather than an array or linked list to keep track of which files
		 * need to be reset.  Otherwise, this cleanup operation would be
		 * O(n^2).
		 */
		memset(&ctl, 0, sizeof(ctl));
		ctl.keysize = sizeof(unlogged_relation_entry);
		ctl.entrysize = sizeof(unlogged_relation_entry);
		hash = hash_create("unlogged hash", 32, &ctl, HASH_ELEM);

		/* Scan the directory. */
		dbspace_dir = AllocateDir(dbspacedirname);
		while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL)
		{
			ForkNumber	forkNum;
			int			oidchars;
			unlogged_relation_entry ent;

			/* Skip anything that doesn't look like a relation data file. */
			if (!parse_filename_for_nontemp_relation(de->d_name, &oidchars,
													 &forkNum))
				continue;

			/* Also skip it unless this is the init fork. */
			if (forkNum != INIT_FORKNUM)
				continue;

			/*
			 * Put the OID portion of the name into the hash table, if it
			 * isn't already.
			 */
			memset(ent.oid, 0, sizeof(ent.oid));
			memcpy(ent.oid, de->d_name, oidchars);
			hash_search(hash, &ent, HASH_ENTER, NULL);
		}

		/* Done with the first pass. */
		FreeDir(dbspace_dir);

		/*
		 * If we didn't find any init forks, there's no point in continuing;
		 * we can bail out now.
		 */
		if (hash_get_num_entries(hash) == 0)
		{
			hash_destroy(hash);
			return;
		}

		/*
		 * Now, make a second pass and remove anything that matches.
		 */
		dbspace_dir = AllocateDir(dbspacedirname);
		while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL)
		{
			ForkNumber	forkNum;
			int			oidchars;
			bool		found;
			unlogged_relation_entry ent;

			/* Skip anything that doesn't look like a relation data file. */
			if (!parse_filename_for_nontemp_relation(de->d_name, &oidchars,
													 &forkNum))
				continue;

			/* We never remove the init fork. */
			if (forkNum == INIT_FORKNUM)
				continue;

			/*
			 * See whether the OID portion of the name shows up in the hash
			 * table.
			 */
			memset(ent.oid, 0, sizeof(ent.oid));
			memcpy(ent.oid, de->d_name, oidchars);
			hash_search(hash, &ent, HASH_FIND, &found);

			/* If so, nuke it! */
			if (found)
			{
				snprintf(rm_path, sizeof(rm_path), "%s/%s",
						 dbspacedirname, de->d_name);
				if (unlink(rm_path) < 0)
					ereport(ERROR,
							(errcode_for_file_access(),
							 errmsg("could not remove file \"%s\": %m",
									rm_path)));
				else
					elog(DEBUG2, "unlinked file \"%s\"", rm_path);
			}
		}

		/* Cleanup is complete. */
		FreeDir(dbspace_dir);
		hash_destroy(hash);
	}

	/*
	 * Initialization happens after cleanup is complete: we copy each init
	 * fork file to the corresponding main fork file.  Note that if we are
	 * asked to do both cleanup and init, we may never get here: if the
	 * cleanup code determines that there are no init forks in this dbspace,
	 * it will return before we get to this point.
	 */
	if ((op & UNLOGGED_RELATION_INIT) != 0)
	{
		/* Scan the directory. */
		dbspace_dir = AllocateDir(dbspacedirname);
		while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL)
		{
			ForkNumber	forkNum;
			int			oidchars;
			char		oidbuf[OIDCHARS + 1];
			char		srcpath[MAXPGPATH * 2];
			char		dstpath[MAXPGPATH];

			/* Skip anything that doesn't look like a relation data file. */
			if (!parse_filename_for_nontemp_relation(de->d_name, &oidchars,
													 &forkNum))
				continue;

			/* Also skip it unless this is the init fork. */
			if (forkNum != INIT_FORKNUM)
				continue;

			/* Construct source pathname. */
			snprintf(srcpath, sizeof(srcpath), "%s/%s",
					 dbspacedirname, de->d_name);

			/* Construct destination pathname. */
			memcpy(oidbuf, de->d_name, oidchars);
			oidbuf[oidchars] = '\0';
			snprintf(dstpath, sizeof(dstpath), "%s/%s%s",
					 dbspacedirname, oidbuf, de->d_name + oidchars + 1 +
					 strlen(forkNames[INIT_FORKNUM]));

			/* OK, we're ready to perform the actual copy. */
			elog(DEBUG2, "copying %s to %s", srcpath, dstpath);
			copy_file(srcpath, dstpath);
		}

		FreeDir(dbspace_dir);

		/*
		 * copy_file() above has already called pg_flush_data() on the files
		 * it created. Now we need to fsync those files, because a checkpoint
		 * won't do it for us while we're in recovery. We do this in a
		 * separate pass to allow the kernel to perform all the flushes
		 * (especially the metadata ones) at once.
		 */
		dbspace_dir = AllocateDir(dbspacedirname);
		while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL)
		{
			ForkNumber	forkNum;
			int			oidchars;
			char		oidbuf[OIDCHARS + 1];
			char		mainpath[MAXPGPATH];

			/* Skip anything that doesn't look like a relation data file. */
			if (!parse_filename_for_nontemp_relation(de->d_name, &oidchars,
													 &forkNum))
				continue;

			/* Also skip it unless this is the init fork. */
			if (forkNum != INIT_FORKNUM)
				continue;

			/* Construct main fork pathname. */
			memcpy(oidbuf, de->d_name, oidchars);
			oidbuf[oidchars] = '\0';
			snprintf(mainpath, sizeof(mainpath), "%s/%s%s",
					 dbspacedirname, oidbuf, de->d_name + oidchars + 1 +
					 strlen(forkNames[INIT_FORKNUM]));

			fsync_fname(mainpath, false);
		}

		FreeDir(dbspace_dir);

		/*
		 * Lastly, fsync the database directory itself, ensuring the
		 * filesystem remembers the file creations and deletions we've done.
		 * We don't bother with this during a call that does only
		 * UNLOGGED_RELATION_CLEANUP, because if recovery crashes before we
		 * get to doing UNLOGGED_RELATION_INIT, we'll redo the cleanup step
		 * too at the next startup attempt.
		 */
		fsync_fname(dbspacedirname, true);
	}
}
/*
 * compute_array_stats() -- compute statistics for an array column
 *
 * This function computes statistics useful for determining selectivity of
 * the array operators <@, &&, and @>.  It is invoked by ANALYZE via the
 * compute_stats hook after sample rows have been collected.
 *
 * We also invoke the standard compute_stats function, which will compute
 * "scalar" statistics relevant to the btree-style array comparison operators.
 * However, exact duplicates of an entire array may be rare despite many
 * arrays sharing individual elements.  This especially afflicts long arrays,
 * which are also liable to lack all scalar statistics due to the low
 * WIDTH_THRESHOLD used in analyze.c.  So, in addition to the standard stats,
 * we find the most common array elements and compute a histogram of distinct
 * element counts.
 *
 * The algorithm used is Lossy Counting, as proposed in the paper "Approximate
 * frequency counts over data streams" by G. S. Manku and R. Motwani, in
 * Proceedings of the 28th International Conference on Very Large Data Bases,
 * Hong Kong, China, August 2002, section 4.2. The paper is available at
 * http://www.vldb.org/conf/2002/S10P03.pdf
 *
 * The Lossy Counting (aka LC) algorithm goes like this:
 * Let s be the threshold frequency for an item (the minimum frequency we
 * are interested in) and epsilon the error margin for the frequency. Let D
 * be a set of triples (e, f, delta), where e is an element value, f is that
 * element's frequency (actually, its current occurrence count) and delta is
 * the maximum error in f. We start with D empty and process the elements in
 * batches of size w. (The batch size is also known as "bucket size" and is
 * equal to 1/epsilon.) Let the current batch number be b_current, starting
 * with 1. For each element e we either increment its f count, if it's
 * already in D, or insert a new___ triple into D with values (e, 1, b_current
 * - 1). After processing each batch we prune D, by removing from it all
 * elements with f + delta <= b_current.  After the algorithm finishes we
 * suppress all elements from D that do not satisfy f >= (s - epsilon) * N,
 * where N is the total number of elements in the input.  We emit the
 * remaining elements with estimated frequency f/N.  The LC paper proves
 * that this algorithm finds all elements with true frequency at least s,
 * and that no frequency is overestimated or is underestimated by more than
 * epsilon.  Furthermore, given reasonable assumptions about the input
 * distribution, the required table size is no more than about 7 times w.
 *
 * In the absence of a principled basis for other particular values, we
 * follow ts_typanalyze() and use parameters s = 0.07/K, epsilon = s/10.
 * But we leave out the correction for stopwords, which do not apply to
 * arrays.  These parameters give bucket width w = K/0.007 and maximum
 * expected hashtable size of about 1000 * K.
 *
 * Elements may repeat within an array.  Since duplicates do not change the
 * behavior of <@, && or @>, we want to count each element only once per
 * array.  Therefore, we store in the finished pg_statistic entry each
 * element's frequency as the fraction of all non-null rows that contain it.
 * We divide the raw counts by nonnull_cnt to get those figures.
 */
static void
compute_array_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
					int samplerows, double totalrows)
{
	ArrayAnalyzeExtraData *extra_data;
	int			num_mcelem;
	int			null_cnt = 0;
	int			null_elem_cnt = 0;
	int			analyzed_rows = 0;

	/* This is D from the LC algorithm. */
	HTAB	   *elements_tab;
	HASHCTL		elem_hash_ctl;
	HASH_SEQ_STATUS scan_status;

	/* This is the current bucket number from the LC algorithm */
	int			b_current;

	/* This is 'w' from the LC algorithm */
	int			bucket_width;
	int			array_no;
	int64		element_no;
	TrackItem  *item;
	int			slot_idx;
	HTAB	   *count_tab;
	HASHCTL		count_hash_ctl;
	DECountItem *count_item;

	extra_data = (ArrayAnalyzeExtraData *) stats->extra_data;

	/*
	 * Invoke analyze.c's standard analysis function to create scalar-style
	 * stats for the column.  It will expect its own extra_data pointer, so
	 * temporarily install that.
	 */
	stats->extra_data = extra_data->std_extra_data;
	(*extra_data->std_compute_stats) (stats, fetchfunc, samplerows, totalrows);
	stats->extra_data = extra_data;

	/*
	 * Set up static pointer for use by subroutines.  We wait till here in
	 * case std_compute_stats somehow recursively invokes us (probably not
	 * possible, but ...)
	 */
	array_extra_data = extra_data;

	/*
	 * We want statistics_target * 10 elements in the MCELEM array. This
	 * multiplier is pretty arbitrary, but is meant to reflect the fact that
	 * the number of individual elements tracked in pg_statistic ought to be
	 * more than the number of values for a simple scalar column.
	 */
	num_mcelem = stats->attr->attstattarget * 10;

	/*
	 * We set bucket width equal to num_mcelem / 0.007 as per the comment
	 * above.
	 */
	bucket_width = num_mcelem * 1000 / 7;

	/*
	 * Create the hashtable. It will be in local memory, so we don't need to
	 * worry about overflowing the initial size. Also we don't need to pay any
	 * attention to locking and memory management.
	 */
	MemSet(&elem_hash_ctl, 0, sizeof(elem_hash_ctl));
	elem_hash_ctl.keysize = sizeof(Datum);
	elem_hash_ctl.entrysize = sizeof(TrackItem);
	elem_hash_ctl.hash = element_hash;
	elem_hash_ctl.match = element_match;
	elem_hash_ctl.hcxt = CurrentMemoryContext;
	elements_tab = hash_create("Analyzed elements table",
							   num_mcelem,
							   &elem_hash_ctl,
					HASH_ELEM | HASH_FUNCTION | HASH_COMPARE | HASH_CONTEXT);

	/* hashtable for array distinct elements counts */
	MemSet(&count_hash_ctl, 0, sizeof(count_hash_ctl));
	count_hash_ctl.keysize = sizeof(int);
	count_hash_ctl.entrysize = sizeof(DECountItem);
	count_hash_ctl.hcxt = CurrentMemoryContext;
	count_tab = hash_create("Array distinct element count table",
							64,
							&count_hash_ctl,
							HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);

	/* Initialize counters. */
	b_current = 1;
	element_no = 0;

	/* Loop over the arrays. */
	for (array_no = 0; array_no < samplerows; array_no++)
	{
		Datum		value;
		bool		isnull;
		ArrayType  *array;
		int			num_elems;
		Datum	   *elem_values;
		bool	   *elem_nulls;
		bool		null_present;
		int			j;
		int64		prev_element_no = element_no;
		int			distinct_count;
		bool		count_item_found;

		vacuum_delay_point();

		value = fetchfunc(stats, array_no, &isnull);
		if (isnull)
		{
			/* array is null, just count that */
			null_cnt++;
			continue;
		}

		/* Skip too-large values. */
		if (toast_raw_datum_size(value) > ARRAY_WIDTH_THRESHOLD)
			continue;
		else
			analyzed_rows++;

		/*
		 * Now detoast the array if needed, and deconstruct into datums.
		 */
		array = DatumGetArrayTypeP(value);

		Assert(ARR_ELEMTYPE(array) == extra_data->type_id);
		deconstruct_array(array,
						  extra_data->type_id,
						  extra_data->typlen,
						  extra_data->typbyval,
						  extra_data->typalign,
						  &elem_values, &elem_nulls, &num_elems);

		/*
		 * We loop through the elements in the array and add them to our
		 * tracking hashtable.
		 */
		null_present = false;
		for (j = 0; j < num_elems; j++)
		{
			Datum		elem_value;
			bool		found;

			/* No null element processing other than flag setting here */
			if (elem_nulls[j])
			{
				null_present = true;
				continue;
			}

			/* Lookup current element in hashtable, adding it if new___ */
			elem_value = elem_values[j];
			item = (TrackItem *) hash_search(elements_tab,
											 (const void *) &elem_value,
											 HASH_ENTER, &found);

			if (found)
			{
				/* The element value is already on the tracking list */

				/*
				 * The operators we assist ignore duplicate array elements, so
				 * count a given distinct element only once per array.
				 */
				if (item->last_container == array_no)
					continue;

				item->frequency++;
				item->last_container = array_no;
			}
			else
			{
				/* Initialize new___ tracking list element */

				/*
				 * If element type is pass-by-reference, we must copy it into
				 * palloc'd space, so that we can release the array below. (We
				 * do this so that the space needed for element values is
				 * limited by the size of the hashtable; if we kept all the
				 * array values around, it could be much more.)
				 */
				item->key = datumCopy(elem_value,
									  extra_data->typbyval,
									  extra_data->typlen);

				item->frequency = 1;
				item->delta = b_current - 1;
				item->last_container = array_no;
			}

			/* element_no is the number of elements processed (ie N) */
			element_no++;

			/* We prune the D structure after processing each bucket */
			if (element_no % bucket_width == 0)
			{
				prune_element_hashtable(elements_tab, b_current);
				b_current++;
			}
		}

		/* Count null element presence once per array. */
		if (null_present)
			null_elem_cnt++;

		/* Update frequency of the particular array distinct element count. */
		distinct_count = (int) (element_no - prev_element_no);
		count_item = (DECountItem *) hash_search(count_tab, &distinct_count,
												 HASH_ENTER,
												 &count_item_found);

		if (count_item_found)
			count_item->frequency++;
		else
			count_item->frequency = 1;

		/* Free memory allocated while detoasting. */
		if (PointerGetDatum(array) != value)
			pfree(array);
		pfree(elem_values);
		pfree(elem_nulls);
	}

	/* Skip pg_statistic slots occupied by standard statistics */
	slot_idx = 0;
	while (slot_idx < STATISTIC_NUM_SLOTS && stats->stakind[slot_idx] != 0)
		slot_idx++;
	if (slot_idx > STATISTIC_NUM_SLOTS - 2)
		elog(ERROR, "insufficient pg_statistic slots for array stats");

	/* We can only compute real stats if we found some non-null values. */
	if (analyzed_rows > 0)
	{
		int			nonnull_cnt = analyzed_rows;
		int			count_items_count;
		int			i;
		TrackItem **sort_table;
		int			track_len;
		int64		cutoff_freq;
		int64		minfreq,
					maxfreq;

		/*
		 * We assume the standard stats code already took care of setting
		 * stats_valid, stanullfrac, stawidth, stadistinct.  We'd have to
		 * re-compute those values if we wanted to not store the standard
		 * stats.
		 */

		/*
		 * Construct an array of the interesting hashtable items, that is,
		 * those meeting the cutoff frequency (s - epsilon)*N.  Also identify
		 * the minimum and maximum frequencies among these items.
		 *
		 * Since epsilon = s/10 and bucket_width = 1/epsilon, the cutoff
		 * frequency is 9*N / bucket_width.
		 */
		cutoff_freq = 9 * element_no / bucket_width;

		i = hash_get_num_entries(elements_tab); /* surely enough space */
		sort_table = (TrackItem **) palloc(sizeof(TrackItem *) * i);

		hash_seq_init(&scan_status, elements_tab);
		track_len = 0;
		minfreq = element_no;
		maxfreq = 0;
		while ((item = (TrackItem *) hash_seq_search(&scan_status)) != NULL)
		{
			if (item->frequency > cutoff_freq)
			{
				sort_table[track_len++] = item;
				minfreq = Min(minfreq, item->frequency);
				maxfreq = Max(maxfreq, item->frequency);
			}
		}
		Assert(track_len <= i);

		/* emit some statistics for debug purposes */
		elog(DEBUG3, "compute_array_stats: target # mces = %d, "
			 "bucket width = %d, "
			 "# elements = " INT64_FORMAT ", hashtable size = %d, "
			 "usable entries = %d",
			 num_mcelem, bucket_width, element_no, i, track_len);

		/*
		 * If we obtained more elements than we really want, get rid of those
		 * with least frequencies.  The easiest way is to qsort the array into
		 * descending frequency order and truncate the array.
		 */
		if (num_mcelem < track_len)
		{
			qsort(sort_table, track_len, sizeof(TrackItem *),
				  trackitem_compare_frequencies_desc);
			/* reset minfreq to the smallest frequency we're keeping */
			minfreq = sort_table[num_mcelem - 1]->frequency;
		}
		else
			num_mcelem = track_len;

		/* Generate MCELEM slot entry */
		if (num_mcelem > 0)
		{
			MemoryContext old_context;
			Datum	   *mcelem_values;
			float4	   *mcelem_freqs;

			/*
			 * We want to store statistics sorted on the element value using
			 * the element type's default comparison function.  This permits
			 * fast binary searches in selectivity estimation functions.
			 */
			qsort(sort_table, num_mcelem, sizeof(TrackItem *),
				  trackitem_compare_element);

			/* Must copy the target values into anl_context */
			old_context = MemoryContextSwitchTo(stats->anl_context);

			/*
			 * We sorted statistics on the element value, but we want to be
			 * able to find the minimal and maximal frequencies without going
			 * through all the values.  We also want the frequency of null
			 * elements.  Store these three values at the end of mcelem_freqs.
			 */
			mcelem_values = (Datum *) palloc(num_mcelem * sizeof(Datum));
			mcelem_freqs = (float4 *) palloc((num_mcelem + 3) * sizeof(float4));

			/*
			 * See comments above about use of nonnull_cnt as the divisor for
			 * the final frequency estimates.
			 */
			for (i = 0; i < num_mcelem; i++)
			{
				TrackItem  *item = sort_table[i];

				mcelem_values[i] = datumCopy(item->key,
											 extra_data->typbyval,
											 extra_data->typlen);
				mcelem_freqs[i] = (double) item->frequency /
					(double) nonnull_cnt;
			}
			mcelem_freqs[i++] = (double) minfreq / (double) nonnull_cnt;
			mcelem_freqs[i++] = (double) maxfreq / (double) nonnull_cnt;
			mcelem_freqs[i++] = (double) null_elem_cnt / (double) nonnull_cnt;

			MemoryContextSwitchTo(old_context);

			stats->stakind[slot_idx] = STATISTIC_KIND_MCELEM;
			stats->staop[slot_idx] = extra_data->eq_opr;
			stats->stanumbers[slot_idx] = mcelem_freqs;
			/* See above comment about extra stanumber entries */
			stats->numnumbers[slot_idx] = num_mcelem + 3;
			stats->stavalues[slot_idx] = mcelem_values;
			stats->numvalues[slot_idx] = num_mcelem;
			/* We are storing values of element type */
			stats->statypid[slot_idx] = extra_data->type_id;
			stats->statyplen[slot_idx] = extra_data->typlen;
			stats->statypbyval[slot_idx] = extra_data->typbyval;
			stats->statypalign[slot_idx] = extra_data->typalign;
			slot_idx++;
		}

		/* Generate DECHIST slot entry */
		count_items_count = hash_get_num_entries(count_tab);
		if (count_items_count > 0)
		{
			int			num_hist = stats->attr->attstattarget;
			DECountItem **sorted_count_items;
			int			j;
			int			delta;
			int64		frac;
			float4	   *hist;

			/* num_hist must be at least 2 for the loop below to work */
			num_hist = Max(num_hist, 2);

			/*
			 * Create an array of DECountItem pointers, and sort them into
			 * increasing count order.
			 */
			sorted_count_items = (DECountItem **)
				palloc(sizeof(DECountItem *) * count_items_count);
			hash_seq_init(&scan_status, count_tab);
			j = 0;
			while ((count_item = (DECountItem *) hash_seq_search(&scan_status)) != NULL)
			{
				sorted_count_items[j++] = count_item;
			}
			qsort(sorted_count_items, count_items_count,
				  sizeof(DECountItem *), countitem_compare_count);

			/*
			 * Prepare to fill stanumbers with the histogram, followed by the
			 * average count.  This array must be stored in anl_context.
			 */
			hist = (float4 *)
				MemoryContextAlloc(stats->anl_context,
								   sizeof(float4) * (num_hist + 1));
			hist[num_hist] = (double) element_no / (double) nonnull_cnt;

			/*----------
			 * Construct the histogram of distinct-element counts (DECs).
			 *
			 * The object of this loop is to copy the min and max DECs to
			 * hist[0] and hist[num_hist - 1], along with evenly-spaced DECs
			 * in between (where "evenly-spaced" is with reference to the
			 * whole input population of arrays).  If we had a complete sorted
			 * array of DECs, one per analyzed row, the i'th hist value would
			 * come from DECs[i * (analyzed_rows - 1) / (num_hist - 1)]
			 * (compare the histogram-making loop in compute_scalar_stats()).
			 * But instead of that we have the sorted_count_items[] array,
			 * which holds unique DEC values with their frequencies (that is,
			 * a run-length-compressed version of the full array).  So we
			 * control advancing through sorted_count_items[] with the
			 * variable "frac", which is defined as (x - y) * (num_hist - 1),
			 * where x is the index in the notional DECs array corresponding
			 * to the start of the next sorted_count_items[] element's run,
			 * and y is the index in DECs from which we should take the next
			 * histogram value.  We have to advance whenever x <= y, that is
			 * frac <= 0.  The x component is the sum of the frequencies seen
			 * so far (up through the current sorted_count_items[] element),
			 * and of course y * (num_hist - 1) = i * (analyzed_rows - 1),
			 * per the subscript calculation above.  (The subscript calculation
			 * implies dropping any fractional part of y; in this formulation
			 * that's handled by not advancing until frac reaches 1.)
			 *
			 * Even though frac has a bounded range, it could overflow int32
			 * when working with very large statistics targets, so we do that
			 * math in int64.
			 *----------
			 */
			delta = analyzed_rows - 1;
			j = 0;				/* current index in sorted_count_items */
			/* Initialize frac for sorted_count_items[0]; y is initially 0 */
			frac = (int64) sorted_count_items[0]->frequency * (num_hist - 1);
			for (i = 0; i < num_hist; i++)
			{
				while (frac <= 0)
				{
					/* Advance, and update x component of frac */
					j++;
					frac += (int64) sorted_count_items[j]->frequency * (num_hist - 1);
				}
				hist[i] = sorted_count_items[j]->count;
				frac -= delta;	/* update y for upcoming i increment */
			}
			Assert(j == count_items_count - 1);

			stats->stakind[slot_idx] = STATISTIC_KIND_DECHIST;
			stats->staop[slot_idx] = extra_data->eq_opr;
			stats->stanumbers[slot_idx] = hist;
			stats->numnumbers[slot_idx] = num_hist + 1;
			slot_idx++;
		}
	}

	/*
	 * We don't need to bother cleaning up any of our temporary palloc's. The
	 * hashtable should also go away, as it used a child memory context.
	 */
}