Ejemplo n.º 1
0
/* =============================================================================
 * element_mapCompare
 *
 * For use in MAP_T
 * =============================================================================
 */
long
element_mapCompare (const pair_t* aPtr, const pair_t* bPtr)
{
    element_t* aElementPtr = (element_t*)(aPtr->firstPtr);
    element_t* bElementPtr = (element_t*)(bPtr->firstPtr);

    return element_compare(aElementPtr, bElementPtr);
}
Ejemplo n.º 2
0
/* =============================================================================
 * element_listCompare
 *
 * For use in list_t
 * =============================================================================
 */
long
element_listCompare (const void* aPtr, const void* bPtr)
{
    element_t* aElementPtr = (element_t*)aPtr;
    element_t* bElementPtr = (element_t*)bPtr;

    return element_compare(aElementPtr, bElementPtr);
}
Ejemplo n.º 3
0
/*
 * qsort() comparator for sorting TrackItems by element values
 */
static int
trackitem_compare_element(const void *e1, const void *e2)
{
	const TrackItem *const * t1 = (const TrackItem *const *) e1;
	const TrackItem *const * t2 = (const TrackItem *const *) e2;

	return element_compare(&(*t1)->key, &(*t2)->key);
}
Ejemplo n.º 4
0
/*
 * Estimate selectivity of "column <@ const" based on most common element
 * statistics.
 *
 * mcelem (of length nmcelem) and numbers (of length nnumbers) are from
 * the array column's MCELEM statistics slot, or are NULL/0 if stats are
 * not available.  array_data (of length nitems) is the constant's elements.
 * hist (of length nhist) is from the array column's DECHIST statistics slot,
 * or is NULL/0 if those stats are not available.
 *
 * Both the mcelem and array_data arrays are assumed presorted according
 * to the element type's cmpfunc.  Null elements are not present.
 *
 * Independent element occurrence would imply a particular distribution of
 * distinct element counts among matching rows.  Real data usually falsifies
 * that assumption.  For example, in a set of 11-element integer arrays having
 * elements in the range [0..10], element occurrences are typically not
 * independent.  If they were, a sufficiently-large set would include all
 * distinct element counts 0 through 11.  We correct for this using the
 * histogram of distinct element counts.
 *
 * In the "column @> const" and "column && const" cases, we usually have a
 * "const" with low number of elements (otherwise we have selectivity close
 * to 0 or 1 respectively).  That's why the effect of dependence related
 * to distinct element count distribution is negligible there.  In the
 * "column <@ const" case, number of elements is usually high (otherwise we
 * have selectivity close to 0).  That's why we should do a correction with
 * the array distinct element count distribution here.
 *
 * Using the histogram of distinct element counts produces a different
 * distribution law than independent occurrences of elements.  This
 * distribution law can be described as follows:
 *
 * P(o1, o2, ..., on) = f1^o1 * (1 - f1)^(1 - o1) * f2^o2 *
 *	  (1 - f2)^(1 - o2) * ... * fn^on * (1 - fn)^(1 - on) * hist[m] / ind[m]
 *
 * where:
 * o1, o2, ..., on - occurrences of elements 1, 2, ..., n
 *		(1 - occurrence, 0 - no occurrence) in row
 * f1, f2, ..., fn - frequencies of elements 1, 2, ..., n
 *		(scalar values in [0..1]) according to collected statistics
 * m = o1 + o2 + ... + on = total number of distinct elements in row
 * hist[m] - histogram data for occurrence of m elements.
 * ind[m] - probability of m occurrences from n events assuming their
 *	  probabilities to be equal to frequencies of array elements.
 *
 * ind[m] = sum(f1^o1 * (1 - f1)^(1 - o1) * f2^o2 * (1 - f2)^(1 - o2) *
 * ... * fn^on * (1 - fn)^(1 - on), o1, o2, ..., on) | o1 + o2 + .. on = m
 */
static Selectivity
mcelem_array_contained_selec(Datum *mcelem, int nmcelem,
							 float4 *numbers, int nnumbers,
							 Datum *array_data, int nitems,
							 float4 *hist, int nhist,
							 Oid operator, FmgrInfo *cmpfunc)
{
	int			mcelem_index,
				i,
				unique_nitems = 0;
	float		selec,
				minfreq,
				nullelem_freq;
	float	   *dist,
			   *mcelem_dist,
			   *hist_part;
	float		avg_count,
				mult,
				rest;
	float	   *elem_selec;

	/*
	 * There should be three more Numbers than Values in the MCELEM slot,
	 * because the last three cells should hold minimal and maximal frequency
	 * among the non-null elements, and then the frequency of null elements.
	 * Punt if not right, because we can't do much without the element freqs.
	 */
	if (numbers == NULL || nnumbers != nmcelem + 3)
		return DEFAULT_CONTAIN_SEL;

	/* Can't do much without a count histogram, either */
	if (hist == NULL || nhist < 3)
		return DEFAULT_CONTAIN_SEL;

	/*
	 * Grab some of the summary statistics that compute_array_stats() stores:
	 * lowest frequency, frequency of null elements, and average distinct
	 * element count.
	 */
	minfreq = numbers[nmcelem];
	nullelem_freq = numbers[nmcelem + 2];
	avg_count = hist[nhist - 1];

	/*
	 * "rest" will be the sum of the frequencies of all elements not
	 * represented in MCELEM.  The average distinct element count is the sum
	 * of the frequencies of *all* elements.  Begin with that; we will proceed
	 * to subtract the MCELEM frequencies.
	 */
	rest = avg_count;

	/*
	 * mult is a multiplier representing estimate of probability that each
	 * mcelem that is not present in constant doesn't occur.
	 */
	mult = 1.0f;

	/*
	 * elem_selec is array of estimated frequencies for elements in the
	 * constant.
	 */
	elem_selec = (float *) palloc(sizeof(float) * nitems);

	/* Scan mcelem and array in parallel. */
	mcelem_index = 0;
	for (i = 0; i < nitems; i++)
	{
		bool		match = false;

		/* Ignore any duplicates in the array data. */
		if (i > 0 &&
			element_compare(&array_data[i - 1], &array_data[i], cmpfunc) == 0)
			continue;

		/*
		 * Iterate over MCELEM until we find an entry greater than or equal to
		 * this element of the constant.  Update "rest" and "mult" for mcelem
		 * entries skipped over.
		 */
		while (mcelem_index < nmcelem)
		{
			int			cmp = element_compare(&mcelem[mcelem_index],
											  &array_data[i],
											  cmpfunc);

			if (cmp < 0)
			{
				mult *= (1.0f - numbers[mcelem_index]);
				rest -= numbers[mcelem_index];
				mcelem_index++;
			}
			else
			{
				if (cmp == 0)
					match = true;	/* mcelem is found */
				break;
			}
		}

		if (match)
		{
			/* MCELEM matches the array item. */
			elem_selec[unique_nitems] = numbers[mcelem_index];
			/* "rest" is decremented for all mcelems, matched or not */
			rest -= numbers[mcelem_index];
			mcelem_index++;
		}
		else
		{
			/*
			 * The element is not in MCELEM.  Punt, but assume that the
			 * selectivity cannot be more than minfreq / 2.
			 */
			elem_selec[unique_nitems] = Min(DEFAULT_CONTAIN_SEL,
											minfreq / 2);
		}

		unique_nitems++;
	}

	/*
	 * If we handled all constant elements without exhausting the MCELEM
	 * array, finish walking it to complete calculation of "rest" and "mult".
	 */
	while (mcelem_index < nmcelem)
	{
		mult *= (1.0f - numbers[mcelem_index]);
		rest -= numbers[mcelem_index];
		mcelem_index++;
	}

	/*
	 * The presence of many distinct rare elements materially decreases
	 * selectivity.  Use the Poisson distribution to estimate the probability
	 * of a column value having zero occurrences of such elements.  See above
	 * for the definition of "rest".
	 */
	mult *= exp(-rest);

	/*----------
	 * Using the distinct element count histogram requires
	 *		O(unique_nitems * (nmcelem + unique_nitems))
	 * operations.  Beyond a certain computational cost threshold, it's
	 * reasonable to sacrifice accuracy for decreased planning time.  We limit
	 * the number of operations to EFFORT * nmcelem; since nmcelem is limited
	 * by the column's statistics target, the work done is user-controllable.
	 *
	 * If the number of operations would be too large, we can reduce it
	 * without losing all accuracy by reducing unique_nitems and considering
	 * only the most-common elements of the constant array.  To make the
	 * results exactly match what we would have gotten with only those
	 * elements to start with, we'd have to remove any discarded elements'
	 * frequencies from "mult", but since this is only an approximation
	 * anyway, we don't bother with that.  Therefore it's sufficient to qsort
	 * elem_selec[] and take the largest elements.  (They will no longer match
	 * up with the elements of array_data[], but we don't care.)
	 *----------
	 */
#define EFFORT 100

	if ((nmcelem + unique_nitems) > 0 &&
		unique_nitems > EFFORT * nmcelem / (nmcelem + unique_nitems))
	{
		/*
		 * Use the quadratic formula to solve for largest allowable N.  We
		 * have A = 1, B = nmcelem, C = - EFFORT * nmcelem.
		 */
		double		b = (double) nmcelem;
		int			n;

		n = (int) ((sqrt(b * b + 4 * EFFORT * b) - b) / 2);

		/* Sort, then take just the first n elements */
		qsort(elem_selec, unique_nitems, sizeof(float),
			  float_compare_desc);
		unique_nitems = n;
	}

	/*
	 * Calculate probabilities of each distinct element count for both mcelems
	 * and constant elements.  At this point, assume independent element
	 * occurrence.
	 */
	dist = calc_distr(elem_selec, unique_nitems, unique_nitems, 0.0f);
	mcelem_dist = calc_distr(numbers, nmcelem, unique_nitems, rest);

	/* ignore hist[nhist-1], which is the average not a histogram member */
	hist_part = calc_hist(hist, nhist - 1, unique_nitems);

	selec = 0.0f;
	for (i = 0; i <= unique_nitems; i++)
	{
		/*
		 * mult * dist[i] / mcelem_dist[i] gives us probability of qual
		 * matching from assumption of independent element occurrence with the
		 * condition that distinct element count = i.
		 */
		if (mcelem_dist[i] > 0)
			selec += hist_part[i] * mult * dist[i] / mcelem_dist[i];
	}

	pfree(dist);
	pfree(mcelem_dist);
	pfree(hist_part);
	pfree(elem_selec);

	/* Take into account occurrence of NULL element. */
	selec *= (1.0f - nullelem_freq);

	CLAMP_PROBABILITY(selec);

	return selec;
}
Ejemplo n.º 5
0
/*
 * Estimate selectivity of "column @> const" and "column && const" based on
 * most common element statistics.  This estimation assumes element
 * occurrences are independent.
 *
 * mcelem (of length nmcelem) and numbers (of length nnumbers) are from
 * the array column's MCELEM statistics slot, or are NULL/0 if stats are
 * not available.  array_data (of length nitems) is the constant's elements.
 *
 * Both the mcelem and array_data arrays are assumed presorted according
 * to the element type's cmpfunc.  Null elements are not present.
 *
 * TODO: this estimate probably could be improved by using the distinct
 * elements count histogram.  For example, excepting the special case of
 * "column @> '{}'", we can multiply the calculated selectivity by the
 * fraction of nonempty arrays in the column.
 */
static Selectivity
mcelem_array_contain_overlap_selec(Datum *mcelem, int nmcelem,
								   float4 *numbers, int nnumbers,
								   Datum *array_data, int nitems,
								   Oid operator, FmgrInfo *cmpfunc)
{
	Selectivity selec,
				elem_selec;
	int			mcelem_index,
				i;
	bool		use_bsearch;
	float4		minfreq;

	/*
	 * There should be three more Numbers than Values, because the last three
	 * cells should hold minimal and maximal frequency among the non-null
	 * elements, and then the frequency of null elements.  Ignore the Numbers
	 * if not right.
	 */
	if (nnumbers != nmcelem + 3)
	{
		numbers = NULL;
		nnumbers = 0;
	}

	if (numbers)
	{
		/* Grab the lowest observed frequency */
		minfreq = numbers[nmcelem];
	}
	else
	{
		/* Without statistics make some default assumptions */
		minfreq = 2 * (float4) DEFAULT_CONTAIN_SEL;
	}

	/* Decide whether it is faster to use binary search or not. */
	if (nitems * floor_log2((uint32) nmcelem) < nmcelem + nitems)
		use_bsearch = true;
	else
		use_bsearch = false;

	if (operator == OID_ARRAY_CONTAINS_OP)
	{
		/*
		 * Initial selectivity for "column @> const" query is 1.0, and it will
		 * be decreased with each element of constant array.
		 */
		selec = 1.0;
	}
	else
	{
		/*
		 * Initial selectivity for "column && const" query is 0.0, and it will
		 * be increased with each element of constant array.
		 */
		selec = 0.0;
	}

	/* Scan mcelem and array in parallel. */
	mcelem_index = 0;
	for (i = 0; i < nitems; i++)
	{
		bool		match = false;

		/* Ignore any duplicates in the array data. */
		if (i > 0 &&
			element_compare(&array_data[i - 1], &array_data[i], cmpfunc) == 0)
			continue;

		/* Find the smallest MCELEM >= this array item. */
		if (use_bsearch)
		{
			match = find_next_mcelem(mcelem, nmcelem, array_data[i],
									 &mcelem_index, cmpfunc);
		}
		else
		{
			while (mcelem_index < nmcelem)
			{
				int			cmp = element_compare(&mcelem[mcelem_index],
												  &array_data[i],
												  cmpfunc);

				if (cmp < 0)
					mcelem_index++;
				else
				{
					if (cmp == 0)
						match = true;	/* mcelem is found */
					break;
				}
			}
		}

		if (match && numbers)
		{
			/* MCELEM matches the array item; use its frequency. */
			elem_selec = numbers[mcelem_index];
			mcelem_index++;
		}
		else
		{
			/*
			 * The element is not in MCELEM.  Punt, but assume that the
			 * selectivity cannot be more than minfreq / 2.
			 */
			elem_selec = Min(DEFAULT_CONTAIN_SEL, minfreq / 2);
		}

		/*
		 * Update overall selectivity using the current element's selectivity
		 * and an assumption of element occurrence independence.
		 */
		if (operator == OID_ARRAY_CONTAINS_OP)
			selec *= elem_selec;
		else
			selec = selec + elem_selec - selec * elem_selec;

		/* Clamp intermediate results to stay sane despite roundoff error */
		CLAMP_PROBABILITY(selec);
	}

	return selec;
}
Ejemplo n.º 6
0
/*
 * Matching function for elements, to be used in hashtable lookups.
 */
static int
element_match(const void *key1, const void *key2, Size keysize)
{
	/* The keysize parameter is superfluous here */
	return element_compare(key1, key2);
}