/* ============================================================================= * element_mapCompare * * For use in MAP_T * ============================================================================= */ long element_mapCompare (const pair_t* aPtr, const pair_t* bPtr) { element_t* aElementPtr = (element_t*)(aPtr->firstPtr); element_t* bElementPtr = (element_t*)(bPtr->firstPtr); return element_compare(aElementPtr, bElementPtr); }
/* ============================================================================= * element_listCompare * * For use in list_t * ============================================================================= */ long element_listCompare (const void* aPtr, const void* bPtr) { element_t* aElementPtr = (element_t*)aPtr; element_t* bElementPtr = (element_t*)bPtr; return element_compare(aElementPtr, bElementPtr); }
/* * qsort() comparator for sorting TrackItems by element values */ static int trackitem_compare_element(const void *e1, const void *e2) { const TrackItem *const * t1 = (const TrackItem *const *) e1; const TrackItem *const * t2 = (const TrackItem *const *) e2; return element_compare(&(*t1)->key, &(*t2)->key); }
/* * Estimate selectivity of "column <@ const" based on most common element * statistics. * * mcelem (of length nmcelem) and numbers (of length nnumbers) are from * the array column's MCELEM statistics slot, or are NULL/0 if stats are * not available. array_data (of length nitems) is the constant's elements. * hist (of length nhist) is from the array column's DECHIST statistics slot, * or is NULL/0 if those stats are not available. * * Both the mcelem and array_data arrays are assumed presorted according * to the element type's cmpfunc. Null elements are not present. * * Independent element occurrence would imply a particular distribution of * distinct element counts among matching rows. Real data usually falsifies * that assumption. For example, in a set of 11-element integer arrays having * elements in the range [0..10], element occurrences are typically not * independent. If they were, a sufficiently-large set would include all * distinct element counts 0 through 11. We correct for this using the * histogram of distinct element counts. * * In the "column @> const" and "column && const" cases, we usually have a * "const" with low number of elements (otherwise we have selectivity close * to 0 or 1 respectively). That's why the effect of dependence related * to distinct element count distribution is negligible there. In the * "column <@ const" case, number of elements is usually high (otherwise we * have selectivity close to 0). That's why we should do a correction with * the array distinct element count distribution here. * * Using the histogram of distinct element counts produces a different * distribution law than independent occurrences of elements. This * distribution law can be described as follows: * * P(o1, o2, ..., on) = f1^o1 * (1 - f1)^(1 - o1) * f2^o2 * * (1 - f2)^(1 - o2) * ... * fn^on * (1 - fn)^(1 - on) * hist[m] / ind[m] * * where: * o1, o2, ..., on - occurrences of elements 1, 2, ..., n * (1 - occurrence, 0 - no occurrence) in row * f1, f2, ..., fn - frequencies of elements 1, 2, ..., n * (scalar values in [0..1]) according to collected statistics * m = o1 + o2 + ... + on = total number of distinct elements in row * hist[m] - histogram data for occurrence of m elements. * ind[m] - probability of m occurrences from n events assuming their * probabilities to be equal to frequencies of array elements. * * ind[m] = sum(f1^o1 * (1 - f1)^(1 - o1) * f2^o2 * (1 - f2)^(1 - o2) * * ... * fn^on * (1 - fn)^(1 - on), o1, o2, ..., on) | o1 + o2 + .. on = m */ static Selectivity mcelem_array_contained_selec(Datum *mcelem, int nmcelem, float4 *numbers, int nnumbers, Datum *array_data, int nitems, float4 *hist, int nhist, Oid operator, FmgrInfo *cmpfunc) { int mcelem_index, i, unique_nitems = 0; float selec, minfreq, nullelem_freq; float *dist, *mcelem_dist, *hist_part; float avg_count, mult, rest; float *elem_selec; /* * There should be three more Numbers than Values in the MCELEM slot, * because the last three cells should hold minimal and maximal frequency * among the non-null elements, and then the frequency of null elements. * Punt if not right, because we can't do much without the element freqs. */ if (numbers == NULL || nnumbers != nmcelem + 3) return DEFAULT_CONTAIN_SEL; /* Can't do much without a count histogram, either */ if (hist == NULL || nhist < 3) return DEFAULT_CONTAIN_SEL; /* * Grab some of the summary statistics that compute_array_stats() stores: * lowest frequency, frequency of null elements, and average distinct * element count. */ minfreq = numbers[nmcelem]; nullelem_freq = numbers[nmcelem + 2]; avg_count = hist[nhist - 1]; /* * "rest" will be the sum of the frequencies of all elements not * represented in MCELEM. The average distinct element count is the sum * of the frequencies of *all* elements. Begin with that; we will proceed * to subtract the MCELEM frequencies. */ rest = avg_count; /* * mult is a multiplier representing estimate of probability that each * mcelem that is not present in constant doesn't occur. */ mult = 1.0f; /* * elem_selec is array of estimated frequencies for elements in the * constant. */ elem_selec = (float *) palloc(sizeof(float) * nitems); /* Scan mcelem and array in parallel. */ mcelem_index = 0; for (i = 0; i < nitems; i++) { bool match = false; /* Ignore any duplicates in the array data. */ if (i > 0 && element_compare(&array_data[i - 1], &array_data[i], cmpfunc) == 0) continue; /* * Iterate over MCELEM until we find an entry greater than or equal to * this element of the constant. Update "rest" and "mult" for mcelem * entries skipped over. */ while (mcelem_index < nmcelem) { int cmp = element_compare(&mcelem[mcelem_index], &array_data[i], cmpfunc); if (cmp < 0) { mult *= (1.0f - numbers[mcelem_index]); rest -= numbers[mcelem_index]; mcelem_index++; } else { if (cmp == 0) match = true; /* mcelem is found */ break; } } if (match) { /* MCELEM matches the array item. */ elem_selec[unique_nitems] = numbers[mcelem_index]; /* "rest" is decremented for all mcelems, matched or not */ rest -= numbers[mcelem_index]; mcelem_index++; } else { /* * The element is not in MCELEM. Punt, but assume that the * selectivity cannot be more than minfreq / 2. */ elem_selec[unique_nitems] = Min(DEFAULT_CONTAIN_SEL, minfreq / 2); } unique_nitems++; } /* * If we handled all constant elements without exhausting the MCELEM * array, finish walking it to complete calculation of "rest" and "mult". */ while (mcelem_index < nmcelem) { mult *= (1.0f - numbers[mcelem_index]); rest -= numbers[mcelem_index]; mcelem_index++; } /* * The presence of many distinct rare elements materially decreases * selectivity. Use the Poisson distribution to estimate the probability * of a column value having zero occurrences of such elements. See above * for the definition of "rest". */ mult *= exp(-rest); /*---------- * Using the distinct element count histogram requires * O(unique_nitems * (nmcelem + unique_nitems)) * operations. Beyond a certain computational cost threshold, it's * reasonable to sacrifice accuracy for decreased planning time. We limit * the number of operations to EFFORT * nmcelem; since nmcelem is limited * by the column's statistics target, the work done is user-controllable. * * If the number of operations would be too large, we can reduce it * without losing all accuracy by reducing unique_nitems and considering * only the most-common elements of the constant array. To make the * results exactly match what we would have gotten with only those * elements to start with, we'd have to remove any discarded elements' * frequencies from "mult", but since this is only an approximation * anyway, we don't bother with that. Therefore it's sufficient to qsort * elem_selec[] and take the largest elements. (They will no longer match * up with the elements of array_data[], but we don't care.) *---------- */ #define EFFORT 100 if ((nmcelem + unique_nitems) > 0 && unique_nitems > EFFORT * nmcelem / (nmcelem + unique_nitems)) { /* * Use the quadratic formula to solve for largest allowable N. We * have A = 1, B = nmcelem, C = - EFFORT * nmcelem. */ double b = (double) nmcelem; int n; n = (int) ((sqrt(b * b + 4 * EFFORT * b) - b) / 2); /* Sort, then take just the first n elements */ qsort(elem_selec, unique_nitems, sizeof(float), float_compare_desc); unique_nitems = n; } /* * Calculate probabilities of each distinct element count for both mcelems * and constant elements. At this point, assume independent element * occurrence. */ dist = calc_distr(elem_selec, unique_nitems, unique_nitems, 0.0f); mcelem_dist = calc_distr(numbers, nmcelem, unique_nitems, rest); /* ignore hist[nhist-1], which is the average not a histogram member */ hist_part = calc_hist(hist, nhist - 1, unique_nitems); selec = 0.0f; for (i = 0; i <= unique_nitems; i++) { /* * mult * dist[i] / mcelem_dist[i] gives us probability of qual * matching from assumption of independent element occurrence with the * condition that distinct element count = i. */ if (mcelem_dist[i] > 0) selec += hist_part[i] * mult * dist[i] / mcelem_dist[i]; } pfree(dist); pfree(mcelem_dist); pfree(hist_part); pfree(elem_selec); /* Take into account occurrence of NULL element. */ selec *= (1.0f - nullelem_freq); CLAMP_PROBABILITY(selec); return selec; }
/* * Estimate selectivity of "column @> const" and "column && const" based on * most common element statistics. This estimation assumes element * occurrences are independent. * * mcelem (of length nmcelem) and numbers (of length nnumbers) are from * the array column's MCELEM statistics slot, or are NULL/0 if stats are * not available. array_data (of length nitems) is the constant's elements. * * Both the mcelem and array_data arrays are assumed presorted according * to the element type's cmpfunc. Null elements are not present. * * TODO: this estimate probably could be improved by using the distinct * elements count histogram. For example, excepting the special case of * "column @> '{}'", we can multiply the calculated selectivity by the * fraction of nonempty arrays in the column. */ static Selectivity mcelem_array_contain_overlap_selec(Datum *mcelem, int nmcelem, float4 *numbers, int nnumbers, Datum *array_data, int nitems, Oid operator, FmgrInfo *cmpfunc) { Selectivity selec, elem_selec; int mcelem_index, i; bool use_bsearch; float4 minfreq; /* * There should be three more Numbers than Values, because the last three * cells should hold minimal and maximal frequency among the non-null * elements, and then the frequency of null elements. Ignore the Numbers * if not right. */ if (nnumbers != nmcelem + 3) { numbers = NULL; nnumbers = 0; } if (numbers) { /* Grab the lowest observed frequency */ minfreq = numbers[nmcelem]; } else { /* Without statistics make some default assumptions */ minfreq = 2 * (float4) DEFAULT_CONTAIN_SEL; } /* Decide whether it is faster to use binary search or not. */ if (nitems * floor_log2((uint32) nmcelem) < nmcelem + nitems) use_bsearch = true; else use_bsearch = false; if (operator == OID_ARRAY_CONTAINS_OP) { /* * Initial selectivity for "column @> const" query is 1.0, and it will * be decreased with each element of constant array. */ selec = 1.0; } else { /* * Initial selectivity for "column && const" query is 0.0, and it will * be increased with each element of constant array. */ selec = 0.0; } /* Scan mcelem and array in parallel. */ mcelem_index = 0; for (i = 0; i < nitems; i++) { bool match = false; /* Ignore any duplicates in the array data. */ if (i > 0 && element_compare(&array_data[i - 1], &array_data[i], cmpfunc) == 0) continue; /* Find the smallest MCELEM >= this array item. */ if (use_bsearch) { match = find_next_mcelem(mcelem, nmcelem, array_data[i], &mcelem_index, cmpfunc); } else { while (mcelem_index < nmcelem) { int cmp = element_compare(&mcelem[mcelem_index], &array_data[i], cmpfunc); if (cmp < 0) mcelem_index++; else { if (cmp == 0) match = true; /* mcelem is found */ break; } } } if (match && numbers) { /* MCELEM matches the array item; use its frequency. */ elem_selec = numbers[mcelem_index]; mcelem_index++; } else { /* * The element is not in MCELEM. Punt, but assume that the * selectivity cannot be more than minfreq / 2. */ elem_selec = Min(DEFAULT_CONTAIN_SEL, minfreq / 2); } /* * Update overall selectivity using the current element's selectivity * and an assumption of element occurrence independence. */ if (operator == OID_ARRAY_CONTAINS_OP) selec *= elem_selec; else selec = selec + elem_selec - selec * elem_selec; /* Clamp intermediate results to stay sane despite roundoff error */ CLAMP_PROBABILITY(selec); } return selec; }
/* * Matching function for elements, to be used in hashtable lookups. */ static int element_match(const void *key1, const void *key2, Size keysize) { /* The keysize parameter is superfluous here */ return element_compare(key1, key2); }