/* * Allocate a new hashtable entry. * caller must hold an exclusive lock on pgss->lock * * Note: despite needing exclusive lock, it's not an error for the target * entry to already exist. This is because pgss_store releases and * reacquires lock after failing to find a match; so someone else could * have made the entry while we waited to get exclusive lock. */ static pgssEntry * entry_alloc(pgssHashKey *key) { pgssEntry *entry; bool found; /* Caller must have clipped query properly */ Assert(key->query_len < pgss->query_size); /* Make space if needed */ while (hash_get_num_entries(pgss_hash) >= pgss_max) entry_dealloc(); /* Find or create an entry with desired hash code */ entry = (pgssEntry *) hash_search(pgss_hash, key, HASH_ENTER, &found); if (!found) { /* New entry, initialize it */ /* dynahash tried to copy the key for us, but must fix query_ptr */ entry->key.query_ptr = entry->query; /* reset the statistics */ memset(&entry->counters, 0, sizeof(Counters)); entry->counters.usage = USAGE_INIT; /* re-initialize the mutex each time ... we assume no one using it */ SpinLockInit(&entry->mutex); /* ... and don't forget the query text */ memcpy(entry->query, key->query_ptr, key->query_len); entry->query[key->query_len] = '\0'; } return entry; }
/* * Deallocate least used entries. * Caller must hold an exclusive lock on pgss->lock. */ static void entry_dealloc(void) { HASH_SEQ_STATUS hash_seq; pgssEntry **entries; pgssEntry *entry; int nvictims; int i; /* Sort entries by usage and deallocate USAGE_DEALLOC_PERCENT of them. */ entries = palloc(hash_get_num_entries(pgss_hash) * sizeof(pgssEntry *)); i = 0; hash_seq_init(&hash_seq, pgss_hash); while ((entry = hash_seq_search(&hash_seq)) != NULL) { entries[i++] = entry; entry->counters.usage *= USAGE_DECREASE_FACTOR; } qsort(entries, i, sizeof(pgssEntry *), entry_cmp); nvictims = Max(10, i * USAGE_DEALLOC_PERCENT / 100); nvictims = Min(nvictims, i); for (i = 0; i < nvictims; i++) { hash_search(pgss_hash, &entries[i]->key, HASH_REMOVE, NULL); } pfree(entries); }
/* Are there any unresolved references to invalid pages? */ bool XLogHaveInvalidPages(void) { if (invalid_page_tab != NULL && hash_get_num_entries(invalid_page_tab) > 0) return true; return false; }
bool Persistent_PostDTMRecv_IsHashFull(void) { Insist(PT_PostDTMRecv_Info); Insist(PT_PostDTMRecv_Info->postDTMRecv_dbTblSpc_Hash); return (hash_get_num_entries(PT_PostDTMRecv_Info->postDTMRecv_dbTblSpc_Hash) == PT_MAX_NUM_POSTDTMRECV_DB); }
/* * Returns the number of elements in the hashtable. * * This function is not synchronized. */ long SyncHTNumEntries(SyncHT *syncHT) { Assert(NULL != syncHT); Assert(NULL != syncHT->ht); return hash_get_num_entries(syncHT->ht); }
/* * shmem_shutdown hook: Dump statistics into file. * * Note: we don't bother with acquiring lock, because there should be no * other processes running when this is called. */ static void pgss_shmem_shutdown(int code, Datum arg) { FILE *file; HASH_SEQ_STATUS hash_seq; int32 num_entries; pgssEntry *entry; /* Don't try to dump during a crash. */ if (code) return; /* Safety check ... shouldn't get here unless shmem is set up. */ if (!pgss || !pgss_hash) return; /* Don't dump if told not to. */ if (!pgss_save) return; file = AllocateFile(PGSS_DUMP_FILE, PG_BINARY_W); if (file == NULL) goto error; if (fwrite(&PGSS_FILE_HEADER, sizeof(uint32), 1, file) != 1) goto error; num_entries = hash_get_num_entries(pgss_hash); if (fwrite(&num_entries, sizeof(int32), 1, file) != 1) goto error; hash_seq_init(&hash_seq, pgss_hash); while ((entry = hash_seq_search(&hash_seq)) != NULL) { int len = entry->key.query_len; if (fwrite(entry, offsetof(pgssEntry, mutex), 1, file) != 1 || fwrite(entry->query, 1, len, file) != len) goto error; } if (FreeFile(file)) { file = NULL; goto error; } return; error: ereport(LOG, (errcode_for_file_access(), errmsg("could not write pg_stat_statement file \"%s\": %m", PGSS_DUMP_FILE))); if (file) FreeFile(file); unlink(PGSS_DUMP_FILE); }
/* * FindRandomNodeNotInList finds a random node from the shared hash that is not * a member of the current node list. The caller is responsible for making the * necessary node count checks to ensure that such a node exists. * * Note that this function has a selection bias towards nodes whose positions in * the shared hash are sequentially adjacent to the positions of nodes that are * in the current node list. This bias follows from our decision to first pick a * random node in the hash, and if that node is a member of the current list, to * simply iterate to the next node in the hash. Overall, this approach trades in * some selection bias for simplicity in design and for bounded execution time. */ static WorkerNode * FindRandomNodeNotInList(HTAB *WorkerNodesHash, List *currentNodeList) { WorkerNode *workerNode = NULL; HASH_SEQ_STATUS status; uint32 workerNodeCount = 0; uint32 currentNodeCount PG_USED_FOR_ASSERTS_ONLY = 0; bool lookForWorkerNode = true; uint32 workerPosition = 0; uint32 workerIndex = 0; workerNodeCount = hash_get_num_entries(WorkerNodesHash); currentNodeCount = list_length(currentNodeList); Assert(workerNodeCount > currentNodeCount); /* * We determine a random position within the worker hash between [1, N], * assuming that the number of elements in the hash is N. We then get to * this random position by iterating over the worker hash. Please note that * the random seed has already been set by the postmaster when starting up. */ workerPosition = (random() % workerNodeCount) + 1; hash_seq_init(&status, WorkerNodesHash); for (workerIndex = 0; workerIndex < workerPosition; workerIndex++) { workerNode = (WorkerNode *) hash_seq_search(&status); } while (lookForWorkerNode) { bool listMember = ListMember(currentNodeList, workerNode); if (workerNode->inWorkerFile && !listMember) { lookForWorkerNode = false; } else { /* iterate to the next worker node in the hash */ workerNode = (WorkerNode *) hash_seq_search(&status); /* reached end of hash; start from the beginning */ if (workerNode == NULL) { hash_seq_init(&status, WorkerNodesHash); workerNode = (WorkerNode *) hash_seq_search(&status); } } } /* we stopped scanning before completion; therefore clean up scan */ hash_seq_term(&status); return workerNode; }
/* * create and populate the crosstab tuplestore using the provided source query */ static Tuplestorestate * get_crosstab_tuplestore(char *sql, HTAB *crosstab_hash, TupleDesc tupdesc, MemoryContext per_query_ctx, bool randomAccess) { Tuplestorestate *tupstore; int num_categories = hash_get_num_entries(crosstab_hash); AttInMetadata *attinmeta = TupleDescGetAttInMetadata(tupdesc); char **values; HeapTuple tuple; int ret; int proc; /* initialize our tuplestore (while still in query context!) */ tupstore = tuplestore_begin_heap(randomAccess, false, work_mem); /* Connect to SPI manager */ if ((ret = SPI_connect()) < 0) /* internal error */ elog(ERROR, "get_crosstab_tuplestore: SPI_connect returned %d", ret); /* Now retrieve the crosstab source rows */ ret = SPI_execute(sql, true, 0); proc = SPI_processed; /* Check for qualifying tuples */ if ((ret == SPI_OK_SELECT) && (proc > 0)) { SPITupleTable *spi_tuptable = SPI_tuptable; TupleDesc spi_tupdesc = spi_tuptable->tupdesc; int ncols = spi_tupdesc->natts; char *rowid; char *lastrowid = NULL; bool firstpass = true; int i, j; int result_ncols; if (num_categories == 0) { /* no qualifying category tuples */ ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("provided \"categories\" SQL must " \ "return 1 column of at least one row"))); } /* * The provided SQL query must always return at least three columns: * * 1. rowname the label for each row - column 1 in the final result * 2. category the label for each value-column in the final result 3. * value the values used to populate the value-columns * * If there are more than three columns, the last two are taken as * "category" and "values". The first column is taken as "rowname". * Additional columns (2 thru N-2) are assumed the same for the same * "rowname", and are copied into the result tuple from the first time * we encounter a particular rowname. */ if (ncols < 3) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("invalid source data SQL statement"), errdetail("The provided SQL must return 3 " \ " columns; rowid, category, and values."))); result_ncols = (ncols - 2) + num_categories; /* Recheck to make sure we tuple descriptor still looks reasonable */ if (tupdesc->natts != result_ncols) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("invalid return type"), errdetail("Query-specified return " \ "tuple has %d columns but crosstab " \ "returns %d.", tupdesc->natts, result_ncols))); /* allocate space */ values = (char **) palloc(result_ncols * sizeof(char *)); /* and make sure it's clear */ memset(values, '\0', result_ncols * sizeof(char *)); for (i = 0; i < proc; i++) { HeapTuple spi_tuple; crosstab_cat_desc *catdesc; char *catname; /* get the next sql result tuple */ spi_tuple = spi_tuptable->vals[i]; /* get the rowid from the current sql result tuple */ rowid = SPI_getvalue(spi_tuple, spi_tupdesc, 1); /* * if we're on a new output row, grab the column values up to * column N-2 now */ if (firstpass || !xstreq(lastrowid, rowid)) { /* * a new row means we need to flush the old one first, unless * we're on the very first row */ if (!firstpass) { /* rowid changed, flush the previous output row */ tuple = BuildTupleFromCStrings(attinmeta, values); tuplestore_puttuple(tupstore, tuple); for (j = 0; j < result_ncols; j++) xpfree(values[j]); } values[0] = rowid; for (j = 1; j < ncols - 2; j++) values[j] = SPI_getvalue(spi_tuple, spi_tupdesc, j + 1); /* we're no longer on the first pass */ firstpass = false; } /* look up the category and fill in the appropriate column */ catname = SPI_getvalue(spi_tuple, spi_tupdesc, ncols - 1); if (catname != NULL) { crosstab_HashTableLookup(crosstab_hash, catname, catdesc); if (catdesc) values[catdesc->attidx + ncols - 2] = SPI_getvalue(spi_tuple, spi_tupdesc, ncols); } xpfree(lastrowid); xpstrdup(lastrowid, rowid); } /* flush the last output row */ tuple = BuildTupleFromCStrings(attinmeta, values); tuplestore_puttuple(tupstore, tuple); } if (SPI_finish() != SPI_OK_FINISH) /* internal error */ elog(ERROR, "get_crosstab_tuplestore: SPI_finish() failed"); tuplestore_donestoring(tupstore); return tupstore; }
/* * compute_tsvector_stats() -- compute statistics for a tsvector column * * This functions computes statistics that are useful for determining @@ * operations' selectivity, along with the fraction of non-null rows and * average width. * * Instead of finding the most common values, as we do for most datatypes, * we're looking for the most common lexemes. This is more useful, because * there most probably won't be any two rows with the same tsvector and thus * the notion of a MCV is a bit bogus with this datatype. With a list of the * most common lexemes we can do a better job at figuring out @@ selectivity. * * For the same reasons we assume that tsvector columns are unique when * determining the number of distinct values. * * The algorithm used is Lossy Counting, as proposed in the paper "Approximate * frequency counts over data streams" by G. S. Manku and R. Motwani, in * Proceedings of the 28th International Conference on Very Large Data Bases, * Hong Kong, China, August 2002, section 4.2. The paper is available at * http://www.vldb.org/conf/2002/S10P03.pdf * * The Lossy Counting (aka LC) algorithm goes like this: * Let s be the threshold frequency for an item (the minimum frequency we * are interested in) and epsilon the error margin for the frequency. Let D * be a set of triples (e, f, delta), where e is an element value, f is that * element's frequency (actually, its current occurrence count) and delta is * the maximum error in f. We start with D empty and process the elements in * batches of size w. (The batch size is also known as "bucket size" and is * equal to 1/epsilon.) Let the current batch number be b_current, starting * with 1. For each element e we either increment its f count, if it's * already in D, or insert a new triple into D with values (e, 1, b_current * - 1). After processing each batch we prune D, by removing from it all * elements with f + delta <= b_current. After the algorithm finishes we * suppress all elements from D that do not satisfy f >= (s - epsilon) * N, * where N is the total number of elements in the input. We emit the * remaining elements with estimated frequency f/N. The LC paper proves * that this algorithm finds all elements with true frequency at least s, * and that no frequency is overestimated or is underestimated by more than * epsilon. Furthermore, given reasonable assumptions about the input * distribution, the required table size is no more than about 7 times w. * * We set s to be the estimated frequency of the K'th word in a natural * language's frequency table, where K is the target number of entries in * the MCELEM array plus an arbitrary constant, meant to reflect the fact * that the most common words in any language would usually be stopwords * so we will not actually see them in the input. We assume that the * distribution of word frequencies (including the stopwords) follows Zipf's * law with an exponent of 1. * * Assuming Zipfian distribution, the frequency of the K'th word is equal * to 1/(K * H(W)) where H(n) is 1/2 + 1/3 + ... + 1/n and W is the number of * words in the language. Putting W as one million, we get roughly 0.07/K. * Assuming top 10 words are stopwords gives s = 0.07/(K + 10). We set * epsilon = s/10, which gives bucket width w = (K + 10)/0.007 and * maximum expected hashtable size of about 1000 * (K + 10). * * Note: in the above discussion, s, epsilon, and f/N are in terms of a * lexeme's frequency as a fraction of all lexemes seen in the input. * However, what we actually want to store in the finished pg_statistic * entry is each lexeme's frequency as a fraction of all rows that it occurs * in. Assuming that the input tsvectors are correctly constructed, no * lexeme occurs more than once per tsvector, so the final count f is a * correct estimate of the number of input tsvectors it occurs in, and we * need only change the divisor from N to nonnull_cnt to get the number we * want. */ static void compute_tsvector_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc, int samplerows, double totalrows) { int num_mcelem; int null_cnt = 0; double total_width = 0; /* This is D from the LC algorithm. */ HTAB *lexemes_tab; HASHCTL hash_ctl; HASH_SEQ_STATUS scan_status; /* This is the current bucket number from the LC algorithm */ int b_current; /* This is 'w' from the LC algorithm */ int bucket_width; int vector_no, lexeme_no; LexemeHashKey hash_key; TrackItem *item; /* * We want statistics_target * 10 lexemes in the MCELEM array. This * multiplier is pretty arbitrary, but is meant to reflect the fact that * the number of individual lexeme values tracked in pg_statistic ought to * be more than the number of values for a simple scalar column. */ num_mcelem = stats->attr->attstattarget * 10; /* * We set bucket width equal to (num_mcelem + 10) / 0.007 as per the * comment above. */ bucket_width = (num_mcelem + 10) * 1000 / 7; /* * Create the hashtable. It will be in local memory, so we don't need to * worry about overflowing the initial size. Also we don't need to pay any * attention to locking and memory management. */ MemSet(&hash_ctl, 0, sizeof(hash_ctl)); hash_ctl.keysize = sizeof(LexemeHashKey); hash_ctl.entrysize = sizeof(TrackItem); hash_ctl.hash = lexeme_hash; hash_ctl.match = lexeme_match; hash_ctl.hcxt = CurrentMemoryContext; lexemes_tab = hash_create("Analyzed lexemes table", num_mcelem, &hash_ctl, HASH_ELEM | HASH_FUNCTION | HASH_COMPARE | HASH_CONTEXT); /* Initialize counters. */ b_current = 1; lexeme_no = 0; /* Loop over the tsvectors. */ for (vector_no = 0; vector_no < samplerows; vector_no++) { Datum value; bool isnull; TSVector vector; WordEntry *curentryptr; char *lexemesptr; int j; vacuum_delay_point(); value = fetchfunc(stats, vector_no, &isnull); /* * Check for null/nonnull. */ if (isnull) { null_cnt++; continue; } /* * Add up widths for average-width calculation. Since it's a * tsvector, we know it's varlena. As in the regular * compute_minimal_stats function, we use the toasted width for this * calculation. */ total_width += VARSIZE_ANY(DatumGetPointer(value)); /* * Now detoast the tsvector if needed. */ vector = DatumGetTSVector(value); /* * We loop through the lexemes in the tsvector and add them to our * tracking hashtable. */ lexemesptr = STRPTR(vector); curentryptr = ARRPTR(vector); for (j = 0; j < vector->size; j++) { bool found; /* * Construct a hash key. The key points into the (detoasted) * tsvector value at this point, but if a new entry is created, we * make a copy of it. This way we can free the tsvector value * once we've processed all its lexemes. */ hash_key.lexeme = lexemesptr + curentryptr->pos; hash_key.length = curentryptr->len; /* Lookup current lexeme in hashtable, adding it if new */ item = (TrackItem *) hash_search(lexemes_tab, (const void *) &hash_key, HASH_ENTER, &found); if (found) { /* The lexeme is already on the tracking list */ item->frequency++; } else { /* Initialize new tracking list element */ item->frequency = 1; item->delta = b_current - 1; item->key.lexeme = palloc(hash_key.length); memcpy(item->key.lexeme, hash_key.lexeme, hash_key.length); } /* lexeme_no is the number of elements processed (ie N) */ lexeme_no++; /* We prune the D structure after processing each bucket */ if (lexeme_no % bucket_width == 0) { prune_lexemes_hashtable(lexemes_tab, b_current); b_current++; } /* Advance to the next WordEntry in the tsvector */ curentryptr++; } /* If the vector was toasted, free the detoasted copy. */ if (TSVectorGetDatum(vector) != value) pfree(vector); } /* We can only compute real stats if we found some non-null values. */ if (null_cnt < samplerows) { int nonnull_cnt = samplerows - null_cnt; int i; TrackItem **sort_table; int track_len; int cutoff_freq; int minfreq, maxfreq; stats->stats_valid = true; /* Do the simple null-frac and average width stats */ stats->stanullfrac = (double) null_cnt / (double) samplerows; stats->stawidth = total_width / (double) nonnull_cnt; /* Assume it's a unique column (see notes above) */ stats->stadistinct = -1.0 * (1.0 - stats->stanullfrac); /* * Construct an array of the interesting hashtable items, that is, * those meeting the cutoff frequency (s - epsilon)*N. Also identify * the minimum and maximum frequencies among these items. * * Since epsilon = s/10 and bucket_width = 1/epsilon, the cutoff * frequency is 9*N / bucket_width. */ cutoff_freq = 9 * lexeme_no / bucket_width; i = hash_get_num_entries(lexemes_tab); /* surely enough space */ sort_table = (TrackItem **) palloc(sizeof(TrackItem *) * i); hash_seq_init(&scan_status, lexemes_tab); track_len = 0; minfreq = lexeme_no; maxfreq = 0; while ((item = (TrackItem *) hash_seq_search(&scan_status)) != NULL) { if (item->frequency > cutoff_freq) { sort_table[track_len++] = item; minfreq = Min(minfreq, item->frequency); maxfreq = Max(maxfreq, item->frequency); } } Assert(track_len <= i); /* emit some statistics for debug purposes */ elog(DEBUG3, "tsvector_stats: target # mces = %d, bucket width = %d, " "# lexemes = %d, hashtable size = %d, usable entries = %d", num_mcelem, bucket_width, lexeme_no, i, track_len); /* * If we obtained more lexemes than we really want, get rid of those * with least frequencies. The easiest way is to qsort the array into * descending frequency order and truncate the array. */ if (num_mcelem < track_len) { qsort(sort_table, track_len, sizeof(TrackItem *), trackitem_compare_frequencies_desc); /* reset minfreq to the smallest frequency we're keeping */ minfreq = sort_table[num_mcelem - 1]->frequency; } else num_mcelem = track_len; /* Generate MCELEM slot entry */ if (num_mcelem > 0) { MemoryContext old_context; Datum *mcelem_values; float4 *mcelem_freqs; /* * We want to store statistics sorted on the lexeme value using * first length, then byte-for-byte comparison. The reason for * doing length comparison first is that we don't care about the * ordering so long as it's consistent, and comparing lengths * first gives us a chance to avoid a strncmp() call. * * This is different from what we do with scalar statistics -- * they get sorted on frequencies. The rationale is that we * usually search through most common elements looking for a * specific value, so we can grab its frequency. When values are * presorted we can employ binary search for that. See * ts_selfuncs.c for a real usage scenario. */ qsort(sort_table, num_mcelem, sizeof(TrackItem *), trackitem_compare_lexemes); /* Must copy the target values into anl_context */ old_context = MemoryContextSwitchTo(stats->anl_context); /* * We sorted statistics on the lexeme value, but we want to be * able to find out the minimal and maximal frequency without * going through all the values. We keep those two extra * frequencies in two extra cells in mcelem_freqs. * * (Note: the MCELEM statistics slot definition allows for a third * extra number containing the frequency of nulls, but we don't * create that for a tsvector column, since null elements aren't * possible.) */ mcelem_values = (Datum *) palloc(num_mcelem * sizeof(Datum)); mcelem_freqs = (float4 *) palloc((num_mcelem + 2) * sizeof(float4)); /* * See comments above about use of nonnull_cnt as the divisor for * the final frequency estimates. */ for (i = 0; i < num_mcelem; i++) { TrackItem *item = sort_table[i]; mcelem_values[i] = PointerGetDatum(cstring_to_text_with_len(item->key.lexeme, item->key.length)); mcelem_freqs[i] = (double) item->frequency / (double) nonnull_cnt; } mcelem_freqs[i++] = (double) minfreq / (double) nonnull_cnt; mcelem_freqs[i] = (double) maxfreq / (double) nonnull_cnt; MemoryContextSwitchTo(old_context); stats->stakind[0] = STATISTIC_KIND_MCELEM; stats->staop[0] = TextEqualOperator; stats->stacoll[0] = DEFAULT_COLLATION_OID; stats->stanumbers[0] = mcelem_freqs; /* See above comment about two extra frequency fields */ stats->numnumbers[0] = num_mcelem + 2; stats->stavalues[0] = mcelem_values; stats->numvalues[0] = num_mcelem; /* We are storing text values */ stats->statypid[0] = TEXTOID; stats->statyplen[0] = -1; /* typlen, -1 for varlena */ stats->statypbyval[0] = false; stats->statypalign[0] = 'i'; } } else { /* We found only nulls; assume the column is entirely null */ stats->stats_valid = true; stats->stanullfrac = 1.0; stats->stawidth = 0; /* "unknown" */ stats->stadistinct = 0.0; /* "unknown" */ } /* * We don't need to bother cleaning up any of our temporary palloc's. The * hashtable should also go away, as it used a child memory context. */ }
/* * SQL function json_populate_record * * set fields in a record from the argument json * * Code adapted shamelessly from hstore's populate_record * which is in turn partly adapted from record_out. * * The json is decomposed into a hash table, in which each * field in the record is then looked up by name. */ Datum json_populate_record(PG_FUNCTION_ARGS) { Oid argtype = get_fn_expr_argtype(fcinfo->flinfo, 0); text *json = PG_GETARG_TEXT_P(1); bool use_json_as_text = PG_GETARG_BOOL(2); HTAB *json_hash; HeapTupleHeader rec; Oid tupType; int32 tupTypmod; TupleDesc tupdesc; HeapTupleData tuple; HeapTuple rettuple; RecordIOData *my_extra; int ncolumns; int i; Datum *values; bool *nulls; char fname[NAMEDATALEN]; JsonHashEntry hashentry; if (!type_is_rowtype(argtype)) ereport(ERROR, (errcode(ERRCODE_DATATYPE_MISMATCH), errmsg("first argument must be a rowtype"))); if (PG_ARGISNULL(0)) { if (PG_ARGISNULL(1)) PG_RETURN_NULL(); rec = NULL; /* * have no tuple to look at, so the only source of type info is the * argtype. The lookup_rowtype_tupdesc call below will error out if we * don't have a known composite type oid here. */ tupType = argtype; tupTypmod = -1; } else { rec = PG_GETARG_HEAPTUPLEHEADER(0); if (PG_ARGISNULL(1)) PG_RETURN_POINTER(rec); /* Extract type info from the tuple itself */ tupType = HeapTupleHeaderGetTypeId(rec); tupTypmod = HeapTupleHeaderGetTypMod(rec); } json_hash = get_json_object_as_hash(json, "json_populate_record", use_json_as_text); /* * if the input json is empty, we can only skip the rest if we were passed * in a non-null record, since otherwise there may be issues with domain * nulls. */ if (hash_get_num_entries(json_hash) == 0 && rec) PG_RETURN_POINTER(rec); tupdesc = lookup_rowtype_tupdesc(tupType, tupTypmod); ncolumns = tupdesc->natts; if (rec) { /* Build a temporary HeapTuple control structure */ tuple.t_len = HeapTupleHeaderGetDatumLength(rec); ItemPointerSetInvalid(&(tuple.t_self)); tuple.t_data = rec; } /* * We arrange to look up the needed I/O info just once per series of * calls, assuming the record type doesn't change underneath us. */ my_extra = (RecordIOData *) fcinfo->flinfo->fn_extra; if (my_extra == NULL || my_extra->ncolumns != ncolumns) { fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt, sizeof(RecordIOData) - sizeof(ColumnIOData) + ncolumns * sizeof(ColumnIOData)); my_extra = (RecordIOData *) fcinfo->flinfo->fn_extra; my_extra->record_type = InvalidOid; my_extra->record_typmod = 0; } if (my_extra->record_type != tupType || my_extra->record_typmod != tupTypmod) { MemSet(my_extra, 0, sizeof(RecordIOData) - sizeof(ColumnIOData) + ncolumns * sizeof(ColumnIOData)); my_extra->record_type = tupType; my_extra->record_typmod = tupTypmod; my_extra->ncolumns = ncolumns; } values = (Datum *) palloc(ncolumns * sizeof(Datum)); nulls = (bool *) palloc(ncolumns * sizeof(bool)); if (rec) { /* Break down the tuple into fields */ heap_deform_tuple(&tuple, tupdesc, values, nulls); } else { for (i = 0; i < ncolumns; ++i) { values[i] = (Datum) 0; nulls[i] = true; } } for (i = 0; i < ncolumns; ++i) { ColumnIOData *column_info = &my_extra->columns[i]; Oid column_type = tupdesc->attrs[i]->atttypid; char *value; /* Ignore dropped columns in datatype */ if (tupdesc->attrs[i]->attisdropped) { nulls[i] = true; continue; } memset(fname, 0, NAMEDATALEN); strncpy(fname, NameStr(tupdesc->attrs[i]->attname), NAMEDATALEN); hashentry = hash_search(json_hash, fname, HASH_FIND, NULL); /* * we can't just skip here if the key wasn't found since we might have * a domain to deal with. If we were passed in a non-null record * datum, we assume that the existing values are valid (if they're * not, then it's not our fault), but if we were passed in a null, * then every field which we don't populate needs to be run through * the input function just in case it's a domain type. */ if (hashentry == NULL && rec) continue; /* * Prepare to convert the column value from text */ if (column_info->column_type != column_type) { getTypeInputInfo(column_type, &column_info->typiofunc, &column_info->typioparam); fmgr_info_cxt(column_info->typiofunc, &column_info->proc, fcinfo->flinfo->fn_mcxt); column_info->column_type = column_type; } if (hashentry == NULL || hashentry->isnull) { /* * need InputFunctionCall to happen even for nulls, so that domain * checks are done */ values[i] = InputFunctionCall(&column_info->proc, NULL, column_info->typioparam, tupdesc->attrs[i]->atttypmod); nulls[i] = true; } else { value = hashentry->val; values[i] = InputFunctionCall(&column_info->proc, value, column_info->typioparam, tupdesc->attrs[i]->atttypmod); nulls[i] = false; } } rettuple = heap_form_tuple(tupdesc, values, nulls); ReleaseTupleDesc(tupdesc); PG_RETURN_DATUM(HeapTupleGetDatum(rettuple)); }
/* Process one per-dbspace directory for ResetUnloggedRelations */ static void ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op) { DIR *dbspace_dir; struct dirent *de; char rm_path[MAXPGPATH]; /* Caller must specify at least one operation. */ Assert((op & (UNLOGGED_RELATION_CLEANUP | UNLOGGED_RELATION_INIT)) != 0); /* * Cleanup is a two-pass operation. First, we go through and identify all * the files with init forks. Then, we go through again and nuke * everything with the same OID except the init fork. */ if ((op & UNLOGGED_RELATION_CLEANUP) != 0) { HTAB *hash = NULL; HASHCTL ctl; /* Open the directory. */ dbspace_dir = AllocateDir(dbspacedirname); if (dbspace_dir == NULL) { elog(LOG, "could not open dbspace directory \"%s\": %m", dbspacedirname); return; } /* * It's possible that someone could create a ton of unlogged relations * in the same database & tablespace, so we'd better use a hash table * rather than an array or linked list to keep track of which files * need to be reset. Otherwise, this cleanup operation would be * O(n^2). */ ctl.keysize = sizeof(unlogged_relation_entry); ctl.entrysize = sizeof(unlogged_relation_entry); hash = hash_create("unlogged hash", 32, &ctl, HASH_ELEM); /* Scan the directory. */ while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL) { ForkNumber forkNum; int oidchars; unlogged_relation_entry ent; /* Skip anything that doesn't look like a relation data file. */ if (!parse_filename_for_nontemp_relation(de->d_name, &oidchars, &forkNum)) continue; /* Also skip it unless this is the init fork. */ if (forkNum != INIT_FORKNUM) continue; /* * Put the OID portion of the name into the hash table, if it * isn't already. */ memset(ent.oid, 0, sizeof(ent.oid)); memcpy(ent.oid, de->d_name, oidchars); hash_search(hash, &ent, HASH_ENTER, NULL); } /* Done with the first pass. */ FreeDir(dbspace_dir); /* * If we didn't find any init forks, there's no point in continuing; * we can bail out now. */ if (hash_get_num_entries(hash) == 0) { hash_destroy(hash); return; } /* * Now, make a second pass and remove anything that matches. First, * reopen the directory. */ dbspace_dir = AllocateDir(dbspacedirname); if (dbspace_dir == NULL) { elog(LOG, "could not open dbspace directory \"%s\": %m", dbspacedirname); hash_destroy(hash); return; } /* Scan the directory. */ while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL) { ForkNumber forkNum; int oidchars; bool found; unlogged_relation_entry ent; /* Skip anything that doesn't look like a relation data file. */ if (!parse_filename_for_nontemp_relation(de->d_name, &oidchars, &forkNum)) continue; /* We never remove the init fork. */ if (forkNum == INIT_FORKNUM) continue; /* * See whether the OID portion of the name shows up in the hash * table. */ memset(ent.oid, 0, sizeof(ent.oid)); memcpy(ent.oid, de->d_name, oidchars); hash_search(hash, &ent, HASH_FIND, &found); /* If so, nuke it! */ if (found) { snprintf(rm_path, sizeof(rm_path), "%s/%s", dbspacedirname, de->d_name); /* * It's tempting to actually throw an error here, but since * this code gets run during database startup, that could * result in the database failing to start. (XXX Should we do * it anyway?) */ if (unlink(rm_path)) elog(LOG, "could not unlink file \"%s\": %m", rm_path); else elog(DEBUG2, "unlinked file \"%s\"", rm_path); } } /* Cleanup is complete. */ FreeDir(dbspace_dir); hash_destroy(hash); } /* * Initialization happens after cleanup is complete: we copy each init * fork file to the corresponding main fork file. Note that if we are * asked to do both cleanup and init, we may never get here: if the * cleanup code determines that there are no init forks in this dbspace, * it will return before we get to this point. */ if ((op & UNLOGGED_RELATION_INIT) != 0) { /* Open the directory. */ dbspace_dir = AllocateDir(dbspacedirname); if (dbspace_dir == NULL) { /* we just saw this directory, so it really ought to be there */ elog(LOG, "could not open dbspace directory \"%s\": %m", dbspacedirname); return; } /* Scan the directory. */ while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL) { ForkNumber forkNum; int oidchars; char oidbuf[OIDCHARS + 1]; char srcpath[MAXPGPATH]; char dstpath[MAXPGPATH]; /* Skip anything that doesn't look like a relation data file. */ if (!parse_filename_for_nontemp_relation(de->d_name, &oidchars, &forkNum)) continue; /* Also skip it unless this is the init fork. */ if (forkNum != INIT_FORKNUM) continue; /* Construct source pathname. */ snprintf(srcpath, sizeof(srcpath), "%s/%s", dbspacedirname, de->d_name); /* Construct destination pathname. */ memcpy(oidbuf, de->d_name, oidchars); oidbuf[oidchars] = '\0'; snprintf(dstpath, sizeof(dstpath), "%s/%s%s", dbspacedirname, oidbuf, de->d_name + oidchars + 1 + strlen(forkNames[INIT_FORKNUM])); /* OK, we're ready to perform the actual copy. */ elog(DEBUG2, "copying %s to %s", srcpath, dstpath); copy_file(srcpath, dstpath); } /* Done with the first pass. */ FreeDir(dbspace_dir); } }
/* * compute_tsvector_stats() -- compute statistics for a tsvector column * * This functions computes statistics that are useful for determining @@ * operations' selectivity, along with the fraction of non-null rows and * average width. * * Instead of finding the most common values, as we do for most datatypes, * we're looking for the most common lexemes. This is more useful, because * there most probably won't be any two rows with the same tsvector and thus * the notion of a MCV is a bit bogus with this datatype. With a list of the * most common lexemes we can do a better job at figuring out @@ selectivity. * * For the same reasons we assume that tsvector columns are unique when * determining the number of distinct values. * * The algorithm used is Lossy Counting, as proposed in the paper "Approximate * frequency counts over data streams" by G. S. Manku and R. Motwani, in * Proceedings of the 28th International Conference on Very Large Data Bases, * Hong Kong, China, August 2002, section 4.2. The paper is available at * http://www.vldb.org/conf/2002/S10P03.pdf * * The Lossy Counting (aka LC) algorithm goes like this: * Let D be a set of triples (e, f, d), where e is an element value, f is * that element's frequency (occurrence count) and d is the maximum error in * f. We start with D empty and process the elements in batches of size * w. (The batch size is also known as "bucket size".) Let the current batch * number be b_current, starting with 1. For each element e we either * increment its f count, if it's already in D, or insert a new triple into D * with values (e, 1, b_current - 1). After processing each batch we prune D, * by removing from it all elements with f + d <= b_current. Finally, we * gather elements with largest f. The LC paper proves error bounds on f * dependent on the batch size w, and shows that the required table size * is no more than a few times w. * * We use a hashtable for the D structure and a bucket width of * statistics_target * 10, where 10 is an arbitrarily chosen constant, * meant to approximate the number of lexemes in a single tsvector. */ static void compute_tsvector_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc, int samplerows, double totalrows) { int num_mcelem; int null_cnt = 0; double total_width = 0; /* This is D from the LC algorithm. */ HTAB *lexemes_tab; HASHCTL hash_ctl; HASH_SEQ_STATUS scan_status; /* This is the current bucket number from the LC algorithm */ int b_current; /* This is 'w' from the LC algorithm */ int bucket_width; int vector_no, lexeme_no; LexemeHashKey hash_key; TrackItem *item; /* We want statistics_target * 10 lexemes in the MCELEM array */ num_mcelem = stats->attr->attstattarget * 10; /* * We set bucket width equal to the target number of result lexemes. This * is probably about right but perhaps might need to be scaled up or down * a bit? */ bucket_width = num_mcelem; /* * Create the hashtable. It will be in local memory, so we don't need to * worry about initial size too much. Also we don't need to pay any * attention to locking and memory management. */ MemSet(&hash_ctl, 0, sizeof(hash_ctl)); hash_ctl.keysize = sizeof(LexemeHashKey); hash_ctl.entrysize = sizeof(TrackItem); hash_ctl.hash = lexeme_hash; hash_ctl.match = lexeme_match; hash_ctl.hcxt = CurrentMemoryContext; lexemes_tab = hash_create("Analyzed lexemes table", bucket_width * 4, &hash_ctl, HASH_ELEM | HASH_FUNCTION | HASH_COMPARE | HASH_CONTEXT); /* Initialize counters. */ b_current = 1; lexeme_no = 1; /* Loop over the tsvectors. */ for (vector_no = 0; vector_no < samplerows; vector_no++) { Datum value; bool isnull; TSVector vector; WordEntry *curentryptr; char *lexemesptr; int j; vacuum_delay_point(); value = fetchfunc(stats, vector_no, &isnull); /* * Check for null/nonnull. */ if (isnull) { null_cnt++; continue; } /* * Add up widths for average-width calculation. Since it's a * tsvector, we know it's varlena. As in the regular * compute_minimal_stats function, we use the toasted width for this * calculation. */ total_width += VARSIZE_ANY(DatumGetPointer(value)); /* * Now detoast the tsvector if needed. */ vector = DatumGetTSVector(value); /* * We loop through the lexemes in the tsvector and add them to our * tracking hashtable. Note: the hashtable entries will point into * the (detoasted) tsvector value, therefore we cannot free that * storage until we're done. */ lexemesptr = STRPTR(vector); curentryptr = ARRPTR(vector); for (j = 0; j < vector->size; j++) { bool found; /* Construct a hash key */ hash_key.lexeme = lexemesptr + curentryptr->pos; hash_key.length = curentryptr->len; /* Lookup current lexeme in hashtable, adding it if new */ item = (TrackItem *) hash_search(lexemes_tab, (const void *) &hash_key, HASH_ENTER, &found); if (found) { /* The lexeme is already on the tracking list */ item->frequency++; } else { /* Initialize new tracking list element */ item->frequency = 1; item->delta = b_current - 1; } /* We prune the D structure after processing each bucket */ if (lexeme_no % bucket_width == 0) { prune_lexemes_hashtable(lexemes_tab, b_current); b_current++; } /* Advance to the next WordEntry in the tsvector */ lexeme_no++; curentryptr++; } } /* We can only compute real stats if we found some non-null values. */ if (null_cnt < samplerows) { int nonnull_cnt = samplerows - null_cnt; int i; TrackItem **sort_table; int track_len; int minfreq, maxfreq; stats->stats_valid = true; /* Do the simple null-frac and average width stats */ stats->stanullfrac = (double) null_cnt / (double) samplerows; stats->stawidth = total_width / (double) nonnull_cnt; /* Assume it's a unique column (see notes above) */ stats->stadistinct = -1.0; /* * Determine the top-N lexemes by simply copying pointers from the * hashtable into an array and applying qsort() */ track_len = hash_get_num_entries(lexemes_tab); sort_table = (TrackItem **) palloc(sizeof(TrackItem *) * track_len); hash_seq_init(&scan_status, lexemes_tab); i = 0; while ((item = (TrackItem *) hash_seq_search(&scan_status)) != NULL) { sort_table[i++] = item; } Assert(i == track_len); qsort(sort_table, track_len, sizeof(TrackItem *), trackitem_compare_frequencies_desc); /* Suppress any single-occurrence items */ while (track_len > 0) { if (sort_table[track_len - 1]->frequency > 1) break; track_len--; } /* Determine the number of most common lexemes to be stored */ if (num_mcelem > track_len) num_mcelem = track_len; /* Generate MCELEM slot entry */ if (num_mcelem > 0) { MemoryContext old_context; Datum *mcelem_values; float4 *mcelem_freqs; /* Grab the minimal and maximal frequencies that will get stored */ minfreq = sort_table[num_mcelem - 1]->frequency; maxfreq = sort_table[0]->frequency; /* * We want to store statistics sorted on the lexeme value using * first length, then byte-for-byte comparison. The reason for * doing length comparison first is that we don't care about the * ordering so long as it's consistent, and comparing lengths * first gives us a chance to avoid a strncmp() call. * * This is different from what we do with scalar statistics -- * they get sorted on frequencies. The rationale is that we * usually search through most common elements looking for a * specific value, so we can grab its frequency. When values are * presorted we can employ binary search for that. See * ts_selfuncs.c for a real usage scenario. */ qsort(sort_table, num_mcelem, sizeof(TrackItem *), trackitem_compare_lexemes); /* Must copy the target values into anl_context */ old_context = MemoryContextSwitchTo(stats->anl_context); /* * We sorted statistics on the lexeme value, but we want to be * able to find out the minimal and maximal frequency without * going through all the values. We keep those two extra * frequencies in two extra cells in mcelem_freqs. */ mcelem_values = (Datum *) palloc(num_mcelem * sizeof(Datum)); mcelem_freqs = (float4 *) palloc((num_mcelem + 2) * sizeof(float4)); for (i = 0; i < num_mcelem; i++) { TrackItem *item = sort_table[i]; mcelem_values[i] = PointerGetDatum(cstring_to_text_with_len(item->key.lexeme, item->key.length)); mcelem_freqs[i] = (double) item->frequency / (double) nonnull_cnt; } mcelem_freqs[i++] = (double) minfreq / (double) nonnull_cnt; mcelem_freqs[i] = (double) maxfreq / (double) nonnull_cnt; MemoryContextSwitchTo(old_context); stats->stakind[0] = STATISTIC_KIND_MCELEM; stats->staop[0] = TextEqualOperator; stats->stanumbers[0] = mcelem_freqs; /* See above comment about two extra frequency fields */ stats->numnumbers[0] = num_mcelem + 2; stats->stavalues[0] = mcelem_values; stats->numvalues[0] = num_mcelem; /* We are storing text values */ stats->statypid[0] = TEXTOID; stats->statyplen[0] = -1; /* typlen, -1 for varlena */ stats->statypbyval[0] = false; stats->statypalign[0] = 'i'; } } else { /* We found only nulls; assume the column is entirely null */ stats->stats_valid = true; stats->stanullfrac = 1.0; stats->stawidth = 0; /* "unknown" */ stats->stadistinct = 0.0; /* "unknown" */ } /* * We don't need to bother cleaning up any of our temporary palloc's. The * hashtable should also go away, as it used a child memory context. */ }
/* * Process one per-dbspace directory for ResetUnloggedRelations */ static void ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op) { DIR *dbspace_dir; struct dirent *de; char rm_path[MAXPGPATH * 2]; /* Caller must specify at least one operation. */ Assert((op & (UNLOGGED_RELATION_CLEANUP | UNLOGGED_RELATION_INIT)) != 0); /* * Cleanup is a two-pass operation. First, we go through and identify all * the files with init forks. Then, we go through again and nuke * everything with the same OID except the init fork. */ if ((op & UNLOGGED_RELATION_CLEANUP) != 0) { HTAB *hash; HASHCTL ctl; /* * It's possible that someone could create a ton of unlogged relations * in the same database & tablespace, so we'd better use a hash table * rather than an array or linked list to keep track of which files * need to be reset. Otherwise, this cleanup operation would be * O(n^2). */ memset(&ctl, 0, sizeof(ctl)); ctl.keysize = sizeof(unlogged_relation_entry); ctl.entrysize = sizeof(unlogged_relation_entry); hash = hash_create("unlogged hash", 32, &ctl, HASH_ELEM); /* Scan the directory. */ dbspace_dir = AllocateDir(dbspacedirname); while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL) { ForkNumber forkNum; int oidchars; unlogged_relation_entry ent; /* Skip anything that doesn't look like a relation data file. */ if (!parse_filename_for_nontemp_relation(de->d_name, &oidchars, &forkNum)) continue; /* Also skip it unless this is the init fork. */ if (forkNum != INIT_FORKNUM) continue; /* * Put the OID portion of the name into the hash table, if it * isn't already. */ memset(ent.oid, 0, sizeof(ent.oid)); memcpy(ent.oid, de->d_name, oidchars); hash_search(hash, &ent, HASH_ENTER, NULL); } /* Done with the first pass. */ FreeDir(dbspace_dir); /* * If we didn't find any init forks, there's no point in continuing; * we can bail out now. */ if (hash_get_num_entries(hash) == 0) { hash_destroy(hash); return; } /* * Now, make a second pass and remove anything that matches. */ dbspace_dir = AllocateDir(dbspacedirname); while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL) { ForkNumber forkNum; int oidchars; bool found; unlogged_relation_entry ent; /* Skip anything that doesn't look like a relation data file. */ if (!parse_filename_for_nontemp_relation(de->d_name, &oidchars, &forkNum)) continue; /* We never remove the init fork. */ if (forkNum == INIT_FORKNUM) continue; /* * See whether the OID portion of the name shows up in the hash * table. */ memset(ent.oid, 0, sizeof(ent.oid)); memcpy(ent.oid, de->d_name, oidchars); hash_search(hash, &ent, HASH_FIND, &found); /* If so, nuke it! */ if (found) { snprintf(rm_path, sizeof(rm_path), "%s/%s", dbspacedirname, de->d_name); if (unlink(rm_path) < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not remove file \"%s\": %m", rm_path))); else elog(DEBUG2, "unlinked file \"%s\"", rm_path); } } /* Cleanup is complete. */ FreeDir(dbspace_dir); hash_destroy(hash); } /* * Initialization happens after cleanup is complete: we copy each init * fork file to the corresponding main fork file. Note that if we are * asked to do both cleanup and init, we may never get here: if the * cleanup code determines that there are no init forks in this dbspace, * it will return before we get to this point. */ if ((op & UNLOGGED_RELATION_INIT) != 0) { /* Scan the directory. */ dbspace_dir = AllocateDir(dbspacedirname); while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL) { ForkNumber forkNum; int oidchars; char oidbuf[OIDCHARS + 1]; char srcpath[MAXPGPATH * 2]; char dstpath[MAXPGPATH]; /* Skip anything that doesn't look like a relation data file. */ if (!parse_filename_for_nontemp_relation(de->d_name, &oidchars, &forkNum)) continue; /* Also skip it unless this is the init fork. */ if (forkNum != INIT_FORKNUM) continue; /* Construct source pathname. */ snprintf(srcpath, sizeof(srcpath), "%s/%s", dbspacedirname, de->d_name); /* Construct destination pathname. */ memcpy(oidbuf, de->d_name, oidchars); oidbuf[oidchars] = '\0'; snprintf(dstpath, sizeof(dstpath), "%s/%s%s", dbspacedirname, oidbuf, de->d_name + oidchars + 1 + strlen(forkNames[INIT_FORKNUM])); /* OK, we're ready to perform the actual copy. */ elog(DEBUG2, "copying %s to %s", srcpath, dstpath); copy_file(srcpath, dstpath); } FreeDir(dbspace_dir); /* * copy_file() above has already called pg_flush_data() on the files * it created. Now we need to fsync those files, because a checkpoint * won't do it for us while we're in recovery. We do this in a * separate pass to allow the kernel to perform all the flushes * (especially the metadata ones) at once. */ dbspace_dir = AllocateDir(dbspacedirname); while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL) { ForkNumber forkNum; int oidchars; char oidbuf[OIDCHARS + 1]; char mainpath[MAXPGPATH]; /* Skip anything that doesn't look like a relation data file. */ if (!parse_filename_for_nontemp_relation(de->d_name, &oidchars, &forkNum)) continue; /* Also skip it unless this is the init fork. */ if (forkNum != INIT_FORKNUM) continue; /* Construct main fork pathname. */ memcpy(oidbuf, de->d_name, oidchars); oidbuf[oidchars] = '\0'; snprintf(mainpath, sizeof(mainpath), "%s/%s%s", dbspacedirname, oidbuf, de->d_name + oidchars + 1 + strlen(forkNames[INIT_FORKNUM])); fsync_fname(mainpath, false); } FreeDir(dbspace_dir); /* * Lastly, fsync the database directory itself, ensuring the * filesystem remembers the file creations and deletions we've done. * We don't bother with this during a call that does only * UNLOGGED_RELATION_CLEANUP, because if recovery crashes before we * get to doing UNLOGGED_RELATION_INIT, we'll redo the cleanup step * too at the next startup attempt. */ fsync_fname(dbspacedirname, true); } }
/* * compute_array_stats() -- compute statistics for an array column * * This function computes statistics useful for determining selectivity of * the array operators <@, &&, and @>. It is invoked by ANALYZE via the * compute_stats hook after sample rows have been collected. * * We also invoke the standard compute_stats function, which will compute * "scalar" statistics relevant to the btree-style array comparison operators. * However, exact duplicates of an entire array may be rare despite many * arrays sharing individual elements. This especially afflicts long arrays, * which are also liable to lack all scalar statistics due to the low * WIDTH_THRESHOLD used in analyze.c. So, in addition to the standard stats, * we find the most common array elements and compute a histogram of distinct * element counts. * * The algorithm used is Lossy Counting, as proposed in the paper "Approximate * frequency counts over data streams" by G. S. Manku and R. Motwani, in * Proceedings of the 28th International Conference on Very Large Data Bases, * Hong Kong, China, August 2002, section 4.2. The paper is available at * http://www.vldb.org/conf/2002/S10P03.pdf * * The Lossy Counting (aka LC) algorithm goes like this: * Let s be the threshold frequency for an item (the minimum frequency we * are interested in) and epsilon the error margin for the frequency. Let D * be a set of triples (e, f, delta), where e is an element value, f is that * element's frequency (actually, its current occurrence count) and delta is * the maximum error in f. We start with D empty and process the elements in * batches of size w. (The batch size is also known as "bucket size" and is * equal to 1/epsilon.) Let the current batch number be b_current, starting * with 1. For each element e we either increment its f count, if it's * already in D, or insert a new___ triple into D with values (e, 1, b_current * - 1). After processing each batch we prune D, by removing from it all * elements with f + delta <= b_current. After the algorithm finishes we * suppress all elements from D that do not satisfy f >= (s - epsilon) * N, * where N is the total number of elements in the input. We emit the * remaining elements with estimated frequency f/N. The LC paper proves * that this algorithm finds all elements with true frequency at least s, * and that no frequency is overestimated or is underestimated by more than * epsilon. Furthermore, given reasonable assumptions about the input * distribution, the required table size is no more than about 7 times w. * * In the absence of a principled basis for other particular values, we * follow ts_typanalyze() and use parameters s = 0.07/K, epsilon = s/10. * But we leave out the correction for stopwords, which do not apply to * arrays. These parameters give bucket width w = K/0.007 and maximum * expected hashtable size of about 1000 * K. * * Elements may repeat within an array. Since duplicates do not change the * behavior of <@, && or @>, we want to count each element only once per * array. Therefore, we store in the finished pg_statistic entry each * element's frequency as the fraction of all non-null rows that contain it. * We divide the raw counts by nonnull_cnt to get those figures. */ static void compute_array_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc, int samplerows, double totalrows) { ArrayAnalyzeExtraData *extra_data; int num_mcelem; int null_cnt = 0; int null_elem_cnt = 0; int analyzed_rows = 0; /* This is D from the LC algorithm. */ HTAB *elements_tab; HASHCTL elem_hash_ctl; HASH_SEQ_STATUS scan_status; /* This is the current bucket number from the LC algorithm */ int b_current; /* This is 'w' from the LC algorithm */ int bucket_width; int array_no; int64 element_no; TrackItem *item; int slot_idx; HTAB *count_tab; HASHCTL count_hash_ctl; DECountItem *count_item; extra_data = (ArrayAnalyzeExtraData *) stats->extra_data; /* * Invoke analyze.c's standard analysis function to create scalar-style * stats for the column. It will expect its own extra_data pointer, so * temporarily install that. */ stats->extra_data = extra_data->std_extra_data; (*extra_data->std_compute_stats) (stats, fetchfunc, samplerows, totalrows); stats->extra_data = extra_data; /* * Set up static pointer for use by subroutines. We wait till here in * case std_compute_stats somehow recursively invokes us (probably not * possible, but ...) */ array_extra_data = extra_data; /* * We want statistics_target * 10 elements in the MCELEM array. This * multiplier is pretty arbitrary, but is meant to reflect the fact that * the number of individual elements tracked in pg_statistic ought to be * more than the number of values for a simple scalar column. */ num_mcelem = stats->attr->attstattarget * 10; /* * We set bucket width equal to num_mcelem / 0.007 as per the comment * above. */ bucket_width = num_mcelem * 1000 / 7; /* * Create the hashtable. It will be in local memory, so we don't need to * worry about overflowing the initial size. Also we don't need to pay any * attention to locking and memory management. */ MemSet(&elem_hash_ctl, 0, sizeof(elem_hash_ctl)); elem_hash_ctl.keysize = sizeof(Datum); elem_hash_ctl.entrysize = sizeof(TrackItem); elem_hash_ctl.hash = element_hash; elem_hash_ctl.match = element_match; elem_hash_ctl.hcxt = CurrentMemoryContext; elements_tab = hash_create("Analyzed elements table", num_mcelem, &elem_hash_ctl, HASH_ELEM | HASH_FUNCTION | HASH_COMPARE | HASH_CONTEXT); /* hashtable for array distinct elements counts */ MemSet(&count_hash_ctl, 0, sizeof(count_hash_ctl)); count_hash_ctl.keysize = sizeof(int); count_hash_ctl.entrysize = sizeof(DECountItem); count_hash_ctl.hcxt = CurrentMemoryContext; count_tab = hash_create("Array distinct element count table", 64, &count_hash_ctl, HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); /* Initialize counters. */ b_current = 1; element_no = 0; /* Loop over the arrays. */ for (array_no = 0; array_no < samplerows; array_no++) { Datum value; bool isnull; ArrayType *array; int num_elems; Datum *elem_values; bool *elem_nulls; bool null_present; int j; int64 prev_element_no = element_no; int distinct_count; bool count_item_found; vacuum_delay_point(); value = fetchfunc(stats, array_no, &isnull); if (isnull) { /* array is null, just count that */ null_cnt++; continue; } /* Skip too-large values. */ if (toast_raw_datum_size(value) > ARRAY_WIDTH_THRESHOLD) continue; else analyzed_rows++; /* * Now detoast the array if needed, and deconstruct into datums. */ array = DatumGetArrayTypeP(value); Assert(ARR_ELEMTYPE(array) == extra_data->type_id); deconstruct_array(array, extra_data->type_id, extra_data->typlen, extra_data->typbyval, extra_data->typalign, &elem_values, &elem_nulls, &num_elems); /* * We loop through the elements in the array and add them to our * tracking hashtable. */ null_present = false; for (j = 0; j < num_elems; j++) { Datum elem_value; bool found; /* No null element processing other than flag setting here */ if (elem_nulls[j]) { null_present = true; continue; } /* Lookup current element in hashtable, adding it if new___ */ elem_value = elem_values[j]; item = (TrackItem *) hash_search(elements_tab, (const void *) &elem_value, HASH_ENTER, &found); if (found) { /* The element value is already on the tracking list */ /* * The operators we assist ignore duplicate array elements, so * count a given distinct element only once per array. */ if (item->last_container == array_no) continue; item->frequency++; item->last_container = array_no; } else { /* Initialize new___ tracking list element */ /* * If element type is pass-by-reference, we must copy it into * palloc'd space, so that we can release the array below. (We * do this so that the space needed for element values is * limited by the size of the hashtable; if we kept all the * array values around, it could be much more.) */ item->key = datumCopy(elem_value, extra_data->typbyval, extra_data->typlen); item->frequency = 1; item->delta = b_current - 1; item->last_container = array_no; } /* element_no is the number of elements processed (ie N) */ element_no++; /* We prune the D structure after processing each bucket */ if (element_no % bucket_width == 0) { prune_element_hashtable(elements_tab, b_current); b_current++; } } /* Count null element presence once per array. */ if (null_present) null_elem_cnt++; /* Update frequency of the particular array distinct element count. */ distinct_count = (int) (element_no - prev_element_no); count_item = (DECountItem *) hash_search(count_tab, &distinct_count, HASH_ENTER, &count_item_found); if (count_item_found) count_item->frequency++; else count_item->frequency = 1; /* Free memory allocated while detoasting. */ if (PointerGetDatum(array) != value) pfree(array); pfree(elem_values); pfree(elem_nulls); } /* Skip pg_statistic slots occupied by standard statistics */ slot_idx = 0; while (slot_idx < STATISTIC_NUM_SLOTS && stats->stakind[slot_idx] != 0) slot_idx++; if (slot_idx > STATISTIC_NUM_SLOTS - 2) elog(ERROR, "insufficient pg_statistic slots for array stats"); /* We can only compute real stats if we found some non-null values. */ if (analyzed_rows > 0) { int nonnull_cnt = analyzed_rows; int count_items_count; int i; TrackItem **sort_table; int track_len; int64 cutoff_freq; int64 minfreq, maxfreq; /* * We assume the standard stats code already took care of setting * stats_valid, stanullfrac, stawidth, stadistinct. We'd have to * re-compute those values if we wanted to not store the standard * stats. */ /* * Construct an array of the interesting hashtable items, that is, * those meeting the cutoff frequency (s - epsilon)*N. Also identify * the minimum and maximum frequencies among these items. * * Since epsilon = s/10 and bucket_width = 1/epsilon, the cutoff * frequency is 9*N / bucket_width. */ cutoff_freq = 9 * element_no / bucket_width; i = hash_get_num_entries(elements_tab); /* surely enough space */ sort_table = (TrackItem **) palloc(sizeof(TrackItem *) * i); hash_seq_init(&scan_status, elements_tab); track_len = 0; minfreq = element_no; maxfreq = 0; while ((item = (TrackItem *) hash_seq_search(&scan_status)) != NULL) { if (item->frequency > cutoff_freq) { sort_table[track_len++] = item; minfreq = Min(minfreq, item->frequency); maxfreq = Max(maxfreq, item->frequency); } } Assert(track_len <= i); /* emit some statistics for debug purposes */ elog(DEBUG3, "compute_array_stats: target # mces = %d, " "bucket width = %d, " "# elements = " INT64_FORMAT ", hashtable size = %d, " "usable entries = %d", num_mcelem, bucket_width, element_no, i, track_len); /* * If we obtained more elements than we really want, get rid of those * with least frequencies. The easiest way is to qsort the array into * descending frequency order and truncate the array. */ if (num_mcelem < track_len) { qsort(sort_table, track_len, sizeof(TrackItem *), trackitem_compare_frequencies_desc); /* reset minfreq to the smallest frequency we're keeping */ minfreq = sort_table[num_mcelem - 1]->frequency; } else num_mcelem = track_len; /* Generate MCELEM slot entry */ if (num_mcelem > 0) { MemoryContext old_context; Datum *mcelem_values; float4 *mcelem_freqs; /* * We want to store statistics sorted on the element value using * the element type's default comparison function. This permits * fast binary searches in selectivity estimation functions. */ qsort(sort_table, num_mcelem, sizeof(TrackItem *), trackitem_compare_element); /* Must copy the target values into anl_context */ old_context = MemoryContextSwitchTo(stats->anl_context); /* * We sorted statistics on the element value, but we want to be * able to find the minimal and maximal frequencies without going * through all the values. We also want the frequency of null * elements. Store these three values at the end of mcelem_freqs. */ mcelem_values = (Datum *) palloc(num_mcelem * sizeof(Datum)); mcelem_freqs = (float4 *) palloc((num_mcelem + 3) * sizeof(float4)); /* * See comments above about use of nonnull_cnt as the divisor for * the final frequency estimates. */ for (i = 0; i < num_mcelem; i++) { TrackItem *item = sort_table[i]; mcelem_values[i] = datumCopy(item->key, extra_data->typbyval, extra_data->typlen); mcelem_freqs[i] = (double) item->frequency / (double) nonnull_cnt; } mcelem_freqs[i++] = (double) minfreq / (double) nonnull_cnt; mcelem_freqs[i++] = (double) maxfreq / (double) nonnull_cnt; mcelem_freqs[i++] = (double) null_elem_cnt / (double) nonnull_cnt; MemoryContextSwitchTo(old_context); stats->stakind[slot_idx] = STATISTIC_KIND_MCELEM; stats->staop[slot_idx] = extra_data->eq_opr; stats->stanumbers[slot_idx] = mcelem_freqs; /* See above comment about extra stanumber entries */ stats->numnumbers[slot_idx] = num_mcelem + 3; stats->stavalues[slot_idx] = mcelem_values; stats->numvalues[slot_idx] = num_mcelem; /* We are storing values of element type */ stats->statypid[slot_idx] = extra_data->type_id; stats->statyplen[slot_idx] = extra_data->typlen; stats->statypbyval[slot_idx] = extra_data->typbyval; stats->statypalign[slot_idx] = extra_data->typalign; slot_idx++; } /* Generate DECHIST slot entry */ count_items_count = hash_get_num_entries(count_tab); if (count_items_count > 0) { int num_hist = stats->attr->attstattarget; DECountItem **sorted_count_items; int j; int delta; int64 frac; float4 *hist; /* num_hist must be at least 2 for the loop below to work */ num_hist = Max(num_hist, 2); /* * Create an array of DECountItem pointers, and sort them into * increasing count order. */ sorted_count_items = (DECountItem **) palloc(sizeof(DECountItem *) * count_items_count); hash_seq_init(&scan_status, count_tab); j = 0; while ((count_item = (DECountItem *) hash_seq_search(&scan_status)) != NULL) { sorted_count_items[j++] = count_item; } qsort(sorted_count_items, count_items_count, sizeof(DECountItem *), countitem_compare_count); /* * Prepare to fill stanumbers with the histogram, followed by the * average count. This array must be stored in anl_context. */ hist = (float4 *) MemoryContextAlloc(stats->anl_context, sizeof(float4) * (num_hist + 1)); hist[num_hist] = (double) element_no / (double) nonnull_cnt; /*---------- * Construct the histogram of distinct-element counts (DECs). * * The object of this loop is to copy the min and max DECs to * hist[0] and hist[num_hist - 1], along with evenly-spaced DECs * in between (where "evenly-spaced" is with reference to the * whole input population of arrays). If we had a complete sorted * array of DECs, one per analyzed row, the i'th hist value would * come from DECs[i * (analyzed_rows - 1) / (num_hist - 1)] * (compare the histogram-making loop in compute_scalar_stats()). * But instead of that we have the sorted_count_items[] array, * which holds unique DEC values with their frequencies (that is, * a run-length-compressed version of the full array). So we * control advancing through sorted_count_items[] with the * variable "frac", which is defined as (x - y) * (num_hist - 1), * where x is the index in the notional DECs array corresponding * to the start of the next sorted_count_items[] element's run, * and y is the index in DECs from which we should take the next * histogram value. We have to advance whenever x <= y, that is * frac <= 0. The x component is the sum of the frequencies seen * so far (up through the current sorted_count_items[] element), * and of course y * (num_hist - 1) = i * (analyzed_rows - 1), * per the subscript calculation above. (The subscript calculation * implies dropping any fractional part of y; in this formulation * that's handled by not advancing until frac reaches 1.) * * Even though frac has a bounded range, it could overflow int32 * when working with very large statistics targets, so we do that * math in int64. *---------- */ delta = analyzed_rows - 1; j = 0; /* current index in sorted_count_items */ /* Initialize frac for sorted_count_items[0]; y is initially 0 */ frac = (int64) sorted_count_items[0]->frequency * (num_hist - 1); for (i = 0; i < num_hist; i++) { while (frac <= 0) { /* Advance, and update x component of frac */ j++; frac += (int64) sorted_count_items[j]->frequency * (num_hist - 1); } hist[i] = sorted_count_items[j]->count; frac -= delta; /* update y for upcoming i increment */ } Assert(j == count_items_count - 1); stats->stakind[slot_idx] = STATISTIC_KIND_DECHIST; stats->staop[slot_idx] = extra_data->eq_opr; stats->stanumbers[slot_idx] = hist; stats->numnumbers[slot_idx] = num_hist + 1; slot_idx++; } } /* * We don't need to bother cleaning up any of our temporary palloc's. The * hashtable should also go away, as it used a child memory context. */ }