/* * initNextTableToScan * Find the next table to scan and initiate the scan if the previous table * is finished. * * If scanning on the current table is not finished, or a new table is found, * this function returns true. * If no more table is found, this function returns false. */ static bool initNextTableToScan(DynamicTableScanState *node) { ScanState *scanState = (ScanState *)node; if (scanState->scan_state == SCAN_INIT || scanState->scan_state == SCAN_DONE) { Oid *pid = hash_seq_search(&node->pidStatus); if (pid == NULL) { node->shouldCallHashSeqTerm = false; return false; } /* Collect number of partitions scanned in EXPLAIN ANALYZE */ if (NULL != scanState->ps.instrument) { Instrumentation *instr = scanState->ps.instrument; instr->numPartScanned ++; } /* * Inside ExecInitScanTupleSlot() we set the tuple table slot's oid * to range table entry's relid, which for partitioned table always set * to parent table's oid. In queries where we need to read table oids * (MPP-20736) we use the tuple table slot's saved oid (refer to slot_getsysattr()). * This wrongly returns parent oid, instead of partition oid. Therefore, * to return correct partition oid, we need to update * our tuple table slot's oid to reflect the partition oid. */ scanState->ss_ScanTupleSlot->tts_tableOid = *pid; scanState->ss_currentRelation = OpenScanRelationByOid(*pid); Relation lastScannedRel = OpenScanRelationByOid(node->lastRelOid); TupleDesc lastTupDesc = RelationGetDescr(lastScannedRel); CloseScanRelation(lastScannedRel); TupleDesc partTupDesc = RelationGetDescr(scanState->ss_currentRelation); ExecAssignScanType(scanState, partTupDesc); AttrNumber *attMap; attMap = varattnos_map(lastTupDesc, partTupDesc); /* If attribute remapping is not necessary, then do not change the varattno */ if (attMap) { change_varattnos_of_a_varno((Node*)scanState->ps.plan->qual, attMap, node->scanrelid); change_varattnos_of_a_varno((Node*)scanState->ps.plan->targetlist, attMap, node->scanrelid); /* * Now that the varattno mapping has been changed, change the relation that * the new varnos correspond to */ node->lastRelOid = *pid; } /* * For the very first partition, the targetlist of planstate is set to null. So, we must * initialize quals and targetlist, regardless of remapping requirements. For later * partitions, we only initialize quals and targetlist if a column re-mapping is necessary. */ if (attMap || node->firstPartition) { node->firstPartition = false; MemoryContextReset(node->partitionMemoryContext); MemoryContext oldCxt = MemoryContextSwitchTo(node->partitionMemoryContext); /* Initialize child expressions */ scanState->ps.qual = (List *)ExecInitExpr((Expr *)scanState->ps.plan->qual, (PlanState*)scanState); scanState->ps.targetlist = (List *)ExecInitExpr((Expr *)scanState->ps.plan->targetlist, (PlanState*)scanState); MemoryContextSwitchTo(oldCxt); } if (attMap) { pfree(attMap); } ExecAssignScanProjectionInfo(scanState); scanState->tableType = getTableType(scanState->ss_currentRelation); BeginTableScanRelation(scanState); } return true; }
/* * DatabaseInfo_SortRelArray() * Builds the sorted RelArray structure based on a RelHash */ static void DatabaseInfo_SortRelArray( DatabaseInfo *info, HTAB *dbInfoRelHashTable, int count) { HASH_SEQ_STATUS iterateStatus; DbInfoRel **dbInfoRelPtrArray; int d; /* This function will populate the dbInfoRelArray */ Assert(info->dbInfoRelArray == NULL); /* Construct an array of pointers by scanning through the hash table */ dbInfoRelPtrArray = (DbInfoRel**) palloc(sizeof(DbInfoRel*) * count); hash_seq_init(&iterateStatus, dbInfoRelHashTable); for (d = 0; d < count; d++) { dbInfoRelPtrArray[d] = (DbInfoRel*) hash_seq_search(&iterateStatus); /* should have as many entries in the hash scan as "count" */ if (dbInfoRelPtrArray[d] == NULL) elog(ERROR, "insufficient #/entries in dbInfoRelHashTable"); } /* double check that the hash contained the right number of elements */ if (hash_seq_search(&iterateStatus) != NULL) elog(ERROR, "too many entries in dbInfoRelHashTable"); /* sort the pointer array */ qsort(dbInfoRelPtrArray, count, sizeof(DbInfoRel*), DbInfoRelPtrArray_Compare); /* * Finally convert the sorted pointer array into a sorted record array. */ info->dbInfoRelArray = (DbInfoRel*) palloc(sizeof(DbInfoRel)*count); for (d = 0; d < count; d++) { info->dbInfoRelArray[d] = *(dbInfoRelPtrArray[d]); /* * For each record in the array we have three lists: * - gpRelationNodes * - appendOnlyCatalogSegmentInfo * - physicalSegmentFiles * * All three of which need to be sorted on segmentFileNum otherwise * we will not be able to merge the lists correctly. * * XXX - this seems like a bad design, it seems like we have three * sources of information on the same thing, which should be able * to be satisfied with a single Hash rather than trying to keep * around three different lists and have code spread throughout the * source trying to deal with merging the lists. */ if (info->dbInfoRelArray[d].gpRelationNodes) qsort(info->dbInfoRelArray[d].gpRelationNodes, info->dbInfoRelArray[d].gpRelationNodesCount, sizeof(DbInfoGpRelationNode), DbInfoGpRelationNode_Compare); if (info->dbInfoRelArray[d].appendOnlyCatalogSegmentInfo) qsort(info->dbInfoRelArray[d].appendOnlyCatalogSegmentInfo, info->dbInfoRelArray[d].appendOnlyCatalogSegmentInfoCount, sizeof(DbInfoAppendOnlyCatalogSegmentInfo), DbInfoAppendOnlyCatalogSegmentInfo_Compare); if (info->dbInfoRelArray[d].physicalSegmentFiles) qsort(info->dbInfoRelArray[d].physicalSegmentFiles, info->dbInfoRelArray[d].physicalSegmentFilesCount, sizeof(DbInfoSegmentFile), DbInfoSegmentFile_Compare); } info->dbInfoRelArrayCount = count; /* Release the temporary pointer array and return */ pfree(dbInfoRelPtrArray); return; }
/* * compute_tsvector_stats() -- compute statistics for a tsvector column * * This functions computes statistics that are useful for determining @@ * operations' selectivity, along with the fraction of non-null rows and * average width. * * Instead of finding the most common values, as we do for most datatypes, * we're looking for the most common lexemes. This is more useful, because * there most probably won't be any two rows with the same tsvector and thus * the notion of a MCV is a bit bogus with this datatype. With a list of the * most common lexemes we can do a better job at figuring out @@ selectivity. * * For the same reasons we assume that tsvector columns are unique when * determining the number of distinct values. * * The algorithm used is Lossy Counting, as proposed in the paper "Approximate * frequency counts over data streams" by G. S. Manku and R. Motwani, in * Proceedings of the 28th International Conference on Very Large Data Bases, * Hong Kong, China, August 2002, section 4.2. The paper is available at * http://www.vldb.org/conf/2002/S10P03.pdf * * The Lossy Counting (aka LC) algorithm goes like this: * Let s be the threshold frequency for an item (the minimum frequency we * are interested in) and epsilon the error margin for the frequency. Let D * be a set of triples (e, f, delta), where e is an element value, f is that * element's frequency (actually, its current occurrence count) and delta is * the maximum error in f. We start with D empty and process the elements in * batches of size w. (The batch size is also known as "bucket size" and is * equal to 1/epsilon.) Let the current batch number be b_current, starting * with 1. For each element e we either increment its f count, if it's * already in D, or insert a new triple into D with values (e, 1, b_current * - 1). After processing each batch we prune D, by removing from it all * elements with f + delta <= b_current. After the algorithm finishes we * suppress all elements from D that do not satisfy f >= (s - epsilon) * N, * where N is the total number of elements in the input. We emit the * remaining elements with estimated frequency f/N. The LC paper proves * that this algorithm finds all elements with true frequency at least s, * and that no frequency is overestimated or is underestimated by more than * epsilon. Furthermore, given reasonable assumptions about the input * distribution, the required table size is no more than about 7 times w. * * We set s to be the estimated frequency of the K'th word in a natural * language's frequency table, where K is the target number of entries in * the MCELEM array plus an arbitrary constant, meant to reflect the fact * that the most common words in any language would usually be stopwords * so we will not actually see them in the input. We assume that the * distribution of word frequencies (including the stopwords) follows Zipf's * law with an exponent of 1. * * Assuming Zipfian distribution, the frequency of the K'th word is equal * to 1/(K * H(W)) where H(n) is 1/2 + 1/3 + ... + 1/n and W is the number of * words in the language. Putting W as one million, we get roughly 0.07/K. * Assuming top 10 words are stopwords gives s = 0.07/(K + 10). We set * epsilon = s/10, which gives bucket width w = (K + 10)/0.007 and * maximum expected hashtable size of about 1000 * (K + 10). * * Note: in the above discussion, s, epsilon, and f/N are in terms of a * lexeme's frequency as a fraction of all lexemes seen in the input. * However, what we actually want to store in the finished pg_statistic * entry is each lexeme's frequency as a fraction of all rows that it occurs * in. Assuming that the input tsvectors are correctly constructed, no * lexeme occurs more than once per tsvector, so the final count f is a * correct estimate of the number of input tsvectors it occurs in, and we * need only change the divisor from N to nonnull_cnt to get the number we * want. */ static void compute_tsvector_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc, int samplerows, double totalrows) { int num_mcelem; int null_cnt = 0; double total_width = 0; /* This is D from the LC algorithm. */ HTAB *lexemes_tab; HASHCTL hash_ctl; HASH_SEQ_STATUS scan_status; /* This is the current bucket number from the LC algorithm */ int b_current; /* This is 'w' from the LC algorithm */ int bucket_width; int vector_no, lexeme_no; LexemeHashKey hash_key; TrackItem *item; /* * We want statistics_target * 10 lexemes in the MCELEM array. This * multiplier is pretty arbitrary, but is meant to reflect the fact that * the number of individual lexeme values tracked in pg_statistic ought to * be more than the number of values for a simple scalar column. */ num_mcelem = stats->attr->attstattarget * 10; /* * We set bucket width equal to (num_mcelem + 10) / 0.007 as per the * comment above. */ bucket_width = (num_mcelem + 10) * 1000 / 7; /* * Create the hashtable. It will be in local memory, so we don't need to * worry about overflowing the initial size. Also we don't need to pay any * attention to locking and memory management. */ MemSet(&hash_ctl, 0, sizeof(hash_ctl)); hash_ctl.keysize = sizeof(LexemeHashKey); hash_ctl.entrysize = sizeof(TrackItem); hash_ctl.hash = lexeme_hash; hash_ctl.match = lexeme_match; hash_ctl.hcxt = CurrentMemoryContext; lexemes_tab = hash_create("Analyzed lexemes table", num_mcelem, &hash_ctl, HASH_ELEM | HASH_FUNCTION | HASH_COMPARE | HASH_CONTEXT); /* Initialize counters. */ b_current = 1; lexeme_no = 0; /* Loop over the tsvectors. */ for (vector_no = 0; vector_no < samplerows; vector_no++) { Datum value; bool isnull; TSVector vector; WordEntry *curentryptr; char *lexemesptr; int j; vacuum_delay_point(); value = fetchfunc(stats, vector_no, &isnull); /* * Check for null/nonnull. */ if (isnull) { null_cnt++; continue; } /* * Add up widths for average-width calculation. Since it's a * tsvector, we know it's varlena. As in the regular * compute_minimal_stats function, we use the toasted width for this * calculation. */ total_width += VARSIZE_ANY(DatumGetPointer(value)); /* * Now detoast the tsvector if needed. */ vector = DatumGetTSVector(value); /* * We loop through the lexemes in the tsvector and add them to our * tracking hashtable. */ lexemesptr = STRPTR(vector); curentryptr = ARRPTR(vector); for (j = 0; j < vector->size; j++) { bool found; /* * Construct a hash key. The key points into the (detoasted) * tsvector value at this point, but if a new entry is created, we * make a copy of it. This way we can free the tsvector value * once we've processed all its lexemes. */ hash_key.lexeme = lexemesptr + curentryptr->pos; hash_key.length = curentryptr->len; /* Lookup current lexeme in hashtable, adding it if new */ item = (TrackItem *) hash_search(lexemes_tab, (const void *) &hash_key, HASH_ENTER, &found); if (found) { /* The lexeme is already on the tracking list */ item->frequency++; } else { /* Initialize new tracking list element */ item->frequency = 1; item->delta = b_current - 1; item->key.lexeme = palloc(hash_key.length); memcpy(item->key.lexeme, hash_key.lexeme, hash_key.length); } /* lexeme_no is the number of elements processed (ie N) */ lexeme_no++; /* We prune the D structure after processing each bucket */ if (lexeme_no % bucket_width == 0) { prune_lexemes_hashtable(lexemes_tab, b_current); b_current++; } /* Advance to the next WordEntry in the tsvector */ curentryptr++; } /* If the vector was toasted, free the detoasted copy. */ if (TSVectorGetDatum(vector) != value) pfree(vector); } /* We can only compute real stats if we found some non-null values. */ if (null_cnt < samplerows) { int nonnull_cnt = samplerows - null_cnt; int i; TrackItem **sort_table; int track_len; int cutoff_freq; int minfreq, maxfreq; stats->stats_valid = true; /* Do the simple null-frac and average width stats */ stats->stanullfrac = (double) null_cnt / (double) samplerows; stats->stawidth = total_width / (double) nonnull_cnt; /* Assume it's a unique column (see notes above) */ stats->stadistinct = -1.0 * (1.0 - stats->stanullfrac); /* * Construct an array of the interesting hashtable items, that is, * those meeting the cutoff frequency (s - epsilon)*N. Also identify * the minimum and maximum frequencies among these items. * * Since epsilon = s/10 and bucket_width = 1/epsilon, the cutoff * frequency is 9*N / bucket_width. */ cutoff_freq = 9 * lexeme_no / bucket_width; i = hash_get_num_entries(lexemes_tab); /* surely enough space */ sort_table = (TrackItem **) palloc(sizeof(TrackItem *) * i); hash_seq_init(&scan_status, lexemes_tab); track_len = 0; minfreq = lexeme_no; maxfreq = 0; while ((item = (TrackItem *) hash_seq_search(&scan_status)) != NULL) { if (item->frequency > cutoff_freq) { sort_table[track_len++] = item; minfreq = Min(minfreq, item->frequency); maxfreq = Max(maxfreq, item->frequency); } } Assert(track_len <= i); /* emit some statistics for debug purposes */ elog(DEBUG3, "tsvector_stats: target # mces = %d, bucket width = %d, " "# lexemes = %d, hashtable size = %d, usable entries = %d", num_mcelem, bucket_width, lexeme_no, i, track_len); /* * If we obtained more lexemes than we really want, get rid of those * with least frequencies. The easiest way is to qsort the array into * descending frequency order and truncate the array. */ if (num_mcelem < track_len) { qsort(sort_table, track_len, sizeof(TrackItem *), trackitem_compare_frequencies_desc); /* reset minfreq to the smallest frequency we're keeping */ minfreq = sort_table[num_mcelem - 1]->frequency; } else num_mcelem = track_len; /* Generate MCELEM slot entry */ if (num_mcelem > 0) { MemoryContext old_context; Datum *mcelem_values; float4 *mcelem_freqs; /* * We want to store statistics sorted on the lexeme value using * first length, then byte-for-byte comparison. The reason for * doing length comparison first is that we don't care about the * ordering so long as it's consistent, and comparing lengths * first gives us a chance to avoid a strncmp() call. * * This is different from what we do with scalar statistics -- * they get sorted on frequencies. The rationale is that we * usually search through most common elements looking for a * specific value, so we can grab its frequency. When values are * presorted we can employ binary search for that. See * ts_selfuncs.c for a real usage scenario. */ qsort(sort_table, num_mcelem, sizeof(TrackItem *), trackitem_compare_lexemes); /* Must copy the target values into anl_context */ old_context = MemoryContextSwitchTo(stats->anl_context); /* * We sorted statistics on the lexeme value, but we want to be * able to find out the minimal and maximal frequency without * going through all the values. We keep those two extra * frequencies in two extra cells in mcelem_freqs. * * (Note: the MCELEM statistics slot definition allows for a third * extra number containing the frequency of nulls, but we don't * create that for a tsvector column, since null elements aren't * possible.) */ mcelem_values = (Datum *) palloc(num_mcelem * sizeof(Datum)); mcelem_freqs = (float4 *) palloc((num_mcelem + 2) * sizeof(float4)); /* * See comments above about use of nonnull_cnt as the divisor for * the final frequency estimates. */ for (i = 0; i < num_mcelem; i++) { TrackItem *item = sort_table[i]; mcelem_values[i] = PointerGetDatum(cstring_to_text_with_len(item->key.lexeme, item->key.length)); mcelem_freqs[i] = (double) item->frequency / (double) nonnull_cnt; } mcelem_freqs[i++] = (double) minfreq / (double) nonnull_cnt; mcelem_freqs[i] = (double) maxfreq / (double) nonnull_cnt; MemoryContextSwitchTo(old_context); stats->stakind[0] = STATISTIC_KIND_MCELEM; stats->staop[0] = TextEqualOperator; stats->stanumbers[0] = mcelem_freqs; /* See above comment about two extra frequency fields */ stats->numnumbers[0] = num_mcelem + 2; stats->stavalues[0] = mcelem_values; stats->numvalues[0] = num_mcelem; /* We are storing text values */ stats->statypid[0] = TEXTOID; stats->statyplen[0] = -1; /* typlen, -1 for varlena */ stats->statypbyval[0] = false; stats->statypalign[0] = 'i'; } } else { /* We found only nulls; assume the column is entirely null */ stats->stats_valid = true; stats->stanullfrac = 1.0; stats->stawidth = 0; /* "unknown" */ stats->stadistinct = 0.0; /* "unknown" */ } /* * We don't need to bother cleaning up any of our temporary palloc's. The * hashtable should also go away, as it used a child memory context. */ }
int FaultInjector_SetFaultInjection( FaultInjectorEntry_s *entry) { int status = STATUS_OK; bool isRemoved = FALSE; getFileRepRoleAndState(&fileRepRole, &segmentState, &dataState, NULL, NULL); switch (entry->faultInjectorType) { case FaultInjectorTypeReset: { HASH_SEQ_STATUS hash_status; FaultInjectorEntry_s *entryLocal; if (entry->faultInjectorIdentifier == FaultInjectorIdAll) { hash_seq_init(&hash_status, faultInjectorShmem->hash); LockAcquire(); while ((entryLocal = (FaultInjectorEntry_s *) hash_seq_search(&hash_status)) != NULL) { isRemoved = FaultInjector_RemoveHashEntry(entryLocal->faultInjectorIdentifier); if (isRemoved == TRUE) { faultInjectorShmem->faultInjectorSlots--; } } Assert(faultInjectorShmem->faultInjectorSlots == 0); LockRelease(); } else { LockAcquire(); isRemoved = FaultInjector_RemoveHashEntry(entry->faultInjectorIdentifier); if (isRemoved == TRUE) { faultInjectorShmem->faultInjectorSlots--; } LockRelease(); } if (isRemoved == FALSE) { ereport(DEBUG1, (errmsg("LOG(fault injector): could not remove fault injection from hash" "identifier:'%s' ", FaultInjectorIdentifierEnumToString[entry->faultInjectorIdentifier]))); } break; } case FaultInjectorTypeStatus: { HASH_SEQ_STATUS hash_status; FaultInjectorEntry_s *entryLocal; bool found = FALSE; if (faultInjectorShmem->hash == NULL) { status = STATUS_ERROR; break; } snprintf(entry->bufOutput, sizeof(entry->bufOutput), "Success: "); if (entry->faultInjectorIdentifier == ChangeTrackingCompactingReport) { snprintf(entry->bufOutput, sizeof(entry->bufOutput), "Success: compacting in progress %s", "false"); break; } hash_seq_init(&hash_status, faultInjectorShmem->hash); while ((entryLocal = (FaultInjectorEntry_s *) hash_seq_search(&hash_status)) != NULL) { ereport(LOG, (errmsg("fault injector status: " "fault name:'%s' " "fault type:'%s' " "ddl statement:'%s' " "database name:'%s' " "table name:'%s' " "occurrence:'%d' " "sleep time:'%d' " "fault injection state:'%s' ", FaultInjectorIdentifierEnumToString[entryLocal->faultInjectorIdentifier], FaultInjectorTypeEnumToString[entryLocal->faultInjectorType], FaultInjectorDDLEnumToString[entryLocal->ddlStatement], entryLocal->databaseName, entryLocal->tableName, entryLocal->occurrence, entryLocal->sleepTime, FaultInjectorStateEnumToString[entryLocal->faultInjectorState]))); if (entry->faultInjectorIdentifier == entryLocal->faultInjectorIdentifier || entry->faultInjectorIdentifier == FaultInjectorIdAll) { snprintf(entry->bufOutput, sizeof(entry->bufOutput), "%s \n" "fault name:'%s' " "fault type:'%s' " "ddl statement:'%s' " "database name:'%s' " "table name:'%s' " "occurrence:'%d' " "sleep time:'%d' " "fault injection state:'%s' ", entry->bufOutput, FaultInjectorIdentifierEnumToString[entryLocal->faultInjectorIdentifier], FaultInjectorTypeEnumToString[entryLocal->faultInjectorType], FaultInjectorDDLEnumToString[entryLocal->ddlStatement], entryLocal->databaseName, entryLocal->tableName, entryLocal->occurrence, entryLocal->sleepTime, FaultInjectorStateEnumToString[entryLocal->faultInjectorState]); found = TRUE; } } if (found == FALSE) { snprintf(entry->bufOutput, sizeof(entry->bufOutput), "Failure: " "fault name:'%s' not set", FaultInjectorIdentifierEnumToString[entry->faultInjectorIdentifier]); } break; } case FaultInjectorTypeResume: ereport(LOG, (errmsg("fault triggered, fault name:'%s' fault type:'%s' ", FaultInjectorIdentifierEnumToString[entry->faultInjectorIdentifier], FaultInjectorTypeEnumToString[entry->faultInjectorType]))); FaultInjector_UpdateHashEntry(entry); break; default: status = FaultInjector_NewHashEntry(entry); break; } return status; }
/* * purge_dropped_db_segments */ static void purge_dropped_db_segments(bool force) { static TimestampTz last_purge_time = 0; List *db_oids; List *dbs_to_remove = NIL; HASH_SEQ_STATUS status; broker_db_meta *db_meta; if (!force && !TimestampDifferenceExceeds(last_purge_time, GetCurrentTimestamp(), 10 * 1000)) /* 10s */ return; db_oids = get_database_oids(); LWLockAcquire(IPCMessageBrokerIndexLock, LW_SHARED); hash_seq_init(&status, broker_meta->db_meta_hash); while ((db_meta = (broker_db_meta *) hash_seq_search(&status)) != NULL) { bool found = false; ListCell *lc; foreach(lc, db_oids) { if (lfirst_oid(lc) == db_meta->dbid) { found = true; break; } } if (!found) dbs_to_remove = lappend_oid(dbs_to_remove, db_meta->dbid); } LWLockRelease(IPCMessageBrokerIndexLock); if (list_length(dbs_to_remove)) { ListCell *lc; LWLockAcquire(IPCMessageBrokerIndexLock, LW_EXCLUSIVE); foreach(lc, dbs_to_remove) { Oid dbid = lfirst_oid(lc); bool found; db_meta = hash_search(broker_meta->db_meta_hash, &dbid, HASH_FIND, &found); Assert(found); Assert(db_meta->handle > 0); /* detach from main db segment */ if (db_meta->segment) dsm_detach(db_meta->segment); if (db_meta->lqueues) { int i; for (i = 0; i < continuous_query_num_workers; i++) { local_queue *local_buf = &db_meta->lqueues[i]; if (local_buf->slots) list_free_deep(local_buf->slots); } pfree(db_meta->lqueues); } hash_search(broker_meta->db_meta_hash, &dbid, HASH_REMOVE, &found); Assert(found); } mark_unused_locks_as_free(db_oids); LWLockRelease(IPCMessageBrokerIndexLock); }
/* * compute_array_stats() -- compute statistics for an array column * * This function computes statistics useful for determining selectivity of * the array operators <@, &&, and @>. It is invoked by ANALYZE via the * compute_stats hook after sample rows have been collected. * * We also invoke the standard compute_stats function, which will compute * "scalar" statistics relevant to the btree-style array comparison operators. * However, exact duplicates of an entire array may be rare despite many * arrays sharing individual elements. This especially afflicts long arrays, * which are also liable to lack all scalar statistics due to the low * WIDTH_THRESHOLD used in analyze.c. So, in addition to the standard stats, * we find the most common array elements and compute a histogram of distinct * element counts. * * The algorithm used is Lossy Counting, as proposed in the paper "Approximate * frequency counts over data streams" by G. S. Manku and R. Motwani, in * Proceedings of the 28th International Conference on Very Large Data Bases, * Hong Kong, China, August 2002, section 4.2. The paper is available at * http://www.vldb.org/conf/2002/S10P03.pdf * * The Lossy Counting (aka LC) algorithm goes like this: * Let s be the threshold frequency for an item (the minimum frequency we * are interested in) and epsilon the error margin for the frequency. Let D * be a set of triples (e, f, delta), where e is an element value, f is that * element's frequency (actually, its current occurrence count) and delta is * the maximum error in f. We start with D empty and process the elements in * batches of size w. (The batch size is also known as "bucket size" and is * equal to 1/epsilon.) Let the current batch number be b_current, starting * with 1. For each element e we either increment its f count, if it's * already in D, or insert a new___ triple into D with values (e, 1, b_current * - 1). After processing each batch we prune D, by removing from it all * elements with f + delta <= b_current. After the algorithm finishes we * suppress all elements from D that do not satisfy f >= (s - epsilon) * N, * where N is the total number of elements in the input. We emit the * remaining elements with estimated frequency f/N. The LC paper proves * that this algorithm finds all elements with true frequency at least s, * and that no frequency is overestimated or is underestimated by more than * epsilon. Furthermore, given reasonable assumptions about the input * distribution, the required table size is no more than about 7 times w. * * In the absence of a principled basis for other particular values, we * follow ts_typanalyze() and use parameters s = 0.07/K, epsilon = s/10. * But we leave out the correction for stopwords, which do not apply to * arrays. These parameters give bucket width w = K/0.007 and maximum * expected hashtable size of about 1000 * K. * * Elements may repeat within an array. Since duplicates do not change the * behavior of <@, && or @>, we want to count each element only once per * array. Therefore, we store in the finished pg_statistic entry each * element's frequency as the fraction of all non-null rows that contain it. * We divide the raw counts by nonnull_cnt to get those figures. */ static void compute_array_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc, int samplerows, double totalrows) { ArrayAnalyzeExtraData *extra_data; int num_mcelem; int null_cnt = 0; int null_elem_cnt = 0; int analyzed_rows = 0; /* This is D from the LC algorithm. */ HTAB *elements_tab; HASHCTL elem_hash_ctl; HASH_SEQ_STATUS scan_status; /* This is the current bucket number from the LC algorithm */ int b_current; /* This is 'w' from the LC algorithm */ int bucket_width; int array_no; int64 element_no; TrackItem *item; int slot_idx; HTAB *count_tab; HASHCTL count_hash_ctl; DECountItem *count_item; extra_data = (ArrayAnalyzeExtraData *) stats->extra_data; /* * Invoke analyze.c's standard analysis function to create scalar-style * stats for the column. It will expect its own extra_data pointer, so * temporarily install that. */ stats->extra_data = extra_data->std_extra_data; (*extra_data->std_compute_stats) (stats, fetchfunc, samplerows, totalrows); stats->extra_data = extra_data; /* * Set up static pointer for use by subroutines. We wait till here in * case std_compute_stats somehow recursively invokes us (probably not * possible, but ...) */ array_extra_data = extra_data; /* * We want statistics_target * 10 elements in the MCELEM array. This * multiplier is pretty arbitrary, but is meant to reflect the fact that * the number of individual elements tracked in pg_statistic ought to be * more than the number of values for a simple scalar column. */ num_mcelem = stats->attr->attstattarget * 10; /* * We set bucket width equal to num_mcelem / 0.007 as per the comment * above. */ bucket_width = num_mcelem * 1000 / 7; /* * Create the hashtable. It will be in local memory, so we don't need to * worry about overflowing the initial size. Also we don't need to pay any * attention to locking and memory management. */ MemSet(&elem_hash_ctl, 0, sizeof(elem_hash_ctl)); elem_hash_ctl.keysize = sizeof(Datum); elem_hash_ctl.entrysize = sizeof(TrackItem); elem_hash_ctl.hash = element_hash; elem_hash_ctl.match = element_match; elem_hash_ctl.hcxt = CurrentMemoryContext; elements_tab = hash_create("Analyzed elements table", num_mcelem, &elem_hash_ctl, HASH_ELEM | HASH_FUNCTION | HASH_COMPARE | HASH_CONTEXT); /* hashtable for array distinct elements counts */ MemSet(&count_hash_ctl, 0, sizeof(count_hash_ctl)); count_hash_ctl.keysize = sizeof(int); count_hash_ctl.entrysize = sizeof(DECountItem); count_hash_ctl.hcxt = CurrentMemoryContext; count_tab = hash_create("Array distinct element count table", 64, &count_hash_ctl, HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); /* Initialize counters. */ b_current = 1; element_no = 0; /* Loop over the arrays. */ for (array_no = 0; array_no < samplerows; array_no++) { Datum value; bool isnull; ArrayType *array; int num_elems; Datum *elem_values; bool *elem_nulls; bool null_present; int j; int64 prev_element_no = element_no; int distinct_count; bool count_item_found; vacuum_delay_point(); value = fetchfunc(stats, array_no, &isnull); if (isnull) { /* array is null, just count that */ null_cnt++; continue; } /* Skip too-large values. */ if (toast_raw_datum_size(value) > ARRAY_WIDTH_THRESHOLD) continue; else analyzed_rows++; /* * Now detoast the array if needed, and deconstruct into datums. */ array = DatumGetArrayTypeP(value); Assert(ARR_ELEMTYPE(array) == extra_data->type_id); deconstruct_array(array, extra_data->type_id, extra_data->typlen, extra_data->typbyval, extra_data->typalign, &elem_values, &elem_nulls, &num_elems); /* * We loop through the elements in the array and add them to our * tracking hashtable. */ null_present = false; for (j = 0; j < num_elems; j++) { Datum elem_value; bool found; /* No null element processing other than flag setting here */ if (elem_nulls[j]) { null_present = true; continue; } /* Lookup current element in hashtable, adding it if new___ */ elem_value = elem_values[j]; item = (TrackItem *) hash_search(elements_tab, (const void *) &elem_value, HASH_ENTER, &found); if (found) { /* The element value is already on the tracking list */ /* * The operators we assist ignore duplicate array elements, so * count a given distinct element only once per array. */ if (item->last_container == array_no) continue; item->frequency++; item->last_container = array_no; } else { /* Initialize new___ tracking list element */ /* * If element type is pass-by-reference, we must copy it into * palloc'd space, so that we can release the array below. (We * do this so that the space needed for element values is * limited by the size of the hashtable; if we kept all the * array values around, it could be much more.) */ item->key = datumCopy(elem_value, extra_data->typbyval, extra_data->typlen); item->frequency = 1; item->delta = b_current - 1; item->last_container = array_no; } /* element_no is the number of elements processed (ie N) */ element_no++; /* We prune the D structure after processing each bucket */ if (element_no % bucket_width == 0) { prune_element_hashtable(elements_tab, b_current); b_current++; } } /* Count null element presence once per array. */ if (null_present) null_elem_cnt++; /* Update frequency of the particular array distinct element count. */ distinct_count = (int) (element_no - prev_element_no); count_item = (DECountItem *) hash_search(count_tab, &distinct_count, HASH_ENTER, &count_item_found); if (count_item_found) count_item->frequency++; else count_item->frequency = 1; /* Free memory allocated while detoasting. */ if (PointerGetDatum(array) != value) pfree(array); pfree(elem_values); pfree(elem_nulls); } /* Skip pg_statistic slots occupied by standard statistics */ slot_idx = 0; while (slot_idx < STATISTIC_NUM_SLOTS && stats->stakind[slot_idx] != 0) slot_idx++; if (slot_idx > STATISTIC_NUM_SLOTS - 2) elog(ERROR, "insufficient pg_statistic slots for array stats"); /* We can only compute real stats if we found some non-null values. */ if (analyzed_rows > 0) { int nonnull_cnt = analyzed_rows; int count_items_count; int i; TrackItem **sort_table; int track_len; int64 cutoff_freq; int64 minfreq, maxfreq; /* * We assume the standard stats code already took care of setting * stats_valid, stanullfrac, stawidth, stadistinct. We'd have to * re-compute those values if we wanted to not store the standard * stats. */ /* * Construct an array of the interesting hashtable items, that is, * those meeting the cutoff frequency (s - epsilon)*N. Also identify * the minimum and maximum frequencies among these items. * * Since epsilon = s/10 and bucket_width = 1/epsilon, the cutoff * frequency is 9*N / bucket_width. */ cutoff_freq = 9 * element_no / bucket_width; i = hash_get_num_entries(elements_tab); /* surely enough space */ sort_table = (TrackItem **) palloc(sizeof(TrackItem *) * i); hash_seq_init(&scan_status, elements_tab); track_len = 0; minfreq = element_no; maxfreq = 0; while ((item = (TrackItem *) hash_seq_search(&scan_status)) != NULL) { if (item->frequency > cutoff_freq) { sort_table[track_len++] = item; minfreq = Min(minfreq, item->frequency); maxfreq = Max(maxfreq, item->frequency); } } Assert(track_len <= i); /* emit some statistics for debug purposes */ elog(DEBUG3, "compute_array_stats: target # mces = %d, " "bucket width = %d, " "# elements = " INT64_FORMAT ", hashtable size = %d, " "usable entries = %d", num_mcelem, bucket_width, element_no, i, track_len); /* * If we obtained more elements than we really want, get rid of those * with least frequencies. The easiest way is to qsort the array into * descending frequency order and truncate the array. */ if (num_mcelem < track_len) { qsort(sort_table, track_len, sizeof(TrackItem *), trackitem_compare_frequencies_desc); /* reset minfreq to the smallest frequency we're keeping */ minfreq = sort_table[num_mcelem - 1]->frequency; } else num_mcelem = track_len; /* Generate MCELEM slot entry */ if (num_mcelem > 0) { MemoryContext old_context; Datum *mcelem_values; float4 *mcelem_freqs; /* * We want to store statistics sorted on the element value using * the element type's default comparison function. This permits * fast binary searches in selectivity estimation functions. */ qsort(sort_table, num_mcelem, sizeof(TrackItem *), trackitem_compare_element); /* Must copy the target values into anl_context */ old_context = MemoryContextSwitchTo(stats->anl_context); /* * We sorted statistics on the element value, but we want to be * able to find the minimal and maximal frequencies without going * through all the values. We also want the frequency of null * elements. Store these three values at the end of mcelem_freqs. */ mcelem_values = (Datum *) palloc(num_mcelem * sizeof(Datum)); mcelem_freqs = (float4 *) palloc((num_mcelem + 3) * sizeof(float4)); /* * See comments above about use of nonnull_cnt as the divisor for * the final frequency estimates. */ for (i = 0; i < num_mcelem; i++) { TrackItem *item = sort_table[i]; mcelem_values[i] = datumCopy(item->key, extra_data->typbyval, extra_data->typlen); mcelem_freqs[i] = (double) item->frequency / (double) nonnull_cnt; } mcelem_freqs[i++] = (double) minfreq / (double) nonnull_cnt; mcelem_freqs[i++] = (double) maxfreq / (double) nonnull_cnt; mcelem_freqs[i++] = (double) null_elem_cnt / (double) nonnull_cnt; MemoryContextSwitchTo(old_context); stats->stakind[slot_idx] = STATISTIC_KIND_MCELEM; stats->staop[slot_idx] = extra_data->eq_opr; stats->stanumbers[slot_idx] = mcelem_freqs; /* See above comment about extra stanumber entries */ stats->numnumbers[slot_idx] = num_mcelem + 3; stats->stavalues[slot_idx] = mcelem_values; stats->numvalues[slot_idx] = num_mcelem; /* We are storing values of element type */ stats->statypid[slot_idx] = extra_data->type_id; stats->statyplen[slot_idx] = extra_data->typlen; stats->statypbyval[slot_idx] = extra_data->typbyval; stats->statypalign[slot_idx] = extra_data->typalign; slot_idx++; } /* Generate DECHIST slot entry */ count_items_count = hash_get_num_entries(count_tab); if (count_items_count > 0) { int num_hist = stats->attr->attstattarget; DECountItem **sorted_count_items; int j; int delta; int64 frac; float4 *hist; /* num_hist must be at least 2 for the loop below to work */ num_hist = Max(num_hist, 2); /* * Create an array of DECountItem pointers, and sort them into * increasing count order. */ sorted_count_items = (DECountItem **) palloc(sizeof(DECountItem *) * count_items_count); hash_seq_init(&scan_status, count_tab); j = 0; while ((count_item = (DECountItem *) hash_seq_search(&scan_status)) != NULL) { sorted_count_items[j++] = count_item; } qsort(sorted_count_items, count_items_count, sizeof(DECountItem *), countitem_compare_count); /* * Prepare to fill stanumbers with the histogram, followed by the * average count. This array must be stored in anl_context. */ hist = (float4 *) MemoryContextAlloc(stats->anl_context, sizeof(float4) * (num_hist + 1)); hist[num_hist] = (double) element_no / (double) nonnull_cnt; /*---------- * Construct the histogram of distinct-element counts (DECs). * * The object of this loop is to copy the min and max DECs to * hist[0] and hist[num_hist - 1], along with evenly-spaced DECs * in between (where "evenly-spaced" is with reference to the * whole input population of arrays). If we had a complete sorted * array of DECs, one per analyzed row, the i'th hist value would * come from DECs[i * (analyzed_rows - 1) / (num_hist - 1)] * (compare the histogram-making loop in compute_scalar_stats()). * But instead of that we have the sorted_count_items[] array, * which holds unique DEC values with their frequencies (that is, * a run-length-compressed version of the full array). So we * control advancing through sorted_count_items[] with the * variable "frac", which is defined as (x - y) * (num_hist - 1), * where x is the index in the notional DECs array corresponding * to the start of the next sorted_count_items[] element's run, * and y is the index in DECs from which we should take the next * histogram value. We have to advance whenever x <= y, that is * frac <= 0. The x component is the sum of the frequencies seen * so far (up through the current sorted_count_items[] element), * and of course y * (num_hist - 1) = i * (analyzed_rows - 1), * per the subscript calculation above. (The subscript calculation * implies dropping any fractional part of y; in this formulation * that's handled by not advancing until frac reaches 1.) * * Even though frac has a bounded range, it could overflow int32 * when working with very large statistics targets, so we do that * math in int64. *---------- */ delta = analyzed_rows - 1; j = 0; /* current index in sorted_count_items */ /* Initialize frac for sorted_count_items[0]; y is initially 0 */ frac = (int64) sorted_count_items[0]->frequency * (num_hist - 1); for (i = 0; i < num_hist; i++) { while (frac <= 0) { /* Advance, and update x component of frac */ j++; frac += (int64) sorted_count_items[j]->frequency * (num_hist - 1); } hist[i] = sorted_count_items[j]->count; frac -= delta; /* update y for upcoming i increment */ } Assert(j == count_items_count - 1); stats->stakind[slot_idx] = STATISTIC_KIND_DECHIST; stats->staop[slot_idx] = extra_data->eq_opr; stats->stanumbers[slot_idx] = hist; stats->numnumbers[slot_idx] = num_hist + 1; slot_idx++; } } /* * We don't need to bother cleaning up any of our temporary palloc's. The * hashtable should also go away, as it used a child memory context. */ }
/* * pgfdw_xact_callback --- cleanup at main-transaction end. */ static void pgfdw_xact_callback(XactEvent event, void *arg) { HASH_SEQ_STATUS scan; ConnCacheEntry *entry; /* Quick exit if no connections were touched in this transaction. */ if (!xact_got_connection) return; /* * Scan all connection cache entries to find open remote transactions, and * close them. */ hash_seq_init(&scan, ConnectionHash); while ((entry = (ConnCacheEntry *) hash_seq_search(&scan))) { Jresult *res; /* Ignore cache entry if no open connection right now */ if (entry->conn == NULL) continue; /* If it has an open remote transaction, try to close it */ if (entry->xact_depth > 0) { elog(DEBUG3, "closing remote transaction on connection %p", entry->conn); switch (event) { case XACT_EVENT_PRE_COMMIT: /* Commit all remote transactions during pre-commit */ do_sql_command(entry->conn, "COMMIT TRANSACTION"); /* * If there were any errors in subtransactions, and we * made prepared statements, do a DEALLOCATE ALL to make * sure we get rid of all prepared statements. This is * annoying and not terribly bulletproof, but it's * probably not worth trying harder. * * DEALLOCATE ALL only exists in 8.3 and later, so this * constrains how old a server jdbc2_fdw can * communicate with. We intentionally ignore errors in * the DEALLOCATE, so that we can hobble along to some * extent with older servers (leaking prepared statements * as we go; but we don't really support update operations * pre-8.3 anyway). */ if (entry->have_prep_stmt && entry->have_error) { res = JQexec(entry->conn, "DEALLOCATE ALL"); JQclear(res); } entry->have_prep_stmt = false; entry->have_error = false; break; case XACT_EVENT_PRE_PREPARE: /* * We disallow remote transactions that modified anything, * since it's not very reasonable to hold them open until * the prepared transaction is committed. For the moment, * throw error unconditionally; later we might allow * read-only cases. Note that the error will cause us to * come right back here with event == XACT_EVENT_ABORT, so * we'll clean up the connection state at that point. */ ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("cannot prepare a transaction that modified remote tables"))); break; case XACT_EVENT_COMMIT: case XACT_EVENT_PREPARE: /* Pre-commit should have closed the open transaction */ elog(ERROR, "missed cleaning up connection during pre-commit"); break; case XACT_EVENT_ABORT: /* Assume we might have lost track of prepared statements */ entry->have_error = true; /* If we're aborting, abort all remote transactions too */ res = JQexec(entry->conn, "ABORT TRANSACTION"); /* Note: can't throw ERROR, it would be infinite loop */ if (JQresultStatus(res) != PGRES_COMMAND_OK) pgfdw_report_error(WARNING, res, entry->conn, true, "ABORT TRANSACTION"); else { JQclear(res); /* As above, make sure to clear any prepared stmts */ if (entry->have_prep_stmt && entry->have_error) { res = JQexec(entry->conn, "DEALLOCATE ALL"); JQclear(res); } entry->have_prep_stmt = false; entry->have_error = false; } break; } } /* Reset state to show we're out of a transaction */ entry->xact_depth = 0; /* * If the connection isn't in a good idle state, discard it to * recover. Next GetConnection will open a new connection. */ if (JQstatus(entry->conn) != CONNECTION_OK || JQtransactionStatus(entry->conn) != PQTRANS_IDLE) { elog(DEBUG3, "discarding connection %p", entry->conn); JQfinish(entry->conn); entry->conn = NULL; } } /* * Regardless of the event type, we can now mark ourselves as out of the * transaction. (Note: if we are here during PRE_COMMIT or PRE_PREPARE, * this saves a useless scan of the hashtable during COMMIT or PREPARE.) */ xact_got_connection = false; /* Also reset cursor numbering for next transaction */ cursor_number = 0; }
/* * pgfdw_subxact_callback --- cleanup at subtransaction end. */ static void pgfdw_subxact_callback(SubXactEvent event, SubTransactionId mySubid, SubTransactionId parentSubid, void *arg) { HASH_SEQ_STATUS scan; ConnCacheEntry *entry; int curlevel; /* Nothing to do at subxact start, nor after commit. */ if (!(event == SUBXACT_EVENT_PRE_COMMIT_SUB || event == SUBXACT_EVENT_ABORT_SUB)) return; /* Quick exit if no connections were touched in this transaction. */ if (!xact_got_connection) return; /* * Scan all connection cache entries to find open remote subtransactions * of the current level, and close them. */ curlevel = GetCurrentTransactionNestLevel(); hash_seq_init(&scan, ConnectionHash); while ((entry = (ConnCacheEntry *) hash_seq_search(&scan))) { Jresult *res; char sql[100]; /* * We only care about connections with open remote subtransactions of * the current level. */ if (entry->conn == NULL || entry->xact_depth < curlevel) continue; if (entry->xact_depth > curlevel) elog(ERROR, "missed cleaning up remote subtransaction at level %d", entry->xact_depth); if (event == SUBXACT_EVENT_PRE_COMMIT_SUB) { /* Commit all remote subtransactions during pre-commit */ snprintf(sql, sizeof(sql), "RELEASE SAVEPOINT s%d", curlevel); do_sql_command(entry->conn, sql); } else { /* Assume we might have lost track of prepared statements */ entry->have_error = true; /* Rollback all remote subtransactions during abort */ snprintf(sql, sizeof(sql), "ROLLBACK TO SAVEPOINT s%d; RELEASE SAVEPOINT s%d", curlevel, curlevel); res = JQexec(entry->conn, sql); if (JQresultStatus(res) != PGRES_COMMAND_OK) pgfdw_report_error(WARNING, res, entry->conn, true, sql); else JQclear(res); } /* OK, we're outta that level of subtransaction */ entry->xact_depth--; } }
void ContQuerySchedulerMain(int argc, char *argv[]) { sigjmp_buf local_sigjmp_buf; List *dbs = NIL; /* we are a postmaster subprocess now */ IsUnderPostmaster = true; am_cont_scheduler = true; /* reset MyProcPid */ MyProcPid = getpid(); MyPMChildSlot = AssignPostmasterChildSlot(); /* record Start Time for logging */ MyStartTime = time(NULL); /* Identify myself via ps */ init_ps_display("continuous query scheduler process", "", "", ""); ereport(LOG, (errmsg("continuous query scheduler started"))); if (PostAuthDelay) pg_usleep(PostAuthDelay * 1000000L); SetProcessingMode(InitProcessing); /* * If possible, make this process a group leader, so that the postmaster * can signal any child processes too. This is only for consistency sake, we * never fork the scheduler process. Instead dynamic bgworkers are used. */ #ifdef HAVE_SETSID if (setsid() < 0) elog(FATAL, "setsid() failed: %m"); #endif /* * Set up signal handlers. We operate on databases much like a regular * backend, so we use the same signal handling. See equivalent code in * tcop/postgres.c. */ pqsignal(SIGHUP, sighup_handler); pqsignal(SIGINT, sigint_handler); pqsignal(SIGTERM, sigterm_handler); pqsignal(SIGQUIT, quickdie); InitializeTimeouts(); /* establishes SIGALRM handler */ pqsignal(SIGPIPE, SIG_IGN); pqsignal(SIGUSR1, procsignal_sigusr1_handler); pqsignal(SIGUSR2, sigusr2_handler); pqsignal(SIGFPE, FloatExceptionHandler); pqsignal(SIGCHLD, SIG_DFL); #define BACKTRACE_SEGFAULTS #ifdef BACKTRACE_SEGFAULTS pqsignal(SIGSEGV, debug_segfault); #endif /* Early initialization */ BaseInit(); /* * Create a per-backend PGPROC struct in shared memory, except in the * EXEC_BACKEND case where this was done in SubPostmasterMain. We must do * this before we can use LWLocks (and in the EXEC_BACKEND case we already * had to do some stuff with LWLocks). */ #ifndef EXEC_BACKEND InitProcess(); #endif InitPostgres(NULL, InvalidOid, NULL, NULL); SetProcessingMode(NormalProcessing); /* * Create a memory context that we will do all our work in. We do this so * that we can reset the context during error recovery and thereby avoid * possible memory leaks. */ ContQuerySchedulerMemCxt = AllocSetContextCreate(TopMemoryContext, "ContQuerySchedulerCtx", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); MemoryContextSwitchTo(ContQuerySchedulerMemCxt); /* * If an exception is encountered, processing resumes here. * * This code is a stripped down version of PostgresMain error recovery. */ if (sigsetjmp(local_sigjmp_buf, 1) != 0) { /* since not using PG_TRY, must reset error stack by hand */ error_context_stack = NULL; /* Prevents interrupts while cleaning up */ HOLD_INTERRUPTS(); /* Forget any pending QueryCancel or timeout request */ disable_all_timeouts(false); QueryCancelPending = false; /* second to avoid race condition */ /* Report the error to the server log */ EmitErrorReport(); /* Abort the current transaction in order to recover */ AbortCurrentTransaction(); /* * Now return to normal top-level context and clear ErrorContext for * next time. */ MemoryContextSwitchTo(ContQuerySchedulerMemCxt); FlushErrorState(); /* Flush any leaked data in the top-level context */ MemoryContextResetAndDeleteChildren(ContQuerySchedulerMemCxt); /* Now we can allow interrupts again */ RESUME_INTERRUPTS(); /* * Sleep at least 1 second after any error. We don't want to be * filling the error logs as fast as we can. */ pg_usleep(1000000L); } /* We can now handle ereport(ERROR) */ PG_exception_stack = &local_sigjmp_buf; /* must unblock signals before calling rebuild_database_list */ PG_SETMASK(&UnBlockSig); ContQuerySchedulerShmem->scheduler_pid = MyProcPid; dbs = get_database_list(); /* Loop forever */ for (;;) { ListCell *lc; int rc; foreach(lc, dbs) { DatabaseEntry *db_entry = lfirst(lc); bool found; ContQueryProcGroup *grp = hash_search(ContQuerySchedulerShmem->proc_table, &db_entry->oid, HASH_ENTER, &found); /* If we don't have an entry for this dboid, initialize a new one and fire off bg procs */ if (!found) { grp->db_oid = db_entry->oid; namestrcpy(&grp->db_name, NameStr(db_entry->name)); start_group(grp); } } /* Allow sinval catchup interrupts while sleeping */ EnableCatchupInterrupt(); /* * Wait until naptime expires or we get some type of signal (all the * signal handlers will wake us by calling SetLatch). */ rc = WaitLatch(&MyProc->procLatch, WL_LATCH_SET | WL_POSTMASTER_DEATH, 0); ResetLatch(&MyProc->procLatch); DisableCatchupInterrupt(); /* * Emergency bailout if postmaster has died. This is to avoid the * necessity for manual cleanup of all postmaster children. */ if (rc & WL_POSTMASTER_DEATH) proc_exit(1); /* the normal shutdown case */ if (got_SIGTERM) break; /* update config? */ if (got_SIGHUP) { got_SIGHUP = false; ProcessConfigFile(PGC_SIGHUP); /* update tuning parameters, so that they can be read downstream by background processes */ update_tuning_params(); } /* terminate a proc group? */ if (got_SIGUSR2) { HASH_SEQ_STATUS status; ContQueryProcGroup *grp; got_SIGUSR2 = false; hash_seq_init(&status, ContQuerySchedulerShmem->proc_table); while ((grp = (ContQueryProcGroup *) hash_seq_search(&status)) != NULL) { ListCell *lc; if (!grp->terminate) continue; foreach(lc, dbs) { DatabaseEntry *entry = lfirst(lc); if (entry->oid == grp->db_oid) { dbs = list_delete(dbs, entry); break; } } terminate_group(grp); } }
/* * pgfdw_subxact_callback --- cleanup at subtransaction end. */ static void pgfdw_subxact_callback(SubXactEvent event, SubTransactionId mySubid, SubTransactionId parentSubid, void *arg) { HASH_SEQ_STATUS scan; ConnCacheEntry *entry; int curlevel; /* Nothing to do at subxact start, nor after commit. */ if (!(event == SUBXACT_EVENT_PRE_COMMIT_SUB || event == SUBXACT_EVENT_ABORT_SUB)) return; /* Quick exit if no connections were touched in this transaction. */ if (!xact_got_connection) return; /* * Scan all connection cache entries to find open remote subtransactions * of the current level, and close them. */ curlevel = GetCurrentTransactionNestLevel(); hash_seq_init(&scan, ConnectionHash); while ((entry = (ConnCacheEntry *) hash_seq_search(&scan))) { PGresult *res; char sql[100]; /* * We only care about connections with open remote subtransactions of * the current level. */ if (entry->conn == NULL || entry->xact_depth < curlevel) continue; if (entry->xact_depth > curlevel) elog(ERROR, "missed cleaning up remote subtransaction at level %d", entry->xact_depth); if (event == SUBXACT_EVENT_PRE_COMMIT_SUB) { /* Commit all remote subtransactions during pre-commit */ snprintf(sql, sizeof(sql), "RELEASE SAVEPOINT s%d", curlevel); do_sql_command(entry->conn, sql); } else { /* Assume we might have lost track of prepared statements */ entry->have_error = true; /* * If a command has been submitted to the remote server by using * an asynchronous execution function, the command might not have * yet completed. Check to see if a command is still being * processed by the remote server, and if so, request cancellation * of the command. */ if (PQtransactionStatus(entry->conn) == PQTRANS_ACTIVE) { PGcancel *cancel; char errbuf[256]; if ((cancel = PQgetCancel(entry->conn))) { if (!PQcancel(cancel, errbuf, sizeof(errbuf))) ereport(WARNING, (errcode(ERRCODE_CONNECTION_FAILURE), errmsg("could not send cancel request: %s", errbuf))); PQfreeCancel(cancel); } } /* Rollback all remote subtransactions during abort */ snprintf(sql, sizeof(sql), "ROLLBACK TO SAVEPOINT s%d; RELEASE SAVEPOINT s%d", curlevel, curlevel); res = PQexec(entry->conn, sql); if (PQresultStatus(res) != PGRES_COMMAND_OK) pgfdw_report_error(WARNING, res, entry->conn, true, sql); else PQclear(res); } /* OK, we're outta that level of subtransaction */ entry->xact_depth--; } }
/* * mdsync() -- Sync previous writes to stable storage. */ void mdsync(void) { static bool mdsync_in_progress = false; HASH_SEQ_STATUS hstat; PendingOperationEntry *entry; int absorb_counter; /* * This is only called during checkpoints, and checkpoints should only * occur in processes that have created a pendingOpsTable. */ if (!pendingOpsTable) elog(ERROR, "cannot sync without a pendingOpsTable"); /* * If we are in the bgwriter, the sync had better include all fsync * requests that were queued by backends up to this point. The tightest * race condition that could occur is that a buffer that must be written * and fsync'd for the checkpoint could have been dumped by a backend just * before it was visited by BufferSync(). We know the backend will have * queued an fsync request before clearing the buffer's dirtybit, so we * are safe as long as we do an Absorb after completing BufferSync(). */ AbsorbFsyncRequests(); /* * To avoid excess fsync'ing (in the worst case, maybe a never-terminating * checkpoint), we want to ignore fsync requests that are entered into the * hashtable after this point --- they should be processed next time, * instead. We use mdsync_cycle_ctr to tell old entries apart from new * ones: new ones will have cycle_ctr equal to the incremented value of * mdsync_cycle_ctr. * * In normal circumstances, all entries present in the table at this point * will have cycle_ctr exactly equal to the current (about to be old) * value of mdsync_cycle_ctr. However, if we fail partway through the * fsync'ing loop, then older values of cycle_ctr might remain when we * come back here to try again. Repeated checkpoint failures would * eventually wrap the counter around to the point where an old entry * might appear new, causing us to skip it, possibly allowing a checkpoint * to succeed that should not have. To forestall wraparound, any time the * previous mdsync() failed to complete, run through the table and * forcibly set cycle_ctr = mdsync_cycle_ctr. * * Think not to merge this loop with the main loop, as the problem is * exactly that that loop may fail before having visited all the entries. * From a performance point of view it doesn't matter anyway, as this path * will never be taken in a system that's functioning normally. */ if (mdsync_in_progress) { /* prior try failed, so update any stale cycle_ctr values */ hash_seq_init(&hstat, pendingOpsTable); while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL) { entry->cycle_ctr = mdsync_cycle_ctr; } } /* Advance counter so that new hashtable entries are distinguishable */ mdsync_cycle_ctr++; /* Set flag to detect failure if we don't reach the end of the loop */ mdsync_in_progress = true; /* Now scan the hashtable for fsync requests to process */ absorb_counter = FSYNCS_PER_ABSORB; hash_seq_init(&hstat, pendingOpsTable); while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL) { /* * If the entry is new then don't process it this time. Note that * "continue" bypasses the hash-remove call at the bottom of the loop. */ if (entry->cycle_ctr == mdsync_cycle_ctr) continue; /* Else assert we haven't missed it */ Assert((CycleCtr) (entry->cycle_ctr + 1) == mdsync_cycle_ctr); /* * If fsync is off then we don't have to bother opening the file at * all. (We delay checking until this point so that changing fsync on * the fly behaves sensibly.) Also, if the entry is marked canceled, * fall through to delete it. */ if (enableFsync && !entry->canceled) { int failures; /* * If in bgwriter, we want to absorb pending requests every so * often to prevent overflow of the fsync request queue. It is * unspecified whether newly-added entries will be visited by * hash_seq_search, but we don't care since we don't need to * process them anyway. */ if (--absorb_counter <= 0) { AbsorbFsyncRequests(); absorb_counter = FSYNCS_PER_ABSORB; } /* * The fsync table could contain requests to fsync segments that * have been deleted (unlinked) by the time we get to them. Rather * than just hoping an ENOENT (or EACCES on Windows) error can be * ignored, what we do on error is absorb pending requests and * then retry. Since mdunlink() queues a "revoke" message before * actually unlinking, the fsync request is guaranteed to be * marked canceled after the absorb if it really was this case. * DROP DATABASE likewise has to tell us to forget fsync requests * before it starts deletions. */ for (failures = 0;; failures++) /* loop exits at "break" */ { SMgrRelation reln; MdfdVec *seg; char *path; /* * Find or create an smgr hash entry for this relation. This * may seem a bit unclean -- md calling smgr? But it's really * the best solution. It ensures that the open file reference * isn't permanently leaked if we get an error here. (You may * say "but an unreferenced SMgrRelation is still a leak!" Not * really, because the only case in which a checkpoint is done * by a process that isn't about to shut down is in the * bgwriter, and it will periodically do smgrcloseall(). This * fact justifies our not closing the reln in the success path * either, which is a good thing since in non-bgwriter cases * we couldn't safely do that.) Furthermore, in many cases * the relation will have been dirtied through this same smgr * relation, and so we can save a file open/close cycle. */ reln = smgropen(entry->tag.rnode.node, entry->tag.rnode.backend); /* * It is possible that the relation has been dropped or * truncated since the fsync request was entered. Therefore, * allow ENOENT, but only if we didn't fail already on this * file. This applies both during _mdfd_getseg() and during * FileSync, since fd.c might have closed the file behind our * back. */ seg = _mdfd_getseg(reln, entry->tag.forknum, entry->tag.segno * ((BlockNumber) RELSEG_SIZE), false, EXTENSION_RETURN_NULL); if (seg != NULL && FileSync(seg->mdfd_vfd) >= 0) break; /* success; break out of retry loop */ /* * XXX is there any point in allowing more than one retry? * Don't see one at the moment, but easy to change the test * here if so. */ path = _mdfd_segpath(reln, entry->tag.forknum, entry->tag.segno); if (!FILE_POSSIBLY_DELETED(errno) || failures > 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not fsync file \"%s\": %m", path))); else ereport(DEBUG1, (errcode_for_file_access(), errmsg("could not fsync file \"%s\" but retrying: %m", path))); pfree(path); /* * Absorb incoming requests and check to see if canceled. */ AbsorbFsyncRequests(); absorb_counter = FSYNCS_PER_ABSORB; /* might as well... */ if (entry->canceled) break; } /* end retry loop */ } /* * If we get here, either we fsync'd successfully, or we don't have to * because enableFsync is off, or the entry is (now) marked canceled. * Okay to delete it. */ if (hash_search(pendingOpsTable, &entry->tag, HASH_REMOVE, NULL) == NULL) elog(ERROR, "pendingOpsTable corrupted"); } /* end loop over hashtable entries */ /* Flag successful completion of mdsync */ mdsync_in_progress = false; }
/* * Activate a standby master by removing reference to the dead master * and changing our dbid to the old master's dbid */ void PersistentFilespace_ActivateStandby(int16 oldmaster, int16 newmaster) { HASH_SEQ_STATUS hstat; FilespaceDirEntry fde; WRITE_PERSISTENT_STATE_ORDERED_LOCK_DECLARE; if (Persistent_BeforePersistenceWork()) elog(ERROR, "persistent table changes forbidden"); hash_seq_init(&hstat, persistentFilespaceSharedHashTable); PersistentFilespace_VerifyInitScan(); WRITE_PERSISTENT_STATE_ORDERED_LOCK; while ((fde = hash_seq_search(&hstat)) != NULL) { Oid filespace = fde->key.filespaceOid; PersistentFileSysObjName fsObjName; PersistentFileSysObjName_SetFilespaceDir(&fsObjName, filespace); if (fde->dbId1 == oldmaster) { fde->dbId1 = InvalidDbid; fde->dbId2 = newmaster; /* Copy standby filespace location into new master location */ PersistentFilespace_BlankPadCopyLocation( fde->locationBlankPadded2, fde->locationBlankPadded1); PersistentFilespace_BlankPadCopyLocation( fde->locationBlankPadded1, ""); } else if (fde->dbId2 == oldmaster) { fde->dbId2 = InvalidDbid; fde->dbId1 = newmaster; /* Copy standby filespace location into new master location */ PersistentFilespace_BlankPadCopyLocation( fde->locationBlankPadded1, fde->locationBlankPadded2); PersistentFilespace_BlankPadCopyLocation( fde->locationBlankPadded2, ""); } PersistentFileSysObj_ActivateStandby(&fsObjName, &fde->persistentTid, fde->persistentSerialNum, oldmaster, newmaster, /* flushToXlog */ false); } WRITE_PERSISTENT_STATE_ORDERED_UNLOCK; }
/* * Activate a standby master by removing reference to the dead master * and changing our dbid to the old master's dbid */ void PersistentFilespace_ActivateStandby(int16 oldmaster, int16 newmaster) { HASH_SEQ_STATUS hstat; FilespaceDirEntry fde; WRITE_PERSISTENT_STATE_ORDERED_LOCK_DECLARE; if (Persistent_BeforePersistenceWork()) elog(ERROR, "persistent table changes forbidden"); hash_seq_init(&hstat, persistentFilespaceSharedHashTable); PersistentFilespace_VerifyInitScan(); WRITE_PERSISTENT_STATE_ORDERED_LOCK; /* * We release FilespaceHashLock in the middle of the loop and re-acquire * it after doing persistent table change. This is needed to prevent * holding the lock for any purpose other than to protect the filespace * shared hash table. Not releasing this lock could result in file I/O * and potential deadlock due to other LW locks being acquired in the * process. Releasing the lock this way is safe because we are still * holding PersistentObjLock in exclusive mode. Any change to the * filespace shared hash table is also protected by PersistentObjLock. */ WRITE_FILESPACE_HASH_LOCK; while ((fde = hash_seq_search(&hstat)) != NULL) { Oid filespace = fde->key.filespaceOid; PersistentFileSysObjName fsObjName; ItemPointerData persistentTid; int64 persistentSerialNum = fde->persistentSerialNum; ItemPointerCopy(&fde->persistentTid, &persistentTid); PersistentFileSysObjName_SetFilespaceDir(&fsObjName, filespace); if (fde->dbId1 == oldmaster) { fde->dbId1 = InvalidDbid; fde->dbId2 = newmaster; /* Copy standby filespace location into new master location */ PersistentFilespace_BlankPadCopyLocation( fde->locationBlankPadded2, fde->locationBlankPadded1); PersistentFilespace_BlankPadCopyLocation( fde->locationBlankPadded1, ""); } else if (fde->dbId2 == oldmaster) { fde->dbId2 = InvalidDbid; fde->dbId1 = newmaster; /* Copy standby filespace location into new master location */ PersistentFilespace_BlankPadCopyLocation( fde->locationBlankPadded1, fde->locationBlankPadded2); PersistentFilespace_BlankPadCopyLocation( fde->locationBlankPadded2, ""); } WRITE_FILESPACE_HASH_UNLOCK; PersistentFileSysObj_ActivateStandby(&fsObjName, &persistentTid, persistentSerialNum, oldmaster, newmaster, /* flushToXlog */ false); WRITE_FILESPACE_HASH_LOCK; } WRITE_FILESPACE_HASH_UNLOCK; WRITE_PERSISTENT_STATE_ORDERED_UNLOCK; }