static void ginVacuumPostingTree(GinVacuumState *gvs, BlockNumber rootBlkno) { Buffer rootBuffer = InvalidBuffer; DataPageDeleteStack root, *ptr, *tmp; if (ginVacuumPostingTreeLeaves(gvs, rootBlkno, TRUE, &rootBuffer) == FALSE) { Assert(rootBuffer == InvalidBuffer); return; } memset(&root, 0, sizeof(DataPageDeleteStack)); root.leftBlkno = InvalidBlockNumber; root.isRoot = TRUE; vacuum_delay_point(); ginScanToDelete(gvs, rootBlkno, TRUE, &root, InvalidOffsetNumber); ptr = root.child; while (ptr) { tmp = ptr->child; pfree(ptr); ptr = tmp; } UnlockReleaseBuffer(rootBuffer); }
/* * Post-VACUUM cleanup. * * Result: a palloc'd struct containing statistical info for VACUUM displays. */ IndexBulkDeleteResult * blvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) { Relation index = info->index; BlockNumber npages, blkno; BlockNumber totFreePages; if (info->analyze_only) return stats; if (stats == NULL) stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); /* * Iterate over the pages: insert deleted pages into FSM and collect * statistics. */ npages = RelationGetNumberOfBlocks(index); totFreePages = 0; for (blkno = BLOOM_HEAD_BLKNO; blkno < npages; blkno++) { Buffer buffer; Page page; vacuum_delay_point(); buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy); LockBuffer(buffer, BUFFER_LOCK_SHARE); page = BufferGetPage(buffer, NULL, NULL, BGP_NO_SNAPSHOT_TEST); if (BloomPageIsDeleted(page)) { RecordFreeIndexPage(index, blkno); totFreePages++; } else { stats->num_index_tuples += BloomPageGetMaxOffset(page); stats->estimated_count += BloomPageGetMaxOffset(page); } UnlockReleaseBuffer(buffer); } IndexFreeSpaceMapVacuum(info->index); stats->pages_free = totFreePages; stats->num_pages = RelationGetNumberOfBlocks(index); return stats; }
/* * lazy_vacuum_heap() -- second pass over the heap * * This routine marks dead tuples as unused and compacts out free * space on their pages. Pages not having dead tuples recorded from * lazy_scan_heap are not visited at all. * * Note: the reason for doing this as a second pass is we cannot remove * the tuples until we've removed their index entries, and we want to * process index entry removal in batches as large as possible. */ static void lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats) { MIRROREDLOCK_BUFMGR_DECLARE; int tupindex; int npages; PGRUsage ru0; pg_rusage_init(&ru0); npages = 0; tupindex = 0; /* Fetch gp_persistent_relation_node information that will be added to XLOG record. */ RelationFetchGpRelationNodeForXLog(onerel); while (tupindex < vacrelstats->num_dead_tuples) { BlockNumber tblk; Buffer buf; Page page; vacuum_delay_point(); tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples[tupindex]); /* -------- MirroredLock ---------- */ MIRROREDLOCK_BUFMGR_LOCK; buf = ReadBufferWithStrategy(onerel, tblk, vac_strategy); LockBufferForCleanup(buf); tupindex = lazy_vacuum_page(onerel, tblk, buf, tupindex, vacrelstats); /* Now that we've compacted the page, record its available space */ page = BufferGetPage(buf); lazy_record_free_space(vacrelstats, tblk, PageGetHeapFreeSpace(page)); UnlockReleaseBuffer(buf); MIRROREDLOCK_BUFMGR_UNLOCK; /* -------- MirroredLock ---------- */ npages++; } ereport(elevel, (errmsg("\"%s\": removed %d row versions in %d pages", RelationGetRelationName(onerel), tupindex, npages), errdetail("%s.", pg_rusage_show(&ru0)))); }
/* * lazy_vacuum_heap() -- second pass over the heap * * This routine marks dead tuples as unused and compacts out free * space on their pages. Pages not having dead tuples recorded from * lazy_scan_heap are not visited at all. * * Note: the reason for doing this as a second pass is we cannot remove * the tuples until we've removed their index entries, and we want to * process index entry removal in batches as large as possible. */ static void lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats) { int tupindex; int npages; PGRUsage ru0; pg_rusage_init(&ru0); npages = 0; tupindex = 0; while (tupindex < vacrelstats->num_dead_tuples) { BlockNumber tblk; Buffer buf; Page page; Size freespace; vacuum_delay_point(); tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples[tupindex]); buf = ReadBufferExtended(onerel, MAIN_FORKNUM, tblk, RBM_NORMAL, vac_strategy); if (!ConditionalLockBufferForCleanup(buf)) { ReleaseBuffer(buf); ++tupindex; continue; } tupindex = lazy_vacuum_page(onerel, tblk, buf, tupindex, vacrelstats); /* Now that we've compacted the page, record its available space */ page = BufferGetPage(buf); freespace = PageGetHeapFreeSpace(page); UnlockReleaseBuffer(buf); RecordPageWithFreeSpace(onerel, tblk, freespace); npages++; } ereport(elevel, (errmsg("\"%s\": removed %d row versions in %d pages", RelationGetRelationName(onerel), tupindex, npages), errdetail("%s.", pg_rusage_show(&ru0)))); }
/* * Helper function to perform deletion of index entries from a bucket. * * This function expects that the caller has acquired a cleanup lock on the * primary bucket page, and will return with a write lock again held on the * primary bucket page. The lock won't necessarily be held continuously, * though, because we'll release it when visiting overflow pages. * * It would be very bad if this function cleaned a page while some other * backend was in the midst of scanning it, because hashgettuple assumes * that the next valid TID will be greater than or equal to the current * valid TID. There can't be any concurrent scans in progress when we first * enter this function because of the cleanup lock we hold on the primary * bucket page, but as soon as we release that lock, there might be. We * handle that by conspiring to prevent those scans from passing our cleanup * scan. To do that, we lock the next page in the bucket chain before * releasing the lock on the previous page. (This type of lock chaining is * not ideal, so we might want to look for a better solution at some point.) * * We need to retain a pin on the primary bucket to ensure that no concurrent * split can start. */ void hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf, BlockNumber bucket_blkno, BufferAccessStrategy bstrategy, uint32 maxbucket, uint32 highmask, uint32 lowmask, double *tuples_removed, double *num_index_tuples, bool split_cleanup, IndexBulkDeleteCallback callback, void *callback_state) { BlockNumber blkno; Buffer buf; Bucket new_bucket PG_USED_FOR_ASSERTS_ONLY = InvalidBucket; bool bucket_dirty = false; blkno = bucket_blkno; buf = bucket_buf; if (split_cleanup) new_bucket = _hash_get_newbucket_from_oldbucket(rel, cur_bucket, lowmask, maxbucket); /* Scan each page in bucket */ for (;;) { HashPageOpaque opaque; OffsetNumber offno; OffsetNumber maxoffno; Buffer next_buf; Page page; OffsetNumber deletable[MaxOffsetNumber]; int ndeletable = 0; bool retain_pin = false; bool clear_dead_marking = false; vacuum_delay_point(); page = BufferGetPage(buf); opaque = (HashPageOpaque) PageGetSpecialPointer(page); /* Scan each tuple in page */ maxoffno = PageGetMaxOffsetNumber(page); for (offno = FirstOffsetNumber; offno <= maxoffno; offno = OffsetNumberNext(offno)) { ItemPointer htup; IndexTuple itup; Bucket bucket; bool kill_tuple = false; itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offno)); htup = &(itup->t_tid); /* * To remove the dead tuples, we strictly want to rely on results * of callback function. refer btvacuumpage for detailed reason. */ if (callback && callback(htup, callback_state)) { kill_tuple = true; if (tuples_removed) *tuples_removed += 1; } else if (split_cleanup) { /* delete the tuples that are moved by split. */ bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup), maxbucket, highmask, lowmask); /* mark the item for deletion */ if (bucket != cur_bucket) { /* * We expect tuples to either belong to current bucket or * new_bucket. This is ensured because we don't allow * further splits from bucket that contains garbage. See * comments in _hash_expandtable. */ Assert(bucket == new_bucket); kill_tuple = true; } } if (kill_tuple) { /* mark the item for deletion */ deletable[ndeletable++] = offno; } else { /* we're keeping it, so count it */ if (num_index_tuples) *num_index_tuples += 1; } } /* retain the pin on primary bucket page till end of bucket scan */ if (blkno == bucket_blkno) retain_pin = true; else retain_pin = false; blkno = opaque->hasho_nextblkno; /* * Apply deletions, advance to next page and write page if needed. */ if (ndeletable > 0) { /* No ereport(ERROR) until changes are logged */ START_CRIT_SECTION(); PageIndexMultiDelete(page, deletable, ndeletable); bucket_dirty = true; /* * Let us mark the page as clean if vacuum removes the DEAD tuples * from an index page. We do this by clearing * LH_PAGE_HAS_DEAD_TUPLES flag. */ if (tuples_removed && *tuples_removed > 0 && H_HAS_DEAD_TUPLES(opaque)) { opaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES; clear_dead_marking = true; } MarkBufferDirty(buf); /* XLOG stuff */ if (RelationNeedsWAL(rel)) { xl_hash_delete xlrec; XLogRecPtr recptr; xlrec.clear_dead_marking = clear_dead_marking; xlrec.is_primary_bucket_page = (buf == bucket_buf) ? true : false; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfHashDelete); /* * bucket buffer needs to be registered to ensure that we can * acquire a cleanup lock on it during replay. */ if (!xlrec.is_primary_bucket_page) XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD | REGBUF_NO_IMAGE); XLogRegisterBuffer(1, buf, REGBUF_STANDARD); XLogRegisterBufData(1, (char *) deletable, ndeletable * sizeof(OffsetNumber)); recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_DELETE); PageSetLSN(BufferGetPage(buf), recptr); } END_CRIT_SECTION(); } /* bail out if there are no more pages to scan. */ if (!BlockNumberIsValid(blkno)) break; next_buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE, LH_OVERFLOW_PAGE, bstrategy); /* * release the lock on previous page after acquiring the lock on next * page */ if (retain_pin) LockBuffer(buf, BUFFER_LOCK_UNLOCK); else _hash_relbuf(rel, buf); buf = next_buf; } /* * lock the bucket page to clear the garbage flag and squeeze the bucket. * if the current buffer is same as bucket buffer, then we already have * lock on bucket page. */ if (buf != bucket_buf) { _hash_relbuf(rel, buf); LockBuffer(bucket_buf, BUFFER_LOCK_EXCLUSIVE); } /* * Clear the garbage flag from bucket after deleting the tuples that are * moved by split. We purposefully clear the flag before squeeze bucket, * so that after restart, vacuum shouldn't again try to delete the moved * by split tuples. */ if (split_cleanup) { HashPageOpaque bucket_opaque; Page page; page = BufferGetPage(bucket_buf); bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page); /* No ereport(ERROR) until changes are logged */ START_CRIT_SECTION(); bucket_opaque->hasho_flag &= ~LH_BUCKET_NEEDS_SPLIT_CLEANUP; MarkBufferDirty(bucket_buf); /* XLOG stuff */ if (RelationNeedsWAL(rel)) { XLogRecPtr recptr; XLogBeginInsert(); XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD); recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_CLEANUP); PageSetLSN(page, recptr); } END_CRIT_SECTION(); } /* * If we have deleted anything, try to compact free space. For squeezing * the bucket, we must have a cleanup lock, else it can impact the * ordering of tuples for a scan that has started before it. */ if (bucket_dirty && IsBufferCleanupOK(bucket_buf)) _hash_squeezebucket(rel, cur_bucket, bucket_blkno, bucket_buf, bstrategy); else LockBuffer(bucket_buf, BUFFER_LOCK_UNLOCK); }
/* * Move tuples from pending pages into regular GIN structure. * * This can be called concurrently by multiple backends, so it must cope. * On first glance it looks completely not concurrent-safe and not crash-safe * either. The reason it's okay is that multiple insertion of the same entry * is detected and treated as a no-op by gininsert.c. If we crash after * posting entries to the main index and before removing them from the * pending list, it's okay because when we redo the posting later on, nothing * bad will happen. Likewise, if two backends simultaneously try to post * a pending entry into the main index, one will succeed and one will do * nothing. We try to notice when someone else is a little bit ahead of * us in the process, but that's just to avoid wasting cycles. Only the * action of removing a page from the pending list really needs exclusive * lock. * * vac_delay indicates that ginInsertCleanup is called from vacuum process, * so call vacuum_delay_point() periodically. * If stats isn't null, we count deleted pending pages into the counts. */ void ginInsertCleanup(GinState *ginstate, bool vac_delay, IndexBulkDeleteResult *stats) { Relation index = ginstate->index; Buffer metabuffer, buffer; Page metapage, page; GinMetaPageData *metadata; MemoryContext opCtx, oldCtx; BuildAccumulator accum; KeyArray datums; BlockNumber blkno; metabuffer = ReadBuffer(index, GIN_METAPAGE_BLKNO); LockBuffer(metabuffer, GIN_SHARE); metapage = BufferGetPage(metabuffer); metadata = GinPageGetMeta(metapage); if (metadata->head == InvalidBlockNumber) { /* Nothing to do */ UnlockReleaseBuffer(metabuffer); return; } /* * Read and lock head of pending list */ blkno = metadata->head; buffer = ReadBuffer(index, blkno); LockBuffer(buffer, GIN_SHARE); page = BufferGetPage(buffer); LockBuffer(metabuffer, GIN_UNLOCK); /* * Initialize. All temporary space will be in opCtx */ opCtx = AllocSetContextCreate(CurrentMemoryContext, "GIN insert cleanup temporary context", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); oldCtx = MemoryContextSwitchTo(opCtx); initKeyArray(&datums, 128); ginInitBA(&accum); accum.ginstate = ginstate; /* * At the top of this loop, we have pin and lock on the current page of * the pending list. However, we'll release that before exiting the loop. * Note we also have pin but not lock on the metapage. */ for (;;) { if (GinPageIsDeleted(page)) { /* another cleanup process is running concurrently */ UnlockReleaseBuffer(buffer); break; } /* * read page's datums into accum */ processPendingPage(&accum, &datums, page, FirstOffsetNumber); if (vac_delay) vacuum_delay_point(); /* * Is it time to flush memory to disk? Flush if we are at the end of * the pending list, or if we have a full row and memory is getting * full. * * XXX using up maintenance_work_mem here is probably unreasonably * much, since vacuum might already be using that much. */ if (GinPageGetOpaque(page)->rightlink == InvalidBlockNumber || (GinPageHasFullRow(page) && (accum.allocatedMemory >= maintenance_work_mem * 1024L))) { ItemPointerData *list; uint32 nlist; Datum key; GinNullCategory category; OffsetNumber maxoff, attnum; /* * Unlock current page to increase performance. Changes of page * will be checked later by comparing maxoff after completion of * memory flush. */ maxoff = PageGetMaxOffsetNumber(page); LockBuffer(buffer, GIN_UNLOCK); /* * Moving collected data into regular structure can take * significant amount of time - so, run it without locking pending * list. */ ginBeginBAScan(&accum); while ((list = ginGetBAEntry(&accum, &attnum, &key, &category, &nlist)) != NULL) { ginEntryInsert(ginstate, attnum, key, category, list, nlist, NULL); if (vac_delay) vacuum_delay_point(); } /* * Lock the whole list to remove pages */ LockBuffer(metabuffer, GIN_EXCLUSIVE); LockBuffer(buffer, GIN_SHARE); if (GinPageIsDeleted(page)) { /* another cleanup process is running concurrently */ UnlockReleaseBuffer(buffer); LockBuffer(metabuffer, GIN_UNLOCK); break; } /* * While we left the page unlocked, more stuff might have gotten * added to it. If so, process those entries immediately. There * shouldn't be very many, so we don't worry about the fact that * we're doing this with exclusive lock. Insertion algorithm * guarantees that inserted row(s) will not continue on next page. * NOTE: intentionally no vacuum_delay_point in this loop. */ if (PageGetMaxOffsetNumber(page) != maxoff) { ginInitBA(&accum); processPendingPage(&accum, &datums, page, maxoff + 1); ginBeginBAScan(&accum); while ((list = ginGetBAEntry(&accum, &attnum, &key, &category, &nlist)) != NULL) ginEntryInsert(ginstate, attnum, key, category, list, nlist, NULL); } /* * Remember next page - it will become the new list head */ blkno = GinPageGetOpaque(page)->rightlink; UnlockReleaseBuffer(buffer); /* shiftList will do exclusive * locking */ /* * remove readed pages from pending list, at this point all * content of readed pages is in regular structure */ if (shiftList(index, metabuffer, blkno, stats)) { /* another cleanup process is running concurrently */ LockBuffer(metabuffer, GIN_UNLOCK); break; } Assert(blkno == metadata->head); LockBuffer(metabuffer, GIN_UNLOCK); /* * if we removed the whole pending list just exit */ if (blkno == InvalidBlockNumber) break; /* * release memory used so far and reinit state */ MemoryContextReset(opCtx); initKeyArray(&datums, datums.maxvalues); ginInitBA(&accum); } else { blkno = GinPageGetOpaque(page)->rightlink; UnlockReleaseBuffer(buffer); } /* * Read next page in pending list */ CHECK_FOR_INTERRUPTS(); buffer = ReadBuffer(index, blkno); LockBuffer(buffer, GIN_SHARE); page = BufferGetPage(buffer); } ReleaseBuffer(metabuffer); /* Clean up temporary space */ MemoryContextSwitchTo(oldCtx); MemoryContextDelete(opCtx); }
/* * compute_tsvector_stats() -- compute statistics for a tsvector column * * This functions computes statistics that are useful for determining @@ * operations' selectivity, along with the fraction of non-null rows and * average width. * * Instead of finding the most common values, as we do for most datatypes, * we're looking for the most common lexemes. This is more useful, because * there most probably won't be any two rows with the same tsvector and thus * the notion of a MCV is a bit bogus with this datatype. With a list of the * most common lexemes we can do a better job at figuring out @@ selectivity. * * For the same reasons we assume that tsvector columns are unique when * determining the number of distinct values. * * The algorithm used is Lossy Counting, as proposed in the paper "Approximate * frequency counts over data streams" by G. S. Manku and R. Motwani, in * Proceedings of the 28th International Conference on Very Large Data Bases, * Hong Kong, China, August 2002, section 4.2. The paper is available at * http://www.vldb.org/conf/2002/S10P03.pdf * * The Lossy Counting (aka LC) algorithm goes like this: * Let s be the threshold frequency for an item (the minimum frequency we * are interested in) and epsilon the error margin for the frequency. Let D * be a set of triples (e, f, delta), where e is an element value, f is that * element's frequency (actually, its current occurrence count) and delta is * the maximum error in f. We start with D empty and process the elements in * batches of size w. (The batch size is also known as "bucket size" and is * equal to 1/epsilon.) Let the current batch number be b_current, starting * with 1. For each element e we either increment its f count, if it's * already in D, or insert a new triple into D with values (e, 1, b_current * - 1). After processing each batch we prune D, by removing from it all * elements with f + delta <= b_current. After the algorithm finishes we * suppress all elements from D that do not satisfy f >= (s - epsilon) * N, * where N is the total number of elements in the input. We emit the * remaining elements with estimated frequency f/N. The LC paper proves * that this algorithm finds all elements with true frequency at least s, * and that no frequency is overestimated or is underestimated by more than * epsilon. Furthermore, given reasonable assumptions about the input * distribution, the required table size is no more than about 7 times w. * * We set s to be the estimated frequency of the K'th word in a natural * language's frequency table, where K is the target number of entries in * the MCELEM array plus an arbitrary constant, meant to reflect the fact * that the most common words in any language would usually be stopwords * so we will not actually see them in the input. We assume that the * distribution of word frequencies (including the stopwords) follows Zipf's * law with an exponent of 1. * * Assuming Zipfian distribution, the frequency of the K'th word is equal * to 1/(K * H(W)) where H(n) is 1/2 + 1/3 + ... + 1/n and W is the number of * words in the language. Putting W as one million, we get roughly 0.07/K. * Assuming top 10 words are stopwords gives s = 0.07/(K + 10). We set * epsilon = s/10, which gives bucket width w = (K + 10)/0.007 and * maximum expected hashtable size of about 1000 * (K + 10). * * Note: in the above discussion, s, epsilon, and f/N are in terms of a * lexeme's frequency as a fraction of all lexemes seen in the input. * However, what we actually want to store in the finished pg_statistic * entry is each lexeme's frequency as a fraction of all rows that it occurs * in. Assuming that the input tsvectors are correctly constructed, no * lexeme occurs more than once per tsvector, so the final count f is a * correct estimate of the number of input tsvectors it occurs in, and we * need only change the divisor from N to nonnull_cnt to get the number we * want. */ static void compute_tsvector_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc, int samplerows, double totalrows) { int num_mcelem; int null_cnt = 0; double total_width = 0; /* This is D from the LC algorithm. */ HTAB *lexemes_tab; HASHCTL hash_ctl; HASH_SEQ_STATUS scan_status; /* This is the current bucket number from the LC algorithm */ int b_current; /* This is 'w' from the LC algorithm */ int bucket_width; int vector_no, lexeme_no; LexemeHashKey hash_key; TrackItem *item; /* * We want statistics_target * 10 lexemes in the MCELEM array. This * multiplier is pretty arbitrary, but is meant to reflect the fact that * the number of individual lexeme values tracked in pg_statistic ought to * be more than the number of values for a simple scalar column. */ num_mcelem = stats->attr->attstattarget * 10; /* * We set bucket width equal to (num_mcelem + 10) / 0.007 as per the * comment above. */ bucket_width = (num_mcelem + 10) * 1000 / 7; /* * Create the hashtable. It will be in local memory, so we don't need to * worry about overflowing the initial size. Also we don't need to pay any * attention to locking and memory management. */ MemSet(&hash_ctl, 0, sizeof(hash_ctl)); hash_ctl.keysize = sizeof(LexemeHashKey); hash_ctl.entrysize = sizeof(TrackItem); hash_ctl.hash = lexeme_hash; hash_ctl.match = lexeme_match; hash_ctl.hcxt = CurrentMemoryContext; lexemes_tab = hash_create("Analyzed lexemes table", num_mcelem, &hash_ctl, HASH_ELEM | HASH_FUNCTION | HASH_COMPARE | HASH_CONTEXT); /* Initialize counters. */ b_current = 1; lexeme_no = 0; /* Loop over the tsvectors. */ for (vector_no = 0; vector_no < samplerows; vector_no++) { Datum value; bool isnull; TSVector vector; WordEntry *curentryptr; char *lexemesptr; int j; vacuum_delay_point(); value = fetchfunc(stats, vector_no, &isnull); /* * Check for null/nonnull. */ if (isnull) { null_cnt++; continue; } /* * Add up widths for average-width calculation. Since it's a * tsvector, we know it's varlena. As in the regular * compute_minimal_stats function, we use the toasted width for this * calculation. */ total_width += VARSIZE_ANY(DatumGetPointer(value)); /* * Now detoast the tsvector if needed. */ vector = DatumGetTSVector(value); /* * We loop through the lexemes in the tsvector and add them to our * tracking hashtable. */ lexemesptr = STRPTR(vector); curentryptr = ARRPTR(vector); for (j = 0; j < vector->size; j++) { bool found; /* * Construct a hash key. The key points into the (detoasted) * tsvector value at this point, but if a new entry is created, we * make a copy of it. This way we can free the tsvector value * once we've processed all its lexemes. */ hash_key.lexeme = lexemesptr + curentryptr->pos; hash_key.length = curentryptr->len; /* Lookup current lexeme in hashtable, adding it if new */ item = (TrackItem *) hash_search(lexemes_tab, (const void *) &hash_key, HASH_ENTER, &found); if (found) { /* The lexeme is already on the tracking list */ item->frequency++; } else { /* Initialize new tracking list element */ item->frequency = 1; item->delta = b_current - 1; item->key.lexeme = palloc(hash_key.length); memcpy(item->key.lexeme, hash_key.lexeme, hash_key.length); } /* lexeme_no is the number of elements processed (ie N) */ lexeme_no++; /* We prune the D structure after processing each bucket */ if (lexeme_no % bucket_width == 0) { prune_lexemes_hashtable(lexemes_tab, b_current); b_current++; } /* Advance to the next WordEntry in the tsvector */ curentryptr++; } /* If the vector was toasted, free the detoasted copy. */ if (TSVectorGetDatum(vector) != value) pfree(vector); } /* We can only compute real stats if we found some non-null values. */ if (null_cnt < samplerows) { int nonnull_cnt = samplerows - null_cnt; int i; TrackItem **sort_table; int track_len; int cutoff_freq; int minfreq, maxfreq; stats->stats_valid = true; /* Do the simple null-frac and average width stats */ stats->stanullfrac = (double) null_cnt / (double) samplerows; stats->stawidth = total_width / (double) nonnull_cnt; /* Assume it's a unique column (see notes above) */ stats->stadistinct = -1.0 * (1.0 - stats->stanullfrac); /* * Construct an array of the interesting hashtable items, that is, * those meeting the cutoff frequency (s - epsilon)*N. Also identify * the minimum and maximum frequencies among these items. * * Since epsilon = s/10 and bucket_width = 1/epsilon, the cutoff * frequency is 9*N / bucket_width. */ cutoff_freq = 9 * lexeme_no / bucket_width; i = hash_get_num_entries(lexemes_tab); /* surely enough space */ sort_table = (TrackItem **) palloc(sizeof(TrackItem *) * i); hash_seq_init(&scan_status, lexemes_tab); track_len = 0; minfreq = lexeme_no; maxfreq = 0; while ((item = (TrackItem *) hash_seq_search(&scan_status)) != NULL) { if (item->frequency > cutoff_freq) { sort_table[track_len++] = item; minfreq = Min(minfreq, item->frequency); maxfreq = Max(maxfreq, item->frequency); } } Assert(track_len <= i); /* emit some statistics for debug purposes */ elog(DEBUG3, "tsvector_stats: target # mces = %d, bucket width = %d, " "# lexemes = %d, hashtable size = %d, usable entries = %d", num_mcelem, bucket_width, lexeme_no, i, track_len); /* * If we obtained more lexemes than we really want, get rid of those * with least frequencies. The easiest way is to qsort the array into * descending frequency order and truncate the array. */ if (num_mcelem < track_len) { qsort(sort_table, track_len, sizeof(TrackItem *), trackitem_compare_frequencies_desc); /* reset minfreq to the smallest frequency we're keeping */ minfreq = sort_table[num_mcelem - 1]->frequency; } else num_mcelem = track_len; /* Generate MCELEM slot entry */ if (num_mcelem > 0) { MemoryContext old_context; Datum *mcelem_values; float4 *mcelem_freqs; /* * We want to store statistics sorted on the lexeme value using * first length, then byte-for-byte comparison. The reason for * doing length comparison first is that we don't care about the * ordering so long as it's consistent, and comparing lengths * first gives us a chance to avoid a strncmp() call. * * This is different from what we do with scalar statistics -- * they get sorted on frequencies. The rationale is that we * usually search through most common elements looking for a * specific value, so we can grab its frequency. When values are * presorted we can employ binary search for that. See * ts_selfuncs.c for a real usage scenario. */ qsort(sort_table, num_mcelem, sizeof(TrackItem *), trackitem_compare_lexemes); /* Must copy the target values into anl_context */ old_context = MemoryContextSwitchTo(stats->anl_context); /* * We sorted statistics on the lexeme value, but we want to be * able to find out the minimal and maximal frequency without * going through all the values. We keep those two extra * frequencies in two extra cells in mcelem_freqs. * * (Note: the MCELEM statistics slot definition allows for a third * extra number containing the frequency of nulls, but we don't * create that for a tsvector column, since null elements aren't * possible.) */ mcelem_values = (Datum *) palloc(num_mcelem * sizeof(Datum)); mcelem_freqs = (float4 *) palloc((num_mcelem + 2) * sizeof(float4)); /* * See comments above about use of nonnull_cnt as the divisor for * the final frequency estimates. */ for (i = 0; i < num_mcelem; i++) { TrackItem *item = sort_table[i]; mcelem_values[i] = PointerGetDatum(cstring_to_text_with_len(item->key.lexeme, item->key.length)); mcelem_freqs[i] = (double) item->frequency / (double) nonnull_cnt; } mcelem_freqs[i++] = (double) minfreq / (double) nonnull_cnt; mcelem_freqs[i] = (double) maxfreq / (double) nonnull_cnt; MemoryContextSwitchTo(old_context); stats->stakind[0] = STATISTIC_KIND_MCELEM; stats->staop[0] = TextEqualOperator; stats->stacoll[0] = DEFAULT_COLLATION_OID; stats->stanumbers[0] = mcelem_freqs; /* See above comment about two extra frequency fields */ stats->numnumbers[0] = num_mcelem + 2; stats->stavalues[0] = mcelem_values; stats->numvalues[0] = num_mcelem; /* We are storing text values */ stats->statypid[0] = TEXTOID; stats->statyplen[0] = -1; /* typlen, -1 for varlena */ stats->statypbyval[0] = false; stats->statypalign[0] = 'i'; } } else { /* We found only nulls; assume the column is entirely null */ stats->stats_valid = true; stats->stanullfrac = 1.0; stats->stawidth = 0; /* "unknown" */ stats->stadistinct = 0.0; /* "unknown" */ } /* * We don't need to bother cleaning up any of our temporary palloc's. The * hashtable should also go away, as it used a child memory context. */ }
/* * Assumes that the segment file lock is already held. * Assumes that the segment file should be compacted. */ static bool AOCSSegmentFileFullCompaction(Relation aorel, AOCSInsertDesc insertDesc, AOCSFileSegInfo *fsinfo, Snapshot snapshot) { const char *relname; AppendOnlyVisimap visiMap; AOCSScanDesc scanDesc; TupleDesc tupDesc; TupleTableSlot *slot; int compact_segno; int64 movedTupleCount = 0; ResultRelInfo *resultRelInfo; MemTupleBinding *mt_bind; EState *estate; bool *proj; int i; AOTupleId *aoTupleId; int64 tupleCount = 0; int64 tuplePerPage = INT_MAX; Assert(Gp_role == GP_ROLE_EXECUTE || Gp_role == GP_ROLE_UTILITY); Assert(RelationIsAoCols(aorel)); Assert(insertDesc); compact_segno = fsinfo->segno; if (fsinfo->varblockcount > 0) { tuplePerPage = fsinfo->total_tupcount / fsinfo->varblockcount; } relname = RelationGetRelationName(aorel); AppendOnlyVisimap_Init(&visiMap, aorel->rd_appendonly->visimaprelid, aorel->rd_appendonly->visimapidxid, ShareLock, snapshot); elogif(Debug_appendonly_print_compaction, LOG, "Compact AO segfile %d, relation %sd", compact_segno, relname); proj = palloc0(sizeof(bool) * RelationGetNumberOfAttributes(aorel)); for (i = 0; i < RelationGetNumberOfAttributes(aorel); ++i) { proj[i] = true; } scanDesc = aocs_beginrangescan(aorel, snapshot, snapshot, &compact_segno, 1, NULL, proj); tupDesc = RelationGetDescr(aorel); slot = MakeSingleTupleTableSlot(tupDesc); mt_bind = create_memtuple_binding(tupDesc); /* * We need a ResultRelInfo and an EState so we can use the regular * executor's index-entry-making machinery. */ estate = CreateExecutorState(); resultRelInfo = makeNode(ResultRelInfo); resultRelInfo->ri_RangeTableIndex = 1; /* dummy */ resultRelInfo->ri_RelationDesc = aorel; resultRelInfo->ri_TrigDesc = NULL; /* we don't fire triggers */ ExecOpenIndices(resultRelInfo); estate->es_result_relations = resultRelInfo; estate->es_num_result_relations = 1; estate->es_result_relation_info = resultRelInfo; while (aocs_getnext(scanDesc, ForwardScanDirection, slot)) { CHECK_FOR_INTERRUPTS(); aoTupleId = (AOTupleId *) slot_get_ctid(slot); if (AppendOnlyVisimap_IsVisible(&scanDesc->visibilityMap, aoTupleId)) { AOCSMoveTuple(slot, insertDesc, resultRelInfo, estate); movedTupleCount++; } else { /* Tuple is invisible and needs to be dropped */ AppendOnlyThrowAwayTuple(aorel, slot, mt_bind); } /* * Check for vacuum delay point after approximatly a var block */ tupleCount++; if (VacuumCostActive && tupleCount % tuplePerPage == 0) { vacuum_delay_point(); } } SetAOCSFileSegInfoState(aorel, compact_segno, AOSEG_STATE_AWAITING_DROP); AppendOnlyVisimap_DeleteSegmentFile(&visiMap, compact_segno); /* Delete all mini pages of the segment files if block directory exists */ if (OidIsValid(aorel->rd_appendonly->blkdirrelid)) { AppendOnlyBlockDirectory_DeleteSegmentFile(aorel, snapshot, compact_segno, 0); } elogif(Debug_appendonly_print_compaction, LOG, "Finished compaction: " "AO segfile %d, relation %s, moved tuple count " INT64_FORMAT, compact_segno, relname, movedTupleCount); AppendOnlyVisimap_Finish(&visiMap, NoLock); ExecCloseIndices(resultRelInfo); FreeExecutorState(estate); ExecDropSingleTupleTableSlot(slot); destroy_memtuple_binding(mt_bind); aocs_endscan(scanDesc); pfree(proj); return true; }
/* * Process the pending-TID list between pages of the main scan */ static void spgprocesspending(spgBulkDeleteState *bds) { Relation index = bds->info->index; spgVacPendingItem *pitem; spgVacPendingItem *nitem; BlockNumber blkno; Buffer buffer; Page page; for (pitem = bds->pendingList; pitem != NULL; pitem = pitem->next) { if (pitem->done) continue; /* ignore already-done items */ /* call vacuum_delay_point while not holding any buffer lock */ vacuum_delay_point(); /* examine the referenced page */ blkno = ItemPointerGetBlockNumber(&pitem->tid); buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno, RBM_NORMAL, bds->info->strategy); LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); page = (Page) BufferGetPage(buffer); if (PageIsNew(page) || SpGistPageIsDeleted(page)) { /* Probably shouldn't happen, but ignore it */ } else if (SpGistPageIsLeaf(page)) { if (SpGistBlockIsRoot(blkno)) { /* this should definitely not happen */ elog(ERROR, "redirection leads to root page of index \"%s\"", RelationGetRelationName(index)); } /* deal with any deletable tuples */ vacuumLeafPage(bds, index, buffer, true); /* might as well do this while we are here */ vacuumRedirectAndPlaceholder(index, buffer); SpGistSetLastUsedPage(index, buffer); /* * We can mark as done not only this item, but any later ones * pointing at the same page, since we vacuumed the whole page. */ pitem->done = true; for (nitem = pitem->next; nitem != NULL; nitem = nitem->next) { if (ItemPointerGetBlockNumber(&nitem->tid) == blkno) nitem->done = true; } } else { /* * On an inner page, visit the referenced inner tuple and add all * its downlinks to the pending list. We might have pending items * for more than one inner tuple on the same page (in fact this is * pretty likely given the way space allocation works), so get * them all while we are here. */ for (nitem = pitem; nitem != NULL; nitem = nitem->next) { if (nitem->done) continue; if (ItemPointerGetBlockNumber(&nitem->tid) == blkno) { OffsetNumber offset; SpGistInnerTuple innerTuple; offset = ItemPointerGetOffsetNumber(&nitem->tid); innerTuple = (SpGistInnerTuple) PageGetItem(page, PageGetItemId(page, offset)); if (innerTuple->tupstate == SPGIST_LIVE) { SpGistNodeTuple node; int i; SGITITERATE(innerTuple, i, node) { if (ItemPointerIsValid(&node->t_tid)) spgAddPendingTID(bds, &node->t_tid); } } else if (innerTuple->tupstate == SPGIST_REDIRECT) { /* transfer attention to redirect point */ spgAddPendingTID(bds, &((SpGistDeadTuple) innerTuple)->pointer); } else elog(ERROR, "unexpected SPGiST tuple state: %d", innerTuple->tupstate); nitem->done = true; } }
/* * Move tuples from pending pages into regular GIN structure. * * On first glance it looks completely not crash-safe. But if we crash * after posting entries to the main index and before removing them from the * pending list, it's okay because when we redo the posting later on, nothing * bad will happen. * * fill_fsm indicates that ginInsertCleanup should add deleted pages * to FSM otherwise caller is responsible to put deleted pages into * FSM. * * If stats isn't null, we count deleted pending pages into the counts. */ void ginInsertCleanup(GinState *ginstate, bool full_clean, bool fill_fsm, IndexBulkDeleteResult *stats) { Relation index = ginstate->index; Buffer metabuffer, buffer; Page metapage, page; GinMetaPageData *metadata; MemoryContext opCtx, oldCtx; BuildAccumulator accum; KeyArray datums; BlockNumber blkno, blknoFinish; bool cleanupFinish = false; bool fsm_vac = false; Size workMemory; bool inVacuum = (stats == NULL); /* * We would like to prevent concurrent cleanup process. For that we will * lock metapage in exclusive mode using LockPage() call. Nobody other * will use that lock for metapage, so we keep possibility of concurrent * insertion into pending list */ if (inVacuum) { /* * We are called from [auto]vacuum/analyze or gin_clean_pending_list() * and we would like to wait concurrent cleanup to finish. */ LockPage(index, GIN_METAPAGE_BLKNO, ExclusiveLock); workMemory = (IsAutoVacuumWorkerProcess() && autovacuum_work_mem != -1) ? autovacuum_work_mem : maintenance_work_mem; } else { /* * We are called from regular insert and if we see concurrent cleanup * just exit in hope that concurrent process will clean up pending * list. */ if (!ConditionalLockPage(index, GIN_METAPAGE_BLKNO, ExclusiveLock)) return; workMemory = work_mem; } metabuffer = ReadBuffer(index, GIN_METAPAGE_BLKNO); LockBuffer(metabuffer, GIN_SHARE); metapage = BufferGetPage(metabuffer); metadata = GinPageGetMeta(metapage); if (metadata->head == InvalidBlockNumber) { /* Nothing to do */ UnlockReleaseBuffer(metabuffer); UnlockPage(index, GIN_METAPAGE_BLKNO, ExclusiveLock); return; } /* * Remember a tail page to prevent infinite cleanup if other backends add * new tuples faster than we can cleanup. */ blknoFinish = metadata->tail; /* * Read and lock head of pending list */ blkno = metadata->head; buffer = ReadBuffer(index, blkno); LockBuffer(buffer, GIN_SHARE); page = BufferGetPage(buffer); LockBuffer(metabuffer, GIN_UNLOCK); /* * Initialize. All temporary space will be in opCtx */ opCtx = AllocSetContextCreate(CurrentMemoryContext, "GIN insert cleanup temporary context", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); oldCtx = MemoryContextSwitchTo(opCtx); initKeyArray(&datums, 128); ginInitBA(&accum); accum.ginstate = ginstate; /* * At the top of this loop, we have pin and lock on the current page of * the pending list. However, we'll release that before exiting the loop. * Note we also have pin but not lock on the metapage. */ for (;;) { Assert(!GinPageIsDeleted(page)); /* * Are we walk through the page which as we remember was a tail when * we start our cleanup? But if caller asks us to clean up whole * pending list then ignore old tail, we will work until list becomes * empty. */ if (blkno == blknoFinish && full_clean == false) cleanupFinish = true; /* * read page's datums into accum */ processPendingPage(&accum, &datums, page, FirstOffsetNumber); vacuum_delay_point(); /* * Is it time to flush memory to disk? Flush if we are at the end of * the pending list, or if we have a full row and memory is getting * full. */ if (GinPageGetOpaque(page)->rightlink == InvalidBlockNumber || (GinPageHasFullRow(page) && (accum.allocatedMemory >= workMemory * 1024L))) { ItemPointerData *list; uint32 nlist; Datum key; GinNullCategory category; OffsetNumber maxoff, attnum; /* * Unlock current page to increase performance. Changes of page * will be checked later by comparing maxoff after completion of * memory flush. */ maxoff = PageGetMaxOffsetNumber(page); LockBuffer(buffer, GIN_UNLOCK); /* * Moving collected data into regular structure can take * significant amount of time - so, run it without locking pending * list. */ ginBeginBAScan(&accum); while ((list = ginGetBAEntry(&accum, &attnum, &key, &category, &nlist)) != NULL) { ginEntryInsert(ginstate, attnum, key, category, list, nlist, NULL); vacuum_delay_point(); } /* * Lock the whole list to remove pages */ LockBuffer(metabuffer, GIN_EXCLUSIVE); LockBuffer(buffer, GIN_SHARE); Assert(!GinPageIsDeleted(page)); /* * While we left the page unlocked, more stuff might have gotten * added to it. If so, process those entries immediately. There * shouldn't be very many, so we don't worry about the fact that * we're doing this with exclusive lock. Insertion algorithm * guarantees that inserted row(s) will not continue on next page. * NOTE: intentionally no vacuum_delay_point in this loop. */ if (PageGetMaxOffsetNumber(page) != maxoff) { ginInitBA(&accum); processPendingPage(&accum, &datums, page, maxoff + 1); ginBeginBAScan(&accum); while ((list = ginGetBAEntry(&accum, &attnum, &key, &category, &nlist)) != NULL) ginEntryInsert(ginstate, attnum, key, category, list, nlist, NULL); } /* * Remember next page - it will become the new list head */ blkno = GinPageGetOpaque(page)->rightlink; UnlockReleaseBuffer(buffer); /* shiftList will do exclusive * locking */ /* * remove read pages from pending list, at this point all content * of read pages is in regular structure */ shiftList(index, metabuffer, blkno, fill_fsm, stats); /* At this point, some pending pages have been freed up */ fsm_vac = true; Assert(blkno == metadata->head); LockBuffer(metabuffer, GIN_UNLOCK); /* * if we removed the whole pending list or we cleanup tail (which * we remembered on start our cleanup process) then just exit */ if (blkno == InvalidBlockNumber || cleanupFinish) break; /* * release memory used so far and reinit state */ MemoryContextReset(opCtx); initKeyArray(&datums, datums.maxvalues); ginInitBA(&accum); } else { blkno = GinPageGetOpaque(page)->rightlink; UnlockReleaseBuffer(buffer); } /* * Read next page in pending list */ vacuum_delay_point(); buffer = ReadBuffer(index, blkno); LockBuffer(buffer, GIN_SHARE); page = BufferGetPage(buffer); } UnlockPage(index, GIN_METAPAGE_BLKNO, ExclusiveLock); ReleaseBuffer(metabuffer); /* * As pending list pages can have a high churn rate, it is desirable to * recycle them immediately to the FreeSpace Map when ordinary backends * clean the list. */ if (fsm_vac && fill_fsm) IndexFreeSpaceMapVacuum(index); /* Clean up temporary space */ MemoryContextSwitchTo(oldCtx); MemoryContextDelete(opCtx); }
/* * btvacuumpage --- VACUUM one page * * This processes a single page for btvacuumscan(). In some cases we * must go back and re-examine previously-scanned pages; this routine * recurses when necessary to handle that case. * * blkno is the page to process. orig_blkno is the highest block number * reached by the outer btvacuumscan loop (the same as blkno, unless we * are recursing to re-examine a previous page). */ static void btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno) { IndexVacuumInfo *info = vstate->info; IndexBulkDeleteResult *stats = vstate->stats; IndexBulkDeleteCallback callback = vstate->callback; void *callback_state = vstate->callback_state; Relation rel = info->index; bool delete_now; BlockNumber recurse_to; Buffer buf; Page page; BTPageOpaque opaque = NULL; restart: delete_now = false; recurse_to = P_NONE; /* call vacuum_delay_point while not holding any buffer lock */ vacuum_delay_point(); /* * We can't use _bt_getbuf() here because it always applies * _bt_checkpage(), which will barf on an all-zero page. We want to * recycle all-zero pages, not fail. Also, we want to use a nondefault * buffer access strategy. */ buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy); LockBuffer(buf, BT_READ); page = BufferGetPage(buf); if (!PageIsNew(page)) { _bt_checkpage(rel, buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); } /* * If we are recursing, the only case we want to do anything with is a * live leaf page having the current vacuum cycle ID. Any other state * implies we already saw the page (eg, deleted it as being empty). */ if (blkno != orig_blkno) { if (_bt_page_recyclable(page) || P_IGNORE(opaque) || !P_ISLEAF(opaque) || opaque->btpo_cycleid != vstate->cycleid) { _bt_relbuf(rel, buf); return; } } /* Page is valid, see what to do with it */ if (_bt_page_recyclable(page)) { /* Okay to recycle this page */ RecordFreeIndexPage(rel, blkno); vstate->totFreePages++; stats->pages_deleted++; } else if (P_ISDELETED(opaque)) { /* Already deleted, but can't recycle yet */ stats->pages_deleted++; } else if (P_ISHALFDEAD(opaque)) { /* Half-dead, try to delete */ delete_now = true; } else if (P_ISLEAF(opaque)) { OffsetNumber deletable[MaxOffsetNumber]; int ndeletable; OffsetNumber offnum, minoff, maxoff; /* * Trade in the initial read lock for a super-exclusive write lock on * this page. We must get such a lock on every leaf page over the * course of the vacuum scan, whether or not it actually contains any * deletable tuples --- see nbtree/README. */ LockBuffer(buf, BUFFER_LOCK_UNLOCK); LockBufferForCleanup(buf); /* * Remember highest leaf page number we've taken cleanup lock on; see * notes in btvacuumscan */ if (blkno > vstate->lastBlockLocked) vstate->lastBlockLocked = blkno; /* * Check whether we need to recurse back to earlier pages. What we * are concerned about is a page split that happened since we started * the vacuum scan. If the split moved some tuples to a lower page * then we might have missed 'em. If so, set up for tail recursion. * (Must do this before possibly clearing btpo_cycleid below!) */ if (vstate->cycleid != 0 && opaque->btpo_cycleid == vstate->cycleid && !(opaque->btpo_flags & BTP_SPLIT_END) && !P_RIGHTMOST(opaque) && opaque->btpo_next < orig_blkno) recurse_to = opaque->btpo_next; /* * Scan over all items to see which ones need deleted according to the * callback function. */ ndeletable = 0; minoff = P_FIRSTDATAKEY(opaque); maxoff = PageGetMaxOffsetNumber(page); if (callback) { for (offnum = minoff; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { IndexTuple itup; ItemPointer htup; itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); htup = &(itup->t_tid); /* * During Hot Standby we currently assume that * XLOG_BTREE_VACUUM records do not produce conflicts. That is * only true as long as the callback function depends only * upon whether the index tuple refers to heap tuples removed * in the initial heap scan. When vacuum starts it derives a * value of OldestXmin. Backends taking later snapshots could * have a RecentGlobalXmin with a later xid than the vacuum's * OldestXmin, so it is possible that row versions deleted * after OldestXmin could be marked as killed by other * backends. The callback function *could* look at the index * tuple state in isolation and decide to delete the index * tuple, though currently it does not. If it ever did, we * would need to reconsider whether XLOG_BTREE_VACUUM records * should cause conflicts. If they did cause conflicts they * would be fairly harsh conflicts, since we haven't yet * worked out a way to pass a useful value for * latestRemovedXid on the XLOG_BTREE_VACUUM records. This * applies to *any* type of index that marks index tuples as * killed. */ if (callback(htup, callback_state)) deletable[ndeletable++] = offnum; } } /* * Apply any needed deletes. We issue just one _bt_delitems_vacuum() * call per page, so as to minimize WAL traffic. */ if (ndeletable > 0) { /* * Notice that the issued XLOG_BTREE_VACUUM WAL record includes * all information to the replay code to allow it to get a cleanup * lock on all pages between the previous lastBlockVacuumed and * this page. This ensures that WAL replay locks all leaf pages at * some point, which is important should non-MVCC scans be * requested. This is currently unused on standby, but we record * it anyway, so that the WAL contains the required information. * * Since we can visit leaf pages out-of-order when recursing, * replay might end up locking such pages an extra time, but it * doesn't seem worth the amount of bookkeeping it'd take to avoid * that. */ _bt_delitems_vacuum(rel, buf, deletable, ndeletable, vstate->lastBlockVacuumed); /* * Remember highest leaf page number we've issued a * XLOG_BTREE_VACUUM WAL record for. */ if (blkno > vstate->lastBlockVacuumed) vstate->lastBlockVacuumed = blkno; stats->tuples_removed += ndeletable; /* must recompute maxoff */ maxoff = PageGetMaxOffsetNumber(page); } else { /* * If the page has been split during this vacuum cycle, it seems * worth expending a write to clear btpo_cycleid even if we don't * have any deletions to do. (If we do, _bt_delitems_vacuum takes * care of this.) This ensures we won't process the page again. * * We treat this like a hint-bit update because there's no need to * WAL-log it. */ if (vstate->cycleid != 0 && opaque->btpo_cycleid == vstate->cycleid) { opaque->btpo_cycleid = 0; MarkBufferDirtyHint(buf, true); } } /* * If it's now empty, try to delete; else count the live tuples. We * don't delete when recursing, though, to avoid putting entries into * freePages out-of-order (doesn't seem worth any extra code to handle * the case). */ if (minoff > maxoff) delete_now = (blkno == orig_blkno); else stats->num_index_tuples += maxoff - minoff + 1; } if (delete_now) { MemoryContext oldcontext; int ndel; /* Run pagedel in a temp context to avoid memory leakage */ MemoryContextReset(vstate->pagedelcontext); oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext); ndel = _bt_pagedel(rel, buf); /* count only this page, else may double-count parent */ if (ndel) stats->pages_deleted++; MemoryContextSwitchTo(oldcontext); /* pagedel released buffer, so we shouldn't */ } else _bt_relbuf(rel, buf); /* * This is really tail recursion, but if the compiler is too stupid to * optimize it as such, we'd eat an uncomfortably large amount of stack * space per recursion level (due to the deletable[] array). A failure is * improbable since the number of levels isn't likely to be large ... but * just in case, let's hand-optimize into a loop. */ if (recurse_to != P_NONE) { blkno = recurse_to; goto restart; } }
/* * compute_tsvector_stats() -- compute statistics for a tsvector column * * This functions computes statistics that are useful for determining @@ * operations' selectivity, along with the fraction of non-null rows and * average width. * * Instead of finding the most common values, as we do for most datatypes, * we're looking for the most common lexemes. This is more useful, because * there most probably won't be any two rows with the same tsvector and thus * the notion of a MCV is a bit bogus with this datatype. With a list of the * most common lexemes we can do a better job at figuring out @@ selectivity. * * For the same reasons we assume that tsvector columns are unique when * determining the number of distinct values. * * The algorithm used is Lossy Counting, as proposed in the paper "Approximate * frequency counts over data streams" by G. S. Manku and R. Motwani, in * Proceedings of the 28th International Conference on Very Large Data Bases, * Hong Kong, China, August 2002, section 4.2. The paper is available at * http://www.vldb.org/conf/2002/S10P03.pdf * * The Lossy Counting (aka LC) algorithm goes like this: * Let D be a set of triples (e, f, d), where e is an element value, f is * that element's frequency (occurrence count) and d is the maximum error in * f. We start with D empty and process the elements in batches of size * w. (The batch size is also known as "bucket size".) Let the current batch * number be b_current, starting with 1. For each element e we either * increment its f count, if it's already in D, or insert a new triple into D * with values (e, 1, b_current - 1). After processing each batch we prune D, * by removing from it all elements with f + d <= b_current. Finally, we * gather elements with largest f. The LC paper proves error bounds on f * dependent on the batch size w, and shows that the required table size * is no more than a few times w. * * We use a hashtable for the D structure and a bucket width of * statistics_target * 10, where 10 is an arbitrarily chosen constant, * meant to approximate the number of lexemes in a single tsvector. */ static void compute_tsvector_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc, int samplerows, double totalrows) { int num_mcelem; int null_cnt = 0; double total_width = 0; /* This is D from the LC algorithm. */ HTAB *lexemes_tab; HASHCTL hash_ctl; HASH_SEQ_STATUS scan_status; /* This is the current bucket number from the LC algorithm */ int b_current; /* This is 'w' from the LC algorithm */ int bucket_width; int vector_no, lexeme_no; LexemeHashKey hash_key; TrackItem *item; /* We want statistics_target * 10 lexemes in the MCELEM array */ num_mcelem = stats->attr->attstattarget * 10; /* * We set bucket width equal to the target number of result lexemes. This * is probably about right but perhaps might need to be scaled up or down * a bit? */ bucket_width = num_mcelem; /* * Create the hashtable. It will be in local memory, so we don't need to * worry about initial size too much. Also we don't need to pay any * attention to locking and memory management. */ MemSet(&hash_ctl, 0, sizeof(hash_ctl)); hash_ctl.keysize = sizeof(LexemeHashKey); hash_ctl.entrysize = sizeof(TrackItem); hash_ctl.hash = lexeme_hash; hash_ctl.match = lexeme_match; hash_ctl.hcxt = CurrentMemoryContext; lexemes_tab = hash_create("Analyzed lexemes table", bucket_width * 4, &hash_ctl, HASH_ELEM | HASH_FUNCTION | HASH_COMPARE | HASH_CONTEXT); /* Initialize counters. */ b_current = 1; lexeme_no = 1; /* Loop over the tsvectors. */ for (vector_no = 0; vector_no < samplerows; vector_no++) { Datum value; bool isnull; TSVector vector; WordEntry *curentryptr; char *lexemesptr; int j; vacuum_delay_point(); value = fetchfunc(stats, vector_no, &isnull); /* * Check for null/nonnull. */ if (isnull) { null_cnt++; continue; } /* * Add up widths for average-width calculation. Since it's a * tsvector, we know it's varlena. As in the regular * compute_minimal_stats function, we use the toasted width for this * calculation. */ total_width += VARSIZE_ANY(DatumGetPointer(value)); /* * Now detoast the tsvector if needed. */ vector = DatumGetTSVector(value); /* * We loop through the lexemes in the tsvector and add them to our * tracking hashtable. Note: the hashtable entries will point into * the (detoasted) tsvector value, therefore we cannot free that * storage until we're done. */ lexemesptr = STRPTR(vector); curentryptr = ARRPTR(vector); for (j = 0; j < vector->size; j++) { bool found; /* Construct a hash key */ hash_key.lexeme = lexemesptr + curentryptr->pos; hash_key.length = curentryptr->len; /* Lookup current lexeme in hashtable, adding it if new */ item = (TrackItem *) hash_search(lexemes_tab, (const void *) &hash_key, HASH_ENTER, &found); if (found) { /* The lexeme is already on the tracking list */ item->frequency++; } else { /* Initialize new tracking list element */ item->frequency = 1; item->delta = b_current - 1; } /* We prune the D structure after processing each bucket */ if (lexeme_no % bucket_width == 0) { prune_lexemes_hashtable(lexemes_tab, b_current); b_current++; } /* Advance to the next WordEntry in the tsvector */ lexeme_no++; curentryptr++; } } /* We can only compute real stats if we found some non-null values. */ if (null_cnt < samplerows) { int nonnull_cnt = samplerows - null_cnt; int i; TrackItem **sort_table; int track_len; int minfreq, maxfreq; stats->stats_valid = true; /* Do the simple null-frac and average width stats */ stats->stanullfrac = (double) null_cnt / (double) samplerows; stats->stawidth = total_width / (double) nonnull_cnt; /* Assume it's a unique column (see notes above) */ stats->stadistinct = -1.0; /* * Determine the top-N lexemes by simply copying pointers from the * hashtable into an array and applying qsort() */ track_len = hash_get_num_entries(lexemes_tab); sort_table = (TrackItem **) palloc(sizeof(TrackItem *) * track_len); hash_seq_init(&scan_status, lexemes_tab); i = 0; while ((item = (TrackItem *) hash_seq_search(&scan_status)) != NULL) { sort_table[i++] = item; } Assert(i == track_len); qsort(sort_table, track_len, sizeof(TrackItem *), trackitem_compare_frequencies_desc); /* Suppress any single-occurrence items */ while (track_len > 0) { if (sort_table[track_len - 1]->frequency > 1) break; track_len--; } /* Determine the number of most common lexemes to be stored */ if (num_mcelem > track_len) num_mcelem = track_len; /* Generate MCELEM slot entry */ if (num_mcelem > 0) { MemoryContext old_context; Datum *mcelem_values; float4 *mcelem_freqs; /* Grab the minimal and maximal frequencies that will get stored */ minfreq = sort_table[num_mcelem - 1]->frequency; maxfreq = sort_table[0]->frequency; /* * We want to store statistics sorted on the lexeme value using * first length, then byte-for-byte comparison. The reason for * doing length comparison first is that we don't care about the * ordering so long as it's consistent, and comparing lengths * first gives us a chance to avoid a strncmp() call. * * This is different from what we do with scalar statistics -- * they get sorted on frequencies. The rationale is that we * usually search through most common elements looking for a * specific value, so we can grab its frequency. When values are * presorted we can employ binary search for that. See * ts_selfuncs.c for a real usage scenario. */ qsort(sort_table, num_mcelem, sizeof(TrackItem *), trackitem_compare_lexemes); /* Must copy the target values into anl_context */ old_context = MemoryContextSwitchTo(stats->anl_context); /* * We sorted statistics on the lexeme value, but we want to be * able to find out the minimal and maximal frequency without * going through all the values. We keep those two extra * frequencies in two extra cells in mcelem_freqs. */ mcelem_values = (Datum *) palloc(num_mcelem * sizeof(Datum)); mcelem_freqs = (float4 *) palloc((num_mcelem + 2) * sizeof(float4)); for (i = 0; i < num_mcelem; i++) { TrackItem *item = sort_table[i]; mcelem_values[i] = PointerGetDatum(cstring_to_text_with_len(item->key.lexeme, item->key.length)); mcelem_freqs[i] = (double) item->frequency / (double) nonnull_cnt; } mcelem_freqs[i++] = (double) minfreq / (double) nonnull_cnt; mcelem_freqs[i] = (double) maxfreq / (double) nonnull_cnt; MemoryContextSwitchTo(old_context); stats->stakind[0] = STATISTIC_KIND_MCELEM; stats->staop[0] = TextEqualOperator; stats->stanumbers[0] = mcelem_freqs; /* See above comment about two extra frequency fields */ stats->numnumbers[0] = num_mcelem + 2; stats->stavalues[0] = mcelem_values; stats->numvalues[0] = num_mcelem; /* We are storing text values */ stats->statypid[0] = TEXTOID; stats->statyplen[0] = -1; /* typlen, -1 for varlena */ stats->statypbyval[0] = false; stats->statypalign[0] = 'i'; } } else { /* We found only nulls; assume the column is entirely null */ stats->stats_valid = true; stats->stanullfrac = 1.0; stats->stawidth = 0; /* "unknown" */ stats->stadistinct = 0.0; /* "unknown" */ } /* * We don't need to bother cleaning up any of our temporary palloc's. The * hashtable should also go away, as it used a child memory context. */ }
Datum ginvacuumcleanup(PG_FUNCTION_ARGS) { IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0); IndexBulkDeleteResult *stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(1); Relation index = info->index; bool needLock; BlockNumber npages, blkno; BlockNumber totFreePages; GinState ginstate; GinStatsData idxStat; /* * In an autovacuum analyze, we want to clean up pending insertions. * Otherwise, an ANALYZE-only call is a no-op. */ if (info->analyze_only) { if (IsAutoVacuumWorkerProcess()) { initGinState(&ginstate, index); ginInsertCleanup(&ginstate, true, stats); } PG_RETURN_POINTER(stats); } /* * Set up all-zero stats and cleanup pending inserts if ginbulkdelete * wasn't called */ if (stats == NULL) { stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); initGinState(&ginstate, index); ginInsertCleanup(&ginstate, true, stats); } memset(&idxStat, 0, sizeof(idxStat)); /* * XXX we always report the heap tuple count as the number of index * entries. This is bogus if the index is partial, but it's real hard to * tell how many distinct heap entries are referenced by a GIN index. */ stats->num_index_tuples = info->num_heap_tuples; stats->estimated_count = info->estimated_count; /* * Need lock unless it's local to this backend. */ needLock = !RELATION_IS_LOCAL(index); if (needLock) LockRelationForExtension(index, ExclusiveLock); npages = RelationGetNumberOfBlocks(index); if (needLock) UnlockRelationForExtension(index, ExclusiveLock); totFreePages = 0; for (blkno = GIN_ROOT_BLKNO; blkno < npages; blkno++) { Buffer buffer; Page page; vacuum_delay_point(); buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy); LockBuffer(buffer, GIN_SHARE); page = (Page) BufferGetPage(buffer); if (GinPageIsDeleted(page)) { Assert(blkno != GIN_ROOT_BLKNO); RecordFreeIndexPage(index, blkno); totFreePages++; } else if (GinPageIsData(page)) { idxStat.nDataPages++; } else if (!GinPageIsList(page)) { idxStat.nEntryPages++; if (GinPageIsLeaf(page)) idxStat.nEntries += PageGetMaxOffsetNumber(page); } UnlockReleaseBuffer(buffer); } /* Update the metapage with accurate page and entry counts */ idxStat.nTotalPages = npages; ginUpdateStats(info->index, &idxStat); /* Finally, vacuum the FSM */ IndexFreeSpaceMapVacuum(info->index); stats->pages_free = totFreePages; if (needLock) LockRelationForExtension(index, ExclusiveLock); stats->num_pages = RelationGetNumberOfBlocks(index); if (needLock) UnlockRelationForExtension(index, ExclusiveLock); PG_RETURN_POINTER(stats); }
/* * Bulk deletion of all index entries pointing to a set of heap tuples. * The set of target tuples is specified via a callback routine that tells * whether any given heap tuple (identified by ItemPointer) is being deleted. * * Result: a palloc'd struct containing statistical info for VACUUM displays. */ Datum rtbulkdelete(PG_FUNCTION_ARGS) { Relation rel = (Relation) PG_GETARG_POINTER(0); IndexBulkDeleteCallback callback = (IndexBulkDeleteCallback) PG_GETARG_POINTER(1); void *callback_state = (void *) PG_GETARG_POINTER(2); IndexBulkDeleteResult *result; BlockNumber num_pages; double tuples_removed; double num_index_tuples; IndexScanDesc iscan; tuples_removed = 0; num_index_tuples = 0; /* * Since rtree is not marked "amconcurrent" in pg_am, caller should have * acquired exclusive lock on index relation. We need no locking here. */ /* * XXX generic implementation --- should be improved! */ /* walk through the entire index */ iscan = index_beginscan(NULL, rel, SnapshotAny, 0, NULL); /* including killed tuples */ iscan->ignore_killed_tuples = false; while (index_getnext_indexitem(iscan, ForwardScanDirection)) { vacuum_delay_point(); if (callback(&iscan->xs_ctup.t_self, callback_state)) { ItemPointerData indextup = iscan->currentItemData; BlockNumber blkno; OffsetNumber offnum; Buffer buf; Page page; blkno = ItemPointerGetBlockNumber(&indextup); offnum = ItemPointerGetOffsetNumber(&indextup); /* adjust any scans that will be affected by this deletion */ /* (namely, my own scan) */ rtadjscans(rel, RTOP_DEL, blkno, offnum); /* delete the index tuple */ buf = ReadBuffer(rel, blkno); page = BufferGetPage(buf); PageIndexTupleDelete(page, offnum); WriteBuffer(buf); tuples_removed += 1; } else num_index_tuples += 1; } index_endscan(iscan); /* return statistics */ num_pages = RelationGetNumberOfBlocks(rel); result = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); result->num_pages = num_pages; result->num_index_tuples = num_index_tuples; result->tuples_removed = tuples_removed; PG_RETURN_POINTER(result); }
/* * lazy_scan_heap() -- scan an open heap relation * * This routine sets commit status bits, builds lists of dead tuples * and pages with free space, and calculates statistics on the number * of live tuples in the heap. When done, or when we run low on space * for dead-tuple TIDs, invoke vacuuming of indexes and heap. * * If there are no indexes then we just vacuum each dirty page as we * process it, since there's no point in gathering many tuples. */ static void lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, Relation *Irel, int nindexes, List *updated_stats) { MIRROREDLOCK_BUFMGR_DECLARE; BlockNumber nblocks, blkno; HeapTupleData tuple; char *relname; BlockNumber empty_pages, vacuumed_pages; double num_tuples, tups_vacuumed, nkeep, nunused; IndexBulkDeleteResult **indstats; int i; int reindex_count = 1; PGRUsage ru0; /* Fetch gp_persistent_relation_node information that will be added to XLOG record. */ RelationFetchGpRelationNodeForXLog(onerel); pg_rusage_init(&ru0); relname = RelationGetRelationName(onerel); ereport(elevel, (errmsg("vacuuming \"%s.%s\"", get_namespace_name(RelationGetNamespace(onerel)), relname))); empty_pages = vacuumed_pages = 0; num_tuples = tups_vacuumed = nkeep = nunused = 0; indstats = (IndexBulkDeleteResult **) palloc0(nindexes * sizeof(IndexBulkDeleteResult *)); nblocks = RelationGetNumberOfBlocks(onerel); vacrelstats->rel_pages = nblocks; vacrelstats->nonempty_pages = 0; lazy_space_alloc(vacrelstats, nblocks); for (blkno = 0; blkno < nblocks; blkno++) { Buffer buf; Page page; OffsetNumber offnum, maxoff; bool tupgone, hastup; int prev_dead_count; OffsetNumber frozen[MaxOffsetNumber]; int nfrozen; vacuum_delay_point(); /* * If we are close to overrunning the available space for dead-tuple * TIDs, pause and do a cycle of vacuuming before we tackle this page. */ if ((vacrelstats->max_dead_tuples - vacrelstats->num_dead_tuples) < MaxHeapTuplesPerPage && vacrelstats->num_dead_tuples > 0) { /* Remove index entries */ for (i = 0; i < nindexes; i++) lazy_vacuum_index(Irel[i], &indstats[i], vacrelstats); reindex_count++; /* Remove tuples from heap */ lazy_vacuum_heap(onerel, vacrelstats); /* Forget the now-vacuumed tuples, and press on */ vacrelstats->num_dead_tuples = 0; vacrelstats->num_index_scans++; } /* -------- MirroredLock ---------- */ MIRROREDLOCK_BUFMGR_LOCK; buf = ReadBufferWithStrategy(onerel, blkno, vac_strategy); /* We need buffer cleanup lock so that we can prune HOT chains. */ LockBufferForCleanup(buf); page = BufferGetPage(buf); if (PageIsNew(page)) { /* * An all-zeroes page could be left over if a backend extends the * relation but crashes before initializing the page. Reclaim such * pages for use. * * We have to be careful here because we could be looking at a * page that someone has just added to the relation and not yet * been able to initialize (see RelationGetBufferForTuple). To * protect against that, release the buffer lock, grab the * relation extension lock momentarily, and re-lock the buffer. If * the page is still uninitialized by then, it must be left over * from a crashed backend, and we can initialize it. * * We don't really need the relation lock when this is a new or * temp relation, but it's probably not worth the code space to * check that, since this surely isn't a critical path. * * Note: the comparable code in vacuum.c need not worry because * it's got exclusive lock on the whole relation. */ LockBuffer(buf, BUFFER_LOCK_UNLOCK); MIRROREDLOCK_BUFMGR_UNLOCK; /* -------- MirroredLock ---------- */ LockRelationForExtension(onerel, ExclusiveLock); UnlockRelationForExtension(onerel, ExclusiveLock); /* -------- MirroredLock ---------- */ MIRROREDLOCK_BUFMGR_LOCK; LockBufferForCleanup(buf); if (PageIsNew(page)) { ereport(WARNING, (errmsg("relation \"%s\" page %u is uninitialized --- fixing", relname, blkno))); PageInit(page, BufferGetPageSize(buf), 0); /* must record in xlog so that changetracking will know about this change */ log_heap_newpage(onerel, page, blkno); empty_pages++; lazy_record_free_space(vacrelstats, blkno, PageGetHeapFreeSpace(page)); } MarkBufferDirty(buf); UnlockReleaseBuffer(buf); MIRROREDLOCK_BUFMGR_UNLOCK; /* -------- MirroredLock ---------- */ continue; } if (PageIsEmpty(page)) { empty_pages++; lazy_record_free_space(vacrelstats, blkno, PageGetHeapFreeSpace(page)); UnlockReleaseBuffer(buf); MIRROREDLOCK_BUFMGR_UNLOCK; /* -------- MirroredLock ---------- */ continue; } /* * Prune all HOT-update chains in this page. * * We count tuples removed by the pruning step as removed by VACUUM. */ tups_vacuumed += heap_page_prune(onerel, buf, OldestXmin, false, false); /* * Now scan the page to collect vacuumable items and check for tuples * requiring freezing. */ nfrozen = 0; hastup = false; prev_dead_count = vacrelstats->num_dead_tuples; maxoff = PageGetMaxOffsetNumber(page); for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { ItemId itemid; itemid = PageGetItemId(page, offnum); /* Unused items require no processing, but we count 'em */ if (!ItemIdIsUsed(itemid)) { nunused += 1; continue; } /* Redirect items mustn't be touched */ if (ItemIdIsRedirected(itemid)) { hastup = true; /* this page won't be truncatable */ continue; } ItemPointerSet(&(tuple.t_self), blkno, offnum); /* * DEAD item pointers are to be vacuumed normally; but we don't * count them in tups_vacuumed, else we'd be double-counting (at * least in the common case where heap_page_prune() just freed up * a non-HOT tuple). */ if (ItemIdIsDead(itemid)) { lazy_record_dead_tuple(vacrelstats, &(tuple.t_self)); continue; } Assert(ItemIdIsNormal(itemid)); tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid); tuple.t_len = ItemIdGetLength(itemid); tupgone = false; switch (HeapTupleSatisfiesVacuum(onerel, tuple.t_data, OldestXmin, buf)) { case HEAPTUPLE_DEAD: /* * Ordinarily, DEAD tuples would have been removed by * heap_page_prune(), but it's possible that the tuple * state changed since heap_page_prune() looked. In * particular an INSERT_IN_PROGRESS tuple could have * changed to DEAD if the inserter aborted. So this * cannot be considered an error condition. * * If the tuple is HOT-updated then it must only be * removed by a prune operation; so we keep it just as if * it were RECENTLY_DEAD. Also, if it's a heap-only * tuple, we choose to keep it, because it'll be a lot * cheaper to get rid of it in the next pruning pass than * to treat it like an indexed tuple. */ if (HeapTupleIsHotUpdated(&tuple) || HeapTupleIsHeapOnly(&tuple)) nkeep += 1; else tupgone = true; /* we can delete the tuple */ break; case HEAPTUPLE_LIVE: /* Tuple is good --- but let's do some validity checks */ if (onerel->rd_rel->relhasoids && !OidIsValid(HeapTupleGetOid(&tuple))) elog(WARNING, "relation \"%s\" TID %u/%u: OID is invalid", relname, blkno, offnum); break; case HEAPTUPLE_RECENTLY_DEAD: /* * If tuple is recently deleted then we must not remove it * from relation. */ nkeep += 1; break; case HEAPTUPLE_INSERT_IN_PROGRESS: /* This is an expected case during concurrent vacuum */ break; case HEAPTUPLE_DELETE_IN_PROGRESS: /* This is an expected case during concurrent vacuum */ break; default: elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); break; } if (tupgone) { lazy_record_dead_tuple(vacrelstats, &(tuple.t_self)); tups_vacuumed += 1; } else { num_tuples += 1; hastup = true; /* * Each non-removable tuple must be checked to see if it needs * freezing. Note we already have exclusive buffer lock. */ if (heap_freeze_tuple(tuple.t_data, &FreezeLimit, InvalidBuffer, false)) frozen[nfrozen++] = offnum; } } /* scan along page */ /* * If we froze any tuples, mark the buffer dirty, and write a WAL * record recording the changes. We must log the changes to be * crash-safe against future truncation of CLOG. */ if (nfrozen > 0) { MarkBufferDirty(buf); /* no XLOG for temp tables, though */ if (!onerel->rd_istemp) { XLogRecPtr recptr; recptr = log_heap_freeze(onerel, buf, FreezeLimit, frozen, nfrozen); PageSetLSN(page, recptr); } } /* * If there are no indexes then we can vacuum the page right now * instead of doing a second scan. */ if (nindexes == 0 && vacrelstats->num_dead_tuples > 0) { /* Remove tuples from heap */ lazy_vacuum_page(onerel, blkno, buf, 0, vacrelstats); /* Forget the now-vacuumed tuples, and press on */ vacrelstats->num_dead_tuples = 0; vacuumed_pages++; } /* * If we remembered any tuples for deletion, then the page will be * visited again by lazy_vacuum_heap, which will compute and record * its post-compaction free space. If not, then we're done with this * page, so remember its free space as-is. (This path will always be * taken if there are no indexes.) */ if (vacrelstats->num_dead_tuples == prev_dead_count) { lazy_record_free_space(vacrelstats, blkno, PageGetHeapFreeSpace(page)); } /* Remember the location of the last page with nonremovable tuples */ if (hastup) vacrelstats->nonempty_pages = blkno + 1; UnlockReleaseBuffer(buf); MIRROREDLOCK_BUFMGR_UNLOCK; /* -------- MirroredLock ---------- */ } /* save stats for use later */ vacrelstats->rel_tuples = num_tuples; vacrelstats->tuples_deleted = tups_vacuumed; /* If any tuples need to be deleted, perform final vacuum cycle */ /* XXX put a threshold on min number of tuples here? */ if (vacrelstats->num_dead_tuples > 0) { /* Remove index entries */ for (i = 0; i < nindexes; i++) lazy_vacuum_index(Irel[i], &indstats[i], vacrelstats); reindex_count++; /* Remove tuples from heap */ lazy_vacuum_heap(onerel, vacrelstats); vacrelstats->num_index_scans++; } /* Do post-vacuum cleanup and statistics update for each index */ for (i = 0; i < nindexes; i++) lazy_cleanup_index(Irel[i], indstats[i], vacrelstats, updated_stats); /* If no indexes, make log report that lazy_vacuum_heap would've made */ if (vacuumed_pages) ereport(elevel, (errmsg("\"%s\": removed %.0f row versions in %u pages", RelationGetRelationName(onerel), tups_vacuumed, vacuumed_pages))); ereport(elevel, (errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u pages", RelationGetRelationName(onerel), tups_vacuumed, num_tuples, nblocks), errdetail("%.0f dead row versions cannot be removed yet.\n" "There were %.0f unused item pointers.\n" "%u pages contain useful free space.\n" "%u pages are entirely empty.\n" "%s.", nkeep, nunused, vacrelstats->tot_free_pages, empty_pages, pg_rusage_show(&ru0)))); }
Datum ginvacuumcleanup(PG_FUNCTION_ARGS) { MIRROREDLOCK_BUFMGR_DECLARE; IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0); IndexBulkDeleteResult *stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(1); Relation index = info->index; bool needLock; BlockNumber npages, blkno; BlockNumber totFreePages, nFreePages, *freePages, maxFreePages; BlockNumber lastBlock = GIN_ROOT_BLKNO, lastFilledBlock = GIN_ROOT_BLKNO; /* Set up all-zero stats if ginbulkdelete wasn't called */ if (stats == NULL) stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); /* * XXX we always report the heap tuple count as the number of index * entries. This is bogus if the index is partial, but it's real hard to * tell how many distinct heap entries are referenced by a GIN index. */ stats->num_index_tuples = info->num_heap_tuples; /* * If vacuum full, we already have exclusive lock on the index. Otherwise, * need lock unless it's local to this backend. */ if (info->vacuum_full) needLock = false; else needLock = !RELATION_IS_LOCAL(index); if (needLock) LockRelationForExtension(index, ExclusiveLock); npages = RelationGetNumberOfBlocks(index); if (needLock) UnlockRelationForExtension(index, ExclusiveLock); maxFreePages = npages; if (maxFreePages > MaxFSMPages) maxFreePages = MaxFSMPages; totFreePages = nFreePages = 0; freePages = (BlockNumber *) palloc(sizeof(BlockNumber) * maxFreePages); for (blkno = GIN_ROOT_BLKNO + 1; blkno < npages; blkno++) { Buffer buffer; Page page; vacuum_delay_point(); // -------- MirroredLock ---------- MIRROREDLOCK_BUFMGR_LOCK; buffer = ReadBuffer(index, blkno); LockBuffer(buffer, GIN_SHARE); page = (Page) BufferGetPage(buffer); if (GinPageIsDeleted(page)) { if (nFreePages < maxFreePages) freePages[nFreePages++] = blkno; totFreePages++; } else lastFilledBlock = blkno; UnlockReleaseBuffer(buffer); MIRROREDLOCK_BUFMGR_UNLOCK; // -------- MirroredLock ---------- } lastBlock = npages - 1; if (info->vacuum_full && nFreePages > 0) { /* try to truncate index */ int i; for (i = 0; i < nFreePages; i++) if (freePages[i] >= lastFilledBlock) { totFreePages = nFreePages = i; break; } if (lastBlock > lastFilledBlock) RelationTruncate( index, lastFilledBlock + 1, /* markPersistentAsPhysicallyTruncated */ true); stats->pages_removed = lastBlock - lastFilledBlock; } RecordIndexFreeSpace(&index->rd_node, totFreePages, nFreePages, freePages); stats->pages_free = totFreePages; if (needLock) LockRelationForExtension(index, ExclusiveLock); stats->num_pages = RelationGetNumberOfBlocks(index); if (needLock) UnlockRelationForExtension(index, ExclusiveLock); PG_RETURN_POINTER(stats); }
Datum ginbulkdelete(PG_FUNCTION_ARGS) { MIRROREDLOCK_BUFMGR_DECLARE; IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0); IndexBulkDeleteResult *stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(1); IndexBulkDeleteCallback callback = (IndexBulkDeleteCallback) PG_GETARG_POINTER(2); void *callback_state = (void *) PG_GETARG_POINTER(3); Relation index = info->index; BlockNumber blkno = GIN_ROOT_BLKNO; GinVacuumState gvs; Buffer buffer; BlockNumber rootOfPostingTree[BLCKSZ / (sizeof(IndexTupleData) + sizeof(ItemId))]; uint32 nRoot; /* first time through? */ if (stats == NULL) stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); /* we'll re-count the tuples each time */ stats->num_index_tuples = 0; gvs.index = index; gvs.result = stats; gvs.callback = callback; gvs.callback_state = callback_state; initGinState(&gvs.ginstate, index); // -------- MirroredLock ---------- MIRROREDLOCK_BUFMGR_LOCK; buffer = ReadBuffer(index, blkno); /* find leaf page */ for (;;) { Page page = BufferGetPage(buffer); IndexTuple itup; LockBuffer(buffer, GIN_SHARE); Assert(!GinPageIsData(page)); if (GinPageIsLeaf(page)) { LockBuffer(buffer, GIN_UNLOCK); LockBuffer(buffer, GIN_EXCLUSIVE); if (blkno == GIN_ROOT_BLKNO && !GinPageIsLeaf(page)) { LockBuffer(buffer, GIN_UNLOCK); continue; /* check it one more */ } break; } Assert(PageGetMaxOffsetNumber(page) >= FirstOffsetNumber); itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, FirstOffsetNumber)); blkno = GinItemPointerGetBlockNumber(&(itup)->t_tid); Assert(blkno != InvalidBlockNumber); LockBuffer(buffer, GIN_UNLOCK); buffer = ReleaseAndReadBuffer(buffer, index, blkno); } /* right now we found leftmost page in entry's BTree */ for (;;) { Page page = BufferGetPage(buffer); Page resPage; uint32 i; Assert(!GinPageIsData(page)); resPage = ginVacuumEntryPage(&gvs, buffer, rootOfPostingTree, &nRoot); blkno = GinPageGetOpaque(page)->rightlink; if (resPage) { START_CRIT_SECTION(); PageRestoreTempPage(resPage, page); MarkBufferDirty(buffer); xlogVacuumPage(gvs.index, buffer); UnlockReleaseBuffer(buffer); END_CRIT_SECTION(); } else { UnlockReleaseBuffer(buffer); } vacuum_delay_point(); for (i = 0; i < nRoot; i++) { ginVacuumPostingTree(&gvs, rootOfPostingTree[i]); vacuum_delay_point(); } if (blkno == InvalidBlockNumber) /* rightmost page */ break; buffer = ReadBuffer(index, blkno); LockBuffer(buffer, GIN_EXCLUSIVE); } MIRROREDLOCK_BUFMGR_UNLOCK; // -------- MirroredLock ---------- PG_RETURN_POINTER(gvs.result); }
/* * Helper function to perform deletion of index entries from a bucket. * * This function expects that the caller has acquired a cleanup lock on the * primary bucket page, and will return with a write lock again held on the * primary bucket page. The lock won't necessarily be held continuously, * though, because we'll release it when visiting overflow pages. * * It would be very bad if this function cleaned a page while some other * backend was in the midst of scanning it, because hashgettuple assumes * that the next valid TID will be greater than or equal to the current * valid TID. There can't be any concurrent scans in progress when we first * enter this function because of the cleanup lock we hold on the primary * bucket page, but as soon as we release that lock, there might be. We * handle that by conspiring to prevent those scans from passing our cleanup * scan. To do that, we lock the next page in the bucket chain before * releasing the lock on the previous page. (This type of lock chaining is * not ideal, so we might want to look for a better solution at some point.) * * We need to retain a pin on the primary bucket to ensure that no concurrent * split can start. */ void hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf, BlockNumber bucket_blkno, BufferAccessStrategy bstrategy, uint32 maxbucket, uint32 highmask, uint32 lowmask, double *tuples_removed, double *num_index_tuples, bool split_cleanup, IndexBulkDeleteCallback callback, void *callback_state) { BlockNumber blkno; Buffer buf; Bucket new_bucket PG_USED_FOR_ASSERTS_ONLY = InvalidBucket; bool bucket_dirty = false; blkno = bucket_blkno; buf = bucket_buf; if (split_cleanup) new_bucket = _hash_get_newbucket_from_oldbucket(rel, cur_bucket, lowmask, maxbucket); /* Scan each page in bucket */ for (;;) { HashPageOpaque opaque; OffsetNumber offno; OffsetNumber maxoffno; Buffer next_buf; Page page; OffsetNumber deletable[MaxOffsetNumber]; int ndeletable = 0; bool retain_pin = false; bool curr_page_dirty = false; vacuum_delay_point(); page = BufferGetPage(buf); opaque = (HashPageOpaque) PageGetSpecialPointer(page); /* Scan each tuple in page */ maxoffno = PageGetMaxOffsetNumber(page); for (offno = FirstOffsetNumber; offno <= maxoffno; offno = OffsetNumberNext(offno)) { ItemPointer htup; IndexTuple itup; Bucket bucket; bool kill_tuple = false; itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offno)); htup = &(itup->t_tid); /* * To remove the dead tuples, we strictly want to rely on results * of callback function. refer btvacuumpage for detailed reason. */ if (callback && callback(htup, callback_state)) { kill_tuple = true; if (tuples_removed) *tuples_removed += 1; } else if (split_cleanup) { /* delete the tuples that are moved by split. */ bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup), maxbucket, highmask, lowmask); /* mark the item for deletion */ if (bucket != cur_bucket) { /* * We expect tuples to either belong to curent bucket or * new_bucket. This is ensured because we don't allow * further splits from bucket that contains garbage. See * comments in _hash_expandtable. */ Assert(bucket == new_bucket); kill_tuple = true; } } if (kill_tuple) { /* mark the item for deletion */ deletable[ndeletable++] = offno; } else { /* we're keeping it, so count it */ if (num_index_tuples) *num_index_tuples += 1; } } /* retain the pin on primary bucket page till end of bucket scan */ if (blkno == bucket_blkno) retain_pin = true; else retain_pin = false; blkno = opaque->hasho_nextblkno; /* * Apply deletions, advance to next page and write page if needed. */ if (ndeletable > 0) { PageIndexMultiDelete(page, deletable, ndeletable); bucket_dirty = true; curr_page_dirty = true; } /* bail out if there are no more pages to scan. */ if (!BlockNumberIsValid(blkno)) break; next_buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE, LH_OVERFLOW_PAGE, bstrategy); /* * release the lock on previous page after acquiring the lock on next * page */ if (curr_page_dirty) { if (retain_pin) _hash_chgbufaccess(rel, buf, HASH_WRITE, HASH_NOLOCK); else _hash_wrtbuf(rel, buf); curr_page_dirty = false; } else if (retain_pin) _hash_chgbufaccess(rel, buf, HASH_READ, HASH_NOLOCK); else _hash_relbuf(rel, buf); buf = next_buf; } /* * lock the bucket page to clear the garbage flag and squeeze the bucket. * if the current buffer is same as bucket buffer, then we already have * lock on bucket page. */ if (buf != bucket_buf) { _hash_relbuf(rel, buf); _hash_chgbufaccess(rel, bucket_buf, HASH_NOLOCK, HASH_WRITE); } /* * Clear the garbage flag from bucket after deleting the tuples that are * moved by split. We purposefully clear the flag before squeeze bucket, * so that after restart, vacuum shouldn't again try to delete the moved * by split tuples. */ if (split_cleanup) { HashPageOpaque bucket_opaque; Page page; page = BufferGetPage(bucket_buf); bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page); bucket_opaque->hasho_flag &= ~LH_BUCKET_NEEDS_SPLIT_CLEANUP; } /* * If we have deleted anything, try to compact free space. For squeezing * the bucket, we must have a cleanup lock, else it can impact the * ordering of tuples for a scan that has started before it. */ if (bucket_dirty && IsBufferCleanupOK(bucket_buf)) _hash_squeezebucket(rel, cur_bucket, bucket_blkno, bucket_buf, bstrategy); else _hash_chgbufaccess(rel, bucket_buf, HASH_WRITE, HASH_NOLOCK); }
/* * Bulk deletion of all index entries pointing to a set of heap tuples and * check invalid tuples left after upgrade. * The set of target tuples is specified via a callback routine that tells * whether any given heap tuple (identified by ItemPointer) is being deleted. * * Result: a palloc'd struct containing statistical info for VACUUM displays. */ IndexBulkDeleteResult * gistbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state) { Relation rel = info->index; GistBDItem *stack, *ptr; /* first time through? */ if (stats == NULL) stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); /* we'll re-count the tuples each time */ stats->estimated_count = false; stats->num_index_tuples = 0; stack = (GistBDItem *) palloc0(sizeof(GistBDItem)); stack->blkno = GIST_ROOT_BLKNO; while (stack) { Buffer buffer; Page page; OffsetNumber i, maxoff; IndexTuple idxtuple; ItemId iid; buffer = ReadBufferExtended(rel, MAIN_FORKNUM, stack->blkno, RBM_NORMAL, info->strategy); LockBuffer(buffer, GIST_SHARE); gistcheckpage(rel, buffer); page = (Page) BufferGetPage(buffer); if (GistPageIsLeaf(page)) { OffsetNumber todelete[MaxOffsetNumber]; int ntodelete = 0; LockBuffer(buffer, GIST_UNLOCK); LockBuffer(buffer, GIST_EXCLUSIVE); page = (Page) BufferGetPage(buffer); if (stack->blkno == GIST_ROOT_BLKNO && !GistPageIsLeaf(page)) { /* only the root can become non-leaf during relock */ UnlockReleaseBuffer(buffer); /* one more check */ continue; } /* * check for split proceeded after look at parent, we should check * it after relock */ pushStackIfSplited(page, stack); /* * Remove deletable tuples from page */ maxoff = PageGetMaxOffsetNumber(page); for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) { iid = PageGetItemId(page, i); idxtuple = (IndexTuple) PageGetItem(page, iid); if (callback(&(idxtuple->t_tid), callback_state)) todelete[ntodelete++] = i; else stats->num_index_tuples += 1; } stats->tuples_removed += ntodelete; if (ntodelete) { START_CRIT_SECTION(); MarkBufferDirty(buffer); PageIndexMultiDelete(page, todelete, ntodelete); GistMarkTuplesDeleted(page); if (RelationNeedsWAL(rel)) { XLogRecPtr recptr; recptr = gistXLogUpdate(buffer, todelete, ntodelete, NULL, 0, InvalidBuffer); PageSetLSN(page, recptr); } else PageSetLSN(page, gistGetFakeLSN(rel)); END_CRIT_SECTION(); } } else { /* check for split proceeded after look at parent */ pushStackIfSplited(page, stack); maxoff = PageGetMaxOffsetNumber(page); for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) { iid = PageGetItemId(page, i); idxtuple = (IndexTuple) PageGetItem(page, iid); ptr = (GistBDItem *) palloc(sizeof(GistBDItem)); ptr->blkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid)); ptr->parentlsn = BufferGetLSNAtomic(buffer); ptr->next = stack->next; stack->next = ptr; if (GistTupleIsInvalid(idxtuple)) ereport(LOG, (errmsg("index \"%s\" contains an inner tuple marked as invalid", RelationGetRelationName(rel)), errdetail("This is caused by an incomplete page split at crash recovery before upgrading to PostgreSQL 9.1."), errhint("Please REINDEX it."))); } } UnlockReleaseBuffer(buffer); ptr = stack->next; pfree(stack); stack = ptr; vacuum_delay_point(); } return stats; }
/* * Bulk deletion of all index entries pointing to a set of heap tuples. * The set of target tuples is specified via a callback routine that tells * whether any given heap tuple (identified by ItemPointer) is being deleted. * * Result: a palloc'd struct containing statistical info for VACUUM displays. */ IndexBulkDeleteResult * hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state) { Relation rel = info->index; double tuples_removed; double num_index_tuples; double orig_ntuples; Bucket orig_maxbucket; Bucket cur_maxbucket; Bucket cur_bucket; Buffer metabuf; HashMetaPage metap; HashMetaPageData local_metapage; tuples_removed = 0; num_index_tuples = 0; /* * Read the metapage to fetch original bucket and tuple counts. Also, we * keep a copy of the last-seen metapage so that we can use its * hashm_spares[] values to compute bucket page addresses. This is a bit * hokey but perfectly safe, since the interesting entries in the spares * array cannot change under us; and it beats rereading the metapage for * each bucket. */ metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); metap = HashPageGetMeta(BufferGetPage(metabuf)); orig_maxbucket = metap->hashm_maxbucket; orig_ntuples = metap->hashm_ntuples; memcpy(&local_metapage, metap, sizeof(local_metapage)); _hash_relbuf(rel, metabuf); /* Scan the buckets that we know exist */ cur_bucket = 0; cur_maxbucket = orig_maxbucket; loop_top: while (cur_bucket <= cur_maxbucket) { BlockNumber bucket_blkno; BlockNumber blkno; bool bucket_dirty = false; /* Get address of bucket's start page */ bucket_blkno = BUCKET_TO_BLKNO(&local_metapage, cur_bucket); /* Exclusive-lock the bucket so we can shrink it */ _hash_getlock(rel, bucket_blkno, HASH_EXCLUSIVE); /* Shouldn't have any active scans locally, either */ if (_hash_has_active_scan(rel, cur_bucket)) elog(ERROR, "hash index has active scan during VACUUM"); /* Scan each page in bucket */ blkno = bucket_blkno; while (BlockNumberIsValid(blkno)) { Buffer buf; Page page; HashPageOpaque opaque; OffsetNumber offno; OffsetNumber maxoffno; OffsetNumber deletable[MaxOffsetNumber]; int ndeletable = 0; vacuum_delay_point(); buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE, info->strategy); page = BufferGetPage(buf); opaque = (HashPageOpaque) PageGetSpecialPointer(page); Assert(opaque->hasho_bucket == cur_bucket); /* Scan each tuple in page */ maxoffno = PageGetMaxOffsetNumber(page); for (offno = FirstOffsetNumber; offno <= maxoffno; offno = OffsetNumberNext(offno)) { IndexTuple itup; ItemPointer htup; itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offno)); htup = &(itup->t_tid); if (callback(htup, callback_state)) { /* mark the item for deletion */ deletable[ndeletable++] = offno; tuples_removed += 1; } else num_index_tuples += 1; } /* * Apply deletions and write page if needed, advance to next page. */ blkno = opaque->hasho_nextblkno; if (ndeletable > 0) { PageIndexMultiDelete(page, deletable, ndeletable); _hash_wrtbuf(rel, buf); bucket_dirty = true; } else _hash_relbuf(rel, buf); } /* If we deleted anything, try to compact free space */ if (bucket_dirty) _hash_squeezebucket(rel, cur_bucket, bucket_blkno, info->strategy); /* Release bucket lock */ _hash_droplock(rel, bucket_blkno, HASH_EXCLUSIVE); /* Advance to next bucket */ cur_bucket++; } /* Write-lock metapage and check for split since we started */ metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE, LH_META_PAGE); metap = HashPageGetMeta(BufferGetPage(metabuf)); if (cur_maxbucket != metap->hashm_maxbucket) { /* There's been a split, so process the additional bucket(s) */ cur_maxbucket = metap->hashm_maxbucket; memcpy(&local_metapage, metap, sizeof(local_metapage)); _hash_relbuf(rel, metabuf); goto loop_top; } /* Okay, we're really done. Update tuple count in metapage. */ if (orig_maxbucket == metap->hashm_maxbucket && orig_ntuples == metap->hashm_ntuples) { /* * No one has split or inserted anything since start of scan, so * believe our count as gospel. */ metap->hashm_ntuples = num_index_tuples; } else { /* * Otherwise, our count is untrustworthy since we may have * double-scanned tuples in split buckets. Proceed by dead-reckoning. * (Note: we still return estimated_count = false, because using this * count is better than not updating reltuples at all.) */ if (metap->hashm_ntuples > tuples_removed) metap->hashm_ntuples -= tuples_removed; else metap->hashm_ntuples = 0; num_index_tuples = metap->hashm_ntuples; } _hash_wrtbuf(rel, metabuf); /* return statistics */ if (stats == NULL) stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); stats->estimated_count = false; stats->num_index_tuples = num_index_tuples; stats->tuples_removed += tuples_removed; /* hashvacuumcleanup will fill in num_pages */ return stats; }
/* * VACUUM cleanup: update FSM */ IndexBulkDeleteResult * gistvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) { Relation rel = info->index; BlockNumber npages, blkno; BlockNumber totFreePages; bool needLock; /* No-op in ANALYZE ONLY mode */ if (info->analyze_only) return stats; /* Set up all-zero stats if gistbulkdelete wasn't called */ if (stats == NULL) { stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); /* use heap's tuple count */ stats->num_index_tuples = info->num_heap_tuples; stats->estimated_count = info->estimated_count; /* * XXX the above is wrong if index is partial. Would it be OK to just * return NULL, or is there work we must do below? */ } /* * Need lock unless it's local to this backend. */ needLock = !RELATION_IS_LOCAL(rel); /* try to find deleted pages */ if (needLock) LockRelationForExtension(rel, ExclusiveLock); npages = RelationGetNumberOfBlocks(rel); if (needLock) UnlockRelationForExtension(rel, ExclusiveLock); totFreePages = 0; for (blkno = GIST_ROOT_BLKNO + 1; blkno < npages; blkno++) { Buffer buffer; Page page; vacuum_delay_point(); buffer = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy); LockBuffer(buffer, GIST_SHARE); page = (Page) BufferGetPage(buffer); if (PageIsNew(page) || GistPageIsDeleted(page)) { totFreePages++; RecordFreeIndexPage(rel, blkno); } UnlockReleaseBuffer(buffer); } /* Finally, vacuum the FSM */ IndexFreeSpaceMapVacuum(info->index); /* return statistics */ stats->pages_free = totFreePages; if (needLock) LockRelationForExtension(rel, ExclusiveLock); stats->num_pages = RelationGetNumberOfBlocks(rel); if (needLock) UnlockRelationForExtension(rel, ExclusiveLock); return stats; }
/* * Scan through posting tree, delete empty tuples from leaf pages. * Also, this function collects empty subtrees (with all empty leafs). * For parents of these subtrees CleanUp lock is taken, then we call * ScanToDelete. This is done for every inner page, which points to * empty subtree. */ static bool ginVacuumPostingTreeLeaves(GinVacuumState *gvs, BlockNumber blkno, bool isRoot) { Buffer buffer; Page page; bool hasVoidPage = FALSE; MemoryContext oldCxt; buffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, blkno, RBM_NORMAL, gvs->strategy); page = BufferGetPage(buffer); ginTraverseLock(buffer, false); Assert(GinPageIsData(page)); if (GinPageIsLeaf(page)) { oldCxt = MemoryContextSwitchTo(gvs->tmpCxt); ginVacuumPostingTreeLeaf(gvs->index, buffer, gvs); MemoryContextSwitchTo(oldCxt); MemoryContextReset(gvs->tmpCxt); /* if root is a leaf page, we don't desire further processing */ if (GinDataLeafPageIsEmpty(page)) hasVoidPage = TRUE; UnlockReleaseBuffer(buffer); return hasVoidPage; } else { OffsetNumber i; bool hasEmptyChild = FALSE; bool hasNonEmptyChild = FALSE; OffsetNumber maxoff = GinPageGetOpaque(page)->maxoff; BlockNumber *children = palloc(sizeof(BlockNumber) * (maxoff + 1)); /* * Read all children BlockNumbers. Not sure it is safe if there are * many concurrent vacuums. */ for (i = FirstOffsetNumber; i <= maxoff; i++) { PostingItem *pitem = GinDataPageGetPostingItem(page, i); children[i] = PostingItemGetBlockNumber(pitem); } UnlockReleaseBuffer(buffer); for (i = FirstOffsetNumber; i <= maxoff; i++) { if (ginVacuumPostingTreeLeaves(gvs, children[i], FALSE)) hasEmptyChild = TRUE; else hasNonEmptyChild = TRUE; } pfree(children); vacuum_delay_point(); /* * All subtree is empty - just return TRUE to indicate that parent * must do a cleanup. Unless we are ROOT an there is way to go upper. */ if (hasEmptyChild && !hasNonEmptyChild && !isRoot) return TRUE; if (hasEmptyChild) { DataPageDeleteStack root, *ptr, *tmp; buffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, blkno, RBM_NORMAL, gvs->strategy); LockBufferForCleanup(buffer); memset(&root, 0, sizeof(DataPageDeleteStack)); root.leftBlkno = InvalidBlockNumber; root.isRoot = TRUE; ginScanToDelete(gvs, blkno, TRUE, &root, InvalidOffsetNumber); ptr = root.child; while (ptr) { tmp = ptr->child; pfree(ptr); ptr = tmp; } UnlockReleaseBuffer(buffer); } /* Here we have deleted all empty subtrees */ return FALSE; } }
Datum ginbulkdelete(PG_FUNCTION_ARGS) { IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0); IndexBulkDeleteResult *stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(1); IndexBulkDeleteCallback callback = (IndexBulkDeleteCallback) PG_GETARG_POINTER(2); void *callback_state = (void *) PG_GETARG_POINTER(3); Relation index = info->index; BlockNumber blkno = GIN_ROOT_BLKNO; GinVacuumState gvs; Buffer buffer; BlockNumber rootOfPostingTree[BLCKSZ / (sizeof(IndexTupleData) + sizeof(ItemId))]; uint32 nRoot; gvs.index = index; gvs.callback = callback; gvs.callback_state = callback_state; gvs.strategy = info->strategy; initGinState(&gvs.ginstate, index); /* first time through? */ if (stats == NULL) { /* Yes, so initialize stats to zeroes */ stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); /* and cleanup any pending inserts */ ginInsertCleanup(&gvs.ginstate, true, stats); } /* we'll re-count the tuples each time */ stats->num_index_tuples = 0; gvs.result = stats; buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy); /* find leaf page */ for (;;) { Page page = BufferGetPage(buffer); IndexTuple itup; LockBuffer(buffer, GIN_SHARE); Assert(!GinPageIsData(page)); if (GinPageIsLeaf(page)) { LockBuffer(buffer, GIN_UNLOCK); LockBuffer(buffer, GIN_EXCLUSIVE); if (blkno == GIN_ROOT_BLKNO && !GinPageIsLeaf(page)) { LockBuffer(buffer, GIN_UNLOCK); continue; /* check it one more */ } break; } Assert(PageGetMaxOffsetNumber(page) >= FirstOffsetNumber); itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, FirstOffsetNumber)); blkno = GinGetDownlink(itup); Assert(blkno != InvalidBlockNumber); UnlockReleaseBuffer(buffer); buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy); } /* right now we found leftmost page in entry's BTree */ for (;;) { Page page = BufferGetPage(buffer); Page resPage; uint32 i; Assert(!GinPageIsData(page)); resPage = ginVacuumEntryPage(&gvs, buffer, rootOfPostingTree, &nRoot); blkno = GinPageGetOpaque(page)->rightlink; if (resPage) { START_CRIT_SECTION(); PageRestoreTempPage(resPage, page); MarkBufferDirty(buffer); xlogVacuumPage(gvs.index, buffer); UnlockReleaseBuffer(buffer); END_CRIT_SECTION(); } else { UnlockReleaseBuffer(buffer); } vacuum_delay_point(); for (i = 0; i < nRoot; i++) { ginVacuumPostingTree(&gvs, rootOfPostingTree[i]); vacuum_delay_point(); } if (blkno == InvalidBlockNumber) /* rightmost page */ break; buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy); LockBuffer(buffer, GIN_EXCLUSIVE); } PG_RETURN_POINTER(gvs.result); }
static ArrayTuple gistVacuumUpdate(GistVacuum *gv, BlockNumber blkno, bool needunion) { ArrayTuple res = {NULL, 0, false}; Buffer buffer; Page page, tempPage = NULL; OffsetNumber i, maxoff; ItemId iid; int lenaddon = 4, curlenaddon = 0, nOffToDelete = 0, nBlkToDelete = 0; IndexTuple idxtuple, *addon = NULL; bool needwrite = false; OffsetNumber offToDelete[MaxOffsetNumber]; BlockNumber blkToDelete[MaxOffsetNumber]; ItemPointerData *completed = NULL; int ncompleted = 0, lencompleted = 16; vacuum_delay_point(); buffer = ReadBufferWithStrategy(gv->index, blkno, gv->strategy); LockBuffer(buffer, GIST_EXCLUSIVE); gistcheckpage(gv->index, buffer); page = (Page) BufferGetPage(buffer); maxoff = PageGetMaxOffsetNumber(page); if (GistPageIsLeaf(page)) { if (GistTuplesDeleted(page)) needunion = needwrite = true; } else { completed = (ItemPointerData *) palloc(sizeof(ItemPointerData) * lencompleted); addon = (IndexTuple *) palloc(sizeof(IndexTuple) * lenaddon); /* get copy of page to work */ tempPage = GistPageGetCopyPage(page); for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) { ArrayTuple chldtuple; bool needchildunion; iid = PageGetItemId(tempPage, i); idxtuple = (IndexTuple) PageGetItem(tempPage, iid); needchildunion = (GistTupleIsInvalid(idxtuple)) ? true : false; if (needchildunion) elog(DEBUG2, "gistVacuumUpdate: need union for block %u", ItemPointerGetBlockNumber(&(idxtuple->t_tid))); chldtuple = gistVacuumUpdate(gv, ItemPointerGetBlockNumber(&(idxtuple->t_tid)), needchildunion); if (chldtuple.ituplen || chldtuple.emptypage) { /* update tuple or/and inserts new */ if (chldtuple.emptypage) blkToDelete[nBlkToDelete++] = ItemPointerGetBlockNumber(&(idxtuple->t_tid)); offToDelete[nOffToDelete++] = i; PageIndexTupleDelete(tempPage, i); i--; maxoff--; needwrite = needunion = true; if (chldtuple.ituplen) { Assert(chldtuple.emptypage == false); while (curlenaddon + chldtuple.ituplen >= lenaddon) { lenaddon *= 2; addon = (IndexTuple *) repalloc(addon, sizeof(IndexTuple) * lenaddon); } memcpy(addon + curlenaddon, chldtuple.itup, chldtuple.ituplen * sizeof(IndexTuple)); curlenaddon += chldtuple.ituplen; if (chldtuple.ituplen > 1) { /* * child was split, so we need mark completion * insert(split) */ int j; while (ncompleted + chldtuple.ituplen > lencompleted) { lencompleted *= 2; completed = (ItemPointerData *) repalloc(completed, sizeof(ItemPointerData) * lencompleted); } for (j = 0; j < chldtuple.ituplen; j++) { ItemPointerCopy(&(chldtuple.itup[j]->t_tid), completed + ncompleted); ncompleted++; } } pfree(chldtuple.itup); } } } Assert(maxoff == PageGetMaxOffsetNumber(tempPage)); if (curlenaddon) { /* insert updated tuples */ if (gistnospace(tempPage, addon, curlenaddon, InvalidOffsetNumber, 0)) { /* there is no space on page to insert tuples */ res = vacuumSplitPage(gv, tempPage, buffer, addon, curlenaddon); tempPage = NULL; /* vacuumSplitPage() free tempPage */ needwrite = needunion = false; /* gistSplit already forms * unions and writes pages */ } else /* enough free space */ gistfillbuffer(gv->index, tempPage, addon, curlenaddon, InvalidOffsetNumber); } } /* * If page is empty, we should remove pointer to it before deleting page * (except root) */ if (blkno != GIST_ROOT_BLKNO && (PageIsEmpty(page) || (tempPage && PageIsEmpty(tempPage)))) { /* * New version of page is empty, so leave it unchanged, upper call * will mark our page as deleted. In case of page split we never will * be here... * * If page was empty it can't become non-empty during processing */ res.emptypage = true; UnlockReleaseBuffer(buffer); } else { /* write page and remove its childs if it need */ START_CRIT_SECTION(); if (tempPage && needwrite) { PageRestoreTempPage(tempPage, page); tempPage = NULL; } /* Empty index */ if (PageIsEmpty(page) && blkno == GIST_ROOT_BLKNO) { needwrite = true; GistPageSetLeaf(page); } if (needwrite) { MarkBufferDirty(buffer); GistClearTuplesDeleted(page); if (!gv->index->rd_istemp) { XLogRecData *rdata; XLogRecPtr recptr; char *xlinfo; rdata = formUpdateRdata(gv->index->rd_node, buffer, offToDelete, nOffToDelete, addon, curlenaddon, NULL); xlinfo = rdata->next->data; recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_UPDATE, rdata); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); pfree(xlinfo); pfree(rdata); } else PageSetLSN(page, XLogRecPtrForTemp); } END_CRIT_SECTION(); if (needunion && !PageIsEmpty(page)) { res.itup = (IndexTuple *) palloc(sizeof(IndexTuple)); res.ituplen = 1; res.itup[0] = PageMakeUnionKey(gv, buffer); } UnlockReleaseBuffer(buffer); /* delete empty children, now we havn't any links to pointed subtrees */ for (i = 0; i < nBlkToDelete; i++) gistDeleteSubtree(gv, blkToDelete[i]); if (ncompleted && !gv->index->rd_istemp) gistxlogInsertCompletion(gv->index->rd_node, completed, ncompleted); } for (i = 0; i < curlenaddon; i++) pfree(addon[i]); if (addon) pfree(addon); if (completed) pfree(completed); if (tempPage) pfree(tempPage); return res; }
Datum gistvacuumcleanup(PG_FUNCTION_ARGS) { IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0); GistBulkDeleteResult *stats = (GistBulkDeleteResult *) PG_GETARG_POINTER(1); Relation rel = info->index; BlockNumber npages, blkno; BlockNumber totFreePages, nFreePages, *freePages, maxFreePages; BlockNumber lastBlock = GIST_ROOT_BLKNO, lastFilledBlock = GIST_ROOT_BLKNO; bool needLock; /* Set up all-zero stats if gistbulkdelete wasn't called */ if (stats == NULL) { stats = (GistBulkDeleteResult *) palloc0(sizeof(GistBulkDeleteResult)); /* use heap's tuple count */ Assert(info->num_heap_tuples >= 0); stats->std.num_index_tuples = info->num_heap_tuples; /* * XXX the above is wrong if index is partial. Would it be OK to just * return NULL, or is there work we must do below? */ } /* gistVacuumUpdate may cause hard work */ if (info->vacuum_full) { GistVacuum gv; ArrayTuple res; /* note: vacuum.c already acquired AccessExclusiveLock on index */ gv.index = rel; initGISTstate(&(gv.giststate), rel); gv.opCtx = createTempGistContext(); gv.result = stats; gv.strategy = info->strategy; /* walk through the entire index for update tuples */ res = gistVacuumUpdate(&gv, GIST_ROOT_BLKNO, false); /* cleanup */ if (res.itup) { int i; for (i = 0; i < res.ituplen; i++) pfree(res.itup[i]); pfree(res.itup); } freeGISTstate(&(gv.giststate)); MemoryContextDelete(gv.opCtx); } else if (stats->needFullVacuum) ereport(NOTICE, (errmsg("index \"%s\" needs VACUUM FULL or REINDEX to finish crash recovery", RelationGetRelationName(rel)))); /* * If vacuum full, we already have exclusive lock on the index. Otherwise, * need lock unless it's local to this backend. */ if (info->vacuum_full) needLock = false; else needLock = !RELATION_IS_LOCAL(rel); /* try to find deleted pages */ if (needLock) LockRelationForExtension(rel, ExclusiveLock); npages = RelationGetNumberOfBlocks(rel); if (needLock) UnlockRelationForExtension(rel, ExclusiveLock); maxFreePages = npages; if (maxFreePages > MaxFSMPages) maxFreePages = MaxFSMPages; totFreePages = nFreePages = 0; freePages = (BlockNumber *) palloc(sizeof(BlockNumber) * maxFreePages); for (blkno = GIST_ROOT_BLKNO + 1; blkno < npages; blkno++) { Buffer buffer; Page page; vacuum_delay_point(); buffer = ReadBufferWithStrategy(rel, blkno, info->strategy); LockBuffer(buffer, GIST_SHARE); page = (Page) BufferGetPage(buffer); if (PageIsNew(page) || GistPageIsDeleted(page)) { if (nFreePages < maxFreePages) freePages[nFreePages++] = blkno; totFreePages++; } else lastFilledBlock = blkno; UnlockReleaseBuffer(buffer); } lastBlock = npages - 1; if (info->vacuum_full && nFreePages > 0) { /* try to truncate index */ int i; for (i = 0; i < nFreePages; i++) if (freePages[i] >= lastFilledBlock) { totFreePages = nFreePages = i; break; } if (lastBlock > lastFilledBlock) RelationTruncate(rel, lastFilledBlock + 1); stats->std.pages_removed = lastBlock - lastFilledBlock; } RecordIndexFreeSpace(&rel->rd_node, totFreePages, nFreePages, freePages); pfree(freePages); /* return statistics */ stats->std.pages_free = totFreePages; if (needLock) LockRelationForExtension(rel, ExclusiveLock); stats->std.num_pages = RelationGetNumberOfBlocks(rel); if (needLock) UnlockRelationForExtension(rel, ExclusiveLock); PG_RETURN_POINTER(stats); }
/* * Bulk deletion of all index entries pointing to a set of heap tuples and * check invalid tuples after crash recovery. * The set of target tuples is specified via a callback routine that tells * whether any given heap tuple (identified by ItemPointer) is being deleted. * * Result: a palloc'd struct containing statistical info for VACUUM displays. */ Datum gistbulkdelete(PG_FUNCTION_ARGS) { IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0); GistBulkDeleteResult *stats = (GistBulkDeleteResult *) PG_GETARG_POINTER(1); IndexBulkDeleteCallback callback = (IndexBulkDeleteCallback) PG_GETARG_POINTER(2); void *callback_state = (void *) PG_GETARG_POINTER(3); Relation rel = info->index; GistBDItem *stack, *ptr; /* first time through? */ if (stats == NULL) stats = (GistBulkDeleteResult *) palloc0(sizeof(GistBulkDeleteResult)); /* we'll re-count the tuples each time */ stats->std.num_index_tuples = 0; stack = (GistBDItem *) palloc0(sizeof(GistBDItem)); stack->blkno = GIST_ROOT_BLKNO; while (stack) { Buffer buffer = ReadBufferWithStrategy(rel, stack->blkno, info->strategy); Page page; OffsetNumber i, maxoff; IndexTuple idxtuple; ItemId iid; LockBuffer(buffer, GIST_SHARE); gistcheckpage(rel, buffer); page = (Page) BufferGetPage(buffer); if (GistPageIsLeaf(page)) { OffsetNumber todelete[MaxOffsetNumber]; int ntodelete = 0; LockBuffer(buffer, GIST_UNLOCK); LockBuffer(buffer, GIST_EXCLUSIVE); page = (Page) BufferGetPage(buffer); if (stack->blkno == GIST_ROOT_BLKNO && !GistPageIsLeaf(page)) { /* only the root can become non-leaf during relock */ UnlockReleaseBuffer(buffer); /* one more check */ continue; } /* * check for split proceeded after look at parent, we should check * it after relock */ pushStackIfSplited(page, stack); /* * Remove deletable tuples from page */ maxoff = PageGetMaxOffsetNumber(page); for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) { iid = PageGetItemId(page, i); idxtuple = (IndexTuple) PageGetItem(page, iid); if (callback(&(idxtuple->t_tid), callback_state)) { todelete[ntodelete] = i - ntodelete; ntodelete++; stats->std.tuples_removed += 1; } else stats->std.num_index_tuples += 1; } if (ntodelete) { START_CRIT_SECTION(); MarkBufferDirty(buffer); for (i = 0; i < ntodelete; i++) PageIndexTupleDelete(page, todelete[i]); GistMarkTuplesDeleted(page); if (!rel->rd_istemp) { XLogRecData *rdata; XLogRecPtr recptr; gistxlogPageUpdate *xlinfo; rdata = formUpdateRdata(rel->rd_node, buffer, todelete, ntodelete, NULL, 0, NULL); xlinfo = (gistxlogPageUpdate *) rdata->next->data; recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_UPDATE, rdata); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); pfree(xlinfo); pfree(rdata); } else PageSetLSN(page, XLogRecPtrForTemp); END_CRIT_SECTION(); } } else { /* check for split proceeded after look at parent */ pushStackIfSplited(page, stack); maxoff = PageGetMaxOffsetNumber(page); for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) { iid = PageGetItemId(page, i); idxtuple = (IndexTuple) PageGetItem(page, iid); ptr = (GistBDItem *) palloc(sizeof(GistBDItem)); ptr->blkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid)); ptr->parentlsn = PageGetLSN(page); ptr->next = stack->next; stack->next = ptr; if (GistTupleIsInvalid(idxtuple)) stats->needFullVacuum = true; } } UnlockReleaseBuffer(buffer); ptr = stack->next; pfree(stack); stack = ptr; vacuum_delay_point(); } PG_RETURN_POINTER(stats); }
/* * btvacuumpage --- VACUUM one page * * This processes a single page for btvacuumscan(). In some cases we * must go back and re-examine previously-scanned pages; this routine * recurses when necessary to handle that case. * * blkno is the page to process. orig_blkno is the highest block number * reached by the outer btvacuumscan loop (the same as blkno, unless we * are recursing to re-examine a previous page). */ static void btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno) { MIRROREDLOCK_BUFMGR_DECLARE; IndexVacuumInfo *info = vstate->info; IndexBulkDeleteResult *stats = vstate->stats; IndexBulkDeleteCallback callback = vstate->callback; void *callback_state = vstate->callback_state; Relation rel = info->index; bool delete_now; BlockNumber recurse_to; Buffer buf; Page page; BTPageOpaque opaque; restart: delete_now = false; recurse_to = P_NONE; /* call vacuum_delay_point while not holding any buffer lock */ vacuum_delay_point(); /* * We can't use _bt_getbuf() here because it always applies * _bt_checkpage(), which will barf on an all-zero page. We want to * recycle all-zero pages, not fail. Also, we want to use a nondefault * buffer access strategy. */ // -------- MirroredLock ---------- MIRROREDLOCK_BUFMGR_LOCK; buf = ReadBufferWithStrategy(rel, blkno, info->strategy); LockBuffer(buf, BT_READ); page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (!PageIsNew(page)) _bt_checkpage(rel, buf); /* * If we are recursing, the only case we want to do anything with is a * live leaf page having the current vacuum cycle ID. Any other state * implies we already saw the page (eg, deleted it as being empty). In * particular, we don't want to risk adding it to freePages twice. */ if (blkno != orig_blkno) { if (_bt_page_recyclable(page) || P_IGNORE(opaque) || !P_ISLEAF(opaque) || opaque->btpo_cycleid != vstate->cycleid) { _bt_relbuf(rel, buf); MIRROREDLOCK_BUFMGR_UNLOCK; // -------- MirroredLock ---------- return; } } /* Page is valid, see what to do with it */ if (_bt_page_recyclable(page)) { /* Okay to recycle this page */ if (vstate->nFreePages < vstate->maxFreePages) vstate->freePages[vstate->nFreePages++] = blkno; vstate->totFreePages++; stats->pages_deleted++; } else if (P_ISDELETED(opaque)) { /* Already deleted, but can't recycle yet */ stats->pages_deleted++; } else if (P_ISHALFDEAD(opaque)) { /* Half-dead, try to delete */ delete_now = true; } else if (P_ISLEAF(opaque)) { OffsetNumber deletable[MaxOffsetNumber]; int ndeletable; OffsetNumber offnum, minoff, maxoff; /* * Trade in the initial read lock for a super-exclusive write lock on * this page. We must get such a lock on every leaf page over the * course of the vacuum scan, whether or not it actually contains any * deletable tuples --- see nbtree/README. */ LockBuffer(buf, BUFFER_LOCK_UNLOCK); LockBufferForCleanup(buf); /* * Check whether we need to recurse back to earlier pages. What we * are concerned about is a page split that happened since we started * the vacuum scan. If the split moved some tuples to a lower page * then we might have missed 'em. If so, set up for tail recursion. * (Must do this before possibly clearing btpo_cycleid below!) */ if (vstate->cycleid != 0 && opaque->btpo_cycleid == vstate->cycleid && !(opaque->btpo_flags & BTP_SPLIT_END) && !P_RIGHTMOST(opaque) && opaque->btpo_next < orig_blkno) recurse_to = opaque->btpo_next; /* * Scan over all items to see which ones need deleted according to the * callback function. */ ndeletable = 0; minoff = P_FIRSTDATAKEY(opaque); maxoff = PageGetMaxOffsetNumber(page); if (callback) { for (offnum = minoff; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { IndexTuple itup; ItemPointer htup; itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); htup = &(itup->t_tid); if (callback(htup, callback_state)) deletable[ndeletable++] = offnum; } } /* * Apply any needed deletes. We issue just one _bt_delitems() call * per page, so as to minimize WAL traffic. */ if (ndeletable > 0) { _bt_delitems(rel, buf, deletable, ndeletable, true); stats->tuples_removed += ndeletable; /* must recompute maxoff */ maxoff = PageGetMaxOffsetNumber(page); } else { /* * If the page has been split during this vacuum cycle, it seems * worth expending a write to clear btpo_cycleid even if we don't * have any deletions to do. (If we do, _bt_delitems takes care * of this.) This ensures we won't process the page again. * * We treat this like a hint-bit update because there's no need to * WAL-log it. */ if (vstate->cycleid != 0 && opaque->btpo_cycleid == vstate->cycleid) { opaque->btpo_cycleid = 0; SetBufferCommitInfoNeedsSave(buf); } } /* * If it's now empty, try to delete; else count the live tuples. We * don't delete when recursing, though, to avoid putting entries into * freePages out-of-order (doesn't seem worth any extra code to handle * the case). */ if (minoff > maxoff) delete_now = (blkno == orig_blkno); else stats->num_index_tuples += maxoff - minoff + 1; } if (delete_now) { MemoryContext oldcontext; int ndel; /* Run pagedel in a temp context to avoid memory leakage */ MemoryContextReset(vstate->pagedelcontext); oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext); ndel = _bt_pagedel(rel, buf, NULL, info->vacuum_full); /* count only this page, else may double-count parent */ if (ndel) stats->pages_deleted++; /* * During VACUUM FULL it's okay to recycle deleted pages immediately, * since there can be no other transactions scanning the index. Note * that we will only recycle the current page and not any parent pages * that _bt_pagedel might have recursed to; this seems reasonable in * the name of simplicity. (Trying to do otherwise would mean we'd * have to sort the list of recyclable pages we're building.) */ if (ndel && info->vacuum_full) { if (vstate->nFreePages < vstate->maxFreePages) vstate->freePages[vstate->nFreePages++] = blkno; vstate->totFreePages++; } MemoryContextSwitchTo(oldcontext); /* pagedel released buffer, so we shouldn't */ } else _bt_relbuf(rel, buf); MIRROREDLOCK_BUFMGR_UNLOCK; // -------- MirroredLock ---------- /* * This is really tail recursion, but if the compiler is too stupid to * optimize it as such, we'd eat an uncomfortably large amount of stack * space per recursion level (due to the deletable[] array). A failure is * improbable since the number of levels isn't likely to be large ... but * just in case, let's hand-optimize into a loop. */ if (recurse_to != P_NONE) { blkno = recurse_to; goto restart; } }
/* * Process one page during a bulkdelete scan */ static void spgvacuumpage(spgBulkDeleteState *bds, BlockNumber blkno) { Relation index = bds->info->index; Buffer buffer; Page page; /* call vacuum_delay_point while not holding any buffer lock */ vacuum_delay_point(); buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno, RBM_NORMAL, bds->info->strategy); LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); page = (Page) BufferGetPage(buffer); if (PageIsNew(page)) { /* * We found an all-zero page, which could happen if the database * crashed just after extending the file. Initialize and recycle it. */ SpGistInitBuffer(buffer, 0); SpGistPageSetDeleted(page); /* We don't bother to WAL-log this action; easy to redo */ MarkBufferDirty(buffer); } else if (SpGistPageIsDeleted(page)) { /* nothing to do */ } else if (SpGistPageIsLeaf(page)) { if (SpGistBlockIsRoot(blkno)) { vacuumLeafRoot(bds, index, buffer); /* no need for vacuumRedirectAndPlaceholder */ } else { vacuumLeafPage(bds, index, buffer, false); vacuumRedirectAndPlaceholder(index, buffer); } } else { /* inner page */ vacuumRedirectAndPlaceholder(index, buffer); } /* * The root pages must never be deleted, nor marked as available in FSM, * because we don't want them ever returned by a search for a place to put * a new tuple. Otherwise, check for empty/deletable page, and make sure * FSM knows about it. */ if (!SpGistBlockIsRoot(blkno)) { /* If page is now empty, mark it deleted */ if (PageIsEmpty(page) && !SpGistPageIsDeleted(page)) { SpGistPageSetDeleted(page); /* We don't bother to WAL-log this action; easy to redo */ MarkBufferDirty(buffer); } if (SpGistPageIsDeleted(page)) { RecordFreeIndexPage(index, blkno); bds->stats->pages_deleted++; } else bds->lastFilledBlock = blkno; } SpGistSetLastUsedPage(index, buffer); UnlockReleaseBuffer(buffer); }
/* * lazy_scan_heap() -- scan an open heap relation * * This routine sets commit status bits, builds lists of dead tuples * and pages with free space, and calculates statistics on the number * of live tuples in the heap. When done, or when we run low on space * for dead-tuple TIDs, invoke vacuuming of indexes and heap. * * If there are no indexes then we just vacuum each dirty page as we * process it, since there's no point in gathering many tuples. */ static void lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, Relation *Irel, int nindexes, bool scan_all) { BlockNumber nblocks, blkno; HeapTupleData tuple; char *relname; BlockNumber empty_pages, vacuumed_pages; double num_tuples, tups_vacuumed, nkeep, nunused; IndexBulkDeleteResult **indstats; int i; PGRUsage ru0; Buffer vmbuffer = InvalidBuffer; BlockNumber next_not_all_visible_block; bool skipping_all_visible_blocks; pg_rusage_init(&ru0); relname = RelationGetRelationName(onerel); ereport(elevel, (errmsg("vacuuming \"%s.%s\"", get_namespace_name(RelationGetNamespace(onerel)), relname))); empty_pages = vacuumed_pages = 0; num_tuples = tups_vacuumed = nkeep = nunused = 0; indstats = (IndexBulkDeleteResult **) palloc0(nindexes * sizeof(IndexBulkDeleteResult *)); nblocks = RelationGetNumberOfBlocks(onerel); vacrelstats->rel_pages = nblocks; vacrelstats->scanned_pages = 0; vacrelstats->nonempty_pages = 0; vacrelstats->latestRemovedXid = InvalidTransactionId; lazy_space_alloc(vacrelstats, nblocks); /* * We want to skip pages that don't require vacuuming according to the * visibility map, but only when we can skip at least SKIP_PAGES_THRESHOLD * consecutive pages. Since we're reading sequentially, the OS should be * doing readahead for us, so there's no gain in skipping a page now and * then; that's likely to disable readahead and so be counterproductive. * Also, skipping even a single page means that we can't update * relfrozenxid, so we only want to do it if we can skip a goodly number * of pages. * * Before entering the main loop, establish the invariant that * next_not_all_visible_block is the next block number >= blkno that's not * all-visible according to the visibility map, or nblocks if there's no * such block. Also, we set up the skipping_all_visible_blocks flag, * which is needed because we need hysteresis in the decision: once we've * started skipping blocks, we may as well skip everything up to the next * not-all-visible block. * * Note: if scan_all is true, we won't actually skip any pages; but we * maintain next_not_all_visible_block anyway, so as to set up the * all_visible_according_to_vm flag correctly for each page. */ for (next_not_all_visible_block = 0; next_not_all_visible_block < nblocks; next_not_all_visible_block++) { if (!visibilitymap_test(onerel, next_not_all_visible_block, &vmbuffer)) break; vacuum_delay_point(); } if (next_not_all_visible_block >= SKIP_PAGES_THRESHOLD) skipping_all_visible_blocks = true; else skipping_all_visible_blocks = false; for (blkno = 0; blkno < nblocks; blkno++) { Buffer buf; Page page; OffsetNumber offnum, maxoff; bool tupgone, hastup; int prev_dead_count; OffsetNumber frozen[MaxOffsetNumber]; int nfrozen; Size freespace; bool all_visible_according_to_vm; bool all_visible; bool has_dead_tuples; if (blkno == next_not_all_visible_block) { /* Time to advance next_not_all_visible_block */ for (next_not_all_visible_block++; next_not_all_visible_block < nblocks; next_not_all_visible_block++) { if (!visibilitymap_test(onerel, next_not_all_visible_block, &vmbuffer)) break; vacuum_delay_point(); } /* * We know we can't skip the current block. But set up * skipping_all_visible_blocks to do the right thing at the * following blocks. */ if (next_not_all_visible_block - blkno > SKIP_PAGES_THRESHOLD) skipping_all_visible_blocks = true; else skipping_all_visible_blocks = false; all_visible_according_to_vm = false; } else { /* Current block is all-visible */ if (skipping_all_visible_blocks && !scan_all) continue; all_visible_according_to_vm = true; } vacuum_delay_point(); vacrelstats->scanned_pages++; /* * If we are close to overrunning the available space for dead-tuple * TIDs, pause and do a cycle of vacuuming before we tackle this page. */ if ((vacrelstats->max_dead_tuples - vacrelstats->num_dead_tuples) < MaxHeapTuplesPerPage && vacrelstats->num_dead_tuples > 0) { /* Log cleanup info before we touch indexes */ vacuum_log_cleanup_info(onerel, vacrelstats); /* Remove index entries */ for (i = 0; i < nindexes; i++) lazy_vacuum_index(Irel[i], &indstats[i], vacrelstats); /* Remove tuples from heap */ lazy_vacuum_heap(onerel, vacrelstats); /* * Forget the now-vacuumed tuples, and press on, but be careful * not to reset latestRemovedXid since we want that value to be * valid. */ vacrelstats->num_dead_tuples = 0; vacrelstats->num_index_scans++; } buf = ReadBufferExtended(onerel, MAIN_FORKNUM, blkno, RBM_NORMAL, vac_strategy); /* We need buffer cleanup lock so that we can prune HOT chains. */ LockBufferForCleanup(buf); page = BufferGetPage(buf); if (PageIsNew(page)) { /* * An all-zeroes page could be left over if a backend extends the * relation but crashes before initializing the page. Reclaim such * pages for use. * * We have to be careful here because we could be looking at a * page that someone has just added to the relation and not yet * been able to initialize (see RelationGetBufferForTuple). To * protect against that, release the buffer lock, grab the * relation extension lock momentarily, and re-lock the buffer. If * the page is still uninitialized by then, it must be left over * from a crashed backend, and we can initialize it. * * We don't really need the relation lock when this is a new or * temp relation, but it's probably not worth the code space to * check that, since this surely isn't a critical path. * * Note: the comparable code in vacuum.c need not worry because * it's got exclusive lock on the whole relation. */ LockBuffer(buf, BUFFER_LOCK_UNLOCK); LockRelationForExtension(onerel, ExclusiveLock); UnlockRelationForExtension(onerel, ExclusiveLock); LockBufferForCleanup(buf); if (PageIsNew(page)) { ereport(WARNING, (errmsg("relation \"%s\" page %u is uninitialized --- fixing", relname, blkno))); PageInit(page, BufferGetPageSize(buf), 0); empty_pages++; } freespace = PageGetHeapFreeSpace(page); MarkBufferDirty(buf); UnlockReleaseBuffer(buf); RecordPageWithFreeSpace(onerel, blkno, freespace); continue; } if (PageIsEmpty(page)) { empty_pages++; freespace = PageGetHeapFreeSpace(page); if (!PageIsAllVisible(page)) { PageSetAllVisible(page); SetBufferCommitInfoNeedsSave(buf); } LockBuffer(buf, BUFFER_LOCK_UNLOCK); /* Update the visibility map */ if (!all_visible_according_to_vm) { visibilitymap_pin(onerel, blkno, &vmbuffer); LockBuffer(buf, BUFFER_LOCK_SHARE); if (PageIsAllVisible(page)) visibilitymap_set(onerel, blkno, PageGetLSN(page), &vmbuffer); LockBuffer(buf, BUFFER_LOCK_UNLOCK); } ReleaseBuffer(buf); RecordPageWithFreeSpace(onerel, blkno, freespace); continue; } /* * Prune all HOT-update chains in this page. * * We count tuples removed by the pruning step as removed by VACUUM. */ tups_vacuumed += heap_page_prune(onerel, buf, OldestXmin, false, &vacrelstats->latestRemovedXid); /* * Now scan the page to collect vacuumable items and check for tuples * requiring freezing. */ all_visible = true; has_dead_tuples = false; nfrozen = 0; hastup = false; prev_dead_count = vacrelstats->num_dead_tuples; maxoff = PageGetMaxOffsetNumber(page); for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { ItemId itemid; itemid = PageGetItemId(page, offnum); /* Unused items require no processing, but we count 'em */ if (!ItemIdIsUsed(itemid)) { nunused += 1; continue; } /* Redirect items mustn't be touched */ if (ItemIdIsRedirected(itemid)) { hastup = true; /* this page won't be truncatable */ continue; } ItemPointerSet(&(tuple.t_self), blkno, offnum); /* * DEAD item pointers are to be vacuumed normally; but we don't * count them in tups_vacuumed, else we'd be double-counting (at * least in the common case where heap_page_prune() just freed up * a non-HOT tuple). */ if (ItemIdIsDead(itemid)) { lazy_record_dead_tuple(vacrelstats, &(tuple.t_self)); all_visible = false; continue; } Assert(ItemIdIsNormal(itemid)); tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid); tuple.t_len = ItemIdGetLength(itemid); tupgone = false; switch (HeapTupleSatisfiesVacuum(tuple.t_data, OldestXmin, buf)) { case HEAPTUPLE_DEAD: /* * Ordinarily, DEAD tuples would have been removed by * heap_page_prune(), but it's possible that the tuple * state changed since heap_page_prune() looked. In * particular an INSERT_IN_PROGRESS tuple could have * changed to DEAD if the inserter aborted. So this * cannot be considered an error condition. * * If the tuple is HOT-updated then it must only be * removed by a prune operation; so we keep it just as if * it were RECENTLY_DEAD. Also, if it's a heap-only * tuple, we choose to keep it, because it'll be a lot * cheaper to get rid of it in the next pruning pass than * to treat it like an indexed tuple. */ if (HeapTupleIsHotUpdated(&tuple) || HeapTupleIsHeapOnly(&tuple)) nkeep += 1; else tupgone = true; /* we can delete the tuple */ all_visible = false; break; case HEAPTUPLE_LIVE: /* Tuple is good --- but let's do some validity checks */ if (onerel->rd_rel->relhasoids && !OidIsValid(HeapTupleGetOid(&tuple))) elog(WARNING, "relation \"%s\" TID %u/%u: OID is invalid", relname, blkno, offnum); /* * Is the tuple definitely visible to all transactions? * * NB: Like with per-tuple hint bits, we can't set the * PD_ALL_VISIBLE flag if the inserter committed * asynchronously. See SetHintBits for more info. Check * that the HEAP_XMIN_COMMITTED hint bit is set because of * that. */ if (all_visible) { TransactionId xmin; if (!(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED)) { all_visible = false; break; } /* * The inserter definitely committed. But is it old * enough that everyone sees it as committed? */ xmin = HeapTupleHeaderGetXmin(tuple.t_data); if (!TransactionIdPrecedes(xmin, OldestXmin)) { all_visible = false; break; } } break; case HEAPTUPLE_RECENTLY_DEAD: /* * If tuple is recently deleted then we must not remove it * from relation. */ nkeep += 1; all_visible = false; break; case HEAPTUPLE_INSERT_IN_PROGRESS: /* This is an expected case during concurrent vacuum */ all_visible = false; break; case HEAPTUPLE_DELETE_IN_PROGRESS: /* This is an expected case during concurrent vacuum */ all_visible = false; break; default: elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); break; } if (tupgone) { lazy_record_dead_tuple(vacrelstats, &(tuple.t_self)); HeapTupleHeaderAdvanceLatestRemovedXid(tuple.t_data, &vacrelstats->latestRemovedXid); tups_vacuumed += 1; has_dead_tuples = true; } else { num_tuples += 1; hastup = true; /* * Each non-removable tuple must be checked to see if it needs * freezing. Note we already have exclusive buffer lock. */ if (heap_freeze_tuple(tuple.t_data, FreezeLimit, InvalidBuffer)) frozen[nfrozen++] = offnum; } } /* scan along page */ /* * If we froze any tuples, mark the buffer dirty, and write a WAL * record recording the changes. We must log the changes to be * crash-safe against future truncation of CLOG. */ if (nfrozen > 0) { MarkBufferDirty(buf); if (RelationNeedsWAL(onerel)) { XLogRecPtr recptr; recptr = log_heap_freeze(onerel, buf, FreezeLimit, frozen, nfrozen); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); } } /* * If there are no indexes then we can vacuum the page right now * instead of doing a second scan. */ if (nindexes == 0 && vacrelstats->num_dead_tuples > 0) { /* Remove tuples from heap */ lazy_vacuum_page(onerel, blkno, buf, 0, vacrelstats); /* * Forget the now-vacuumed tuples, and press on, but be careful * not to reset latestRemovedXid since we want that value to be * valid. */ vacrelstats->num_dead_tuples = 0; vacuumed_pages++; } freespace = PageGetHeapFreeSpace(page); /* Update the all-visible flag on the page */ if (!PageIsAllVisible(page) && all_visible) { PageSetAllVisible(page); SetBufferCommitInfoNeedsSave(buf); } /* * It's possible for the value returned by GetOldestXmin() to move * backwards, so it's not wrong for us to see tuples that appear to * not be visible to everyone yet, while PD_ALL_VISIBLE is already * set. The real safe xmin value never moves backwards, but * GetOldestXmin() is conservative and sometimes returns a value * that's unnecessarily small, so if we see that contradiction it just * means that the tuples that we think are not visible to everyone yet * actually are, and the PD_ALL_VISIBLE flag is correct. * * There should never be dead tuples on a page with PD_ALL_VISIBLE * set, however. */ else if (PageIsAllVisible(page) && has_dead_tuples) { elog(WARNING, "page containing dead tuples is marked as all-visible in relation \"%s\" page %u", relname, blkno); PageClearAllVisible(page); SetBufferCommitInfoNeedsSave(buf); /* * Normally, we would drop the lock on the heap page before * updating the visibility map, but since this case shouldn't * happen anyway, don't worry about that. */ visibilitymap_clear(onerel, blkno); } LockBuffer(buf, BUFFER_LOCK_UNLOCK); /* Update the visibility map */ if (!all_visible_according_to_vm && all_visible) { visibilitymap_pin(onerel, blkno, &vmbuffer); LockBuffer(buf, BUFFER_LOCK_SHARE); if (PageIsAllVisible(page)) visibilitymap_set(onerel, blkno, PageGetLSN(page), &vmbuffer); LockBuffer(buf, BUFFER_LOCK_UNLOCK); } ReleaseBuffer(buf); /* Remember the location of the last page with nonremovable tuples */ if (hastup) vacrelstats->nonempty_pages = blkno + 1; /* * If we remembered any tuples for deletion, then the page will be * visited again by lazy_vacuum_heap, which will compute and record * its post-compaction free space. If not, then we're done with this * page, so remember its free space as-is. (This path will always be * taken if there are no indexes.) */ if (vacrelstats->num_dead_tuples == prev_dead_count) RecordPageWithFreeSpace(onerel, blkno, freespace); } /* save stats for use later */ vacrelstats->scanned_tuples = num_tuples; vacrelstats->tuples_deleted = tups_vacuumed; /* now we can compute the new value for pg_class.reltuples */ vacrelstats->new_rel_tuples = vac_estimate_reltuples(onerel, false, nblocks, vacrelstats->scanned_pages, num_tuples); /* If any tuples need to be deleted, perform final vacuum cycle */ /* XXX put a threshold on min number of tuples here? */ if (vacrelstats->num_dead_tuples > 0) { /* Log cleanup info before we touch indexes */ vacuum_log_cleanup_info(onerel, vacrelstats); /* Remove index entries */ for (i = 0; i < nindexes; i++) lazy_vacuum_index(Irel[i], &indstats[i], vacrelstats); /* Remove tuples from heap */ lazy_vacuum_heap(onerel, vacrelstats); vacrelstats->num_index_scans++; } /* Release the pin on the visibility map page */ if (BufferIsValid(vmbuffer)) { ReleaseBuffer(vmbuffer); vmbuffer = InvalidBuffer; } /* Do post-vacuum cleanup and statistics update for each index */ for (i = 0; i < nindexes; i++) lazy_cleanup_index(Irel[i], indstats[i], vacrelstats); /* If no indexes, make log report that lazy_vacuum_heap would've made */ if (vacuumed_pages) ereport(elevel, (errmsg("\"%s\": removed %.0f row versions in %u pages", RelationGetRelationName(onerel), tups_vacuumed, vacuumed_pages))); ereport(elevel, (errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u out of %u pages", RelationGetRelationName(onerel), tups_vacuumed, num_tuples, vacrelstats->scanned_pages, nblocks), errdetail("%.0f dead row versions cannot be removed yet.\n" "There were %.0f unused item pointers.\n" "%u pages are entirely empty.\n" "%s.", nkeep, nunused, empty_pages, pg_rusage_show(&ru0)))); }
/* * VACUUM cleanup: update FSM */ Datum gistvacuumcleanup(PG_FUNCTION_ARGS) { IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0); GistBulkDeleteResult *stats = (GistBulkDeleteResult *) PG_GETARG_POINTER(1); Relation rel = info->index; BlockNumber npages, blkno; BlockNumber totFreePages; BlockNumber lastBlock = GIST_ROOT_BLKNO, lastFilledBlock = GIST_ROOT_BLKNO; bool needLock; /* No-op in ANALYZE ONLY mode */ if (info->analyze_only) PG_RETURN_POINTER(stats); /* Set up all-zero stats if gistbulkdelete wasn't called */ if (stats == NULL) { stats = (GistBulkDeleteResult *) palloc0(sizeof(GistBulkDeleteResult)); /* use heap's tuple count */ stats->std.num_index_tuples = info->num_heap_tuples; stats->std.estimated_count = info->estimated_count; /* * XXX the above is wrong if index is partial. Would it be OK to just * return NULL, or is there work we must do below? */ } if (stats->needReindex) ereport(NOTICE, (errmsg("index \"%s\" needs VACUUM FULL or REINDEX to finish crash recovery", RelationGetRelationName(rel)))); /* * Need lock unless it's local to this backend. */ needLock = !RELATION_IS_LOCAL(rel); /* try to find deleted pages */ if (needLock) LockRelationForExtension(rel, ExclusiveLock); npages = RelationGetNumberOfBlocks(rel); if (needLock) UnlockRelationForExtension(rel, ExclusiveLock); totFreePages = 0; for (blkno = GIST_ROOT_BLKNO + 1; blkno < npages; blkno++) { Buffer buffer; Page page; vacuum_delay_point(); buffer = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy); LockBuffer(buffer, GIST_SHARE); page = (Page) BufferGetPage(buffer); if (PageIsNew(page) || GistPageIsDeleted(page)) { totFreePages++; RecordFreeIndexPage(rel, blkno); } else lastFilledBlock = blkno; UnlockReleaseBuffer(buffer); } lastBlock = npages - 1; /* Finally, vacuum the FSM */ IndexFreeSpaceMapVacuum(info->index); /* return statistics */ stats->std.pages_free = totFreePages; if (needLock) LockRelationForExtension(rel, ExclusiveLock); stats->std.num_pages = RelationGetNumberOfBlocks(rel); if (needLock) UnlockRelationForExtension(rel, ExclusiveLock); PG_RETURN_POINTER(stats); }