/* * IsSystemRelation * True iff the relation is a system catalog relation. * * NB: TOAST relations are considered system relations by this test * for compatibility with the old IsSystemRelationName function. * This is appropriate in many places but not all. Where it's not, * also check IsToastRelation. * * We now just test if the relation is in the system catalog namespace; * so it's no longer necessary to forbid user relations from having * names starting with pg_. */ bool IsSystemRelation(Relation relation) { return IsSystemNamespace(RelationGetNamespace(relation)) || IsToastNamespace(RelationGetNamespace(relation)) || IsAoSegmentNamespace(RelationGetNamespace(relation)); }
/* * DropErrorTable * * Drop the error table from the database. This function will be called from * destroyCdbSreh when an autogenerated error table was not used in the COPY * operation granted KEEP wasn't specified. * */ static void DropErrorTable(CdbSreh *cdbsreh) { StringInfoData dropstmt; RangeVar *errtbl_rv; Insist(Gp_role == GP_ROLE_DISPATCH); ereport(NOTICE, (errcode(ERRCODE_SUCCESSFUL_COMPLETION), errmsg("Dropping the auto-generated unused error table"), errhint("Use KEEP in LOG INTO clause to force keeping the error table alive"))); initStringInfo(&dropstmt); appendStringInfo(&dropstmt, "DROP TABLE %s.%s", quote_identifier(get_namespace_name(RelationGetNamespace(cdbsreh->errtbl))), quote_identifier(RelationGetRelationName(cdbsreh->errtbl))); errtbl_rv = makeRangeVar(get_namespace_name(RelationGetNamespace(cdbsreh->errtbl)), RelationGetRelationName(cdbsreh->errtbl), -1); /* DROP the relation on the QD */ RemoveRelation(errtbl_rv,DROP_RESTRICT, NULL, RELKIND_RELATION); /* dispatch the DROP to the QEs */ CdbDoCommand(dropstmt.data, false, /*no txn */ false); pfree(dropstmt.data); }
/* * AppendOnlySegmentFileTruncateToEOF() * * Assumes that the segment file lock is already held. * * For the segment file is truncates to the eof. */ static void AppendOnlySegmentFileTruncateToEOF(Relation aorel, FileSegInfo *fsinfo) { const char* relname = RelationGetRelationName(aorel); MirroredAppendOnlyOpen mirroredOpened; int32 fileSegNo; char filenamepath[MAXPGPATH]; int segno; int64 segeof; Assert(fsinfo); Assert(RelationIsAoRows(aorel)); segno = fsinfo->segno; relname = RelationGetRelationName(aorel); segeof = (int64)fsinfo->eof; /* Open and truncate the relation segfile beyond its eof */ MakeAOSegmentFileName(aorel, segno, -1, &fileSegNo, filenamepath); elogif(Debug_appendonly_print_compaction, LOG, "Opening AO relation \"%s.%s\", relation id %u, relfilenode %u (physical segment file #%d, logical EOF " INT64_FORMAT ")", get_namespace_name(RelationGetNamespace(aorel)), relname, aorel->rd_id, aorel->rd_node.relNode, segno, segeof); if (OpenAOSegmentFile(aorel, filenamepath, fileSegNo, segeof, &mirroredOpened)) { TruncateAOSegmentFile(&mirroredOpened, aorel, segeof, ERROR); CloseAOSegmentFile(&mirroredOpened); elogif(Debug_appendonly_print_compaction, LOG, "Successfully truncated AO ROL relation \"%s.%s\", relation id %u, relfilenode %u (physical segment file #%d, logical EOF " INT64_FORMAT ")", get_namespace_name(RelationGetNamespace(aorel)), relname, aorel->rd_id, aorel->rd_node.relNode, segno, segeof); } else { elogif(Debug_appendonly_print_compaction, LOG, "No gp_relation_node entry for AO ROW relation \"%s.%s\", relation id %u, relfilenode %u (physical segment file #%d, logical EOF " INT64_FORMAT ")", get_namespace_name(RelationGetNamespace(aorel)), relname, aorel->rd_id, aorel->rd_node.relNode, segno, segeof); } }
/* * Get the max size of the relation across segments */ int64 cdbRelMaxSegSize(Relation rel) { int64 size = 0; int i; CdbPgResults cdb_pgresults = {NULL, 0}; StringInfoData buffer; char *schemaName; char *relName; /* * Let's ask the QEs for the size of the relation */ initStringInfo(&buffer); schemaName = get_namespace_name(RelationGetNamespace(rel)); if (schemaName == NULL) elog(ERROR, "cache lookup failed for namespace %d", RelationGetNamespace(rel)); relName = RelationGetRelationName(rel); /* * Safer to pass names than oids, just in case they get out of sync between QD and QE, * which might happen with a toast table or index, I think (but maybe not) */ appendStringInfo(&buffer, "select pg_relation_size('%s.%s')", quote_identifier(schemaName), quote_identifier(relName)); CdbDispatchCommand(buffer.data, DF_WITH_SNAPSHOT, &cdb_pgresults); for (i = 0; i < cdb_pgresults.numResults; i++) { struct pg_result * pgresult = cdb_pgresults.pg_results[i]; if (PQresultStatus(pgresult) != PGRES_TUPLES_OK) { cdbdisp_clearCdbPgResults(&cdb_pgresults); elog(ERROR,"cdbRelMaxSegSize: resultStatus not tuples_Ok: %s %s", PQresStatus(PQresultStatus(pgresult)),PQresultErrorMessage(pgresult)); } else { Assert(PQntuples(pgresult) == 1); int64 tempsize = 0; (void) scanint8(PQgetvalue(pgresult, 0, 0), false, &tempsize); if (tempsize > size) size = tempsize; } } pfree(buffer.data); cdbdisp_clearCdbPgResults(&cdb_pgresults); return size; }
/* * OpenErrorTable * * Open the error table for this operation, and perform all necessary checks. */ void OpenErrorTable(CdbSreh *cdbsreh, RangeVar *errortable) { AclResult aclresult; AclMode required_access = ACL_INSERT; Oid relOid; /* Open and lock the error relation, using the appropriate lock type. */ cdbsreh->errtbl = heap_openrv(errortable, RowExclusiveLock); relOid = RelationGetRelid(cdbsreh->errtbl); /* Check relation permissions. */ aclresult = pg_class_aclcheck(relOid, GetUserId(), required_access); if (aclresult != ACLCHECK_OK) aclcheck_error(aclresult, ACL_KIND_CLASS, RelationGetRelationName(cdbsreh->errtbl)); /* check read-only transaction */ if (XactReadOnly && !isTempNamespace(RelationGetNamespace(cdbsreh->errtbl))) ereport(ERROR, (errcode(ERRCODE_READ_ONLY_SQL_TRANSACTION), errmsg("transaction is read-only"))); /* make sure this is a regular relation */ if (cdbsreh->errtbl->rd_rel->relkind != RELKIND_RELATION) ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("\"%s\" exists in the database but is a non table relation", RelationGetRelationName(cdbsreh->errtbl)))); }
/* * btinsert() -- insert an index tuple into a btree. * * Descend the tree recursively, find the appropriate location for our * new tuple, and put it there. */ Datum btinsert(PG_FUNCTION_ARGS) { MIRROREDLOCK_BUFMGR_VERIFY_NO_LOCK_LEAK_DECLARE; Relation rel = (Relation) PG_GETARG_POINTER(0); Datum *values = (Datum *) PG_GETARG_POINTER(1); bool *isnull = (bool *) PG_GETARG_POINTER(2); ItemPointer ht_ctid = (ItemPointer) PG_GETARG_POINTER(3); Relation heapRel = (Relation) PG_GETARG_POINTER(4); bool checkUnique = PG_GETARG_BOOL(5); IndexTuple itup; MIRROREDLOCK_BUFMGR_VERIFY_NO_LOCK_LEAK_ENTER; if (checkUnique && ( (gp_indexcheck_insert == INDEX_CHECK_ALL && RelationIsHeap(heapRel)) || (gp_indexcheck_insert == INDEX_CHECK_SYSTEM && PG_CATALOG_NAMESPACE == RelationGetNamespace(heapRel)))) { _bt_validate_tid(rel, ht_ctid); } /* generate an index tuple */ itup = index_form_tuple(RelationGetDescr(rel), values, isnull); itup->t_tid = *ht_ctid; _bt_doinsert(rel, itup, checkUnique, heapRel); pfree(itup); MIRROREDLOCK_BUFMGR_VERIFY_NO_LOCK_LEAK_EXIT; PG_RETURN_BOOL(true); }
/*--------------------------------------------------------------------------- * This cluster code allows for clustering multiple tables at once. Because * of this, we cannot just run everything on a single transaction, or we * would be forced to acquire exclusive locks on all the tables being * clustered, simultaneously --- very likely leading to deadlock. * * To solve this we follow a similar strategy to VACUUM code, * clustering each relation in a separate transaction. For this to work, * we need to: * - provide a separate memory context so that we can pass information in * a way that survives across transactions * - start a new transaction every time a new relation is clustered * - check for validity of the information on to-be-clustered relations, * as someone might have deleted a relation behind our back, or * clustered one on a different index * - end the transaction * * The single-relation case does not have any such overhead. * * We also allow a relation to be specified without index. In that case, * the indisclustered bit will be looked up, and an ERROR will be thrown * if there is no index with the bit set. *--------------------------------------------------------------------------- */ void cluster(ClusterStmt *stmt, bool isTopLevel) { if (stmt->relation != NULL) { /* This is the single-relation case. */ Oid tableOid, indexOid = InvalidOid; Relation rel; RelToCluster rvtc; /* Find and lock the table */ rel = heap_openrv(stmt->relation, AccessExclusiveLock); tableOid = RelationGetRelid(rel); /* Check permissions */ if (!pg_class_ownercheck(tableOid, GetUserId())) aclcheck_error(ACLCHECK_NOT_OWNER, ACL_KIND_CLASS, RelationGetRelationName(rel)); /* * Reject clustering a remote temp table ... their local buffer * manager is not going to cope. */ if (isOtherTempNamespace(RelationGetNamespace(rel))) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("cannot cluster temporary tables of other sessions"))); if (stmt->indexname == NULL) { ListCell *index; /* We need to find the index that has indisclustered set. */ foreach(index, RelationGetIndexList(rel)) { HeapTuple idxtuple; Form_pg_index indexForm; indexOid = lfirst_oid(index); idxtuple = SearchSysCache(INDEXRELID, ObjectIdGetDatum(indexOid), 0, 0, 0); if (!HeapTupleIsValid(idxtuple)) elog(ERROR, "cache lookup failed for index %u", indexOid); indexForm = (Form_pg_index) GETSTRUCT(idxtuple); if (indexForm->indisclustered) { ReleaseSysCache(idxtuple); break; } ReleaseSysCache(idxtuple); indexOid = InvalidOid; }
/* * GetNewOid * Generate a new OID that is unique within the given relation. * * Caller must have a suitable lock on the relation. * * Uniqueness is promised only if the relation has a unique index on OID. * This is true for all system catalogs that have OIDs, but might not be * true for user tables. Note that we are effectively assuming that the * table has a relatively small number of entries (much less than 2^32) * and there aren't very long runs of consecutive existing OIDs. Again, * this is reasonable for system catalogs but less so for user tables. * * Since the OID is not immediately inserted into the table, there is a * race condition here; but a problem could occur only if someone else * managed to cycle through 2^32 OIDs and generate the same OID before we * finish inserting our row. This seems unlikely to be a problem. Note * that if we had to *commit* the row to end the race condition, the risk * would be rather higher; therefore we use SnapshotDirty in the test, * so that we will see uncommitted rows. */ Oid GetNewOid(Relation relation) { Oid newOid; Oid oidIndex; Relation indexrel; /* If relation doesn't have OIDs at all, caller is confused */ Assert(relation->rd_rel->relhasoids); /* In bootstrap mode, we don't have any indexes to use */ if (IsBootstrapProcessingMode()) return GetNewObjectId(); /* The relcache will cache the identity of the OID index for us */ oidIndex = RelationGetOidIndex(relation); /* If no OID index, just hand back the next OID counter value */ if (!OidIsValid(oidIndex)) { Oid result; /* * System catalogs that have OIDs should *always* have a unique OID * index; we should only take this path for user tables. Give a * warning if it looks like somebody forgot an index. */ if (IsSystemRelation(relation)) elog(WARNING, "generating possibly-non-unique OID for \"%s\"", RelationGetRelationName(relation)); result= GetNewObjectId(); if (IsSystemNamespace(RelationGetNamespace(relation))) { if (Gp_role == GP_ROLE_EXECUTE) { elog(DEBUG1,"Allocating Oid %u on relid %u %s in EXECUTE mode",result,relation->rd_id,RelationGetRelationName(relation)); } if (Gp_role == GP_ROLE_DISPATCH) { elog(DEBUG5,"Allocating Oid %u on relid %u %s in DISPATCH mode",result,relation->rd_id,RelationGetRelationName(relation)); } } return result; } /* Otherwise, use the index to find a nonconflicting OID */ indexrel = index_open(oidIndex, AccessShareLock); newOid = GetNewOidWithIndex(relation, indexrel); index_close(indexrel, AccessShareLock); return newOid; }
/* * GetNewOidWithIndex * Guts of GetNewOid: use the supplied index * * This is exported separately because there are cases where we want to use * an index that will not be recognized by RelationGetOidIndex: TOAST tables * and pg_largeobject have indexes that are usable, but have multiple columns * and are on ordinary columns rather than a true OID column. This code * will work anyway, so long as the OID is the index's first column. * * Caller must have a suitable lock on the relation. */ Oid GetNewOidWithIndex(Relation relation, Relation indexrel) { Oid newOid; IndexScanDesc scan; ScanKeyData key; bool collides; /* Generate new OIDs until we find one not in the table */ do { CHECK_FOR_INTERRUPTS(); newOid = GetNewObjectId(); ScanKeyInit(&key, (AttrNumber) 1, BTEqualStrategyNumber, F_OIDEQ, ObjectIdGetDatum(newOid)); /* see notes above about using SnapshotDirty */ scan = index_beginscan(relation, indexrel, SnapshotDirty, 1, &key); collides = HeapTupleIsValid(index_getnext(scan, ForwardScanDirection)); index_endscan(scan); } while (collides); if (IsSystemNamespace(RelationGetNamespace(relation))) { if (Gp_role == GP_ROLE_EXECUTE) { if (relation->rd_id != 2604 /* pg_attrdef */ && relation->rd_id != 2606 /* pg_constraint */ && relation->rd_id != 2615 /* pg_namespace */) elog(DEBUG1,"Allocating Oid %u with index on relid %u %s in EXECUTE mode",newOid,relation->rd_id, RelationGetRelationName(relation)); else elog(DEBUG4,"Allocating Oid %u with index on relid %u %s in EXECUTE mode",newOid,relation->rd_id, RelationGetRelationName(relation)); } if (Gp_role == GP_ROLE_DISPATCH) { elog(DEBUG5,"Allocating Oid %u with index on relid %u %s in DISPATCH mode",newOid,relation->rd_id, RelationGetRelationName(relation)); } } return newOid; }
/* * Write relation description to the output stream. */ void logicalrep_write_rel(StringInfo out, Relation rel) { char *relname; pq_sendbyte(out, 'R'); /* sending RELATION */ /* use Oid as relation identifier */ pq_sendint(out, RelationGetRelid(rel), 4); /* send qualified relation name */ logicalrep_write_namespace(out, RelationGetNamespace(rel)); relname = RelationGetRelationName(rel); pq_sendstring(out, relname); /* send replica identity */ pq_sendbyte(out, rel->rd_rel->relreplident); /* send the attribute info */ logicalrep_write_attrs(out, rel); }
/* * OIDs for catalog object are normally allocated in the master, and * executor nodes should just use the OIDs passed by the master. But * there are some exceptions. */ static bool RelationNeedsSynchronizedOIDs(Relation relation) { if (IsSystemNamespace(RelationGetNamespace(relation))) { switch(RelationGetRelid(relation)) { /* * pg_largeobject is more like a user table, and has * different contents in each segment and master. */ case LargeObjectRelationId: return false; /* * We don't currently synchronize the OIDs of these catalogs. * It's a bit sketchy that we don't, but we get away with it * because these OIDs don't appear in any of the Node structs * that are dispatched from master to segments. (Except for the * OIDs, the contents of these tables should be in sync.) */ case RewriteRelationId: case TriggerRelationId: case AccessMethodOperatorRelationId: case AccessMethodProcedureRelationId: return false; } /* * All other system catalogs are assumed to need synchronized * OIDs. */ return true; } return false; }
Datum bt_page_items(PG_FUNCTION_ARGS) { text *relname = PG_GETARG_TEXT_P(0); uint32 blkno = PG_GETARG_UINT32(1); RangeVar *relrv; Datum result; char *values[BTPAGEITEMS_NCOLUMNS]; BTPageOpaque opaque; HeapTuple tuple; ItemId id; FuncCallContext *fctx; MemoryContext mctx; struct user_args *uargs = NULL; if (!superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), (errmsg("must be superuser to use pgstattuple functions")))); if (SRF_IS_FIRSTCALL()) { fctx = SRF_FIRSTCALL_INIT(); mctx = MemoryContextSwitchTo(fctx->multi_call_memory_ctx); uargs = palloc(sizeof(struct user_args)); uargs->tupd = RelationNameGetTupleDesc(BTPAGEITEMS_TYPE); uargs->offset = FirstOffsetNumber; relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname)); uargs->rel = relation_openrv(relrv, AccessShareLock); if (!IS_INDEX(uargs->rel) || !IS_BTREE(uargs->rel)) elog(ERROR, "bt_page_items() can be used only on b-tree index."); /* * Reject attempts to read non-local temporary relations; we would * be likely to get wrong data since we have no visibility into the * owning session's local buffers. */ if (isOtherTempNamespace(RelationGetNamespace(uargs->rel))) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("cannot access temporary tables of other sessions"))); if (blkno == 0) elog(ERROR, "Block 0 is a meta page."); CHECK_RELATION_BLOCK_RANGE(uargs->rel, blkno); uargs->buffer = ReadBuffer(uargs->rel, blkno); uargs->page = BufferGetPage(uargs->buffer); opaque = (BTPageOpaque) PageGetSpecialPointer(uargs->page); if (P_ISDELETED(opaque)) elog(NOTICE, "bt_page_items(): this page is deleted."); fctx->max_calls = PageGetMaxOffsetNumber(uargs->page); fctx->user_fctx = uargs; MemoryContextSwitchTo(mctx); } fctx = SRF_PERCALL_SETUP(); uargs = fctx->user_fctx; if (fctx->call_cntr < fctx->max_calls) { IndexTuple itup; id = PageGetItemId(uargs->page, uargs->offset); if (!ItemIdIsValid(id)) elog(ERROR, "Invalid ItemId."); itup = (IndexTuple) PageGetItem(uargs->page, id); { int j = 0; BlockNumber blkno = BlockIdGetBlockNumber(&(itup->t_tid.ip_blkid)); values[j] = palloc(32); snprintf(values[j++], 32, "%d", uargs->offset); values[j] = palloc(32); snprintf(values[j++], 32, "(%u,%u)", blkno, itup->t_tid.ip_posid); values[j] = palloc(32); snprintf(values[j++], 32, "%d", (int) IndexTupleSize(itup)); values[j] = palloc(32); snprintf(values[j++], 32, "%c", IndexTupleHasNulls(itup) ? 't' : 'f'); values[j] = palloc(32); snprintf(values[j++], 32, "%c", IndexTupleHasVarwidths(itup) ? 't' : 'f'); { int off; char *dump; char *ptr = (char *) itup + IndexInfoFindDataOffset(itup->t_info); dump = palloc(IndexTupleSize(itup) * 3); memset(dump, 0, IndexTupleSize(itup) * 3); for (off = 0; off < IndexTupleSize(itup) - IndexInfoFindDataOffset(itup->t_info); off++) { if (dump[0] == '\0') sprintf(dump, "%02x", *(ptr + off) & 0xff); else { char buf[4]; sprintf(buf, " %02x", *(ptr + off) & 0xff); strcat(dump, buf); } } values[j] = dump; } tuple = BuildTupleFromCStrings(TupleDescGetAttInMetadata(uargs->tupd), values); result = TupleGetDatum(TupleDescGetSlot(uargs->tupd), tuple); } uargs->offset = uargs->offset + 1; SRF_RETURN_NEXT(fctx, result); } else { ReleaseBuffer(uargs->buffer); relation_close(uargs->rel, AccessShareLock); SRF_RETURN_DONE(fctx); } }
/* * ExecRefreshMatView -- execute a REFRESH MATERIALIZED VIEW command * * This refreshes the materialized view by creating a new table and swapping * the relfilenodes of the new table and the old materialized view, so the OID * of the original materialized view is preserved. Thus we do not lose GRANT * nor references to this materialized view. * * If WITH NO DATA was specified, this is effectively like a TRUNCATE; * otherwise it is like a TRUNCATE followed by an INSERT using the SELECT * statement associated with the materialized view. The statement node's * skipData field shows whether the clause was used. * * Indexes are rebuilt too, via REINDEX. Since we are effectively bulk-loading * the new heap, it's better to create the indexes afterwards than to fill them * incrementally while we load. * * The matview's "populated" state is changed based on whether the contents * reflect the result set of the materialized view's query. */ ObjectAddress ExecRefreshMatView(RefreshMatViewStmt *stmt, const char *queryString, ParamListInfo params, char *completionTag) { Oid matviewOid; Relation matviewRel; RewriteRule *rule; List *actions; Query *dataQuery; Oid tableSpace; Oid relowner; Oid OIDNewHeap; DestReceiver *dest; uint64 processed = 0; bool concurrent; LOCKMODE lockmode; char relpersistence; Oid save_userid; int save_sec_context; int save_nestlevel; ObjectAddress address; /* Determine strength of lock needed. */ concurrent = stmt->concurrent; lockmode = concurrent ? ExclusiveLock : AccessExclusiveLock; /* * Get a lock until end of transaction. */ matviewOid = RangeVarGetRelidExtended(stmt->relation, lockmode, 0, RangeVarCallbackOwnsTable, NULL); matviewRel = table_open(matviewOid, NoLock); /* Make sure it is a materialized view. */ if (matviewRel->rd_rel->relkind != RELKIND_MATVIEW) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("\"%s\" is not a materialized view", RelationGetRelationName(matviewRel)))); /* Check that CONCURRENTLY is not specified if not populated. */ if (concurrent && !RelationIsPopulated(matviewRel)) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("CONCURRENTLY cannot be used when the materialized view is not populated"))); /* Check that conflicting options have not been specified. */ if (concurrent && stmt->skipData) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("CONCURRENTLY and WITH NO DATA options cannot be used together"))); /* * Check that everything is correct for a refresh. Problems at this point * are internal errors, so elog is sufficient. */ if (matviewRel->rd_rel->relhasrules == false || matviewRel->rd_rules->numLocks < 1) elog(ERROR, "materialized view \"%s\" is missing rewrite information", RelationGetRelationName(matviewRel)); if (matviewRel->rd_rules->numLocks > 1) elog(ERROR, "materialized view \"%s\" has too many rules", RelationGetRelationName(matviewRel)); rule = matviewRel->rd_rules->rules[0]; if (rule->event != CMD_SELECT || !(rule->isInstead)) elog(ERROR, "the rule for materialized view \"%s\" is not a SELECT INSTEAD OF rule", RelationGetRelationName(matviewRel)); actions = rule->actions; if (list_length(actions) != 1) elog(ERROR, "the rule for materialized view \"%s\" is not a single action", RelationGetRelationName(matviewRel)); /* * Check that there is a unique index with no WHERE clause on one or more * columns of the materialized view if CONCURRENTLY is specified. */ if (concurrent) { List *indexoidlist = RelationGetIndexList(matviewRel); ListCell *indexoidscan; bool hasUniqueIndex = false; foreach(indexoidscan, indexoidlist) { Oid indexoid = lfirst_oid(indexoidscan); Relation indexRel; indexRel = index_open(indexoid, AccessShareLock); hasUniqueIndex = is_usable_unique_index(indexRel); index_close(indexRel, AccessShareLock); if (hasUniqueIndex) break; } list_free(indexoidlist); if (!hasUniqueIndex) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("cannot refresh materialized view \"%s\" concurrently", quote_qualified_identifier(get_namespace_name(RelationGetNamespace(matviewRel)), RelationGetRelationName(matviewRel))), errhint("Create a unique index with no WHERE clause on one or more columns of the materialized view."))); }
/* * DefineIndex * Creates a new index. * * 'heapRelation': the relation the index will apply to. * 'indexRelationName': the name for the new index, or NULL to indicate * that a nonconflicting default name should be picked. * 'indexRelationId': normally InvalidOid, but during bootstrap can be * nonzero to specify a preselected OID for the index. * 'accessMethodName': name of the AM to use. * 'tableSpaceName': name of the tablespace to create the index in. * NULL specifies using the appropriate default. * 'attributeList': a list of IndexElem specifying columns and expressions * to index on. * 'predicate': the partial-index condition, or NULL if none. * 'rangetable': needed to interpret the predicate. * 'options': reloptions from WITH (in list-of-DefElem form). * 'unique': make the index enforce uniqueness. * 'primary': mark the index as a primary key in the catalogs. * 'isconstraint': index is for a PRIMARY KEY or UNIQUE constraint, * so build a pg_constraint entry for it. * 'is_alter_table': this is due to an ALTER rather than a CREATE operation. * 'check_rights': check for CREATE rights in the namespace. (This should * be true except when ALTER is deleting/recreating an index.) * 'skip_build': make the catalog entries but leave the index file empty; * it will be filled later. * 'quiet': suppress the NOTICE chatter ordinarily provided for constraints. * 'concurrent': avoid blocking writers to the table while building. */ void DefineIndex(RangeVar *heapRelation, char *indexRelationName, Oid indexRelationId, char *accessMethodName, char *tableSpaceName, List *attributeList, Expr *predicate, List *rangetable, List *options, bool unique, bool primary, bool isconstraint, bool is_alter_table, bool check_rights, bool skip_build, bool quiet, bool concurrent) { Oid *classObjectId; Oid accessMethodId; Oid relationId; Oid namespaceId; Oid tablespaceId; Relation rel; HeapTuple tuple; Form_pg_am accessMethodForm; RegProcedure amoptions; Datum reloptions; IndexInfo *indexInfo; int numberOfAttributes; List *old_xact_list; ListCell *lc; uint32 ixcnt; LockRelId heaprelid; LOCKTAG heaplocktag; Snapshot snapshot; Relation pg_index; HeapTuple indexTuple; Form_pg_index indexForm; /* * count attributes in index */ numberOfAttributes = list_length(attributeList); if (numberOfAttributes <= 0) ereport(ERROR, (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), errmsg("must specify at least one column"))); if (numberOfAttributes > INDEX_MAX_KEYS) ereport(ERROR, (errcode(ERRCODE_TOO_MANY_COLUMNS), errmsg("cannot use more than %d columns in an index", INDEX_MAX_KEYS))); /* * Open heap relation, acquire a suitable lock on it, remember its OID * * Only SELECT ... FOR UPDATE/SHARE are allowed while doing a standard * index build; but for concurrent builds we allow INSERT/UPDATE/DELETE * (but not VACUUM). */ rel = heap_openrv(heapRelation, (concurrent ? ShareUpdateExclusiveLock : ShareLock)); relationId = RelationGetRelid(rel); namespaceId = RelationGetNamespace(rel); /* Note: during bootstrap may see uncataloged relation */ if (rel->rd_rel->relkind != RELKIND_RELATION && rel->rd_rel->relkind != RELKIND_UNCATALOGED) ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("\"%s\" is not a table", heapRelation->relname))); /* * Don't try to CREATE INDEX on temp tables of other backends. */ if (isOtherTempNamespace(namespaceId)) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("cannot create indexes on temporary tables of other sessions"))); /* * Verify we (still) have CREATE rights in the rel's namespace. * (Presumably we did when the rel was created, but maybe not anymore.) * Skip check if caller doesn't want it. Also skip check if * bootstrapping, since permissions machinery may not be working yet. */ if (check_rights && !IsBootstrapProcessingMode()) { AclResult aclresult; aclresult = pg_namespace_aclcheck(namespaceId, GetUserId(), ACL_CREATE); if (aclresult != ACLCHECK_OK) aclcheck_error(aclresult, ACL_KIND_NAMESPACE, get_namespace_name(namespaceId)); } /* * Select tablespace to use. If not specified, use default_tablespace * (which may in turn default to database's default). */ if (tableSpaceName) { tablespaceId = get_tablespace_oid(tableSpaceName); if (!OidIsValid(tablespaceId)) ereport(ERROR, (errcode(ERRCODE_UNDEFINED_OBJECT), errmsg("tablespace \"%s\" does not exist", tableSpaceName))); } else { tablespaceId = GetDefaultTablespace(); /* note InvalidOid is OK in this case */ } /* Check permissions except when using database's default */ if (OidIsValid(tablespaceId)) { AclResult aclresult; aclresult = pg_tablespace_aclcheck(tablespaceId, GetUserId(), ACL_CREATE); if (aclresult != ACLCHECK_OK) aclcheck_error(aclresult, ACL_KIND_TABLESPACE, get_tablespace_name(tablespaceId)); } /* * Force shared indexes into the pg_global tablespace. This is a bit of a * hack but seems simpler than marking them in the BKI commands. */ if (rel->rd_rel->relisshared) tablespaceId = GLOBALTABLESPACE_OID; /* * Select name for index if caller didn't specify */ if (indexRelationName == NULL) { if (primary) indexRelationName = ChooseRelationName(RelationGetRelationName(rel), NULL, "pkey", namespaceId); else { IndexElem *iparam = (IndexElem *) linitial(attributeList); indexRelationName = ChooseRelationName(RelationGetRelationName(rel), iparam->name, "key", namespaceId); } } /* * look up the access method, verify it can handle the requested features */ tuple = SearchSysCache(AMNAME, PointerGetDatum(accessMethodName), 0, 0, 0); if (!HeapTupleIsValid(tuple)) { /* * Hack to provide more-or-less-transparent updating of old RTREE * indexes to GIST: if RTREE is requested and not found, use GIST. */ if (strcmp(accessMethodName, "rtree") == 0) { ereport(NOTICE, (errmsg("substituting access method \"gist\" for obsolete method \"rtree\""))); accessMethodName = "gist"; tuple = SearchSysCache(AMNAME, PointerGetDatum(accessMethodName), 0, 0, 0); } if (!HeapTupleIsValid(tuple)) ereport(ERROR, (errcode(ERRCODE_UNDEFINED_OBJECT), errmsg("access method \"%s\" does not exist", accessMethodName))); } accessMethodId = HeapTupleGetOid(tuple); accessMethodForm = (Form_pg_am) GETSTRUCT(tuple); if (unique && !accessMethodForm->amcanunique) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("access method \"%s\" does not support unique indexes", accessMethodName))); if (numberOfAttributes > 1 && !accessMethodForm->amcanmulticol) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("access method \"%s\" does not support multicolumn indexes", accessMethodName))); amoptions = accessMethodForm->amoptions; ReleaseSysCache(tuple); /* * If a range table was created then check that only the base rel is * mentioned. */ if (rangetable != NIL) { if (list_length(rangetable) != 1 || getrelid(1, rangetable) != relationId) ereport(ERROR, (errcode(ERRCODE_INVALID_COLUMN_REFERENCE), errmsg("index expressions and predicates may refer only to the table being indexed"))); } /* * Validate predicate, if given */ if (predicate) CheckPredicate(predicate); /* * Extra checks when creating a PRIMARY KEY index. */ if (primary) { List *cmds; ListCell *keys; /* * If ALTER TABLE, check that there isn't already a PRIMARY KEY. In * CREATE TABLE, we have faith that the parser rejected multiple pkey * clauses; and CREATE INDEX doesn't have a way to say PRIMARY KEY, so * it's no problem either. */ if (is_alter_table && relationHasPrimaryKey(rel)) { ereport(ERROR, (errcode(ERRCODE_INVALID_TABLE_DEFINITION), errmsg("multiple primary keys for table \"%s\" are not allowed", RelationGetRelationName(rel)))); } /* * Check that all of the attributes in a primary key are marked as not * null, otherwise attempt to ALTER TABLE .. SET NOT NULL */ cmds = NIL; foreach(keys, attributeList) { IndexElem *key = (IndexElem *) lfirst(keys); HeapTuple atttuple; if (!key->name) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("primary keys cannot be expressions"))); /* System attributes are never null, so no problem */ if (SystemAttributeByName(key->name, rel->rd_rel->relhasoids)) continue; atttuple = SearchSysCacheAttName(relationId, key->name); if (HeapTupleIsValid(atttuple)) { if (!((Form_pg_attribute) GETSTRUCT(atttuple))->attnotnull) { /* Add a subcommand to make this one NOT NULL */ AlterTableCmd *cmd = makeNode(AlterTableCmd); cmd->subtype = AT_SetNotNull; cmd->name = key->name; cmds = lappend(cmds, cmd); } ReleaseSysCache(atttuple); } else { /* * This shouldn't happen during CREATE TABLE, but can happen * during ALTER TABLE. Keep message in sync with * transformIndexConstraints() in parser/analyze.c. */ ereport(ERROR, (errcode(ERRCODE_UNDEFINED_COLUMN), errmsg("column \"%s\" named in key does not exist", key->name))); } } /* * XXX: Shouldn't the ALTER TABLE .. SET NOT NULL cascade to child * tables? Currently, since the PRIMARY KEY itself doesn't cascade, * we don't cascade the notnull constraint(s) either; but this is * pretty debatable. * * XXX: possible future improvement: when being called from ALTER * TABLE, it would be more efficient to merge this with the outer * ALTER TABLE, so as to avoid two scans. But that seems to * complicate DefineIndex's API unduly. */ if (cmds) AlterTableInternal(relationId, cmds, false); }
/* * Post vacuum, iterate over all entries in index, check if the h_tid * of each entry exists and is not dead. For specific system tables, * also ensure that the key in index entry matches the corresponding * attribute in the heap tuple. */ void _bt_validate_vacuum(Relation irel, Relation hrel, TransactionId oldest_xmin) { MIRROREDLOCK_BUFMGR_DECLARE; BlockNumber blkno; BlockNumber num_pages; Buffer ibuf = InvalidBuffer; Buffer hbuf = InvalidBuffer; Page ipage; BTPageOpaque opaque; IndexTuple itup; HeapTupleData htup; OffsetNumber maxoff, minoff, offnum; Oid ioid, hoid; bool isnull; blkno = BTREE_METAPAGE + 1; num_pages = RelationGetNumberOfBlocks(irel); elog(LOG, "btvalidatevacuum: index %s, heap %s", RelationGetRelationName(irel), RelationGetRelationName(hrel)); MIRROREDLOCK_BUFMGR_LOCK; for (; blkno < num_pages; blkno++) { ibuf = ReadBuffer(irel, blkno); ipage = BufferGetPage(ibuf); opaque = (BTPageOpaque) PageGetSpecialPointer(ipage); if (!PageIsNew(ipage)) _bt_checkpage(irel, ibuf); if (P_ISLEAF(opaque)) { minoff = P_FIRSTDATAKEY(opaque); maxoff = PageGetMaxOffsetNumber(ipage); for (offnum = minoff; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { itup = (IndexTuple) PageGetItem(ipage, PageGetItemId(ipage, offnum)); ItemPointerCopy(&itup->t_tid, &htup.t_self); /* * TODO: construct a tid bitmap based on index tids * and fetch heap tids in order afterwards. That will * also allow validating if a heap tid appears twice * in a unique index. */ if (!heap_release_fetch(hrel, SnapshotAny, &htup, &hbuf, true, NULL)) { elog(ERROR, "btvalidatevacuum: tid (%d,%d) from index %s " "not found in heap %s", ItemPointerGetBlockNumber(&itup->t_tid), ItemPointerGetOffsetNumber(&itup->t_tid), RelationGetRelationName(irel), RelationGetRelationName(hrel)); } switch (HeapTupleSatisfiesVacuum(hrel, htup.t_data, oldest_xmin, hbuf)) { case HEAPTUPLE_RECENTLY_DEAD: case HEAPTUPLE_LIVE: case HEAPTUPLE_INSERT_IN_PROGRESS: case HEAPTUPLE_DELETE_IN_PROGRESS: /* these tuples are considered alive by vacuum */ break; case HEAPTUPLE_DEAD: elog(ERROR, "btvalidatevacuum: vacuum did not remove " "dead tuple (%d,%d) from heap %s and index %s", ItemPointerGetBlockNumber(&itup->t_tid), ItemPointerGetOffsetNumber(&itup->t_tid), RelationGetRelationName(hrel), RelationGetRelationName(irel)); break; default: elog(ERROR, "btvalidatevacuum: invalid visibility"); break; } switch(RelationGetRelid(irel)) { case DatabaseOidIndexId: case TypeOidIndexId: case ClassOidIndexId: case ConstraintOidIndexId: hoid = HeapTupleGetOid(&htup); ioid = index_getattr(itup, 1, RelationGetDescr(irel), &isnull); if (hoid != ioid) { elog(ERROR, "btvalidatevacuum: index oid(%d) != heap oid(%d)" " tuple (%d,%d) index %s", ioid, hoid, ItemPointerGetBlockNumber(&itup->t_tid), ItemPointerGetOffsetNumber(&itup->t_tid), RelationGetRelationName(irel)); } break; case GpRelationNodeOidIndexId: hoid = heap_getattr(&htup, 1, RelationGetDescr(hrel), &isnull); ioid = index_getattr(itup, 1, RelationGetDescr(irel), &isnull); if (hoid != ioid) { elog(ERROR, "btvalidatevacuum: index oid(%d) != heap oid(%d)" " tuple (%d,%d) index %s", ioid, hoid, ItemPointerGetBlockNumber(&itup->t_tid), ItemPointerGetOffsetNumber(&itup->t_tid), RelationGetRelationName(irel)); } int4 hsegno = heap_getattr(&htup, 2, RelationGetDescr(hrel), &isnull); int4 isegno = index_getattr(itup, 2, RelationGetDescr(irel), &isnull); if (isegno != hsegno) { elog(ERROR, "btvalidatevacuum: index segno(%d) != heap segno(%d)" " tuple (%d,%d) index %s", isegno, hsegno, ItemPointerGetBlockNumber(&itup->t_tid), ItemPointerGetOffsetNumber(&itup->t_tid), RelationGetRelationName(irel)); } break; default: break; } if (RelationGetNamespace(irel) == PG_AOSEGMENT_NAMESPACE) { int4 isegno = index_getattr(itup, 1, RelationGetDescr(irel), &isnull); int4 hsegno = heap_getattr(&htup, 1, RelationGetDescr(hrel), &isnull); if (isegno != hsegno) { elog(ERROR, "btvalidatevacuum: index segno(%d) != heap segno(%d)" " tuple (%d,%d) index %s", isegno, hsegno, ItemPointerGetBlockNumber(&itup->t_tid), ItemPointerGetOffsetNumber(&itup->t_tid), RelationGetRelationName(irel)); } } } } if (BufferIsValid(ibuf)) ReleaseBuffer(ibuf); } if (BufferIsValid(hbuf)) ReleaseBuffer(hbuf); MIRROREDLOCK_BUFMGR_UNLOCK; }
/* * lazy_vacuum_rel() -- perform LAZY VACUUM for one heap relation * * This routine vacuums a single heap, cleans out its indexes, and * updates its relpages and reltuples statistics. * * At entry, we have already established a transaction and opened * and locked the relation. */ void lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt, BufferAccessStrategy bstrategy) { LVRelStats *vacrelstats; Relation *Irel; int nindexes; BlockNumber possibly_freeable; PGRUsage ru0; TimestampTz starttime = 0; long secs; int usecs; double read_rate, write_rate; bool scan_all; TransactionId freezeTableLimit; BlockNumber new_rel_pages; double new_rel_tuples; BlockNumber new_rel_allvisible; TransactionId new_frozen_xid; /* measure elapsed time iff autovacuum logging requires it */ if (IsAutoVacuumWorkerProcess() && Log_autovacuum_min_duration >= 0) { pg_rusage_init(&ru0); starttime = GetCurrentTimestamp(); } if (vacstmt->options & VACOPT_VERBOSE) elevel = INFO; else elevel = DEBUG2; vac_strategy = bstrategy; vacuum_set_xid_limits(vacstmt->freeze_min_age, vacstmt->freeze_table_age, onerel->rd_rel->relisshared, &OldestXmin, &FreezeLimit, &freezeTableLimit); scan_all = TransactionIdPrecedesOrEquals(onerel->rd_rel->relfrozenxid, freezeTableLimit); vacrelstats = (LVRelStats *) palloc0(sizeof(LVRelStats)); vacrelstats->old_rel_pages = onerel->rd_rel->relpages; vacrelstats->old_rel_tuples = onerel->rd_rel->reltuples; vacrelstats->num_index_scans = 0; /* Open all indexes of the relation */ vac_open_indexes(onerel, RowExclusiveLock, &nindexes, &Irel); vacrelstats->hasindex = (nindexes > 0); /* Do the vacuuming */ lazy_scan_heap(onerel, vacrelstats, Irel, nindexes, scan_all); /* Done with indexes */ vac_close_indexes(nindexes, Irel, NoLock); /* * Optionally truncate the relation. * * Don't even think about it unless we have a shot at releasing a goodly * number of pages. Otherwise, the time taken isn't worth it. */ possibly_freeable = vacrelstats->rel_pages - vacrelstats->nonempty_pages; if (possibly_freeable > 0 && (possibly_freeable >= REL_TRUNCATE_MINIMUM || possibly_freeable >= vacrelstats->rel_pages / REL_TRUNCATE_FRACTION)) lazy_truncate_heap(onerel, vacrelstats); /* Vacuum the Free Space Map */ FreeSpaceMapVacuum(onerel); /* * Update statistics in pg_class. * * A corner case here is that if we scanned no pages at all because every * page is all-visible, we should not update relpages/reltuples, because * we have no new information to contribute. In particular this keeps * us from replacing relpages=reltuples=0 (which means "unknown tuple * density") with nonzero relpages and reltuples=0 (which means "zero * tuple density") unless there's some actual evidence for the latter. * * We do update relallvisible even in the corner case, since if the * table is all-visible we'd definitely like to know that. But clamp * the value to be not more than what we're setting relpages to. * * Also, don't change relfrozenxid if we skipped any pages, since then * we don't know for certain that all tuples have a newer xmin. */ new_rel_pages = vacrelstats->rel_pages; new_rel_tuples = vacrelstats->new_rel_tuples; if (vacrelstats->scanned_pages == 0 && new_rel_pages > 0) { new_rel_pages = vacrelstats->old_rel_pages; new_rel_tuples = vacrelstats->old_rel_tuples; } new_rel_allvisible = visibilitymap_count(onerel); if (new_rel_allvisible > new_rel_pages) new_rel_allvisible = new_rel_pages; new_frozen_xid = FreezeLimit; if (vacrelstats->scanned_pages < vacrelstats->rel_pages) new_frozen_xid = InvalidTransactionId; vac_update_relstats(onerel, new_rel_pages, new_rel_tuples, new_rel_allvisible, vacrelstats->hasindex, new_frozen_xid); /* report results to the stats collector, too */ pgstat_report_vacuum(RelationGetRelid(onerel), onerel->rd_rel->relisshared, new_rel_tuples); /* and log the action if appropriate */ if (IsAutoVacuumWorkerProcess() && Log_autovacuum_min_duration >= 0) { TimestampTz endtime = GetCurrentTimestamp(); if (Log_autovacuum_min_duration == 0 || TimestampDifferenceExceeds(starttime, endtime, Log_autovacuum_min_duration)) { TimestampDifference(starttime, endtime, &secs, &usecs); read_rate = 0; write_rate = 0; if ((secs > 0) || (usecs > 0)) { read_rate = (double) BLCKSZ * VacuumPageMiss / (1024 * 1024) / (secs + usecs / 1000000.0); write_rate = (double) BLCKSZ * VacuumPageDirty / (1024 * 1024) / (secs + usecs / 1000000.0); } ereport(LOG, (errmsg("automatic vacuum of table \"%s.%s.%s\": index scans: %d\n" "pages: %d removed, %d remain\n" "tuples: %.0f removed, %.0f remain\n" "buffer usage: %d hits, %d misses, %d dirtied\n" "avg read rate: %.3f MiB/s, avg write rate: %.3f MiB/s\n" "system usage: %s", get_database_name(MyDatabaseId), get_namespace_name(RelationGetNamespace(onerel)), RelationGetRelationName(onerel), vacrelstats->num_index_scans, vacrelstats->pages_removed, vacrelstats->rel_pages, vacrelstats->tuples_deleted, vacrelstats->new_rel_tuples, VacuumPageHit, VacuumPageMiss, VacuumPageDirty, read_rate,write_rate, pg_rusage_show(&ru0)))); } } }
/* * lazy_vacuum_rel() -- perform LAZY VACUUM for one heap relation * * This routine vacuums a single heap, cleans out its indexes, and * updates its relpages and reltuples statistics. * * At entry, we have already established a transaction and opened * and locked the relation. */ void lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt, BufferAccessStrategy bstrategy) { LVRelStats *vacrelstats; Relation *Irel; int nindexes; BlockNumber possibly_freeable; PGRUsage ru0; TimestampTz starttime = 0; bool scan_all; TransactionId freezeTableLimit; pg_rusage_init(&ru0); /* measure elapsed time iff autovacuum logging requires it */ if (IsAutoVacuumWorkerProcess() && Log_autovacuum_min_duration > 0) starttime = GetCurrentTimestamp(); if (vacstmt->options & VACOPT_VERBOSE) elevel = INFO; else elevel = DEBUG2; vac_strategy = bstrategy; vacuum_set_xid_limits(vacstmt->freeze_min_age, vacstmt->freeze_table_age, onerel->rd_rel->relisshared, &OldestXmin, &FreezeLimit, &freezeTableLimit); scan_all = TransactionIdPrecedesOrEquals(onerel->rd_rel->relfrozenxid, freezeTableLimit); vacrelstats = (LVRelStats *) palloc0(sizeof(LVRelStats)); vacrelstats->old_rel_tuples = onerel->rd_rel->reltuples; vacrelstats->num_index_scans = 0; /* Open all indexes of the relation */ vac_open_indexes(onerel, RowExclusiveLock, &nindexes, &Irel); vacrelstats->hasindex = (nindexes > 0); /* Do the vacuuming */ lazy_scan_heap(onerel, vacrelstats, Irel, nindexes, scan_all); /* Done with indexes */ vac_close_indexes(nindexes, Irel, NoLock); /* * Optionally truncate the relation. * * Don't even think about it unless we have a shot at releasing a goodly * number of pages. Otherwise, the time taken isn't worth it. */ possibly_freeable = vacrelstats->rel_pages - vacrelstats->nonempty_pages; if (possibly_freeable > 0 && (possibly_freeable >= REL_TRUNCATE_MINIMUM || possibly_freeable >= vacrelstats->rel_pages / REL_TRUNCATE_FRACTION)) lazy_truncate_heap(onerel, vacrelstats); /* Vacuum the Free Space Map */ FreeSpaceMapVacuum(onerel); /* * Update statistics in pg_class. But don't change relfrozenxid if we * skipped any pages. */ vac_update_relstats(onerel, vacrelstats->rel_pages, vacrelstats->new_rel_tuples, vacrelstats->hasindex, (vacrelstats->scanned_pages < vacrelstats->rel_pages) ? InvalidTransactionId : FreezeLimit); /* report results to the stats collector, too */ pgstat_report_vacuum(RelationGetRelid(onerel), onerel->rd_rel->relisshared, vacrelstats->new_rel_tuples); /* and log the action if appropriate */ if (IsAutoVacuumWorkerProcess() && Log_autovacuum_min_duration >= 0) { if (Log_autovacuum_min_duration == 0 || TimestampDifferenceExceeds(starttime, GetCurrentTimestamp(), Log_autovacuum_min_duration)) ereport(LOG, (errmsg("automatic vacuum of table \"%s.%s.%s\": index scans: %d\n" "pages: %d removed, %d remain\n" "tuples: %.0f removed, %.0f remain\n" "system usage: %s", get_database_name(MyDatabaseId), get_namespace_name(RelationGetNamespace(onerel)), RelationGetRelationName(onerel), vacrelstats->num_index_scans, vacrelstats->pages_removed, vacrelstats->rel_pages, vacrelstats->tuples_deleted, vacrelstats->new_rel_tuples, pg_rusage_show(&ru0)))); } }
/* * vacuum_appendonly_rel() -- vaccum an append-only relation * * This procedure will be what gets executed both for VACUUM * and VACUUM FULL (and also ANALYZE or any other thing that * needs the pg_class stats updated). * * The function can compact append-only segment files or just * truncating the segment file to its existing eof. * * Afterwards, the reltuples and relpages information in pg_class * are updated. reltuples is the same as "pg_aoseg_<oid>:tupcount" * column and we simulate relpages by subdividing the eof value * ("pg_aoseg_<oid>:eof") over the defined page size. * * * There are txn ids, hint bits, free space, dead tuples, * etc. these are all irrelevant in the append only relation context. * */ void vacuum_appendonly_rel(Relation aorel, VacuumStmt *vacstmt) { char *relname; PGRUsage ru0; Assert(RelationIsAoRows(aorel) || RelationIsAoCols(aorel)); Assert(!vacummStatement_IsInAppendOnlyCleanupPhase(vacstmt)); pg_rusage_init(&ru0); relname = RelationGetRelationName(aorel); ereport(elevel, (errmsg("vacuuming \"%s.%s\"", get_namespace_name(RelationGetNamespace(aorel)), relname))); if (Gp_role == GP_ROLE_DISPATCH) { return; } Assert(list_length(vacstmt->appendonly_compaction_insert_segno) <= 1); if (vacstmt->appendonly_compaction_insert_segno == NULL) { elogif(Debug_appendonly_print_compaction, LOG, "Vacuum drop phase %s", RelationGetRelationName(aorel)); if (RelationIsAoRows(aorel)) { AppendOnlyDrop(aorel, vacstmt->appendonly_compaction_segno); } else { Assert(RelationIsAoCols(aorel)); AOCSDrop(aorel, vacstmt->appendonly_compaction_segno); } } else { int insert_segno = linitial_int(vacstmt->appendonly_compaction_insert_segno); if (insert_segno == APPENDONLY_COMPACTION_SEGNO_INVALID) { elogif(Debug_appendonly_print_compaction, LOG, "Vacuum pseudo-compaction phase %s", RelationGetRelationName(aorel)); } else { elogif(Debug_appendonly_print_compaction, LOG, "Vacuum compaction phase %s", RelationGetRelationName(aorel)); if (RelationIsAoRows(aorel)) { AppendOnlyCompact(aorel, vacstmt->appendonly_compaction_segno, insert_segno, vacstmt->full); } else { Assert(RelationIsAoCols(aorel)); AOCSCompact(aorel, vacstmt->appendonly_compaction_segno, insert_segno, vacstmt->full); } } } }
desc = lfirst(lc); if (desc->indexRelid == indexRel->rd_id) return desc; } heapRel = relation_open(indexRel->rd_index->indrelid, AccessShareLock); desc = palloc0(sizeof(ZDBIndexDescriptor)); /* these all come from the actual index */ desc->indexRelid = RelationGetRelid(indexRel); desc->heapRelid = RelationGetRelid(heapRel); desc->isShadow = ZDBIndexOptionsGetShadow(indexRel) != NULL; desc->logit = false; desc->databaseName = pstrdup(get_database_name(MyDatabaseId)); desc->schemaName = pstrdup(get_namespace_name(RelationGetNamespace(heapRel))); desc->tableName = pstrdup(RelationGetRelationName(heapRel)); desc->options = ZDBIndexOptionsGetOptions(indexRel) == NULL ? NULL : pstrdup(ZDBIndexOptionsGetOptions(indexRel)); desc->searchPreference = ZDBIndexOptionsGetSearchPreference(indexRel); desc->refreshInterval = ZDBIndexOptionsGetRefreshInterval(indexRel); desc->bulk_concurrency = ZDBIndexOptionsGetBulkConcurrency(indexRel); desc->batch_size = ZDBIndexOptionsGetBatchSize(indexRel); desc->ignoreVisibility = ZDBIndexOptionsGetIgnoreVisibility(indexRel); desc->fieldLists = ZDBIndexOptionsGetFieldLists(indexRel); desc->alwaysResolveJoins = ZDBIndexOptionsAlwaysResolveJoins(indexRel); heapTupDesc = RelationGetDescr(heapRel); for (i = 0; i < heapTupDesc->natts; i++) { if (heapTupDesc->attrs[i]->atttypid == JSONOID) { desc->hasJson = true;
/* * refresh_by_match_merge * * Refresh a materialized view with transactional semantics, while allowing * concurrent reads. * * This is called after a new version of the data has been created in a * temporary table. It performs a full outer join against the old version of * the data, producing "diff" results. This join cannot work if there are any * duplicated rows in either the old or new versions, in the sense that every * column would compare as equal between the two rows. It does work correctly * in the face of rows which have at least one NULL value, with all non-NULL * columns equal. The behavior of NULLs on equality tests and on UNIQUE * indexes turns out to be quite convenient here; the tests we need to make * are consistent with default behavior. If there is at least one UNIQUE * index on the materialized view, we have exactly the guarantee we need. * * The temporary table used to hold the diff results contains just the TID of * the old record (if matched) and the ROW from the new table as a single * column of complex record type (if matched). * * Once we have the diff table, we perform set-based DELETE and INSERT * operations against the materialized view, and discard both temporary * tables. * * Everything from the generation of the new data to applying the differences * takes place under cover of an ExclusiveLock, since it seems as though we * would want to prohibit not only concurrent REFRESH operations, but also * incremental maintenance. It also doesn't seem reasonable or safe to allow * SELECT FOR UPDATE or SELECT FOR SHARE on rows being updated or deleted by * this command. */ static void refresh_by_match_merge(Oid matviewOid, Oid tempOid, Oid relowner, int save_sec_context) { StringInfoData querybuf; Relation matviewRel; Relation tempRel; char *matviewname; char *tempname; char *diffname; TupleDesc tupdesc; bool foundUniqueIndex; List *indexoidlist; ListCell *indexoidscan; int16 relnatts; bool *usedForQual; initStringInfo(&querybuf); matviewRel = heap_open(matviewOid, NoLock); matviewname = quote_qualified_identifier(get_namespace_name(RelationGetNamespace(matviewRel)), RelationGetRelationName(matviewRel)); tempRel = heap_open(tempOid, NoLock); tempname = quote_qualified_identifier(get_namespace_name(RelationGetNamespace(tempRel)), RelationGetRelationName(tempRel)); diffname = make_temptable_name_n(tempname, 2); relnatts = matviewRel->rd_rel->relnatts; usedForQual = (bool *) palloc0(sizeof(bool) * relnatts); /* Open SPI context. */ if (SPI_connect() != SPI_OK_CONNECT) elog(ERROR, "SPI_connect failed"); /* Analyze the temp table with the new contents. */ appendStringInfo(&querybuf, "ANALYZE %s", tempname); if (SPI_exec(querybuf.data, 0) != SPI_OK_UTILITY) elog(ERROR, "SPI_exec failed: %s", querybuf.data); /* * We need to ensure that there are not duplicate rows without NULLs in * the new data set before we can count on the "diff" results. Check for * that in a way that allows showing the first duplicated row found. Even * after we pass this test, a unique index on the materialized view may * find a duplicate key problem. */ resetStringInfo(&querybuf); appendStringInfo(&querybuf, "SELECT newdata FROM %s newdata " "WHERE newdata IS NOT NULL AND EXISTS " "(SELECT * FROM %s newdata2 WHERE newdata2 IS NOT NULL " "AND newdata2 OPERATOR(pg_catalog.*=) newdata " "AND newdata2.ctid OPERATOR(pg_catalog.<>) " "newdata.ctid) LIMIT 1", tempname, tempname); if (SPI_execute(querybuf.data, false, 1) != SPI_OK_SELECT) elog(ERROR, "SPI_exec failed: %s", querybuf.data); if (SPI_processed > 0) { ereport(ERROR, (errcode(ERRCODE_CARDINALITY_VIOLATION), errmsg("new data for \"%s\" contains duplicate rows without any null columns", RelationGetRelationName(matviewRel)), errdetail("Row: %s", SPI_getvalue(SPI_tuptable->vals[0], SPI_tuptable->tupdesc, 1)))); } SetUserIdAndSecContext(relowner, save_sec_context | SECURITY_LOCAL_USERID_CHANGE); /* Start building the query for creating the diff table. */ resetStringInfo(&querybuf); appendStringInfo(&querybuf, "CREATE TEMP TABLE %s AS " "SELECT mv.ctid AS tid, newdata " "FROM %s mv FULL JOIN %s newdata ON (", diffname, matviewname, tempname); /* * Get the list of index OIDs for the table from the relcache, and look up * each one in the pg_index syscache. We will test for equality on all * columns present in all unique indexes which only reference columns and * include all rows. */ tupdesc = matviewRel->rd_att; foundUniqueIndex = false; indexoidlist = RelationGetIndexList(matviewRel); foreach(indexoidscan, indexoidlist) { Oid indexoid = lfirst_oid(indexoidscan); Relation indexRel; Form_pg_index indexStruct; indexRel = index_open(indexoid, RowExclusiveLock); indexStruct = indexRel->rd_index; /* * We're only interested if it is unique, valid, contains no * expressions, and is not partial. */ if (indexStruct->indisunique && IndexIsValid(indexStruct) && RelationGetIndexExpressions(indexRel) == NIL && RelationGetIndexPredicate(indexRel) == NIL) { int numatts = indexStruct->indnatts; int i; /* Add quals for all columns from this index. */ for (i = 0; i < numatts; i++) { int attnum = indexStruct->indkey.values[i]; Oid type; Oid op; const char *colname; /* * Only include the column once regardless of how many times * it shows up in how many indexes. */ if (usedForQual[attnum - 1]) continue; usedForQual[attnum - 1] = true; /* * Actually add the qual, ANDed with any others. */ if (foundUniqueIndex) appendStringInfoString(&querybuf, " AND "); colname = quote_identifier(NameStr((tupdesc->attrs[attnum - 1])->attname)); appendStringInfo(&querybuf, "newdata.%s ", colname); type = attnumTypeId(matviewRel, attnum); op = lookup_type_cache(type, TYPECACHE_EQ_OPR)->eq_opr; mv_GenerateOper(&querybuf, op); appendStringInfo(&querybuf, " mv.%s", colname); foundUniqueIndex = true; } } /* Keep the locks, since we're about to run DML which needs them. */ index_close(indexRel, NoLock); }
/* * lazy_truncate_heap - try to truncate off any empty pages at the end */ static void lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats) { BlockNumber old_rel_pages = vacrelstats->rel_pages; BlockNumber new_rel_pages; PGRUsage ru0; int lock_retry; pg_rusage_init(&ru0); /* * Loop until no more truncating can be done. */ do { /* * We need full exclusive lock on the relation in order to do * truncation. If we can't get it, give up rather than waiting --- we * don't want to block other backends, and we don't want to deadlock * (which is quite possible considering we already hold a lower-grade * lock). */ vacrelstats->lock_waiter_detected = false; lock_retry = 0; while (true) { if (ConditionalLockRelation(onerel, AccessExclusiveLock)) break; /* * Check for interrupts while trying to (re-)acquire the exclusive * lock. */ CHECK_FOR_INTERRUPTS(); if (++lock_retry > (AUTOVACUUM_TRUNCATE_LOCK_TIMEOUT / AUTOVACUUM_TRUNCATE_LOCK_WAIT_INTERVAL)) { /* * We failed to establish the lock in the specified number of * retries. This means we give up truncating. Suppress the * ANALYZE step. Doing an ANALYZE at this point will reset the * dead_tuple_count in the stats collector, so we will not get * called by the autovacuum launcher again to do the truncate. */ vacrelstats->lock_waiter_detected = true; ereport(LOG, (errmsg("automatic vacuum of table \"%s.%s.%s\": " "could not (re)acquire exclusive " "lock for truncate scan", get_database_name(MyDatabaseId), get_namespace_name(RelationGetNamespace(onerel)), RelationGetRelationName(onerel)))); return; } pg_usleep(AUTOVACUUM_TRUNCATE_LOCK_WAIT_INTERVAL); } /* * Now that we have exclusive lock, look to see if the rel has grown * whilst we were vacuuming with non-exclusive lock. If so, give up; * the newly added pages presumably contain non-deletable tuples. */ new_rel_pages = RelationGetNumberOfBlocks(onerel); if (new_rel_pages != old_rel_pages) { /* * Note: we intentionally don't update vacrelstats->rel_pages with * the new rel size here. If we did, it would amount to assuming * that the new pages are empty, which is unlikely. Leaving the * numbers alone amounts to assuming that the new pages have the * same tuple density as existing ones, which is less unlikely. */ UnlockRelation(onerel, AccessExclusiveLock); return; } /* * Scan backwards from the end to verify that the end pages actually * contain no tuples. This is *necessary*, not optional, because * other backends could have added tuples to these pages whilst we * were vacuuming. */ new_rel_pages = count_nondeletable_pages(onerel, vacrelstats); if (new_rel_pages >= old_rel_pages) { /* can't do anything after all */ UnlockRelation(onerel, AccessExclusiveLock); return; } /* * Okay to truncate. */ RelationTruncate(onerel, new_rel_pages); /* * We can release the exclusive lock as soon as we have truncated. * Other backends can't safely access the relation until they have * processed the smgr invalidation that smgrtruncate sent out ... but * that should happen as part of standard invalidation processing once * they acquire lock on the relation. */ UnlockRelation(onerel, AccessExclusiveLock); /* * Update statistics. Here, it *is* correct to adjust rel_pages * without also touching reltuples, since the tuple count wasn't * changed by the truncation. */ vacrelstats->pages_removed += old_rel_pages - new_rel_pages; vacrelstats->rel_pages = new_rel_pages; ereport(elevel, (errmsg("\"%s\": truncated %u to %u pages", RelationGetRelationName(onerel), old_rel_pages, new_rel_pages), errdetail("%s.", pg_rusage_show(&ru0)))); old_rel_pages = new_rel_pages; } while (new_rel_pages > vacrelstats->nonempty_pages && vacrelstats->lock_waiter_detected); }
/* * IsToastRelation * True iff relation is a TOAST support relation (or index). */ bool IsToastRelation(Relation relation) { return IsToastNamespace(RelationGetNamespace(relation)); }
Datum spgstat(PG_FUNCTION_ARGS) { text *name=PG_GETARG_TEXT_P(0); char *relname=text_to_cstring(name); RangeVar *relvar; Relation index; List *relname_list; Oid relOid; BlockNumber blkno = SPGIST_HEAD_BLKNO; BlockNumber totalPages = 0, innerPages = 0, emptyPages = 0; double usedSpace = 0.0; char res[1024]; int bufferSize = -1; int64 innerTuples = 0, leafTuples = 0; relname_list = stringToQualifiedNameList(relname); relvar = makeRangeVarFromNameList(relname_list); relOid = RangeVarGetRelid(relvar, false); index = index_open(relOid, AccessExclusiveLock); if ( index->rd_am == NULL ) elog(ERROR, "Relation %s.%s is not an index", get_namespace_name(RelationGetNamespace(index)), RelationGetRelationName(index) ); totalPages = RelationGetNumberOfBlocks(index); for(blkno=SPGIST_HEAD_BLKNO; blkno<totalPages; blkno++) { Buffer buffer; Page page; buffer = ReadBuffer(index, blkno); LockBuffer(buffer, BUFFER_LOCK_SHARE); page = BufferGetPage(buffer); if (SpGistPageIsLeaf(page)) { leafTuples += SpGistPageGetMaxOffset(page); } else { innerPages++; innerTuples += SpGistPageGetMaxOffset(page); } if (bufferSize < 0) bufferSize = BufferGetPageSize(buffer) - MAXALIGN(sizeof(SpGistPageOpaqueData)) - SizeOfPageHeaderData; usedSpace += bufferSize - (PageGetFreeSpace(page) + sizeof(ItemIdData)); if (PageGetFreeSpace(page) + sizeof(ItemIdData) == bufferSize) emptyPages++; UnlockReleaseBuffer(buffer); } index_close(index, AccessExclusiveLock); totalPages--; /* metapage */ snprintf(res, sizeof(res), "totalPages: %u\n" "innerPages: %u\n" "leafPages: %u\n" "emptyPages: %u\n" "usedSpace: %.2f kbytes\n" "freeSpace: %.2f kbytes\n" "fillRatio: %.2f%c\n" "leafTuples: %lld\n" "innerTuples: %lld", totalPages, innerPages, totalPages - innerPages, emptyPages, usedSpace / 1024.0, (( (double) bufferSize ) * ( (double) totalPages ) - usedSpace) / 1024, 100.0 * ( usedSpace / (( (double) bufferSize ) * ( (double) totalPages )) ), '%', leafTuples, innerTuples ); PG_RETURN_TEXT_P(CStringGetTextDatum(res)); }
/* ------------------------------------------------ * bt_metap() * * Get a btree meta-page information * * Usage: SELECT * FROM bt_metap('t1_pkey') * ------------------------------------------------ */ Datum bt_metap(PG_FUNCTION_ARGS) { text *relname = PG_GETARG_TEXT_P(0); Buffer buffer; Relation rel; RangeVar *relrv; Datum result; if (!superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), (errmsg("must be superuser to use pgstattuple functions")))); relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname)); rel = relation_openrv(relrv, AccessShareLock); if (!IS_INDEX(rel) || !IS_BTREE(rel)) elog(ERROR, "bt_metap() can be used only on b-tree index."); /* * Reject attempts to read non-local temporary relations; we would * be likely to get wrong data since we have no visibility into the * owning session's local buffers. */ if (isOtherTempNamespace(RelationGetNamespace(rel))) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("cannot access temporary tables of other sessions"))); buffer = ReadBuffer(rel, 0); { BTMetaPageData *metad; TupleDesc tupleDesc; int j; char *values[BTMETAP_NCOLUMNS]; HeapTuple tuple; Page page = BufferGetPage(buffer); metad = BTPageGetMeta(page); tupleDesc = RelationNameGetTupleDesc(BTMETAP_TYPE); j = 0; values[j] = palloc(32); snprintf(values[j++], 32, "%d", metad->btm_magic); values[j] = palloc(32); snprintf(values[j++], 32, "%d", metad->btm_version); values[j] = palloc(32); snprintf(values[j++], 32, "%d", metad->btm_root); values[j] = palloc(32); snprintf(values[j++], 32, "%d", metad->btm_level); values[j] = palloc(32); snprintf(values[j++], 32, "%d", metad->btm_fastroot); values[j] = palloc(32); snprintf(values[j++], 32, "%d", metad->btm_fastlevel); tuple = BuildTupleFromCStrings(TupleDescGetAttInMetadata(tupleDesc), values); result = TupleGetDatum(TupleDescGetSlot(tupleDesc), tuple); } ReleaseBuffer(buffer); relation_close(rel, AccessShareLock); PG_RETURN_DATUM(result); }
char * SPI_getnspname(Relation rel) { return get_namespace_name(RelationGetNamespace(rel)); }
/* * IsAoSegmentRelation * True iff relation is an AO segment support relation (or index). */ bool IsAoSegmentRelation(Relation relation) { return IsAoSegmentNamespace(RelationGetNamespace(relation)); }
/* * lazy_vacuum_rel() -- perform LAZY VACUUM for one heap relation * * This routine vacuums a single heap, cleans out its indexes, and * updates its relpages and reltuples statistics. * * At entry, we have already established a transaction and opened * and locked the relation. * * The return value indicates whether this function has held off * interrupts -- caller must RESUME_INTERRUPTS() after commit if true. */ bool lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt, BufferAccessStrategy bstrategy, List *updated_stats) { LVRelStats *vacrelstats; Relation *Irel; int nindexes; BlockNumber possibly_freeable; PGRUsage ru0; TimestampTz starttime = 0; bool heldoff = false; pg_rusage_init(&ru0); /* measure elapsed time iff autovacuum logging requires it */ if (IsAutoVacuumWorkerProcess() && Log_autovacuum_min_duration > 0) starttime = GetCurrentTimestamp(); if (vacstmt->verbose) elevel = INFO; else elevel = DEBUG2; if (Gp_role == GP_ROLE_DISPATCH) elevel = DEBUG2; /* vacuum and analyze messages aren't interesting from the QD */ #ifdef FAULT_INJECTOR if (vacuumStatement_IsInAppendOnlyDropPhase(vacstmt)) { FaultInjector_InjectFaultIfSet( CompactionBeforeSegmentFileDropPhase, DDLNotSpecified, "", // databaseName ""); // tableName } if (vacummStatement_IsInAppendOnlyCleanupPhase(vacstmt)) { FaultInjector_InjectFaultIfSet( CompactionBeforeCleanupPhase, DDLNotSpecified, "", // databaseName ""); // tableName } #endif /* * MPP-23647. Update xid limits for heap as well as appendonly * relations. This allows setting relfrozenxid to correct value * for an appendonly (AO/CO) table. */ vac_strategy = bstrategy; vacuum_set_xid_limits(vacstmt->freeze_min_age, onerel->rd_rel->relisshared, &OldestXmin, &FreezeLimit); /* * Execute the various vacuum operations. Appendonly tables are treated * differently. */ if (RelationIsAoRows(onerel) || RelationIsAoCols(onerel)) { lazy_vacuum_aorel(onerel, vacstmt, updated_stats); return false; } vacrelstats = (LVRelStats *) palloc0(sizeof(LVRelStats)); /* heap relation */ /* Set threshold for interesting free space = average request size */ /* XXX should we scale it up or down? Adjust vacuum.c too, if so */ vacrelstats->threshold = GetAvgFSMRequestSize(&onerel->rd_node); vacrelstats->num_index_scans = 0; /* Open all indexes of the relation */ vac_open_indexes(onerel, RowExclusiveLock, &nindexes, &Irel); vacrelstats->hasindex = (nindexes > 0); /* Do the vacuuming */ lazy_scan_heap(onerel, vacrelstats, Irel, nindexes, updated_stats); /* Done with indexes */ vac_close_indexes(nindexes, Irel, NoLock); /* * Optionally truncate the relation. * * Don't even think about it unless we have a shot at releasing a goodly * number of pages. Otherwise, the time taken isn't worth it. * * Note that after we've truncated the heap, it's too late to abort the * transaction; doing so would lose the sinval messages needed to tell * the other backends about the table being shrunk. We prevent interrupts * in that case; caller is responsible for re-enabling them after * committing the transaction. */ possibly_freeable = vacrelstats->rel_pages - vacrelstats->nonempty_pages; if (possibly_freeable > 0 && (possibly_freeable >= REL_TRUNCATE_MINIMUM || possibly_freeable >= vacrelstats->rel_pages / REL_TRUNCATE_FRACTION)) { HOLD_INTERRUPTS(); heldoff = true; lazy_truncate_heap(onerel, vacrelstats); } /* Update shared free space map with final free space info */ lazy_update_fsm(onerel, vacrelstats); if (vacrelstats->tot_free_pages > MaxFSMPages) ereport(WARNING, (errmsg("relation \"%s.%s\" contains more than \"max_fsm_pages\" pages with useful free space", get_namespace_name(RelationGetNamespace(onerel)), RelationGetRelationName(onerel)), /* Only suggest VACUUM FULL if > 20% free */ (vacrelstats->tot_free_pages > vacrelstats->rel_pages * 0.20) ? errhint("Consider using VACUUM FULL on this relation or increasing the configuration parameter \"max_fsm_pages\".") : errhint("Consider increasing the configuration parameter \"max_fsm_pages\"."))); /* Update statistics in pg_class */ vac_update_relstats_from_list(onerel, vacrelstats->rel_pages, vacrelstats->rel_tuples, vacrelstats->hasindex, FreezeLimit, updated_stats); /* report results to the stats collector, too */ pgstat_report_vacuum(RelationGetRelid(onerel), onerel->rd_rel->relisshared, true /*vacrelstats->scanned_all*/, vacstmt->analyze, vacrelstats->rel_tuples); if (gp_indexcheck_vacuum == INDEX_CHECK_ALL || (gp_indexcheck_vacuum == INDEX_CHECK_SYSTEM && PG_CATALOG_NAMESPACE == RelationGetNamespace(onerel))) { int i; for (i = 0; i < nindexes; i++) { if (Irel[i]->rd_rel->relam == BTREE_AM_OID) _bt_validate_vacuum(Irel[i], onerel, OldestXmin); } } /* and log the action if appropriate */ if (IsAutoVacuumWorkerProcess() && Log_autovacuum_min_duration >= 0) { if (Log_autovacuum_min_duration == 0 || TimestampDifferenceExceeds(starttime, GetCurrentTimestamp(), Log_autovacuum_min_duration)) ereport(LOG, (errmsg("automatic vacuum of table \"%s.%s.%s\": index scans: %d\n" "pages: %d removed, %d remain\n" "tuples: %.0f removed, %.0f remain\n" "system usage: %s", get_database_name(MyDatabaseId), get_namespace_name(RelationGetNamespace(onerel)), RelationGetRelationName(onerel), vacrelstats->num_index_scans, vacrelstats->pages_removed, vacrelstats->rel_pages, vacrelstats->tuples_deleted, vacrelstats->rel_tuples, pg_rusage_show(&ru0)))); } return heldoff; }
/* table_log() trigger function for logging table changes parameter: - log table name (optional) return: - trigger data (for Pg) */ Datum table_log(PG_FUNCTION_ARGS) { TriggerData *trigdata = (TriggerData *) fcinfo->context; int ret; char query[250 + NAMEDATALEN]; /* for getting table infos (250 chars (+ one times the length of all names) should be enough) */ int number_columns = 0; /* counts the number columns in the table */ int number_columns_log = 0; /* counts the number columns in the table */ char *orig_schema; char *log_schema; char *log_table; int use_session_user = 0; /* should we write the current (session) user to the log table? */ /* * Some checks first... */ #ifdef TABLE_LOG_DEBUG elog(NOTICE, "start table_log()"); #endif /* TABLE_LOG_DEBUG */ /* called by trigger manager? */ if (!CALLED_AS_TRIGGER(fcinfo)) { elog(ERROR, "table_log: not fired by trigger manager"); } /* must only be called for ROW trigger */ if (TRIGGER_FIRED_FOR_STATEMENT(trigdata->tg_event)) { elog(ERROR, "table_log: can't process STATEMENT events"); } /* must only be called AFTER */ if (TRIGGER_FIRED_BEFORE(trigdata->tg_event)) { elog(ERROR, "table_log: must be fired after event"); } /* now connect to SPI manager */ ret = SPI_connect(); if (ret != SPI_OK_CONNECT) { elog(ERROR, "table_log: SPI_connect returned %d", ret); } /* get schema name for the table, in case we need it later */ orig_schema = get_namespace_name(RelationGetNamespace(trigdata->tg_relation)); #ifdef TABLE_LOG_DEBUG elog(NOTICE, "prechecks done, now getting original table attributes"); #endif /* TABLE_LOG_DEBUG */ number_columns = count_columns(trigdata->tg_relation->rd_att); if (number_columns < 1) { elog(ERROR, "table_log: number of columns in table is < 1, can this happen?"); } #ifdef TABLE_LOG_DEBUG elog(NOTICE, "number columns in orig table: %i", number_columns); #endif /* TABLE_LOG_DEBUG */ if (trigdata->tg_trigger->tgnargs > 3) { elog(ERROR, "table_log: too many arguments to trigger"); } /* name of the log schema */ if (trigdata->tg_trigger->tgnargs > 2) { /* check if a log schema argument is given, if yes, use it */ log_schema = trigdata->tg_trigger->tgargs[2]; } else { /* if no, use orig_schema */ log_schema = orig_schema; } /* should we write the current user? */ if (trigdata->tg_trigger->tgnargs > 1) { /* check if a second argument is given */ /* if yes, use it, if it is true */ if (atoi(trigdata->tg_trigger->tgargs[1]) == 1) { use_session_user = 1; #ifdef TABLE_LOG_DEBUG elog(NOTICE, "will write session user to 'trigger_user'"); #endif /* TABLE_LOG_DEBUG */ } } /* name of the log table */ if (trigdata->tg_trigger->tgnargs > 0) { /* check if a logtable argument is given */ /* if yes, use it */ log_table = (char *) palloc((strlen(trigdata->tg_trigger->tgargs[0]) + 2) * sizeof(char)); sprintf(log_table, "%s", trigdata->tg_trigger->tgargs[0]); } else { /* if no, use 'table name' + '_log' */ log_table = (char *) palloc((strlen(do_quote_ident(SPI_getrelname(trigdata->tg_relation))) + 5) * sizeof(char)); sprintf(log_table, "%s_log", SPI_getrelname(trigdata->tg_relation)); } #ifdef TABLE_LOG_DEBUG elog(NOTICE, "log table: %s", log_table); #endif /* TABLE_LOG_DEBUG */ #ifdef TABLE_LOG_DEBUG elog(NOTICE, "now check, if log table exists"); #endif /* TABLE_LOG_DEBUG */ /* get the number columns in the table */ snprintf(query, 249, "%s.%s", do_quote_ident(log_schema), do_quote_ident(log_table)); number_columns_log = count_columns(RelationNameGetTupleDesc(query)); if (number_columns_log < 1) { elog(ERROR, "could not get number columns in relation %s", log_table); } #ifdef TABLE_LOG_DEBUG elog(NOTICE, "number columns in log table: %i", number_columns_log); #endif /* TABLE_LOG_DEBUG */ /* check if the logtable has 3 (or now 4) columns more than our table */ /* +1 if we should write the session user */ if (use_session_user == 0) { /* without session user */ if (number_columns_log != number_columns + 3 && number_columns_log != number_columns + 4) { elog(ERROR, "number colums in relation %s(%d) does not match columns in %s(%d)", SPI_getrelname(trigdata->tg_relation), number_columns, log_table, number_columns_log); } } else { /* with session user */ if (number_columns_log != number_columns + 3 + 1 && number_columns_log != number_columns + 4 + 1) { elog(ERROR, "number colums in relation %s does not match columns in %s", SPI_getrelname(trigdata->tg_relation), log_table); } } #ifdef TABLE_LOG_DEBUG elog(NOTICE, "log table OK"); #endif /* TABLE_LOG_DEBUG */ /* For each column in key ... */ #ifdef TABLE_LOG_DEBUG elog(NOTICE, "copy data ..."); #endif /* TABLE_LOG_DEBUG */ if (TRIGGER_FIRED_BY_INSERT(trigdata->tg_event)) { /* trigger called from INSERT */ #ifdef TABLE_LOG_DEBUG elog(NOTICE, "mode: INSERT -> new"); #endif /* TABLE_LOG_DEBUG */ __table_log(trigdata, "INSERT", "new", trigdata->tg_trigtuple, number_columns, log_table, use_session_user, log_schema); } else if (TRIGGER_FIRED_BY_UPDATE(trigdata->tg_event)) { /* trigger called from UPDATE */ #ifdef TABLE_LOG_DEBUG elog(NOTICE, "mode: UPDATE -> old"); #endif /* TABLE_LOG_DEBUG */ __table_log(trigdata, "UPDATE", "old", trigdata->tg_trigtuple, number_columns, log_table, use_session_user, log_schema); #ifdef TABLE_LOG_DEBUG elog(NOTICE, "mode: UPDATE -> new"); #endif /* TABLE_LOG_DEBUG */ __table_log(trigdata, "UPDATE", "new", trigdata->tg_newtuple, number_columns, log_table, use_session_user, log_schema); } else if (TRIGGER_FIRED_BY_DELETE(trigdata->tg_event)) { /* trigger called from DELETE */ #ifdef TABLE_LOG_DEBUG elog(NOTICE, "mode: DELETE -> old"); #endif /* TABLE_LOG_DEBUG */ __table_log(trigdata, "DELETE", "old", trigdata->tg_trigtuple, number_columns, log_table, use_session_user, log_schema); } else { elog(ERROR, "trigger fired by unknown event"); } #ifdef TABLE_LOG_DEBUG elog(NOTICE, "cleanup, trigger done"); #endif /* TABLE_LOG_DEBUG */ /* clean up */ pfree(log_table); /* close SPI connection */ SPI_finish(); /* return trigger data */ return PointerGetDatum(trigdata->tg_trigtuple); }
/* * lazy_scan_heap() -- scan an open heap relation * * This routine sets commit status bits, builds lists of dead tuples * and pages with free space, and calculates statistics on the number * of live tuples in the heap. When done, or when we run low on space * for dead-tuple TIDs, invoke vacuuming of indexes and heap. * * If there are no indexes then we just vacuum each dirty page as we * process it, since there's no point in gathering many tuples. */ static void lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, Relation *Irel, int nindexes, bool scan_all) { BlockNumber nblocks, blkno; HeapTupleData tuple; char *relname; BlockNumber empty_pages, vacuumed_pages; double num_tuples, tups_vacuumed, nkeep, nunused; IndexBulkDeleteResult **indstats; int i; PGRUsage ru0; Buffer vmbuffer = InvalidBuffer; BlockNumber next_not_all_visible_block; bool skipping_all_visible_blocks; pg_rusage_init(&ru0); relname = RelationGetRelationName(onerel); ereport(elevel, (errmsg("vacuuming \"%s.%s\"", get_namespace_name(RelationGetNamespace(onerel)), relname))); empty_pages = vacuumed_pages = 0; num_tuples = tups_vacuumed = nkeep = nunused = 0; indstats = (IndexBulkDeleteResult **) palloc0(nindexes * sizeof(IndexBulkDeleteResult *)); nblocks = RelationGetNumberOfBlocks(onerel); vacrelstats->rel_pages = nblocks; vacrelstats->scanned_pages = 0; vacrelstats->nonempty_pages = 0; vacrelstats->latestRemovedXid = InvalidTransactionId; lazy_space_alloc(vacrelstats, nblocks); /* * We want to skip pages that don't require vacuuming according to the * visibility map, but only when we can skip at least SKIP_PAGES_THRESHOLD * consecutive pages. Since we're reading sequentially, the OS should be * doing readahead for us, so there's no gain in skipping a page now and * then; that's likely to disable readahead and so be counterproductive. * Also, skipping even a single page means that we can't update * relfrozenxid, so we only want to do it if we can skip a goodly number * of pages. * * Before entering the main loop, establish the invariant that * next_not_all_visible_block is the next block number >= blkno that's not * all-visible according to the visibility map, or nblocks if there's no * such block. Also, we set up the skipping_all_visible_blocks flag, * which is needed because we need hysteresis in the decision: once we've * started skipping blocks, we may as well skip everything up to the next * not-all-visible block. * * Note: if scan_all is true, we won't actually skip any pages; but we * maintain next_not_all_visible_block anyway, so as to set up the * all_visible_according_to_vm flag correctly for each page. */ for (next_not_all_visible_block = 0; next_not_all_visible_block < nblocks; next_not_all_visible_block++) { if (!visibilitymap_test(onerel, next_not_all_visible_block, &vmbuffer)) break; vacuum_delay_point(); } if (next_not_all_visible_block >= SKIP_PAGES_THRESHOLD) skipping_all_visible_blocks = true; else skipping_all_visible_blocks = false; for (blkno = 0; blkno < nblocks; blkno++) { Buffer buf; Page page; OffsetNumber offnum, maxoff; bool tupgone, hastup; int prev_dead_count; OffsetNumber frozen[MaxOffsetNumber]; int nfrozen; Size freespace; bool all_visible_according_to_vm; bool all_visible; bool has_dead_tuples; if (blkno == next_not_all_visible_block) { /* Time to advance next_not_all_visible_block */ for (next_not_all_visible_block++; next_not_all_visible_block < nblocks; next_not_all_visible_block++) { if (!visibilitymap_test(onerel, next_not_all_visible_block, &vmbuffer)) break; vacuum_delay_point(); } /* * We know we can't skip the current block. But set up * skipping_all_visible_blocks to do the right thing at the * following blocks. */ if (next_not_all_visible_block - blkno > SKIP_PAGES_THRESHOLD) skipping_all_visible_blocks = true; else skipping_all_visible_blocks = false; all_visible_according_to_vm = false; } else { /* Current block is all-visible */ if (skipping_all_visible_blocks && !scan_all) continue; all_visible_according_to_vm = true; } vacuum_delay_point(); vacrelstats->scanned_pages++; /* * If we are close to overrunning the available space for dead-tuple * TIDs, pause and do a cycle of vacuuming before we tackle this page. */ if ((vacrelstats->max_dead_tuples - vacrelstats->num_dead_tuples) < MaxHeapTuplesPerPage && vacrelstats->num_dead_tuples > 0) { /* Log cleanup info before we touch indexes */ vacuum_log_cleanup_info(onerel, vacrelstats); /* Remove index entries */ for (i = 0; i < nindexes; i++) lazy_vacuum_index(Irel[i], &indstats[i], vacrelstats); /* Remove tuples from heap */ lazy_vacuum_heap(onerel, vacrelstats); /* * Forget the now-vacuumed tuples, and press on, but be careful * not to reset latestRemovedXid since we want that value to be * valid. */ vacrelstats->num_dead_tuples = 0; vacrelstats->num_index_scans++; } buf = ReadBufferExtended(onerel, MAIN_FORKNUM, blkno, RBM_NORMAL, vac_strategy); /* We need buffer cleanup lock so that we can prune HOT chains. */ LockBufferForCleanup(buf); page = BufferGetPage(buf); if (PageIsNew(page)) { /* * An all-zeroes page could be left over if a backend extends the * relation but crashes before initializing the page. Reclaim such * pages for use. * * We have to be careful here because we could be looking at a * page that someone has just added to the relation and not yet * been able to initialize (see RelationGetBufferForTuple). To * protect against that, release the buffer lock, grab the * relation extension lock momentarily, and re-lock the buffer. If * the page is still uninitialized by then, it must be left over * from a crashed backend, and we can initialize it. * * We don't really need the relation lock when this is a new or * temp relation, but it's probably not worth the code space to * check that, since this surely isn't a critical path. * * Note: the comparable code in vacuum.c need not worry because * it's got exclusive lock on the whole relation. */ LockBuffer(buf, BUFFER_LOCK_UNLOCK); LockRelationForExtension(onerel, ExclusiveLock); UnlockRelationForExtension(onerel, ExclusiveLock); LockBufferForCleanup(buf); if (PageIsNew(page)) { ereport(WARNING, (errmsg("relation \"%s\" page %u is uninitialized --- fixing", relname, blkno))); PageInit(page, BufferGetPageSize(buf), 0); empty_pages++; } freespace = PageGetHeapFreeSpace(page); MarkBufferDirty(buf); UnlockReleaseBuffer(buf); RecordPageWithFreeSpace(onerel, blkno, freespace); continue; } if (PageIsEmpty(page)) { empty_pages++; freespace = PageGetHeapFreeSpace(page); if (!PageIsAllVisible(page)) { PageSetAllVisible(page); SetBufferCommitInfoNeedsSave(buf); } LockBuffer(buf, BUFFER_LOCK_UNLOCK); /* Update the visibility map */ if (!all_visible_according_to_vm) { visibilitymap_pin(onerel, blkno, &vmbuffer); LockBuffer(buf, BUFFER_LOCK_SHARE); if (PageIsAllVisible(page)) visibilitymap_set(onerel, blkno, PageGetLSN(page), &vmbuffer); LockBuffer(buf, BUFFER_LOCK_UNLOCK); } ReleaseBuffer(buf); RecordPageWithFreeSpace(onerel, blkno, freespace); continue; } /* * Prune all HOT-update chains in this page. * * We count tuples removed by the pruning step as removed by VACUUM. */ tups_vacuumed += heap_page_prune(onerel, buf, OldestXmin, false, &vacrelstats->latestRemovedXid); /* * Now scan the page to collect vacuumable items and check for tuples * requiring freezing. */ all_visible = true; has_dead_tuples = false; nfrozen = 0; hastup = false; prev_dead_count = vacrelstats->num_dead_tuples; maxoff = PageGetMaxOffsetNumber(page); for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { ItemId itemid; itemid = PageGetItemId(page, offnum); /* Unused items require no processing, but we count 'em */ if (!ItemIdIsUsed(itemid)) { nunused += 1; continue; } /* Redirect items mustn't be touched */ if (ItemIdIsRedirected(itemid)) { hastup = true; /* this page won't be truncatable */ continue; } ItemPointerSet(&(tuple.t_self), blkno, offnum); /* * DEAD item pointers are to be vacuumed normally; but we don't * count them in tups_vacuumed, else we'd be double-counting (at * least in the common case where heap_page_prune() just freed up * a non-HOT tuple). */ if (ItemIdIsDead(itemid)) { lazy_record_dead_tuple(vacrelstats, &(tuple.t_self)); all_visible = false; continue; } Assert(ItemIdIsNormal(itemid)); tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid); tuple.t_len = ItemIdGetLength(itemid); tupgone = false; switch (HeapTupleSatisfiesVacuum(tuple.t_data, OldestXmin, buf)) { case HEAPTUPLE_DEAD: /* * Ordinarily, DEAD tuples would have been removed by * heap_page_prune(), but it's possible that the tuple * state changed since heap_page_prune() looked. In * particular an INSERT_IN_PROGRESS tuple could have * changed to DEAD if the inserter aborted. So this * cannot be considered an error condition. * * If the tuple is HOT-updated then it must only be * removed by a prune operation; so we keep it just as if * it were RECENTLY_DEAD. Also, if it's a heap-only * tuple, we choose to keep it, because it'll be a lot * cheaper to get rid of it in the next pruning pass than * to treat it like an indexed tuple. */ if (HeapTupleIsHotUpdated(&tuple) || HeapTupleIsHeapOnly(&tuple)) nkeep += 1; else tupgone = true; /* we can delete the tuple */ all_visible = false; break; case HEAPTUPLE_LIVE: /* Tuple is good --- but let's do some validity checks */ if (onerel->rd_rel->relhasoids && !OidIsValid(HeapTupleGetOid(&tuple))) elog(WARNING, "relation \"%s\" TID %u/%u: OID is invalid", relname, blkno, offnum); /* * Is the tuple definitely visible to all transactions? * * NB: Like with per-tuple hint bits, we can't set the * PD_ALL_VISIBLE flag if the inserter committed * asynchronously. See SetHintBits for more info. Check * that the HEAP_XMIN_COMMITTED hint bit is set because of * that. */ if (all_visible) { TransactionId xmin; if (!(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED)) { all_visible = false; break; } /* * The inserter definitely committed. But is it old * enough that everyone sees it as committed? */ xmin = HeapTupleHeaderGetXmin(tuple.t_data); if (!TransactionIdPrecedes(xmin, OldestXmin)) { all_visible = false; break; } } break; case HEAPTUPLE_RECENTLY_DEAD: /* * If tuple is recently deleted then we must not remove it * from relation. */ nkeep += 1; all_visible = false; break; case HEAPTUPLE_INSERT_IN_PROGRESS: /* This is an expected case during concurrent vacuum */ all_visible = false; break; case HEAPTUPLE_DELETE_IN_PROGRESS: /* This is an expected case during concurrent vacuum */ all_visible = false; break; default: elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); break; } if (tupgone) { lazy_record_dead_tuple(vacrelstats, &(tuple.t_self)); HeapTupleHeaderAdvanceLatestRemovedXid(tuple.t_data, &vacrelstats->latestRemovedXid); tups_vacuumed += 1; has_dead_tuples = true; } else { num_tuples += 1; hastup = true; /* * Each non-removable tuple must be checked to see if it needs * freezing. Note we already have exclusive buffer lock. */ if (heap_freeze_tuple(tuple.t_data, FreezeLimit, InvalidBuffer)) frozen[nfrozen++] = offnum; } } /* scan along page */ /* * If we froze any tuples, mark the buffer dirty, and write a WAL * record recording the changes. We must log the changes to be * crash-safe against future truncation of CLOG. */ if (nfrozen > 0) { MarkBufferDirty(buf); if (RelationNeedsWAL(onerel)) { XLogRecPtr recptr; recptr = log_heap_freeze(onerel, buf, FreezeLimit, frozen, nfrozen); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); } } /* * If there are no indexes then we can vacuum the page right now * instead of doing a second scan. */ if (nindexes == 0 && vacrelstats->num_dead_tuples > 0) { /* Remove tuples from heap */ lazy_vacuum_page(onerel, blkno, buf, 0, vacrelstats); /* * Forget the now-vacuumed tuples, and press on, but be careful * not to reset latestRemovedXid since we want that value to be * valid. */ vacrelstats->num_dead_tuples = 0; vacuumed_pages++; } freespace = PageGetHeapFreeSpace(page); /* Update the all-visible flag on the page */ if (!PageIsAllVisible(page) && all_visible) { PageSetAllVisible(page); SetBufferCommitInfoNeedsSave(buf); } /* * It's possible for the value returned by GetOldestXmin() to move * backwards, so it's not wrong for us to see tuples that appear to * not be visible to everyone yet, while PD_ALL_VISIBLE is already * set. The real safe xmin value never moves backwards, but * GetOldestXmin() is conservative and sometimes returns a value * that's unnecessarily small, so if we see that contradiction it just * means that the tuples that we think are not visible to everyone yet * actually are, and the PD_ALL_VISIBLE flag is correct. * * There should never be dead tuples on a page with PD_ALL_VISIBLE * set, however. */ else if (PageIsAllVisible(page) && has_dead_tuples) { elog(WARNING, "page containing dead tuples is marked as all-visible in relation \"%s\" page %u", relname, blkno); PageClearAllVisible(page); SetBufferCommitInfoNeedsSave(buf); /* * Normally, we would drop the lock on the heap page before * updating the visibility map, but since this case shouldn't * happen anyway, don't worry about that. */ visibilitymap_clear(onerel, blkno); } LockBuffer(buf, BUFFER_LOCK_UNLOCK); /* Update the visibility map */ if (!all_visible_according_to_vm && all_visible) { visibilitymap_pin(onerel, blkno, &vmbuffer); LockBuffer(buf, BUFFER_LOCK_SHARE); if (PageIsAllVisible(page)) visibilitymap_set(onerel, blkno, PageGetLSN(page), &vmbuffer); LockBuffer(buf, BUFFER_LOCK_UNLOCK); } ReleaseBuffer(buf); /* Remember the location of the last page with nonremovable tuples */ if (hastup) vacrelstats->nonempty_pages = blkno + 1; /* * If we remembered any tuples for deletion, then the page will be * visited again by lazy_vacuum_heap, which will compute and record * its post-compaction free space. If not, then we're done with this * page, so remember its free space as-is. (This path will always be * taken if there are no indexes.) */ if (vacrelstats->num_dead_tuples == prev_dead_count) RecordPageWithFreeSpace(onerel, blkno, freespace); } /* save stats for use later */ vacrelstats->scanned_tuples = num_tuples; vacrelstats->tuples_deleted = tups_vacuumed; /* now we can compute the new value for pg_class.reltuples */ vacrelstats->new_rel_tuples = vac_estimate_reltuples(onerel, false, nblocks, vacrelstats->scanned_pages, num_tuples); /* If any tuples need to be deleted, perform final vacuum cycle */ /* XXX put a threshold on min number of tuples here? */ if (vacrelstats->num_dead_tuples > 0) { /* Log cleanup info before we touch indexes */ vacuum_log_cleanup_info(onerel, vacrelstats); /* Remove index entries */ for (i = 0; i < nindexes; i++) lazy_vacuum_index(Irel[i], &indstats[i], vacrelstats); /* Remove tuples from heap */ lazy_vacuum_heap(onerel, vacrelstats); vacrelstats->num_index_scans++; } /* Release the pin on the visibility map page */ if (BufferIsValid(vmbuffer)) { ReleaseBuffer(vmbuffer); vmbuffer = InvalidBuffer; } /* Do post-vacuum cleanup and statistics update for each index */ for (i = 0; i < nindexes; i++) lazy_cleanup_index(Irel[i], indstats[i], vacrelstats); /* If no indexes, make log report that lazy_vacuum_heap would've made */ if (vacuumed_pages) ereport(elevel, (errmsg("\"%s\": removed %.0f row versions in %u pages", RelationGetRelationName(onerel), tups_vacuumed, vacuumed_pages))); ereport(elevel, (errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u out of %u pages", RelationGetRelationName(onerel), tups_vacuumed, num_tuples, vacrelstats->scanned_pages, nblocks), errdetail("%.0f dead row versions cannot be removed yet.\n" "There were %.0f unused item pointers.\n" "%u pages are entirely empty.\n" "%s.", nkeep, nunused, empty_pages, pg_rusage_show(&ru0)))); }
/* * lazy_scan_heap() -- scan an open heap relation * * This routine sets commit status bits, builds lists of dead tuples * and pages with free space, and calculates statistics on the number * of live tuples in the heap. When done, or when we run low on space * for dead-tuple TIDs, invoke vacuuming of indexes and heap. * * If there are no indexes then we just vacuum each dirty page as we * process it, since there's no point in gathering many tuples. */ static void lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, Relation *Irel, int nindexes, List *updated_stats) { MIRROREDLOCK_BUFMGR_DECLARE; BlockNumber nblocks, blkno; HeapTupleData tuple; char *relname; BlockNumber empty_pages, vacuumed_pages; double num_tuples, tups_vacuumed, nkeep, nunused; IndexBulkDeleteResult **indstats; int i; int reindex_count = 1; PGRUsage ru0; /* Fetch gp_persistent_relation_node information that will be added to XLOG record. */ RelationFetchGpRelationNodeForXLog(onerel); pg_rusage_init(&ru0); relname = RelationGetRelationName(onerel); ereport(elevel, (errmsg("vacuuming \"%s.%s\"", get_namespace_name(RelationGetNamespace(onerel)), relname))); empty_pages = vacuumed_pages = 0; num_tuples = tups_vacuumed = nkeep = nunused = 0; indstats = (IndexBulkDeleteResult **) palloc0(nindexes * sizeof(IndexBulkDeleteResult *)); nblocks = RelationGetNumberOfBlocks(onerel); vacrelstats->rel_pages = nblocks; vacrelstats->nonempty_pages = 0; lazy_space_alloc(vacrelstats, nblocks); for (blkno = 0; blkno < nblocks; blkno++) { Buffer buf; Page page; OffsetNumber offnum, maxoff; bool tupgone, hastup; int prev_dead_count; OffsetNumber frozen[MaxOffsetNumber]; int nfrozen; vacuum_delay_point(); /* * If we are close to overrunning the available space for dead-tuple * TIDs, pause and do a cycle of vacuuming before we tackle this page. */ if ((vacrelstats->max_dead_tuples - vacrelstats->num_dead_tuples) < MaxHeapTuplesPerPage && vacrelstats->num_dead_tuples > 0) { /* Remove index entries */ for (i = 0; i < nindexes; i++) lazy_vacuum_index(Irel[i], &indstats[i], vacrelstats); reindex_count++; /* Remove tuples from heap */ lazy_vacuum_heap(onerel, vacrelstats); /* Forget the now-vacuumed tuples, and press on */ vacrelstats->num_dead_tuples = 0; vacrelstats->num_index_scans++; } /* -------- MirroredLock ---------- */ MIRROREDLOCK_BUFMGR_LOCK; buf = ReadBufferWithStrategy(onerel, blkno, vac_strategy); /* We need buffer cleanup lock so that we can prune HOT chains. */ LockBufferForCleanup(buf); page = BufferGetPage(buf); if (PageIsNew(page)) { /* * An all-zeroes page could be left over if a backend extends the * relation but crashes before initializing the page. Reclaim such * pages for use. * * We have to be careful here because we could be looking at a * page that someone has just added to the relation and not yet * been able to initialize (see RelationGetBufferForTuple). To * protect against that, release the buffer lock, grab the * relation extension lock momentarily, and re-lock the buffer. If * the page is still uninitialized by then, it must be left over * from a crashed backend, and we can initialize it. * * We don't really need the relation lock when this is a new or * temp relation, but it's probably not worth the code space to * check that, since this surely isn't a critical path. * * Note: the comparable code in vacuum.c need not worry because * it's got exclusive lock on the whole relation. */ LockBuffer(buf, BUFFER_LOCK_UNLOCK); MIRROREDLOCK_BUFMGR_UNLOCK; /* -------- MirroredLock ---------- */ LockRelationForExtension(onerel, ExclusiveLock); UnlockRelationForExtension(onerel, ExclusiveLock); /* -------- MirroredLock ---------- */ MIRROREDLOCK_BUFMGR_LOCK; LockBufferForCleanup(buf); if (PageIsNew(page)) { ereport(WARNING, (errmsg("relation \"%s\" page %u is uninitialized --- fixing", relname, blkno))); PageInit(page, BufferGetPageSize(buf), 0); /* must record in xlog so that changetracking will know about this change */ log_heap_newpage(onerel, page, blkno); empty_pages++; lazy_record_free_space(vacrelstats, blkno, PageGetHeapFreeSpace(page)); } MarkBufferDirty(buf); UnlockReleaseBuffer(buf); MIRROREDLOCK_BUFMGR_UNLOCK; /* -------- MirroredLock ---------- */ continue; } if (PageIsEmpty(page)) { empty_pages++; lazy_record_free_space(vacrelstats, blkno, PageGetHeapFreeSpace(page)); UnlockReleaseBuffer(buf); MIRROREDLOCK_BUFMGR_UNLOCK; /* -------- MirroredLock ---------- */ continue; } /* * Prune all HOT-update chains in this page. * * We count tuples removed by the pruning step as removed by VACUUM. */ tups_vacuumed += heap_page_prune(onerel, buf, OldestXmin, false, false); /* * Now scan the page to collect vacuumable items and check for tuples * requiring freezing. */ nfrozen = 0; hastup = false; prev_dead_count = vacrelstats->num_dead_tuples; maxoff = PageGetMaxOffsetNumber(page); for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { ItemId itemid; itemid = PageGetItemId(page, offnum); /* Unused items require no processing, but we count 'em */ if (!ItemIdIsUsed(itemid)) { nunused += 1; continue; } /* Redirect items mustn't be touched */ if (ItemIdIsRedirected(itemid)) { hastup = true; /* this page won't be truncatable */ continue; } ItemPointerSet(&(tuple.t_self), blkno, offnum); /* * DEAD item pointers are to be vacuumed normally; but we don't * count them in tups_vacuumed, else we'd be double-counting (at * least in the common case where heap_page_prune() just freed up * a non-HOT tuple). */ if (ItemIdIsDead(itemid)) { lazy_record_dead_tuple(vacrelstats, &(tuple.t_self)); continue; } Assert(ItemIdIsNormal(itemid)); tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid); tuple.t_len = ItemIdGetLength(itemid); tupgone = false; switch (HeapTupleSatisfiesVacuum(onerel, tuple.t_data, OldestXmin, buf)) { case HEAPTUPLE_DEAD: /* * Ordinarily, DEAD tuples would have been removed by * heap_page_prune(), but it's possible that the tuple * state changed since heap_page_prune() looked. In * particular an INSERT_IN_PROGRESS tuple could have * changed to DEAD if the inserter aborted. So this * cannot be considered an error condition. * * If the tuple is HOT-updated then it must only be * removed by a prune operation; so we keep it just as if * it were RECENTLY_DEAD. Also, if it's a heap-only * tuple, we choose to keep it, because it'll be a lot * cheaper to get rid of it in the next pruning pass than * to treat it like an indexed tuple. */ if (HeapTupleIsHotUpdated(&tuple) || HeapTupleIsHeapOnly(&tuple)) nkeep += 1; else tupgone = true; /* we can delete the tuple */ break; case HEAPTUPLE_LIVE: /* Tuple is good --- but let's do some validity checks */ if (onerel->rd_rel->relhasoids && !OidIsValid(HeapTupleGetOid(&tuple))) elog(WARNING, "relation \"%s\" TID %u/%u: OID is invalid", relname, blkno, offnum); break; case HEAPTUPLE_RECENTLY_DEAD: /* * If tuple is recently deleted then we must not remove it * from relation. */ nkeep += 1; break; case HEAPTUPLE_INSERT_IN_PROGRESS: /* This is an expected case during concurrent vacuum */ break; case HEAPTUPLE_DELETE_IN_PROGRESS: /* This is an expected case during concurrent vacuum */ break; default: elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); break; } if (tupgone) { lazy_record_dead_tuple(vacrelstats, &(tuple.t_self)); tups_vacuumed += 1; } else { num_tuples += 1; hastup = true; /* * Each non-removable tuple must be checked to see if it needs * freezing. Note we already have exclusive buffer lock. */ if (heap_freeze_tuple(tuple.t_data, &FreezeLimit, InvalidBuffer, false)) frozen[nfrozen++] = offnum; } } /* scan along page */ /* * If we froze any tuples, mark the buffer dirty, and write a WAL * record recording the changes. We must log the changes to be * crash-safe against future truncation of CLOG. */ if (nfrozen > 0) { MarkBufferDirty(buf); /* no XLOG for temp tables, though */ if (!onerel->rd_istemp) { XLogRecPtr recptr; recptr = log_heap_freeze(onerel, buf, FreezeLimit, frozen, nfrozen); PageSetLSN(page, recptr); } } /* * If there are no indexes then we can vacuum the page right now * instead of doing a second scan. */ if (nindexes == 0 && vacrelstats->num_dead_tuples > 0) { /* Remove tuples from heap */ lazy_vacuum_page(onerel, blkno, buf, 0, vacrelstats); /* Forget the now-vacuumed tuples, and press on */ vacrelstats->num_dead_tuples = 0; vacuumed_pages++; } /* * If we remembered any tuples for deletion, then the page will be * visited again by lazy_vacuum_heap, which will compute and record * its post-compaction free space. If not, then we're done with this * page, so remember its free space as-is. (This path will always be * taken if there are no indexes.) */ if (vacrelstats->num_dead_tuples == prev_dead_count) { lazy_record_free_space(vacrelstats, blkno, PageGetHeapFreeSpace(page)); } /* Remember the location of the last page with nonremovable tuples */ if (hastup) vacrelstats->nonempty_pages = blkno + 1; UnlockReleaseBuffer(buf); MIRROREDLOCK_BUFMGR_UNLOCK; /* -------- MirroredLock ---------- */ } /* save stats for use later */ vacrelstats->rel_tuples = num_tuples; vacrelstats->tuples_deleted = tups_vacuumed; /* If any tuples need to be deleted, perform final vacuum cycle */ /* XXX put a threshold on min number of tuples here? */ if (vacrelstats->num_dead_tuples > 0) { /* Remove index entries */ for (i = 0; i < nindexes; i++) lazy_vacuum_index(Irel[i], &indstats[i], vacrelstats); reindex_count++; /* Remove tuples from heap */ lazy_vacuum_heap(onerel, vacrelstats); vacrelstats->num_index_scans++; } /* Do post-vacuum cleanup and statistics update for each index */ for (i = 0; i < nindexes; i++) lazy_cleanup_index(Irel[i], indstats[i], vacrelstats, updated_stats); /* If no indexes, make log report that lazy_vacuum_heap would've made */ if (vacuumed_pages) ereport(elevel, (errmsg("\"%s\": removed %.0f row versions in %u pages", RelationGetRelationName(onerel), tups_vacuumed, vacuumed_pages))); ereport(elevel, (errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u pages", RelationGetRelationName(onerel), tups_vacuumed, num_tuples, nblocks), errdetail("%.0f dead row versions cannot be removed yet.\n" "There were %.0f unused item pointers.\n" "%u pages contain useful free space.\n" "%u pages are entirely empty.\n" "%s.", nkeep, nunused, vacrelstats->tot_free_pages, empty_pages, pg_rusage_show(&ru0)))); }