/* * create_ctas_internal * * Internal utility used for the creation of the definition of a relation * created via CREATE TABLE AS or a materialized view. Caller needs to * provide a list of attributes (ColumnDef nodes). */ static ObjectAddress create_ctas_internal(List *attrList, IntoClause *into) { CreateStmt *create = makeNode(CreateStmt); bool is_matview; char relkind; Datum toast_options; static char *validnsps[] = HEAP_RELOPT_NAMESPACES; ObjectAddress intoRelationAddr; /* This code supports both CREATE TABLE AS and CREATE MATERIALIZED VIEW */ is_matview = (into->viewQuery != NULL); relkind = is_matview ? RELKIND_MATVIEW : RELKIND_RELATION; /* * Create the target relation by faking up a CREATE TABLE parsetree and * passing it to DefineRelation. */ create->relation = into->rel; create->tableElts = attrList; create->inhRelations = NIL; create->ofTypename = NULL; create->constraints = NIL; create->options = into->options; create->oncommit = into->onCommit; create->tablespacename = into->tableSpaceName; create->if_not_exists = false; /* * Create the relation. (This will error out if there's an existing view, * so we don't need more code to complain if "replace" is false.) */ intoRelationAddr = DefineRelation(create, relkind, InvalidOid, NULL, NULL); /* * If necessary, create a TOAST table for the target table. Note that * NewRelationCreateToastTable ends with CommandCounterIncrement(), so * that the TOAST table will be visible for insertion. */ CommandCounterIncrement(); /* parse and validate reloptions for the toast table */ toast_options = transformRelOptions((Datum) 0, create->options, "toast", validnsps, true, false); (void) heap_reloptions(RELKIND_TOASTVALUE, toast_options, true); NewRelationCreateToastTable(intoRelationAddr.objectId, toast_options); /* Create the "view" part of a materialized view. */ if (is_matview) { /* StoreViewQuery scribbles on tree, so make a copy */ Query *query = (Query *) copyObject(into->viewQuery); StoreViewQuery(intoRelationAddr.objectId, query, false); CommandCounterIncrement(); } return intoRelationAddr; }
/* * Create append-only auxiliary relations for target relation rel. * Returns true if they are newly created. If pg_appendonly has already * known those tables, don't create them and returns false. */ bool CreateAOAuxiliaryTable( Relation rel, const char *auxiliaryNamePrefix, char relkind, Oid aoauxiliaryOid, Oid aoauxiliaryIndexOid, Oid *aoauxiliaryComptypeOid, TupleDesc tupledesc, IndexInfo *indexInfo, Oid *classObjectId, int16 *coloptions) { char aoauxiliary_relname[NAMEDATALEN]; char aoauxiliary_idxname[NAMEDATALEN]; bool shared_relation; Oid relOid, aoauxiliary_relid, aoauxiliary_idxid; ObjectAddress baseobject; ObjectAddress aoauxiliaryobject; Assert(RelationIsValid(rel)); Assert(RelationIsAoRows(rel) || RelationIsAoCols(rel)); Assert(auxiliaryNamePrefix); Assert(tupledesc); Assert(indexInfo); Assert(classObjectId); shared_relation = rel->rd_rel->relisshared; /* * We cannot allow creating an auxiliary table for a shared relation * after initdb (because there's no way to let other databases know * this visibility map. */ if (shared_relation && !IsBootstrapProcessingMode()) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("shared tables cannot have append-only auxiliary relations after initdb"))); relOid = RelationGetRelid(rel); switch(relkind) { case RELKIND_AOVISIMAP: GetAppendOnlyEntryAuxOids(relOid, SnapshotNow, NULL, NULL, NULL, NULL, &aoauxiliary_relid, &aoauxiliary_idxid); break; case RELKIND_AOBLOCKDIR: GetAppendOnlyEntryAuxOids(relOid, SnapshotNow, NULL, NULL, &aoauxiliary_relid, &aoauxiliary_idxid, NULL, NULL); break; case RELKIND_AOSEGMENTS: GetAppendOnlyEntryAuxOids(relOid, SnapshotNow, &aoauxiliary_relid, &aoauxiliary_idxid, NULL, NULL, NULL, NULL); break; default: elog(ERROR, "unsupported auxiliary relkind '%c'", relkind); } /* * Does it have the auxiliary relation? */ if (OidIsValid(aoauxiliary_relid)) { return false; } snprintf(aoauxiliary_relname, sizeof(aoauxiliary_relname), "%s_%u", auxiliaryNamePrefix, relOid); snprintf(aoauxiliary_idxname, sizeof(aoauxiliary_idxname), "%s_%u_index", auxiliaryNamePrefix, relOid); /* * We place auxiliary relation in the pg_aoseg namespace * even if its master relation is a temp table. There cannot be * any naming collision, and the auxiliary relation will be * destroyed when its master is, so there is no need to handle * the aovisimap relation as temp. */ aoauxiliary_relid = heap_create_with_catalog(aoauxiliary_relname, PG_AOSEGMENT_NAMESPACE, rel->rd_rel->reltablespace, aoauxiliaryOid, rel->rd_rel->relowner, tupledesc, /* relam */ InvalidOid, relkind, RELSTORAGE_HEAP, shared_relation, true, /* bufferPoolBulkLoad */ false, 0, ONCOMMIT_NOOP, NULL, /* GP Policy */ (Datum) 0, true, /* valid_opts */ false, aoauxiliaryComptypeOid, /* persistentTid */ NULL, /* persistentSerialNum */ NULL); /* Make this table visible, else index creation will fail */ CommandCounterIncrement(); aoauxiliary_idxid = index_create(aoauxiliaryOid, aoauxiliary_idxname, aoauxiliaryIndexOid, indexInfo, BTREE_AM_OID, rel->rd_rel->reltablespace, classObjectId, coloptions, (Datum) 0, true, false, (Oid *) NULL, true, false, false, NULL); /* Unlock target table -- no one can see it */ UnlockRelationOid(aoauxiliaryOid, ShareLock); /* Unlock the index -- no one can see it anyway */ UnlockRelationOid(aoauxiliaryIndexOid, AccessExclusiveLock); /* * Store the auxiliary table's OID in the parent relation's pg_appendonly row. * TODO (How to generalize this?) */ switch (relkind) { case RELKIND_AOVISIMAP: UpdateAppendOnlyEntryAuxOids(relOid, InvalidOid, InvalidOid, InvalidOid, InvalidOid, aoauxiliary_relid, aoauxiliary_idxid); break; case RELKIND_AOBLOCKDIR: UpdateAppendOnlyEntryAuxOids(relOid, InvalidOid, InvalidOid, aoauxiliary_relid, aoauxiliary_idxid, InvalidOid, InvalidOid); break; case RELKIND_AOSEGMENTS: UpdateAppendOnlyEntryAuxOids(relOid, aoauxiliary_relid, aoauxiliary_idxid, InvalidOid, InvalidOid, InvalidOid, InvalidOid); break; default: elog(ERROR, "unsupported auxiliary relkind '%c'", relkind); } /* * Register dependency from the auxiliary table to the master, so that the * aoseg table will be deleted if the master is. */ baseobject.classId = RelationRelationId; baseobject.objectId = relOid; baseobject.objectSubId = 0; aoauxiliaryobject.classId = RelationRelationId; aoauxiliaryobject.objectId = aoauxiliaryOid; aoauxiliaryobject.objectSubId = 0; recordDependencyOn(&aoauxiliaryobject, &baseobject, DEPENDENCY_INTERNAL); /* * Make changes visible */ CommandCounterIncrement(); return true; }
/* * OperatorShellMake * Make a "shell" entry for a not-yet-existing operator. */ static Oid OperatorShellMake(const char *operatorName, Oid operatorNamespace, Oid leftTypeId, Oid rightTypeId) { Relation pg_operator_desc; Oid operatorObjectId; int i; HeapTuple tup; Datum values[Natts_pg_operator]; bool nulls[Natts_pg_operator]; NameData oname; TupleDesc tupDesc; /* * validate operator name */ if (!validOperatorName(operatorName)) ereport(ERROR, (errcode(ERRCODE_INVALID_NAME), errmsg("\"%s\" is not a valid operator name", operatorName))); /* * initialize our *nulls and *values arrays */ for (i = 0; i < Natts_pg_operator; ++i) { nulls[i] = false; values[i] = (Datum) NULL; /* redundant, but safe */ } /* * initialize values[] with the operator name and input data types. Note * that oprcode is set to InvalidOid, indicating it's a shell. */ i = 0; namestrcpy(&oname, operatorName); values[i++] = NameGetDatum(&oname); /* oprname */ values[i++] = ObjectIdGetDatum(operatorNamespace); /* oprnamespace */ values[i++] = ObjectIdGetDatum(GetUserId()); /* oprowner */ values[i++] = CharGetDatum(leftTypeId ? (rightTypeId ? 'b' : 'r') : 'l'); /* oprkind */ values[i++] = BoolGetDatum(false); /* oprcanmerge */ values[i++] = BoolGetDatum(false); /* oprcanhash */ values[i++] = ObjectIdGetDatum(leftTypeId); /* oprleft */ values[i++] = ObjectIdGetDatum(rightTypeId); /* oprright */ values[i++] = ObjectIdGetDatum(InvalidOid); /* oprresult */ values[i++] = ObjectIdGetDatum(InvalidOid); /* oprcom */ values[i++] = ObjectIdGetDatum(InvalidOid); /* oprnegate */ values[i++] = ObjectIdGetDatum(InvalidOid); /* oprcode */ values[i++] = ObjectIdGetDatum(InvalidOid); /* oprrest */ values[i++] = ObjectIdGetDatum(InvalidOid); /* oprjoin */ /* * open pg_operator */ pg_operator_desc = heap_open(OperatorRelationId, RowExclusiveLock); tupDesc = pg_operator_desc->rd_att; /* * create a new operator tuple */ tup = heap_form_tuple(tupDesc, values, nulls); /* * insert our "shell" operator tuple */ operatorObjectId = simple_heap_insert(pg_operator_desc, tup); CatalogUpdateIndexes(pg_operator_desc, tup); /* Add dependencies for the entry */ makeOperatorDependencies(tup); heap_freetuple(tup); /* Post creation hook for new shell operator */ InvokeObjectAccessHook(OAT_POST_CREATE, OperatorRelationId, operatorObjectId, 0); /* * Make sure the tuple is visible for subsequent lookups/updates. */ CommandCounterIncrement(); /* * close the operator relation and return the oid. */ heap_close(pg_operator_desc, RowExclusiveLock); return operatorObjectId; }
/* * OperatorUpd * * For a given operator, look up its negator and commutator operators. * If they are defined, but their negator and commutator fields * (respectively) are empty, then use the new operator for neg or comm. * This solves a problem for users who need to insert two new operators * which are the negator or commutator of each other. */ static void OperatorUpd(Oid baseId, Oid commId, Oid negId) { int i; Relation pg_operator_desc; HeapTuple tup; bool nulls[Natts_pg_operator]; bool replaces[Natts_pg_operator]; Datum values[Natts_pg_operator]; for (i = 0; i < Natts_pg_operator; ++i) { values[i] = (Datum) 0; replaces[i] = false; nulls[i] = false; } /* * check and update the commutator & negator, if necessary * * We need a CommandCounterIncrement here in case of a self-commutator * operator: we'll need to update the tuple that we just inserted. */ CommandCounterIncrement(); pg_operator_desc = heap_open(OperatorRelationId, RowExclusiveLock); tup = SearchSysCacheCopy1(OPEROID, ObjectIdGetDatum(commId)); /* * if the commutator and negator are the same operator, do one update. XXX * this is probably useless code --- I doubt it ever makes sense for * commutator and negator to be the same thing... */ if (commId == negId) { if (HeapTupleIsValid(tup)) { Form_pg_operator t = (Form_pg_operator) GETSTRUCT(tup); if (!OidIsValid(t->oprcom) || !OidIsValid(t->oprnegate)) { if (!OidIsValid(t->oprnegate)) { values[Anum_pg_operator_oprnegate - 1] = ObjectIdGetDatum(baseId); replaces[Anum_pg_operator_oprnegate - 1] = true; } if (!OidIsValid(t->oprcom)) { values[Anum_pg_operator_oprcom - 1] = ObjectIdGetDatum(baseId); replaces[Anum_pg_operator_oprcom - 1] = true; } tup = heap_modify_tuple(tup, RelationGetDescr(pg_operator_desc), values, nulls, replaces); simple_heap_update(pg_operator_desc, &tup->t_self, tup); CatalogUpdateIndexes(pg_operator_desc, tup); } } heap_close(pg_operator_desc, RowExclusiveLock); return; } /* if commutator and negator are different, do two updates */ if (HeapTupleIsValid(tup) && !(OidIsValid(((Form_pg_operator) GETSTRUCT(tup))->oprcom))) { values[Anum_pg_operator_oprcom - 1] = ObjectIdGetDatum(baseId); replaces[Anum_pg_operator_oprcom - 1] = true; tup = heap_modify_tuple(tup, RelationGetDescr(pg_operator_desc), values, nulls, replaces); simple_heap_update(pg_operator_desc, &tup->t_self, tup); CatalogUpdateIndexes(pg_operator_desc, tup); values[Anum_pg_operator_oprcom - 1] = (Datum) NULL; replaces[Anum_pg_operator_oprcom - 1] = false; } /* check and update the negator, if necessary */ tup = SearchSysCacheCopy1(OPEROID, ObjectIdGetDatum(negId)); if (HeapTupleIsValid(tup) && !(OidIsValid(((Form_pg_operator) GETSTRUCT(tup))->oprnegate))) { values[Anum_pg_operator_oprnegate - 1] = ObjectIdGetDatum(baseId); replaces[Anum_pg_operator_oprnegate - 1] = true; tup = heap_modify_tuple(tup, RelationGetDescr(pg_operator_desc), values, nulls, replaces); simple_heap_update(pg_operator_desc, &tup->t_self, tup); CatalogUpdateIndexes(pg_operator_desc, tup); } heap_close(pg_operator_desc, RowExclusiveLock); }
int inv_write(LargeObjectDesc *obj_desc, const char *buf, int nbytes) { int nwritten = 0; int n; int off; int len; int32 pageno = (int32) (obj_desc->offset / LOBLKSIZE); ScanKeyData skey[2]; SysScanDesc sd; HeapTuple oldtuple; Form_pg_largeobject olddata; bool neednextpage; bytea *datafield; bool pfreeit; struct { bytea hdr; char data[LOBLKSIZE]; /* make struct big enough */ int32 align_it; /* ensure struct is aligned well enough */ } workbuf; char *workb = VARDATA(&workbuf.hdr); HeapTuple newtup; Datum values[Natts_pg_largeobject]; bool nulls[Natts_pg_largeobject]; bool replace[Natts_pg_largeobject]; CatalogIndexState indstate; Assert(PointerIsValid(obj_desc)); Assert(buf != NULL); /* enforce writability because snapshot is probably wrong otherwise */ Assert(obj_desc->flags & IFS_WRLOCK); if (nbytes <= 0) return 0; /* this addition can't overflow because nbytes is only int32 */ if ((nbytes + obj_desc->offset) > MAX_LARGE_OBJECT_SIZE) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("invalid large object write request size: %d", nbytes))); open_lo_relation(); indstate = CatalogOpenIndexes(lo_heap_r); ScanKeyInit(&skey[0], Anum_pg_largeobject_loid, BTEqualStrategyNumber, F_OIDEQ, ObjectIdGetDatum(obj_desc->id)); ScanKeyInit(&skey[1], Anum_pg_largeobject_pageno, BTGreaterEqualStrategyNumber, F_INT4GE, Int32GetDatum(pageno)); sd = systable_beginscan_ordered(lo_heap_r, lo_index_r, obj_desc->snapshot, 2, skey); oldtuple = NULL; olddata = NULL; neednextpage = true; while (nwritten < nbytes) { /* * If possible, get next pre-existing page of the LO. We expect the * indexscan will deliver these in order --- but there may be holes. */ if (neednextpage) { if ((oldtuple = systable_getnext_ordered(sd, ForwardScanDirection)) != NULL) { if (HeapTupleHasNulls(oldtuple)) /* paranoia */ elog(ERROR, "null field found in pg_largeobject"); olddata = (Form_pg_largeobject) GETSTRUCT(oldtuple); Assert(olddata->pageno >= pageno); } neednextpage = false; } /* * If we have a pre-existing page, see if it is the page we want to * write, or a later one. */ if (olddata != NULL && olddata->pageno == pageno) { /* * Update an existing page with fresh data. * * First, load old data into workbuf */ getdatafield(olddata, &datafield, &len, &pfreeit); memcpy(workb, VARDATA(datafield), len); if (pfreeit) pfree(datafield); /* * Fill any hole */ off = (int) (obj_desc->offset % LOBLKSIZE); if (off > len) MemSet(workb + len, 0, off - len); /* * Insert appropriate portion of new data */ n = LOBLKSIZE - off; n = (n <= (nbytes - nwritten)) ? n : (nbytes - nwritten); memcpy(workb + off, buf + nwritten, n); nwritten += n; obj_desc->offset += n; off += n; /* compute valid length of new page */ len = (len >= off) ? len : off; SET_VARSIZE(&workbuf.hdr, len + VARHDRSZ); /* * Form and insert updated tuple */ memset(values, 0, sizeof(values)); memset(nulls, false, sizeof(nulls)); memset(replace, false, sizeof(replace)); values[Anum_pg_largeobject_data - 1] = PointerGetDatum(&workbuf); replace[Anum_pg_largeobject_data - 1] = true; newtup = heap_modify_tuple(oldtuple, RelationGetDescr(lo_heap_r), values, nulls, replace); simple_heap_update(lo_heap_r, &newtup->t_self, newtup); CatalogIndexInsert(indstate, newtup); heap_freetuple(newtup); /* * We're done with this old page. */ oldtuple = NULL; olddata = NULL; neednextpage = true; } else { /* * Write a brand new page. * * First, fill any hole */ off = (int) (obj_desc->offset % LOBLKSIZE); if (off > 0) MemSet(workb, 0, off); /* * Insert appropriate portion of new data */ n = LOBLKSIZE - off; n = (n <= (nbytes - nwritten)) ? n : (nbytes - nwritten); memcpy(workb + off, buf + nwritten, n); nwritten += n; obj_desc->offset += n; /* compute valid length of new page */ len = off + n; SET_VARSIZE(&workbuf.hdr, len + VARHDRSZ); /* * Form and insert updated tuple */ memset(values, 0, sizeof(values)); memset(nulls, false, sizeof(nulls)); values[Anum_pg_largeobject_loid - 1] = ObjectIdGetDatum(obj_desc->id); values[Anum_pg_largeobject_pageno - 1] = Int32GetDatum(pageno); values[Anum_pg_largeobject_data - 1] = PointerGetDatum(&workbuf); newtup = heap_form_tuple(lo_heap_r->rd_att, values, nulls); simple_heap_insert(lo_heap_r, newtup); CatalogIndexInsert(indstate, newtup); heap_freetuple(newtup); } pageno++; } systable_endscan_ordered(sd); CatalogCloseIndexes(indstate); /* * Advance command counter so that my tuple updates will be seen by later * large-object operations in this transaction. */ CommandCounterIncrement(); return nwritten; }
void inv_truncate(LargeObjectDesc *obj_desc, int64 len) { int32 pageno = (int32) (len / LOBLKSIZE); int32 off; ScanKeyData skey[2]; SysScanDesc sd; HeapTuple oldtuple; Form_pg_largeobject olddata; struct { bytea hdr; char data[LOBLKSIZE]; /* make struct big enough */ int32 align_it; /* ensure struct is aligned well enough */ } workbuf; char *workb = VARDATA(&workbuf.hdr); HeapTuple newtup; Datum values[Natts_pg_largeobject]; bool nulls[Natts_pg_largeobject]; bool replace[Natts_pg_largeobject]; CatalogIndexState indstate; Assert(PointerIsValid(obj_desc)); /* enforce writability because snapshot is probably wrong otherwise */ Assert(obj_desc->flags & IFS_WRLOCK); /* * use errmsg_internal here because we don't want to expose INT64_FORMAT * in translatable strings; doing better is not worth the trouble */ if (len < 0 || len > MAX_LARGE_OBJECT_SIZE) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg_internal("invalid large object truncation target: " INT64_FORMAT, len))); open_lo_relation(); indstate = CatalogOpenIndexes(lo_heap_r); /* * Set up to find all pages with desired loid and pageno >= target */ ScanKeyInit(&skey[0], Anum_pg_largeobject_loid, BTEqualStrategyNumber, F_OIDEQ, ObjectIdGetDatum(obj_desc->id)); ScanKeyInit(&skey[1], Anum_pg_largeobject_pageno, BTGreaterEqualStrategyNumber, F_INT4GE, Int32GetDatum(pageno)); sd = systable_beginscan_ordered(lo_heap_r, lo_index_r, obj_desc->snapshot, 2, skey); /* * If possible, get the page the truncation point is in. The truncation * point may be beyond the end of the LO or in a hole. */ olddata = NULL; if ((oldtuple = systable_getnext_ordered(sd, ForwardScanDirection)) != NULL) { if (HeapTupleHasNulls(oldtuple)) /* paranoia */ elog(ERROR, "null field found in pg_largeobject"); olddata = (Form_pg_largeobject) GETSTRUCT(oldtuple); Assert(olddata->pageno >= pageno); } /* * If we found the page of the truncation point we need to truncate the * data in it. Otherwise if we're in a hole, we need to create a page to * mark the end of data. */ if (olddata != NULL && olddata->pageno == pageno) { /* First, load old data into workbuf */ bytea *datafield; int pagelen; bool pfreeit; getdatafield(olddata, &datafield, &pagelen, &pfreeit); memcpy(workb, VARDATA(datafield), pagelen); if (pfreeit) pfree(datafield); /* * Fill any hole */ off = len % LOBLKSIZE; if (off > pagelen) MemSet(workb + pagelen, 0, off - pagelen); /* compute length of new page */ SET_VARSIZE(&workbuf.hdr, off + VARHDRSZ); /* * Form and insert updated tuple */ memset(values, 0, sizeof(values)); memset(nulls, false, sizeof(nulls)); memset(replace, false, sizeof(replace)); values[Anum_pg_largeobject_data - 1] = PointerGetDatum(&workbuf); replace[Anum_pg_largeobject_data - 1] = true; newtup = heap_modify_tuple(oldtuple, RelationGetDescr(lo_heap_r), values, nulls, replace); simple_heap_update(lo_heap_r, &newtup->t_self, newtup); CatalogIndexInsert(indstate, newtup); heap_freetuple(newtup); } else { /* * If the first page we found was after the truncation point, we're in * a hole that we'll fill, but we need to delete the later page * because the loop below won't visit it again. */ if (olddata != NULL) { Assert(olddata->pageno > pageno); simple_heap_delete(lo_heap_r, &oldtuple->t_self); } /* * Write a brand new page. * * Fill the hole up to the truncation point */ off = len % LOBLKSIZE; if (off > 0) MemSet(workb, 0, off); /* compute length of new page */ SET_VARSIZE(&workbuf.hdr, off + VARHDRSZ); /* * Form and insert new tuple */ memset(values, 0, sizeof(values)); memset(nulls, false, sizeof(nulls)); values[Anum_pg_largeobject_loid - 1] = ObjectIdGetDatum(obj_desc->id); values[Anum_pg_largeobject_pageno - 1] = Int32GetDatum(pageno); values[Anum_pg_largeobject_data - 1] = PointerGetDatum(&workbuf); newtup = heap_form_tuple(lo_heap_r->rd_att, values, nulls); simple_heap_insert(lo_heap_r, newtup); CatalogIndexInsert(indstate, newtup); heap_freetuple(newtup); } /* * Delete any pages after the truncation point. If the initial search * didn't find a page, then of course there's nothing more to do. */ if (olddata != NULL) { while ((oldtuple = systable_getnext_ordered(sd, ForwardScanDirection)) != NULL) { simple_heap_delete(lo_heap_r, &oldtuple->t_self); } } systable_endscan_ordered(sd); CatalogCloseIndexes(indstate); /* * Advance command counter so that tuple updates will be seen by later * large-object operations in this transaction. */ CommandCounterIncrement(); }
/* * create_toast_table --- internal workhorse * * rel is already opened and locked * toastOid and toastIndexOid are normally InvalidOid, but during * bootstrap they can be nonzero to specify hand-assigned OIDs */ static bool create_toast_table(Relation rel, Oid toastOid, Oid toastIndexOid, Datum reloptions, LOCKMODE lockmode, bool check) { Oid relOid = RelationGetRelid(rel); HeapTuple reltup; TupleDesc tupdesc; bool shared_relation; bool mapped_relation; Relation toast_rel; Relation class_rel; Oid toast_relid; Oid toast_typid = InvalidOid; Oid namespaceid; char toast_relname[NAMEDATALEN]; char toast_idxname[NAMEDATALEN]; IndexInfo *indexInfo; Oid collationObjectId[2]; Oid classObjectId[2]; int16 coloptions[2]; ObjectAddress baseobject, toastobject; /* * Toast table is shared if and only if its parent is. * * We cannot allow toasting a shared relation after initdb (because * there's no way to mark it toasted in other databases' pg_class). */ shared_relation = rel->rd_rel->relisshared; if (shared_relation && !IsBootstrapProcessingMode()) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("shared tables cannot be toasted after initdb"))); /* It's mapped if and only if its parent is, too */ mapped_relation = RelationIsMapped(rel); /* * Is it already toasted? */ if (rel->rd_rel->reltoastrelid != InvalidOid) return false; /* * Check to see whether the table actually needs a TOAST table. */ if (!IsBinaryUpgrade) { /* Normal mode, normal check */ if (!needs_toast_table(rel)) return false; } else { /* * In binary-upgrade mode, create a TOAST table if and only if * pg_upgrade told us to (ie, a TOAST table OID has been provided). * * This indicates that the old cluster had a TOAST table for the * current table. We must create a TOAST table to receive the old * TOAST file, even if the table seems not to need one. * * Contrariwise, if the old cluster did not have a TOAST table, we * should be able to get along without one even if the new version's * needs_toast_table rules suggest we should have one. There is a lot * of daylight between where we will create a TOAST table and where * one is really necessary to avoid failures, so small cross-version * differences in the when-to-create heuristic shouldn't be a problem. * If we tried to create a TOAST table anyway, we would have the * problem that it might take up an OID that will conflict with some * old-cluster table we haven't seen yet. */ if (!OidIsValid(binary_upgrade_next_toast_pg_class_oid) || !OidIsValid(binary_upgrade_next_toast_pg_type_oid)) return false; } /* * If requested check lockmode is sufficient. This is a cross check in * case of errors or conflicting decisions in earlier code. */ if (check && lockmode != AccessExclusiveLock) elog(ERROR, "AccessExclusiveLock required to add toast table."); /* * Create the toast table and its index */ snprintf(toast_relname, sizeof(toast_relname), "pg_toast_%u", relOid); snprintf(toast_idxname, sizeof(toast_idxname), "pg_toast_%u_index", relOid); /* this is pretty painful... need a tuple descriptor */ tupdesc = CreateTemplateTupleDesc(3, false); TupleDescInitEntry(tupdesc, (AttrNumber) 1, "chunk_id", OIDOID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 2, "chunk_seq", INT4OID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 3, "chunk_data", BYTEAOID, -1, 0); /* * Ensure that the toast table doesn't itself get toasted, or we'll be * toast :-(. This is essential for chunk_data because type bytea is * toastable; hit the other two just to be sure. */ TupleDescAttr(tupdesc, 0)->attstorage = 'p'; TupleDescAttr(tupdesc, 1)->attstorage = 'p'; TupleDescAttr(tupdesc, 2)->attstorage = 'p'; /* * Toast tables for regular relations go in pg_toast; those for temp * relations go into the per-backend temp-toast-table namespace. */ if (isTempOrTempToastNamespace(rel->rd_rel->relnamespace)) namespaceid = GetTempToastNamespace(); else namespaceid = PG_TOAST_NAMESPACE; /* * Use binary-upgrade override for pg_type.oid, if supplied. We might be * in the post-schema-restore phase where we are doing ALTER TABLE to * create TOAST tables that didn't exist in the old cluster. */ if (IsBinaryUpgrade && OidIsValid(binary_upgrade_next_toast_pg_type_oid)) { toast_typid = binary_upgrade_next_toast_pg_type_oid; binary_upgrade_next_toast_pg_type_oid = InvalidOid; } toast_relid = heap_create_with_catalog(toast_relname, namespaceid, rel->rd_rel->reltablespace, toastOid, toast_typid, InvalidOid, rel->rd_rel->relowner, tupdesc, NIL, RELKIND_TOASTVALUE, rel->rd_rel->relpersistence, shared_relation, mapped_relation, true, 0, ONCOMMIT_NOOP, reloptions, false, true, true, InvalidOid, NULL); Assert(toast_relid != InvalidOid); /* make the toast relation visible, else heap_open will fail */ CommandCounterIncrement(); /* ShareLock is not really needed here, but take it anyway */ toast_rel = heap_open(toast_relid, ShareLock); /* * Create unique index on chunk_id, chunk_seq. * * NOTE: the normal TOAST access routines could actually function with a * single-column index on chunk_id only. However, the slice access * routines use both columns for faster access to an individual chunk. In * addition, we want it to be unique as a check against the possibility of * duplicate TOAST chunk OIDs. The index might also be a little more * efficient this way, since btree isn't all that happy with large numbers * of equal keys. */ indexInfo = makeNode(IndexInfo); indexInfo->ii_NumIndexAttrs = 2; indexInfo->ii_KeyAttrNumbers[0] = 1; indexInfo->ii_KeyAttrNumbers[1] = 2; indexInfo->ii_Expressions = NIL; indexInfo->ii_ExpressionsState = NIL; indexInfo->ii_Predicate = NIL; indexInfo->ii_PredicateState = NULL; indexInfo->ii_ExclusionOps = NULL; indexInfo->ii_ExclusionProcs = NULL; indexInfo->ii_ExclusionStrats = NULL; indexInfo->ii_Unique = true; indexInfo->ii_ReadyForInserts = true; indexInfo->ii_Concurrent = false; indexInfo->ii_BrokenHotChain = false; indexInfo->ii_ParallelWorkers = 0; indexInfo->ii_Am = BTREE_AM_OID; indexInfo->ii_AmCache = NULL; indexInfo->ii_Context = CurrentMemoryContext; collationObjectId[0] = InvalidOid; collationObjectId[1] = InvalidOid; classObjectId[0] = OID_BTREE_OPS_OID; classObjectId[1] = INT4_BTREE_OPS_OID; coloptions[0] = 0; coloptions[1] = 0; index_create(toast_rel, toast_idxname, toastIndexOid, InvalidOid, InvalidOid, InvalidOid, indexInfo, list_make2("chunk_id", "chunk_seq"), BTREE_AM_OID, rel->rd_rel->reltablespace, collationObjectId, classObjectId, coloptions, (Datum) 0, INDEX_CREATE_IS_PRIMARY, 0, true, true, NULL); heap_close(toast_rel, NoLock); /* * Store the toast table's OID in the parent relation's pg_class row */ class_rel = heap_open(RelationRelationId, RowExclusiveLock); reltup = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(relOid)); if (!HeapTupleIsValid(reltup)) elog(ERROR, "cache lookup failed for relation %u", relOid); ((Form_pg_class) GETSTRUCT(reltup))->reltoastrelid = toast_relid; if (!IsBootstrapProcessingMode()) { /* normal case, use a transactional update */ CatalogTupleUpdate(class_rel, &reltup->t_self, reltup); } else { /* While bootstrapping, we cannot UPDATE, so overwrite in-place */ heap_inplace_update(class_rel, reltup); } heap_freetuple(reltup); heap_close(class_rel, RowExclusiveLock); /* * Register dependency from the toast table to the master, so that the * toast table will be deleted if the master is. Skip this in bootstrap * mode. */ if (!IsBootstrapProcessingMode()) { baseobject.classId = RelationRelationId; baseobject.objectId = relOid; baseobject.objectSubId = 0; toastobject.classId = RelationRelationId; toastobject.objectId = toast_relid; toastobject.objectSubId = 0; recordDependencyOn(&toastobject, &baseobject, DEPENDENCY_INTERNAL); } /* * Make changes visible */ CommandCounterIncrement(); return true; }
/* * reindex_relation - This routine is used to recreate all indexes * of a relation (and its toast relation too, if any). * * Returns true if any indexes were rebuilt. */ bool reindex_relation(Oid relid) { Relation rel; Oid toast_relid; bool is_pg_class; bool result; List *indexIds, *doneIndexes, *indexId; /* * Ensure to hold an exclusive lock throughout the transaction. The * lock could perhaps be less intensive (in the non-overwrite case) * but for now it's AccessExclusiveLock for simplicity. */ rel = heap_open(relid, AccessExclusiveLock); toast_relid = rel->rd_rel->reltoastrelid; /* * Get the list of index OIDs for this relation. (We trust to the * relcache to get this with a sequential scan if ignoring system * indexes.) */ indexIds = RelationGetIndexList(rel); /* * reindex_index will attempt to update the pg_class rows for the * relation and index. If we are processing pg_class itself, we * want to make sure that the updates do not try to insert index * entries into indexes we have not processed yet. (When we are * trying to recover from corrupted indexes, that could easily * cause a crash.) We can accomplish this because CatalogUpdateIndexes * will use the relcache's index list to know which indexes to update. * We just force the index list to be only the stuff we've processed. * * It is okay to not insert entries into the indexes we have not * processed yet because all of this is transaction-safe. If we fail * partway through, the updated rows are dead and it doesn't matter * whether they have index entries. Also, a new pg_class index will * be created with an entry for its own pg_class row because we do * setNewRelfilenode() before we do index_build(). */ is_pg_class = (RelationGetRelid(rel) == RelOid_pg_class); doneIndexes = NIL; /* Reindex all the indexes. */ foreach(indexId, indexIds) { Oid indexOid = lfirsto(indexId); if (is_pg_class) RelationSetIndexList(rel, doneIndexes); reindex_index(indexOid); CommandCounterIncrement(); if (is_pg_class) doneIndexes = lappendo(doneIndexes, indexOid); }
void _bitmap_create_lov_heapandindex(Relation rel, Oid *lovHeapOid, Oid *lovIndexOid) { char lovHeapName[NAMEDATALEN]; char lovIndexName[NAMEDATALEN]; TupleDesc tupDesc; IndexInfo *indexInfo; ObjectAddress objAddr, referenced; Oid *classObjectId; int16 *coloptions; Oid heapid; Oid idxid; int indattrs; int i; Assert(rel != NULL); /* create the new names for the new lov heap and index */ snprintf(lovHeapName, sizeof(lovHeapName), "pg_bm_%u", RelationGetRelid(rel)); snprintf(lovIndexName, sizeof(lovIndexName), "pg_bm_%u_index", RelationGetRelid(rel)); heapid = get_relname_relid(lovHeapName, PG_BITMAPINDEX_NAMESPACE); /* * If heapid exists, then this is happening during re-indexing. * We allocate new relfilenodes for lov heap and lov index. * * XXX Each segment db may have different relfilenodes for lov heap and * lov index, which should not be an issue now. Ideally, we would like each * segment db use the same oids. */ if (OidIsValid(heapid)) { Relation lovHeap; Relation lovIndex; Buffer btree_metabuf; Page btree_metapage; *lovHeapOid = heapid; idxid = get_relname_relid(lovIndexName, PG_BITMAPINDEX_NAMESPACE); Assert(OidIsValid(idxid)); *lovIndexOid = idxid; lovHeap = heap_open(heapid, AccessExclusiveLock); lovIndex = index_open(idxid, AccessExclusiveLock); setNewRelfilenode(lovHeap, RecentXmin); setNewRelfilenode(lovIndex, RecentXmin); /* * After creating the new relfilenode for a btee index, this is not * a btree anymore. We create the new metapage for this btree. */ btree_metabuf = _bt_getbuf(lovIndex, P_NEW, BT_WRITE); Assert (BTREE_METAPAGE == BufferGetBlockNumber(btree_metabuf)); btree_metapage = BufferGetPage(btree_metabuf); _bt_initmetapage(btree_metapage, P_NONE, 0); /* XLOG the metapage */ if (!lovIndex->rd_istemp) { // Fetch gp_persistent_relation_node information that will be added to XLOG record. RelationFetchGpRelationNodeForXLog(lovIndex); log_newpage_rel(lovIndex, BufferGetBlockNumber(btree_metabuf), MAIN_FORKNUM, btree_metapage); } /* This cache value is not valid anymore. */ if (lovIndex->rd_amcache) { pfree(lovIndex->rd_amcache); lovIndex->rd_amcache = NULL; } MarkBufferDirty(btree_metabuf); _bt_relbuf(lovIndex, btree_metabuf); index_close(lovIndex, NoLock); heap_close(lovHeap, NoLock); return; } /* * create a new empty heap to store all attribute values with their * corresponding block number and offset in LOV. */ tupDesc = _bitmap_create_lov_heapTupleDesc(rel); Assert(rel->rd_rel != NULL); heapid = heap_create_with_catalog(lovHeapName, PG_BITMAPINDEX_NAMESPACE, rel->rd_rel->reltablespace, InvalidOid, rel->rd_rel->relowner, tupDesc, NIL, /* relam */ InvalidOid, RELKIND_RELATION, RELSTORAGE_HEAP, rel->rd_rel->relisshared, false, /* bufferPoolBulkLoad */ false, 0, ONCOMMIT_NOOP, NULL /* GP Policy */, (Datum)0, true, /* valid_opts */ true, /* persistentTid */ NULL, /* persistentSerialNum */ NULL); *lovHeapOid = heapid; /* * We must bump the command counter to make the newly-created relation * tuple visible for opening. */ CommandCounterIncrement(); objAddr.classId = RelationRelationId; objAddr.objectId = heapid; objAddr.objectSubId = 0 ; referenced.classId = RelationRelationId; referenced.objectId = RelationGetRelid(rel); referenced.objectSubId = 0; recordDependencyOn(&objAddr, &referenced, DEPENDENCY_INTERNAL); /* * create a btree index on the newly-created heap. * The key includes all attributes to be indexed in this bitmap index. */ indattrs = tupDesc->natts - 2; indexInfo = makeNode(IndexInfo); indexInfo->ii_NumIndexAttrs = indattrs; indexInfo->ii_Expressions = NIL; indexInfo->ii_ExpressionsState = NIL; indexInfo->ii_Predicate = make_ands_implicit(NULL); indexInfo->ii_PredicateState = NIL; indexInfo->ii_Unique = true; classObjectId = (Oid *) palloc(indattrs * sizeof(Oid)); coloptions = (int16 *) palloc(indattrs * sizeof(int16)); for (i = 0; i < indattrs; i++) { Oid typid = tupDesc->attrs[i]->atttypid; indexInfo->ii_KeyAttrNumbers[i] = i + 1; classObjectId[i] = GetDefaultOpClass(typid, BTREE_AM_OID); coloptions[i] = 0; } idxid = index_create(heapid, lovIndexName, InvalidOid, indexInfo, BTREE_AM_OID, rel->rd_rel->reltablespace, classObjectId, coloptions, 0, false, false, true, false, false, NULL); *lovIndexOid = idxid; }
/* * FetchRegularTable fetches the given table's data using the copy out command. * The function then fetches the DDL commands necessary to create this table's * replica, and locally applies these DDL commands. Last, the function copies * the fetched table data into the created table; and on success, returns true. * On failure due to connectivity issues with remote node, the function returns * false. On other types of failures, the function errors out. */ static bool FetchRegularTable(const char *nodeName, uint32 nodePort, const char *tableName) { StringInfo localFilePath = NULL; StringInfo remoteCopyCommand = NULL; List *ddlCommandList = NIL; ListCell *ddlCommandCell = NULL; CopyStmt *localCopyCommand = NULL; RangeVar *localTable = NULL; uint64 shardId = 0; bool received = false; StringInfo queryString = NULL; const char *tableOwner = NULL; Oid tableOwnerId = InvalidOid; Oid savedUserId = InvalidOid; int savedSecurityContext = 0; List *tableNameList = NIL; /* copy remote table's data to this node in an idempotent manner */ shardId = ExtractShardId(tableName); localFilePath = makeStringInfo(); appendStringInfo(localFilePath, "base/%s/%s" UINT64_FORMAT, PG_JOB_CACHE_DIR, TABLE_FILE_PREFIX, shardId); remoteCopyCommand = makeStringInfo(); appendStringInfo(remoteCopyCommand, COPY_OUT_COMMAND, tableName); received = ReceiveRegularFile(nodeName, nodePort, remoteCopyCommand, localFilePath); if (!received) { return false; } /* fetch the ddl commands needed to create the table */ tableOwner = RemoteTableOwner(nodeName, nodePort, tableName); if (tableOwner == NULL) { return false; } tableOwnerId = get_role_oid(tableOwner, false); /* fetch the ddl commands needed to create the table */ ddlCommandList = TableDDLCommandList(nodeName, nodePort, tableName); if (ddlCommandList == NIL) { return false; } /* * Apply DDL commands against the database. Note that on failure from here * on, we immediately error out instead of returning false. Have to do * this as the table's owner to ensure the local table is created with * compatible permissions. */ GetUserIdAndSecContext(&savedUserId, &savedSecurityContext); SetUserIdAndSecContext(tableOwnerId, SECURITY_LOCAL_USERID_CHANGE); foreach(ddlCommandCell, ddlCommandList) { StringInfo ddlCommand = (StringInfo) lfirst(ddlCommandCell); Node *ddlCommandNode = ParseTreeNode(ddlCommand->data); ProcessUtility(ddlCommandNode, ddlCommand->data, PROCESS_UTILITY_TOPLEVEL, NULL, None_Receiver, NULL); CommandCounterIncrement(); }
/* * intorel_startup --- executor startup */ static void intorel_startup(DestReceiver *self, int operation, TupleDesc typeinfo) { DR_intorel *myState = (DR_intorel *) self; IntoClause *into = myState->into; bool is_matview; char relkind; CreateStmt *create; ObjectAddress intoRelationAddr; Relation intoRelationDesc; RangeTblEntry *rte; Datum toast_options; ListCell *lc; int attnum; static char *validnsps[] = HEAP_RELOPT_NAMESPACES; Assert(into != NULL); /* else somebody forgot to set it */ /* This code supports both CREATE TABLE AS and CREATE MATERIALIZED VIEW */ is_matview = (into->viewQuery != NULL); relkind = is_matview ? RELKIND_MATVIEW : RELKIND_RELATION; /* * Create the target relation by faking up a CREATE TABLE parsetree and * passing it to DefineRelation. */ create = makeNode(CreateStmt); create->relation = into->rel; create->tableElts = NIL; /* will fill below */ create->inhRelations = NIL; create->ofTypename = NULL; create->constraints = NIL; create->options = into->options; create->oncommit = into->onCommit; create->tablespacename = into->tableSpaceName; create->if_not_exists = false; /* * Build column definitions using "pre-cooked" type and collation info. If * a column name list was specified in CREATE TABLE AS, override the * column names derived from the query. (Too few column names are OK, too * many are not.) */ lc = list_head(into->colNames); for (attnum = 0; attnum < typeinfo->natts; attnum++) { Form_pg_attribute attribute = typeinfo->attrs[attnum]; ColumnDef *col = makeNode(ColumnDef); TypeName *coltype = makeNode(TypeName); if (lc) { col->colname = strVal(lfirst(lc)); lc = lnext(lc); } else col->colname = NameStr(attribute->attname); col->typeName = coltype; col->inhcount = 0; col->is_local = true; col->is_not_null = false; col->is_from_type = false; col->storage = 0; col->raw_default = NULL; col->cooked_default = NULL; col->collClause = NULL; col->collOid = attribute->attcollation; col->constraints = NIL; col->fdwoptions = NIL; col->location = -1; coltype->names = NIL; coltype->typeOid = attribute->atttypid; coltype->setof = false; coltype->pct_type = false; coltype->typmods = NIL; coltype->typemod = attribute->atttypmod; coltype->arrayBounds = NIL; coltype->location = -1; /* * It's possible that the column is of a collatable type but the * collation could not be resolved, so double-check. (We must check * this here because DefineRelation would adopt the type's default * collation rather than complaining.) */ if (!OidIsValid(col->collOid) && type_is_collatable(coltype->typeOid)) ereport(ERROR, (errcode(ERRCODE_INDETERMINATE_COLLATION), errmsg("no collation was derived for column \"%s\" with collatable type %s", col->colname, format_type_be(coltype->typeOid)), errhint("Use the COLLATE clause to set the collation explicitly."))); create->tableElts = lappend(create->tableElts, col); } if (lc != NULL) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("too many column names were specified"))); /* * Actually create the target table */ intoRelationAddr = DefineRelation(create, relkind, InvalidOid, NULL); /* * If necessary, create a TOAST table for the target table. Note that * NewRelationCreateToastTable ends with CommandCounterIncrement(), so * that the TOAST table will be visible for insertion. */ CommandCounterIncrement(); /* parse and validate reloptions for the toast table */ toast_options = transformRelOptions((Datum) 0, create->options, "toast", validnsps, true, false); (void) heap_reloptions(RELKIND_TOASTVALUE, toast_options, true); NewRelationCreateToastTable(intoRelationAddr.objectId, toast_options); /* Create the "view" part of a materialized view. */ if (is_matview) { /* StoreViewQuery scribbles on tree, so make a copy */ Query *query = (Query *) copyObject(into->viewQuery); StoreViewQuery(intoRelationAddr.objectId, query, false); CommandCounterIncrement(); } /* * Finally we can open the target table */ intoRelationDesc = heap_open(intoRelationAddr.objectId, AccessExclusiveLock); /* * Check INSERT permission on the constructed table. * * XXX: It would arguably make sense to skip this check if into->skipData * is true. */ rte = makeNode(RangeTblEntry); rte->rtekind = RTE_RELATION; rte->relid = intoRelationAddr.objectId; rte->relkind = relkind; rte->requiredPerms = ACL_INSERT; for (attnum = 1; attnum <= intoRelationDesc->rd_att->natts; attnum++) rte->insertedCols = bms_add_member(rte->insertedCols, attnum - FirstLowInvalidHeapAttributeNumber); ExecCheckRTPerms(list_make1(rte), true); /* * Make sure the constructed table does not have RLS enabled. * * check_enable_rls() will ereport(ERROR) itself if the user has requested * something invalid, and otherwise will return RLS_ENABLED if RLS should * be enabled here. We don't actually support that currently, so throw * our own ereport(ERROR) if that happens. */ if (check_enable_rls(intoRelationAddr.objectId, InvalidOid, false) == RLS_ENABLED) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), (errmsg("policies not yet implemented for this command")))); /* * Tentatively mark the target as populated, if it's a matview and we're * going to fill it; otherwise, no change needed. */ if (is_matview && !into->skipData) SetMatViewPopulatedState(intoRelationDesc, true); /* * Fill private fields of myState for use by later routines */ myState->rel = intoRelationDesc; myState->output_cid = GetCurrentCommandId(true); /* and remember the new relation's address for ExecCreateTableAs */ CreateAsReladdr = intoRelationAddr; /* * We can skip WAL-logging the insertions, unless PITR or streaming * replication is in use. We can skip the FSM in any case. */ myState->hi_options = HEAP_INSERT_SKIP_FSM | (XLogIsNeeded() ? 0 : HEAP_INSERT_SKIP_WAL); myState->bistate = GetBulkInsertState(); /* Not using WAL requires smgr_targblock be initially invalid */ Assert(RelationGetTargetBlock(intoRelationDesc) == InvalidBlockNumber); }
/* * create_toast_table --- internal workhorse * * rel is already opened and locked * toastOid and toastIndexOid are normally InvalidOid, but during * bootstrap they can be nonzero to specify hand-assigned OIDs */ static bool create_toast_table(Relation rel, Oid toastOid, Oid toastIndexOid, Datum reloptions) { Oid relOid = RelationGetRelid(rel); HeapTuple reltup; TupleDesc tupdesc; bool shared_relation; bool mapped_relation; Relation toast_rel; Relation class_rel; Oid toast_relid; Oid toast_typid = InvalidOid; Oid namespaceid; char toast_relname[NAMEDATALEN]; char toast_idxname[NAMEDATALEN]; IndexInfo *indexInfo; Oid collationObjectId[2]; Oid classObjectId[2]; int16 coloptions[2]; ObjectAddress baseobject, toastobject; /* * Toast table is shared if and only if its parent is. * * We cannot allow toasting a shared relation after initdb (because * there's no way to mark it toasted in other databases' pg_class). */ shared_relation = rel->rd_rel->relisshared; if (shared_relation && !IsBootstrapProcessingMode()) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("shared tables cannot be toasted after initdb"))); /* It's mapped if and only if its parent is, too */ mapped_relation = RelationIsMapped(rel); /* * Is it already toasted? */ if (rel->rd_rel->reltoastrelid != InvalidOid) return false; /* * Check to see whether the table actually needs a TOAST table. * * If an update-in-place toast relfilenode is specified, force toast file * creation even if it seems not to need one. */ if (!needs_toast_table(rel) && (!IsBinaryUpgrade || !OidIsValid(binary_upgrade_next_toast_pg_class_oid))) return false; /* * Create the toast table and its index */ snprintf(toast_relname, sizeof(toast_relname), "pg_toast_%u", relOid); snprintf(toast_idxname, sizeof(toast_idxname), "pg_toast_%u_index", relOid); /* this is pretty painful... need a tuple descriptor */ tupdesc = CreateTemplateTupleDesc(3, false); TupleDescInitEntry(tupdesc, (AttrNumber) 1, "chunk_id", OIDOID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 2, "chunk_seq", INT4OID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 3, "chunk_data", BYTEAOID, -1, 0); /* * Ensure that the toast table doesn't itself get toasted, or we'll be * toast :-(. This is essential for chunk_data because type bytea is * toastable; hit the other two just to be sure. */ tupdesc->attrs[0]->attstorage = 'p'; tupdesc->attrs[1]->attstorage = 'p'; tupdesc->attrs[2]->attstorage = 'p'; /* * Toast tables for regular relations go in pg_toast; those for temp * relations go into the per-backend temp-toast-table namespace. */ if (isTempOrToastNamespace(rel->rd_rel->relnamespace)) namespaceid = GetTempToastNamespace(); else namespaceid = PG_TOAST_NAMESPACE; /* Use binary-upgrade override for pg_type.oid, if supplied. */ if (IsBinaryUpgrade && OidIsValid(binary_upgrade_next_toast_pg_type_oid)) { toast_typid = binary_upgrade_next_toast_pg_type_oid; binary_upgrade_next_toast_pg_type_oid = InvalidOid; } toast_relid = heap_create_with_catalog(toast_relname, namespaceid, rel->rd_rel->reltablespace, toastOid, toast_typid, InvalidOid, rel->rd_rel->relowner, tupdesc, NIL, RELKIND_TOASTVALUE, rel->rd_rel->relpersistence, shared_relation, mapped_relation, true, 0, ONCOMMIT_NOOP, reloptions, false, true, true); Assert(toast_relid != InvalidOid); /* make the toast relation visible, else heap_open will fail */ CommandCounterIncrement(); /* ShareLock is not really needed here, but take it anyway */ toast_rel = heap_open(toast_relid, ShareLock); /* * Create unique index on chunk_id, chunk_seq. * * NOTE: the normal TOAST access routines could actually function with a * single-column index on chunk_id only. However, the slice access * routines use both columns for faster access to an individual chunk. In * addition, we want it to be unique as a check against the possibility of * duplicate TOAST chunk OIDs. The index might also be a little more * efficient this way, since btree isn't all that happy with large numbers * of equal keys. */ indexInfo = makeNode(IndexInfo); indexInfo->ii_NumIndexAttrs = 2; indexInfo->ii_KeyAttrNumbers[0] = 1; indexInfo->ii_KeyAttrNumbers[1] = 2; indexInfo->ii_Expressions = NIL; indexInfo->ii_ExpressionsState = NIL; indexInfo->ii_Predicate = NIL; indexInfo->ii_PredicateState = NIL; indexInfo->ii_ExclusionOps = NULL; indexInfo->ii_ExclusionProcs = NULL; indexInfo->ii_ExclusionStrats = NULL; indexInfo->ii_Unique = true; indexInfo->ii_ReadyForInserts = true; indexInfo->ii_Concurrent = false; indexInfo->ii_BrokenHotChain = false; collationObjectId[0] = InvalidOid; collationObjectId[1] = InvalidOid; classObjectId[0] = OID_BTREE_OPS_OID; classObjectId[1] = INT4_BTREE_OPS_OID; coloptions[0] = 0; coloptions[1] = 0; index_create(toast_rel, toast_idxname, toastIndexOid, InvalidOid, indexInfo, list_make2("chunk_id", "chunk_seq"), BTREE_AM_OID, rel->rd_rel->reltablespace, collationObjectId, classObjectId, coloptions, (Datum) 0, true, false, false, false, true, false, false, true); heap_close(toast_rel, NoLock); /* * Store the toast table's OID in the parent relation's pg_class row */ class_rel = heap_open(RelationRelationId, RowExclusiveLock); reltup = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(relOid)); if (!HeapTupleIsValid(reltup)) elog(ERROR, "cache lookup failed for relation %u", relOid); ((Form_pg_class) GETSTRUCT(reltup))->reltoastrelid = toast_relid; if (!IsBootstrapProcessingMode()) { /* normal case, use a transactional update */ simple_heap_update(class_rel, &reltup->t_self, reltup); /* Keep catalog indexes current */ CatalogUpdateIndexes(class_rel, reltup); } else { /* While bootstrapping, we cannot UPDATE, so overwrite in-place */ heap_inplace_update(class_rel, reltup); } heap_freetuple(reltup); heap_close(class_rel, RowExclusiveLock); /* * Register dependency from the toast table to the master, so that the * toast table will be deleted if the master is. Skip this in bootstrap * mode. */ if (!IsBootstrapProcessingMode()) { baseobject.classId = RelationRelationId; baseobject.objectId = relOid; baseobject.objectSubId = 0; toastobject.classId = RelationRelationId; toastobject.objectId = toast_relid; toastobject.objectSubId = 0; recordDependencyOn(&toastobject, &baseobject, DEPENDENCY_INTERNAL); } /* * Make changes visible */ CommandCounterIncrement(); return true; }
/* ---------------------------------------------------------------- * index_create * * Returns OID of the created index. * ---------------------------------------------------------------- */ Oid index_create(Oid heapRelationId, const char *indexRelationName, IndexInfo *indexInfo, Oid accessMethodObjectId, Oid *classObjectId, bool primary, bool isconstraint, bool allow_system_table_mods) { Relation heapRelation; Relation indexRelation; TupleDesc indexTupDesc; bool shared_relation; Oid namespaceId; Oid indexoid; int i; /* * Only SELECT ... FOR UPDATE are allowed while doing this */ heapRelation = heap_open(heapRelationId, ShareLock); /* * The index will be in the same namespace as its parent table, and is * shared across databases if and only if the parent is. */ namespaceId = RelationGetNamespace(heapRelation); shared_relation = heapRelation->rd_rel->relisshared; /* * check parameters */ if (indexInfo->ii_NumIndexAttrs < 1) elog(ERROR, "must index at least one column"); if (!allow_system_table_mods && IsSystemRelation(heapRelation) && IsNormalProcessingMode()) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("user-defined indexes on system catalog tables are not supported"))); /* * We cannot allow indexing a shared relation after initdb (because * there's no way to make the entry in other databases' pg_class). * Unfortunately we can't distinguish initdb from a manually started * standalone backend. However, we can at least prevent this mistake * under normal multi-user operation. */ if (shared_relation && IsUnderPostmaster) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("shared indexes cannot be created after initdb"))); if (get_relname_relid(indexRelationName, namespaceId)) ereport(ERROR, (errcode(ERRCODE_DUPLICATE_TABLE), errmsg("relation \"%s\" already exists", indexRelationName))); /* * construct tuple descriptor for index tuples */ indexTupDesc = ConstructTupleDescriptor(heapRelation, indexInfo, classObjectId); /* * create the index relation's relcache entry and physical disk file. * (If we fail further down, it's the smgr's responsibility to remove * the disk file again.) */ indexRelation = heap_create(indexRelationName, namespaceId, indexTupDesc, shared_relation, true, allow_system_table_mods); /* Fetch the relation OID assigned by heap_create */ indexoid = RelationGetRelid(indexRelation); /* * Obtain exclusive lock on it. Although no other backends can see it * until we commit, this prevents deadlock-risk complaints from lock * manager in cases such as CLUSTER. */ LockRelation(indexRelation, AccessExclusiveLock); /* * Fill in fields of the index's pg_class entry that are not set * correctly by heap_create. * * XXX should have a cleaner way to create cataloged indexes */ indexRelation->rd_rel->relowner = GetUserId(); indexRelation->rd_rel->relam = accessMethodObjectId; indexRelation->rd_rel->relkind = RELKIND_INDEX; indexRelation->rd_rel->relhasoids = false; /* * store index's pg_class entry */ UpdateRelationRelation(indexRelation); /* * now update the object id's of all the attribute tuple forms in the * index relation's tuple descriptor */ InitializeAttributeOids(indexRelation, indexInfo->ii_NumIndexAttrs, indexoid); /* * append ATTRIBUTE tuples for the index */ AppendAttributeTuples(indexRelation, indexInfo->ii_NumIndexAttrs); /* ---------------- * update pg_index * (append INDEX tuple) * * Note that this stows away a representation of "predicate". * (Or, could define a rule to maintain the predicate) --Nels, Feb '92 * ---------------- */ UpdateIndexRelation(indexoid, heapRelationId, indexInfo, classObjectId, primary); /* * Register constraint and dependencies for the index. * * If the index is from a CONSTRAINT clause, construct a pg_constraint * entry. The index is then linked to the constraint, which in turn * is linked to the table. If it's not a CONSTRAINT, make the * dependency directly on the table. * * We don't need a dependency on the namespace, because there'll be an * indirect dependency via our parent table. * * During bootstrap we can't register any dependencies, and we don't try * to make a constraint either. */ if (!IsBootstrapProcessingMode()) { ObjectAddress myself, referenced; myself.classId = RelOid_pg_class; myself.objectId = indexoid; myself.objectSubId = 0; if (isconstraint) { char constraintType; Oid conOid; if (primary) constraintType = CONSTRAINT_PRIMARY; else if (indexInfo->ii_Unique) constraintType = CONSTRAINT_UNIQUE; else { elog(ERROR, "constraint must be PRIMARY or UNIQUE"); constraintType = 0; /* keep compiler quiet */ } /* Shouldn't have any expressions */ if (indexInfo->ii_Expressions) elog(ERROR, "constraints can't have index expressions"); conOid = CreateConstraintEntry(indexRelationName, namespaceId, constraintType, false, /* isDeferrable */ false, /* isDeferred */ heapRelationId, indexInfo->ii_KeyAttrNumbers, indexInfo->ii_NumIndexAttrs, InvalidOid, /* no domain */ InvalidOid, /* no foreign key */ NULL, 0, ' ', ' ', ' ', InvalidOid, /* no associated index */ NULL, /* no check constraint */ NULL, NULL); referenced.classId = get_system_catalog_relid(ConstraintRelationName); referenced.objectId = conOid; referenced.objectSubId = 0; recordDependencyOn(&myself, &referenced, DEPENDENCY_INTERNAL); } else { /* Create auto dependencies on simply-referenced columns */ for (i = 0; i < indexInfo->ii_NumIndexAttrs; i++) { if (indexInfo->ii_KeyAttrNumbers[i] != 0) { referenced.classId = RelOid_pg_class; referenced.objectId = heapRelationId; referenced.objectSubId = indexInfo->ii_KeyAttrNumbers[i]; recordDependencyOn(&myself, &referenced, DEPENDENCY_AUTO); } } } /* Store dependency on operator classes */ referenced.classId = get_system_catalog_relid(OperatorClassRelationName); for (i = 0; i < indexInfo->ii_NumIndexAttrs; i++) { referenced.objectId = classObjectId[i]; referenced.objectSubId = 0; recordDependencyOn(&myself, &referenced, DEPENDENCY_NORMAL); } /* Store dependencies on anything mentioned in index expressions */ if (indexInfo->ii_Expressions) { recordDependencyOnSingleRelExpr(&myself, (Node *) indexInfo->ii_Expressions, heapRelationId, DEPENDENCY_NORMAL, DEPENDENCY_AUTO); } /* Store dependencies on anything mentioned in predicate */ if (indexInfo->ii_Predicate) { recordDependencyOnSingleRelExpr(&myself, (Node *) indexInfo->ii_Predicate, heapRelationId, DEPENDENCY_NORMAL, DEPENDENCY_AUTO); } } /* * Advance the command counter so that we can see the newly-entered * catalog tuples for the index. */ CommandCounterIncrement(); /* * In bootstrap mode, we have to fill in the index strategy structure * with information from the catalogs. If we aren't bootstrapping, * then the relcache entry has already been rebuilt thanks to sinval * update during CommandCounterIncrement. */ if (IsBootstrapProcessingMode()) RelationInitIndexAccessInfo(indexRelation); else Assert(indexRelation->rd_indexcxt != NULL); /* * If this is bootstrap (initdb) time, then we don't actually fill in * the index yet. We'll be creating more indexes and classes later, * so we delay filling them in until just before we're done with * bootstrapping. Otherwise, we call the routine that constructs the * index. * * In normal processing mode, the heap and index relations are closed by * index_build() --- but we continue to hold the ShareLock on the heap * and the exclusive lock on the index that we acquired above, until * end of transaction. */ if (IsBootstrapProcessingMode()) { index_register(heapRelationId, indexoid, indexInfo); /* XXX shouldn't we close the heap and index rels here? */ } else index_build(heapRelation, indexRelation, indexInfo); return indexoid; }
/* * CREATE SCHEMA */ void CreateSchemaCommand(CreateSchemaStmt *stmt, const char *queryString) { const char *schemaName = stmt->schemaname; const char *authId = stmt->authid; Oid namespaceId; OverrideSearchPath *overridePath; List *parsetree_list; ListCell *parsetree_item; Oid owner_uid; Oid saved_uid; int save_sec_context; AclResult aclresult; GetUserIdAndSecContext(&saved_uid, &save_sec_context); /* * Who is supposed to own the new schema? */ if (authId) owner_uid = get_role_oid(authId, false); else owner_uid = saved_uid; /* * To create a schema, must have schema-create privilege on the current * database and must be able to become the target role (this does not * imply that the target role itself must have create-schema privilege). * The latter provision guards against "giveaway" attacks. Note that a * superuser will always have both of these privileges a fortiori. */ aclresult = pg_database_aclcheck(MyDatabaseId, saved_uid, ACL_CREATE); if (aclresult != ACLCHECK_OK) aclcheck_error(aclresult, ACL_KIND_DATABASE, get_database_name(MyDatabaseId)); check_is_member_of_role(saved_uid, owner_uid); /* Additional check to protect reserved schema names */ if (!allowSystemTableMods && IsReservedName(schemaName)) ereport(ERROR, (errcode(ERRCODE_RESERVED_NAME), errmsg("unacceptable schema name \"%s\"", schemaName), errdetail("The prefix \"pg_\" is reserved for system schemas."))); /* * If if_not_exists was given and the schema already exists, bail out. * (Note: we needn't check this when not if_not_exists, because * NamespaceCreate will complain anyway.) We could do this before making * the permissions checks, but since CREATE TABLE IF NOT EXISTS makes its * creation-permission check first, we do likewise. */ if (stmt->if_not_exists && SearchSysCacheExists1(NAMESPACENAME, PointerGetDatum(schemaName))) { ereport(NOTICE, (errcode(ERRCODE_DUPLICATE_SCHEMA), errmsg("schema \"%s\" already exists, skipping", schemaName))); return; } /* * If the requested authorization is different from the current user, * temporarily set the current user so that the object(s) will be created * with the correct ownership. * * (The setting will be restored at the end of this routine, or in case of * error, transaction abort will clean things up.) */ if (saved_uid != owner_uid) SetUserIdAndSecContext(owner_uid, save_sec_context | SECURITY_LOCAL_USERID_CHANGE); /* Create the schema's namespace */ namespaceId = NamespaceCreate(schemaName, owner_uid, false); /* Advance cmd counter to make the namespace visible */ CommandCounterIncrement(); /* * Temporarily make the new namespace be the front of the search path, as * well as the default creation target namespace. This will be undone at * the end of this routine, or upon error. */ overridePath = GetOverrideSearchPath(CurrentMemoryContext); overridePath->schemas = lcons_oid(namespaceId, overridePath->schemas); /* XXX should we clear overridePath->useTemp? */ PushOverrideSearchPath(overridePath); /* * Examine the list of commands embedded in the CREATE SCHEMA command, and * reorganize them into a sequentially executable order with no forward * references. Note that the result is still a list of raw parsetrees --- * we cannot, in general, run parse analysis on one statement until we * have actually executed the prior ones. */ parsetree_list = transformCreateSchemaStmt(stmt); /* * Execute each command contained in the CREATE SCHEMA. Since the grammar * allows only utility commands in CREATE SCHEMA, there is no need to pass * them through parse_analyze() or the rewriter; we can just hand them * straight to ProcessUtility. */ foreach(parsetree_item, parsetree_list) { Node *stmt = (Node *) lfirst(parsetree_item); /* do this step */ ProcessUtility(stmt, queryString, NULL, None_Receiver, NULL, PROCESS_UTILITY_SUBCOMMAND); /* make sure later steps can see the object created here */ CommandCounterIncrement(); }
/** * @fn Datum reorg_swap(PG_FUNCTION_ARGS) * @brief Swapping relfilenode of tables and relation ids of toast tables * and toast indexes. * * reorg_swap(oid, relname) * * TODO: remove useless CommandCounterIncrement(). * * @param oid Oid of table of target. * @retval None. */ Datum reorg_swap(PG_FUNCTION_ARGS) { Oid oid = PG_GETARG_OID(0); const char *relname = get_quoted_relname(oid); const char *nspname = get_quoted_nspname(oid); Oid argtypes[1] = { OIDOID }; bool nulls[1] = { 0 }; Datum values[1]; SPITupleTable *tuptable; TupleDesc desc; HeapTuple tuple; uint32 records; uint32 i; Oid reltoastrelid1; Oid reltoastidxid1; Oid oid2; Oid reltoastrelid2; Oid reltoastidxid2; Oid owner1; Oid owner2; /* authority check */ must_be_superuser("reorg_swap"); /* connect to SPI manager */ reorg_init(); /* swap relfilenode and dependencies for tables. */ values[0] = ObjectIdGetDatum(oid); execute_with_args(SPI_OK_SELECT, "SELECT X.reltoastrelid, TX.reltoastidxid, X.relowner," " Y.oid, Y.reltoastrelid, TY.reltoastidxid, Y.relowner" " FROM pg_catalog.pg_class X LEFT JOIN pg_catalog.pg_class TX" " ON X.reltoastrelid = TX.oid," " pg_catalog.pg_class Y LEFT JOIN pg_catalog.pg_class TY" " ON Y.reltoastrelid = TY.oid" " WHERE X.oid = $1" " AND Y.oid = ('reorg.table_' || X.oid)::regclass", 1, argtypes, values, nulls); tuptable = SPI_tuptable; desc = tuptable->tupdesc; records = SPI_processed; if (records == 0) elog(ERROR, "reorg_swap : no swap target"); tuple = tuptable->vals[0]; reltoastrelid1 = getoid(tuple, desc, 1); reltoastidxid1 = getoid(tuple, desc, 2); owner1 = getoid(tuple, desc, 3); oid2 = getoid(tuple, desc, 4); reltoastrelid2 = getoid(tuple, desc, 5); reltoastidxid2 = getoid(tuple, desc, 6); owner2 = getoid(tuple, desc, 7); /* change owner of new relation to original owner */ if (owner1 != owner2) { ATExecChangeOwner(oid2, owner1, true, AccessExclusiveLock); CommandCounterIncrement(); } /* swap tables. */ swap_heap_or_index_files(oid, oid2); CommandCounterIncrement(); /* swap indexes. */ values[0] = ObjectIdGetDatum(oid); execute_with_args(SPI_OK_SELECT, "SELECT X.oid, Y.oid" " FROM pg_catalog.pg_index I," " pg_catalog.pg_class X," " pg_catalog.pg_class Y" " WHERE I.indrelid = $1" " AND I.indexrelid = X.oid" " AND I.indisvalid" " AND Y.oid = ('reorg.index_' || X.oid)::regclass", 1, argtypes, values, nulls); tuptable = SPI_tuptable; desc = tuptable->tupdesc; records = SPI_processed; for (i = 0; i < records; i++) { Oid idx1, idx2; tuple = tuptable->vals[i]; idx1 = getoid(tuple, desc, 1); idx2 = getoid(tuple, desc, 2); swap_heap_or_index_files(idx1, idx2); CommandCounterIncrement(); } /* swap names for toast tables and toast indexes */ if (reltoastrelid1 == InvalidOid) { if (reltoastidxid1 != InvalidOid || reltoastrelid2 != InvalidOid || reltoastidxid2 != InvalidOid) elog(ERROR, "reorg_swap : unexpected toast relations (T1=%u, I1=%u, T2=%u, I2=%u", reltoastrelid1, reltoastidxid1, reltoastrelid2, reltoastidxid2); /* do nothing */ } else if (reltoastrelid2 == InvalidOid) { char name[NAMEDATALEN]; if (reltoastidxid1 == InvalidOid || reltoastidxid2 != InvalidOid) elog(ERROR, "reorg_swap : unexpected toast relations (T1=%u, I1=%u, T2=%u, I2=%u", reltoastrelid1, reltoastidxid1, reltoastrelid2, reltoastidxid2); /* rename X to Y */ snprintf(name, NAMEDATALEN, "pg_toast_%u", oid2); RENAME_REL(reltoastrelid1, name); snprintf(name, NAMEDATALEN, "pg_toast_%u_index", oid2); RENAME_REL(reltoastidxid1, name); CommandCounterIncrement(); } else if (reltoastrelid1 != InvalidOid) { char name[NAMEDATALEN]; int pid = getpid(); /* rename X to TEMP */ snprintf(name, NAMEDATALEN, "pg_toast_pid%d", pid); RENAME_REL(reltoastrelid1, name); snprintf(name, NAMEDATALEN, "pg_toast_pid%d_index", pid); RENAME_REL(reltoastidxid1, name); CommandCounterIncrement(); /* rename Y to X */ snprintf(name, NAMEDATALEN, "pg_toast_%u", oid); RENAME_REL(reltoastrelid2, name); snprintf(name, NAMEDATALEN, "pg_toast_%u_index", oid); RENAME_REL(reltoastidxid2, name); CommandCounterIncrement(); /* rename TEMP to Y */ snprintf(name, NAMEDATALEN, "pg_toast_%u", oid2); RENAME_REL(reltoastrelid1, name); snprintf(name, NAMEDATALEN, "pg_toast_%u_index", oid2); RENAME_REL(reltoastidxid1, name); CommandCounterIncrement(); } /* drop reorg trigger */ execute_with_format( SPI_OK_UTILITY, "DROP TRIGGER IF EXISTS z_reorg_trigger ON %s.%s CASCADE", nspname, relname); SPI_finish(); PG_RETURN_VOID(); }
/* * create_aoblkdir_table * * rel is already opened and exclusive-locked. * comptypeOid is InvalidOid. */ static bool create_aoblkdir_table(Relation rel, Oid aoblkdirOid, Oid aoblkdirIndexOid, Oid *comptypeOid) { Oid relOid = RelationGetRelid(rel); Oid aoblkdir_relid; Oid aoblkdir_idxid; bool shared_relation = rel->rd_rel->relisshared; char aoblkdir_relname[NAMEDATALEN]; char aoblkdir_idxname[NAMEDATALEN]; TupleDesc tupdesc; IndexInfo *indexInfo; Oid classObjectId[3]; ObjectAddress baseobject; ObjectAddress aoblkdirobject; Oid tablespaceOid = ChooseTablespaceForLimitedObject(rel->rd_rel->reltablespace); if (!RelationIsAoRows(rel)) return false; /* * We cannot allow creating a block directory for a shared relation * after initdb (because there's no way to let other databases know * this block directory. */ if (shared_relation && !IsBootstrapProcessingMode()) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("shared tables cannot have block directory after initdb"))); GetAppendOnlyEntryAuxOids(relOid, SnapshotNow, NULL,NULL, &aoblkdir_relid, &aoblkdir_idxid); /* * Does it have a block directory? */ if (aoblkdir_relid != InvalidOid) { return false; } snprintf(aoblkdir_relname, sizeof(aoblkdir_relname), "pg_aoblkdir_%u", relOid); snprintf(aoblkdir_idxname, sizeof(aoblkdir_idxname), "pg_aoblkdir_%u_index", relOid); /* Create a tuple descriptor */ tupdesc = CreateTemplateTupleDesc(4, false); TupleDescInitEntry(tupdesc, (AttrNumber) 1, "segno", INT4OID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 2, "columngroup_no", INT4OID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 3, "first_row_no", INT8OID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 4, "minipage", VARBITOID, -1, 0); /* * We don't want any toast columns here. */ tupdesc->attrs[0]->attstorage = 'p'; tupdesc->attrs[1]->attstorage = 'p'; tupdesc->attrs[2]->attstorage = 'p'; tupdesc->attrs[2]->attstorage = 'p'; /* * We place aoblkdir relation in the pg_aoseg namespace * even if its master relation is a temp table. There cannot be * any naming collision, and the aoblkdir relation will be * destroyed when its master is, so there is no need to handle * the aoblkdir relation as temp. */ aoblkdir_relid = heap_create_with_catalog(aoblkdir_relname, PG_AOSEGMENT_NAMESPACE, tablespaceOid, aoblkdirOid, rel->rd_rel->relowner, tupdesc, /* relam */ InvalidOid, RELKIND_AOBLOCKDIR, RELSTORAGE_HEAP, shared_relation, true, /* bufferPoolBulkLoad */ false, 0, ONCOMMIT_NOOP, NULL, /* GP Policy */ (Datum) 0, true, comptypeOid, /* persistentTid */ NULL, /* persistentSerialNum */ NULL); /* Make this table visible, else index creation will fail */ CommandCounterIncrement(); /* * Create index on segno, first_row_no. */ indexInfo = makeNode(IndexInfo); indexInfo->ii_NumIndexAttrs = 3; indexInfo->ii_KeyAttrNumbers[0] = 1; indexInfo->ii_KeyAttrNumbers[1] = 2; indexInfo->ii_KeyAttrNumbers[2] = 3; indexInfo->ii_Expressions = NIL; indexInfo->ii_ExpressionsState = NIL; indexInfo->ii_Predicate = NIL; indexInfo->ii_PredicateState = NIL; indexInfo->ii_Unique = false; indexInfo->ii_Concurrent = false; classObjectId[0] = INT4_BTREE_OPS_OID; classObjectId[1] = INT4_BTREE_OPS_OID; classObjectId[2] = INT8_BTREE_OPS_OID; aoblkdir_idxid = index_create(aoblkdirOid, aoblkdir_idxname, aoblkdirIndexOid, indexInfo, BTREE_AM_OID, tablespaceOid, classObjectId, (Datum) 0, true, false, (Oid *) NULL, true, false, false, NULL); /* Unlock target table -- no one can see it */ UnlockRelationOid(aoblkdirOid, ShareLock); /* Unlock the index -- no one can see it anyway */ UnlockRelationOid(aoblkdirIndexOid, AccessExclusiveLock); /* * Store the aoblkdir table's OID in the parent relation's pg_appendonly row. */ UpdateAppendOnlyEntryAuxOids(relOid, InvalidOid, InvalidOid, aoblkdir_relid, aoblkdir_idxid); /* * Register dependency from the aoseg table to the master, so that the * aoseg table will be deleted if the master is. */ baseobject.classId = RelationRelationId; baseobject.objectId = relOid; baseobject.objectSubId = 0; aoblkdirobject.classId = RelationRelationId; aoblkdirobject.objectId = aoblkdirOid; aoblkdirobject.objectSubId = 0; recordDependencyOn(&aoblkdirobject, &baseobject, DEPENDENCY_INTERNAL); /* * Make changes visible */ CommandCounterIncrement(); return true; }
/* ---------------- * UpdateStats * * Update pg_class' relpages and reltuples statistics for the given relation * (which can be either a table or an index). Note that this is not used * in the context of VACUUM. * ---------------- */ void UpdateStats(Oid relid, double reltuples) { Relation whichRel; Relation pg_class; HeapTuple tuple; BlockNumber relpages; Form_pg_class rd_rel; HeapScanDesc pg_class_scan = NULL; bool in_place_upd; /* * This routine handles updates for both the heap and index relation * statistics. In order to guarantee that we're able to *see* the * index relation tuple, we bump the command counter id here. The * index relation tuple was created in the current transaction. */ CommandCounterIncrement(); /* * CommandCounterIncrement() flushes invalid cache entries, including * those for the heap and index relations for which we're updating * statistics. Now that the cache is flushed, it's safe to open the * relation again. We need the relation open in order to figure out * how many blocks it contains. */ /* * Grabbing lock here is probably redundant ... */ whichRel = relation_open(relid, ShareLock); /* * Find the tuple to update in pg_class. Normally we make a copy of * the tuple using the syscache, modify it, and apply heap_update. * But in bootstrap mode we can't use heap_update, so we cheat and * overwrite the tuple in-place. * * We also must cheat if reindexing pg_class itself, because the * target index may presently not be part of the set of indexes that * CatalogUpdateIndexes would update (see reindex_relation). In this * case the stats updates will not be WAL-logged and so could be lost * in a crash. This seems OK considering VACUUM does the same thing. */ pg_class = heap_openr(RelationRelationName, RowExclusiveLock); in_place_upd = IsBootstrapProcessingMode() || ReindexIsProcessingHeap(RelationGetRelid(pg_class)); if (!in_place_upd) { tuple = SearchSysCacheCopy(RELOID, ObjectIdGetDatum(relid), 0, 0, 0); } else { ScanKeyData key[1]; ScanKeyEntryInitialize(&key[0], 0, ObjectIdAttributeNumber, F_OIDEQ, ObjectIdGetDatum(relid)); pg_class_scan = heap_beginscan(pg_class, SnapshotNow, 1, key); tuple = heap_getnext(pg_class_scan, ForwardScanDirection); } if (!HeapTupleIsValid(tuple)) elog(ERROR, "could not find tuple for relation %u", relid); rd_rel = (Form_pg_class) GETSTRUCT(tuple); /* * Figure values to insert. * * If we found zero tuples in the scan, do NOT believe it; instead put a * bogus estimate into the statistics fields. Otherwise, the common * pattern "CREATE TABLE; CREATE INDEX; insert data" leaves the table * with zero size statistics until a VACUUM is done. The optimizer * will generate very bad plans if the stats claim the table is empty * when it is actually sizable. See also CREATE TABLE in heap.c. * * Note: this path is also taken during bootstrap, because bootstrap.c * passes reltuples = 0 after loading a table. We have to estimate * some number for reltuples based on the actual number of pages. */ relpages = RelationGetNumberOfBlocks(whichRel); if (reltuples == 0) { if (relpages == 0) { /* Bogus defaults for a virgin table, same as heap.c */ reltuples = 1000; relpages = 10; } else if (whichRel->rd_rel->relkind == RELKIND_INDEX && relpages <= 2) { /* Empty index, leave bogus defaults in place */ reltuples = 1000; } else reltuples = ((double) relpages) * NTUPLES_PER_PAGE(whichRel->rd_rel->relnatts); } /* * Update statistics in pg_class, if they changed. (Avoiding an * unnecessary update is not just a tiny performance improvement; it * also reduces the window wherein concurrent CREATE INDEX commands * may conflict.) */ if (rd_rel->relpages != (int32) relpages || rd_rel->reltuples != (float4) reltuples) { if (in_place_upd) { /* Bootstrap or reindex case: overwrite fields in place. */ LockBuffer(pg_class_scan->rs_cbuf, BUFFER_LOCK_EXCLUSIVE); rd_rel->relpages = (int32) relpages; rd_rel->reltuples = (float4) reltuples; LockBuffer(pg_class_scan->rs_cbuf, BUFFER_LOCK_UNLOCK); WriteNoReleaseBuffer(pg_class_scan->rs_cbuf); if (!IsBootstrapProcessingMode()) CacheInvalidateHeapTuple(pg_class, tuple); } else { /* During normal processing, must work harder. */ rd_rel->relpages = (int32) relpages; rd_rel->reltuples = (float4) reltuples; simple_heap_update(pg_class, &tuple->t_self, tuple); CatalogUpdateIndexes(pg_class, tuple); } } if (!pg_class_scan) heap_freetuple(tuple); else heap_endscan(pg_class_scan); /* * We shouldn't have to do this, but we do... Modify the reldesc in * place with the new values so that the cache contains the latest * copy. (XXX is this really still necessary? The relcache will get * fixed at next CommandCounterIncrement, so why bother here?) */ whichRel->rd_rel->relpages = (int32) relpages; whichRel->rd_rel->reltuples = (float4) reltuples; heap_close(pg_class, RowExclusiveLock); relation_close(whichRel, NoLock); }