/* * GetParquetTotalBytes * * Get the total bytes for a specific parquet table from the pg_aoseg table on master. * * In hawq, master keep all segfile info in pg_aoseg table, * therefore it get the whole table size. */ int64 GetParquetTotalBytes(Relation parentrel, Snapshot parquetMetaDataSnapshot) { Relation pg_paqseg_rel; TupleDesc pg_paqseg_dsc; HeapTuple tuple; SysScanDesc parquetscan; int64 result; Datum eof; bool isNull; AppendOnlyEntry *aoEntry = NULL; aoEntry = GetAppendOnlyEntry(RelationGetRelid(parentrel), parquetMetaDataSnapshot); result = 0; pg_paqseg_rel = heap_open(aoEntry->segrelid, AccessShareLock); pg_paqseg_dsc = RelationGetDescr(pg_paqseg_rel); Assert (Gp_role != GP_ROLE_EXECUTE); parquetscan = systable_beginscan(pg_paqseg_rel, InvalidOid, FALSE, parquetMetaDataSnapshot, 0, NULL); while (HeapTupleIsValid(tuple = systable_getnext(parquetscan))) { eof = fastgetattr(tuple, Anum_pg_parquetseg_eof, pg_paqseg_dsc, &isNull); Assert(!isNull); result += (int64)DatumGetFloat8(eof); CHECK_FOR_INTERRUPTS(); } systable_endscan(parquetscan); heap_close(pg_paqseg_rel, AccessShareLock); pfree(aoEntry); return result; }
/* * Map a relation's (tablespace, filenode) to a relation's oid and cache the * result. * * Returns InvalidOid if no relation matching the criteria could be found. */ Oid RelidByRelfilenode(Oid reltablespace, Oid relfilenode) { RelfilenodeMapKey key; RelfilenodeMapEntry *entry; bool found; SysScanDesc scandesc; Relation relation; HeapTuple ntp; ScanKeyData skey[2]; if (RelfilenodeMapHash == NULL) InitializeRelfilenodeMap(); /* pg_class will show 0 when the value is actually MyDatabaseTableSpace */ if (reltablespace == MyDatabaseTableSpace) reltablespace = 0; MemSet(&key, 0, sizeof(key)); key.reltablespace = reltablespace; key.relfilenode = relfilenode; /* * Check cache and enter entry if nothing could be found. Even if no target * relation can be found later on we store the negative match and return a * InvalidOid from cache. That's not really necessary for performance since * querying invalid values isn't supposed to be a frequent thing, but the * implementation is simpler this way. */ entry = hash_search(RelfilenodeMapHash, (void *) &key, HASH_ENTER, &found); if (found) return entry->relid; /* ok, no previous cache entry, do it the hard way */ /* check shared tables */ if (reltablespace == GLOBALTABLESPACE_OID) { entry->relid = RelationMapFilenodeToOid(relfilenode, true); return entry->relid; } /* check plain relations by looking in pg_class */ relation = heap_open(RelationRelationId, AccessShareLock); /* copy scankey to local copy, it will be modified during the scan */ memcpy(skey, relfilenode_skey, sizeof(skey)); /* set scan arguments */ skey[0].sk_argument = ObjectIdGetDatum(reltablespace); skey[1].sk_argument = ObjectIdGetDatum(relfilenode); scandesc = systable_beginscan(relation, ClassTblspcRelfilenodeIndexId, true, NULL, 2, skey); found = false; while (HeapTupleIsValid(ntp = systable_getnext(scandesc))) { bool isnull PG_USED_FOR_ASSERTS_ONLY; if (found) elog(ERROR, "unexpected duplicate for tablespace %u, relfilenode %u", reltablespace, relfilenode); found = true; #ifdef USE_ASSERT_CHECKING if (assert_enabled) { Oid check; check = fastgetattr(ntp, Anum_pg_class_reltablespace, RelationGetDescr(relation), &isnull); Assert(!isnull && check == reltablespace); check = fastgetattr(ntp, Anum_pg_class_relfilenode, RelationGetDescr(relation), &isnull); Assert(!isnull && check == relfilenode); } #endif entry->relid = HeapTupleGetOid(ntp); } systable_endscan(scandesc); heap_close(relation, AccessShareLock); /* check for tables that are mapped but not shared */ if (!found) entry->relid = RelationMapFilenodeToOid(relfilenode, false); return entry->relid; }
/* ---------- * toast_fetch_datum_slice - * * Reconstruct a segment of a Datum from the chunks saved * in the toast relation * ---------- */ static struct varlena * toast_fetch_datum_slice(struct varlena * attr, int32 sliceoffset, int32 length) { Relation toastrel; Relation toastidx; ScanKeyData toastkey[3]; int nscankeys; SysScanDesc toastscan; HeapTuple ttup; TupleDesc toasttupDesc; struct varlena *result; struct varatt_external toast_pointer; int32 attrsize; int32 residx; int32 nextidx; int numchunks; int startchunk; int endchunk; int32 startoffset; int32 endoffset; int totalchunks; Pointer chunk; bool isnull; char *chunkdata; int32 chunksize; int32 chcpystrt; int32 chcpyend; Assert(VARATT_IS_EXTERNAL(attr)); /* Must copy to access aligned fields */ VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr); /* * It's nonsense to fetch slices of a compressed datum -- this isn't lo_* * we can't return a compressed datum which is meaningful to toast later */ Assert(!VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer)); attrsize = toast_pointer.va_extsize; totalchunks = ((attrsize - 1) / TOAST_MAX_CHUNK_SIZE) + 1; if (sliceoffset >= attrsize) { sliceoffset = 0; length = 0; } if (((sliceoffset + length) > attrsize) || length < 0) length = attrsize - sliceoffset; result = (struct varlena *) palloc(length + VARHDRSZ); if (VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer)) SET_VARSIZE_COMPRESSED(result, length + VARHDRSZ); else SET_VARSIZE(result, length + VARHDRSZ); if (length == 0) return result; /* Can save a lot of work at this point! */ startchunk = sliceoffset / TOAST_MAX_CHUNK_SIZE; endchunk = (sliceoffset + length - 1) / TOAST_MAX_CHUNK_SIZE; numchunks = (endchunk - startchunk) + 1; startoffset = sliceoffset % TOAST_MAX_CHUNK_SIZE; endoffset = (sliceoffset + length - 1) % TOAST_MAX_CHUNK_SIZE; /* * Open the toast relation and its index */ toastrel = heap_open(toast_pointer.va_toastrelid, AccessShareLock); toasttupDesc = toastrel->rd_att; toastidx = index_open(toastrel->rd_rel->reltoastidxid, AccessShareLock); /* * Setup a scan key to fetch from the index. This is either two keys or * three depending on the number of chunks. */ ScanKeyInit(&toastkey[0], (AttrNumber) 1, BTEqualStrategyNumber, F_OIDEQ, ObjectIdGetDatum(toast_pointer.va_valueid)); /* * Use equality condition for one chunk, a range condition otherwise: */ if (numchunks == 1) { ScanKeyInit(&toastkey[1], (AttrNumber) 2, BTEqualStrategyNumber, F_INT4EQ, Int32GetDatum(startchunk)); nscankeys = 2; } else { ScanKeyInit(&toastkey[1], (AttrNumber) 2, BTGreaterEqualStrategyNumber, F_INT4GE, Int32GetDatum(startchunk)); ScanKeyInit(&toastkey[2], (AttrNumber) 2, BTLessEqualStrategyNumber, F_INT4LE, Int32GetDatum(endchunk)); nscankeys = 3; } /* * Read the chunks by index * * The index is on (valueid, chunkidx) so they will come in order */ nextidx = startchunk; toastscan = systable_beginscan_ordered(toastrel, toastidx, SnapshotToast, nscankeys, toastkey); while ((ttup = systable_getnext_ordered(toastscan, ForwardScanDirection)) != NULL) { /* * Have a chunk, extract the sequence number and the data */ residx = DatumGetInt32(fastgetattr(ttup, 2, toasttupDesc, &isnull)); Assert(!isnull); chunk = DatumGetPointer(fastgetattr(ttup, 3, toasttupDesc, &isnull)); Assert(!isnull); if (!VARATT_IS_EXTENDED(chunk)) { chunksize = VARSIZE(chunk) - VARHDRSZ; chunkdata = VARDATA(chunk); } else if (VARATT_IS_SHORT(chunk)) { /* could happen due to heap_form_tuple doing its thing */ chunksize = VARSIZE_SHORT(chunk) - VARHDRSZ_SHORT; chunkdata = VARDATA_SHORT(chunk); } else { /* should never happen */ elog(ERROR, "found toasted toast chunk for toast value %u in %s", toast_pointer.va_valueid, RelationGetRelationName(toastrel)); chunksize = 0; /* keep compiler quiet */ chunkdata = NULL; } /* * Some checks on the data we've found */ if ((residx != nextidx) || (residx > endchunk) || (residx < startchunk)) elog(ERROR, "unexpected chunk number %d (expected %d) for toast value %u in %s", residx, nextidx, toast_pointer.va_valueid, RelationGetRelationName(toastrel)); if (residx < totalchunks - 1) { if (chunksize != TOAST_MAX_CHUNK_SIZE) elog(ERROR, "unexpected chunk size %d (expected %d) in chunk %d of %d for toast value %u in %s when fetching slice", chunksize, (int) TOAST_MAX_CHUNK_SIZE, residx, totalchunks, toast_pointer.va_valueid, RelationGetRelationName(toastrel)); } else if (residx == totalchunks - 1) { if ((residx * TOAST_MAX_CHUNK_SIZE + chunksize) != attrsize) elog(ERROR, "unexpected chunk size %d (expected %d) in final chunk %d for toast value %u in %s when fetching slice", chunksize, (int) (attrsize - residx * TOAST_MAX_CHUNK_SIZE), residx, toast_pointer.va_valueid, RelationGetRelationName(toastrel)); } else elog(ERROR, "unexpected chunk number %d (out of range %d..%d) for toast value %u in %s", residx, 0, totalchunks - 1, toast_pointer.va_valueid, RelationGetRelationName(toastrel)); /* * Copy the data into proper place in our result */ chcpystrt = 0; chcpyend = chunksize - 1; if (residx == startchunk) chcpystrt = startoffset; if (residx == endchunk) chcpyend = endoffset; memcpy(VARDATA(result) + (residx * TOAST_MAX_CHUNK_SIZE - sliceoffset) + chcpystrt, chunkdata + chcpystrt, (chcpyend - chcpystrt) + 1); nextidx++; } /* * Final checks that we successfully fetched the datum */ if (nextidx != (endchunk + 1)) elog(ERROR, "missing chunk number %d for toast value %u in %s", nextidx, toast_pointer.va_valueid, RelationGetRelationName(toastrel)); /* * End scan and close relations */ systable_endscan_ordered(toastscan); index_close(toastidx, AccessShareLock); heap_close(toastrel, AccessShareLock); return result; }
/* ---------- * toast_fetch_datum - * * Reconstruct an in memory Datum from the chunks saved * in the toast relation * ---------- */ static struct varlena * toast_fetch_datum(struct varlena * attr) { Relation toastrel; Relation toastidx; ScanKeyData toastkey; SysScanDesc toastscan; HeapTuple ttup; TupleDesc toasttupDesc; struct varlena *result; struct varatt_external toast_pointer; int32 ressize; int32 residx, nextidx; int32 numchunks; Pointer chunk; bool isnull; char *chunkdata; int32 chunksize; /* Must copy to access aligned fields */ VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr); ressize = toast_pointer.va_extsize; numchunks = ((ressize - 1) / TOAST_MAX_CHUNK_SIZE) + 1; result = (struct varlena *) palloc(ressize + VARHDRSZ); if (VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer)) SET_VARSIZE_COMPRESSED(result, ressize + VARHDRSZ); else SET_VARSIZE(result, ressize + VARHDRSZ); /* * Open the toast relation and its index */ toastrel = heap_open(toast_pointer.va_toastrelid, AccessShareLock); toasttupDesc = toastrel->rd_att; toastidx = index_open(toastrel->rd_rel->reltoastidxid, AccessShareLock); /* * Setup a scan key to fetch from the index by va_valueid */ ScanKeyInit(&toastkey, (AttrNumber) 1, BTEqualStrategyNumber, F_OIDEQ, ObjectIdGetDatum(toast_pointer.va_valueid)); /* * Read the chunks by index * * Note that because the index is actually on (valueid, chunkidx) we will * see the chunks in chunkidx order, even though we didn't explicitly ask * for it. */ nextidx = 0; toastscan = systable_beginscan_ordered(toastrel, toastidx, SnapshotToast, 1, &toastkey); while ((ttup = systable_getnext_ordered(toastscan, ForwardScanDirection)) != NULL) { /* * Have a chunk, extract the sequence number and the data */ residx = DatumGetInt32(fastgetattr(ttup, 2, toasttupDesc, &isnull)); Assert(!isnull); chunk = DatumGetPointer(fastgetattr(ttup, 3, toasttupDesc, &isnull)); Assert(!isnull); if (!VARATT_IS_EXTENDED(chunk)) { chunksize = VARSIZE(chunk) - VARHDRSZ; chunkdata = VARDATA(chunk); } else if (VARATT_IS_SHORT(chunk)) { /* could happen due to heap_form_tuple doing its thing */ chunksize = VARSIZE_SHORT(chunk) - VARHDRSZ_SHORT; chunkdata = VARDATA_SHORT(chunk); } else { /* should never happen */ elog(ERROR, "found toasted toast chunk for toast value %u in %s", toast_pointer.va_valueid, RelationGetRelationName(toastrel)); chunksize = 0; /* keep compiler quiet */ chunkdata = NULL; } /* * Some checks on the data we've found */ if (residx != nextidx) elog(ERROR, "unexpected chunk number %d (expected %d) for toast value %u in %s", residx, nextidx, toast_pointer.va_valueid, RelationGetRelationName(toastrel)); if (residx < numchunks - 1) { if (chunksize != TOAST_MAX_CHUNK_SIZE) elog(ERROR, "unexpected chunk size %d (expected %d) in chunk %d of %d for toast value %u in %s", chunksize, (int) TOAST_MAX_CHUNK_SIZE, residx, numchunks, toast_pointer.va_valueid, RelationGetRelationName(toastrel)); } else if (residx == numchunks - 1) { if ((residx * TOAST_MAX_CHUNK_SIZE + chunksize) != ressize) elog(ERROR, "unexpected chunk size %d (expected %d) in final chunk %d for toast value %u in %s", chunksize, (int) (ressize - residx * TOAST_MAX_CHUNK_SIZE), residx, toast_pointer.va_valueid, RelationGetRelationName(toastrel)); } else elog(ERROR, "unexpected chunk number %d (out of range %d..%d) for toast value %u in %s", residx, 0, numchunks - 1, toast_pointer.va_valueid, RelationGetRelationName(toastrel)); /* * Copy the data into proper place in our result */ memcpy(VARDATA(result) + residx * TOAST_MAX_CHUNK_SIZE, chunkdata, chunksize); nextidx++; } /* * Final checks that we successfully fetched the datum */ if (nextidx != numchunks) elog(ERROR, "missing chunk number %d for toast value %u in %s", nextidx, toast_pointer.va_valueid, RelationGetRelationName(toastrel)); /* * End scan and close relations */ systable_endscan_ordered(toastscan); index_close(toastidx, AccessShareLock); heap_close(toastrel, AccessShareLock); return result; }
/* * CloneForeignKeyConstraints * Clone foreign keys from a partitioned table to a newly acquired * partition. * * relationId is a partition of parentId, so we can be certain that it has the * same columns with the same datatypes. The columns may be in different * order, though. * * The *cloned list is appended ClonedConstraint elements describing what was * created. */ void CloneForeignKeyConstraints(Oid parentId, Oid relationId, List **cloned) { Relation pg_constraint; Relation parentRel; Relation rel; ScanKeyData key; SysScanDesc scan; TupleDesc tupdesc; HeapTuple tuple; AttrNumber *attmap; parentRel = heap_open(parentId, NoLock); /* already got lock */ /* see ATAddForeignKeyConstraint about lock level */ rel = heap_open(relationId, AccessExclusiveLock); pg_constraint = heap_open(ConstraintRelationId, RowShareLock); tupdesc = RelationGetDescr(pg_constraint); /* * The constraint key may differ, if the columns in the partition are * different. This map is used to convert them. */ attmap = convert_tuples_by_name_map(RelationGetDescr(rel), RelationGetDescr(parentRel), gettext_noop("could not convert row type")); ScanKeyInit(&key, Anum_pg_constraint_conrelid, BTEqualStrategyNumber, F_OIDEQ, ObjectIdGetDatum(parentId)); scan = systable_beginscan(pg_constraint, ConstraintRelidIndexId, true, NULL, 1, &key); while ((tuple = systable_getnext(scan)) != NULL) { Form_pg_constraint constrForm = (Form_pg_constraint) GETSTRUCT(tuple); AttrNumber conkey[INDEX_MAX_KEYS]; AttrNumber mapped_conkey[INDEX_MAX_KEYS]; AttrNumber confkey[INDEX_MAX_KEYS]; Oid conpfeqop[INDEX_MAX_KEYS]; Oid conppeqop[INDEX_MAX_KEYS]; Oid conffeqop[INDEX_MAX_KEYS]; Constraint *fkconstraint; ClonedConstraint *newc; Oid constrOid; ObjectAddress parentAddr, childAddr; int nelem; int i; ArrayType *arr; Datum datum; bool isnull; /* only foreign keys */ if (constrForm->contype != CONSTRAINT_FOREIGN) continue; ObjectAddressSet(parentAddr, ConstraintRelationId, HeapTupleGetOid(tuple)); datum = fastgetattr(tuple, Anum_pg_constraint_conkey, tupdesc, &isnull); if (isnull) elog(ERROR, "null conkey"); arr = DatumGetArrayTypeP(datum); nelem = ARR_DIMS(arr)[0]; if (ARR_NDIM(arr) != 1 || nelem < 1 || nelem > INDEX_MAX_KEYS || ARR_HASNULL(arr) || ARR_ELEMTYPE(arr) != INT2OID) elog(ERROR, "conkey is not a 1-D smallint array"); memcpy(conkey, ARR_DATA_PTR(arr), nelem * sizeof(AttrNumber)); for (i = 0; i < nelem; i++) mapped_conkey[i] = attmap[conkey[i] - 1]; datum = fastgetattr(tuple, Anum_pg_constraint_confkey, tupdesc, &isnull); if (isnull) elog(ERROR, "null confkey"); arr = DatumGetArrayTypeP(datum); nelem = ARR_DIMS(arr)[0]; if (ARR_NDIM(arr) != 1 || nelem < 1 || nelem > INDEX_MAX_KEYS || ARR_HASNULL(arr) || ARR_ELEMTYPE(arr) != INT2OID) elog(ERROR, "confkey is not a 1-D smallint array"); memcpy(confkey, ARR_DATA_PTR(arr), nelem * sizeof(AttrNumber)); datum = fastgetattr(tuple, Anum_pg_constraint_conpfeqop, tupdesc, &isnull); if (isnull) elog(ERROR, "null conpfeqop"); arr = DatumGetArrayTypeP(datum); nelem = ARR_DIMS(arr)[0]; if (ARR_NDIM(arr) != 1 || nelem < 1 || nelem > INDEX_MAX_KEYS || ARR_HASNULL(arr) || ARR_ELEMTYPE(arr) != OIDOID) elog(ERROR, "conpfeqop is not a 1-D OID array"); memcpy(conpfeqop, ARR_DATA_PTR(arr), nelem * sizeof(Oid)); datum = fastgetattr(tuple, Anum_pg_constraint_conpfeqop, tupdesc, &isnull); if (isnull) elog(ERROR, "null conpfeqop"); arr = DatumGetArrayTypeP(datum); nelem = ARR_DIMS(arr)[0]; if (ARR_NDIM(arr) != 1 || nelem < 1 || nelem > INDEX_MAX_KEYS || ARR_HASNULL(arr) || ARR_ELEMTYPE(arr) != OIDOID) elog(ERROR, "conpfeqop is not a 1-D OID array"); memcpy(conpfeqop, ARR_DATA_PTR(arr), nelem * sizeof(Oid)); datum = fastgetattr(tuple, Anum_pg_constraint_conppeqop, tupdesc, &isnull); if (isnull) elog(ERROR, "null conppeqop"); arr = DatumGetArrayTypeP(datum); nelem = ARR_DIMS(arr)[0]; if (ARR_NDIM(arr) != 1 || nelem < 1 || nelem > INDEX_MAX_KEYS || ARR_HASNULL(arr) || ARR_ELEMTYPE(arr) != OIDOID) elog(ERROR, "conppeqop is not a 1-D OID array"); memcpy(conppeqop, ARR_DATA_PTR(arr), nelem * sizeof(Oid)); datum = fastgetattr(tuple, Anum_pg_constraint_conffeqop, tupdesc, &isnull); if (isnull) elog(ERROR, "null conffeqop"); arr = DatumGetArrayTypeP(datum); nelem = ARR_DIMS(arr)[0]; if (ARR_NDIM(arr) != 1 || nelem < 1 || nelem > INDEX_MAX_KEYS || ARR_HASNULL(arr) || ARR_ELEMTYPE(arr) != OIDOID) elog(ERROR, "conffeqop is not a 1-D OID array"); memcpy(conffeqop, ARR_DATA_PTR(arr), nelem * sizeof(Oid)); constrOid = CreateConstraintEntry(NameStr(constrForm->conname), constrForm->connamespace, CONSTRAINT_FOREIGN, constrForm->condeferrable, constrForm->condeferred, constrForm->convalidated, HeapTupleGetOid(tuple), relationId, mapped_conkey, nelem, nelem, InvalidOid, /* not a domain constraint */ constrForm->conindid, /* same index */ constrForm->confrelid, /* same foreign rel */ confkey, conpfeqop, conppeqop, conffeqop, nelem, constrForm->confupdtype, constrForm->confdeltype, constrForm->confmatchtype, NULL, NULL, NULL, NULL, false, 1, false, true); ObjectAddressSet(childAddr, ConstraintRelationId, constrOid); recordDependencyOn(&childAddr, &parentAddr, DEPENDENCY_INTERNAL_AUTO); fkconstraint = makeNode(Constraint); /* for now this is all we need */ fkconstraint->fk_upd_action = constrForm->confupdtype; fkconstraint->fk_del_action = constrForm->confdeltype; fkconstraint->deferrable = constrForm->condeferrable; fkconstraint->initdeferred = constrForm->condeferred; createForeignKeyTriggers(rel, constrForm->confrelid, fkconstraint, constrOid, constrForm->conindid, false); if (cloned) { /* * Feed back caller about the constraints we created, so that they can * set up constraint verification. */ newc = palloc(sizeof(ClonedConstraint)); newc->relid = relationId; newc->refrelid = constrForm->confrelid; newc->conindid = constrForm->conindid; newc->conid = constrOid; newc->constraint = fkconstraint; *cloned = lappend(*cloned, newc); } } systable_endscan(scan); pfree(attmap); if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) { PartitionDesc partdesc = RelationGetPartitionDesc(rel); int i; for (i = 0; i < partdesc->nparts; i++) CloneForeignKeyConstraints(RelationGetRelid(rel), partdesc->oids[i], cloned); } heap_close(rel, NoLock); /* keep lock till commit */ heap_close(parentRel, NoLock); heap_close(pg_constraint, RowShareLock); }
/* * Map a relation's (tablespace, filenode) to a relation's oid and cache the * result. * * Returns InvalidOid if no relation matching the criteria could be found. */ Oid RelidByRelfilenode(Oid reltablespace, Oid relfilenode) { RelfilenodeMapKey key; RelfilenodeMapEntry *entry; bool found; SysScanDesc scandesc; Relation relation; HeapTuple ntp; ScanKeyData skey[2]; Oid relid; if (RelfilenodeMapHash == NULL) InitializeRelfilenodeMap(); /* pg_class will show 0 when the value is actually MyDatabaseTableSpace */ if (reltablespace == MyDatabaseTableSpace) reltablespace = 0; MemSet(&key, 0, sizeof(key)); key.reltablespace = reltablespace; key.relfilenode = relfilenode; /* * Check cache and return entry if one is found. Even if no target * relation can be found later on we store the negative match and return a * InvalidOid from cache. That's not really necessary for performance * since querying invalid values isn't supposed to be a frequent thing, * but it's basically free. */ entry = hash_search(RelfilenodeMapHash, (void *) &key, HASH_FIND, &found); if (found) return entry->relid; /* ok, no previous cache entry, do it the hard way */ /* initialize empty/negative cache entry before doing the actual lookups */ relid = InvalidOid; if (reltablespace == GLOBALTABLESPACE_OID) { /* * Ok, shared table, check relmapper. */ relid = RelationMapFilenodeToOid(relfilenode, true); } else { /* * Not a shared table, could either be a plain relation or a * non-shared, nailed one, like e.g. pg_class. */ /* check for plain relations by looking in pg_class */ relation = heap_open(RelationRelationId, AccessShareLock); /* copy scankey to local copy, it will be modified during the scan */ memcpy(skey, relfilenode_skey, sizeof(skey)); /* set scan arguments */ skey[0].sk_argument = ObjectIdGetDatum(reltablespace); skey[1].sk_argument = ObjectIdGetDatum(relfilenode); scandesc = systable_beginscan(relation, ClassTblspcRelfilenodeIndexId, true, NULL, 2, skey); found = false; while (HeapTupleIsValid(ntp = systable_getnext(scandesc))) { if (found) elog(ERROR, "unexpected duplicate for tablespace %u, relfilenode %u", reltablespace, relfilenode); found = true; #ifdef USE_ASSERT_CHECKING if (assert_enabled) { bool isnull; Oid check; check = fastgetattr(ntp, Anum_pg_class_reltablespace, RelationGetDescr(relation), &isnull); Assert(!isnull && check == reltablespace); check = fastgetattr(ntp, Anum_pg_class_relfilenode, RelationGetDescr(relation), &isnull); Assert(!isnull && check == relfilenode); } #endif relid = HeapTupleGetOid(ntp); } systable_endscan(scandesc); heap_close(relation, AccessShareLock); /* check for tables that are mapped but not shared */ if (!found) relid = RelationMapFilenodeToOid(relfilenode, false); } /* * Only enter entry into cache now, our opening of pg_class could have * caused cache invalidations to be executed which would have deleted a * new entry if we had entered it above. */ entry = hash_search(RelfilenodeMapHash, (void *) &key, HASH_ENTER, &found); if (found) elog(ERROR, "corrupted hashtable"); entry->relid = relid; return relid; }
/* ---------- * toast_fetch_datum_slice - * * Reconstruct a segment of a Datum from the chunks saved * in the toast relation * ---------- */ static struct varlena * toast_fetch_datum_slice(struct varlena *attr, int32 sliceoffset, int32 length) { Relation toastrel; Relation toastidx; ScanKeyData toastkey[3]; int nscankeys; IndexScanDesc toastscan; HeapTuple ttup; TupleDesc toasttupDesc; varattrib *result; int32 attrsize; int32 residx; int32 nextidx; int numchunks; int startchunk; int endchunk; int32 startoffset; int32 endoffset; int totalchunks; Pointer chunk; bool isnull; int32 chunksize; int32 chcpystrt; int32 chcpyend; attrsize = ((varattrib *)attr)->va_external.va_extsize; totalchunks = ((attrsize - 1) / TOAST_MAX_CHUNK_SIZE) + 1; if (sliceoffset >= attrsize) { sliceoffset = 0; length = 0; } if (((sliceoffset + length) > attrsize) || length < 0) length = attrsize - sliceoffset; result = (varattrib *) palloc(length + VARHDRSZ); SET_VARSIZE(result, length + VARHDRSZ); if (VARATT_EXTERNAL_IS_COMPRESSED(attr)) VARATT_SET_COMPRESSED(result); if (length == 0) return (struct varlena *)result; /* Can save a lot of work at this point! */ startchunk = sliceoffset / TOAST_MAX_CHUNK_SIZE; endchunk = (sliceoffset + length - 1) / TOAST_MAX_CHUNK_SIZE; numchunks = (endchunk - startchunk) + 1; startoffset = sliceoffset % TOAST_MAX_CHUNK_SIZE; endoffset = (sliceoffset + length - 1) % TOAST_MAX_CHUNK_SIZE; /* * Open the toast relation and its index */ toastrel = heap_open(((varattrib *)attr)->va_external.va_toastrelid, AccessShareLock); toasttupDesc = toastrel->rd_att; toastidx = index_open(toastrel->rd_rel->reltoastidxid, AccessShareLock); /* * Setup a scan key to fetch from the index. This is either two keys or * three depending on the number of chunks. */ ScanKeyInit(&toastkey[0], (AttrNumber) 1, BTEqualStrategyNumber, F_OIDEQ, ObjectIdGetDatum(((varattrib *)attr)->va_external.va_valueid)); /* * Use equality condition for one chunk, a range condition otherwise: */ if (numchunks == 1) { ScanKeyInit(&toastkey[1], (AttrNumber) 2, BTEqualStrategyNumber, F_INT4EQ, Int32GetDatum(startchunk)); nscankeys = 2; } else { ScanKeyInit(&toastkey[1], (AttrNumber) 2, BTGreaterEqualStrategyNumber, F_INT4GE, Int32GetDatum(startchunk)); ScanKeyInit(&toastkey[2], (AttrNumber) 2, BTLessEqualStrategyNumber, F_INT4LE, Int32GetDatum(endchunk)); nscankeys = 3; } /* * Read the chunks by index * * The index is on (valueid, chunkidx) so they will come in order */ nextidx = startchunk; toastscan = index_beginscan(toastrel, toastidx, SnapshotToast, nscankeys, toastkey); while ((ttup = index_getnext(toastscan, ForwardScanDirection)) != NULL) { /* * Have a chunk, extract the sequence number and the data */ residx = DatumGetInt32(fastgetattr(ttup, 2, toasttupDesc, &isnull)); Assert(!isnull); chunk = DatumGetPointer(fastgetattr(ttup, 3, toasttupDesc, &isnull)); Assert(!isnull); if (VARATT_IS_SHORT((varattrib *)chunk)) chunksize = VARSIZE_SHORT((varattrib *)chunk) - VARHDRSZ_SHORT; else if (!VARATT_IS_EXTENDED((varattrib *)chunk)) chunksize = VARSIZE((varattrib *)chunk) - VARHDRSZ; else { elog(ERROR, "found toasted toast chunk?"); chunksize = 0; /* shut compiler up */ } /* * Some checks on the data we've found */ if ((residx != nextidx) || (residx > endchunk) || (residx < startchunk)) elog(ERROR, "unexpected chunk number %d (expected %d) for toast value %u", residx, nextidx, ((varattrib *)attr)->va_external.va_valueid); if (residx < totalchunks - 1) { if (chunksize != TOAST_MAX_CHUNK_SIZE) elog(ERROR, "unexpected chunk size %d in chunk %d for toast value %u of %d when fetching slice (expected %d)", chunksize, residx, ((varattrib *)attr)->va_external.va_valueid, totalchunks-1, (int)TOAST_MAX_CHUNK_SIZE); } else if (residx == totalchunks-1) { if ((residx * TOAST_MAX_CHUNK_SIZE + chunksize) != attrsize) elog(ERROR, "unexpected chunk size %d in chunk %d for final toast value %u when fetching slice (expected %d)", chunksize, residx, ((varattrib *)attr)->va_external.va_valueid, attrsize - residx * (int)TOAST_MAX_CHUNK_SIZE); } else { elog(ERROR, "unexpected chunk"); } /* * Copy the data into proper place in our result */ chcpystrt = 0; chcpyend = chunksize - 1; if (residx == startchunk) chcpystrt = startoffset; if (residx == endchunk) chcpyend = endoffset; memcpy(((char *) VARDATA(result)) + (residx * TOAST_MAX_CHUNK_SIZE - sliceoffset) + chcpystrt, VARDATA((varattrib *)chunk) + chcpystrt, (chcpyend - chcpystrt) + 1); nextidx++; } /* * Final checks that we successfully fetched the datum */ if (nextidx != (endchunk + 1)) elog(ERROR, "missing chunk number %d for toast value %u", nextidx, ((varattrib *)attr)->va_external.va_valueid); /* * End scan and close relations */ index_endscan(toastscan); index_close(toastidx, AccessShareLock); heap_close(toastrel, AccessShareLock); return (struct varlena *)result; }
/* ---------- * toast_fetch_datum - * * Reconstruct an in memory Datum from the chunks saved * in the toast relation * ---------- */ static struct varlena * toast_fetch_datum(struct varlena *attr) { Relation toastrel; Relation toastidx; ScanKeyData toastkey; IndexScanDesc toastscan; HeapTuple ttup; TupleDesc toasttupDesc; varattrib *result; int32 ressize; int32 residx, nextidx; int32 numchunks; Pointer chunk; bool isnull; int32 chunksize; void *chunkdata; ressize = ((varattrib *)attr)->va_external.va_extsize; numchunks = ((ressize - 1) / TOAST_MAX_CHUNK_SIZE) + 1; result = (varattrib *) palloc(ressize + VARHDRSZ); SET_VARSIZE(result, ressize + VARHDRSZ); if (VARATT_EXTERNAL_IS_COMPRESSED(attr)) VARATT_SET_COMPRESSED(result); /* * Open the toast relation and its index */ toastrel = heap_open(((varattrib *)attr)->va_external.va_toastrelid, AccessShareLock); toasttupDesc = toastrel->rd_att; toastidx = index_open(toastrel->rd_rel->reltoastidxid, AccessShareLock); /* * Setup a scan key to fetch from the index by va_valueid */ ScanKeyInit(&toastkey, (AttrNumber) 1, BTEqualStrategyNumber, F_OIDEQ, ObjectIdGetDatum(((varattrib *)attr)->va_external.va_valueid)); /* * Read the chunks by index * * Note that because the index is actually on (valueid, chunkidx) we will * see the chunks in chunkidx order, even though we didn't explicitly ask * for it. */ nextidx = 0; toastscan = index_beginscan(toastrel, toastidx, SnapshotToast, 1, &toastkey); while ((ttup = index_getnext(toastscan, ForwardScanDirection)) != NULL) { /* * Have a chunk, extract the sequence number and the data */ residx = DatumGetInt32(fastgetattr(ttup, 2, toasttupDesc, &isnull)); Assert(!isnull); chunk = DatumGetPointer(fastgetattr(ttup, 3, toasttupDesc, &isnull)); Assert(!isnull); if (VARATT_IS_SHORT(chunk)) { chunksize = VARSIZE_SHORT(chunk) - VARHDRSZ_SHORT; chunkdata = VARDATA_SHORT(chunk); } else if (!VARATT_IS_EXTENDED(chunk)) { chunksize = VARSIZE(chunk) - VARHDRSZ; chunkdata = VARDATA(chunk); } else { elog(ERROR, "found toasted toast chunk?"); chunksize = 0; /* shut compiler up */ chunkdata = NULL; } /* * Some checks on the data we've found */ if (residx != nextidx) elog(ERROR, "unexpected chunk number %d (expected %d) for toast value %u", residx, nextidx, ((varattrib *)attr)->va_external.va_valueid); if (residx < numchunks - 1) { if (chunksize != TOAST_MAX_CHUNK_SIZE) elog(ERROR, "unexpected chunk size %d in chunk %d of %d for toast value %u (expected %d)", chunksize, residx, ((varattrib *)attr)->va_external.va_valueid, numchunks-1, (int)TOAST_MAX_CHUNK_SIZE); } else if (residx == numchunks-1) { if ((residx * TOAST_MAX_CHUNK_SIZE + chunksize) != ressize) elog(ERROR, "unexpected chunk size %d in final chunk %d for toast value %u (expected %d)", chunksize, residx, ((varattrib *)attr)->va_external.va_valueid, ressize - residx*(int)TOAST_MAX_CHUNK_SIZE); } else elog(ERROR, "unexpected chunk number %d for toast value %u (expected in %d..%d)", residx, ((varattrib *)attr)->va_external.va_valueid, 0, numchunks-1); /* * Copy the data into proper place in our result */ memcpy(((char *) VARDATA(result)) + residx * TOAST_MAX_CHUNK_SIZE, chunkdata, chunksize); nextidx++; } /* * Final checks that we successfully fetched the datum */ if (nextidx != numchunks) elog(ERROR, "missing chunk number %d for toast value %u", nextidx, ((varattrib *)attr)->va_external.va_valueid); /* * End scan and close relations */ index_endscan(toastscan); index_close(toastidx, AccessShareLock); heap_close(toastrel, AccessShareLock); return (struct varlena *)result; }
/* * GetFileSegInfo * * Get the catalog entry for an appendonly (row-oriented) relation from the * pg_aoseg_* relation that belongs to the currently used * AppendOnly table. * * If a caller intends to append to this file segment entry they must * already hold a relation Append-Only segment file (transaction-scope) lock (tag * LOCKTAG_RELATION_APPENDONLY_SEGMENT_FILE) in order to guarantee * stability of the pg_aoseg information on this segment file and exclusive right * to append data to the segment file. */ ParquetFileSegInfo * GetParquetFileSegInfo(Relation parentrel, AppendOnlyEntry *aoEntry, Snapshot parquetMetaDataSnapshot, int segno) { Relation pg_parquetseg_rel; TupleDesc pg_parquetseg_dsc; HeapTuple tuple; ScanKeyData key[1]; SysScanDesc parquetscan; Datum eof, eof_uncompressed, tupcount; bool isNull; bool indexOK; Oid indexid; ParquetFileSegInfo *fsinfo; /* * Check the pg_paqseg relation to be certain the parquet table segment file * is there. */ pg_parquetseg_rel = heap_open(aoEntry->segrelid, AccessShareLock); pg_parquetseg_dsc = RelationGetDescr(pg_parquetseg_rel); if (Gp_role == GP_ROLE_EXECUTE) { indexOK = FALSE; indexid = InvalidOid; } else { indexOK = TRUE; indexid = aoEntry->segidxid; } /* * Setup a scan key to fetch from the index by segno. */ ScanKeyInit(&key[0], (AttrNumber) Anum_pg_parquetseg_segno, BTEqualStrategyNumber, F_INT4EQ, Int32GetDatum(segno)); parquetscan = systable_beginscan(pg_parquetseg_rel, indexid, indexOK, SnapshotNow, 1, &key[0]); tuple = systable_getnext(parquetscan); if (!HeapTupleIsValid(tuple)) { /* This segment file does not have an entry. */ systable_endscan(parquetscan); heap_close(pg_parquetseg_rel, AccessShareLock); return NULL ; } tuple = heap_copytuple(tuple); systable_endscan(parquetscan); Assert(HeapTupleIsValid(tuple)); fsinfo = (ParquetFileSegInfo *) palloc0(sizeof(ParquetFileSegInfo)); /* get the eof */ eof = fastgetattr(tuple, Anum_pg_parquetseg_eof, pg_parquetseg_dsc, &isNull); if (isNull) ereport(ERROR, (errcode(ERRCODE_UNDEFINED_OBJECT), errmsg("got invalid eof value: NULL"))); /* get the tupcount */ tupcount = fastgetattr(tuple, Anum_pg_parquetseg_tupcount, pg_parquetseg_dsc, &isNull); if (isNull) ereport(ERROR, (errcode(ERRCODE_UNDEFINED_OBJECT), errmsg("got invalid tupcount value: NULL"))); /* get the uncompressed eof */ eof_uncompressed = fastgetattr(tuple, Anum_pg_parquetseg_eofuncompressed, pg_parquetseg_dsc, &isNull); /* * Confusing: This eof_uncompressed variable is never used. It appears we only * call fastgetattr to get the isNull value. this variable "eof_uncompressed" is * not at all the same as fsinfo->eof_uncompressed. */ if (isNull) { /* * NULL is allowed. Tables that were created before the release of the * eof_uncompressed catalog column will have a NULL instead of a value. */ fsinfo->eof_uncompressed = InvalidUncompressedEof; } else { fsinfo->eof_uncompressed = (int64) DatumGetFloat8(eof_uncompressed); } fsinfo->segno = segno; fsinfo->eof = (int64) DatumGetFloat8(eof); fsinfo->tupcount = (int64) DatumGetFloat8(tupcount); ItemPointerSetInvalid(&fsinfo->sequence_tid); if (fsinfo->eof < 0) ereport(ERROR, (errcode(ERRCODE_GP_INTERNAL_ERROR), errmsg("invalid eof " INT64_FORMAT " for relation %s", fsinfo->eof, RelationGetRelationName(parentrel)))); /* Finish up scan and close appendonly catalog. */ heap_close(pg_parquetseg_rel, AccessShareLock); return fsinfo; }
/* * Update the eof and filetupcount of a parquet table. */ void UpdateParquetFileSegInfo(Relation parentrel, AppendOnlyEntry *aoEntry, int segno, int64 eof, int64 eof_uncompressed, int64 tuples_added) { LockAcquireResult acquireResult; Relation pg_parquetseg_rel; TupleDesc pg_parquetseg_dsc; ScanKeyData key[1]; SysScanDesc parquetscan; HeapTuple tuple, new_tuple; Datum filetupcount; Datum new_tuple_count; Datum *new_record; bool *new_record_nulls; bool *new_record_repl; bool isNull; /* overflow sanity checks. don't check the same for tuples_added, * it may be coming as a negative diff from gp_update_ao_master_stats */ Assert(eof >= 0); Insist(Gp_role != GP_ROLE_EXECUTE); elog(DEBUG3, "UpdateParquetFileSegInfo called. segno = %d", segno); if (Gp_role != GP_ROLE_DISPATCH) { /* * Verify we already have the write-lock! */ acquireResult = LockRelationAppendOnlySegmentFile( &parentrel->rd_node, segno, AccessExclusiveLock, /* dontWait */ false); if (acquireResult != LOCKACQUIRE_ALREADY_HELD) { elog(ERROR, "Should already have the (transaction-scope) write-lock on Parquet segment file #%d, " "relation %s", segno, RelationGetRelationName(parentrel)); } } /* * Open the aoseg relation and its index. */ pg_parquetseg_rel = heap_open(aoEntry->segrelid, RowExclusiveLock); pg_parquetseg_dsc = pg_parquetseg_rel->rd_att; /* * Setup a scan key to fetch from the index by segno. */ ScanKeyInit(&key[0], (AttrNumber) Anum_pg_parquetseg_segno, BTEqualStrategyNumber, F_INT4EQ, Int32GetDatum(segno)); parquetscan = systable_beginscan(pg_parquetseg_rel, aoEntry->segidxid, TRUE, SnapshotNow, 1, &key[0]); tuple = systable_getnext(parquetscan); if (!HeapTupleIsValid(tuple)) ereport(ERROR, (errcode(ERRCODE_UNDEFINED_OBJECT), errmsg("parquet table \"%s\" file segment \"%d\" entry " "does not exist", RelationGetRelationName(parentrel), segno))); new_record = palloc0(sizeof(Datum) * pg_parquetseg_dsc->natts); new_record_nulls = palloc0(sizeof(bool) * pg_parquetseg_dsc->natts); new_record_repl = palloc0(sizeof(bool) * pg_parquetseg_dsc->natts); /* get the current tuple count so we can add to it */ filetupcount = fastgetattr(tuple, Anum_pg_parquetseg_tupcount, pg_parquetseg_dsc, &isNull); if(isNull) ereport(ERROR, (errcode(ERRCODE_UNDEFINED_OBJECT), errmsg("got invalid pg_aoseg filetupcount value: NULL"))); /* calculate the new tuple count */ new_tuple_count = DirectFunctionCall2(float8pl, filetupcount, Float8GetDatum((float8)tuples_added)); /* * Build a tuple to update */ new_record[Anum_pg_parquetseg_eof - 1] = Float8GetDatum((float8)eof); new_record_repl[Anum_pg_parquetseg_eof - 1] = true; new_record[Anum_pg_parquetseg_tupcount - 1] = new_tuple_count; new_record_repl[Anum_pg_parquetseg_tupcount - 1] = true; new_record[Anum_pg_parquetseg_eofuncompressed - 1] = Float8GetDatum((float8)eof_uncompressed); new_record_repl[Anum_pg_parquetseg_eofuncompressed - 1] = true; /* * update the tuple in the pg_aoseg table */ new_tuple = heap_modify_tuple(tuple, pg_parquetseg_dsc, new_record, new_record_nulls, new_record_repl); simple_heap_update(pg_parquetseg_rel, &tuple->t_self, new_tuple); CatalogUpdateIndexes(pg_parquetseg_rel, new_tuple); heap_freetuple(new_tuple); /* Finish up scan */ systable_endscan(parquetscan); heap_close(pg_parquetseg_rel, RowExclusiveLock); pfree(new_record); pfree(new_record_nulls); pfree(new_record_repl); }
/* * CatalogCacheFlushRelation * * This is called by RelationFlushRelation() to clear out cached information * about a relation being dropped. (This could be a DROP TABLE command, * or a temp table being dropped at end of transaction, or a table created * during the current transaction that is being dropped because of abort.) * Remove all cache entries relevant to the specified relation OID. * * A special case occurs when relId is itself one of the cacheable system * tables --- although those'll never be dropped, they can get flushed from * the relcache (VACUUM causes this, for example). In that case we need * to flush all cache entries that came from that table. (At one point we * also tried to force re-execution of CatalogCacheInitializeCache for * the cache(s) on that table. This is a bad idea since it leads to all * kinds of trouble if a cache flush occurs while loading cache entries. * We now avoid the need to do it by copying cc_tupdesc out of the relcache, * rather than relying on the relcache to keep a tupdesc for us. Of course * this assumes the tupdesc of a cachable system table will not change...) */ void CatalogCacheFlushRelation(Oid relId) { CatCache *cache; CACHE2_elog(DEBUG2, "CatalogCacheFlushRelation called for %u", relId); for (cache = CacheHdr->ch_caches; cache; cache = cache->cc_next) { int i; /* We can ignore uninitialized caches, since they must be empty */ if (cache->cc_tupdesc == NULL) continue; /* Does this cache store tuples of the target relation itself? */ if (cache->cc_tupdesc->attrs[0]->attrelid == relId) { /* Yes, so flush all its contents */ ResetCatalogCache(cache); continue; } /* Does this cache store tuples associated with relations at all? */ if (cache->cc_reloidattr == 0) continue; /* nope, leave it alone */ /* Yes, scan the tuples and remove those related to relId */ for (i = 0; i < cache->cc_nbuckets; i++) { Dlelem *elt, *nextelt; for (elt = DLGetHead(&cache->cc_bucket[i]); elt; elt = nextelt) { CatCTup *ct = (CatCTup *) DLE_VAL(elt); Oid tupRelid; nextelt = DLGetSucc(elt); /* * Negative entries are never considered related to a rel, * even if the rel is part of their lookup key. */ if (ct->negative) continue; if (cache->cc_reloidattr == ObjectIdAttributeNumber) tupRelid = HeapTupleGetOid(&ct->tuple); else { bool isNull; tupRelid = DatumGetObjectId(fastgetattr(&ct->tuple, cache->cc_reloidattr, cache->cc_tupdesc, &isNull)); Assert(!isNull); } if (tupRelid == relId) { if (ct->refcount > 0) ct->dead = true; else CatCacheRemoveCTup(cache, ct); #ifdef CATCACHE_STATS cache->cc_invals++; #endif } } } } CACHE1_elog(DEBUG2, "end of CatalogCacheFlushRelation call"); }