/* * btinsert() -- insert an index tuple into a btree. * * Descend the tree recursively, find the appropriate location for our * new tuple, and put it there. */ Datum btinsert(PG_FUNCTION_ARGS) { MIRROREDLOCK_BUFMGR_VERIFY_NO_LOCK_LEAK_DECLARE; Relation rel = (Relation) PG_GETARG_POINTER(0); Datum *values = (Datum *) PG_GETARG_POINTER(1); bool *isnull = (bool *) PG_GETARG_POINTER(2); ItemPointer ht_ctid = (ItemPointer) PG_GETARG_POINTER(3); Relation heapRel = (Relation) PG_GETARG_POINTER(4); bool checkUnique = PG_GETARG_BOOL(5); IndexTuple itup; MIRROREDLOCK_BUFMGR_VERIFY_NO_LOCK_LEAK_ENTER; if (checkUnique && ( (gp_indexcheck_insert == INDEX_CHECK_ALL && RelationIsHeap(heapRel)) || (gp_indexcheck_insert == INDEX_CHECK_SYSTEM && PG_CATALOG_NAMESPACE == RelationGetNamespace(heapRel)))) { _bt_validate_tid(rel, ht_ctid); } /* generate an index tuple */ itup = index_form_tuple(RelationGetDescr(rel), values, isnull); itup->t_tid = *ht_ctid; _bt_doinsert(rel, itup, checkUnique, heapRel); pfree(itup); MIRROREDLOCK_BUFMGR_VERIFY_NO_LOCK_LEAK_EXIT; PG_RETURN_BOOL(true); }
/** * Given the oid of a relation, this method calculates reltuples, relpages. This only looks up * local information (on master or segments). It produces meaningful values for AO and * heap tables and returns [0.0,0.0] for all other relations. * Input: * relationoid * Output: * array of two values [reltuples,relpages] */ Datum gp_statistics_estimate_reltuples_relpages_oid(PG_FUNCTION_ARGS) { float4 relpages = 0.0; float4 reltuples = 0.0; Oid relOid = PG_GETARG_OID(0); Datum values[2]; ArrayType *result; Relation rel = try_relation_open(relOid, AccessShareLock, false); if (rel != NULL) { if (rel->rd_rel->relkind == RELKIND_RELATION) { if (RelationIsHeap(rel)) { gp_statistics_estimate_reltuples_relpages_heap(rel, &reltuples, &relpages); } else if (RelationIsAoRows(rel)) { gp_statistics_estimate_reltuples_relpages_ao_rows(rel, &reltuples, &relpages); } else if (RelationIsAoCols(rel)) { gp_statistics_estimate_reltuples_relpages_ao_cs(rel, &reltuples, &relpages); } } else if (rel->rd_rel->relkind == RELKIND_INDEX) { reltuples = 1.0; relpages = RelationGetNumberOfBlocks(rel); } else { /** * Should we silently return [0.0,0.0] or error out? Currently, we choose option 1. */ } relation_close(rel, AccessShareLock); } else { /** * Should we silently return [0.0,0.0] or error out? Currently, we choose option 1. */ } values[0] = Float4GetDatum(reltuples); values[1] = Float4GetDatum(relpages); result = construct_array(values, 2, FLOAT4OID, sizeof(float4), true, 'i'); PG_RETURN_ARRAYTYPE_P(result); }
/* * calculate size of (one fork of) a relation * * Iterator over all files belong to the relation and do stat. * The obviously better way is to use glob. For whatever reason, * glob is extremely slow if there are lots of relations in the * database. So we handle all cases, instead. * * Note: we can safely apply this to temp tables of other sessions, so there * is no check here or at the call sites for that. */ static int64 calculate_relation_size(Relation rel, ForkNumber forknum) { int64 totalsize = 0; char *relationpath; char pathname[MAXPGPATH]; unsigned int segcount = 0; relationpath = relpathbackend(rel->rd_node, rel->rd_backend, forknum); if (RelationIsHeap(rel)) { /* Ordinary relation, including heap and index. * They take form of relationpath, or relationpath.%d * There will be no holes, therefore, we can stop when * we reach the first non-existing file. */ for (segcount = 0;; segcount++) { struct stat fst; CHECK_FOR_INTERRUPTS(); if (segcount == 0) snprintf(pathname, MAXPGPATH, "%s", relationpath); else snprintf(pathname, MAXPGPATH, "%s.%u", relationpath, segcount); if (stat(pathname, &fst) < 0) { if (errno == ENOENT) break; else ereport(ERROR, (errcode_for_file_access(), errmsg("could not stat file %s: %m", pathname))); } totalsize += fst.st_size; } } /* AO tables don't have any extra forks. */ else if (forknum == MAIN_FORKNUM) { if (RelationIsAoRows(rel)) { totalsize = GetAOTotalBytes(rel, GetActiveSnapshot()); } else if (RelationIsAoCols(rel)) { totalsize = GetAOCSTotalBytes(rel, GetActiveSnapshot(), true); } } /* RELSTORAGE_VIRTUAL has no space usage */ return totalsize; }
/* * IsErrorTable * * Returns true if relid is used as an error table, which has dependent object * that is an external table. Though it's not great we didn't have a clear * definition of Error Table, it satisfies the current requirements. */ bool IsErrorTable(Relation rel) { cqContext *pcqCtx, *pcqCtxExt, ctxExt; HeapTuple tup; Relation extrel; bool result = false; /* fast path to quick check */ if (!RelationIsHeap(rel)) return false; /* * Otherwise, go deeper and play with pg_depend... */ pcqCtx = caql_beginscan(NULL, cql("SELECT * FROM pg_depend " " WHERE refclassid = :1 " " AND refobjid = :2 " " AND refobjsubid = :3 ", ObjectIdGetDatum(RelationRelationId), ObjectIdGetDatum(RelationGetRelid(rel)), Int32GetDatum(0))); extrel = relation_open(ExtTableRelationId, AccessShareLock); pcqCtxExt = caql_addrel(cqclr(&ctxExt), extrel); while (HeapTupleIsValid(tup = caql_getnext(pcqCtx))) { Form_pg_depend dep = (Form_pg_depend) GETSTRUCT(tup); Oid fmterrtbl; fmterrtbl = caql_getoid(pcqCtxExt, cql("SELECT fmterrtbl FROM pg_exttable " " WHERE reloid = :1", ObjectIdGetDatum(dep->objid))); if (RelationGetRelid(rel) == fmterrtbl) { result = true; break; } } relation_close(extrel, AccessShareLock); caql_endscan(pcqCtx); return result; }
/* * calculate size of a relation * * Iterator over all files belong to the relation and do stat. * The obviously better way is to use glob. For whatever reason, * glob is extremely slow if there are lots of relations in the * database. So we handle all cases, instead. */ int64 calculate_relation_size(Relation rel) { int64 totalsize = 0; char *relationpath; char pathname[MAXPGPATH]; struct stat fst; int i; relationpath = relpath(rel->rd_node); if(RelationIsHeap(rel)) { /* Ordinary relation, including heap and index. * They take form of relationpath, or relationpath.%d * There will be no holes, therefore, we can stop we * we reach the first non-exist file. */ for(i=0; ; ++i) { if (i==0) snprintf(pathname, MAXPGPATH, "%s", relationpath); else snprintf(pathname, MAXPGPATH, "%s.%d", relationpath, i); if (stat(pathname, &fst) >= 0) totalsize += fst.st_size; else { if (errno == ENOENT) break; else ereport(ERROR, (errcode_for_file_access(), errmsg("could not stat file %s: %m", pathname) )); } } } else if (RelationIsAoRows(rel)) totalsize = GetAOTotalBytes(rel, SnapshotNow); else if (RelationIsParquet(rel)) totalsize = GetParquetTotalBytes(rel, SnapshotNow); /* RELSTORAGE_VIRTUAL has no space usage */ return totalsize; }
/* * getTableType * Return the table type for a given relation. */ int getTableType(Relation rel) { Assert(rel != NULL && rel->rd_rel != NULL); if (RelationIsHeap(rel)) { return TableTypeHeap; } if (RelationIsAoRows(rel)) { return TableTypeAppendOnly; } if (RelationIsParquet(rel)) { return TableTypeParquet; } elog(ERROR, "undefined table type for storage format: %c", rel->rd_rel->relstorage); return TableTypeInvalid; }
/* ---------------------------------------------------------------- * ExecInsert * * INSERTs have to add the tuple into * the base relation and insert appropriate tuples into the * index relations. * Insert can be part of an update operation when * there is a preceding SplitUpdate node. * ---------------------------------------------------------------- */ void ExecInsert(TupleTableSlot *slot, DestReceiver *dest, EState *estate, PlanGenerator planGen, bool isUpdate) { void *tuple = NULL; ResultRelInfo *resultRelInfo = NULL; Relation resultRelationDesc = NULL; Oid newId = InvalidOid; TupleTableSlot *partslot = NULL; AOTupleId aoTupleId = AOTUPLEID_INIT; bool rel_is_heap = false; bool rel_is_aorows = false; bool rel_is_external = false; bool rel_is_parquet = false; /* * get information on the (current) result relation */ if (estate->es_result_partitions) { resultRelInfo = slot_get_partition(slot, estate); estate->es_result_relation_info = resultRelInfo; if (NULL != resultRelInfo->ri_parquetSendBack) { /* * The Parquet part we are about to insert into * has sendBack information. This means we're inserting into the * part twice, which is not supported. Error out (GPSQL-2291) */ Assert(gp_parquet_insert_sort); ereport(ERROR, (errcode(ERRCODE_CDB_FEATURE_NOT_YET), errmsg("Cannot insert out-of-order tuples in parquet partitions"), errhint("Sort the data on the partitioning key(s) before inserting"), errOmitLocation(true))); } /* * Check if we need to close the last parquet partition we * inserted into (GPSQL-2291). */ Oid new_part_oid = resultRelInfo->ri_RelationDesc->rd_id; if (gp_parquet_insert_sort && PLANGEN_OPTIMIZER == planGen && InvalidOid != estate->es_last_parq_part && new_part_oid != estate->es_last_parq_part) { Assert(NULL != estate->es_partition_state->result_partition_hash); ResultPartHashEntry *entry = hash_search(estate->es_partition_state->result_partition_hash, &estate->es_last_parq_part, HASH_FIND, NULL /* found */); Assert(NULL != entry); Assert(entry->offset < estate->es_num_result_relations); ResultRelInfo *oldResultRelInfo = & estate->es_result_relations[entry->offset]; elog(DEBUG1, "Switching from old part oid=%d name=[%s] to new part oid=%d name=[%s]", estate->es_last_parq_part, oldResultRelInfo->ri_RelationDesc->rd_rel->relname.data, new_part_oid, resultRelInfo->ri_RelationDesc->rd_rel->relname.data); /* * We are opening a new partition, and the last partition we * inserted into was a Parquet part. Let's close the old * parquet insert descriptor to free the memory before * opening the new one. */ ParquetInsertDescData *oldInsertDesc = oldResultRelInfo->ri_parquetInsertDesc; /* * We need to preserve the "sendback" information that needs to be * sent back to the QD process from this part. * Compute it here, and store it for later use. */ QueryContextDispatchingSendBack sendback = CreateQueryContextDispatchingSendBack(1); sendback->relid = RelationGetRelid(oldResultRelInfo->ri_RelationDesc); oldInsertDesc->sendback = sendback; parquet_insert_finish(oldInsertDesc); /* Store the sendback information in the resultRelInfo for this part */ oldResultRelInfo->ri_parquetSendBack = sendback; /* Record in the resultRelInfo that we closed the parquet insert descriptor */ oldResultRelInfo->ri_parquetInsertDesc = NULL; /* Reset the last parquet part Oid, it's now closed */ estate->es_last_parq_part = InvalidOid; } } else { resultRelInfo = estate->es_result_relation_info; } Assert (!resultRelInfo->ri_projectReturning); resultRelationDesc = resultRelInfo->ri_RelationDesc; rel_is_heap = RelationIsHeap(resultRelationDesc); rel_is_aorows = RelationIsAoRows(resultRelationDesc); rel_is_external = RelationIsExternal(resultRelationDesc); rel_is_parquet = RelationIsParquet(resultRelationDesc); /* Validate that insert is not part of an non-allowed update operation. */ if (isUpdate && (rel_is_aorows || rel_is_parquet)) { ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("Append-only tables are not updatable. Operation not permitted."), errOmitLocation(true))); } partslot = reconstructMatchingTupleSlot(slot, resultRelInfo); if (rel_is_heap || rel_is_external) { tuple = ExecFetchSlotHeapTuple(partslot); } else if (rel_is_aorows) { tuple = ExecFetchSlotMemTuple(partslot, false); } else if (rel_is_parquet) { tuple = NULL; } Assert( partslot != NULL ); Assert( rel_is_parquet || (tuple != NULL)); /* Execute triggers in Planner-generated plans */ if (planGen == PLANGEN_PLANNER) { /* BEFORE ROW INSERT Triggers */ if (resultRelInfo->ri_TrigDesc && resultRelInfo->ri_TrigDesc->n_before_row[TRIGGER_EVENT_INSERT] > 0) { HeapTuple newtuple; /* NYI */ if(rel_is_parquet) elog(ERROR, "triggers are not supported on tables that use column-oriented storage"); newtuple = ExecBRInsertTriggers(estate, resultRelInfo, tuple); if (newtuple == NULL) /* "do nothing" */ { return; } if (newtuple != tuple) /* modified by Trigger(s) */ { /* * Put the modified tuple into a slot for convenience of routines * below. We assume the tuple was allocated in per-tuple memory * context, and therefore will go away by itself. The tuple table * slot should not try to clear it. */ TupleTableSlot *newslot = estate->es_trig_tuple_slot; if (newslot->tts_tupleDescriptor != partslot->tts_tupleDescriptor) ExecSetSlotDescriptor(newslot, partslot->tts_tupleDescriptor); ExecStoreGenericTuple(newtuple, newslot, false); newslot->tts_tableOid = partslot->tts_tableOid; /* for constraints */ tuple = newtuple; partslot = newslot; } } } /* * Check the constraints of the tuple */ if (resultRelationDesc->rd_att->constr && planGen == PLANGEN_PLANNER) { ExecConstraints(resultRelInfo, partslot, estate); } /* * insert the tuple * * Note: heap_insert returns the tid (location) of the new tuple in the * t_self field. * * NOTE: for append-only relations we use the append-only access methods. */ if (rel_is_aorows) { if (resultRelInfo->ri_aoInsertDesc == NULL) { ResultRelSegFileInfo *segfileinfo = NULL; /* Set the pre-assigned fileseg number to insert into */ ResultRelInfoSetSegFileInfo(resultRelInfo, estate->es_result_segfileinfos); segfileinfo = (ResultRelSegFileInfo *)list_nth(resultRelInfo->ri_aosegfileinfos, GetQEIndex()); resultRelInfo->ri_aoInsertDesc = appendonly_insert_init(resultRelationDesc, segfileinfo); } appendonly_insert(resultRelInfo->ri_aoInsertDesc, tuple, &newId, &aoTupleId); } else if (rel_is_external) { /* Writable external table */ if (resultRelInfo->ri_extInsertDesc == NULL) resultRelInfo->ri_extInsertDesc = external_insert_init( resultRelationDesc, 0); newId = external_insert(resultRelInfo->ri_extInsertDesc, tuple); } else if(rel_is_parquet) { /* If there is no parquet insert descriptor, create it now. */ if (resultRelInfo->ri_parquetInsertDesc == NULL) { ResultRelSegFileInfo *segfileinfo = NULL; ResultRelInfoSetSegFileInfo(resultRelInfo, estate->es_result_segfileinfos); segfileinfo = (ResultRelSegFileInfo *)list_nth(resultRelInfo->ri_aosegfileinfos, GetQEIndex()); resultRelInfo->ri_parquetInsertDesc = parquet_insert_init(resultRelationDesc, segfileinfo); /* * Just opened a new parquet partition for insert. Save the Oid * in estate, so that we can close it when switching to a * new partition (GPSQL-2291) */ elog(DEBUG1, "Saving es_last_parq_part. Old=%d, new=%d", estate->es_last_parq_part, resultRelationDesc->rd_id); estate->es_last_parq_part = resultRelationDesc->rd_id; } newId = parquet_insert(resultRelInfo->ri_parquetInsertDesc, partslot); } else { Insist(rel_is_heap); newId = heap_insert(resultRelationDesc, tuple, estate->es_snapshot->curcid, true, true, GetCurrentTransactionId()); } IncrAppended(); (estate->es_processed)++; (resultRelInfo->ri_aoprocessed)++; estate->es_lastoid = newId; partslot->tts_tableOid = RelationGetRelid(resultRelationDesc); if (rel_is_aorows || rel_is_parquet) { /* NOTE: Current version does not support index upon parquet table. */ /* * insert index entries for AO Row-Store tuple */ if (resultRelInfo->ri_NumIndices > 0 && !rel_is_parquet) ExecInsertIndexTuples(partslot, (ItemPointer)&aoTupleId, estate, false); } else { /* Use parttuple for index update in case this is an indexed heap table. */ TupleTableSlot *xslot = partslot; void *xtuple = tuple; setLastTid(&(((HeapTuple) xtuple)->t_self)); /* * insert index entries for tuple */ if (resultRelInfo->ri_NumIndices > 0) ExecInsertIndexTuples(xslot, &(((HeapTuple) xtuple)->t_self), estate, false); } if (planGen == PLANGEN_PLANNER) { /* AFTER ROW INSERT Triggers */ ExecARInsertTriggers(estate, resultRelInfo, tuple); } }
/** * This method estimates the number of tuples and pages in a heaptable relation. Getting the number of blocks is straightforward. * Estimating the number of tuples is a little trickier. There are two factors that complicate this: * 1. Tuples may be of variable length. * 2. There may be dead tuples lying around. * To do this, it chooses a certain number of blocks (as determined by a guc) randomly. The process of choosing is not strictly * uniformly random since we have a target number of blocks in mind. We start processing blocks in order and choose an block * with a probability p determined by the ratio of target to total blocks. It is possible that we get really unlucky and reject * a large number of blocks up front. We compensate for this by increasing p dynamically. Thus, we are guaranteed to choose the target number * of blocks. We read all heaptuples from these blocks and keep count of number of live tuples. We scale up this count to * estimate reltuples. Relpages is an exact value. * * Input: * rel - Relation. Must be a heaptable. * * Output: * reltuples - estimated number of tuples in relation. * relpages - exact number of pages. */ static void gp_statistics_estimate_reltuples_relpages_heap(Relation rel, float4 *reltuples, float4 *relpages) { MIRROREDLOCK_BUFMGR_DECLARE; float4 nrowsseen = 0; /* # rows seen (including dead rows) */ float4 nrowsdead = 0; /* # rows dead */ float4 totalEmptyPages = 0; /* # of empty pages with only dead rows */ float4 totalSamplePages = 0; /* # of pages sampled */ BlockNumber nblockstotal = 0; /* nblocks in relation */ BlockNumber nblockstarget = (BlockNumber) gp_statistics_blocks_target; BlockNumber nblocksseen = 0; int j = 0; /* counter */ /** * Ensure that the right kind of relation with the right kind of storage is passed to us. */ Assert(rel->rd_rel->relkind == RELKIND_RELATION); Assert(RelationIsHeap(rel)); nblockstotal = RelationGetNumberOfBlocks(rel); if (nblockstotal == 0 || nblockstarget == 0) { /** * If there are no blocks, there cannot be tuples. */ *reltuples = 0.0; *relpages = 0.0; return; } for (j=0 ; j<nblockstotal; j++) { /** * Threshold is dynamically adjusted based on how many blocks we need to examine and how many blocks * are left. */ double threshold = ((double) nblockstarget - nblocksseen)/((double) nblockstotal - j); /** * Random dice thrown to determine if current block is chosen. */ double diceValue = ((double) random()) / ((double) MAX_RANDOM_VALUE); if (threshold >= 1.0 || diceValue <= threshold) { totalSamplePages++; /** * Block j shall be examined! */ BlockNumber targblock = j; Buffer targbuffer; Page targpage; OffsetNumber targoffset, maxoffset; /** * Check for cancellations. */ CHECK_FOR_INTERRUPTS(); /* * We must maintain a pin on the target page's buffer to ensure that * the maxoffset value stays good (else concurrent VACUUM might delete * tuples out from under us). Hence, pin the page until we are done * looking at it. We don't maintain a lock on the page, so tuples * could get added to it, but we ignore such tuples. */ // -------- MirroredLock ---------- MIRROREDLOCK_BUFMGR_LOCK; targbuffer = ReadBuffer(rel, targblock); LockBuffer(targbuffer, BUFFER_LOCK_SHARE); targpage = BufferGetPage(targbuffer); maxoffset = PageGetMaxOffsetNumber(targpage); /* Figure out overall nrowsdead/nrowsseen ratio */ /* Figure out # of empty pages based on page level #rowsseen and #rowsdead.*/ float4 pageRowsSeen = 0.0; float4 pageRowsDead = 0.0; /* Inner loop over all tuples on the selected block. */ for (targoffset = FirstOffsetNumber; targoffset <= maxoffset; targoffset++) { ItemId itemid; itemid = PageGetItemId(targpage, targoffset); nrowsseen++; pageRowsSeen++; if(!ItemIdIsNormal(itemid)) { nrowsdead += 1; pageRowsDead++; } else { HeapTupleData targtuple; ItemPointerSet(&targtuple.t_self, targblock, targoffset); targtuple.t_data = (HeapTupleHeader) PageGetItem(targpage, itemid); targtuple.t_len = ItemIdGetLength(itemid); if(!HeapTupleSatisfiesVisibility(rel, &targtuple, SnapshotNow, targbuffer)) { nrowsdead += 1; pageRowsDead++; } } } /* Now release the pin on the page */ UnlockReleaseBuffer(targbuffer); MIRROREDLOCK_BUFMGR_UNLOCK; // -------- MirroredLock ---------- /* detect empty pages: pageRowsSeen == pageRowsDead, also log the nrowsseen (total) and nrowsdead (total) */ if (pageRowsSeen == pageRowsDead && pageRowsSeen > 0) { totalEmptyPages++; } nblocksseen++; } } Assert(nblocksseen > 0); /** * To calculate reltuples, scale up the number of live rows per block seen to the total number * of blocks. */ *reltuples = ceil((nrowsseen - nrowsdead) * nblockstotal / nblocksseen); *relpages = nblockstotal; if (totalSamplePages * 0.5 <= totalEmptyPages && totalSamplePages != 0) { /* * LOG empty pages of bloated table for each segments. */ elog(DEBUG1, "ANALYZE detected 50%% or more empty pages (%f empty out of %f pages), please run VACUUM FULL for accurate estimation.", totalEmptyPages, totalSamplePages); } return; }
/* * ValidateErrorTableMetaData * * This function gets called if a user wants an already existing table to be * used as an error table for some COPY or external table operation with SREH. * In here we verify that the metadata of the user selected table matches the * predefined requirement for an error table. */ void ValidateErrorTableMetaData(Relation rel) { TupleDesc tupDesc = RelationGetDescr(rel); Form_pg_attribute *attr = tupDesc->attrs; TupleConstr *constr = tupDesc->constr; char *relname = RelationGetRelationName(rel); int attr_count = tupDesc->natts; /* * Verify it is an ordinary table. */ if (rel->rd_rel->relkind != RELKIND_RELATION) ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("error table \"%s\" is not a table", relname))); /* * Verify it is a heap table. */ if (!RelationIsHeap(rel)) ereport(ERROR, (errcode(ERRCODE_INVALID_TABLE_DEFINITION), errmsg("error table \"%s\" is not a heap table", relname), errhint("Use a relation with heap storage type for error table"))); /* * Verify number of attributes match */ if (attr_count != NUM_ERRORTABLE_ATTR) ereport(ERROR, (errcode(ERRCODE_INVALID_TABLE_DEFINITION), errmsg("Relation \"%s\" already exists and is not of a valid " "error table format (expected %d attributes, found %d)", relname, NUM_ERRORTABLE_ATTR, attr_count))); /* * Verify this table is randomly-distributed. */ if (!GpPolicyIsRandomly(rel->rd_cdbpolicy)) ereport(ERROR, (errcode(ERRCODE_INVALID_TABLE_DEFINITION), errmsg("error table \"%s\" is not randomly distributed", relname), errhint("Use a randomly-distributed realtion for error table"))); /* * Verify this table is not a partitioned or child partition table. */ if (rel_is_partitioned(RelationGetRelid(rel)) || rel_is_child_partition(RelationGetRelid(rel))) ereport(ERROR, (errcode(ERRCODE_INVALID_TABLE_DEFINITION), errmsg("error table \"%s\" is a partitioned table", relname), errdetail("Either root or child partition are not allowed as an error table"))); /* * Verify this table has no constraints. * * This errors out with DEFAULTs, CHECKs or NOT NULLs. * * Unique constraint is not allowed in randomly-distributed table * so we don't check it here. * * We never insert NULL values in some attributes, but allowing * NOT NULL for error table columns doesn't give much convenience * to the user anyway. Likewise, DEFAULT values are not of interest * mostly, but it does not give much value to user, either. */ if (constr) ereport(ERROR, (errcode(ERRCODE_INVALID_TABLE_DEFINITION), errmsg("Relation \"%s\" already exists and is not of a valid " "error table format. If appears to have constraints " "defined.", relname))); /* * run through each attribute at a time and verify it's what we expect */ VerifyErrorTableAttr(attr, errtable_cmdtime, "cmdtime", TIMESTAMPTZOID, relname); VerifyErrorTableAttr(attr, errtable_relname, "relname", TEXTOID, relname); VerifyErrorTableAttr(attr, errtable_filename, "filename", TEXTOID, relname); VerifyErrorTableAttr(attr, errtable_linenum, "linenum", INT4OID, relname); VerifyErrorTableAttr(attr, errtable_bytenum, "bytenum", INT4OID, relname); VerifyErrorTableAttr(attr, errtable_errmsg, "errmsg", TEXTOID, relname); VerifyErrorTableAttr(attr, errtable_rawdata, "rawdata", TEXTOID, relname); VerifyErrorTableAttr(attr, errtable_rawbytes, "rawbytes", BYTEAOID, relname); }
/* ---------------------------------------------------------------- * ExecUpdate * * note: we can't run UPDATE queries with transactions * off because UPDATEs are actually INSERTs and our * scan will mistakenly loop forever, updating the tuple * it just inserted.. This should be fixed but until it * is, we don't want to get stuck in an infinite loop * which corrupts your database.. * ---------------------------------------------------------------- */ void ExecUpdate(TupleTableSlot *slot, ItemPointer tupleid, TupleTableSlot *planSlot, DestReceiver *dest, EState *estate) { void* tuple; ResultRelInfo *resultRelInfo; Relation resultRelationDesc; HTSU_Result result; ItemPointerData update_ctid; TransactionId update_xmax; AOTupleId aoTupleId = AOTUPLEID_INIT; TupleTableSlot *partslot = NULL; /* * abort the operation if not running transactions */ if (IsBootstrapProcessingMode()) elog(ERROR, "cannot UPDATE during bootstrap"); /* * get information on the (current) result relation */ resultRelInfo = estate->es_result_relation_info; resultRelationDesc = resultRelInfo->ri_RelationDesc; bool rel_is_heap = RelationIsHeap(resultRelationDesc); bool rel_is_aorows = RelationIsAoRows(resultRelationDesc); bool rel_is_aocols = RelationIsAoCols(resultRelationDesc); bool rel_is_external = RelationIsExternal(resultRelationDesc); /* * get the heap tuple out of the tuple table slot, making sure we have a * writable copy */ if (rel_is_heap) { partslot = slot; tuple = ExecFetchSlotHeapTuple(partslot); } else if (rel_is_aorows || rel_is_aocols) { /* * It is necessary to reconstruct a logically compatible tuple to * a phyiscally compatible tuple. The slot's tuple descriptor comes * from the projection target list, which doesn't indicate dropped * columns, and MemTuple cannot deal with cases without converting * the target list back into the original relation's tuple desc. */ partslot = reconstructMatchingTupleSlot(slot, resultRelInfo); /* * We directly inline toasted columns here as update with toasted columns * would create two references to the same toasted value. */ tuple = ExecFetchSlotMemTuple(partslot, true); } else if (rel_is_external) { if (estate->es_result_partitions && estate->es_result_partitions->part->parrelid != 0) { ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("Update external partitions not supported."))); return; } else { partslot = slot; tuple = ExecFetchSlotHeapTuple(partslot); } } else { Insist(false); } /* see if this update would move the tuple to a different partition */ if (estate->es_result_partitions) checkPartitionUpdate(estate, partslot, resultRelInfo); /* BEFORE ROW UPDATE Triggers */ if (resultRelInfo->ri_TrigDesc && resultRelInfo->ri_TrigDesc->n_before_row[TRIGGER_EVENT_UPDATE] > 0) { HeapTuple newtuple; newtuple = ExecBRUpdateTriggers(estate, resultRelInfo, tupleid, tuple, estate->es_snapshot->curcid); if (newtuple == NULL) /* "do nothing" */ return; if (newtuple != tuple) /* modified by Trigger(s) */ { /* * Put the modified tuple into a slot for convenience of routines * below. We assume the tuple was allocated in per-tuple memory * context, and therefore will go away by itself. The tuple table * slot should not try to clear it. */ TupleTableSlot *newslot = estate->es_trig_tuple_slot; if (newslot->tts_tupleDescriptor != partslot->tts_tupleDescriptor) ExecSetSlotDescriptor(newslot, partslot->tts_tupleDescriptor); ExecStoreGenericTuple(newtuple, newslot, false); newslot->tts_tableOid = partslot->tts_tableOid; /* for constraints */ partslot = newslot; tuple = newtuple; } } /* * Check the constraints of the tuple * * If we generate a new candidate tuple after EvalPlanQual testing, we * must loop back here and recheck constraints. (We don't need to redo * triggers, however. If there are any BEFORE triggers then trigger.c * will have done heap_lock_tuple to lock the correct tuple, so there's no * need to do them again.) */ lreplace:; if (resultRelationDesc->rd_att->constr) ExecConstraints(resultRelInfo, partslot, estate); if (!GpPersistent_IsPersistentRelation(resultRelationDesc->rd_id)) { /* * Normal UPDATE path. */ /* * replace the heap tuple * * Note: if es_crosscheck_snapshot isn't InvalidSnapshot, we check that * the row to be updated is visible to that snapshot, and throw a can't- * serialize error if not. This is a special-case behavior needed for * referential integrity updates in serializable transactions. */ if (rel_is_heap) { result = heap_update(resultRelationDesc, tupleid, tuple, &update_ctid, &update_xmax, estate->es_snapshot->curcid, estate->es_crosscheck_snapshot, true /* wait for commit */ ); } else if (rel_is_aorows) { if (IsXactIsoLevelSerializable) { ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("Updates on append-only tables are not supported in serializable transactions."))); } if (resultRelInfo->ri_updateDesc == NULL) { ResultRelInfoSetSegno(resultRelInfo, estate->es_result_aosegnos); resultRelInfo->ri_updateDesc = (AppendOnlyUpdateDesc) appendonly_update_init(resultRelationDesc, ActiveSnapshot, resultRelInfo->ri_aosegno); } result = appendonly_update(resultRelInfo->ri_updateDesc, tuple, (AOTupleId *) tupleid, &aoTupleId); } else if (rel_is_aocols) { if (IsXactIsoLevelSerializable) { ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("Updates on append-only tables are not supported in serializable transactions."))); } if (resultRelInfo->ri_updateDesc == NULL) { ResultRelInfoSetSegno(resultRelInfo, estate->es_result_aosegnos); resultRelInfo->ri_updateDesc = (AppendOnlyUpdateDesc) aocs_update_init(resultRelationDesc, resultRelInfo->ri_aosegno); } result = aocs_update(resultRelInfo->ri_updateDesc, partslot, (AOTupleId *) tupleid, &aoTupleId); } else { Assert(!"We should not be here"); } switch (result) { case HeapTupleSelfUpdated: /* already deleted by self; nothing to do */ return; case HeapTupleMayBeUpdated: break; case HeapTupleUpdated: if (IsXactIsoLevelSerializable) ereport(ERROR, (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), errmsg("could not serialize access due to concurrent update"))); else if (!ItemPointerEquals(tupleid, &update_ctid)) { TupleTableSlot *epqslot; epqslot = EvalPlanQual(estate, resultRelInfo->ri_RangeTableIndex, &update_ctid, update_xmax, estate->es_snapshot->curcid); if (!TupIsNull(epqslot)) { *tupleid = update_ctid; partslot = ExecFilterJunk(estate->es_junkFilter, epqslot); tuple = ExecFetchSlotHeapTuple(partslot); goto lreplace; } } /* tuple already deleted; nothing to do */ return; default: elog(ERROR, "unrecognized heap_update status: %u", result); return; } } else { HeapTuple persistentTuple; /* * Persistent metadata path. */ persistentTuple = heap_copytuple(tuple); persistentTuple->t_self = *tupleid; frozen_heap_inplace_update(resultRelationDesc, persistentTuple); heap_freetuple(persistentTuple); } IncrReplaced(); (estate->es_processed)++; (resultRelInfo->ri_aoprocessed)++; /* * Note: instead of having to update the old index tuples associated with * the heap tuple, all we do is form and insert new index tuples. This is * because UPDATEs are actually DELETEs and INSERTs, and index tuple * deletion is done later by VACUUM (see notes in ExecDelete). All we do * here is insert new index tuples. -cim 9/27/89 */ /* * insert index entries for tuple * * Note: heap_update returns the tid (location) of the new tuple in the * t_self field. */ if (rel_is_aorows || rel_is_aocols) { if (resultRelInfo->ri_NumIndices > 0) ExecInsertIndexTuples(partslot, (ItemPointer)&aoTupleId, estate, false); } else { if (resultRelInfo->ri_NumIndices > 0) ExecInsertIndexTuples(partslot, &(((HeapTuple) tuple)->t_self), estate, false); } /* AFTER ROW UPDATE Triggers */ ExecARUpdateTriggers(estate, resultRelInfo, tupleid, tuple); }
/* ---------------------------------------------------------------- * ExecDelete * * DELETE is like UPDATE, except that we delete the tuple and no * index modifications are needed. * DELETE can be part of an update operation when * there is a preceding SplitUpdate node. * * ---------------------------------------------------------------- */ void ExecDelete(ItemPointer tupleid, TupleTableSlot *planSlot, DestReceiver *dest, EState *estate, PlanGenerator planGen, bool isUpdate) { ResultRelInfo *resultRelInfo; Relation resultRelationDesc; HTSU_Result result; ItemPointerData update_ctid; TransactionId update_xmax; /* * Get information on the (current) result relation. */ if (estate->es_result_partitions && planGen == PLANGEN_OPTIMIZER) { Assert(estate->es_result_partitions->part->parrelid); #ifdef USE_ASSERT_CHECKING Oid parent = estate->es_result_partitions->part->parrelid; #endif /* Obtain part for current tuple. */ resultRelInfo = slot_get_partition(planSlot, estate); estate->es_result_relation_info = resultRelInfo; #ifdef USE_ASSERT_CHECKING Oid part = RelationGetRelid(resultRelInfo->ri_RelationDesc); #endif Assert(parent != part); } else { resultRelInfo = estate->es_result_relation_info; } resultRelationDesc = resultRelInfo->ri_RelationDesc; Assert (!resultRelInfo->ri_projectReturning); if (planGen == PLANGEN_PLANNER) { /* BEFORE ROW DELETE Triggers */ if (resultRelInfo->ri_TrigDesc && resultRelInfo->ri_TrigDesc->n_before_row[TRIGGER_EVENT_DELETE] > 0) { bool dodelete; dodelete = ExecBRDeleteTriggers(estate, resultRelInfo, tupleid, estate->es_snapshot->curcid); if (!dodelete) /* "do nothing" */ return; } } bool isHeapTable = RelationIsHeap(resultRelationDesc); bool isAORowsTable = RelationIsAoRows(resultRelationDesc); bool isAOColsTable = RelationIsAoCols(resultRelationDesc); bool isExternalTable = RelationIsExternal(resultRelationDesc); if (isExternalTable && estate->es_result_partitions && estate->es_result_partitions->part->parrelid != 0) { ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("Delete from external partitions not supported."))); return; } /* * delete the tuple * * Note: if es_crosscheck_snapshot isn't InvalidSnapshot, we check that * the row to be deleted is visible to that snapshot, and throw a can't- * serialize error if not. This is a special-case behavior needed for * referential integrity updates in serializable transactions. */ ldelete:; if (isHeapTable) { result = heap_delete(resultRelationDesc, tupleid, &update_ctid, &update_xmax, estate->es_snapshot->curcid, estate->es_crosscheck_snapshot, true /* wait for commit */ ); } else if (isAORowsTable) { if (IsXactIsoLevelSerializable) { if (!isUpdate) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("Deletes on append-only tables are not supported in serializable transactions."))); else ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("Updates on append-only tables are not supported in serializable transactions."))); } if (resultRelInfo->ri_deleteDesc == NULL) { resultRelInfo->ri_deleteDesc = appendonly_delete_init(resultRelationDesc, ActiveSnapshot); } AOTupleId* aoTupleId = (AOTupleId*)tupleid; result = appendonly_delete(resultRelInfo->ri_deleteDesc, aoTupleId); } else if (isAOColsTable) { if (IsXactIsoLevelSerializable) { if (!isUpdate) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("Deletes on append-only tables are not supported in serializable transactions."))); else ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("Updates on append-only tables are not supported in serializable transactions."))); } if (resultRelInfo->ri_deleteDesc == NULL) { resultRelInfo->ri_deleteDesc = aocs_delete_init(resultRelationDesc); } AOTupleId* aoTupleId = (AOTupleId*)tupleid; result = aocs_delete(resultRelInfo->ri_deleteDesc, aoTupleId); } else { Insist(0); } switch (result) { case HeapTupleSelfUpdated: /* already deleted by self; nothing to do */ /* * In an scenario in which R(a,b) and S(a,b) have * R S * ________ ________ * (1, 1) (1, 2) * (1, 7) * * An update query such as: * UPDATE R SET a = S.b FROM S WHERE R.b = S.a; * * will have an non-deterministic output. The tuple in R * can be updated to (2,1) or (7,1). * Since the introduction of SplitUpdate, these queries will * send multiple requests to delete the same tuple. Therefore, * in order to avoid a non-deterministic output, * an error is reported in such scenario. */ if (isUpdate) { ereport(ERROR, (errcode(ERRCODE_IN_FAILED_SQL_TRANSACTION ), errmsg("multiple updates to a row by the same query is not allowed"))); } return; case HeapTupleMayBeUpdated: break; case HeapTupleUpdated: if (IsXactIsoLevelSerializable) ereport(ERROR, (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), errmsg("could not serialize access due to concurrent update"))); else if (!ItemPointerEquals(tupleid, &update_ctid)) { TupleTableSlot *epqslot; epqslot = EvalPlanQual(estate, resultRelInfo->ri_RangeTableIndex, &update_ctid, update_xmax, estate->es_snapshot->curcid); if (!TupIsNull(epqslot)) { *tupleid = update_ctid; goto ldelete; } } /* tuple already deleted; nothing to do */ return; default: elog(ERROR, "unrecognized heap_delete status: %u", result); return; } if (!isUpdate) { IncrDeleted(); (estate->es_processed)++; /* * To notify master if tuples deleted or not, to update mod_count. */ (resultRelInfo->ri_aoprocessed)++; } /* * Note: Normally one would think that we have to delete index tuples * associated with the heap tuple now... * * ... but in POSTGRES, we have no need to do this because VACUUM will * take care of it later. We can't delete index tuples immediately * anyway, since the tuple is still visible to other transactions. */ if (planGen == PLANGEN_PLANNER) { /* AFTER ROW DELETE Triggers */ ExecARDeleteTriggers(estate, resultRelInfo, tupleid); } }
/* ---------------------------------------------------------------- * ExecInsert * * INSERTs have to add the tuple into * the base relation and insert appropriate tuples into the * index relations. * Insert can be part of an update operation when * there is a preceding SplitUpdate node. * ---------------------------------------------------------------- */ void ExecInsert(TupleTableSlot *slot, DestReceiver *dest, EState *estate, PlanGenerator planGen, bool isUpdate) { void *tuple = NULL; ResultRelInfo *resultRelInfo = NULL; Relation resultRelationDesc = NULL; Oid newId = InvalidOid; TupleTableSlot *partslot = NULL; AOTupleId aoTupleId = AOTUPLEID_INIT; bool rel_is_heap = false; bool rel_is_aorows = false; bool rel_is_aocols = false; bool rel_is_external = false; /* * get information on the (current) result relation */ if (estate->es_result_partitions) { resultRelInfo = slot_get_partition(slot, estate); /* Check whether the user provided the correct leaf part only if required */ if (!dml_ignore_target_partition_check) { Assert(NULL != estate->es_result_partitions->part && NULL != resultRelInfo->ri_RelationDesc); List *resultRelations = estate->es_plannedstmt->resultRelations; /* * Only inheritance can generate multiple result relations and inheritance * is not compatible with partitions. As we are in inserting in partitioned * table, we should not have more than one resultRelation */ Assert(list_length(resultRelations) == 1); /* We only have one resultRelations entry where the user originally intended to insert */ int rteIdxForUserRel = linitial_int(resultRelations); Assert (rteIdxForUserRel > 0); Oid userProvidedRel = InvalidOid; if (1 == rteIdxForUserRel) { /* Optimization for typical case */ userProvidedRel = ((RangeTblEntry *) estate->es_plannedstmt->rtable->head->data.ptr_value)->relid; } else { userProvidedRel = getrelid(rteIdxForUserRel, estate->es_plannedstmt->rtable); } /* Error out if user provides a leaf partition that does not match with our calculated partition */ if (userProvidedRel != estate->es_result_partitions->part->parrelid && userProvidedRel != resultRelInfo->ri_RelationDesc->rd_id) { ereport(ERROR, (errcode(ERRCODE_CHECK_VIOLATION), errmsg("Trying to insert row into wrong partition"), errdetail("Expected partition: %s, provided partition: %s", resultRelInfo->ri_RelationDesc->rd_rel->relname.data, estate->es_result_relation_info->ri_RelationDesc->rd_rel->relname.data))); } } estate->es_result_relation_info = resultRelInfo; } else { resultRelInfo = estate->es_result_relation_info; } Assert (!resultRelInfo->ri_projectReturning); resultRelationDesc = resultRelInfo->ri_RelationDesc; rel_is_heap = RelationIsHeap(resultRelationDesc); rel_is_aocols = RelationIsAoCols(resultRelationDesc); rel_is_aorows = RelationIsAoRows(resultRelationDesc); rel_is_external = RelationIsExternal(resultRelationDesc); partslot = reconstructMatchingTupleSlot(slot, resultRelInfo); if (rel_is_heap) { tuple = ExecFetchSlotHeapTuple(partslot); } else if (rel_is_aorows) { tuple = ExecFetchSlotMemTuple(partslot, false); } else if (rel_is_external) { if (estate->es_result_partitions && estate->es_result_partitions->part->parrelid != 0) { ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("Insert into external partitions not supported."))); return; } else { tuple = ExecFetchSlotHeapTuple(partslot); } } else { Assert(rel_is_aocols); tuple = ExecFetchSlotMemTuple(partslot, true); } Assert(partslot != NULL && tuple != NULL); /* Execute triggers in Planner-generated plans */ if (planGen == PLANGEN_PLANNER) { /* BEFORE ROW INSERT Triggers */ if (resultRelInfo->ri_TrigDesc && resultRelInfo->ri_TrigDesc->n_before_row[TRIGGER_EVENT_INSERT] > 0) { HeapTuple newtuple; /* NYI */ if(rel_is_aocols) elog(ERROR, "triggers are not supported on tables that use column-oriented storage"); newtuple = ExecBRInsertTriggers(estate, resultRelInfo, tuple); if (newtuple == NULL) /* "do nothing" */ { return; } if (newtuple != tuple) /* modified by Trigger(s) */ { /* * Put the modified tuple into a slot for convenience of routines * below. We assume the tuple was allocated in per-tuple memory * context, and therefore will go away by itself. The tuple table * slot should not try to clear it. */ TupleTableSlot *newslot = estate->es_trig_tuple_slot; if (newslot->tts_tupleDescriptor != partslot->tts_tupleDescriptor) ExecSetSlotDescriptor(newslot, partslot->tts_tupleDescriptor); ExecStoreGenericTuple(newtuple, newslot, false); newslot->tts_tableOid = partslot->tts_tableOid; /* for constraints */ tuple = newtuple; partslot = newslot; } } } /* * Check the constraints of the tuple */ if (resultRelationDesc->rd_att->constr && planGen == PLANGEN_PLANNER) { ExecConstraints(resultRelInfo, partslot, estate); } /* * insert the tuple * * Note: heap_insert returns the tid (location) of the new tuple in the * t_self field. * * NOTE: for append-only relations we use the append-only access methods. */ if (rel_is_aorows) { if (resultRelInfo->ri_aoInsertDesc == NULL) { /* Set the pre-assigned fileseg number to insert into */ ResultRelInfoSetSegno(resultRelInfo, estate->es_result_aosegnos); resultRelInfo->ri_aoInsertDesc = appendonly_insert_init(resultRelationDesc, ActiveSnapshot, resultRelInfo->ri_aosegno, false); } appendonly_insert(resultRelInfo->ri_aoInsertDesc, tuple, &newId, &aoTupleId); } else if (rel_is_aocols) { if (resultRelInfo->ri_aocsInsertDesc == NULL) { ResultRelInfoSetSegno(resultRelInfo, estate->es_result_aosegnos); resultRelInfo->ri_aocsInsertDesc = aocs_insert_init(resultRelationDesc, resultRelInfo->ri_aosegno, false); } newId = aocs_insert(resultRelInfo->ri_aocsInsertDesc, partslot); aoTupleId = *((AOTupleId*)slot_get_ctid(partslot)); } else if (rel_is_external) { /* Writable external table */ if (resultRelInfo->ri_extInsertDesc == NULL) resultRelInfo->ri_extInsertDesc = external_insert_init(resultRelationDesc); newId = external_insert(resultRelInfo->ri_extInsertDesc, tuple); } else { Insist(rel_is_heap); newId = heap_insert(resultRelationDesc, tuple, estate->es_snapshot->curcid, true, true, GetCurrentTransactionId()); } IncrAppended(); (estate->es_processed)++; (resultRelInfo->ri_aoprocessed)++; estate->es_lastoid = newId; partslot->tts_tableOid = RelationGetRelid(resultRelationDesc); if (rel_is_aorows || rel_is_aocols) { /* * insert index entries for AO Row-Store tuple */ if (resultRelInfo->ri_NumIndices > 0) ExecInsertIndexTuples(partslot, (ItemPointer)&aoTupleId, estate, false); } else { /* Use parttuple for index update in case this is an indexed heap table. */ TupleTableSlot *xslot = partslot; void *xtuple = tuple; setLastTid(&(((HeapTuple) xtuple)->t_self)); /* * insert index entries for tuple */ if (resultRelInfo->ri_NumIndices > 0) ExecInsertIndexTuples(xslot, &(((HeapTuple) xtuple)->t_self), estate, false); } if (planGen == PLANGEN_PLANNER) { /* AFTER ROW INSERT Triggers */ ExecARInsertTriggers(estate, resultRelInfo, tuple); } }