/* * CheckHashPartitionedTable looks up the partition information for the given * tableId and checks if the table is hash partitioned. If not, the function * throws an error. */ static void CheckHashPartitionedTable(Oid distributedTableId) { char partitionType = PartitionMethod(distributedTableId); if (partitionType != DISTRIBUTE_BY_HASH) { ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("unsupported table partition type: %c", partitionType))); } }
/* * ConstraintIsAForeignKeyToReferenceTable function scans the pgConstraint to * fetch all of the constraints on the given relationId and see if at least one * of them is a foreign key referencing to a reference table. */ bool ConstraintIsAForeignKeyToReferenceTable(char *constraintName, Oid relationId) { Relation pgConstraint = NULL; SysScanDesc scanDescriptor = NULL; ScanKeyData scanKey[1]; int scanKeyCount = 1; HeapTuple heapTuple = NULL; bool foreignKeyToReferenceTable = false; pgConstraint = heap_open(ConstraintRelationId, AccessShareLock); ScanKeyInit(&scanKey[0], Anum_pg_constraint_contype, BTEqualStrategyNumber, F_CHAREQ, CharGetDatum(CONSTRAINT_FOREIGN)); scanDescriptor = systable_beginscan(pgConstraint, InvalidOid, false, NULL, scanKeyCount, scanKey); heapTuple = systable_getnext(scanDescriptor); while (HeapTupleIsValid(heapTuple)) { Oid referencedTableId = InvalidOid; Form_pg_constraint constraintForm = (Form_pg_constraint) GETSTRUCT(heapTuple); char *constraintName = (constraintForm->conname).data; if (strncmp(constraintName, constraintName, NAMEDATALEN) != 0 || constraintForm->conrelid != relationId) { heapTuple = systable_getnext(scanDescriptor); continue; } referencedTableId = constraintForm->confrelid; Assert(IsDistributedTable(referencedTableId)); if (PartitionMethod(referencedTableId) == DISTRIBUTE_BY_NONE) { foreignKeyToReferenceTable = true; break; } heapTuple = systable_getnext(scanDescriptor); } /* clean up scan and close system catalog */ systable_endscan(scanDescriptor); heap_close(pgConstraint, AccessShareLock); return foreignKeyToReferenceTable; }
/* * HasForeignKeyToReferenceTable function scans the pgConstraint table to * fetch all of the constraints on the given relationId and see if at least one * of them is a foreign key referencing to a reference table. */ bool HasForeignKeyToReferenceTable(Oid relationId) { Relation pgConstraint = NULL; SysScanDesc scanDescriptor = NULL; ScanKeyData scanKey[1]; int scanKeyCount = 1; HeapTuple heapTuple = NULL; bool hasForeignKeyToReferenceTable = false; pgConstraint = heap_open(ConstraintRelationId, AccessShareLock); ScanKeyInit(&scanKey[0], Anum_pg_constraint_conrelid, BTEqualStrategyNumber, F_OIDEQ, relationId); scanDescriptor = systable_beginscan(pgConstraint, ConstraintRelidIndexId, true, NULL, scanKeyCount, scanKey); heapTuple = systable_getnext(scanDescriptor); while (HeapTupleIsValid(heapTuple)) { Oid referencedTableId = InvalidOid; Form_pg_constraint constraintForm = (Form_pg_constraint) GETSTRUCT(heapTuple); if (constraintForm->contype != CONSTRAINT_FOREIGN) { heapTuple = systable_getnext(scanDescriptor); continue; } referencedTableId = constraintForm->confrelid; if (!IsDistributedTable(referencedTableId)) { continue; } if (PartitionMethod(referencedTableId) == DISTRIBUTE_BY_NONE) { hasForeignKeyToReferenceTable = true; break; } heapTuple = systable_getnext(scanDescriptor); } /* clean up scan and close system catalog */ systable_endscan(scanDescriptor); heap_close(pgConstraint, NoLock); return hasForeignKeyToReferenceTable; }
/* * ColumnAppearsInForeignKeyToReferenceTable checks if there is foreign constraint * from/to a reference table on the given column. We iterate pgConstraint to fetch * the constraint on the given relationId and find if any of the constraints * includes the given column. */ bool ColumnAppearsInForeignKeyToReferenceTable(char *columnName, Oid relationId) { Relation pgConstraint = NULL; SysScanDesc scanDescriptor = NULL; ScanKeyData scanKey[1]; int scanKeyCount = 1; HeapTuple heapTuple = NULL; bool foreignKeyToReferenceTableIncludesGivenColumn = false; pgConstraint = heap_open(ConstraintRelationId, AccessShareLock); ScanKeyInit(&scanKey[0], Anum_pg_constraint_contype, BTEqualStrategyNumber, F_CHAREQ, CharGetDatum(CONSTRAINT_FOREIGN)); scanDescriptor = systable_beginscan(pgConstraint, InvalidOid, false, NULL, scanKeyCount, scanKey); heapTuple = systable_getnext(scanDescriptor); while (HeapTupleIsValid(heapTuple)) { Oid referencedTableId = InvalidOid; Oid referencingTableId = InvalidOid; int pgConstraintKey = 0; Form_pg_constraint constraintForm = (Form_pg_constraint) GETSTRUCT(heapTuple); referencedTableId = constraintForm->confrelid; referencingTableId = constraintForm->conrelid; if (referencedTableId == relationId) { pgConstraintKey = Anum_pg_constraint_confkey; } else if (referencingTableId == relationId) { pgConstraintKey = Anum_pg_constraint_conkey; } else { /* * If the constraint is not from/to the given relation, we should simply * skip. */ heapTuple = systable_getnext(scanDescriptor); continue; } /* * We check if the referenced table is a reference table. There cannot be * any foreign constraint from a distributed table to a local table. */ Assert(IsDistributedTable(referencedTableId)); if (PartitionMethod(referencedTableId) != DISTRIBUTE_BY_NONE) { heapTuple = systable_getnext(scanDescriptor); continue; } if (HeapTupleOfForeignConstraintIncludesColumn(heapTuple, relationId, pgConstraintKey, columnName)) { foreignKeyToReferenceTableIncludesGivenColumn = true; break; } heapTuple = systable_getnext(scanDescriptor); } /* clean up scan and close system catalog */ systable_endscan(scanDescriptor); heap_close(pgConstraint, AccessShareLock); return foreignKeyToReferenceTableIncludesGivenColumn; }
/* * ErrorIfUnsupportedForeignConstraint runs checks related to foreign constraints and * errors out if it is not possible to create one of the foreign constraint in distributed * environment. * * To support foreign constraints, we require that; * - If referencing and referenced tables are hash-distributed * - Referencing and referenced tables are co-located. * - Foreign constraint is defined over distribution column. * - ON DELETE/UPDATE SET NULL, ON DELETE/UPDATE SET DEFAULT and ON UPDATE CASCADE options * are not used. * - Replication factors of referencing and referenced table are 1. * - If referenced table is a reference table * - ON DELETE/UPDATE SET NULL, ON DELETE/UPDATE SET DEFAULT and ON UPDATE CASCADE options * are not used on the distribution key of the referencing column. * - If referencing table is a reference table, error out */ void ErrorIfUnsupportedForeignConstraint(Relation relation, char distributionMethod, Var *distributionColumn, uint32 colocationId) { Relation pgConstraint = NULL; SysScanDesc scanDescriptor = NULL; ScanKeyData scanKey[1]; int scanKeyCount = 1; HeapTuple heapTuple = NULL; Oid referencingTableId = relation->rd_id; Oid referencedTableId = InvalidOid; uint32 referencedTableColocationId = INVALID_COLOCATION_ID; Var *referencedTablePartitionColumn = NULL; Datum referencingColumnsDatum = 0; Datum *referencingColumnArray = NULL; int referencingColumnCount = 0; Datum referencedColumnsDatum = 0; Datum *referencedColumnArray = NULL; int referencedColumnCount = 0; bool isNull = false; int attrIdx = 0; bool foreignConstraintOnPartitionColumn = false; bool selfReferencingTable = false; bool referencedTableIsAReferenceTable = false; bool referencingColumnsIncludeDistKey = false; pgConstraint = heap_open(ConstraintRelationId, AccessShareLock); ScanKeyInit(&scanKey[0], Anum_pg_constraint_conrelid, BTEqualStrategyNumber, F_OIDEQ, relation->rd_id); scanDescriptor = systable_beginscan(pgConstraint, ConstraintRelidIndexId, true, NULL, scanKeyCount, scanKey); heapTuple = systable_getnext(scanDescriptor); while (HeapTupleIsValid(heapTuple)) { Form_pg_constraint constraintForm = (Form_pg_constraint) GETSTRUCT(heapTuple); bool singleReplicatedTable = true; if (constraintForm->contype != CONSTRAINT_FOREIGN) { heapTuple = systable_getnext(scanDescriptor); continue; } /* * We should make this check in this loop because the error message will only * be given if the table has a foreign constraint and the table is a reference * table. */ if (distributionMethod == DISTRIBUTE_BY_NONE) { ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("cannot create foreign key constraint because " "reference tables are not supported as the " "referencing table of a foreign constraint"), errdetail("Reference tables are only supported as the " "referenced table of a foreign key when the " "referencing table is a hash distributed " "table"))); } referencedTableId = constraintForm->confrelid; selfReferencingTable = referencingTableId == referencedTableId; /* * Some checks are not meaningful if foreign key references the table itself. * Therefore we will skip those checks. */ if (!selfReferencingTable) { if (!IsDistributedTable(referencedTableId)) { ereport(ERROR, (errcode(ERRCODE_INVALID_TABLE_DEFINITION), errmsg("cannot create foreign key constraint"), errdetail("Referenced table must be a distributed " "table."))); } /* * PartitionMethod errors out when it is called for non-distributed * tables. This is why we make this check under !selfReferencingTable * and after !IsDistributedTable(referencedTableId). */ if (PartitionMethod(referencedTableId) == DISTRIBUTE_BY_NONE) { referencedTableIsAReferenceTable = true; } /* * To enforce foreign constraints, tables must be co-located unless a * reference table is referenced. */ referencedTableColocationId = TableColocationId(referencedTableId); if (colocationId == INVALID_COLOCATION_ID || (colocationId != referencedTableColocationId && !referencedTableIsAReferenceTable)) { ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("cannot create foreign key constraint since " "relations are not colocated or not referencing " "a reference table"), errdetail( "A distributed table can only have foreign keys " "if it is referencing another colocated hash " "distributed table or a reference table"))); } referencedTablePartitionColumn = DistPartitionKey(referencedTableId); } else { /* * If the referenced table is not a reference table, the distribution * column in referencing table should be the distribution column in * referenced table as well. */ referencedTablePartitionColumn = distributionColumn; } /* * Column attributes are not available in Form_pg_constraint, therefore we need * to find them in the system catalog. After finding them, we iterate over column * attributes together because partition column must be at the same place in both * referencing and referenced side of the foreign key constraint */ referencingColumnsDatum = SysCacheGetAttr(CONSTROID, heapTuple, Anum_pg_constraint_conkey, &isNull); referencedColumnsDatum = SysCacheGetAttr(CONSTROID, heapTuple, Anum_pg_constraint_confkey, &isNull); deconstruct_array(DatumGetArrayTypeP(referencingColumnsDatum), INT2OID, 2, true, 's', &referencingColumnArray, NULL, &referencingColumnCount); deconstruct_array(DatumGetArrayTypeP(referencedColumnsDatum), INT2OID, 2, true, 's', &referencedColumnArray, NULL, &referencedColumnCount); Assert(referencingColumnCount == referencedColumnCount); for (attrIdx = 0; attrIdx < referencingColumnCount; ++attrIdx) { AttrNumber referencingAttrNo = DatumGetInt16(referencingColumnArray[attrIdx]); AttrNumber referencedAttrNo = DatumGetInt16(referencedColumnArray[attrIdx]); if (distributionColumn->varattno == referencingAttrNo && (!referencedTableIsAReferenceTable && referencedTablePartitionColumn->varattno == referencedAttrNo)) { foreignConstraintOnPartitionColumn = true; } if (distributionColumn->varattno == referencingAttrNo) { referencingColumnsIncludeDistKey = true; } } /* * If columns in the foreign key includes the distribution key from the * referencing side, we do not allow update/delete operations through * foreign key constraints (e.g. ... ON UPDATE SET NULL) */ if (referencingColumnsIncludeDistKey) { /* * ON DELETE SET NULL and ON DELETE SET DEFAULT is not supported. Because we do * not want to set partition column to NULL or default value. */ if (constraintForm->confdeltype == FKCONSTR_ACTION_SETNULL || constraintForm->confdeltype == FKCONSTR_ACTION_SETDEFAULT) { ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("cannot create foreign key constraint"), errdetail("SET NULL or SET DEFAULT is not supported" " in ON DELETE operation when distribution " "key is included in the foreign key constraint"))); } /* * ON UPDATE SET NULL, ON UPDATE SET DEFAULT and UPDATE CASCADE is not supported. * Because we do not want to set partition column to NULL or default value. Also * cascading update operation would require re-partitioning. Updating partition * column value is not allowed anyway even outside of foreign key concept. */ if (constraintForm->confupdtype == FKCONSTR_ACTION_SETNULL || constraintForm->confupdtype == FKCONSTR_ACTION_SETDEFAULT || constraintForm->confupdtype == FKCONSTR_ACTION_CASCADE) { ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("cannot create foreign key constraint"), errdetail("SET NULL, SET DEFAULT or CASCADE is not " "supported in ON UPDATE operation when " "distribution key included in the foreign " "constraint."))); } } /* * if tables are hash-distributed and colocated, we need to make sure that * the distribution key is included in foreign constraint. */ if (!referencedTableIsAReferenceTable && !foreignConstraintOnPartitionColumn) { ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("cannot create foreign key constraint"), errdetail("Foreign keys are supported in two cases, " "either in between two colocated tables including " "partition column in the same ordinal in the both " "tables or from distributed to reference tables"))); } /* * We do not allow to create foreign constraints if shard replication factor is * greater than 1. Because in our current design, multiple replicas may cause * locking problems and inconsistent shard contents. * * Note that we allow referenced table to be a reference table (e.g., not a * single replicated table). This is allowed since (a) we are sure that * placements always be in the same state (b) executors are aware of reference * tables and handle concurrency related issues accordingly. */ if (IsDistributedTable(referencingTableId)) { /* check whether ALTER TABLE command is applied over single replicated table */ if (!SingleReplicatedTable(referencingTableId)) { singleReplicatedTable = false; } } else { Assert(distributionMethod == DISTRIBUTE_BY_HASH); /* check whether creating single replicated table with foreign constraint */ if (ShardReplicationFactor > 1) { singleReplicatedTable = false; } } if (!singleReplicatedTable) { ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("cannot create foreign key constraint"), errdetail("Citus Community Edition currently supports " "foreign key constraints only for " "\"citus.shard_replication_factor = 1\"."), errhint("Please change \"citus.shard_replication_factor to " "1\". To learn more about using foreign keys with " "other replication factors, please contact us at " "https://citusdata.com/about/contact_us."))); } heapTuple = systable_getnext(scanDescriptor); } /* clean up scan and close system catalog */ systable_endscan(scanDescriptor); heap_close(pgConstraint, AccessShareLock); }
/* * master_create_empty_shard creates an empty shard for the given distributed * table. For this, the function first gets a list of candidate nodes, connects * to these nodes, and issues DDL commands on the nodes to create empty shard * placements. The function then updates metadata on the master node to make * this shard (and its placements) visible. */ Datum master_create_empty_shard(PG_FUNCTION_ARGS) { text *relationNameText = PG_GETARG_TEXT_P(0); char *relationName = text_to_cstring(relationNameText); Datum shardIdDatum = 0; int64 shardId = INVALID_SHARD_ID; List *ddlEventList = NULL; uint32 attemptableNodeCount = 0; uint32 liveNodeCount = 0; uint32 candidateNodeCount = 0; List *candidateNodeList = NIL; text *nullMinValue = NULL; text *nullMaxValue = NULL; char partitionMethod = 0; char storageType = SHARD_STORAGE_TABLE; Oid relationId = ResolveRelationId(relationNameText); char *relationOwner = TableOwner(relationId); EnsureTablePermissions(relationId, ACL_INSERT); CheckDistributedTable(relationId); if (CStoreTable(relationId)) { storageType = SHARD_STORAGE_COLUMNAR; } partitionMethod = PartitionMethod(relationId); if (partitionMethod == DISTRIBUTE_BY_HASH) { ereport(ERROR, (errmsg("relation \"%s\" is a hash partitioned table", relationName), errdetail("We currently don't support creating shards " "on hash-partitioned tables"))); } /* generate new and unique shardId from sequence */ shardIdDatum = master_get_new_shardid(NULL); shardId = DatumGetInt64(shardIdDatum); /* get table DDL commands to replay on the worker node */ ddlEventList = GetTableDDLEvents(relationId); /* if enough live nodes, add an extra candidate node as backup */ attemptableNodeCount = ShardReplicationFactor; liveNodeCount = WorkerGetLiveNodeCount(); if (liveNodeCount > ShardReplicationFactor) { attemptableNodeCount = ShardReplicationFactor + 1; } /* first retrieve a list of random nodes for shard placements */ while (candidateNodeCount < attemptableNodeCount) { WorkerNode *candidateNode = WorkerGetCandidateNode(candidateNodeList); if (candidateNode == NULL) { ereport(ERROR, (errmsg("could only find %u of %u possible nodes", candidateNodeCount, attemptableNodeCount))); } candidateNodeList = lappend(candidateNodeList, candidateNode); candidateNodeCount++; } CreateShardPlacements(shardId, ddlEventList, relationOwner, candidateNodeList, 0, ShardReplicationFactor); InsertShardRow(relationId, shardId, storageType, nullMinValue, nullMaxValue); PG_RETURN_INT64(shardId); }
/* * master_append_table_to_shard appends the given table's contents to the given * shard, and updates shard metadata on the master node. If the function fails * to append table data to all shard placements, it doesn't update any metadata * and errors out. Else if the function fails to append table data to some of * the shard placements, it marks those placements as invalid. These invalid * placements will get cleaned up during shard rebalancing. */ Datum master_append_table_to_shard(PG_FUNCTION_ARGS) { uint64 shardId = PG_GETARG_INT64(0); text *sourceTableNameText = PG_GETARG_TEXT_P(1); text *sourceNodeNameText = PG_GETARG_TEXT_P(2); uint32 sourceNodePort = PG_GETARG_UINT32(3); char *sourceTableName = text_to_cstring(sourceTableNameText); char *sourceNodeName = text_to_cstring(sourceNodeNameText); char *shardName = NULL; List *shardPlacementList = NIL; List *succeededPlacementList = NIL; List *failedPlacementList = NIL; ListCell *shardPlacementCell = NULL; ListCell *failedPlacementCell = NULL; uint64 newShardSize = 0; uint64 shardMaxSizeInBytes = 0; float4 shardFillLevel = 0.0; char partitionMethod = 0; ShardInterval *shardInterval = LoadShardInterval(shardId); Oid relationId = shardInterval->relationId; bool cstoreTable = CStoreTable(relationId); char storageType = shardInterval->storageType; EnsureTablePermissions(relationId, ACL_INSERT); if (storageType != SHARD_STORAGE_TABLE && !cstoreTable) { ereport(ERROR, (errmsg("cannot append to shardId " UINT64_FORMAT, shardId), errdetail("The underlying shard is not a regular table"))); } partitionMethod = PartitionMethod(relationId); if (partitionMethod == DISTRIBUTE_BY_HASH) { ereport(ERROR, (errmsg("cannot append to shardId " UINT64_FORMAT, shardId), errdetail("We currently don't support appending to shards " "in hash-partitioned tables"))); } /* * We lock on the shardId, but do not unlock. When the function returns, and * the transaction for this function commits, this lock will automatically * be released. This ensures appends to a shard happen in a serial manner. */ LockShardResource(shardId, AccessExclusiveLock); /* if shard doesn't have an alias, extend regular table name */ shardName = LoadShardAlias(relationId, shardId); if (shardName == NULL) { shardName = get_rel_name(relationId); AppendShardIdToName(&shardName, shardId); } shardPlacementList = FinalizedShardPlacementList(shardId); if (shardPlacementList == NIL) { ereport(ERROR, (errmsg("could not find any shard placements for shardId " UINT64_FORMAT, shardId), errhint("Try running master_create_empty_shard() first"))); } /* issue command to append table to each shard placement */ foreach(shardPlacementCell, shardPlacementList) { ShardPlacement *shardPlacement = (ShardPlacement *) lfirst(shardPlacementCell); char *workerName = shardPlacement->nodeName; uint32 workerPort = shardPlacement->nodePort; List *queryResultList = NIL; StringInfo workerAppendQuery = makeStringInfo(); appendStringInfo(workerAppendQuery, WORKER_APPEND_TABLE_TO_SHARD, quote_literal_cstr(shardName), quote_literal_cstr(sourceTableName), quote_literal_cstr(sourceNodeName), sourceNodePort); /* inserting data should be performed by the current user */ queryResultList = ExecuteRemoteQuery(workerName, workerPort, NULL, workerAppendQuery); if (queryResultList != NIL) { succeededPlacementList = lappend(succeededPlacementList, shardPlacement); } else { failedPlacementList = lappend(failedPlacementList, shardPlacement); } }
/* * master_create_empty_shard creates an empty shard for the given distributed * table. For this, the function first gets a list of candidate nodes, connects * to these nodes, and issues DDL commands on the nodes to create empty shard * placements. The function then updates metadata on the master node to make * this shard (and its placements) visible. */ Datum master_create_empty_shard(PG_FUNCTION_ARGS) { text *relationNameText = PG_GETARG_TEXT_P(0); char *relationName = text_to_cstring(relationNameText); List *workerNodeList = WorkerNodeList(); Datum shardIdDatum = 0; int64 shardId = INVALID_SHARD_ID; List *ddlEventList = NULL; uint32 attemptableNodeCount = 0; uint32 liveNodeCount = 0; uint32 candidateNodeIndex = 0; List *candidateNodeList = NIL; text *nullMinValue = NULL; text *nullMaxValue = NULL; char partitionMethod = 0; char storageType = SHARD_STORAGE_TABLE; Oid relationId = ResolveRelationId(relationNameText); char relationKind = get_rel_relkind(relationId); char *relationOwner = TableOwner(relationId); EnsureTablePermissions(relationId, ACL_INSERT); CheckDistributedTable(relationId); /* * We check whether the table is a foreign table or not. If it is, we set * storage type as foreign also. Only exception is if foreign table is a * foreign cstore table, in this case we set storage type as columnar. * * i.e. While setting storage type, columnar has priority over foreign. */ if (relationKind == RELKIND_FOREIGN_TABLE) { bool cstoreTable = cstoreTable = CStoreTable(relationId); if (cstoreTable) { storageType = SHARD_STORAGE_COLUMNAR; } else { storageType = SHARD_STORAGE_FOREIGN; } } partitionMethod = PartitionMethod(relationId); if (partitionMethod == DISTRIBUTE_BY_HASH) { ereport(ERROR, (errmsg("relation \"%s\" is a hash partitioned table", relationName), errdetail("We currently don't support creating shards " "on hash-partitioned tables"))); } /* generate new and unique shardId from sequence */ shardIdDatum = master_get_new_shardid(NULL); shardId = DatumGetInt64(shardIdDatum); /* get table DDL commands to replay on the worker node */ ddlEventList = GetTableDDLEvents(relationId); /* if enough live nodes, add an extra candidate node as backup */ attemptableNodeCount = ShardReplicationFactor; liveNodeCount = WorkerGetLiveNodeCount(); if (liveNodeCount > ShardReplicationFactor) { attemptableNodeCount = ShardReplicationFactor + 1; } /* first retrieve a list of random nodes for shard placements */ while (candidateNodeIndex < attemptableNodeCount) { WorkerNode *candidateNode = NULL; if (ShardPlacementPolicy == SHARD_PLACEMENT_LOCAL_NODE_FIRST) { candidateNode = WorkerGetLocalFirstCandidateNode(candidateNodeList); } else if (ShardPlacementPolicy == SHARD_PLACEMENT_ROUND_ROBIN) { candidateNode = WorkerGetRoundRobinCandidateNode(workerNodeList, shardId, candidateNodeIndex); } else if (ShardPlacementPolicy == SHARD_PLACEMENT_RANDOM) { candidateNode = WorkerGetRandomCandidateNode(candidateNodeList); } else { ereport(ERROR, (errmsg("unrecognized shard placement policy"))); } if (candidateNode == NULL) { ereport(ERROR, (errmsg("could only find %u of %u possible nodes", candidateNodeIndex, attemptableNodeCount))); } candidateNodeList = lappend(candidateNodeList, candidateNode); candidateNodeIndex++; } CreateShardPlacements(relationId, shardId, ddlEventList, relationOwner, candidateNodeList, 0, ShardReplicationFactor); InsertShardRow(relationId, shardId, storageType, nullMinValue, nullMaxValue); PG_RETURN_INT64(shardId); }
/* * master_apply_delete_command takes in a delete command, finds shards that * match the criteria defined in the delete command, drops the found shards from * the worker nodes, and updates the corresponding metadata on the master node. * This function drops a shard if and only if all rows in the shard satisfy * the conditions in the delete command. Note that this function only accepts * conditions on the partition key and if no condition is provided then all * shards are deleted. * * We mark shard placements that we couldn't drop as to be deleted later. If a * shard satisfies the given conditions, we delete it from shard metadata table * even though related shard placements are not deleted. */ Datum master_apply_delete_command(PG_FUNCTION_ARGS) { text *queryText = PG_GETARG_TEXT_P(0); char *queryString = text_to_cstring(queryText); char *relationName = NULL; char *schemaName = NULL; Oid relationId = InvalidOid; List *shardIntervalList = NIL; List *deletableShardIntervalList = NIL; List *queryTreeList = NIL; Query *deleteQuery = NULL; Node *whereClause = NULL; Node *deleteCriteria = NULL; Node *queryTreeNode = NULL; DeleteStmt *deleteStatement = NULL; int droppedShardCount = 0; LOCKMODE lockMode = 0; char partitionMethod = 0; bool failOK = false; #if (PG_VERSION_NUM >= 100000) RawStmt *rawStmt = (RawStmt *) ParseTreeRawStmt(queryString); queryTreeNode = rawStmt->stmt; #else queryTreeNode = ParseTreeNode(queryString); #endif EnsureCoordinator(); CheckCitusVersion(ERROR); if (!IsA(queryTreeNode, DeleteStmt)) { ereport(ERROR, (errmsg("query \"%s\" is not a delete statement", queryString))); } deleteStatement = (DeleteStmt *) queryTreeNode; schemaName = deleteStatement->relation->schemaname; relationName = deleteStatement->relation->relname; /* * We take an exclusive lock while dropping shards to prevent concurrent * writes. We don't want to block SELECTs, which means queries might fail * if they access a shard that has just been dropped. */ lockMode = ExclusiveLock; relationId = RangeVarGetRelid(deleteStatement->relation, lockMode, failOK); /* schema-prefix if it is not specified already */ if (schemaName == NULL) { Oid schemaId = get_rel_namespace(relationId); schemaName = get_namespace_name(schemaId); } CheckDistributedTable(relationId); EnsureTablePermissions(relationId, ACL_DELETE); #if (PG_VERSION_NUM >= 100000) queryTreeList = pg_analyze_and_rewrite(rawStmt, queryString, NULL, 0, NULL); #else queryTreeList = pg_analyze_and_rewrite(queryTreeNode, queryString, NULL, 0); #endif deleteQuery = (Query *) linitial(queryTreeList); CheckTableCount(deleteQuery); /* get where clause and flatten it */ whereClause = (Node *) deleteQuery->jointree->quals; deleteCriteria = eval_const_expressions(NULL, whereClause); partitionMethod = PartitionMethod(relationId); if (partitionMethod == DISTRIBUTE_BY_HASH) { ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("cannot delete from hash distributed table with this " "command"), errdetail("Delete statements on hash-partitioned tables " "are not supported with master_apply_delete_command."), errhint("Use master_modify_multiple_shards command instead."))); } else if (partitionMethod == DISTRIBUTE_BY_NONE) { ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("cannot delete from distributed table"), errdetail("Delete statements on reference tables " "are not supported."))); } CheckDeleteCriteria(deleteCriteria); CheckPartitionColumn(relationId, deleteCriteria); shardIntervalList = LoadShardIntervalList(relationId); /* drop all shards if where clause is not present */ if (deleteCriteria == NULL) { deletableShardIntervalList = shardIntervalList; ereport(DEBUG2, (errmsg("dropping all shards for \"%s\"", relationName))); } else { deletableShardIntervalList = ShardsMatchingDeleteCriteria(relationId, shardIntervalList, deleteCriteria); } droppedShardCount = DropShards(relationId, schemaName, relationName, deletableShardIntervalList); PG_RETURN_INT32(droppedShardCount); }
/* * CitusCopyFrom implements the COPY table_name FROM ... for hash-partitioned * and range-partitioned tables. */ void CitusCopyFrom(CopyStmt *copyStatement, char *completionTag) { Oid tableId = RangeVarGetRelid(copyStatement->relation, NoLock, false); char *relationName = get_rel_name(tableId); Relation distributedRelation = NULL; char partitionMethod = '\0'; Var *partitionColumn = NULL; TupleDesc tupleDescriptor = NULL; uint32 columnCount = 0; Datum *columnValues = NULL; bool *columnNulls = NULL; TypeCacheEntry *typeEntry = NULL; FmgrInfo *hashFunction = NULL; FmgrInfo *compareFunction = NULL; int shardCount = 0; List *shardIntervalList = NULL; ShardInterval **shardIntervalCache = NULL; bool useBinarySearch = false; HTAB *shardConnectionHash = NULL; ShardConnections *shardConnections = NULL; List *connectionList = NIL; EState *executorState = NULL; MemoryContext executorTupleContext = NULL; ExprContext *executorExpressionContext = NULL; CopyState copyState = NULL; CopyOutState copyOutState = NULL; FmgrInfo *columnOutputFunctions = NULL; uint64 processedRowCount = 0; /* disallow COPY to/from file or program except for superusers */ if (copyStatement->filename != NULL && !superuser()) { if (copyStatement->is_program) { ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("must be superuser to COPY to or from an external program"), errhint("Anyone can COPY to stdout or from stdin. " "psql's \\copy command also works for anyone."))); } else { ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("must be superuser to COPY to or from a file"), errhint("Anyone can COPY to stdout or from stdin. " "psql's \\copy command also works for anyone."))); } } partitionColumn = PartitionColumn(tableId, 0); partitionMethod = PartitionMethod(tableId); if (partitionMethod != DISTRIBUTE_BY_RANGE && partitionMethod != DISTRIBUTE_BY_HASH) { ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("COPY is only supported for hash- and " "range-partitioned tables"))); } /* resolve hash function for partition column */ typeEntry = lookup_type_cache(partitionColumn->vartype, TYPECACHE_HASH_PROC_FINFO); hashFunction = &(typeEntry->hash_proc_finfo); /* resolve compare function for shard intervals */ compareFunction = ShardIntervalCompareFunction(partitionColumn, partitionMethod); /* allocate column values and nulls arrays */ distributedRelation = heap_open(tableId, RowExclusiveLock); tupleDescriptor = RelationGetDescr(distributedRelation); columnCount = tupleDescriptor->natts; columnValues = palloc0(columnCount * sizeof(Datum)); columnNulls = palloc0(columnCount * sizeof(bool)); /* load the list of shards and verify that we have shards to copy into */ shardIntervalList = LoadShardIntervalList(tableId); if (shardIntervalList == NIL) { if (partitionMethod == DISTRIBUTE_BY_HASH) { ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("could not find any shards into which to copy"), errdetail("No shards exist for distributed table \"%s\".", relationName), errhint("Run master_create_worker_shards to create shards " "and try again."))); } else { ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("could not find any shards into which to copy"), errdetail("No shards exist for distributed table \"%s\".", relationName))); } } /* prevent concurrent placement changes and non-commutative DML statements */ LockAllShards(shardIntervalList); /* initialize the shard interval cache */ shardCount = list_length(shardIntervalList); shardIntervalCache = SortedShardIntervalArray(shardIntervalList); /* determine whether to use binary search */ if (partitionMethod != DISTRIBUTE_BY_HASH || !IsUniformHashDistribution(shardIntervalCache, shardCount)) { useBinarySearch = true; } /* initialize copy state to read from COPY data source */ copyState = BeginCopyFrom(distributedRelation, copyStatement->filename, copyStatement->is_program, copyStatement->attlist, copyStatement->options); executorState = CreateExecutorState(); executorTupleContext = GetPerTupleMemoryContext(executorState); executorExpressionContext = GetPerTupleExprContext(executorState); copyOutState = (CopyOutState) palloc0(sizeof(CopyOutStateData)); copyOutState->binary = true; copyOutState->fe_msgbuf = makeStringInfo(); copyOutState->rowcontext = executorTupleContext; columnOutputFunctions = ColumnOutputFunctions(tupleDescriptor, copyOutState->binary); /* * Create a mapping of shard id to a connection for each of its placements. * The hash should be initialized before the PG_TRY, since it is used and * PG_CATCH. Otherwise, it may be undefined in the PG_CATCH (see sigsetjmp * documentation). */ shardConnectionHash = CreateShardConnectionHash(); /* we use a PG_TRY block to roll back on errors (e.g. in NextCopyFrom) */ PG_TRY(); { ErrorContextCallback errorCallback; /* set up callback to identify error line number */ errorCallback.callback = CopyFromErrorCallback; errorCallback.arg = (void *) copyState; errorCallback.previous = error_context_stack; error_context_stack = &errorCallback; /* ensure transactions have unique names on worker nodes */ InitializeDistributedTransaction(); while (true) { bool nextRowFound = false; Datum partitionColumnValue = 0; ShardInterval *shardInterval = NULL; int64 shardId = 0; bool shardConnectionsFound = false; MemoryContext oldContext = NULL; ResetPerTupleExprContext(executorState); oldContext = MemoryContextSwitchTo(executorTupleContext); /* parse a row from the input */ nextRowFound = NextCopyFrom(copyState, executorExpressionContext, columnValues, columnNulls, NULL); if (!nextRowFound) { MemoryContextSwitchTo(oldContext); break; } CHECK_FOR_INTERRUPTS(); /* find the partition column value */ if (columnNulls[partitionColumn->varattno - 1]) { ereport(ERROR, (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), errmsg("cannot copy row with NULL value " "in partition column"))); } partitionColumnValue = columnValues[partitionColumn->varattno - 1]; /* find the shard interval and id for the partition column value */ shardInterval = FindShardInterval(partitionColumnValue, shardIntervalCache, shardCount, partitionMethod, compareFunction, hashFunction, useBinarySearch); if (shardInterval == NULL) { ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("could not find shard for partition column " "value"))); } shardId = shardInterval->shardId; MemoryContextSwitchTo(oldContext); /* get existing connections to the shard placements, if any */ shardConnections = GetShardConnections(shardConnectionHash, shardId, &shardConnectionsFound); if (!shardConnectionsFound) { /* open connections and initiate COPY on shard placements */ OpenCopyTransactions(copyStatement, shardConnections); /* send binary headers to shard placements */ resetStringInfo(copyOutState->fe_msgbuf); AppendCopyBinaryHeaders(copyOutState); SendCopyDataToAll(copyOutState->fe_msgbuf, shardConnections->connectionList); } /* replicate row to shard placements */ resetStringInfo(copyOutState->fe_msgbuf); AppendCopyRowData(columnValues, columnNulls, tupleDescriptor, copyOutState, columnOutputFunctions); SendCopyDataToAll(copyOutState->fe_msgbuf, shardConnections->connectionList); processedRowCount += 1; } connectionList = ConnectionList(shardConnectionHash); /* send binary footers to all shard placements */ resetStringInfo(copyOutState->fe_msgbuf); AppendCopyBinaryFooters(copyOutState); SendCopyDataToAll(copyOutState->fe_msgbuf, connectionList); /* all lines have been copied, stop showing line number in errors */ error_context_stack = errorCallback.previous; /* close the COPY input on all shard placements */ EndRemoteCopy(connectionList, true); if (CopyTransactionManager == TRANSACTION_MANAGER_2PC) { PrepareRemoteTransactions(connectionList); } EndCopyFrom(copyState); heap_close(distributedRelation, NoLock); /* check for cancellation one last time before committing */ CHECK_FOR_INTERRUPTS(); } PG_CATCH(); { List *abortConnectionList = NIL; /* roll back all transactions */ abortConnectionList = ConnectionList(shardConnectionHash); EndRemoteCopy(abortConnectionList, false); AbortRemoteTransactions(abortConnectionList); CloseConnections(abortConnectionList); PG_RE_THROW(); } PG_END_TRY(); /* * Ready to commit the transaction, this code is below the PG_TRY block because * we do not want any of the transactions rolled back if a failure occurs. Instead, * they should be rolled forward. */ CommitRemoteTransactions(connectionList); CloseConnections(connectionList); if (completionTag != NULL) { snprintf(completionTag, COMPLETION_TAG_BUFSIZE, "COPY " UINT64_FORMAT, processedRowCount); } }