/* * master_create_empty_shard creates an empty shard for the given distributed * table. For this, the function first gets a list of candidate nodes, connects * to these nodes, and issues DDL commands on the nodes to create empty shard * placements. The function then updates metadata on the master node to make * this shard (and its placements) visible. */ Datum master_create_empty_shard(PG_FUNCTION_ARGS) { text *relationNameText = PG_GETARG_TEXT_P(0); char *relationName = text_to_cstring(relationNameText); List *workerNodeList = WorkerNodeList(); Datum shardIdDatum = 0; int64 shardId = INVALID_SHARD_ID; List *ddlEventList = NULL; uint32 attemptableNodeCount = 0; uint32 liveNodeCount = 0; uint32 candidateNodeIndex = 0; List *candidateNodeList = NIL; text *nullMinValue = NULL; text *nullMaxValue = NULL; char partitionMethod = 0; char storageType = SHARD_STORAGE_TABLE; Oid relationId = ResolveRelationId(relationNameText); char relationKind = get_rel_relkind(relationId); char *relationOwner = TableOwner(relationId); EnsureTablePermissions(relationId, ACL_INSERT); CheckDistributedTable(relationId); /* * We check whether the table is a foreign table or not. If it is, we set * storage type as foreign also. Only exception is if foreign table is a * foreign cstore table, in this case we set storage type as columnar. * * i.e. While setting storage type, columnar has priority over foreign. */ if (relationKind == RELKIND_FOREIGN_TABLE) { bool cstoreTable = cstoreTable = CStoreTable(relationId); if (cstoreTable) { storageType = SHARD_STORAGE_COLUMNAR; } else { storageType = SHARD_STORAGE_FOREIGN; } } partitionMethod = PartitionMethod(relationId); if (partitionMethod == DISTRIBUTE_BY_HASH) { ereport(ERROR, (errmsg("relation \"%s\" is a hash partitioned table", relationName), errdetail("We currently don't support creating shards " "on hash-partitioned tables"))); } /* generate new and unique shardId from sequence */ shardIdDatum = master_get_new_shardid(NULL); shardId = DatumGetInt64(shardIdDatum); /* get table DDL commands to replay on the worker node */ ddlEventList = GetTableDDLEvents(relationId); /* if enough live nodes, add an extra candidate node as backup */ attemptableNodeCount = ShardReplicationFactor; liveNodeCount = WorkerGetLiveNodeCount(); if (liveNodeCount > ShardReplicationFactor) { attemptableNodeCount = ShardReplicationFactor + 1; } /* first retrieve a list of random nodes for shard placements */ while (candidateNodeIndex < attemptableNodeCount) { WorkerNode *candidateNode = NULL; if (ShardPlacementPolicy == SHARD_PLACEMENT_LOCAL_NODE_FIRST) { candidateNode = WorkerGetLocalFirstCandidateNode(candidateNodeList); } else if (ShardPlacementPolicy == SHARD_PLACEMENT_ROUND_ROBIN) { candidateNode = WorkerGetRoundRobinCandidateNode(workerNodeList, shardId, candidateNodeIndex); } else if (ShardPlacementPolicy == SHARD_PLACEMENT_RANDOM) { candidateNode = WorkerGetRandomCandidateNode(candidateNodeList); } else { ereport(ERROR, (errmsg("unrecognized shard placement policy"))); } if (candidateNode == NULL) { ereport(ERROR, (errmsg("could only find %u of %u possible nodes", candidateNodeIndex, attemptableNodeCount))); } candidateNodeList = lappend(candidateNodeList, candidateNode); candidateNodeIndex++; } CreateShardPlacements(relationId, shardId, ddlEventList, relationOwner, candidateNodeList, 0, ShardReplicationFactor); InsertShardRow(relationId, shardId, storageType, nullMinValue, nullMaxValue); PG_RETURN_INT64(shardId); }
/* * master_create_empty_shard creates an empty shard for the given distributed * table. For this, the function first gets a list of candidate nodes, connects * to these nodes, and issues DDL commands on the nodes to create empty shard * placements. The function then updates metadata on the master node to make * this shard (and its placements) visible. */ Datum master_create_empty_shard(PG_FUNCTION_ARGS) { text *relationNameText = PG_GETARG_TEXT_P(0); char *relationName = text_to_cstring(relationNameText); Datum shardIdDatum = 0; int64 shardId = INVALID_SHARD_ID; List *ddlEventList = NULL; uint32 attemptableNodeCount = 0; uint32 liveNodeCount = 0; uint32 candidateNodeCount = 0; List *candidateNodeList = NIL; text *nullMinValue = NULL; text *nullMaxValue = NULL; char partitionMethod = 0; char storageType = SHARD_STORAGE_TABLE; Oid relationId = ResolveRelationId(relationNameText); char *relationOwner = TableOwner(relationId); EnsureTablePermissions(relationId, ACL_INSERT); CheckDistributedTable(relationId); if (CStoreTable(relationId)) { storageType = SHARD_STORAGE_COLUMNAR; } partitionMethod = PartitionMethod(relationId); if (partitionMethod == DISTRIBUTE_BY_HASH) { ereport(ERROR, (errmsg("relation \"%s\" is a hash partitioned table", relationName), errdetail("We currently don't support creating shards " "on hash-partitioned tables"))); } /* generate new and unique shardId from sequence */ shardIdDatum = master_get_new_shardid(NULL); shardId = DatumGetInt64(shardIdDatum); /* get table DDL commands to replay on the worker node */ ddlEventList = GetTableDDLEvents(relationId); /* if enough live nodes, add an extra candidate node as backup */ attemptableNodeCount = ShardReplicationFactor; liveNodeCount = WorkerGetLiveNodeCount(); if (liveNodeCount > ShardReplicationFactor) { attemptableNodeCount = ShardReplicationFactor + 1; } /* first retrieve a list of random nodes for shard placements */ while (candidateNodeCount < attemptableNodeCount) { WorkerNode *candidateNode = WorkerGetCandidateNode(candidateNodeList); if (candidateNode == NULL) { ereport(ERROR, (errmsg("could only find %u of %u possible nodes", candidateNodeCount, attemptableNodeCount))); } candidateNodeList = lappend(candidateNodeList, candidateNode); candidateNodeCount++; } CreateShardPlacements(shardId, ddlEventList, relationOwner, candidateNodeList, 0, ShardReplicationFactor); InsertShardRow(relationId, shardId, storageType, nullMinValue, nullMaxValue); PG_RETURN_INT64(shardId); }
/* * master_apply_delete_command takes in a delete command, finds shards that * match the criteria defined in the delete command, drops the found shards from * the worker nodes, and updates the corresponding metadata on the master node. * This function drops a shard if and only if all rows in the shard satisfy * the conditions in the delete command. Note that this function only accepts * conditions on the partition key and if no condition is provided then all * shards are deleted. * * We mark shard placements that we couldn't drop as to be deleted later. If a * shard satisfies the given conditions, we delete it from shard metadata table * even though related shard placements are not deleted. */ Datum master_apply_delete_command(PG_FUNCTION_ARGS) { text *queryText = PG_GETARG_TEXT_P(0); char *queryString = text_to_cstring(queryText); char *relationName = NULL; char *schemaName = NULL; Oid relationId = InvalidOid; List *shardIntervalList = NIL; List *deletableShardIntervalList = NIL; List *queryTreeList = NIL; Query *deleteQuery = NULL; Node *whereClause = NULL; Node *deleteCriteria = NULL; Node *queryTreeNode = NULL; DeleteStmt *deleteStatement = NULL; int droppedShardCount = 0; LOCKMODE lockMode = 0; char partitionMethod = 0; bool failOK = false; #if (PG_VERSION_NUM >= 100000) RawStmt *rawStmt = (RawStmt *) ParseTreeRawStmt(queryString); queryTreeNode = rawStmt->stmt; #else queryTreeNode = ParseTreeNode(queryString); #endif EnsureCoordinator(); CheckCitusVersion(ERROR); if (!IsA(queryTreeNode, DeleteStmt)) { ereport(ERROR, (errmsg("query \"%s\" is not a delete statement", queryString))); } deleteStatement = (DeleteStmt *) queryTreeNode; schemaName = deleteStatement->relation->schemaname; relationName = deleteStatement->relation->relname; /* * We take an exclusive lock while dropping shards to prevent concurrent * writes. We don't want to block SELECTs, which means queries might fail * if they access a shard that has just been dropped. */ lockMode = ExclusiveLock; relationId = RangeVarGetRelid(deleteStatement->relation, lockMode, failOK); /* schema-prefix if it is not specified already */ if (schemaName == NULL) { Oid schemaId = get_rel_namespace(relationId); schemaName = get_namespace_name(schemaId); } CheckDistributedTable(relationId); EnsureTablePermissions(relationId, ACL_DELETE); #if (PG_VERSION_NUM >= 100000) queryTreeList = pg_analyze_and_rewrite(rawStmt, queryString, NULL, 0, NULL); #else queryTreeList = pg_analyze_and_rewrite(queryTreeNode, queryString, NULL, 0); #endif deleteQuery = (Query *) linitial(queryTreeList); CheckTableCount(deleteQuery); /* get where clause and flatten it */ whereClause = (Node *) deleteQuery->jointree->quals; deleteCriteria = eval_const_expressions(NULL, whereClause); partitionMethod = PartitionMethod(relationId); if (partitionMethod == DISTRIBUTE_BY_HASH) { ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("cannot delete from hash distributed table with this " "command"), errdetail("Delete statements on hash-partitioned tables " "are not supported with master_apply_delete_command."), errhint("Use master_modify_multiple_shards command instead."))); } else if (partitionMethod == DISTRIBUTE_BY_NONE) { ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("cannot delete from distributed table"), errdetail("Delete statements on reference tables " "are not supported."))); } CheckDeleteCriteria(deleteCriteria); CheckPartitionColumn(relationId, deleteCriteria); shardIntervalList = LoadShardIntervalList(relationId); /* drop all shards if where clause is not present */ if (deleteCriteria == NULL) { deletableShardIntervalList = shardIntervalList; ereport(DEBUG2, (errmsg("dropping all shards for \"%s\"", relationName))); } else { deletableShardIntervalList = ShardsMatchingDeleteCriteria(relationId, shardIntervalList, deleteCriteria); } droppedShardCount = DropShards(relationId, schemaName, relationName, deletableShardIntervalList); PG_RETURN_INT32(droppedShardCount); }