/* * master_create_empty_shard creates an empty shard for the given distributed * table. For this, the function first gets a list of candidate nodes, connects * to these nodes, and issues DDL commands on the nodes to create empty shard * placements. The function then updates metadata on the master node to make * this shard (and its placements) visible. */ Datum master_create_empty_shard(PG_FUNCTION_ARGS) { text *relationNameText = PG_GETARG_TEXT_P(0); char *relationName = text_to_cstring(relationNameText); Datum shardIdDatum = 0; int64 shardId = INVALID_SHARD_ID; List *ddlEventList = NULL; uint32 attemptableNodeCount = 0; uint32 liveNodeCount = 0; uint32 candidateNodeCount = 0; List *candidateNodeList = NIL; text *nullMinValue = NULL; text *nullMaxValue = NULL; char partitionMethod = 0; char storageType = SHARD_STORAGE_TABLE; Oid relationId = ResolveRelationId(relationNameText); char *relationOwner = TableOwner(relationId); EnsureTablePermissions(relationId, ACL_INSERT); CheckDistributedTable(relationId); if (CStoreTable(relationId)) { storageType = SHARD_STORAGE_COLUMNAR; } partitionMethod = PartitionMethod(relationId); if (partitionMethod == DISTRIBUTE_BY_HASH) { ereport(ERROR, (errmsg("relation \"%s\" is a hash partitioned table", relationName), errdetail("We currently don't support creating shards " "on hash-partitioned tables"))); } /* generate new and unique shardId from sequence */ shardIdDatum = master_get_new_shardid(NULL); shardId = DatumGetInt64(shardIdDatum); /* get table DDL commands to replay on the worker node */ ddlEventList = GetTableDDLEvents(relationId); /* if enough live nodes, add an extra candidate node as backup */ attemptableNodeCount = ShardReplicationFactor; liveNodeCount = WorkerGetLiveNodeCount(); if (liveNodeCount > ShardReplicationFactor) { attemptableNodeCount = ShardReplicationFactor + 1; } /* first retrieve a list of random nodes for shard placements */ while (candidateNodeCount < attemptableNodeCount) { WorkerNode *candidateNode = WorkerGetCandidateNode(candidateNodeList); if (candidateNode == NULL) { ereport(ERROR, (errmsg("could only find %u of %u possible nodes", candidateNodeCount, attemptableNodeCount))); } candidateNodeList = lappend(candidateNodeList, candidateNode); candidateNodeCount++; } CreateShardPlacements(shardId, ddlEventList, relationOwner, candidateNodeList, 0, ShardReplicationFactor); InsertShardRow(relationId, shardId, storageType, nullMinValue, nullMaxValue); PG_RETURN_INT64(shardId); }
/* * master_create_empty_shard creates an empty shard for the given distributed * table. For this, the function first gets a list of candidate nodes, connects * to these nodes, and issues DDL commands on the nodes to create empty shard * placements. The function then updates metadata on the master node to make * this shard (and its placements) visible. */ Datum master_create_empty_shard(PG_FUNCTION_ARGS) { text *relationNameText = PG_GETARG_TEXT_P(0); char *relationName = text_to_cstring(relationNameText); List *workerNodeList = WorkerNodeList(); Datum shardIdDatum = 0; int64 shardId = INVALID_SHARD_ID; List *ddlEventList = NULL; uint32 attemptableNodeCount = 0; uint32 liveNodeCount = 0; uint32 candidateNodeIndex = 0; List *candidateNodeList = NIL; text *nullMinValue = NULL; text *nullMaxValue = NULL; char partitionMethod = 0; char storageType = SHARD_STORAGE_TABLE; Oid relationId = ResolveRelationId(relationNameText); char relationKind = get_rel_relkind(relationId); char *relationOwner = TableOwner(relationId); EnsureTablePermissions(relationId, ACL_INSERT); CheckDistributedTable(relationId); /* * We check whether the table is a foreign table or not. If it is, we set * storage type as foreign also. Only exception is if foreign table is a * foreign cstore table, in this case we set storage type as columnar. * * i.e. While setting storage type, columnar has priority over foreign. */ if (relationKind == RELKIND_FOREIGN_TABLE) { bool cstoreTable = cstoreTable = CStoreTable(relationId); if (cstoreTable) { storageType = SHARD_STORAGE_COLUMNAR; } else { storageType = SHARD_STORAGE_FOREIGN; } } partitionMethod = PartitionMethod(relationId); if (partitionMethod == DISTRIBUTE_BY_HASH) { ereport(ERROR, (errmsg("relation \"%s\" is a hash partitioned table", relationName), errdetail("We currently don't support creating shards " "on hash-partitioned tables"))); } /* generate new and unique shardId from sequence */ shardIdDatum = master_get_new_shardid(NULL); shardId = DatumGetInt64(shardIdDatum); /* get table DDL commands to replay on the worker node */ ddlEventList = GetTableDDLEvents(relationId); /* if enough live nodes, add an extra candidate node as backup */ attemptableNodeCount = ShardReplicationFactor; liveNodeCount = WorkerGetLiveNodeCount(); if (liveNodeCount > ShardReplicationFactor) { attemptableNodeCount = ShardReplicationFactor + 1; } /* first retrieve a list of random nodes for shard placements */ while (candidateNodeIndex < attemptableNodeCount) { WorkerNode *candidateNode = NULL; if (ShardPlacementPolicy == SHARD_PLACEMENT_LOCAL_NODE_FIRST) { candidateNode = WorkerGetLocalFirstCandidateNode(candidateNodeList); } else if (ShardPlacementPolicy == SHARD_PLACEMENT_ROUND_ROBIN) { candidateNode = WorkerGetRoundRobinCandidateNode(workerNodeList, shardId, candidateNodeIndex); } else if (ShardPlacementPolicy == SHARD_PLACEMENT_RANDOM) { candidateNode = WorkerGetRandomCandidateNode(candidateNodeList); } else { ereport(ERROR, (errmsg("unrecognized shard placement policy"))); } if (candidateNode == NULL) { ereport(ERROR, (errmsg("could only find %u of %u possible nodes", candidateNodeIndex, attemptableNodeCount))); } candidateNodeList = lappend(candidateNodeList, candidateNode); candidateNodeIndex++; } CreateShardPlacements(relationId, shardId, ddlEventList, relationOwner, candidateNodeList, 0, ShardReplicationFactor); InsertShardRow(relationId, shardId, storageType, nullMinValue, nullMaxValue); PG_RETURN_INT64(shardId); }
/* * master_get_local_first_candidate_nodes returns a set of candidate host names * and port numbers on which to place new shards. The function makes sure to * always allocate the first candidate node as the node the caller is connecting * from; and allocates additional nodes until the shard replication factor is * met. The function errors if the caller's remote node name is not found in the * membership list, or if the number of available nodes falls short of the * replication factor. */ Datum master_get_local_first_candidate_nodes(PG_FUNCTION_ARGS) { FuncCallContext *functionContext = NULL; uint32 desiredNodeCount = 0; uint32 currentNodeCount = 0; if (SRF_IS_FIRSTCALL()) { MemoryContext oldContext = NULL; TupleDesc tupleDescriptor = NULL; uint32 liveNodeCount = 0; bool hasOid = false; /* create a function context for cross-call persistence */ functionContext = SRF_FIRSTCALL_INIT(); /* switch to memory context appropriate for multiple function calls */ oldContext = MemoryContextSwitchTo(functionContext->multi_call_memory_ctx); functionContext->user_fctx = NIL; functionContext->max_calls = ShardReplicationFactor; /* if enough live nodes, return an extra candidate node as backup */ liveNodeCount = WorkerGetLiveNodeCount(); if (liveNodeCount > ShardReplicationFactor) { functionContext->max_calls = ShardReplicationFactor + 1; } /* * This tuple descriptor must match the output parameters declared for * the function in pg_proc. */ tupleDescriptor = CreateTemplateTupleDesc(CANDIDATE_NODE_FIELDS, hasOid); TupleDescInitEntry(tupleDescriptor, (AttrNumber) 1, "node_name", TEXTOID, -1, 0); TupleDescInitEntry(tupleDescriptor, (AttrNumber) 2, "node_port", INT8OID, -1, 0); functionContext->tuple_desc = BlessTupleDesc(tupleDescriptor); MemoryContextSwitchTo(oldContext); } functionContext = SRF_PERCALL_SETUP(); desiredNodeCount = functionContext->max_calls; currentNodeCount = functionContext->call_cntr; if (currentNodeCount < desiredNodeCount) { MemoryContext oldContext = NULL; List *currentNodeList = NIL; WorkerNode *candidateNode = NULL; Datum candidateDatum = 0; /* switch to memory context appropriate for multiple function calls */ oldContext = MemoryContextSwitchTo(functionContext->multi_call_memory_ctx); currentNodeList = functionContext->user_fctx; candidateNode = WorkerGetLocalFirstCandidateNode(currentNodeList); if (candidateNode == NULL) { ereport(ERROR, (errmsg("could only find %u of %u required nodes", currentNodeCount, desiredNodeCount))); } currentNodeList = lappend(currentNodeList, candidateNode); functionContext->user_fctx = currentNodeList; MemoryContextSwitchTo(oldContext); candidateDatum = WorkerNodeGetDatum(candidateNode, functionContext->tuple_desc); SRF_RETURN_NEXT(functionContext, candidateDatum); } else { SRF_RETURN_DONE(functionContext); } }
/* * WorkerGetCandidateNode takes in a list of worker nodes, and then allocates a * new worker node. The allocation is performed according to the following * policy: if the list is empty, a random node is allocated; if the list has one * node (or an odd number of nodes), the new node is allocated on a different * rack than the first node; and if the list has two nodes (or an even number of * nodes), the new node is allocated on the same rack as the first node, but is * different from all the nodes in the list. This node allocation policy ensures * that shard locality is maintained within a rack, but no single rack failure * can result in data loss. * * Note that the function returns null if the worker membership list does not * contain enough nodes to allocate a new worker node. */ WorkerNode * WorkerGetCandidateNode(List *currentNodeList) { WorkerNode *workerNode = NULL; bool wantSameRack = false; uint32 tryCount = WORKER_RACK_TRIES; uint32 tryIndex = 0; /* * We check if the shard has already been placed on all nodes known to us. * This check is rather defensive, and has the drawback of performing a full * scan over the worker node hash for determining the number of live nodes. */ uint32 currentNodeCount = list_length(currentNodeList); uint32 liveNodeCount = WorkerGetLiveNodeCount(); if (currentNodeCount >= liveNodeCount) { return NULL; } /* if current node list is empty, randomly pick one node and return */ if (currentNodeCount == 0) { workerNode = FindRandomNodeNotInList(WorkerNodesHash, NIL); return workerNode; } /* * If the current list has an odd number of nodes (1, 3, 5, etc), we want to * place the shard on a different rack than the first node's rack. * Otherwise, we want to place the shard on the same rack as the first node. */ if (OddNumber(currentNodeCount)) { wantSameRack = false; } else { wantSameRack = true; } /* * We try to find a worker node that fits our rack-aware placement strategy. * If after a predefined number of tries, we still cannot find such a node, * we simply give up and return the last worker node we found. */ for (tryIndex = 0; tryIndex < tryCount; tryIndex++) { WorkerNode *firstNode = (WorkerNode *) linitial(currentNodeList); char *firstRack = firstNode->workerRack; char *workerRack = NULL; bool sameRack = false; workerNode = FindRandomNodeNotInList(WorkerNodesHash, currentNodeList); workerRack = workerNode->workerRack; sameRack = (strncmp(workerRack, firstRack, WORKER_LENGTH) == 0); if ((sameRack && wantSameRack) || (!sameRack && !wantSameRack)) { break; } } return workerNode; }