/*
 * master_create_empty_shard creates an empty shard for the given distributed
 * table. For this, the function first gets a list of candidate nodes, connects
 * to these nodes, and issues DDL commands on the nodes to create empty shard
 * placements. The function then updates metadata on the master node to make
 * this shard (and its placements) visible.
 */
Datum
master_create_empty_shard(PG_FUNCTION_ARGS)
{
	text *relationNameText = PG_GETARG_TEXT_P(0);
	char *relationName = text_to_cstring(relationNameText);
	Datum shardIdDatum = 0;
	int64 shardId = INVALID_SHARD_ID;
	List *ddlEventList = NULL;
	uint32 attemptableNodeCount = 0;
	uint32 liveNodeCount = 0;

	uint32 candidateNodeCount = 0;
	List *candidateNodeList = NIL;
	text *nullMinValue = NULL;
	text *nullMaxValue = NULL;
	char partitionMethod = 0;
	char storageType = SHARD_STORAGE_TABLE;

	Oid relationId = ResolveRelationId(relationNameText);
	char *relationOwner = TableOwner(relationId);

	EnsureTablePermissions(relationId, ACL_INSERT);
	CheckDistributedTable(relationId);

	if (CStoreTable(relationId))
	{
		storageType = SHARD_STORAGE_COLUMNAR;
	}

	partitionMethod = PartitionMethod(relationId);
	if (partitionMethod == DISTRIBUTE_BY_HASH)
	{
		ereport(ERROR, (errmsg("relation \"%s\" is a hash partitioned table",
							   relationName),
						errdetail("We currently don't support creating shards "
								  "on hash-partitioned tables")));
	}

	/* generate new and unique shardId from sequence */
	shardIdDatum = master_get_new_shardid(NULL);
	shardId = DatumGetInt64(shardIdDatum);

	/* get table DDL commands to replay on the worker node */
	ddlEventList = GetTableDDLEvents(relationId);

	/* if enough live nodes, add an extra candidate node as backup */
	attemptableNodeCount = ShardReplicationFactor;
	liveNodeCount = WorkerGetLiveNodeCount();
	if (liveNodeCount > ShardReplicationFactor)
	{
		attemptableNodeCount = ShardReplicationFactor + 1;
	}

	/* first retrieve a list of random nodes for shard placements */
	while (candidateNodeCount < attemptableNodeCount)
	{
		WorkerNode *candidateNode = WorkerGetCandidateNode(candidateNodeList);
		if (candidateNode == NULL)
		{
			ereport(ERROR, (errmsg("could only find %u of %u possible nodes",
								   candidateNodeCount, attemptableNodeCount)));
		}

		candidateNodeList = lappend(candidateNodeList, candidateNode);
		candidateNodeCount++;
	}

	CreateShardPlacements(shardId, ddlEventList, relationOwner,
						  candidateNodeList, 0, ShardReplicationFactor);

	InsertShardRow(relationId, shardId, storageType, nullMinValue, nullMaxValue);

	PG_RETURN_INT64(shardId);
}
Esempio n. 2
0
/*
 * master_create_empty_shard creates an empty shard for the given distributed
 * table. For this, the function first gets a list of candidate nodes, connects
 * to these nodes, and issues DDL commands on the nodes to create empty shard
 * placements. The function then updates metadata on the master node to make
 * this shard (and its placements) visible.
 */
Datum
master_create_empty_shard(PG_FUNCTION_ARGS)
{
	text *relationNameText = PG_GETARG_TEXT_P(0);
	char *relationName = text_to_cstring(relationNameText);
	List *workerNodeList = WorkerNodeList();
	Datum shardIdDatum = 0;
	int64 shardId = INVALID_SHARD_ID;
	List *ddlEventList = NULL;
	uint32 attemptableNodeCount = 0;
	uint32 liveNodeCount = 0;

	uint32 candidateNodeIndex = 0;
	List *candidateNodeList = NIL;
	text *nullMinValue = NULL;
	text *nullMaxValue = NULL;
	char partitionMethod = 0;
	char storageType = SHARD_STORAGE_TABLE;

	Oid relationId = ResolveRelationId(relationNameText);
	char relationKind = get_rel_relkind(relationId);
	char *relationOwner = TableOwner(relationId);

	EnsureTablePermissions(relationId, ACL_INSERT);
	CheckDistributedTable(relationId);

	/*
	 * We check whether the table is a foreign table or not. If it is, we set
	 * storage type as foreign also. Only exception is if foreign table is a
	 * foreign cstore table, in this case we set storage type as columnar.
	 *
	 * i.e. While setting storage type, columnar has priority over foreign.
	 */
	if (relationKind == RELKIND_FOREIGN_TABLE)
	{
		bool cstoreTable = cstoreTable = CStoreTable(relationId);
		if (cstoreTable)
		{
			storageType = SHARD_STORAGE_COLUMNAR;
		}
		else
		{
			storageType = SHARD_STORAGE_FOREIGN;
		}
	}

	partitionMethod = PartitionMethod(relationId);
	if (partitionMethod == DISTRIBUTE_BY_HASH)
	{
		ereport(ERROR, (errmsg("relation \"%s\" is a hash partitioned table",
							   relationName),
						errdetail("We currently don't support creating shards "
								  "on hash-partitioned tables")));
	}

	/* generate new and unique shardId from sequence */
	shardIdDatum = master_get_new_shardid(NULL);
	shardId = DatumGetInt64(shardIdDatum);

	/* get table DDL commands to replay on the worker node */
	ddlEventList = GetTableDDLEvents(relationId);

	/* if enough live nodes, add an extra candidate node as backup */
	attemptableNodeCount = ShardReplicationFactor;
	liveNodeCount = WorkerGetLiveNodeCount();
	if (liveNodeCount > ShardReplicationFactor)
	{
		attemptableNodeCount = ShardReplicationFactor + 1;
	}

	/* first retrieve a list of random nodes for shard placements */
	while (candidateNodeIndex < attemptableNodeCount)
	{
		WorkerNode *candidateNode = NULL;

		if (ShardPlacementPolicy == SHARD_PLACEMENT_LOCAL_NODE_FIRST)
		{
			candidateNode = WorkerGetLocalFirstCandidateNode(candidateNodeList);
		}
		else if (ShardPlacementPolicy == SHARD_PLACEMENT_ROUND_ROBIN)
		{
			candidateNode = WorkerGetRoundRobinCandidateNode(workerNodeList, shardId,
															 candidateNodeIndex);
		}
		else if (ShardPlacementPolicy == SHARD_PLACEMENT_RANDOM)
		{
			candidateNode = WorkerGetRandomCandidateNode(candidateNodeList);
		}
		else
		{
			ereport(ERROR, (errmsg("unrecognized shard placement policy")));
		}

		if (candidateNode == NULL)
		{
			ereport(ERROR, (errmsg("could only find %u of %u possible nodes",
								   candidateNodeIndex, attemptableNodeCount)));
		}

		candidateNodeList = lappend(candidateNodeList, candidateNode);
		candidateNodeIndex++;
	}

	CreateShardPlacements(relationId, shardId, ddlEventList, relationOwner,
						  candidateNodeList, 0, ShardReplicationFactor);

	InsertShardRow(relationId, shardId, storageType, nullMinValue, nullMaxValue);

	PG_RETURN_INT64(shardId);
}
Esempio n. 3
0
/*
 * master_get_local_first_candidate_nodes returns a set of candidate host names
 * and port numbers on which to place new shards. The function makes sure to
 * always allocate the first candidate node as the node the caller is connecting
 * from; and allocates additional nodes until the shard replication factor is
 * met. The function errors if the caller's remote node name is not found in the
 * membership list, or if the number of available nodes falls short of the
 * replication factor.
 */
Datum
master_get_local_first_candidate_nodes(PG_FUNCTION_ARGS)
{
	FuncCallContext *functionContext = NULL;
	uint32 desiredNodeCount = 0;
	uint32 currentNodeCount = 0;

	if (SRF_IS_FIRSTCALL())
	{
		MemoryContext oldContext = NULL;
		TupleDesc tupleDescriptor = NULL;
		uint32 liveNodeCount = 0;
		bool hasOid = false;

		/* create a function context for cross-call persistence */
		functionContext = SRF_FIRSTCALL_INIT();

		/* switch to memory context appropriate for multiple function calls */
		oldContext = MemoryContextSwitchTo(functionContext->multi_call_memory_ctx);

		functionContext->user_fctx = NIL;
		functionContext->max_calls = ShardReplicationFactor;

		/* if enough live nodes, return an extra candidate node as backup */
		liveNodeCount = WorkerGetLiveNodeCount();
		if (liveNodeCount > ShardReplicationFactor)
		{
			functionContext->max_calls = ShardReplicationFactor + 1;
		}

		/*
		 * This tuple descriptor must match the output parameters declared for
		 * the function in pg_proc.
		 */
		tupleDescriptor = CreateTemplateTupleDesc(CANDIDATE_NODE_FIELDS, hasOid);
		TupleDescInitEntry(tupleDescriptor, (AttrNumber) 1, "node_name",
						   TEXTOID, -1, 0);
		TupleDescInitEntry(tupleDescriptor, (AttrNumber) 2, "node_port",
						   INT8OID, -1, 0);

		functionContext->tuple_desc = BlessTupleDesc(tupleDescriptor);

		MemoryContextSwitchTo(oldContext);
	}

	functionContext = SRF_PERCALL_SETUP();
	desiredNodeCount = functionContext->max_calls;
	currentNodeCount = functionContext->call_cntr;

	if (currentNodeCount < desiredNodeCount)
	{
		MemoryContext oldContext = NULL;
		List *currentNodeList = NIL;
		WorkerNode *candidateNode = NULL;
		Datum candidateDatum = 0;

		/* switch to memory context appropriate for multiple function calls */
		oldContext = MemoryContextSwitchTo(functionContext->multi_call_memory_ctx);
		currentNodeList = functionContext->user_fctx;

		candidateNode = WorkerGetLocalFirstCandidateNode(currentNodeList);
		if (candidateNode == NULL)
		{
			ereport(ERROR, (errmsg("could only find %u of %u required nodes",
								   currentNodeCount, desiredNodeCount)));
		}

		currentNodeList = lappend(currentNodeList, candidateNode);
		functionContext->user_fctx = currentNodeList;

		MemoryContextSwitchTo(oldContext);

		candidateDatum = WorkerNodeGetDatum(candidateNode, functionContext->tuple_desc);

		SRF_RETURN_NEXT(functionContext, candidateDatum);
	}
	else
	{
		SRF_RETURN_DONE(functionContext);
	}
}
Esempio n. 4
0
/*
 * WorkerGetCandidateNode takes in a list of worker nodes, and then allocates a
 * new worker node. The allocation is performed according to the following
 * policy: if the list is empty, a random node is allocated; if the list has one
 * node (or an odd number of nodes), the new node is allocated on a different
 * rack than the first node; and if the list has two nodes (or an even number of
 * nodes), the new node is allocated on the same rack as the first node, but is
 * different from all the nodes in the list. This node allocation policy ensures
 * that shard locality is maintained within a rack, but no single rack failure
 * can result in data loss.
 *
 * Note that the function returns null if the worker membership list does not
 * contain enough nodes to allocate a new worker node.
 */
WorkerNode *
WorkerGetCandidateNode(List *currentNodeList)
{
	WorkerNode *workerNode = NULL;
	bool wantSameRack = false;
	uint32 tryCount = WORKER_RACK_TRIES;
	uint32 tryIndex = 0;

	/*
	 * We check if the shard has already been placed on all nodes known to us.
	 * This check is rather defensive, and has the drawback of performing a full
	 * scan over the worker node hash for determining the number of live nodes.
	 */
	uint32 currentNodeCount = list_length(currentNodeList);
	uint32 liveNodeCount = WorkerGetLiveNodeCount();
	if (currentNodeCount >= liveNodeCount)
	{
		return NULL;
	}

	/* if current node list is empty, randomly pick one node and return */
	if (currentNodeCount == 0)
	{
		workerNode = FindRandomNodeNotInList(WorkerNodesHash, NIL);
		return workerNode;
	}

	/*
	 * If the current list has an odd number of nodes (1, 3, 5, etc), we want to
	 * place the shard on a different rack than the first node's rack.
	 * Otherwise, we want to place the shard on the same rack as the first node.
	 */
	if (OddNumber(currentNodeCount))
	{
		wantSameRack = false;
	}
	else
	{
		wantSameRack = true;
	}

	/*
	 * We try to find a worker node that fits our rack-aware placement strategy.
	 * If after a predefined number of tries, we still cannot find such a node,
	 * we simply give up and return the last worker node we found.
	 */
	for (tryIndex = 0; tryIndex < tryCount; tryIndex++)
	{
		WorkerNode *firstNode = (WorkerNode *) linitial(currentNodeList);
		char *firstRack = firstNode->workerRack;
		char *workerRack = NULL;
		bool sameRack = false;

		workerNode = FindRandomNodeNotInList(WorkerNodesHash, currentNodeList);
		workerRack = workerNode->workerRack;

		sameRack = (strncmp(workerRack, firstRack, WORKER_LENGTH) == 0);
		if ((sameRack && wantSameRack) || (!sameRack && !wantSameRack))
		{
			break;
		}
	}

	return workerNode;
}