/*
 * table_ddl_command_array returns an array of strings, each of which is a DDL
 * command required to recreate a table (specified by OID).
 */
Datum
table_ddl_command_array(PG_FUNCTION_ARGS)
{
	Oid distributedTableId = PG_GETARG_OID(0);
	ArrayType *ddlCommandArrayType = NULL;
	List *ddlCommandList = TableDDLCommandList(distributedTableId);
	int ddlCommandCount = list_length(ddlCommandList);
	Datum *ddlCommandDatumArray = palloc0(ddlCommandCount * sizeof(Datum));

	ListCell *ddlCommandCell = NULL;
	int ddlCommandIndex = 0;
	Oid ddlCommandTypeId = TEXTOID;

	foreach(ddlCommandCell, ddlCommandList)
	{
		char *ddlCommand = (char *) lfirst(ddlCommandCell);
		Datum ddlCommandDatum = CStringGetTextDatum(ddlCommand);

		ddlCommandDatumArray[ddlCommandIndex] = ddlCommandDatum;
		ddlCommandIndex++;
	}
Exemple #2
0
/*
 * master_create_worker_shards creates empty shards for the given table based
 * on the specified number of initial shards. The function first gets a list of
 * candidate nodes and issues DDL commands on the nodes to create empty shard
 * placements on those nodes. The function then updates metadata on the master
 * node to make this shard (and its placements) visible. Note that the function
 * assumes the table is hash partitioned and calculates the min/max hash token
 * ranges for each shard, giving them an equal split of the hash space.
 */
Datum
master_create_worker_shards(PG_FUNCTION_ARGS)
{
	text *tableNameText = PG_GETARG_TEXT_P(0);
	int32 shardCount = PG_GETARG_INT32(1);
	int32 replicationFactor = PG_GETARG_INT32(2);

	Oid distributedTableId = ResolveRelationId(tableNameText);
	char relationKind = get_rel_relkind(distributedTableId);
	char *tableName = text_to_cstring(tableNameText);
	char shardStorageType = '\0';
	int32 shardIndex = 0;
	List *workerNodeList = NIL;
	List *ddlCommandList = NIL;
	int32 workerNodeCount = 0;
	uint32 placementAttemptCount = 0;
	uint32 hashTokenIncrement = 0;
	List *existingShardList = NIL;

	/* make sure table is hash partitioned */
	CheckHashPartitionedTable(distributedTableId);

	/* validate that shards haven't already been created for this table */
	existingShardList = LoadShardIntervalList(distributedTableId);
	if (existingShardList != NIL)
	{
		ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
						errmsg("table \"%s\" has already had shards created for it",
							   tableName)));
	}

	/* make sure that at least one shard is specified */
	if (shardCount <= 0)
	{
		ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
						errmsg("shardCount must be positive")));
	}

	/* make sure that at least one replica is specified */
	if (replicationFactor <= 0)
	{
		ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
						errmsg("replicationFactor must be positive")));
	}

	/* calculate the split of the hash space */
	hashTokenIncrement = UINT_MAX / shardCount;

	/* load and sort the worker node list for deterministic placement */
	workerNodeList = ParseWorkerNodeFile(WORKER_LIST_FILENAME);
	workerNodeList = SortList(workerNodeList, CompareWorkerNodes);

	/* make sure we don't process cancel signals until all shards are created */
	HOLD_INTERRUPTS();

	/* retrieve the DDL commands for the table */
	ddlCommandList = TableDDLCommandList(distributedTableId);

	workerNodeCount = list_length(workerNodeList);
	if (replicationFactor > workerNodeCount)
	{
		ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
						errmsg("replicationFactor (%d) exceeds number of worker nodes "
							   "(%d)", replicationFactor, workerNodeCount),
						errhint("Add more worker nodes or try again with a lower "
								"replication factor.")));
	}

	/* if we have enough nodes, add an extra placement attempt for backup */
	placementAttemptCount = (uint32) replicationFactor;
	if (workerNodeCount > replicationFactor)
	{
		placementAttemptCount++;
	}

	/* set shard storage type according to relation type */
	if (relationKind == RELKIND_FOREIGN_TABLE)
	{
		shardStorageType = SHARD_STORAGE_FOREIGN;
	}
	else
	{
		shardStorageType = SHARD_STORAGE_TABLE;
	}

	for (shardIndex = 0; shardIndex < shardCount; shardIndex++)
	{
		uint64 shardId = NextSequenceId(SHARD_ID_SEQUENCE_NAME);
		int32 placementCount = 0;
		uint32 placementIndex = 0;
		uint32 roundRobinNodeIndex = shardIndex % workerNodeCount;

		List *extendedDDLCommands = ExtendedDDLCommandList(distributedTableId, shardId,
														   ddlCommandList);

		/* initialize the hash token space for this shard */
		text *minHashTokenText = NULL;
		text *maxHashTokenText = NULL;
		int32 shardMinHashToken = INT_MIN + (shardIndex * hashTokenIncrement);
		int32 shardMaxHashToken = shardMinHashToken + hashTokenIncrement - 1;

		/* if we are at the last shard, make sure the max token value is INT_MAX */
		if (shardIndex == (shardCount - 1))
		{
			shardMaxHashToken = INT_MAX;
		}

		for (placementIndex = 0; placementIndex < placementAttemptCount; placementIndex++)
		{
			int32 candidateNodeIndex =
				(roundRobinNodeIndex + placementIndex) % workerNodeCount;
			WorkerNode *candidateNode = (WorkerNode *) list_nth(workerNodeList,
																candidateNodeIndex);
			char *nodeName = candidateNode->nodeName;
			uint32 nodePort = candidateNode->nodePort;

			bool created = ExecuteRemoteCommandList(nodeName, nodePort,
													extendedDDLCommands);
			if (created)
			{
				uint64 shardPlacementId = 0;
				ShardState shardState = STATE_FINALIZED;


				shardPlacementId = NextSequenceId(SHARD_PLACEMENT_ID_SEQUENCE_NAME);
				InsertShardPlacementRow(shardPlacementId, shardId, shardState,
										nodeName, nodePort);
				placementCount++;
			}
			else
			{
				ereport(WARNING, (errmsg("could not create shard on \"%s:%u\"",
										 nodeName, nodePort)));
			}

			if (placementCount >= replicationFactor)
			{
				break;
			}
		}

		/* check if we created enough shard replicas */
		if (placementCount < replicationFactor)
		{
			ereport(ERROR, (errmsg("could not satisfy specified replication factor"),
							errdetail("Created %d shard replicas, less than the "
									  "requested replication factor of %d.",
									  placementCount, replicationFactor)));
		}

		/* insert the shard metadata row along with its min/max values */
		minHashTokenText = IntegerToText(shardMinHashToken);
		maxHashTokenText = IntegerToText(shardMaxHashToken);
		InsertShardRow(distributedTableId, shardId, shardStorageType,
					   minHashTokenText, maxHashTokenText);
	}

	if (QueryCancelPending)
	{
		ereport(WARNING, (errmsg("cancel requests are ignored during shard creation")));
		QueryCancelPending = false;
	}

	RESUME_INTERRUPTS();

	PG_RETURN_VOID();
}
/*
 * FetchRegularTable fetches the given table's data using the copy out command.
 * The function then fetches the DDL commands necessary to create this table's
 * replica, and locally applies these DDL commands. Last, the function copies
 * the fetched table data into the created table; and on success, returns true.
 * On failure due to connectivity issues with remote node, the function returns
 * false. On other types of failures, the function errors out.
 */
static bool
FetchRegularTable(const char *nodeName, uint32 nodePort, const char *tableName)
{
	StringInfo localFilePath = NULL;
	StringInfo remoteCopyCommand = NULL;
	List *ddlCommandList = NIL;
	ListCell *ddlCommandCell = NULL;
	CopyStmt *localCopyCommand = NULL;
	RangeVar *localTable = NULL;
	uint64 shardId = 0;
	bool received = false;
	StringInfo queryString = NULL;
	const char *tableOwner = NULL;
	Oid tableOwnerId = InvalidOid;
	Oid savedUserId = InvalidOid;
	int savedSecurityContext = 0;
	List *tableNameList = NIL;

	/* copy remote table's data to this node in an idempotent manner */
	shardId = ExtractShardId(tableName);
	localFilePath = makeStringInfo();
	appendStringInfo(localFilePath, "base/%s/%s" UINT64_FORMAT,
					 PG_JOB_CACHE_DIR, TABLE_FILE_PREFIX, shardId);

	remoteCopyCommand = makeStringInfo();
	appendStringInfo(remoteCopyCommand, COPY_OUT_COMMAND, tableName);

	received = ReceiveRegularFile(nodeName, nodePort, remoteCopyCommand, localFilePath);
	if (!received)
	{
		return false;
	}

	/* fetch the ddl commands needed to create the table */
	tableOwner = RemoteTableOwner(nodeName, nodePort, tableName);
	if (tableOwner == NULL)
	{
		return false;
	}
	tableOwnerId = get_role_oid(tableOwner, false);

	/* fetch the ddl commands needed to create the table */
	ddlCommandList = TableDDLCommandList(nodeName, nodePort, tableName);
	if (ddlCommandList == NIL)
	{
		return false;
	}

	/*
	 * Apply DDL commands against the database. Note that on failure from here
	 * on, we immediately error out instead of returning false.  Have to do
	 * this as the table's owner to ensure the local table is created with
	 * compatible permissions.
	 */
	GetUserIdAndSecContext(&savedUserId, &savedSecurityContext);
	SetUserIdAndSecContext(tableOwnerId, SECURITY_LOCAL_USERID_CHANGE);

	foreach(ddlCommandCell, ddlCommandList)
	{
		StringInfo ddlCommand = (StringInfo) lfirst(ddlCommandCell);
		Node *ddlCommandNode = ParseTreeNode(ddlCommand->data);

		ProcessUtility(ddlCommandNode, ddlCommand->data, PROCESS_UTILITY_TOPLEVEL,
					   NULL, None_Receiver, NULL);
		CommandCounterIncrement();
	}