/*
 * ShardLength finds shard placements for the given shardId, extracts the length
 * of a finalized shard, and returns the shard's length. This function errors
 * out if we cannot find any finalized shard placements for the given shardId.
 */
uint64
ShardLength(uint64 shardId)
{
	uint64 shardLength = 0;

	List *shardPlacementList = FinalizedShardPlacementList(shardId);
	if (shardPlacementList == NIL)
	{
		ereport(ERROR, (errmsg("could not find length of shard " UINT64_FORMAT, shardId),
						errdetail("Could not find any shard placements for the shard.")));
	}
	else
	{
		ShardPlacement *shardPlacement = (ShardPlacement *) linitial(shardPlacementList);
		shardLength = shardPlacement->shardLength;
	}

	return shardLength;
}
/*
 * master_append_table_to_shard appends the given table's contents to the given
 * shard, and updates shard metadata on the master node. If the function fails
 * to append table data to all shard placements, it doesn't update any metadata
 * and errors out. Else if the function fails to append table data to some of
 * the shard placements, it marks those placements as invalid. These invalid
 * placements will get cleaned up during shard rebalancing.
 */
Datum
master_append_table_to_shard(PG_FUNCTION_ARGS)
{
	uint64 shardId = PG_GETARG_INT64(0);
	text *sourceTableNameText = PG_GETARG_TEXT_P(1);
	text *sourceNodeNameText = PG_GETARG_TEXT_P(2);
	uint32 sourceNodePort = PG_GETARG_UINT32(3);
	char *sourceTableName = text_to_cstring(sourceTableNameText);
	char *sourceNodeName = text_to_cstring(sourceNodeNameText);

	char *shardName = NULL;
	List *shardPlacementList = NIL;
	List *succeededPlacementList = NIL;
	List *failedPlacementList = NIL;
	ListCell *shardPlacementCell = NULL;
	ListCell *failedPlacementCell = NULL;
	uint64 newShardSize = 0;
	uint64 shardMaxSizeInBytes = 0;
	float4 shardFillLevel = 0.0;
	char partitionMethod = 0;

	ShardInterval *shardInterval = LoadShardInterval(shardId);
	Oid relationId = shardInterval->relationId;
	bool cstoreTable = CStoreTable(relationId);

	char storageType = shardInterval->storageType;

	EnsureTablePermissions(relationId, ACL_INSERT);

	if (storageType != SHARD_STORAGE_TABLE && !cstoreTable)
	{
		ereport(ERROR, (errmsg("cannot append to shardId " UINT64_FORMAT, shardId),
						errdetail("The underlying shard is not a regular table")));
	}

	partitionMethod = PartitionMethod(relationId);
	if (partitionMethod == DISTRIBUTE_BY_HASH)
	{
		ereport(ERROR, (errmsg("cannot append to shardId " UINT64_FORMAT, shardId),
						errdetail("We currently don't support appending to shards "
								  "in hash-partitioned tables")));
	}

	/*
	 * We lock on the shardId, but do not unlock. When the function returns, and
	 * the transaction for this function commits, this lock will automatically
	 * be released. This ensures appends to a shard happen in a serial manner.
	 */
	LockShardResource(shardId, AccessExclusiveLock);

	/* if shard doesn't have an alias, extend regular table name */
	shardName = LoadShardAlias(relationId, shardId);
	if (shardName == NULL)
	{
		shardName = get_rel_name(relationId);
		AppendShardIdToName(&shardName, shardId);
	}

	shardPlacementList = FinalizedShardPlacementList(shardId);
	if (shardPlacementList == NIL)
	{
		ereport(ERROR, (errmsg("could not find any shard placements for shardId "
							   UINT64_FORMAT, shardId),
						errhint("Try running master_create_empty_shard() first")));
	}

	/* issue command to append table to each shard placement */
	foreach(shardPlacementCell, shardPlacementList)
	{
		ShardPlacement *shardPlacement = (ShardPlacement *) lfirst(shardPlacementCell);
		char *workerName = shardPlacement->nodeName;
		uint32 workerPort = shardPlacement->nodePort;
		List *queryResultList = NIL;

		StringInfo workerAppendQuery = makeStringInfo();
		appendStringInfo(workerAppendQuery, WORKER_APPEND_TABLE_TO_SHARD,
						 quote_literal_cstr(shardName),
						 quote_literal_cstr(sourceTableName),
						 quote_literal_cstr(sourceNodeName), sourceNodePort);

		/* inserting data should be performed by the current user */
		queryResultList = ExecuteRemoteQuery(workerName, workerPort, NULL,
											 workerAppendQuery);
		if (queryResultList != NIL)
		{
			succeededPlacementList = lappend(succeededPlacementList, shardPlacement);
		}
		else
		{
			failedPlacementList = lappend(failedPlacementList, shardPlacement);
		}
	}
Example #3
0
/*
 * OpenTransactionsToAllShardPlacements opens connections to all placements
 * using the provided shard identifier list and returns it as a shard ID ->
 * ShardConnections hash. connectionFlags can be used to specify whether
 * the command is FOR_DML or FOR_DDL.
 */
HTAB *
OpenTransactionsToAllShardPlacements(List *shardIntervalList, int connectionFlags)
{
	HTAB *shardConnectionHash = NULL;
	ListCell *shardIntervalCell = NULL;
	List *newConnectionList = NIL;

	shardConnectionHash = CreateShardConnectionHash(CurrentMemoryContext);

	/* open connections to shards which don't have connections yet */
	foreach(shardIntervalCell, shardIntervalList)
	{
		ShardInterval *shardInterval = (ShardInterval *) lfirst(shardIntervalCell);
		uint64 shardId = shardInterval->shardId;
		ShardConnections *shardConnections = NULL;
		bool shardConnectionsFound = false;
		List *shardPlacementList = NIL;
		ListCell *placementCell = NULL;

		shardConnections = GetShardHashConnections(shardConnectionHash, shardId,
												   &shardConnectionsFound);
		if (shardConnectionsFound)
		{
			continue;
		}

		shardPlacementList = FinalizedShardPlacementList(shardId);
		if (shardPlacementList == NIL)
		{
			/* going to have to have some placements to do any work */
			ereport(ERROR, (errmsg("could not find any shard placements for the shard "
								   UINT64_FORMAT, shardId)));
		}

		foreach(placementCell, shardPlacementList)
		{
			ShardPlacement *shardPlacement = (ShardPlacement *) lfirst(placementCell);
			MultiConnection *connection = NULL;

			WorkerNode *workerNode = FindWorkerNode(shardPlacement->nodeName,
													shardPlacement->nodePort);
			if (workerNode == NULL)
			{
				ereport(ERROR, (errmsg("could not find worker node %s:%d",
									   shardPlacement->nodeName,
									   shardPlacement->nodePort)));
			}

			connection = StartPlacementConnection(connectionFlags,
												  shardPlacement,
												  NULL);

			ClaimConnectionExclusively(connection);

			shardConnections->connectionList = lappend(shardConnections->connectionList,
													   connection);

			newConnectionList = lappend(newConnectionList, connection);

			/*
			 * Every individual failure should cause entire distributed
			 * transaction to fail.
			 */
			MarkRemoteTransactionCritical(connection);
		}
/*
 * OpenTransactionsForAllTasks opens a connection for each task,
 * taking into account which shards are read and modified by the task
 * to select the appopriate connection, or error out if no appropriate
 * connection can be found. The set of connections is returned as an
 * anchor shard ID -> ShardConnections hash.
 */
HTAB *
OpenTransactionsForAllTasks(List *taskList, int connectionFlags)
{
	HTAB *shardConnectionHash = NULL;
	ListCell *taskCell = NULL;
	List *newConnectionList = NIL;

	shardConnectionHash = CreateShardConnectionHash(CurrentMemoryContext);

	connectionFlags |= CONNECTION_PER_PLACEMENT;

	/* open connections to shards which don't have connections yet */
	foreach(taskCell, taskList)
	{
		Task *task = (Task *) lfirst(taskCell);
		ShardPlacementAccessType accessType = PLACEMENT_ACCESS_SELECT;
		uint64 shardId = task->anchorShardId;
		ShardConnections *shardConnections = NULL;
		bool shardConnectionsFound = false;
		List *shardPlacementList = NIL;
		ListCell *placementCell = NULL;

		shardConnections = GetShardHashConnections(shardConnectionHash, shardId,
												   &shardConnectionsFound);
		if (shardConnectionsFound)
		{
			continue;
		}

		shardPlacementList = FinalizedShardPlacementList(shardId);
		if (shardPlacementList == NIL)
		{
			/* going to have to have some placements to do any work */
			ereport(ERROR, (errmsg("could not find any shard placements for the shard "
								   UINT64_FORMAT, shardId)));
		}

		if (task->taskType == MODIFY_TASK)
		{
			accessType = PLACEMENT_ACCESS_DML;
		}
		else
		{
			/* can only open connections for DDL and DML commands */
			Assert(task->taskType == DDL_TASK || VACUUM_ANALYZE_TASK);

			accessType = PLACEMENT_ACCESS_DDL;
		}

		foreach(placementCell, shardPlacementList)
		{
			ShardPlacement *shardPlacement = (ShardPlacement *) lfirst(placementCell);
			ShardPlacementAccess placementModification;
			List *placementAccessList = NIL;
			MultiConnection *connection = NULL;

			WorkerNode *workerNode = FindWorkerNode(shardPlacement->nodeName,
													shardPlacement->nodePort);
			if (workerNode == NULL)
			{
				ereport(ERROR, (errmsg("could not find worker node %s:%d",
									   shardPlacement->nodeName,
									   shardPlacement->nodePort)));
			}

			/* add placement access for modification */
			placementModification.placement = shardPlacement;
			placementModification.accessType = accessType;

			placementAccessList = lappend(placementAccessList, &placementModification);

			if (accessType == PLACEMENT_ACCESS_DDL)
			{
				List *placementDDLList = BuildPlacementDDLList(shardPlacement->groupId,
															   task->relationShardList);

				/*
				 * All relations appearing inter-shard DDL commands should be marked
				 * with DDL access.
				 */
				placementAccessList = list_concat(placementAccessList, placementDDLList);
			}
			else
			{
				List *placementSelectList =
					BuildPlacementSelectList(shardPlacement->groupId,
											 task->relationShardList);

				/* add additional placement accesses for subselects (e.g. INSERT .. SELECT) */
				placementAccessList =
					list_concat(placementAccessList, placementSelectList);
			}

			/*
			 * Find a connection that sees preceding writes and cannot self-deadlock,
			 * or error out if no such connection exists.
			 */
			connection = StartPlacementListConnection(connectionFlags,
													  placementAccessList, NULL);

			ClaimConnectionExclusively(connection);

			shardConnections->connectionList = lappend(shardConnections->connectionList,
													   connection);

			newConnectionList = lappend(newConnectionList, connection);

			/*
			 * Every individual failure should cause entire distributed
			 * transaction to fail.
			 */
			MarkRemoteTransactionCritical(connection);
		}