Пример #1
0
/*
 * RecoverPreparedTransactions recovers any pending prepared
 * transactions started by this node on other nodes.
 */
static int
RecoverPreparedTransactions(void)
{
	List *workerList = NIL;
	ListCell *workerNodeCell = NULL;
	int recoveredTransactionCount = 0;

	/*
	 * We block here if metadata transactions are ongoing, since we
	 * mustn't commit/abort their prepared transactions under their
	 * feet. We also prevent concurrent recovery.
	 */
	LockRelationOid(DistTransactionRelationId(), ExclusiveLock);

	workerList = WorkerNodeList();

	foreach(workerNodeCell, workerList)
	{
		WorkerNode *workerNode = (WorkerNode *) lfirst(workerNodeCell);

		recoveredTransactionCount += RecoverWorkerTransactions(workerNode);
	}
Пример #2
0
/*
 * master_get_active_worker_nodes returns a set of active worker host names and
 * port numbers in deterministic order. Currently we assume that all worker
 * nodes in pg_worker_list.conf are active.
 */
Datum
master_get_active_worker_nodes(PG_FUNCTION_ARGS)
{
	FuncCallContext *functionContext = NULL;
	uint32 workerNodeIndex = 0;
	uint32 workerNodeCount = 0;

	if (SRF_IS_FIRSTCALL())
	{
		MemoryContext oldContext = NULL;
		List *workerNodeList = NIL;
		uint32 workerNodeCount = 0;
		TupleDesc tupleDescriptor = NULL;
		bool hasOid = false;

		/* create a function context for cross-call persistence */
		functionContext = SRF_FIRSTCALL_INIT();

		/* switch to memory context appropriate for multiple function calls */
		oldContext = MemoryContextSwitchTo(functionContext->multi_call_memory_ctx);

		workerNodeList = WorkerNodeList();
		workerNodeCount = (uint32) list_length(workerNodeList);

		functionContext->user_fctx = workerNodeList;
		functionContext->max_calls = workerNodeCount;

		/*
		 * This tuple descriptor must match the output parameters declared for
		 * the function in pg_proc.
		 */
		tupleDescriptor = CreateTemplateTupleDesc(WORKER_NODE_FIELDS, hasOid);
		TupleDescInitEntry(tupleDescriptor, (AttrNumber) 1, "node_name",
						   TEXTOID, -1, 0);
		TupleDescInitEntry(tupleDescriptor, (AttrNumber) 2, "node_port",
						   INT8OID, -1, 0);

		functionContext->tuple_desc = BlessTupleDesc(tupleDescriptor);

		MemoryContextSwitchTo(oldContext);
	}

	functionContext = SRF_PERCALL_SETUP();
	workerNodeIndex = functionContext->call_cntr;
	workerNodeCount = functionContext->max_calls;

	if (workerNodeIndex < workerNodeCount)
	{
		List *workerNodeList = functionContext->user_fctx;
		WorkerNode *workerNode = list_nth(workerNodeList, workerNodeIndex);

		Datum workerNodeDatum = WorkerNodeGetDatum(workerNode,
												   functionContext->tuple_desc);

		SRF_RETURN_NEXT(functionContext, workerNodeDatum);
	}
	else
	{
		SRF_RETURN_DONE(functionContext);
	}
}
Пример #3
0
/*
 * master_get_round_robin_candidate_nodes returns a set of candidate host names
 * and port numbers on which to place new shards. The function uses the round
 * robin policy to choose the nodes and tries to ensure that there is an even
 * distribution of shards across the worker nodes. This function errors out if
 * the number of available nodes falls short of the replication factor.
 */
Datum
master_get_round_robin_candidate_nodes(PG_FUNCTION_ARGS)
{
	uint64 shardId = PG_GETARG_INT64(0);
	FuncCallContext *functionContext = NULL;
	uint32 desiredNodeCount = 0;
	uint32 currentNodeCount = 0;

	if (SRF_IS_FIRSTCALL())
	{
		MemoryContext oldContext = NULL;
		TupleDesc tupleDescriptor = NULL;
		List *workerNodeList = NIL;
		TypeFuncClass resultTypeClass = 0;
		uint32 workerNodeCount = 0;

		/* create a function context for cross-call persistence */
		functionContext = SRF_FIRSTCALL_INIT();

		/* switch to memory context appropriate for multiple function calls */
		oldContext = MemoryContextSwitchTo(functionContext->multi_call_memory_ctx);

		/* get the worker node list and sort it for determinism */
		workerNodeList = WorkerNodeList();
		workerNodeList = SortList(workerNodeList, CompareWorkerNodes);

		functionContext->user_fctx = workerNodeList;
		functionContext->max_calls = ShardReplicationFactor;

		/* if we enough live nodes, return an extra candidate node as backup */
		workerNodeCount = (uint32) list_length(workerNodeList);
		if (workerNodeCount > ShardReplicationFactor)
		{
			functionContext->max_calls = ShardReplicationFactor + 1;
		}

		/* create tuple descriptor for return value */
		resultTypeClass = get_call_result_type(fcinfo, NULL, &tupleDescriptor);
		if (resultTypeClass != TYPEFUNC_COMPOSITE)
		{
			ereport(ERROR, (errmsg("return type must be a row type")));
		}

		functionContext->tuple_desc = tupleDescriptor;

		MemoryContextSwitchTo(oldContext);
	}

	functionContext = SRF_PERCALL_SETUP();
	desiredNodeCount = functionContext->max_calls;
	currentNodeCount = functionContext->call_cntr;

	if (currentNodeCount < desiredNodeCount)
	{
		List *workerNodeList = functionContext->user_fctx;
		WorkerNode *candidateNode = NULL;
		Datum candidateDatum = 0;

		candidateNode = WorkerGetRoundRobinCandidateNode(workerNodeList, shardId,
														 currentNodeCount);
		if (candidateNode == NULL)
		{
			ereport(ERROR, (errmsg("could only find %u of %u required nodes",
								   currentNodeCount, desiredNodeCount)));
		}

		candidateDatum = WorkerNodeGetDatum(candidateNode, functionContext->tuple_desc);

		SRF_RETURN_NEXT(functionContext, candidateDatum);
	}
	else
	{
		SRF_RETURN_DONE(functionContext);
	}
}
Пример #4
0
/*
 * JobExecutorType selects the executor type for the given multiPlan using the task
 * executor type config value. The function then checks if the given multiPlan needs
 * more resources than those provided to it by other config values, and issues
 * warnings accordingly. If the selected executor type cannot execute the given
 * multiPlan, the function errors out.
 */
MultiExecutorType
JobExecutorType(MultiPlan *multiPlan)
{
	Job *job = multiPlan->workerJob;
	List *workerTaskList = job->taskList;
	List *workerNodeList = WorkerNodeList();
	int taskCount = list_length(workerTaskList);
	int workerNodeCount = list_length(workerNodeList);
	double tasksPerNode = taskCount / ((double) workerNodeCount);
	int dependedJobCount = list_length(job->dependedJobList);
	MultiExecutorType executorType = TaskExecutorType;
	bool routerExecutablePlan = multiPlan->routerExecutable;

	/* check if can switch to router executor */
	if (routerExecutablePlan)
	{
		ereport(DEBUG2, (errmsg("Plan is router executable")));
		return MULTI_EXECUTOR_ROUTER;
	}

	if (executorType == MULTI_EXECUTOR_REAL_TIME)
	{
		double reasonableConnectionCount = 0;

		/* if we need to open too many connections per worker, warn the user */
		if (tasksPerNode >= MaxConnections)
		{
			ereport(WARNING, (errmsg("this query uses more connections than the "
									 "configured max_connections limit"),
							  errhint("Consider increasing max_connections or setting "
									  "citus.task_executor_type to "
									  "\"task-tracker\".")));
		}

		/*
		 * If we need to open too many outgoing connections, warn the user.
		 * The real-time executor caps the number of tasks it starts by the same limit,
		 * but we still issue this warning because it degrades performance.
		 */
		reasonableConnectionCount = MaxMasterConnectionCount();
		if (taskCount >= reasonableConnectionCount)
		{
			ereport(WARNING, (errmsg("this query uses more file descriptors than the "
									 "configured max_files_per_process limit"),
							  errhint("Consider increasing max_files_per_process or "
									  "setting citus.task_executor_type to "
									  "\"task-tracker\".")));
		}

		/* if we have repartition jobs with real time executor, error out */
		if (dependedJobCount > 0)
		{
			ereport(ERROR, (errmsg("cannot use real time executor with repartition jobs"),
							errhint("Set citus.task_executor_type to "
									"\"task-tracker\".")));
		}
	}
	else
	{
		/* if we have more tasks per node than what can be tracked, warn the user */
		if (tasksPerNode >= MaxTrackedTasksPerNode)
		{
			ereport(WARNING, (errmsg("this query assigns more tasks per node than the "
									 "configured max_tracked_tasks_per_node limit")));
		}
	}

	return executorType;
}
Пример #5
0
/*
 * master_create_empty_shard creates an empty shard for the given distributed
 * table. For this, the function first gets a list of candidate nodes, connects
 * to these nodes, and issues DDL commands on the nodes to create empty shard
 * placements. The function then updates metadata on the master node to make
 * this shard (and its placements) visible.
 */
Datum
master_create_empty_shard(PG_FUNCTION_ARGS)
{
	text *relationNameText = PG_GETARG_TEXT_P(0);
	char *relationName = text_to_cstring(relationNameText);
	List *workerNodeList = WorkerNodeList();
	Datum shardIdDatum = 0;
	int64 shardId = INVALID_SHARD_ID;
	List *ddlEventList = NULL;
	uint32 attemptableNodeCount = 0;
	uint32 liveNodeCount = 0;

	uint32 candidateNodeIndex = 0;
	List *candidateNodeList = NIL;
	text *nullMinValue = NULL;
	text *nullMaxValue = NULL;
	char partitionMethod = 0;
	char storageType = SHARD_STORAGE_TABLE;

	Oid relationId = ResolveRelationId(relationNameText);
	char relationKind = get_rel_relkind(relationId);
	char *relationOwner = TableOwner(relationId);

	EnsureTablePermissions(relationId, ACL_INSERT);
	CheckDistributedTable(relationId);

	/*
	 * We check whether the table is a foreign table or not. If it is, we set
	 * storage type as foreign also. Only exception is if foreign table is a
	 * foreign cstore table, in this case we set storage type as columnar.
	 *
	 * i.e. While setting storage type, columnar has priority over foreign.
	 */
	if (relationKind == RELKIND_FOREIGN_TABLE)
	{
		bool cstoreTable = cstoreTable = CStoreTable(relationId);
		if (cstoreTable)
		{
			storageType = SHARD_STORAGE_COLUMNAR;
		}
		else
		{
			storageType = SHARD_STORAGE_FOREIGN;
		}
	}

	partitionMethod = PartitionMethod(relationId);
	if (partitionMethod == DISTRIBUTE_BY_HASH)
	{
		ereport(ERROR, (errmsg("relation \"%s\" is a hash partitioned table",
							   relationName),
						errdetail("We currently don't support creating shards "
								  "on hash-partitioned tables")));
	}

	/* generate new and unique shardId from sequence */
	shardIdDatum = master_get_new_shardid(NULL);
	shardId = DatumGetInt64(shardIdDatum);

	/* get table DDL commands to replay on the worker node */
	ddlEventList = GetTableDDLEvents(relationId);

	/* if enough live nodes, add an extra candidate node as backup */
	attemptableNodeCount = ShardReplicationFactor;
	liveNodeCount = WorkerGetLiveNodeCount();
	if (liveNodeCount > ShardReplicationFactor)
	{
		attemptableNodeCount = ShardReplicationFactor + 1;
	}

	/* first retrieve a list of random nodes for shard placements */
	while (candidateNodeIndex < attemptableNodeCount)
	{
		WorkerNode *candidateNode = NULL;

		if (ShardPlacementPolicy == SHARD_PLACEMENT_LOCAL_NODE_FIRST)
		{
			candidateNode = WorkerGetLocalFirstCandidateNode(candidateNodeList);
		}
		else if (ShardPlacementPolicy == SHARD_PLACEMENT_ROUND_ROBIN)
		{
			candidateNode = WorkerGetRoundRobinCandidateNode(workerNodeList, shardId,
															 candidateNodeIndex);
		}
		else if (ShardPlacementPolicy == SHARD_PLACEMENT_RANDOM)
		{
			candidateNode = WorkerGetRandomCandidateNode(candidateNodeList);
		}
		else
		{
			ereport(ERROR, (errmsg("unrecognized shard placement policy")));
		}

		if (candidateNode == NULL)
		{
			ereport(ERROR, (errmsg("could only find %u of %u possible nodes",
								   candidateNodeIndex, attemptableNodeCount)));
		}

		candidateNodeList = lappend(candidateNodeList, candidateNode);
		candidateNodeIndex++;
	}

	CreateShardPlacements(relationId, shardId, ddlEventList, relationOwner,
						  candidateNodeList, 0, ShardReplicationFactor);

	InsertShardRow(relationId, shardId, storageType, nullMinValue, nullMaxValue);

	PG_RETURN_INT64(shardId);
}
Пример #6
0
/*
 * master_create_worker_shards creates empty shards for the given table based
 * on the specified number of initial shards. The function first gets a list of
 * candidate nodes and issues DDL commands on the nodes to create empty shard
 * placements on those nodes. The function then updates metadata on the master
 * node to make this shard (and its placements) visible. Note that the function
 * assumes the table is hash partitioned and calculates the min/max hash token
 * ranges for each shard, giving them an equal split of the hash space.
 */
Datum
master_create_worker_shards(PG_FUNCTION_ARGS)
{
	text *tableNameText = PG_GETARG_TEXT_P(0);
	int32 shardCount = PG_GETARG_INT32(1);
	int32 replicationFactor = PG_GETARG_INT32(2);

	Oid distributedTableId = ResolveRelationId(tableNameText);
	char relationKind = get_rel_relkind(distributedTableId);
	char *tableName = text_to_cstring(tableNameText);
	char *relationOwner = NULL;
	char shardStorageType = '\0';
	List *workerNodeList = NIL;
	List *ddlCommandList = NIL;
	int32 workerNodeCount = 0;
	uint32 placementAttemptCount = 0;
	uint64 hashTokenIncrement = 0;
	List *existingShardList = NIL;
	int64 shardIndex = 0;

	/* make sure table is hash partitioned */
	CheckHashPartitionedTable(distributedTableId);

	/*
	 * In contrast to append/range partitioned tables it makes more sense to
	 * require ownership privileges - shards for hash-partitioned tables are
	 * only created once, not continually during ingest as for the other
	 * partitioning types.
	 */
	EnsureTableOwner(distributedTableId);

	/* we plan to add shards: get an exclusive metadata lock */
	LockRelationDistributionMetadata(distributedTableId, ExclusiveLock);

	relationOwner = TableOwner(distributedTableId);

	/* validate that shards haven't already been created for this table */
	existingShardList = LoadShardList(distributedTableId);
	if (existingShardList != NIL)
	{
		ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
						errmsg("table \"%s\" has already had shards created for it",
							   tableName)));
	}

	/* make sure that at least one shard is specified */
	if (shardCount <= 0)
	{
		ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
						errmsg("shard_count must be positive")));
	}

	/* make sure that at least one replica is specified */
	if (replicationFactor <= 0)
	{
		ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
						errmsg("replication_factor must be positive")));
	}

	/* calculate the split of the hash space */
	hashTokenIncrement = HASH_TOKEN_COUNT / shardCount;

	/* load and sort the worker node list for deterministic placement */
	workerNodeList = WorkerNodeList();
	workerNodeList = SortList(workerNodeList, CompareWorkerNodes);

	/* make sure we don't process cancel signals until all shards are created */
	HOLD_INTERRUPTS();

	/* retrieve the DDL commands for the table */
	ddlCommandList = GetTableDDLEvents(distributedTableId);

	workerNodeCount = list_length(workerNodeList);
	if (replicationFactor > workerNodeCount)
	{
		ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
						errmsg("replication_factor (%d) exceeds number of worker nodes "
							   "(%d)", replicationFactor, workerNodeCount),
						errhint("Add more worker nodes or try again with a lower "
								"replication factor.")));
	}

	/* if we have enough nodes, add an extra placement attempt for backup */
	placementAttemptCount = (uint32) replicationFactor;
	if (workerNodeCount > replicationFactor)
	{
		placementAttemptCount++;
	}

	/* set shard storage type according to relation type */
	if (relationKind == RELKIND_FOREIGN_TABLE)
	{
		bool cstoreTable = CStoreTable(distributedTableId);
		if (cstoreTable)
		{
			shardStorageType = SHARD_STORAGE_COLUMNAR;
		}
		else
		{
			shardStorageType = SHARD_STORAGE_FOREIGN;
		}
	}
	else
	{
		shardStorageType = SHARD_STORAGE_TABLE;
	}

	for (shardIndex = 0; shardIndex < shardCount; shardIndex++)
	{
		uint32 roundRobinNodeIndex = shardIndex % workerNodeCount;

		/* initialize the hash token space for this shard */
		text *minHashTokenText = NULL;
		text *maxHashTokenText = NULL;
		int32 shardMinHashToken = INT32_MIN + (shardIndex * hashTokenIncrement);
		int32 shardMaxHashToken = shardMinHashToken + (hashTokenIncrement - 1);
		Datum shardIdDatum = master_get_new_shardid(NULL);
		int64 shardId = DatumGetInt64(shardIdDatum);

		/* if we are at the last shard, make sure the max token value is INT_MAX */
		if (shardIndex == (shardCount - 1))
		{
			shardMaxHashToken = INT32_MAX;
		}

		/* insert the shard metadata row along with its min/max values */
		minHashTokenText = IntegerToText(shardMinHashToken);
		maxHashTokenText = IntegerToText(shardMaxHashToken);

		/*
		 * Grabbing the shard metadata lock isn't technically necessary since
		 * we already hold an exclusive lock on the partition table, but we'll
		 * acquire it for the sake of completeness. As we're adding new active
		 * placements, the mode must be exclusive.
		 */
		LockShardDistributionMetadata(shardId, ExclusiveLock);

		CreateShardPlacements(shardId, ddlCommandList, relationOwner, workerNodeList,
							  roundRobinNodeIndex, replicationFactor);

		InsertShardRow(distributedTableId, shardId, shardStorageType,
					   minHashTokenText, maxHashTokenText);
	}

	if (QueryCancelPending)
	{
		ereport(WARNING, (errmsg("cancel requests are ignored during shard creation")));
		QueryCancelPending = false;
	}

	RESUME_INTERRUPTS();

	PG_RETURN_VOID();
}