/* * RecoverPreparedTransactions recovers any pending prepared * transactions started by this node on other nodes. */ static int RecoverPreparedTransactions(void) { List *workerList = NIL; ListCell *workerNodeCell = NULL; int recoveredTransactionCount = 0; /* * We block here if metadata transactions are ongoing, since we * mustn't commit/abort their prepared transactions under their * feet. We also prevent concurrent recovery. */ LockRelationOid(DistTransactionRelationId(), ExclusiveLock); workerList = WorkerNodeList(); foreach(workerNodeCell, workerList) { WorkerNode *workerNode = (WorkerNode *) lfirst(workerNodeCell); recoveredTransactionCount += RecoverWorkerTransactions(workerNode); }
/* * master_get_active_worker_nodes returns a set of active worker host names and * port numbers in deterministic order. Currently we assume that all worker * nodes in pg_worker_list.conf are active. */ Datum master_get_active_worker_nodes(PG_FUNCTION_ARGS) { FuncCallContext *functionContext = NULL; uint32 workerNodeIndex = 0; uint32 workerNodeCount = 0; if (SRF_IS_FIRSTCALL()) { MemoryContext oldContext = NULL; List *workerNodeList = NIL; uint32 workerNodeCount = 0; TupleDesc tupleDescriptor = NULL; bool hasOid = false; /* create a function context for cross-call persistence */ functionContext = SRF_FIRSTCALL_INIT(); /* switch to memory context appropriate for multiple function calls */ oldContext = MemoryContextSwitchTo(functionContext->multi_call_memory_ctx); workerNodeList = WorkerNodeList(); workerNodeCount = (uint32) list_length(workerNodeList); functionContext->user_fctx = workerNodeList; functionContext->max_calls = workerNodeCount; /* * This tuple descriptor must match the output parameters declared for * the function in pg_proc. */ tupleDescriptor = CreateTemplateTupleDesc(WORKER_NODE_FIELDS, hasOid); TupleDescInitEntry(tupleDescriptor, (AttrNumber) 1, "node_name", TEXTOID, -1, 0); TupleDescInitEntry(tupleDescriptor, (AttrNumber) 2, "node_port", INT8OID, -1, 0); functionContext->tuple_desc = BlessTupleDesc(tupleDescriptor); MemoryContextSwitchTo(oldContext); } functionContext = SRF_PERCALL_SETUP(); workerNodeIndex = functionContext->call_cntr; workerNodeCount = functionContext->max_calls; if (workerNodeIndex < workerNodeCount) { List *workerNodeList = functionContext->user_fctx; WorkerNode *workerNode = list_nth(workerNodeList, workerNodeIndex); Datum workerNodeDatum = WorkerNodeGetDatum(workerNode, functionContext->tuple_desc); SRF_RETURN_NEXT(functionContext, workerNodeDatum); } else { SRF_RETURN_DONE(functionContext); } }
/* * master_get_round_robin_candidate_nodes returns a set of candidate host names * and port numbers on which to place new shards. The function uses the round * robin policy to choose the nodes and tries to ensure that there is an even * distribution of shards across the worker nodes. This function errors out if * the number of available nodes falls short of the replication factor. */ Datum master_get_round_robin_candidate_nodes(PG_FUNCTION_ARGS) { uint64 shardId = PG_GETARG_INT64(0); FuncCallContext *functionContext = NULL; uint32 desiredNodeCount = 0; uint32 currentNodeCount = 0; if (SRF_IS_FIRSTCALL()) { MemoryContext oldContext = NULL; TupleDesc tupleDescriptor = NULL; List *workerNodeList = NIL; TypeFuncClass resultTypeClass = 0; uint32 workerNodeCount = 0; /* create a function context for cross-call persistence */ functionContext = SRF_FIRSTCALL_INIT(); /* switch to memory context appropriate for multiple function calls */ oldContext = MemoryContextSwitchTo(functionContext->multi_call_memory_ctx); /* get the worker node list and sort it for determinism */ workerNodeList = WorkerNodeList(); workerNodeList = SortList(workerNodeList, CompareWorkerNodes); functionContext->user_fctx = workerNodeList; functionContext->max_calls = ShardReplicationFactor; /* if we enough live nodes, return an extra candidate node as backup */ workerNodeCount = (uint32) list_length(workerNodeList); if (workerNodeCount > ShardReplicationFactor) { functionContext->max_calls = ShardReplicationFactor + 1; } /* create tuple descriptor for return value */ resultTypeClass = get_call_result_type(fcinfo, NULL, &tupleDescriptor); if (resultTypeClass != TYPEFUNC_COMPOSITE) { ereport(ERROR, (errmsg("return type must be a row type"))); } functionContext->tuple_desc = tupleDescriptor; MemoryContextSwitchTo(oldContext); } functionContext = SRF_PERCALL_SETUP(); desiredNodeCount = functionContext->max_calls; currentNodeCount = functionContext->call_cntr; if (currentNodeCount < desiredNodeCount) { List *workerNodeList = functionContext->user_fctx; WorkerNode *candidateNode = NULL; Datum candidateDatum = 0; candidateNode = WorkerGetRoundRobinCandidateNode(workerNodeList, shardId, currentNodeCount); if (candidateNode == NULL) { ereport(ERROR, (errmsg("could only find %u of %u required nodes", currentNodeCount, desiredNodeCount))); } candidateDatum = WorkerNodeGetDatum(candidateNode, functionContext->tuple_desc); SRF_RETURN_NEXT(functionContext, candidateDatum); } else { SRF_RETURN_DONE(functionContext); } }
/* * JobExecutorType selects the executor type for the given multiPlan using the task * executor type config value. The function then checks if the given multiPlan needs * more resources than those provided to it by other config values, and issues * warnings accordingly. If the selected executor type cannot execute the given * multiPlan, the function errors out. */ MultiExecutorType JobExecutorType(MultiPlan *multiPlan) { Job *job = multiPlan->workerJob; List *workerTaskList = job->taskList; List *workerNodeList = WorkerNodeList(); int taskCount = list_length(workerTaskList); int workerNodeCount = list_length(workerNodeList); double tasksPerNode = taskCount / ((double) workerNodeCount); int dependedJobCount = list_length(job->dependedJobList); MultiExecutorType executorType = TaskExecutorType; bool routerExecutablePlan = multiPlan->routerExecutable; /* check if can switch to router executor */ if (routerExecutablePlan) { ereport(DEBUG2, (errmsg("Plan is router executable"))); return MULTI_EXECUTOR_ROUTER; } if (executorType == MULTI_EXECUTOR_REAL_TIME) { double reasonableConnectionCount = 0; /* if we need to open too many connections per worker, warn the user */ if (tasksPerNode >= MaxConnections) { ereport(WARNING, (errmsg("this query uses more connections than the " "configured max_connections limit"), errhint("Consider increasing max_connections or setting " "citus.task_executor_type to " "\"task-tracker\"."))); } /* * If we need to open too many outgoing connections, warn the user. * The real-time executor caps the number of tasks it starts by the same limit, * but we still issue this warning because it degrades performance. */ reasonableConnectionCount = MaxMasterConnectionCount(); if (taskCount >= reasonableConnectionCount) { ereport(WARNING, (errmsg("this query uses more file descriptors than the " "configured max_files_per_process limit"), errhint("Consider increasing max_files_per_process or " "setting citus.task_executor_type to " "\"task-tracker\"."))); } /* if we have repartition jobs with real time executor, error out */ if (dependedJobCount > 0) { ereport(ERROR, (errmsg("cannot use real time executor with repartition jobs"), errhint("Set citus.task_executor_type to " "\"task-tracker\"."))); } } else { /* if we have more tasks per node than what can be tracked, warn the user */ if (tasksPerNode >= MaxTrackedTasksPerNode) { ereport(WARNING, (errmsg("this query assigns more tasks per node than the " "configured max_tracked_tasks_per_node limit"))); } } return executorType; }
/* * master_create_empty_shard creates an empty shard for the given distributed * table. For this, the function first gets a list of candidate nodes, connects * to these nodes, and issues DDL commands on the nodes to create empty shard * placements. The function then updates metadata on the master node to make * this shard (and its placements) visible. */ Datum master_create_empty_shard(PG_FUNCTION_ARGS) { text *relationNameText = PG_GETARG_TEXT_P(0); char *relationName = text_to_cstring(relationNameText); List *workerNodeList = WorkerNodeList(); Datum shardIdDatum = 0; int64 shardId = INVALID_SHARD_ID; List *ddlEventList = NULL; uint32 attemptableNodeCount = 0; uint32 liveNodeCount = 0; uint32 candidateNodeIndex = 0; List *candidateNodeList = NIL; text *nullMinValue = NULL; text *nullMaxValue = NULL; char partitionMethod = 0; char storageType = SHARD_STORAGE_TABLE; Oid relationId = ResolveRelationId(relationNameText); char relationKind = get_rel_relkind(relationId); char *relationOwner = TableOwner(relationId); EnsureTablePermissions(relationId, ACL_INSERT); CheckDistributedTable(relationId); /* * We check whether the table is a foreign table or not. If it is, we set * storage type as foreign also. Only exception is if foreign table is a * foreign cstore table, in this case we set storage type as columnar. * * i.e. While setting storage type, columnar has priority over foreign. */ if (relationKind == RELKIND_FOREIGN_TABLE) { bool cstoreTable = cstoreTable = CStoreTable(relationId); if (cstoreTable) { storageType = SHARD_STORAGE_COLUMNAR; } else { storageType = SHARD_STORAGE_FOREIGN; } } partitionMethod = PartitionMethod(relationId); if (partitionMethod == DISTRIBUTE_BY_HASH) { ereport(ERROR, (errmsg("relation \"%s\" is a hash partitioned table", relationName), errdetail("We currently don't support creating shards " "on hash-partitioned tables"))); } /* generate new and unique shardId from sequence */ shardIdDatum = master_get_new_shardid(NULL); shardId = DatumGetInt64(shardIdDatum); /* get table DDL commands to replay on the worker node */ ddlEventList = GetTableDDLEvents(relationId); /* if enough live nodes, add an extra candidate node as backup */ attemptableNodeCount = ShardReplicationFactor; liveNodeCount = WorkerGetLiveNodeCount(); if (liveNodeCount > ShardReplicationFactor) { attemptableNodeCount = ShardReplicationFactor + 1; } /* first retrieve a list of random nodes for shard placements */ while (candidateNodeIndex < attemptableNodeCount) { WorkerNode *candidateNode = NULL; if (ShardPlacementPolicy == SHARD_PLACEMENT_LOCAL_NODE_FIRST) { candidateNode = WorkerGetLocalFirstCandidateNode(candidateNodeList); } else if (ShardPlacementPolicy == SHARD_PLACEMENT_ROUND_ROBIN) { candidateNode = WorkerGetRoundRobinCandidateNode(workerNodeList, shardId, candidateNodeIndex); } else if (ShardPlacementPolicy == SHARD_PLACEMENT_RANDOM) { candidateNode = WorkerGetRandomCandidateNode(candidateNodeList); } else { ereport(ERROR, (errmsg("unrecognized shard placement policy"))); } if (candidateNode == NULL) { ereport(ERROR, (errmsg("could only find %u of %u possible nodes", candidateNodeIndex, attemptableNodeCount))); } candidateNodeList = lappend(candidateNodeList, candidateNode); candidateNodeIndex++; } CreateShardPlacements(relationId, shardId, ddlEventList, relationOwner, candidateNodeList, 0, ShardReplicationFactor); InsertShardRow(relationId, shardId, storageType, nullMinValue, nullMaxValue); PG_RETURN_INT64(shardId); }
/* * master_create_worker_shards creates empty shards for the given table based * on the specified number of initial shards. The function first gets a list of * candidate nodes and issues DDL commands on the nodes to create empty shard * placements on those nodes. The function then updates metadata on the master * node to make this shard (and its placements) visible. Note that the function * assumes the table is hash partitioned and calculates the min/max hash token * ranges for each shard, giving them an equal split of the hash space. */ Datum master_create_worker_shards(PG_FUNCTION_ARGS) { text *tableNameText = PG_GETARG_TEXT_P(0); int32 shardCount = PG_GETARG_INT32(1); int32 replicationFactor = PG_GETARG_INT32(2); Oid distributedTableId = ResolveRelationId(tableNameText); char relationKind = get_rel_relkind(distributedTableId); char *tableName = text_to_cstring(tableNameText); char *relationOwner = NULL; char shardStorageType = '\0'; List *workerNodeList = NIL; List *ddlCommandList = NIL; int32 workerNodeCount = 0; uint32 placementAttemptCount = 0; uint64 hashTokenIncrement = 0; List *existingShardList = NIL; int64 shardIndex = 0; /* make sure table is hash partitioned */ CheckHashPartitionedTable(distributedTableId); /* * In contrast to append/range partitioned tables it makes more sense to * require ownership privileges - shards for hash-partitioned tables are * only created once, not continually during ingest as for the other * partitioning types. */ EnsureTableOwner(distributedTableId); /* we plan to add shards: get an exclusive metadata lock */ LockRelationDistributionMetadata(distributedTableId, ExclusiveLock); relationOwner = TableOwner(distributedTableId); /* validate that shards haven't already been created for this table */ existingShardList = LoadShardList(distributedTableId); if (existingShardList != NIL) { ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("table \"%s\" has already had shards created for it", tableName))); } /* make sure that at least one shard is specified */ if (shardCount <= 0) { ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("shard_count must be positive"))); } /* make sure that at least one replica is specified */ if (replicationFactor <= 0) { ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("replication_factor must be positive"))); } /* calculate the split of the hash space */ hashTokenIncrement = HASH_TOKEN_COUNT / shardCount; /* load and sort the worker node list for deterministic placement */ workerNodeList = WorkerNodeList(); workerNodeList = SortList(workerNodeList, CompareWorkerNodes); /* make sure we don't process cancel signals until all shards are created */ HOLD_INTERRUPTS(); /* retrieve the DDL commands for the table */ ddlCommandList = GetTableDDLEvents(distributedTableId); workerNodeCount = list_length(workerNodeList); if (replicationFactor > workerNodeCount) { ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("replication_factor (%d) exceeds number of worker nodes " "(%d)", replicationFactor, workerNodeCount), errhint("Add more worker nodes or try again with a lower " "replication factor."))); } /* if we have enough nodes, add an extra placement attempt for backup */ placementAttemptCount = (uint32) replicationFactor; if (workerNodeCount > replicationFactor) { placementAttemptCount++; } /* set shard storage type according to relation type */ if (relationKind == RELKIND_FOREIGN_TABLE) { bool cstoreTable = CStoreTable(distributedTableId); if (cstoreTable) { shardStorageType = SHARD_STORAGE_COLUMNAR; } else { shardStorageType = SHARD_STORAGE_FOREIGN; } } else { shardStorageType = SHARD_STORAGE_TABLE; } for (shardIndex = 0; shardIndex < shardCount; shardIndex++) { uint32 roundRobinNodeIndex = shardIndex % workerNodeCount; /* initialize the hash token space for this shard */ text *minHashTokenText = NULL; text *maxHashTokenText = NULL; int32 shardMinHashToken = INT32_MIN + (shardIndex * hashTokenIncrement); int32 shardMaxHashToken = shardMinHashToken + (hashTokenIncrement - 1); Datum shardIdDatum = master_get_new_shardid(NULL); int64 shardId = DatumGetInt64(shardIdDatum); /* if we are at the last shard, make sure the max token value is INT_MAX */ if (shardIndex == (shardCount - 1)) { shardMaxHashToken = INT32_MAX; } /* insert the shard metadata row along with its min/max values */ minHashTokenText = IntegerToText(shardMinHashToken); maxHashTokenText = IntegerToText(shardMaxHashToken); /* * Grabbing the shard metadata lock isn't technically necessary since * we already hold an exclusive lock on the partition table, but we'll * acquire it for the sake of completeness. As we're adding new active * placements, the mode must be exclusive. */ LockShardDistributionMetadata(shardId, ExclusiveLock); CreateShardPlacements(shardId, ddlCommandList, relationOwner, workerNodeList, roundRobinNodeIndex, replicationFactor); InsertShardRow(distributedTableId, shardId, shardStorageType, minHashTokenText, maxHashTokenText); } if (QueryCancelPending) { ereport(WARNING, (errmsg("cancel requests are ignored during shard creation"))); QueryCancelPending = false; } RESUME_INTERRUPTS(); PG_RETURN_VOID(); }