/* * FixedJoinOrderList returns a list of join order nodes for the query in the order * specified by the user. This is used to handle join trees that contain OUTER joins. * The regular JoinOrderList currently assumes that all joins are inner-joins and can * thus be arbitrarily reordered, which is not the case for OUTER joins. At some point * we should merge these two functions. */ List * FixedJoinOrderList(FromExpr *fromExpr, List *tableEntryList) { List *joinList = NIL; ListCell *joinCell = NULL; List *joinWhereClauseList = NIL; List *joinOrderList = NIL; List *joinedTableList = NIL; JoinOrderNode *firstJoinNode = NULL; JoinOrderNode *currentJoinNode = NULL; ListCell *tableEntryCell = NULL; foreach(tableEntryCell, tableEntryList) { TableEntry *rangeTableEntry = (TableEntry *) lfirst(tableEntryCell); Oid relationId = rangeTableEntry->relationId; DistTableCacheEntry *cacheEntry = DistributedTableCacheEntry(relationId); if (cacheEntry->partitionMethod != DISTRIBUTE_BY_NONE && cacheEntry->hasUninitializedShardInterval) { ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("cannot perform distributed planning on this query"), errdetail("Shards of relations in outer join queries must " "have shard min/max values."))); } }
/* * LoadShardList reads list of shards for given relationId from pg_dist_shard, * and returns the list of found shardIds. */ List * LoadShardList(Oid relationId) { DistTableCacheEntry *cacheEntry = DistributedTableCacheEntry(relationId); List *shardList = NIL; int i = 0; for (i = 0; i < cacheEntry->shardIntervalArrayLength; i++) { ShardInterval *currentShardInterval = &cacheEntry->shardIntervalArray[i]; uint64 *shardIdPointer = AllocateUint64(currentShardInterval->shardId); shardList = lappend(shardList, shardIdPointer); } return shardList; }
/* * LoadShardIntervalList returns a list of shard intervals related for a given * distributed table. The function returns an empty list if no shards can be * found for the given relation. */ List * LoadShardIntervalList(Oid relationId) { DistTableCacheEntry *cacheEntry = DistributedTableCacheEntry(relationId); List *shardList = NIL; int i = 0; for (i = 0; i < cacheEntry->shardIntervalArrayLength; i++) { ShardInterval *newShardInterval = NULL; newShardInterval = (ShardInterval *) palloc0(sizeof(ShardInterval)); CopyShardInterval(&cacheEntry->shardIntervalArray[i], newShardInterval); shardList = lappend(shardList, newShardInterval); } return shardList; }
/* * CreateShardsWithRoundRobinPolicy creates empty shards for the given table * based on the specified number of initial shards. The function first updates * metadata on the coordinator node to make this shard (and its placements) * visible. Note that the function assumes the table is hash partitioned and * calculates the min/max hash token ranges for each shard, giving them an equal * split of the hash space. Finally, function creates empty shard placements on * worker nodes. */ void CreateShardsWithRoundRobinPolicy(Oid distributedTableId, int32 shardCount, int32 replicationFactor, bool useExclusiveConnections) { char shardStorageType = 0; List *workerNodeList = NIL; int32 workerNodeCount = 0; uint32 placementAttemptCount = 0; uint64 hashTokenIncrement = 0; List *existingShardList = NIL; int64 shardIndex = 0; DistTableCacheEntry *cacheEntry = DistributedTableCacheEntry(distributedTableId); bool colocatedShard = false; List *insertedShardPlacements = NIL; /* make sure table is hash partitioned */ CheckHashPartitionedTable(distributedTableId); /* * In contrast to append/range partitioned tables it makes more sense to * require ownership privileges - shards for hash-partitioned tables are * only created once, not continually during ingest as for the other * partitioning types. */ EnsureTableOwner(distributedTableId); /* we plan to add shards: get an exclusive lock on relation oid */ LockRelationOid(distributedTableId, ExclusiveLock); /* validate that shards haven't already been created for this table */ existingShardList = LoadShardList(distributedTableId); if (existingShardList != NIL) { char *tableName = get_rel_name(distributedTableId); ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("table \"%s\" has already had shards created for it", tableName))); } /* make sure that at least one shard is specified */ if (shardCount <= 0) { ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("shard_count must be positive"))); } /* make sure that at least one replica is specified */ if (replicationFactor <= 0) { ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("replication_factor must be positive"))); } /* make sure that RF=1 if the table is streaming replicated */ if (cacheEntry->replicationModel == REPLICATION_MODEL_STREAMING && replicationFactor > 1) { char *relationName = get_rel_name(cacheEntry->relationId); ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("using replication factor %d with the streaming " "replication model is not supported", replicationFactor), errdetail("The table %s is marked as streaming replicated and " "the shard replication factor of streaming replicated " "tables must be 1.", relationName), errhint("Use replication factor 1."))); } /* calculate the split of the hash space */ hashTokenIncrement = HASH_TOKEN_COUNT / shardCount; /* don't allow concurrent node list changes that require an exclusive lock */ LockRelationOid(DistNodeRelationId(), RowShareLock); /* load and sort the worker node list for deterministic placement */ workerNodeList = ActivePrimaryNodeList(); workerNodeList = SortList(workerNodeList, CompareWorkerNodes); /* make sure we don't process cancel signals until all shards are created */ HOLD_INTERRUPTS(); workerNodeCount = list_length(workerNodeList); if (replicationFactor > workerNodeCount) { ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("replication_factor (%d) exceeds number of worker nodes " "(%d)", replicationFactor, workerNodeCount), errhint("Add more worker nodes or try again with a lower " "replication factor."))); } /* if we have enough nodes, add an extra placement attempt for backup */ placementAttemptCount = (uint32) replicationFactor; if (workerNodeCount > replicationFactor) { placementAttemptCount++; } /* set shard storage type according to relation type */ shardStorageType = ShardStorageType(distributedTableId); for (shardIndex = 0; shardIndex < shardCount; shardIndex++) { uint32 roundRobinNodeIndex = shardIndex % workerNodeCount; /* initialize the hash token space for this shard */ text *minHashTokenText = NULL; text *maxHashTokenText = NULL; int32 shardMinHashToken = INT32_MIN + (shardIndex * hashTokenIncrement); int32 shardMaxHashToken = shardMinHashToken + (hashTokenIncrement - 1); uint64 shardId = GetNextShardId(); List *currentInsertedShardPlacements = NIL; /* if we are at the last shard, make sure the max token value is INT_MAX */ if (shardIndex == (shardCount - 1)) { shardMaxHashToken = INT32_MAX; } /* insert the shard metadata row along with its min/max values */ minHashTokenText = IntegerToText(shardMinHashToken); maxHashTokenText = IntegerToText(shardMaxHashToken); /* * Grabbing the shard metadata lock isn't technically necessary since * we already hold an exclusive lock on the partition table, but we'll * acquire it for the sake of completeness. As we're adding new active * placements, the mode must be exclusive. */ LockShardDistributionMetadata(shardId, ExclusiveLock); InsertShardRow(distributedTableId, shardId, shardStorageType, minHashTokenText, maxHashTokenText); currentInsertedShardPlacements = InsertShardPlacementRows(distributedTableId, shardId, workerNodeList, roundRobinNodeIndex, replicationFactor); insertedShardPlacements = list_concat(insertedShardPlacements, currentInsertedShardPlacements); } CreateShardsOnWorkers(distributedTableId, insertedShardPlacements, useExclusiveConnections, colocatedShard); if (QueryCancelPending) { ereport(WARNING, (errmsg("cancel requests are ignored during shard creation"))); QueryCancelPending = false; } RESUME_INTERRUPTS(); }
/* * master_get_table_metadata takes in a relation name, and returns partition * related metadata for the relation. These metadata are grouped and returned in * a tuple, and are used by the caller when creating new shards. The function * errors if given relation does not exist, or is not partitioned. */ Datum master_get_table_metadata(PG_FUNCTION_ARGS) { text *relationName = PG_GETARG_TEXT_P(0); Oid relationId = ResolveRelationId(relationName); DistTableCacheEntry *partitionEntry = NULL; TypeFuncClass resultTypeClass = 0; Datum partitionKeyExpr = 0; Datum partitionKey = 0; Datum metadataDatum = 0; HeapTuple metadataTuple = NULL; TupleDesc metadataDescriptor = NULL; uint64 shardMaxSizeInBytes = 0; char relationType = 0; char storageType = 0; Datum values[TABLE_METADATA_FIELDS]; bool isNulls[TABLE_METADATA_FIELDS]; /* find partition tuple for partitioned relation */ partitionEntry = DistributedTableCacheEntry(relationId); /* create tuple descriptor for return value */ resultTypeClass = get_call_result_type(fcinfo, NULL, &metadataDescriptor); if (resultTypeClass != TYPEFUNC_COMPOSITE) { ereport(ERROR, (errmsg("return type must be a row type"))); } /* get decompiled expression tree for partition key */ partitionKeyExpr = PointerGetDatum(cstring_to_text(partitionEntry->partitionKeyString)); partitionKey = DirectFunctionCall2(pg_get_expr, partitionKeyExpr, ObjectIdGetDatum(relationId)); /* form heap tuple for table metadata */ memset(values, 0, sizeof(values)); memset(isNulls, false, sizeof(isNulls)); shardMaxSizeInBytes = (int64) ShardMaxSize * 1024L; /* get storage type */ relationType = get_rel_relkind(relationId); if (relationType == RELKIND_RELATION) { storageType = SHARD_STORAGE_TABLE; } else if (relationType == RELKIND_FOREIGN_TABLE) { bool cstoreTable = CStoreTable(relationId); if (cstoreTable) { storageType = SHARD_STORAGE_COLUMNAR; } else { storageType = SHARD_STORAGE_FOREIGN; } } values[0] = ObjectIdGetDatum(relationId); values[1] = storageType; values[2] = partitionEntry->partitionMethod; values[3] = partitionKey; values[4] = Int32GetDatum(ShardReplicationFactor); values[5] = Int64GetDatum(shardMaxSizeInBytes); values[6] = Int32GetDatum(ShardPlacementPolicy); metadataTuple = heap_form_tuple(metadataDescriptor, values, isNulls); metadataDatum = HeapTupleGetDatum(metadataTuple); PG_RETURN_DATUM(metadataDatum); }