/* * AcquireExecutorShardLock: acquire shard lock needed for execution of * a single task within a distributed plan. */ static void AcquireExecutorShardLock(Task *task, LOCKMODE lockMode) { int64 shardId = task->anchorShardId; LockShardResource(shardId, lockMode); }
/* * lock_shard_resources allows shard resources to be locked * remotely to serialise non-commutative writes on shards. * * This function does not sort the array to avoid deadlock, callers * must ensure a consistent order. */ Datum lock_shard_resources(PG_FUNCTION_ARGS) { LOCKMODE lockMode = IntToLockMode(PG_GETARG_INT32(0)); ArrayType *shardIdArrayObject = PG_GETARG_ARRAYTYPE_P(1); Datum *shardIdArrayDatum = NULL; int shardIdCount = 0; int shardIdIndex = 0; if (ARR_NDIM(shardIdArrayObject) == 0) { ereport(ERROR, (errmsg("no locks specified"))); } /* we don't want random users to block writes */ EnsureSuperUser(); shardIdCount = ArrayObjectCount(shardIdArrayObject); shardIdArrayDatum = DeconstructArrayObject(shardIdArrayObject); for (shardIdIndex = 0; shardIdIndex < shardIdCount; shardIdIndex++) { int64 shardId = DatumGetInt64(shardIdArrayDatum[shardIdIndex]); LockShardResource(shardId, lockMode); } PG_RETURN_VOID(); }
/* * LockShards takes shared locks on the metadata and the data of all shards in * shardIntervalList. This prevents concurrent placement changes and concurrent * DML statements that require an exclusive lock. */ void LockShards(List *shardIntervalList, LOCKMODE lockMode) { ListCell *shardIntervalCell = NULL; /* lock shards in order of shard id to prevent deadlock */ shardIntervalList = SortList(shardIntervalList, CompareShardIntervalsById); foreach(shardIntervalCell, shardIntervalList) { ShardInterval *shardInterval = (ShardInterval *) lfirst(shardIntervalCell); int64 shardId = shardInterval->shardId; /* prevent concurrent changes to number of placements */ LockShardDistributionMetadata(shardId, lockMode); /* prevent concurrent update/delete statements */ LockShardResource(shardId, lockMode); }
/* * master_append_table_to_shard appends the given table's contents to the given * shard, and updates shard metadata on the master node. If the function fails * to append table data to all shard placements, it doesn't update any metadata * and errors out. Else if the function fails to append table data to some of * the shard placements, it marks those placements as invalid. These invalid * placements will get cleaned up during shard rebalancing. */ Datum master_append_table_to_shard(PG_FUNCTION_ARGS) { uint64 shardId = PG_GETARG_INT64(0); text *sourceTableNameText = PG_GETARG_TEXT_P(1); text *sourceNodeNameText = PG_GETARG_TEXT_P(2); uint32 sourceNodePort = PG_GETARG_UINT32(3); char *sourceTableName = text_to_cstring(sourceTableNameText); char *sourceNodeName = text_to_cstring(sourceNodeNameText); char *shardName = NULL; List *shardPlacementList = NIL; List *succeededPlacementList = NIL; List *failedPlacementList = NIL; ListCell *shardPlacementCell = NULL; ListCell *failedPlacementCell = NULL; uint64 newShardSize = 0; uint64 shardMaxSizeInBytes = 0; float4 shardFillLevel = 0.0; char partitionMethod = 0; ShardInterval *shardInterval = LoadShardInterval(shardId); Oid relationId = shardInterval->relationId; bool cstoreTable = CStoreTable(relationId); char storageType = shardInterval->storageType; EnsureTablePermissions(relationId, ACL_INSERT); if (storageType != SHARD_STORAGE_TABLE && !cstoreTable) { ereport(ERROR, (errmsg("cannot append to shardId " UINT64_FORMAT, shardId), errdetail("The underlying shard is not a regular table"))); } partitionMethod = PartitionMethod(relationId); if (partitionMethod == DISTRIBUTE_BY_HASH) { ereport(ERROR, (errmsg("cannot append to shardId " UINT64_FORMAT, shardId), errdetail("We currently don't support appending to shards " "in hash-partitioned tables"))); } /* * We lock on the shardId, but do not unlock. When the function returns, and * the transaction for this function commits, this lock will automatically * be released. This ensures appends to a shard happen in a serial manner. */ LockShardResource(shardId, AccessExclusiveLock); /* if shard doesn't have an alias, extend regular table name */ shardName = LoadShardAlias(relationId, shardId); if (shardName == NULL) { shardName = get_rel_name(relationId); AppendShardIdToName(&shardName, shardId); } shardPlacementList = FinalizedShardPlacementList(shardId); if (shardPlacementList == NIL) { ereport(ERROR, (errmsg("could not find any shard placements for shardId " UINT64_FORMAT, shardId), errhint("Try running master_create_empty_shard() first"))); } /* issue command to append table to each shard placement */ foreach(shardPlacementCell, shardPlacementList) { ShardPlacement *shardPlacement = (ShardPlacement *) lfirst(shardPlacementCell); char *workerName = shardPlacement->nodeName; uint32 workerPort = shardPlacement->nodePort; List *queryResultList = NIL; StringInfo workerAppendQuery = makeStringInfo(); appendStringInfo(workerAppendQuery, WORKER_APPEND_TABLE_TO_SHARD, quote_literal_cstr(shardName), quote_literal_cstr(sourceTableName), quote_literal_cstr(sourceNodeName), sourceNodePort); /* inserting data should be performed by the current user */ queryResultList = ExecuteRemoteQuery(workerName, workerPort, NULL, workerAppendQuery); if (queryResultList != NIL) { succeededPlacementList = lappend(succeededPlacementList, shardPlacement); } else { failedPlacementList = lappend(failedPlacementList, shardPlacement); } }
/* * FetchTableCommon executes common logic that wraps around the actual data * fetching function. This common logic includes ensuring that only one process * tries to fetch this table at any given time, and that data fetch operations * are retried in case of node failures. */ static void FetchTableCommon(text *tableNameText, uint64 remoteTableSize, ArrayType *nodeNameObject, ArrayType *nodePortObject, bool (*FetchTableFunction)(const char *, uint32, const char *)) { uint64 shardId = INVALID_SHARD_ID; Oid relationId = InvalidOid; List *relationNameList = NIL; RangeVar *relation = NULL; uint32 nodeIndex = 0; bool tableFetched = false; char *tableName = text_to_cstring(tableNameText); Datum *nodeNameArray = DeconstructArrayObject(nodeNameObject); Datum *nodePortArray = DeconstructArrayObject(nodePortObject); int32 nodeNameCount = ArrayObjectCount(nodeNameObject); int32 nodePortCount = ArrayObjectCount(nodePortObject); /* we should have the same number of node names and port numbers */ if (nodeNameCount != nodePortCount) { ereport(ERROR, (errmsg("node name array size: %d and node port array size: %d" " do not match", nodeNameCount, nodePortCount))); } /* * We lock on the shardId, but do not unlock. When the function returns, and * the transaction for this function commits, this lock will automatically * be released. This ensures that concurrent caching commands will see the * newly created table when they acquire the lock (in read committed mode). */ shardId = ExtractShardId(tableName); LockShardResource(shardId, AccessExclusiveLock); relationNameList = textToQualifiedNameList(tableNameText); relation = makeRangeVarFromNameList(relationNameList); relationId = RangeVarGetRelid(relation, NoLock, true); /* check if we already fetched the table */ if (relationId != InvalidOid) { uint64 localTableSize = 0; if (!ExpireCachedShards) { return; } /* * Check if the cached shard has the same size on disk as it has as on * the placement (is up to date). * * Note 1: performing updates or deletes on the original shard leads to * inconsistent sizes between different databases in which case the data * would be fetched every time, or worse, the placement would get into * a deadlock when it tries to fetch from itself while holding the lock. * Therefore, this option is disabled by default. * * Note 2: when appending data to a shard, the size on disk only * increases when a new page is added (the next 8kB block). */ localTableSize = LocalTableSize(relationId); if (remoteTableSize > localTableSize) { /* table is not up to date, drop the table */ ObjectAddress tableObject = { InvalidOid, InvalidOid, 0 }; tableObject.classId = RelationRelationId; tableObject.objectId = relationId; tableObject.objectSubId = 0; performDeletion(&tableObject, DROP_RESTRICT, PERFORM_DELETION_INTERNAL); } else { /* table is up to date */ return; } } /* loop until we fetch the table or try all nodes */ while (!tableFetched && (nodeIndex < nodeNameCount)) { Datum nodeNameDatum = nodeNameArray[nodeIndex]; Datum nodePortDatum = nodePortArray[nodeIndex]; char *nodeName = TextDatumGetCString(nodeNameDatum); uint32 nodePort = DatumGetUInt32(nodePortDatum); tableFetched = (*FetchTableFunction)(nodeName, nodePort, tableName); nodeIndex++; } /* error out if we tried all nodes and could not fetch the table */ if (!tableFetched) { ereport(ERROR, (errmsg("could not fetch relation: \"%s\"", tableName))); } }