/* * master_copy_shard_placement implements a user-facing UDF to copy data from * a healthy (source) node to an inactive (target) node. To accomplish this it * entirely recreates the table structure before copying all data. During this * time all modifications are paused to the shard. After successful repair, the * inactive placement is marked healthy and modifications may continue. If the * repair fails at any point, this function throws an error, leaving the node * in an unhealthy state. */ Datum master_copy_shard_placement(PG_FUNCTION_ARGS) { int64 shardId = PG_GETARG_INT64(0); text *sourceNodeName = PG_GETARG_TEXT_P(1); int32 sourceNodePort = PG_GETARG_INT32(2); text *targetNodeName = PG_GETARG_TEXT_P(3); int32 targetNodePort = PG_GETARG_INT32(4); ShardInterval *shardInterval = LoadShardInterval(shardId); Oid distributedTableId = shardInterval->relationId; List *shardPlacementList = NIL; ShardPlacement *sourcePlacement = NULL; ShardPlacement *targetPlacement = NULL; List *ddlCommandList = NIL; bool recreated = false; bool dataCopied = false; /* * By taking an exclusive lock on the shard, we both stop all modifications * (INSERT, UPDATE, or DELETE) and prevent concurrent repair operations from * being able to operate on this shard. */ LockShard(shardId, ExclusiveLock); shardPlacementList = LoadShardPlacementList(shardId); sourcePlacement = SearchShardPlacementInList(shardPlacementList, sourceNodeName, sourceNodePort); if (sourcePlacement->shardState != STATE_FINALIZED) { ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("source placement must be in finalized state"))); } targetPlacement = SearchShardPlacementInList(shardPlacementList, targetNodeName, targetNodePort); if (targetPlacement->shardState != STATE_INACTIVE) { ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("target placement must be in inactive state"))); } /* retrieve the DDL commands for the table and run them */ ddlCommandList = RecreateTableDDLCommandList(distributedTableId, shardId); recreated = ExecuteRemoteCommandList(targetPlacement->nodeName, targetPlacement->nodePort, ddlCommandList); if (!recreated) { ereport(ERROR, (errmsg("could not recreate shard table"), errhint("Consult recent messages in the server logs for " "details."))); } HOLD_INTERRUPTS(); dataCopied = CopyDataFromFinalizedPlacement(distributedTableId, shardId, sourcePlacement, targetPlacement); if (!dataCopied) { ereport(ERROR, (errmsg("could not copy shard data"), errhint("Consult recent messages in the server logs for " "details."))); } /* the placement is repaired, so return to finalized state */ DeleteShardPlacementRow(targetPlacement->id); InsertShardPlacementRow(targetPlacement->id, targetPlacement->shardId, STATE_FINALIZED, targetPlacement->nodeName, targetPlacement->nodePort); RESUME_INTERRUPTS(); PG_RETURN_VOID(); }
/* * DropShards drops all given shards in a relation. The id, name and schema * for the relation are explicitly provided, since this function may be * called when the table is already dropped. * * We mark shard placements that we couldn't drop as to be deleted later, but * we do delete the shard metadadata. */ static int DropShards(Oid relationId, char *schemaName, char *relationName, List *deletableShardIntervalList) { ListCell *shardIntervalCell = NULL; int droppedShardCount = 0; BeginOrContinueCoordinatedTransaction(); /* At this point we intentionally decided to not use 2PC for reference tables */ if (MultiShardCommitProtocol == COMMIT_PROTOCOL_2PC) { CoordinatedTransactionUse2PC(); } foreach(shardIntervalCell, deletableShardIntervalList) { List *shardPlacementList = NIL; ListCell *shardPlacementCell = NULL; ShardInterval *shardInterval = (ShardInterval *) lfirst(shardIntervalCell); uint64 shardId = shardInterval->shardId; char *quotedShardName = NULL; char *shardRelationName = pstrdup(relationName); Assert(shardInterval->relationId == relationId); /* Build shard relation name. */ AppendShardIdToName(&shardRelationName, shardId); quotedShardName = quote_qualified_identifier(schemaName, shardRelationName); shardPlacementList = ShardPlacementList(shardId); foreach(shardPlacementCell, shardPlacementList) { ShardPlacement *shardPlacement = (ShardPlacement *) lfirst(shardPlacementCell); char *workerName = shardPlacement->nodeName; uint32 workerPort = shardPlacement->nodePort; StringInfo workerDropQuery = makeStringInfo(); MultiConnection *connection = NULL; uint32 connectionFlags = FOR_DDL; char storageType = shardInterval->storageType; if (storageType == SHARD_STORAGE_TABLE) { appendStringInfo(workerDropQuery, DROP_REGULAR_TABLE_COMMAND, quotedShardName); } else if (storageType == SHARD_STORAGE_COLUMNAR || storageType == SHARD_STORAGE_FOREIGN) { appendStringInfo(workerDropQuery, DROP_FOREIGN_TABLE_COMMAND, quotedShardName); } connection = GetPlacementConnection(connectionFlags, shardPlacement, NULL); RemoteTransactionBeginIfNecessary(connection); if (PQstatus(connection->pgConn) != CONNECTION_OK) { uint64 placementId = shardPlacement->placementId; ereport(WARNING, (errmsg("could not connect to shard \"%s\" on node " "\"%s:%u\"", shardRelationName, workerName, workerPort), errdetail("Marking this shard placement for " "deletion"))); UpdateShardPlacementState(placementId, FILE_TO_DELETE); continue; } MarkRemoteTransactionCritical(connection); ExecuteCriticalRemoteCommand(connection, workerDropQuery->data); DeleteShardPlacementRow(shardPlacement->placementId); }