/* * lock_shard_resources allows shard resources to be locked * remotely to serialise non-commutative writes on shards. * * This function does not sort the array to avoid deadlock, callers * must ensure a consistent order. */ Datum lock_shard_resources(PG_FUNCTION_ARGS) { LOCKMODE lockMode = IntToLockMode(PG_GETARG_INT32(0)); ArrayType *shardIdArrayObject = PG_GETARG_ARRAYTYPE_P(1); Datum *shardIdArrayDatum = NULL; int shardIdCount = 0; int shardIdIndex = 0; if (ARR_NDIM(shardIdArrayObject) == 0) { ereport(ERROR, (errmsg("no locks specified"))); } /* we don't want random users to block writes */ EnsureSuperUser(); shardIdCount = ArrayObjectCount(shardIdArrayObject); shardIdArrayDatum = DeconstructArrayObject(shardIdArrayObject); for (shardIdIndex = 0; shardIdIndex < shardIdCount; shardIdIndex++) { int64 shardId = DatumGetInt64(shardIdArrayDatum[shardIdIndex]); LockShardResource(shardId, lockMode); } PG_RETURN_VOID(); }
/* Creates a list of cstrings from a single dimensional array object. */ static List * ArrayObjectToCStringList(ArrayType *arrayObject) { List *cstringList = NIL; Datum *datumArray = DeconstructArrayObject(arrayObject); int32 arraySize = ArrayObjectCount(arrayObject); int32 arrayIndex = 0; for (arrayIndex = 0; arrayIndex < arraySize; arrayIndex++) { Datum datum = datumArray[arrayIndex]; char *cstring = TextDatumGetCString(datum); cstringList = lappend(cstringList, cstring); } Assert(cstringList != NIL); return cstringList; }
/* * FetchTableCommon executes common logic that wraps around the actual data * fetching function. This common logic includes ensuring that only one process * tries to fetch this table at any given time, and that data fetch operations * are retried in case of node failures. */ static void FetchTableCommon(text *tableNameText, uint64 remoteTableSize, ArrayType *nodeNameObject, ArrayType *nodePortObject, bool (*FetchTableFunction)(const char *, uint32, const char *)) { uint64 shardId = INVALID_SHARD_ID; Oid relationId = InvalidOid; List *relationNameList = NIL; RangeVar *relation = NULL; uint32 nodeIndex = 0; bool tableFetched = false; char *tableName = text_to_cstring(tableNameText); Datum *nodeNameArray = DeconstructArrayObject(nodeNameObject); Datum *nodePortArray = DeconstructArrayObject(nodePortObject); int32 nodeNameCount = ArrayObjectCount(nodeNameObject); int32 nodePortCount = ArrayObjectCount(nodePortObject); /* we should have the same number of node names and port numbers */ if (nodeNameCount != nodePortCount) { ereport(ERROR, (errmsg("node name array size: %d and node port array size: %d" " do not match", nodeNameCount, nodePortCount))); } /* * We lock on the shardId, but do not unlock. When the function returns, and * the transaction for this function commits, this lock will automatically * be released. This ensures that concurrent caching commands will see the * newly created table when they acquire the lock (in read committed mode). */ shardId = ExtractShardId(tableName); LockShardResource(shardId, AccessExclusiveLock); relationNameList = textToQualifiedNameList(tableNameText); relation = makeRangeVarFromNameList(relationNameList); relationId = RangeVarGetRelid(relation, NoLock, true); /* check if we already fetched the table */ if (relationId != InvalidOid) { uint64 localTableSize = 0; if (!ExpireCachedShards) { return; } /* * Check if the cached shard has the same size on disk as it has as on * the placement (is up to date). * * Note 1: performing updates or deletes on the original shard leads to * inconsistent sizes between different databases in which case the data * would be fetched every time, or worse, the placement would get into * a deadlock when it tries to fetch from itself while holding the lock. * Therefore, this option is disabled by default. * * Note 2: when appending data to a shard, the size on disk only * increases when a new page is added (the next 8kB block). */ localTableSize = LocalTableSize(relationId); if (remoteTableSize > localTableSize) { /* table is not up to date, drop the table */ ObjectAddress tableObject = { InvalidOid, InvalidOid, 0 }; tableObject.classId = RelationRelationId; tableObject.objectId = relationId; tableObject.objectSubId = 0; performDeletion(&tableObject, DROP_RESTRICT, PERFORM_DELETION_INTERNAL); } else { /* table is up to date */ return; } } /* loop until we fetch the table or try all nodes */ while (!tableFetched && (nodeIndex < nodeNameCount)) { Datum nodeNameDatum = nodeNameArray[nodeIndex]; Datum nodePortDatum = nodePortArray[nodeIndex]; char *nodeName = TextDatumGetCString(nodeNameDatum); uint32 nodePort = DatumGetUInt32(nodePortDatum); tableFetched = (*FetchTableFunction)(nodeName, nodePort, tableName); nodeIndex++; } /* error out if we tried all nodes and could not fetch the table */ if (!tableFetched) { ereport(ERROR, (errmsg("could not fetch relation: \"%s\"", tableName))); } }