/* * lock_shard_resources allows shard resources to be locked * remotely to serialise non-commutative writes on shards. * * This function does not sort the array to avoid deadlock, callers * must ensure a consistent order. */ Datum lock_shard_resources(PG_FUNCTION_ARGS) { LOCKMODE lockMode = IntToLockMode(PG_GETARG_INT32(0)); ArrayType *shardIdArrayObject = PG_GETARG_ARRAYTYPE_P(1); Datum *shardIdArrayDatum = NULL; int shardIdCount = 0; int shardIdIndex = 0; if (ARR_NDIM(shardIdArrayObject) == 0) { ereport(ERROR, (errmsg("no locks specified"))); } /* we don't want random users to block writes */ EnsureSuperUser(); shardIdCount = ArrayObjectCount(shardIdArrayObject); shardIdArrayDatum = DeconstructArrayObject(shardIdArrayObject); for (shardIdIndex = 0; shardIdIndex < shardIdCount; shardIdIndex++) { int64 shardId = DatumGetInt64(shardIdArrayDatum[shardIdIndex]); LockShardResource(shardId, lockMode); } PG_RETURN_VOID(); }
/* * worker_merge_files_into_table creates a task table within the job's schema, * which should have already been created by the task tracker protocol, and * copies files in its task directory into this table. If the schema doesn't * exist, the function defaults to the 'public' schema. Note that, unlike * partitioning functions, this function is not always idempotent. On success, * the function creates the table and loads data, and subsequent calls to the * function error out because the table already exist. On failure, the task * table creation commands are rolled back, and the function can be called * again. */ Datum worker_merge_files_into_table(PG_FUNCTION_ARGS) { uint64 jobId = PG_GETARG_INT64(0); uint32 taskId = PG_GETARG_UINT32(1); ArrayType *columnNameObject = PG_GETARG_ARRAYTYPE_P(2); ArrayType *columnTypeObject = PG_GETARG_ARRAYTYPE_P(3); StringInfo jobSchemaName = JobSchemaName(jobId); StringInfo taskTableName = TaskTableName(taskId); StringInfo taskDirectoryName = TaskDirectoryName(jobId, taskId); bool schemaExists = false; List *columnNameList = NIL; List *columnTypeList = NIL; /* we should have the same number of column names and types */ int32 columnNameCount = ArrayObjectCount(columnNameObject); int32 columnTypeCount = ArrayObjectCount(columnTypeObject); if (columnNameCount != columnTypeCount) { ereport(ERROR, (errmsg("column name array size: %d and type array size: %d" " do not match", columnNameCount, columnTypeCount))); } /* * If the schema for the job isn't already created by the task tracker * protocol, we fall to using the default 'public' schema. */ schemaExists = JobSchemaExists(jobSchemaName); if (!schemaExists) { resetStringInfo(jobSchemaName); appendStringInfoString(jobSchemaName, "public"); } /* create the task table and copy files into the table */ columnNameList = ArrayObjectToCStringList(columnNameObject); columnTypeList = ArrayObjectToCStringList(columnTypeObject); CreateTaskTable(jobSchemaName, taskTableName, columnNameList, columnTypeList); CopyTaskFilesFromDirectory(jobSchemaName, taskTableName, taskDirectoryName); PG_RETURN_VOID(); }
/* Creates a list of cstrings from a single dimensional array object. */ static List * ArrayObjectToCStringList(ArrayType *arrayObject) { List *cstringList = NIL; Datum *datumArray = DeconstructArrayObject(arrayObject); int32 arraySize = ArrayObjectCount(arrayObject); int32 arrayIndex = 0; for (arrayIndex = 0; arrayIndex < arraySize; arrayIndex++) { Datum datum = datumArray[arrayIndex]; char *cstring = TextDatumGetCString(datum); cstringList = lappend(cstringList, cstring); } Assert(cstringList != NIL); return cstringList; }
/* * FetchTableCommon executes common logic that wraps around the actual data * fetching function. This common logic includes ensuring that only one process * tries to fetch this table at any given time, and that data fetch operations * are retried in case of node failures. */ static void FetchTableCommon(text *tableNameText, uint64 remoteTableSize, ArrayType *nodeNameObject, ArrayType *nodePortObject, bool (*FetchTableFunction)(const char *, uint32, const char *)) { uint64 shardId = INVALID_SHARD_ID; Oid relationId = InvalidOid; List *relationNameList = NIL; RangeVar *relation = NULL; uint32 nodeIndex = 0; bool tableFetched = false; char *tableName = text_to_cstring(tableNameText); Datum *nodeNameArray = DeconstructArrayObject(nodeNameObject); Datum *nodePortArray = DeconstructArrayObject(nodePortObject); int32 nodeNameCount = ArrayObjectCount(nodeNameObject); int32 nodePortCount = ArrayObjectCount(nodePortObject); /* we should have the same number of node names and port numbers */ if (nodeNameCount != nodePortCount) { ereport(ERROR, (errmsg("node name array size: %d and node port array size: %d" " do not match", nodeNameCount, nodePortCount))); } /* * We lock on the shardId, but do not unlock. When the function returns, and * the transaction for this function commits, this lock will automatically * be released. This ensures that concurrent caching commands will see the * newly created table when they acquire the lock (in read committed mode). */ shardId = ExtractShardId(tableName); LockShardResource(shardId, AccessExclusiveLock); relationNameList = textToQualifiedNameList(tableNameText); relation = makeRangeVarFromNameList(relationNameList); relationId = RangeVarGetRelid(relation, NoLock, true); /* check if we already fetched the table */ if (relationId != InvalidOid) { uint64 localTableSize = 0; if (!ExpireCachedShards) { return; } /* * Check if the cached shard has the same size on disk as it has as on * the placement (is up to date). * * Note 1: performing updates or deletes on the original shard leads to * inconsistent sizes between different databases in which case the data * would be fetched every time, or worse, the placement would get into * a deadlock when it tries to fetch from itself while holding the lock. * Therefore, this option is disabled by default. * * Note 2: when appending data to a shard, the size on disk only * increases when a new page is added (the next 8kB block). */ localTableSize = LocalTableSize(relationId); if (remoteTableSize > localTableSize) { /* table is not up to date, drop the table */ ObjectAddress tableObject = { InvalidOid, InvalidOid, 0 }; tableObject.classId = RelationRelationId; tableObject.objectId = relationId; tableObject.objectSubId = 0; performDeletion(&tableObject, DROP_RESTRICT, PERFORM_DELETION_INTERNAL); } else { /* table is up to date */ return; } } /* loop until we fetch the table or try all nodes */ while (!tableFetched && (nodeIndex < nodeNameCount)) { Datum nodeNameDatum = nodeNameArray[nodeIndex]; Datum nodePortDatum = nodePortArray[nodeIndex]; char *nodeName = TextDatumGetCString(nodeNameDatum); uint32 nodePort = DatumGetUInt32(nodePortDatum); tableFetched = (*FetchTableFunction)(nodeName, nodePort, tableName); nodeIndex++; } /* error out if we tried all nodes and could not fetch the table */ if (!tableFetched) { ereport(ERROR, (errmsg("could not fetch relation: \"%s\"", tableName))); } }