Beispiel #1
0
/*
 * lock_shard_resources allows shard resources  to be locked
 * remotely to serialise non-commutative writes on shards.
 *
 * This function does not sort the array to avoid deadlock, callers
 * must ensure a consistent order.
 */
Datum
lock_shard_resources(PG_FUNCTION_ARGS)
{
	LOCKMODE lockMode = IntToLockMode(PG_GETARG_INT32(0));
	ArrayType *shardIdArrayObject = PG_GETARG_ARRAYTYPE_P(1);
	Datum *shardIdArrayDatum = NULL;
	int shardIdCount = 0;
	int shardIdIndex = 0;

	if (ARR_NDIM(shardIdArrayObject) == 0)
	{
		ereport(ERROR, (errmsg("no locks specified")));
	}

	/* we don't want random users to block writes */
	EnsureSuperUser();

	shardIdCount = ArrayObjectCount(shardIdArrayObject);
	shardIdArrayDatum = DeconstructArrayObject(shardIdArrayObject);

	for (shardIdIndex = 0; shardIdIndex < shardIdCount; shardIdIndex++)
	{
		int64 shardId = DatumGetInt64(shardIdArrayDatum[shardIdIndex]);

		LockShardResource(shardId, lockMode);
	}

	PG_RETURN_VOID();
}
/*
 * worker_merge_files_into_table creates a task table within the job's schema,
 * which should have already been created by the task tracker protocol, and
 * copies files in its task directory into this table. If the schema doesn't
 * exist, the function defaults to the 'public' schema. Note that, unlike
 * partitioning functions, this function is not always idempotent. On success,
 * the function creates the table and loads data, and subsequent calls to the
 * function error out because the table already exist. On failure, the task
 * table creation commands are rolled back, and the function can be called
 * again.
 */
Datum
worker_merge_files_into_table(PG_FUNCTION_ARGS)
{
	uint64 jobId = PG_GETARG_INT64(0);
	uint32 taskId = PG_GETARG_UINT32(1);
	ArrayType *columnNameObject = PG_GETARG_ARRAYTYPE_P(2);
	ArrayType *columnTypeObject = PG_GETARG_ARRAYTYPE_P(3);

	StringInfo jobSchemaName = JobSchemaName(jobId);
	StringInfo taskTableName = TaskTableName(taskId);
	StringInfo taskDirectoryName = TaskDirectoryName(jobId, taskId);
	bool schemaExists = false;
	List *columnNameList = NIL;
	List *columnTypeList = NIL;

	/* we should have the same number of column names and types */
	int32 columnNameCount = ArrayObjectCount(columnNameObject);
	int32 columnTypeCount = ArrayObjectCount(columnTypeObject);
	if (columnNameCount != columnTypeCount)
	{
		ereport(ERROR, (errmsg("column name array size: %d and type array size: %d"
							   " do not match", columnNameCount, columnTypeCount)));
	}

	/*
	 * If the schema for the job isn't already created by the task tracker
	 * protocol, we fall to using the default 'public' schema.
	 */
	schemaExists = JobSchemaExists(jobSchemaName);
	if (!schemaExists)
	{
		resetStringInfo(jobSchemaName);
		appendStringInfoString(jobSchemaName, "public");
	}

	/* create the task table and copy files into the table */
	columnNameList = ArrayObjectToCStringList(columnNameObject);
	columnTypeList = ArrayObjectToCStringList(columnTypeObject);

	CreateTaskTable(jobSchemaName, taskTableName, columnNameList, columnTypeList);

	CopyTaskFilesFromDirectory(jobSchemaName, taskTableName, taskDirectoryName);

	PG_RETURN_VOID();
}
/* Creates a list of cstrings from a single dimensional array object. */
static List *
ArrayObjectToCStringList(ArrayType *arrayObject)
{
	List *cstringList = NIL;
	Datum *datumArray = DeconstructArrayObject(arrayObject);
	int32 arraySize = ArrayObjectCount(arrayObject);

	int32 arrayIndex = 0;
	for (arrayIndex = 0; arrayIndex < arraySize; arrayIndex++)
	{
		Datum datum = datumArray[arrayIndex];
		char *cstring = TextDatumGetCString(datum);

		cstringList = lappend(cstringList, cstring);
	}

	Assert(cstringList != NIL);
	return cstringList;
}
/*
 * FetchTableCommon executes common logic that wraps around the actual data
 * fetching function. This common logic includes ensuring that only one process
 * tries to fetch this table at any given time, and that data fetch operations
 * are retried in case of node failures.
 */
static void
FetchTableCommon(text *tableNameText, uint64 remoteTableSize,
				 ArrayType *nodeNameObject, ArrayType *nodePortObject,
				 bool (*FetchTableFunction)(const char *, uint32, const char *))
{
	uint64 shardId = INVALID_SHARD_ID;
	Oid relationId = InvalidOid;
	List *relationNameList = NIL;
	RangeVar *relation = NULL;
	uint32 nodeIndex = 0;
	bool tableFetched = false;
	char *tableName = text_to_cstring(tableNameText);

	Datum *nodeNameArray = DeconstructArrayObject(nodeNameObject);
	Datum *nodePortArray = DeconstructArrayObject(nodePortObject);
	int32 nodeNameCount = ArrayObjectCount(nodeNameObject);
	int32 nodePortCount = ArrayObjectCount(nodePortObject);

	/* we should have the same number of node names and port numbers */
	if (nodeNameCount != nodePortCount)
	{
		ereport(ERROR, (errmsg("node name array size: %d and node port array size: %d"
							   " do not match", nodeNameCount, nodePortCount)));
	}

	/*
	 * We lock on the shardId, but do not unlock. When the function returns, and
	 * the transaction for this function commits, this lock will automatically
	 * be released. This ensures that concurrent caching commands will see the
	 * newly created table when they acquire the lock (in read committed mode).
	 */
	shardId = ExtractShardId(tableName);
	LockShardResource(shardId, AccessExclusiveLock);

	relationNameList = textToQualifiedNameList(tableNameText);
	relation = makeRangeVarFromNameList(relationNameList);
	relationId = RangeVarGetRelid(relation, NoLock, true);

	/* check if we already fetched the table */
	if (relationId != InvalidOid)
	{
		uint64 localTableSize = 0;

		if (!ExpireCachedShards)
		{
			return;
		}

		/*
		 * Check if the cached shard has the same size on disk as it has as on
		 * the placement (is up to date).
		 *
		 * Note 1: performing updates or deletes on the original shard leads to
		 * inconsistent sizes between different databases in which case the data
		 * would be fetched every time, or worse, the placement would get into
		 * a deadlock when it tries to fetch from itself while holding the lock.
		 * Therefore, this option is disabled by default.
		 *
		 * Note 2: when appending data to a shard, the size on disk only
		 * increases when a new page is added (the next 8kB block).
		 */
		localTableSize = LocalTableSize(relationId);

		if (remoteTableSize > localTableSize)
		{
			/* table is not up to date, drop the table */
			ObjectAddress tableObject = { InvalidOid, InvalidOid, 0 };

			tableObject.classId = RelationRelationId;
			tableObject.objectId = relationId;
			tableObject.objectSubId = 0;

			performDeletion(&tableObject, DROP_RESTRICT, PERFORM_DELETION_INTERNAL);
		}
		else
		{
			/* table is up to date */
			return;
		}
	}

	/* loop until we fetch the table or try all nodes */
	while (!tableFetched && (nodeIndex < nodeNameCount))
	{
		Datum nodeNameDatum = nodeNameArray[nodeIndex];
		Datum nodePortDatum = nodePortArray[nodeIndex];
		char *nodeName = TextDatumGetCString(nodeNameDatum);
		uint32 nodePort = DatumGetUInt32(nodePortDatum);

		tableFetched = (*FetchTableFunction)(nodeName, nodePort, tableName);

		nodeIndex++;
	}

	/* error out if we tried all nodes and could not fetch the table */
	if (!tableFetched)
	{
		ereport(ERROR, (errmsg("could not fetch relation: \"%s\"", tableName)));
	}
}