/*
 * worker_fetch_partition_file fetches a partition file from the remote node.
 * The function assumes an upstream compute task depends on this partition file,
 * and therefore directly fetches the file into the upstream task's directory.
 */
Datum
worker_fetch_partition_file(PG_FUNCTION_ARGS)
{
	uint64 jobId = PG_GETARG_INT64(0);
	uint32 partitionTaskId = PG_GETARG_UINT32(1);
	uint32 partitionFileId = PG_GETARG_UINT32(2);
	uint32 upstreamTaskId = PG_GETARG_UINT32(3);
	text *nodeNameText = PG_GETARG_TEXT_P(4);
	uint32 nodePort = PG_GETARG_UINT32(5);
	char *nodeName = NULL;

	/* remote filename is <jobId>/<partitionTaskId>/<partitionFileId> */
	StringInfo remoteDirectoryName = TaskDirectoryName(jobId, partitionTaskId);
	StringInfo remoteFilename = PartitionFilename(remoteDirectoryName, partitionFileId);

	/* local filename is <jobId>/<upstreamTaskId>/<partitionTaskId> */
	StringInfo taskDirectoryName = TaskDirectoryName(jobId, upstreamTaskId);
	StringInfo taskFilename = TaskFilename(taskDirectoryName, partitionTaskId);

	/*
	 * If we are the first function to fetch a file for the upstream task, the
	 * task directory does not exist. We then lock and create the directory.
	 */
	bool taskDirectoryExists = DirectoryExists(taskDirectoryName);
	if (!taskDirectoryExists)
	{
		InitTaskDirectory(jobId, upstreamTaskId);
	}

	nodeName = text_to_cstring(nodeNameText);
	FetchRegularFile(nodeName, nodePort, remoteFilename, taskFilename);

	PG_RETURN_VOID();
}
Esempio n. 2
0
/*
 * worker_merge_files_into_table creates a task table within the job's schema,
 * which should have already been created by the task tracker protocol, and
 * copies files in its task directory into this table. If the schema doesn't
 * exist, the function defaults to the 'public' schema. Note that, unlike
 * partitioning functions, this function is not always idempotent. On success,
 * the function creates the table and loads data, and subsequent calls to the
 * function error out because the table already exist. On failure, the task
 * table creation commands are rolled back, and the function can be called
 * again.
 */
Datum
worker_merge_files_into_table(PG_FUNCTION_ARGS)
{
	uint64 jobId = PG_GETARG_INT64(0);
	uint32 taskId = PG_GETARG_UINT32(1);
	ArrayType *columnNameObject = PG_GETARG_ARRAYTYPE_P(2);
	ArrayType *columnTypeObject = PG_GETARG_ARRAYTYPE_P(3);

	StringInfo jobSchemaName = JobSchemaName(jobId);
	StringInfo taskTableName = TaskTableName(taskId);
	StringInfo taskDirectoryName = TaskDirectoryName(jobId, taskId);
	bool schemaExists = false;
	List *columnNameList = NIL;
	List *columnTypeList = NIL;

	/* we should have the same number of column names and types */
	int32 columnNameCount = ArrayObjectCount(columnNameObject);
	int32 columnTypeCount = ArrayObjectCount(columnTypeObject);
	if (columnNameCount != columnTypeCount)
	{
		ereport(ERROR, (errmsg("column name array size: %d and type array size: %d"
							   " do not match", columnNameCount, columnTypeCount)));
	}

	/*
	 * If the schema for the job isn't already created by the task tracker
	 * protocol, we fall to using the default 'public' schema.
	 */
	schemaExists = JobSchemaExists(jobSchemaName);
	if (!schemaExists)
	{
		resetStringInfo(jobSchemaName);
		appendStringInfoString(jobSchemaName, "public");
	}

	/* create the task table and copy files into the table */
	columnNameList = ArrayObjectToCStringList(columnNameObject);
	columnTypeList = ArrayObjectToCStringList(columnTypeObject);

	CreateTaskTable(jobSchemaName, taskTableName, columnNameList, columnTypeList);

	CopyTaskFilesFromDirectory(jobSchemaName, taskTableName, taskDirectoryName);

	PG_RETURN_VOID();
}
Esempio n. 3
0
/*
 * worker_merge_files_and_run_query creates a merge task table within the job's
 * schema, which should have already been created by the task tracker protocol.
 * It copies files in its task directory into this table. Then it runs final
 * query to create result table of the job.
 *
 * Note that here we followed a different approach to create a task table for merge
 * files than worker_merge_files_into_table(). In future we should unify these
 * two approaches. For this purpose creating a directory_fdw extension and using
 * it would make sense. Then we can merge files with a query or without query
 * through directory_fdw.
 */
Datum
worker_merge_files_and_run_query(PG_FUNCTION_ARGS)
{
	uint64 jobId = PG_GETARG_INT64(0);
	uint32 taskId = PG_GETARG_UINT32(1);
	text *createMergeTableQueryText = PG_GETARG_TEXT_P(2);
	text *createIntermediateTableQueryText = PG_GETARG_TEXT_P(3);

	const char *createMergeTableQuery = text_to_cstring(createMergeTableQueryText);
	const char *createIntermediateTableQuery =
		text_to_cstring(createIntermediateTableQueryText);

	StringInfo taskDirectoryName = TaskDirectoryName(jobId, taskId);
	StringInfo jobSchemaName = JobSchemaName(jobId);
	StringInfo intermediateTableName = TaskTableName(taskId);
	StringInfo mergeTableName = makeStringInfo();
	StringInfo setSearchPathString = makeStringInfo();
	bool schemaExists = false;
	int connected = 0;
	int setSearchPathResult = 0;
	int createMergeTableResult = 0;
	int createIntermediateTableResult = 0;
	int finished = 0;

	/*
	 * If the schema for the job isn't already created by the task tracker
	 * protocol, we fall to using the default 'public' schema.
	 */
	schemaExists = JobSchemaExists(jobSchemaName);
	if (!schemaExists)
	{
		resetStringInfo(jobSchemaName);
		appendStringInfoString(jobSchemaName, "public");
	}

	appendStringInfo(setSearchPathString, SET_SEARCH_PATH_COMMAND, jobSchemaName->data);

	/* Add "public" to search path to access UDFs in public schema */
	appendStringInfo(setSearchPathString, ",public");

	connected = SPI_connect();
	if (connected != SPI_OK_CONNECT)
	{
		ereport(ERROR, (errmsg("could not connect to SPI manager")));
	}

	setSearchPathResult = SPI_exec(setSearchPathString->data, 0);
	if (setSearchPathResult < 0)
	{
		ereport(ERROR, (errmsg("execution was not successful \"%s\"",
							   setSearchPathString->data)));
	}

	createMergeTableResult = SPI_exec(createMergeTableQuery, 0);
	if (createMergeTableResult < 0)
	{
		ereport(ERROR, (errmsg("execution was not successful \"%s\"",
							   createMergeTableQuery)));
	}

	appendStringInfo(mergeTableName, "%s%s", intermediateTableName->data,
					 MERGE_TABLE_SUFFIX);
	CopyTaskFilesFromDirectory(jobSchemaName, mergeTableName, taskDirectoryName);

	createIntermediateTableResult = SPI_exec(createIntermediateTableQuery, 0);
	if (createIntermediateTableResult < 0)
	{
		ereport(ERROR, (errmsg("execution was not successful \"%s\"",
							   createIntermediateTableQuery)));
	}

	finished = SPI_finish();
	if (finished != SPI_OK_FINISH)
	{
		ereport(ERROR, (errmsg("could not disconnect from SPI manager")));
	}

	PG_RETURN_VOID();
}