/* * worker_merge_files_into_table creates a task table within the job's schema, * which should have already been created by the task tracker protocol, and * copies files in its task directory into this table. If the schema doesn't * exist, the function defaults to the 'public' schema. Note that, unlike * partitioning functions, this function is not always idempotent. On success, * the function creates the table and loads data, and subsequent calls to the * function error out because the table already exist. On failure, the task * table creation commands are rolled back, and the function can be called * again. */ Datum worker_merge_files_into_table(PG_FUNCTION_ARGS) { uint64 jobId = PG_GETARG_INT64(0); uint32 taskId = PG_GETARG_UINT32(1); ArrayType *columnNameObject = PG_GETARG_ARRAYTYPE_P(2); ArrayType *columnTypeObject = PG_GETARG_ARRAYTYPE_P(3); StringInfo jobSchemaName = JobSchemaName(jobId); StringInfo taskTableName = TaskTableName(taskId); StringInfo taskDirectoryName = TaskDirectoryName(jobId, taskId); bool schemaExists = false; List *columnNameList = NIL; List *columnTypeList = NIL; /* we should have the same number of column names and types */ int32 columnNameCount = ArrayObjectCount(columnNameObject); int32 columnTypeCount = ArrayObjectCount(columnTypeObject); if (columnNameCount != columnTypeCount) { ereport(ERROR, (errmsg("column name array size: %d and type array size: %d" " do not match", columnNameCount, columnTypeCount))); } /* * If the schema for the job isn't already created by the task tracker * protocol, we fall to using the default 'public' schema. */ schemaExists = JobSchemaExists(jobSchemaName); if (!schemaExists) { resetStringInfo(jobSchemaName); appendStringInfoString(jobSchemaName, "public"); } /* create the task table and copy files into the table */ columnNameList = ArrayObjectToCStringList(columnNameObject); columnTypeList = ArrayObjectToCStringList(columnTypeObject); CreateTaskTable(jobSchemaName, taskTableName, columnNameList, columnTypeList); CopyTaskFilesFromDirectory(jobSchemaName, taskTableName, taskDirectoryName); PG_RETURN_VOID(); }
/* * task_tracker_assign_task creates a new task in the shared hash or updates an * already existing task. The function also creates a schema for the job if it * doesn't already exist. */ Datum task_tracker_assign_task(PG_FUNCTION_ARGS) { uint64 jobId = PG_GETARG_INT64(0); uint32 taskId = PG_GETARG_UINT32(1); text *taskCallStringText = PG_GETARG_TEXT_P(2); StringInfo jobSchemaName = JobSchemaName(jobId); bool schemaExists = false; WorkerTask *workerTask = NULL; char *taskCallString = text_to_cstring(taskCallStringText); uint32 taskCallStringLength = strlen(taskCallString); /* check that we have a running task tracker on this host */ bool taskTrackerRunning = TaskTrackerRunning(); if (!taskTrackerRunning) { ereport(ERROR, (errcode(ERRCODE_CANNOT_CONNECT_NOW), errmsg("the task tracker has been disabled or shut down"))); } /* check that we have enough space in our shared hash for this string */ if (taskCallStringLength >= TASK_CALL_STRING_SIZE) { ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("task call string exceeds maximum assignable length"))); } /* * If the schema does not exist, we create it. However, the schema does not * become visible to other processes until the transaction commits, and we * therefore do not release the resource lock in this case. Otherwise, the * schema is already visible, and we immediately release the resource lock. */ LockJobResource(jobId, AccessExclusiveLock); schemaExists = JobSchemaExists(jobSchemaName); if (!schemaExists) { /* lock gets automatically released upon return from this function */ CreateJobSchema(jobSchemaName); } else { UnlockJobResource(jobId, AccessExclusiveLock); } LWLockAcquire(&WorkerTasksSharedState->taskHashLock, LW_EXCLUSIVE); /* check if we already have the task in our shared hash */ workerTask = WorkerTasksHashFind(jobId, taskId); if (workerTask == NULL) { CreateTask(jobId, taskId, taskCallString); } else { UpdateTask(workerTask, taskCallString); } LWLockRelease(&WorkerTasksSharedState->taskHashLock); PG_RETURN_VOID(); }
/* * worker_merge_files_and_run_query creates a merge task table within the job's * schema, which should have already been created by the task tracker protocol. * It copies files in its task directory into this table. Then it runs final * query to create result table of the job. * * Note that here we followed a different approach to create a task table for merge * files than worker_merge_files_into_table(). In future we should unify these * two approaches. For this purpose creating a directory_fdw extension and using * it would make sense. Then we can merge files with a query or without query * through directory_fdw. */ Datum worker_merge_files_and_run_query(PG_FUNCTION_ARGS) { uint64 jobId = PG_GETARG_INT64(0); uint32 taskId = PG_GETARG_UINT32(1); text *createMergeTableQueryText = PG_GETARG_TEXT_P(2); text *createIntermediateTableQueryText = PG_GETARG_TEXT_P(3); const char *createMergeTableQuery = text_to_cstring(createMergeTableQueryText); const char *createIntermediateTableQuery = text_to_cstring(createIntermediateTableQueryText); StringInfo taskDirectoryName = TaskDirectoryName(jobId, taskId); StringInfo jobSchemaName = JobSchemaName(jobId); StringInfo intermediateTableName = TaskTableName(taskId); StringInfo mergeTableName = makeStringInfo(); StringInfo setSearchPathString = makeStringInfo(); bool schemaExists = false; int connected = 0; int setSearchPathResult = 0; int createMergeTableResult = 0; int createIntermediateTableResult = 0; int finished = 0; /* * If the schema for the job isn't already created by the task tracker * protocol, we fall to using the default 'public' schema. */ schemaExists = JobSchemaExists(jobSchemaName); if (!schemaExists) { resetStringInfo(jobSchemaName); appendStringInfoString(jobSchemaName, "public"); } appendStringInfo(setSearchPathString, SET_SEARCH_PATH_COMMAND, jobSchemaName->data); /* Add "public" to search path to access UDFs in public schema */ appendStringInfo(setSearchPathString, ",public"); connected = SPI_connect(); if (connected != SPI_OK_CONNECT) { ereport(ERROR, (errmsg("could not connect to SPI manager"))); } setSearchPathResult = SPI_exec(setSearchPathString->data, 0); if (setSearchPathResult < 0) { ereport(ERROR, (errmsg("execution was not successful \"%s\"", setSearchPathString->data))); } createMergeTableResult = SPI_exec(createMergeTableQuery, 0); if (createMergeTableResult < 0) { ereport(ERROR, (errmsg("execution was not successful \"%s\"", createMergeTableQuery))); } appendStringInfo(mergeTableName, "%s%s", intermediateTableName->data, MERGE_TABLE_SUFFIX); CopyTaskFilesFromDirectory(jobSchemaName, mergeTableName, taskDirectoryName); createIntermediateTableResult = SPI_exec(createIntermediateTableQuery, 0); if (createIntermediateTableResult < 0) { ereport(ERROR, (errmsg("execution was not successful \"%s\"", createIntermediateTableQuery))); } finished = SPI_finish(); if (finished != SPI_OK_FINISH) { ereport(ERROR, (errmsg("could not disconnect from SPI manager"))); } PG_RETURN_VOID(); }