/* * MakeTextPartitionExpression returns an equality expression between the * specified table's partition column and the provided values. */ static Expr * MakeTextPartitionExpression(Oid distributedTableId, text *value) { Var *partitionColumn = PartitionColumn(distributedTableId); Expr *partitionExpression = NULL; if (value != NULL) { OpExpr *equalityExpr = MakeOpExpression(partitionColumn, BTEqualStrategyNumber); Const *rightConst = (Const *) get_rightop((Expr *) equalityExpr); rightConst->constvalue = (Datum) value; rightConst->constisnull = false; rightConst->constbyval = false; partitionExpression = (Expr *) equalityExpr; } else { NullTest *nullTest = makeNode(NullTest); nullTest->arg = (Expr *) partitionColumn; nullTest->nulltesttype = IS_NULL; partitionExpression = (Expr *) nullTest; } return partitionExpression; }
/* * debug_equality_expression returns the textual representation of an equality * expression generated by a call to MakeOpExpression. */ Datum debug_equality_expression(PG_FUNCTION_ARGS) { Oid distributedTableId = PG_GETARG_OID(0); Var *partitionColumn = PartitionColumn(distributedTableId); OpExpr *equalityExpression = MakeOpExpression(partitionColumn, BTEqualStrategyNumber); PG_RETURN_CSTRING(nodeToString(equalityExpression)); }
/* * PruneShardList prunes shards from given list based on the selection criteria, * and returns remaining shards in another list. */ List * PruneShardList(Oid relationId, List *whereClauseList, List *shardIntervalList) { List *remainingShardList = NIL; ListCell *shardIntervalCell = NULL; List *restrictInfoList = NIL; Node *baseConstraint = NULL; Var *partitionColumn = PartitionColumn(relationId); char partitionMethod = PartitionType(relationId); /* build the filter clause list for the partition method */ if (partitionMethod == DISTRIBUTE_BY_HASH) { Node *hashedNode = HashableClauseMutator((Node *) whereClauseList, partitionColumn); List *hashedClauseList = (List *) hashedNode; restrictInfoList = BuildRestrictInfoList(hashedClauseList); } else { restrictInfoList = BuildRestrictInfoList(whereClauseList); } /* override the partition column for hash partitioning */ if (partitionMethod == DISTRIBUTE_BY_HASH) { partitionColumn = MakeInt4Column(); } /* build the base expression for constraint */ baseConstraint = BuildBaseConstraint(partitionColumn); /* walk over shard list and check if shards can be pruned */ foreach(shardIntervalCell, shardIntervalList) { ShardInterval *shardInterval = lfirst(shardIntervalCell); List *constraintList = NIL; bool shardPruned = false; /* set the min/max values in the base constraint */ UpdateConstraint(baseConstraint, shardInterval); constraintList = list_make1(baseConstraint); shardPruned = predicate_refuted_by(constraintList, restrictInfoList); if (shardPruned) { ereport(DEBUG2, (errmsg("predicate pruning for shardId " UINT64_FORMAT, shardInterval->id))); } else { remainingShardList = lappend(remainingShardList, &(shardInterval->id)); } }
/* * CitusCopyFrom implements the COPY table_name FROM ... for hash-partitioned * and range-partitioned tables. */ void CitusCopyFrom(CopyStmt *copyStatement, char *completionTag) { Oid tableId = RangeVarGetRelid(copyStatement->relation, NoLock, false); char *relationName = get_rel_name(tableId); Relation distributedRelation = NULL; char partitionMethod = '\0'; Var *partitionColumn = NULL; TupleDesc tupleDescriptor = NULL; uint32 columnCount = 0; Datum *columnValues = NULL; bool *columnNulls = NULL; TypeCacheEntry *typeEntry = NULL; FmgrInfo *hashFunction = NULL; FmgrInfo *compareFunction = NULL; int shardCount = 0; List *shardIntervalList = NULL; ShardInterval **shardIntervalCache = NULL; bool useBinarySearch = false; HTAB *shardConnectionHash = NULL; ShardConnections *shardConnections = NULL; List *connectionList = NIL; EState *executorState = NULL; MemoryContext executorTupleContext = NULL; ExprContext *executorExpressionContext = NULL; CopyState copyState = NULL; CopyOutState copyOutState = NULL; FmgrInfo *columnOutputFunctions = NULL; uint64 processedRowCount = 0; /* disallow COPY to/from file or program except for superusers */ if (copyStatement->filename != NULL && !superuser()) { if (copyStatement->is_program) { ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("must be superuser to COPY to or from an external program"), errhint("Anyone can COPY to stdout or from stdin. " "psql's \\copy command also works for anyone."))); } else { ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("must be superuser to COPY to or from a file"), errhint("Anyone can COPY to stdout or from stdin. " "psql's \\copy command also works for anyone."))); } } partitionColumn = PartitionColumn(tableId, 0); partitionMethod = PartitionMethod(tableId); if (partitionMethod != DISTRIBUTE_BY_RANGE && partitionMethod != DISTRIBUTE_BY_HASH) { ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("COPY is only supported for hash- and " "range-partitioned tables"))); } /* resolve hash function for partition column */ typeEntry = lookup_type_cache(partitionColumn->vartype, TYPECACHE_HASH_PROC_FINFO); hashFunction = &(typeEntry->hash_proc_finfo); /* resolve compare function for shard intervals */ compareFunction = ShardIntervalCompareFunction(partitionColumn, partitionMethod); /* allocate column values and nulls arrays */ distributedRelation = heap_open(tableId, RowExclusiveLock); tupleDescriptor = RelationGetDescr(distributedRelation); columnCount = tupleDescriptor->natts; columnValues = palloc0(columnCount * sizeof(Datum)); columnNulls = palloc0(columnCount * sizeof(bool)); /* load the list of shards and verify that we have shards to copy into */ shardIntervalList = LoadShardIntervalList(tableId); if (shardIntervalList == NIL) { if (partitionMethod == DISTRIBUTE_BY_HASH) { ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("could not find any shards into which to copy"), errdetail("No shards exist for distributed table \"%s\".", relationName), errhint("Run master_create_worker_shards to create shards " "and try again."))); } else { ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("could not find any shards into which to copy"), errdetail("No shards exist for distributed table \"%s\".", relationName))); } } /* prevent concurrent placement changes and non-commutative DML statements */ LockAllShards(shardIntervalList); /* initialize the shard interval cache */ shardCount = list_length(shardIntervalList); shardIntervalCache = SortedShardIntervalArray(shardIntervalList); /* determine whether to use binary search */ if (partitionMethod != DISTRIBUTE_BY_HASH || !IsUniformHashDistribution(shardIntervalCache, shardCount)) { useBinarySearch = true; } /* initialize copy state to read from COPY data source */ copyState = BeginCopyFrom(distributedRelation, copyStatement->filename, copyStatement->is_program, copyStatement->attlist, copyStatement->options); executorState = CreateExecutorState(); executorTupleContext = GetPerTupleMemoryContext(executorState); executorExpressionContext = GetPerTupleExprContext(executorState); copyOutState = (CopyOutState) palloc0(sizeof(CopyOutStateData)); copyOutState->binary = true; copyOutState->fe_msgbuf = makeStringInfo(); copyOutState->rowcontext = executorTupleContext; columnOutputFunctions = ColumnOutputFunctions(tupleDescriptor, copyOutState->binary); /* * Create a mapping of shard id to a connection for each of its placements. * The hash should be initialized before the PG_TRY, since it is used and * PG_CATCH. Otherwise, it may be undefined in the PG_CATCH (see sigsetjmp * documentation). */ shardConnectionHash = CreateShardConnectionHash(); /* we use a PG_TRY block to roll back on errors (e.g. in NextCopyFrom) */ PG_TRY(); { ErrorContextCallback errorCallback; /* set up callback to identify error line number */ errorCallback.callback = CopyFromErrorCallback; errorCallback.arg = (void *) copyState; errorCallback.previous = error_context_stack; error_context_stack = &errorCallback; /* ensure transactions have unique names on worker nodes */ InitializeDistributedTransaction(); while (true) { bool nextRowFound = false; Datum partitionColumnValue = 0; ShardInterval *shardInterval = NULL; int64 shardId = 0; bool shardConnectionsFound = false; MemoryContext oldContext = NULL; ResetPerTupleExprContext(executorState); oldContext = MemoryContextSwitchTo(executorTupleContext); /* parse a row from the input */ nextRowFound = NextCopyFrom(copyState, executorExpressionContext, columnValues, columnNulls, NULL); if (!nextRowFound) { MemoryContextSwitchTo(oldContext); break; } CHECK_FOR_INTERRUPTS(); /* find the partition column value */ if (columnNulls[partitionColumn->varattno - 1]) { ereport(ERROR, (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), errmsg("cannot copy row with NULL value " "in partition column"))); } partitionColumnValue = columnValues[partitionColumn->varattno - 1]; /* find the shard interval and id for the partition column value */ shardInterval = FindShardInterval(partitionColumnValue, shardIntervalCache, shardCount, partitionMethod, compareFunction, hashFunction, useBinarySearch); if (shardInterval == NULL) { ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("could not find shard for partition column " "value"))); } shardId = shardInterval->shardId; MemoryContextSwitchTo(oldContext); /* get existing connections to the shard placements, if any */ shardConnections = GetShardConnections(shardConnectionHash, shardId, &shardConnectionsFound); if (!shardConnectionsFound) { /* open connections and initiate COPY on shard placements */ OpenCopyTransactions(copyStatement, shardConnections); /* send binary headers to shard placements */ resetStringInfo(copyOutState->fe_msgbuf); AppendCopyBinaryHeaders(copyOutState); SendCopyDataToAll(copyOutState->fe_msgbuf, shardConnections->connectionList); } /* replicate row to shard placements */ resetStringInfo(copyOutState->fe_msgbuf); AppendCopyRowData(columnValues, columnNulls, tupleDescriptor, copyOutState, columnOutputFunctions); SendCopyDataToAll(copyOutState->fe_msgbuf, shardConnections->connectionList); processedRowCount += 1; } connectionList = ConnectionList(shardConnectionHash); /* send binary footers to all shard placements */ resetStringInfo(copyOutState->fe_msgbuf); AppendCopyBinaryFooters(copyOutState); SendCopyDataToAll(copyOutState->fe_msgbuf, connectionList); /* all lines have been copied, stop showing line number in errors */ error_context_stack = errorCallback.previous; /* close the COPY input on all shard placements */ EndRemoteCopy(connectionList, true); if (CopyTransactionManager == TRANSACTION_MANAGER_2PC) { PrepareRemoteTransactions(connectionList); } EndCopyFrom(copyState); heap_close(distributedRelation, NoLock); /* check for cancellation one last time before committing */ CHECK_FOR_INTERRUPTS(); } PG_CATCH(); { List *abortConnectionList = NIL; /* roll back all transactions */ abortConnectionList = ConnectionList(shardConnectionHash); EndRemoteCopy(abortConnectionList, false); AbortRemoteTransactions(abortConnectionList); CloseConnections(abortConnectionList); PG_RE_THROW(); } PG_END_TRY(); /* * Ready to commit the transaction, this code is below the PG_TRY block because * we do not want any of the transactions rolled back if a failure occurs. Instead, * they should be rolled forward. */ CommitRemoteTransactions(connectionList); CloseConnections(connectionList); if (completionTag != NULL) { snprintf(completionTag, COMPLETION_TAG_BUFSIZE, "COPY " UINT64_FORMAT, processedRowCount); } }
/* * PruneShardList prunes shards from given list based on the selection criteria, * and returns remaining shards in another list. */ List * PruneShardList(Oid relationId, List *whereClauseList, List *shardIntervalList) { List *remainingShardList = NIL; ListCell *shardIntervalCell = NULL; List *restrictInfoList = NIL; Node *baseConstraint = NULL; Var *partitionColumn = PartitionColumn(relationId); char partitionMethod = PartitionType(relationId); /* build the filter clause list for the partition method */ switch (partitionMethod) { case APPEND_PARTITION_TYPE: case RANGE_PARTITION_TYPE: { restrictInfoList = BuildRestrictInfoList(whereClauseList); break; } case HASH_PARTITION_TYPE: { Node *hashedNode = HashableClauseMutator((Node *) whereClauseList, partitionColumn); List *hashedClauseList = (List *) hashedNode; restrictInfoList = BuildRestrictInfoList(hashedClauseList); /* override the partition column for hash partitioning */ partitionColumn = MakeInt4Column(); break; } default: { ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("unsupported table partition type: %c", partitionMethod))); } } /* build the base expression for constraint */ baseConstraint = BuildBaseConstraint(partitionColumn); /* walk over shard list and check if shards can be pruned */ foreach(shardIntervalCell, shardIntervalList) { ShardInterval *shardInterval = lfirst(shardIntervalCell); List *constraintList = NIL; bool shardPruned = false; /* set the min/max values in the base constraint */ UpdateConstraint(baseConstraint, shardInterval); constraintList = list_make1(baseConstraint); shardPruned = predicate_refuted_by(constraintList, restrictInfoList); if (shardPruned) { ereport(DEBUG2, (errmsg("predicate pruning for shard with ID " INT64_FORMAT, shardInterval->id))); } else { remainingShardList = lappend(remainingShardList, &(shardInterval->id)); } }