/* * CopyIntoCStoreTable handles a "COPY cstore_table FROM" statement. This * function uses the COPY command's functions to read and parse rows from * the data source specified in the COPY statement. The function then writes * each row to the file specified in the cstore foreign table options. Finally, * the function returns the number of copied rows. */ static uint64 CopyIntoCStoreTable(const CopyStmt *copyStatement, const char *queryString) { uint64 processedRowCount = 0; Relation relation = NULL; Oid relationId = InvalidOid; TupleDesc tupleDescriptor = NULL; uint32 columnCount = 0; CopyState copyState = NULL; bool nextRowFound = true; Datum *columnValues = NULL; bool *columnNulls = NULL; TableWriteState *writeState = NULL; CStoreFdwOptions *cstoreFdwOptions = NULL; MemoryContext tupleContext = NULL; List *columnNameList = copyStatement->attlist; if (columnNameList != NULL) { ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("copy column list is not supported"))); } /* * We disallow copy from file or program except to superusers. These checks * are based on the checks in DoCopy() function of copy.c. */ if (copyStatement->filename != NULL && !superuser()) { if (copyStatement->is_program) { ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("must be superuser to COPY to or from a program"), errhint("Anyone can COPY to stdout or from stdin. " "psql's \\copy command also works for anyone."))); } else { ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("must be superuser to COPY to or from a file"), errhint("Anyone can COPY to stdout or from stdin. " "psql's \\copy command also works for anyone."))); } } Assert(copyStatement->relation != NULL); /* * Open and lock the relation. We acquire ExclusiveLock to allow concurrent * reads, but block concurrent writes. */ relation = heap_openrv(copyStatement->relation, ExclusiveLock); relationId = RelationGetRelid(relation); /* allocate column values and nulls arrays */ tupleDescriptor = RelationGetDescr(relation); columnCount = tupleDescriptor->natts; columnValues = palloc0(columnCount * sizeof(Datum)); columnNulls = palloc0(columnCount * sizeof(bool)); cstoreFdwOptions = CStoreGetOptions(relationId); /* * We create a new memory context called tuple context, and read and write * each row's values within this memory context. After each read and write, * we reset the memory context. That way, we immediately release memory * allocated for each row, and don't bloat memory usage with large input * files. */ tupleContext = AllocSetContextCreate(CurrentMemoryContext, "CStore COPY Row Memory Context", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); /* init state to read from COPY data source */ copyState = BeginCopyFrom(relation, copyStatement->filename, copyStatement->is_program, NIL, copyStatement->options); /* init state to write to the cstore file */ writeState = CStoreBeginWrite(cstoreFdwOptions->filename, cstoreFdwOptions->compressionType, cstoreFdwOptions->stripeRowCount, cstoreFdwOptions->blockRowCount, tupleDescriptor); while (nextRowFound) { /* read the next row in tupleContext */ MemoryContext oldContext = MemoryContextSwitchTo(tupleContext); nextRowFound = NextCopyFrom(copyState, NULL, columnValues, columnNulls, NULL); MemoryContextSwitchTo(oldContext); /* write the row to the cstore file */ if (nextRowFound) { CStoreWriteRow(writeState, columnValues, columnNulls); processedRowCount++; } MemoryContextReset(tupleContext); } /* end read/write sessions and close the relation */ EndCopyFrom(copyState); CStoreEndWrite(writeState); heap_close(relation, ExclusiveLock); return processedRowCount; }
/* * CitusCopyFrom implements the COPY table_name FROM ... for hash-partitioned * and range-partitioned tables. */ void CitusCopyFrom(CopyStmt *copyStatement, char *completionTag) { Oid tableId = RangeVarGetRelid(copyStatement->relation, NoLock, false); char *relationName = get_rel_name(tableId); Relation distributedRelation = NULL; char partitionMethod = '\0'; Var *partitionColumn = NULL; TupleDesc tupleDescriptor = NULL; uint32 columnCount = 0; Datum *columnValues = NULL; bool *columnNulls = NULL; TypeCacheEntry *typeEntry = NULL; FmgrInfo *hashFunction = NULL; FmgrInfo *compareFunction = NULL; int shardCount = 0; List *shardIntervalList = NULL; ShardInterval **shardIntervalCache = NULL; bool useBinarySearch = false; HTAB *shardConnectionHash = NULL; ShardConnections *shardConnections = NULL; List *connectionList = NIL; EState *executorState = NULL; MemoryContext executorTupleContext = NULL; ExprContext *executorExpressionContext = NULL; CopyState copyState = NULL; CopyOutState copyOutState = NULL; FmgrInfo *columnOutputFunctions = NULL; uint64 processedRowCount = 0; /* disallow COPY to/from file or program except for superusers */ if (copyStatement->filename != NULL && !superuser()) { if (copyStatement->is_program) { ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("must be superuser to COPY to or from an external program"), errhint("Anyone can COPY to stdout or from stdin. " "psql's \\copy command also works for anyone."))); } else { ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("must be superuser to COPY to or from a file"), errhint("Anyone can COPY to stdout or from stdin. " "psql's \\copy command also works for anyone."))); } } partitionColumn = PartitionColumn(tableId, 0); partitionMethod = PartitionMethod(tableId); if (partitionMethod != DISTRIBUTE_BY_RANGE && partitionMethod != DISTRIBUTE_BY_HASH) { ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("COPY is only supported for hash- and " "range-partitioned tables"))); } /* resolve hash function for partition column */ typeEntry = lookup_type_cache(partitionColumn->vartype, TYPECACHE_HASH_PROC_FINFO); hashFunction = &(typeEntry->hash_proc_finfo); /* resolve compare function for shard intervals */ compareFunction = ShardIntervalCompareFunction(partitionColumn, partitionMethod); /* allocate column values and nulls arrays */ distributedRelation = heap_open(tableId, RowExclusiveLock); tupleDescriptor = RelationGetDescr(distributedRelation); columnCount = tupleDescriptor->natts; columnValues = palloc0(columnCount * sizeof(Datum)); columnNulls = palloc0(columnCount * sizeof(bool)); /* load the list of shards and verify that we have shards to copy into */ shardIntervalList = LoadShardIntervalList(tableId); if (shardIntervalList == NIL) { if (partitionMethod == DISTRIBUTE_BY_HASH) { ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("could not find any shards into which to copy"), errdetail("No shards exist for distributed table \"%s\".", relationName), errhint("Run master_create_worker_shards to create shards " "and try again."))); } else { ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("could not find any shards into which to copy"), errdetail("No shards exist for distributed table \"%s\".", relationName))); } } /* prevent concurrent placement changes and non-commutative DML statements */ LockAllShards(shardIntervalList); /* initialize the shard interval cache */ shardCount = list_length(shardIntervalList); shardIntervalCache = SortedShardIntervalArray(shardIntervalList); /* determine whether to use binary search */ if (partitionMethod != DISTRIBUTE_BY_HASH || !IsUniformHashDistribution(shardIntervalCache, shardCount)) { useBinarySearch = true; } /* initialize copy state to read from COPY data source */ copyState = BeginCopyFrom(distributedRelation, copyStatement->filename, copyStatement->is_program, copyStatement->attlist, copyStatement->options); executorState = CreateExecutorState(); executorTupleContext = GetPerTupleMemoryContext(executorState); executorExpressionContext = GetPerTupleExprContext(executorState); copyOutState = (CopyOutState) palloc0(sizeof(CopyOutStateData)); copyOutState->binary = true; copyOutState->fe_msgbuf = makeStringInfo(); copyOutState->rowcontext = executorTupleContext; columnOutputFunctions = ColumnOutputFunctions(tupleDescriptor, copyOutState->binary); /* * Create a mapping of shard id to a connection for each of its placements. * The hash should be initialized before the PG_TRY, since it is used and * PG_CATCH. Otherwise, it may be undefined in the PG_CATCH (see sigsetjmp * documentation). */ shardConnectionHash = CreateShardConnectionHash(); /* we use a PG_TRY block to roll back on errors (e.g. in NextCopyFrom) */ PG_TRY(); { ErrorContextCallback errorCallback; /* set up callback to identify error line number */ errorCallback.callback = CopyFromErrorCallback; errorCallback.arg = (void *) copyState; errorCallback.previous = error_context_stack; error_context_stack = &errorCallback; /* ensure transactions have unique names on worker nodes */ InitializeDistributedTransaction(); while (true) { bool nextRowFound = false; Datum partitionColumnValue = 0; ShardInterval *shardInterval = NULL; int64 shardId = 0; bool shardConnectionsFound = false; MemoryContext oldContext = NULL; ResetPerTupleExprContext(executorState); oldContext = MemoryContextSwitchTo(executorTupleContext); /* parse a row from the input */ nextRowFound = NextCopyFrom(copyState, executorExpressionContext, columnValues, columnNulls, NULL); if (!nextRowFound) { MemoryContextSwitchTo(oldContext); break; } CHECK_FOR_INTERRUPTS(); /* find the partition column value */ if (columnNulls[partitionColumn->varattno - 1]) { ereport(ERROR, (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), errmsg("cannot copy row with NULL value " "in partition column"))); } partitionColumnValue = columnValues[partitionColumn->varattno - 1]; /* find the shard interval and id for the partition column value */ shardInterval = FindShardInterval(partitionColumnValue, shardIntervalCache, shardCount, partitionMethod, compareFunction, hashFunction, useBinarySearch); if (shardInterval == NULL) { ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("could not find shard for partition column " "value"))); } shardId = shardInterval->shardId; MemoryContextSwitchTo(oldContext); /* get existing connections to the shard placements, if any */ shardConnections = GetShardConnections(shardConnectionHash, shardId, &shardConnectionsFound); if (!shardConnectionsFound) { /* open connections and initiate COPY on shard placements */ OpenCopyTransactions(copyStatement, shardConnections); /* send binary headers to shard placements */ resetStringInfo(copyOutState->fe_msgbuf); AppendCopyBinaryHeaders(copyOutState); SendCopyDataToAll(copyOutState->fe_msgbuf, shardConnections->connectionList); } /* replicate row to shard placements */ resetStringInfo(copyOutState->fe_msgbuf); AppendCopyRowData(columnValues, columnNulls, tupleDescriptor, copyOutState, columnOutputFunctions); SendCopyDataToAll(copyOutState->fe_msgbuf, shardConnections->connectionList); processedRowCount += 1; } connectionList = ConnectionList(shardConnectionHash); /* send binary footers to all shard placements */ resetStringInfo(copyOutState->fe_msgbuf); AppendCopyBinaryFooters(copyOutState); SendCopyDataToAll(copyOutState->fe_msgbuf, connectionList); /* all lines have been copied, stop showing line number in errors */ error_context_stack = errorCallback.previous; /* close the COPY input on all shard placements */ EndRemoteCopy(connectionList, true); if (CopyTransactionManager == TRANSACTION_MANAGER_2PC) { PrepareRemoteTransactions(connectionList); } EndCopyFrom(copyState); heap_close(distributedRelation, NoLock); /* check for cancellation one last time before committing */ CHECK_FOR_INTERRUPTS(); } PG_CATCH(); { List *abortConnectionList = NIL; /* roll back all transactions */ abortConnectionList = ConnectionList(shardConnectionHash); EndRemoteCopy(abortConnectionList, false); AbortRemoteTransactions(abortConnectionList); CloseConnections(abortConnectionList); PG_RE_THROW(); } PG_END_TRY(); /* * Ready to commit the transaction, this code is below the PG_TRY block because * we do not want any of the transactions rolled back if a failure occurs. Instead, * they should be rolled forward. */ CommitRemoteTransactions(connectionList); CloseConnections(connectionList); if (completionTag != NULL) { snprintf(completionTag, COMPLETION_TAG_BUFSIZE, "COPY " UINT64_FORMAT, processedRowCount); } }