/* * AppendOnlySegmentFileTruncateToEOF() * * Assumes that the segment file lock is already held. * * For the segment file is truncates to the eof. */ static void AppendOnlySegmentFileTruncateToEOF(Relation aorel, FileSegInfo *fsinfo) { const char* relname = RelationGetRelationName(aorel); MirroredAppendOnlyOpen mirroredOpened; int32 fileSegNo; char filenamepath[MAXPGPATH]; int segno; int64 segeof; Assert(fsinfo); Assert(RelationIsAoRows(aorel)); segno = fsinfo->segno; relname = RelationGetRelationName(aorel); segeof = (int64)fsinfo->eof; /* Open and truncate the relation segfile beyond its eof */ MakeAOSegmentFileName(aorel, segno, -1, &fileSegNo, filenamepath); elogif(Debug_appendonly_print_compaction, LOG, "Opening AO relation \"%s.%s\", relation id %u, relfilenode %u (physical segment file #%d, logical EOF " INT64_FORMAT ")", get_namespace_name(RelationGetNamespace(aorel)), relname, aorel->rd_id, aorel->rd_node.relNode, segno, segeof); if (OpenAOSegmentFile(aorel, filenamepath, fileSegNo, segeof, &mirroredOpened)) { TruncateAOSegmentFile(&mirroredOpened, aorel, segeof, ERROR); CloseAOSegmentFile(&mirroredOpened); elogif(Debug_appendonly_print_compaction, LOG, "Successfully truncated AO ROL relation \"%s.%s\", relation id %u, relfilenode %u (physical segment file #%d, logical EOF " INT64_FORMAT ")", get_namespace_name(RelationGetNamespace(aorel)), relname, aorel->rd_id, aorel->rd_node.relNode, segno, segeof); } else { elogif(Debug_appendonly_print_compaction, LOG, "No gp_relation_node entry for AO ROW relation \"%s.%s\", relation id %u, relfilenode %u (physical segment file #%d, logical EOF " INT64_FORMAT ")", get_namespace_name(RelationGetNamespace(aorel)), relname, aorel->rd_id, aorel->rd_node.relNode, segno, segeof); } }
/** * Finds the visibility map entry tuple for a given * segmentFileNum and firstRowNum. * * Note: The firstRowNum needs to be a valid firstRowNum. It is * especially not the tuple id of the append-only tuple checked, updated, * or deleted. * * Returns true if there is such a tuple and * the tuple is used as current tuple. * Otherwise false is returned. * * Assumes that the store data structure has been initialized, but not finished. */ bool AppendOnlyVisimapStore_Find( AppendOnlyVisimapStore* visiMapStore, int32 segmentFileNum, int64 firstRowNum, AppendOnlyVisimapEntry* visiMapEntry) { ScanKey scanKeys; IndexScanDesc indexScan; Assert(visiMapStore); Assert(visiMapEntry); Assert(RelationIsValid(visiMapStore->visimapRelation)); Assert(RelationIsValid(visiMapStore->visimapIndex)); elogif (Debug_appendonly_print_visimap, LOG, "Append-only visi map store: Load entry: " "(segFileNum, firstRowNum) = (%u, " INT64_FORMAT ")", segmentFileNum, firstRowNum); scanKeys = visiMapStore->scanKeys; scanKeys[0].sk_argument = Int32GetDatum(segmentFileNum); scanKeys[1].sk_argument = Int64GetDatum(firstRowNum); indexScan = AppendOnlyVisimapStore_BeginScan( visiMapStore, APPENDONLY_VISIMAP_INDEX_SCAN_KEY_NUM, scanKeys); if (!AppendOnlyVisimapStore_GetNext( visiMapStore, indexScan, BackwardScanDirection, visiMapEntry, &visiMapEntry->tupleTid)) { elogif(Debug_appendonly_print_visimap, LOG, "Append-only visi map store: Visimap entry does not exist: " "(segFileNum, firstRowNum) = (%u, " INT64_FORMAT ")", segmentFileNum, firstRowNum); // failed to lookup row AppendOnlyVisimapStore_EndScan(visiMapStore, indexScan); return false; } AppendOnlyVisimapStore_EndScan(visiMapStore, indexScan); return true; }
static void AppendOnlyStorageFormat_AddFirstRowNum( uint8 *headerPtr, bool usingChecksums, int64 firstRowNum) { AOSmallContentHeader *blockHeader = (AOSmallContentHeader*)headerPtr; int32 offsetToFirstRowNum; int64 *firstRowNumPtr; if (!AoHeader_IsLong(AOSmallContentHeaderGet_headerKind(blockHeader))) { offsetToFirstRowNum = AoHeader_RegularSize; } else { offsetToFirstRowNum = AoHeader_LongSize; } if (usingChecksums) offsetToFirstRowNum += 2 * sizeof(pg_crc32); // Header and Block checksums. firstRowNumPtr = (int64*)&headerPtr[offsetToFirstRowNum]; *firstRowNumPtr = firstRowNum; elogif(Debug_appendonly_print_storage_headers, LOG, "Append-Only storage first row number header result: block_bytes_0_3 0x%X, block_bytes_4_7 0x%X, " "firstRowNum " INT64_FORMAT, blockHeader->smallcontent_bytes_0_3, blockHeader->smallcontent_bytes_4_7, firstRowNum); }
/* * Do open the next segment file to read, but don't do error processing. * * This routine is responsible for seeking to the proper location given * the logical EOF. * * filePathName - name of the segment file to open. */ static File AppendOnlyStorageRead_DoOpenFile(AppendOnlyStorageRead *storageRead, char *filePathName) { int fileFlags = O_RDONLY | PG_BINARY; /* File mode is S_IRUSR 00400 user has read permission */ int fileMode = 0400; File file; Assert(storageRead != NULL); Assert(storageRead->isActive); Assert(filePathName != NULL); elogif(Debug_appendonly_print_read_block, LOG, "Append-Only storage read: opening table '%s', segment file '%s', fileFlags 0x%x, fileMode 0x%x", storageRead->relationName, storageRead->segmentFileName, fileFlags, fileMode); /* * Open the file for read. */ file = PathNameOpenFile(filePathName, fileFlags, fileMode); return file; }
static void AOCSMoveTuple(TupleTableSlot *slot, AOCSInsertDesc insertDesc, ResultRelInfo *resultRelInfo, EState *estate) { AOTupleId *oldAoTupleId; AOTupleId newAoTupleId; Assert(resultRelInfo); Assert(slot); Assert(estate); oldAoTupleId = (AOTupleId *) slot_get_ctid(slot); /* Extract all the values of the tuple */ slot_getallattrs(slot); (void) aocs_insert_values(insertDesc, slot_get_values(slot), slot_get_isnull(slot), &newAoTupleId); /* insert index' tuples if needed */ if (resultRelInfo->ri_NumIndices > 0) { ExecInsertIndexTuples(slot, (ItemPointer) &newAoTupleId, estate); ResetPerTupleExprContext(estate); } elogif(Debug_appendonly_print_compaction, DEBUG5, "Compaction: Moved tuple (%d," INT64_FORMAT ") -> (%d," INT64_FORMAT ")", AOTupleIdGet_segmentFileNum(oldAoTupleId), AOTupleIdGet_rowNum(oldAoTupleId), AOTupleIdGet_segmentFileNum(&newAoTupleId), AOTupleIdGet_rowNum(&newAoTupleId)); }
/* * Fills in the relation statistics for an append-only relation. * * This information is used to update the reltuples and relpages information * in pg_class. reltuples is the same as "pg_aoseg_<oid>:tupcount" * column and we simulate relpages by subdividing the eof value * ("pg_aoseg_<oid>:eof") over the defined page size. */ void vacuum_appendonly_fill_stats(Relation aorel, Snapshot snapshot, BlockNumber *rel_pages, double *rel_tuples, bool *relhasindex) { FileSegTotals *fstotal; BlockNumber nblocks; char *relname; double num_tuples; double totalbytes; double eof; int64 hidden_tupcount; AppendOnlyVisimap visimap; Assert(RelationIsAoRows(aorel) || RelationIsAoCols(aorel)); relname = RelationGetRelationName(aorel); /* get updated statistics from the pg_aoseg table */ if (RelationIsAoRows(aorel)) { fstotal = GetSegFilesTotals(aorel, snapshot); } else { Assert(RelationIsAoCols(aorel)); fstotal = GetAOCSSSegFilesTotals(aorel, snapshot); } /* calculate the values we care about */ eof = (double)fstotal->totalbytes; num_tuples = (double)fstotal->totaltuples; totalbytes = eof; nblocks = (uint32)RelationGuessNumberOfBlocks(totalbytes); AppendOnlyVisimap_Init(&visimap, aorel->rd_appendonly->visimaprelid, aorel->rd_appendonly->visimapidxid, AccessShareLock, snapshot); hidden_tupcount = AppendOnlyVisimap_GetRelationHiddenTupleCount(&visimap); num_tuples -= hidden_tupcount; Assert(num_tuples > -1.0); AppendOnlyVisimap_Finish(&visimap, AccessShareLock); elogif (Debug_appendonly_print_compaction, LOG, "Gather statistics after vacuum for append-only relation %s: " "page count %d, tuple count %f", relname, nblocks, num_tuples); *rel_pages = nblocks; *rel_tuples = num_tuples; *relhasindex = aorel->rd_rel->relhasindex; ereport(elevel, (errmsg("\"%s\": found %.0f rows in %u pages.", relname, num_tuples, nblocks))); pfree(fstotal); }
/* * Drops a segment file. * */ static void AppendOnlyCompaction_DropSegmentFile(Relation aorel, int segno) { ItemPointerData persistentTid; int64 persistentSerialNum; if (!ReadGpRelationNode( aorel->rd_node.relNode, segno, &persistentTid, &persistentSerialNum)) { /* There is nothing to drop */ return; } elogif(Debug_appendonly_print_compaction, LOG, "Drop segment file: segno %d", segno); MirroredFileSysObj_ScheduleDropAppendOnlyFile( &aorel->rd_node, segno, RelationGetRelationName(aorel), &persistentTid, persistentSerialNum); DeleteGpRelationNodeTuple(aorel, segno); }
void AppendOnlyThrowAwayTuple( Relation rel, MemTuple tuple, TupleTableSlot *slot, MemTupleBinding *mt_bind) { AOTupleId *oldAoTupleId; Assert(slot); Assert(mt_bind); oldAoTupleId = (AOTupleId*)slot_get_ctid(slot); /* Extract all the values of the tuple */ slot_getallattrs(slot); if (MemTupleHasExternal(tuple, mt_bind)) { toast_delete(rel, (HeapTuple) tuple, mt_bind); } elogif(Debug_appendonly_print_compaction, DEBUG5, "Compaction: Throw away tuple (%d," INT64_FORMAT ")", AOTupleIdGet_segmentFileNum(oldAoTupleId), AOTupleIdGet_rowNum(oldAoTupleId)); }
/* * Stores the visibility map entry. * * The entry/tuple is invalidated after this function call. * * Assumes that a valid visimap entry is passed. * Assumes that the entry corresponds to the latest tuple * returned by AppendOnlyVisimapStore_find. * * Should not be called twice in the same command. */ void AppendOnlyVisimapStore_Store( AppendOnlyVisimapStore* visiMapStore, AppendOnlyVisimapEntry* visiMapEntry) { MemoryContext oldContext; Relation visimapRelation; TupleDesc heapTupleDesc; HeapTuple tuple; Datum values[Natts_pg_aovisimap]; bool nulls[Natts_pg_aovisimap]; Assert(visiMapStore); Assert(visiMapEntry); elogif (Debug_appendonly_print_visimap, LOG, "Append-only visi map store: Store visimap entry: " "(segFileNum, firstRowNum) = (%u, " INT64_FORMAT ")", visiMapEntry->segmentFileNum, visiMapEntry->firstRowNum); oldContext = MemoryContextSwitchTo(visiMapStore->memoryContext); AppendOnlyVisimapEntry_Write(visiMapEntry, values, nulls); visimapRelation = visiMapStore->visimapRelation; heapTupleDesc = RelationGetDescr(visimapRelation); tuple = heap_form_tuple(heapTupleDesc, values, nulls); /* * Write out the visimap entry to the relation. * If this visimap entry already in the relation, we update * the row. Otherwise, a new row is inserted. */ if (ItemPointerIsValid(&visiMapEntry->tupleTid)) { simple_heap_update(visimapRelation, &visiMapEntry->tupleTid, tuple); } else { simple_heap_insert(visimapRelation, tuple); } CatalogUpdateIndexes(visimapRelation, tuple); heap_freetuple(tuple); MemoryContextSwitchTo(oldContext); // Invalidate the data after storing it. ItemPointerSetInvalid(&visiMapEntry->tupleTid); }
/** * Drops a segment file. * */ static void AOCSCompaction_DropSegmentFile(Relation aorel, int segno) { ItemPointerData persistentTid; int64 persistentSerialNum; int pseudoSegNo; int col; Assert(RelationIsAoCols(aorel)); for (col = 0; col < RelationGetNumberOfAttributes(aorel); col++) { pseudoSegNo = (col * AOTupleId_MultiplierSegmentFileNum) + segno; if (!ReadGpRelationNode( aorel->rd_rel->reltablespace, aorel->rd_rel->relfilenode, pseudoSegNo, &persistentTid, &persistentSerialNum)) { /* There is nothing to drop */ return; } elogif(Debug_appendonly_print_compaction, LOG, "Drop segment file: " "segno %d", pseudoSegNo); MirroredFileSysObj_ScheduleDropAppendOnlyFile( &aorel->rd_node, pseudoSegNo, RelationGetRelationName(aorel), &persistentTid, persistentSerialNum); DeleteGpRelationNodeTuple(aorel, pseudoSegNo); } }
/* * Deletes all visibility map information from a given * segment file. */ void AppendOnlyVisimapStore_DeleteSegmentFile( AppendOnlyVisimapStore *visiMapStore, int segmentFileNum) { ScanKeyData scanKey; IndexScanDesc indexScan; ItemPointerData tid; Assert(visiMapStore); Assert(RelationIsValid(visiMapStore->visimapRelation)); Assert(RelationIsValid(visiMapStore->visimapIndex)); elogif(Debug_appendonly_print_visimap, LOG, "Append-only visi map store: Delete segment file: " "(segFileNum) = (%u)", segmentFileNum); ScanKeyInit(&scanKey, Anum_pg_aovisimap_segno, /* segno */ BTEqualStrategyNumber, F_INT4EQ, Int32GetDatum(segmentFileNum)); indexScan = AppendOnlyVisimapStore_BeginScan( visiMapStore, 1, &scanKey); while (AppendOnlyVisimapStore_GetNext(visiMapStore, indexScan, ForwardScanDirection, NULL, &tid)) { simple_heap_delete(visiMapStore->visimapRelation, &tid); } AppendOnlyVisimapStore_EndScan(visiMapStore, indexScan); }
static void AppendOnlyMoveTuple(MemTuple tuple, TupleTableSlot *slot, MemTupleBinding *mt_bind, AppendOnlyInsertDesc insertDesc, ResultRelInfo *resultRelInfo, EState *estate) { AOTupleId *oldAoTupleId; Oid tupleOid; AOTupleId newAoTupleId; Assert(resultRelInfo); Assert(slot); Assert(mt_bind); Assert(estate); oldAoTupleId = (AOTupleId*)slot_get_ctid(slot); /* Extract all the values of the tuple */ slot_getallattrs(slot); tupleOid = MemTupleGetOid(tuple, mt_bind); appendonly_insert(insertDesc, tuple, &tupleOid, &newAoTupleId); /* insert index' tuples if needed */ if (resultRelInfo->ri_NumIndices > 0) { ExecInsertIndexTuples(slot, (ItemPointer)&newAoTupleId, estate, true); ResetPerTupleExprContext(estate); } elogif(Debug_appendonly_print_compaction, DEBUG5, "Compaction: Moved tuple (%d," INT64_FORMAT ") -> (%d," INT64_FORMAT ")", AOTupleIdGet_segmentFileNum(oldAoTupleId), AOTupleIdGet_rowNum(oldAoTupleId), AOTupleIdGet_segmentFileNum(&newAoTupleId), AOTupleIdGet_rowNum(&newAoTupleId)); }
/* * Drops a segment file. * * Actually, we just truncate the segfile to 0 bytes, to reclaim the space. * Before GPDB 6, we used to remove the file, but with WAL replication, we * no longer have a convenient function to remove a single segment of a * relation. An empty file is as almost as good as a non-existent file. If * the relation is dropped later, the code in mdunlink() will remove all * segments, including any empty ones we've left behind. */ static void AOCSCompaction_DropSegmentFile(Relation aorel, int segno) { int col; Assert(RelationIsAoCols(aorel)); for (col = 0; col < RelationGetNumberOfAttributes(aorel); col++) { char filenamepath[MAXPGPATH]; int pseudoSegNo; File fd; /* Open and truncate the relation segfile */ MakeAOSegmentFileName(aorel, segno, col, &pseudoSegNo, filenamepath); elogif(Debug_appendonly_print_compaction, LOG, "Drop segment file: " "segno %d", pseudoSegNo); fd = OpenAOSegmentFile(aorel, filenamepath, pseudoSegNo, 0); if (fd >= 0) { TruncateAOSegmentFile(fd, aorel, pseudoSegNo, 0); CloseAOSegmentFile(fd); } else { /* * The file we were about to drop/truncate didn't exist. That's normal, * for example, if a column is added with ALTER TABLE ADD COLUMN. */ elog(DEBUG1, "could not truncate segfile %s, because it does not exist", filenamepath); } } }
/* * Actually do a base backup for the specified tablespaces. * * This is split out mainly to avoid complaints about "variable might be * clobbered by longjmp" from stupider versions of gcc. */ static void perform_base_backup(basebackup_options *opt, DIR *tblspcdir) { XLogRecPtr startptr; XLogRecPtr endptr; char *labelfile; startptr = do_pg_start_backup(opt->label, opt->fastcheckpoint, &labelfile); Assert(!XLogRecPtrIsInvalid(startptr)); elogif(!debug_basebackup, LOG, "basebackup perform -- " "Basebackup start xlog location = %X/%X", startptr.xlogid, startptr.xrecoff); /* * Set xlogCleanUpTo so that checkpoint process knows * which old xlog files should not be cleaned */ WalSndSetXLogCleanUpTo(startptr); SIMPLE_FAULT_INJECTOR(BaseBackupPostCreateCheckpoint); SendXlogRecPtrResult(startptr); PG_ENSURE_ERROR_CLEANUP(base_backup_cleanup, (Datum) 0); { List *filespaces = NIL; ListCell *lc; /* Collect information about all filespaces, including pg_system */ filespaces = get_filespaces_to_send(opt); /* Send filespace header */ SendBackupHeader(filespaces); /* Send off our filespaces one by one */ foreach(lc, filespaces) { filespaceinfo *fi = (filespaceinfo *) lfirst(lc); StringInfoData buf; /* Send CopyOutResponse message */ pq_beginmessage(&buf, 'H'); pq_sendbyte(&buf, 0); /* overall format */ pq_sendint(&buf, 0, 2); /* natts */ pq_endmessage(&buf); /* In the main tar, include the backup_label first. */ if (fi->primary_path == NULL) sendFileWithContent(BACKUP_LABEL_FILE, labelfile); sendDir(fi->primary_path == NULL ? "." : fi->primary_path, fi->primary_path == NULL ? 1 : strlen(fi->primary_path), opt->exclude, false); /* In the main tar, include pg_control last. */ if (fi->primary_path == NULL) { struct stat statbuf; if (lstat(XLOG_CONTROL_FILE, &statbuf) != 0) { ereport(ERROR, (errcode_for_file_access(), errmsg("could not stat control file \"%s\": %m", XLOG_CONTROL_FILE))); } sendFile(XLOG_CONTROL_FILE, XLOG_CONTROL_FILE, &statbuf); elogif(debug_basebackup, LOG, "basebackup perform -- Sent file %s." , XLOG_CONTROL_FILE); } /* * If we're including WAL, and this is the main data directory we * don't terminate the tar stream here. Instead, we will append * the xlog files below and terminate it then. This is safe since * the main data directory is always sent *last*. */ if (opt->includewal && fi->xlogdir) { Assert(lnext(lc) == NULL); } else pq_putemptymessage('c'); /* CopyDone */ } }
/* * lazy_vacuum_aorel -- perform LAZY VACUUM for one Append-only relation. */ static void lazy_vacuum_aorel(Relation onerel, VacuumStmt *vacstmt, List *updated_stats) { LVRelStats *vacrelstats; bool update_relstats = true; vacrelstats = (LVRelStats *) palloc0(sizeof(LVRelStats)); if (vacuumStatement_IsInAppendOnlyPreparePhase(vacstmt)) { elogif(Debug_appendonly_print_compaction, LOG, "Vacuum prepare phase %s", RelationGetRelationName(onerel)); vacuum_appendonly_indexes(onerel, vacstmt, updated_stats); if (RelationIsAoRows(onerel)) AppendOnlyTruncateToEOF(onerel); else AOCSTruncateToEOF(onerel); /* * MPP-23647. For empty tables, we skip compaction phase * and cleanup phase. Therefore, we update the stats * (specifically, relfrozenxid) in prepare phase if the * table is empty. Otherwise, the stats will be updated in * the cleanup phase, when we would have computed the * correct values for stats. */ if (vacstmt->appendonly_relation_empty) { update_relstats = true; /* * For an empty relation, the only stats we care about * is relfrozenxid and relhasindex. We need to be * mindful of correctly setting relhasindex here. * relfrozenxid is already taken care of above by * calling vacuum_set_xid_limits(). */ vacrelstats->hasindex = onerel->rd_rel->relhasindex; } else { /* * For a non-empty relation, follow the usual * compaction phases and do not update stats in * prepare phase. */ update_relstats = false; } } else if (!vacummStatement_IsInAppendOnlyCleanupPhase(vacstmt)) { vacuum_appendonly_rel(onerel, vacstmt); update_relstats = false; } else { elogif(Debug_appendonly_print_compaction, LOG, "Vacuum cleanup phase %s", RelationGetRelationName(onerel)); vacuum_appendonly_fill_stats(onerel, ActiveSnapshot, &vacrelstats->rel_pages, &vacrelstats->rel_tuples, &vacrelstats->hasindex); /* reset the remaining LVRelStats values */ vacrelstats->nonempty_pages = 0; vacrelstats->num_dead_tuples = 0; vacrelstats->max_dead_tuples = 0; vacrelstats->tuples_deleted = 0; vacrelstats->tot_free_pages = 0; vacrelstats->fs_is_heap = false; vacrelstats->num_free_pages = 0; vacrelstats->max_free_pages = 0; vacrelstats->pages_removed = 0; } if (update_relstats) { /* Update statistics in pg_class */ vac_update_relstats_from_list(onerel, vacrelstats->rel_pages, vacrelstats->rel_tuples, vacrelstats->hasindex, FreezeLimit, updated_stats); /* report results to the stats collector, too */ pgstat_report_vacuum(RelationGetRelid(onerel), onerel->rd_rel->relisshared, true /*vacrelstats->scanned_all*/, vacstmt->analyze, vacrelstats->rel_tuples); } }
/* * vacuum_appendonly_rel() -- vaccum an append-only relation * * This procedure will be what gets executed both for VACUUM * and VACUUM FULL (and also ANALYZE or any other thing that * needs the pg_class stats updated). * * The function can compact append-only segment files or just * truncating the segment file to its existing eof. * * Afterwards, the reltuples and relpages information in pg_class * are updated. reltuples is the same as "pg_aoseg_<oid>:tupcount" * column and we simulate relpages by subdividing the eof value * ("pg_aoseg_<oid>:eof") over the defined page size. * * * There are txn ids, hint bits, free space, dead tuples, * etc. these are all irrelevant in the append only relation context. * */ void vacuum_appendonly_rel(Relation aorel, VacuumStmt *vacstmt) { char *relname; PGRUsage ru0; Assert(RelationIsAoRows(aorel) || RelationIsAoCols(aorel)); Assert(!vacummStatement_IsInAppendOnlyCleanupPhase(vacstmt)); pg_rusage_init(&ru0); relname = RelationGetRelationName(aorel); ereport(elevel, (errmsg("vacuuming \"%s.%s\"", get_namespace_name(RelationGetNamespace(aorel)), relname))); if (Gp_role == GP_ROLE_DISPATCH) { return; } Assert(list_length(vacstmt->appendonly_compaction_insert_segno) <= 1); if (vacstmt->appendonly_compaction_insert_segno == NULL) { elogif(Debug_appendonly_print_compaction, LOG, "Vacuum drop phase %s", RelationGetRelationName(aorel)); if (RelationIsAoRows(aorel)) { AppendOnlyDrop(aorel, vacstmt->appendonly_compaction_segno); } else { Assert(RelationIsAoCols(aorel)); AOCSDrop(aorel, vacstmt->appendonly_compaction_segno); } } else { int insert_segno = linitial_int(vacstmt->appendonly_compaction_insert_segno); if (insert_segno == APPENDONLY_COMPACTION_SEGNO_INVALID) { elogif(Debug_appendonly_print_compaction, LOG, "Vacuum pseudo-compaction phase %s", RelationGetRelationName(aorel)); } else { elogif(Debug_appendonly_print_compaction, LOG, "Vacuum compaction phase %s", RelationGetRelationName(aorel)); if (RelationIsAoRows(aorel)) { AppendOnlyCompact(aorel, vacstmt->appendonly_compaction_segno, insert_segno, vacstmt->full); } else { Assert(RelationIsAoCols(aorel)); AOCSCompact(aorel, vacstmt->appendonly_compaction_segno, insert_segno, vacstmt->full); } } } }
/* * Truncates each segment file to the AOCS relation to its EOF. * If we cannot get a lock on the segment file (because e.g. a concurrent insert) * the segment file is skipped. */ void AOCSTruncateToEOF(Relation aorel) { const char *relname; int total_segfiles; AOCSFileSegInfo **segfile_array; int i, segno; LockAcquireResult acquireResult; AOCSFileSegInfo *fsinfo; Snapshot appendOnlyMetaDataSnapshot = RegisterSnapshot(GetCatalogSnapshot(InvalidOid)); Assert(RelationIsAoCols(aorel)); relname = RelationGetRelationName(aorel); elogif(Debug_appendonly_print_compaction, LOG, "Compact AO relation %s", relname); /* Get information about all the file segments we need to scan */ segfile_array = GetAllAOCSFileSegInfo(aorel, appendOnlyMetaDataSnapshot, &total_segfiles); for (i = 0; i < total_segfiles; i++) { segno = segfile_array[i]->segno; /* * Try to get the transaction write-lock for the Append-Only segment * file. * * NOTE: This is a transaction scope lock that must be held until * commit / abort. */ acquireResult = LockRelationAppendOnlySegmentFile( &aorel->rd_node, segfile_array[i]->segno, AccessExclusiveLock, /* dontWait */ true); if (acquireResult == LOCKACQUIRE_NOT_AVAIL) { elog(DEBUG5, "truncate skips AO segfile %d, " "relation %s", segfile_array[i]->segno, relname); continue; } /* Re-fetch under the write lock to get latest committed eof. */ fsinfo = GetAOCSFileSegInfo(aorel, appendOnlyMetaDataSnapshot, segno); /* * This should not occur since this segfile info was found by the * "all" method, but better to catch for trouble shooting (possibly * index corruption?) */ if (fsinfo == NULL) elog(ERROR, "file seginfo for AOCS relation %s %u/%u/%u (segno=%u) is missing", relname, aorel->rd_node.spcNode, aorel->rd_node.dbNode, aorel->rd_node.relNode, segno); AOCSSegmentFileTruncateToEOF(aorel, fsinfo); pfree(fsinfo); } if (segfile_array) { FreeAllAOCSSegFileInfo(segfile_array, total_segfiles); pfree(segfile_array); } UnregisterSnapshot(appendOnlyMetaDataSnapshot); }
/* * Get information on the next Append-Only Storage Block. * * Return true if another block was found. Otherwise, we have reached the * end of the current segment file. */ bool AppendOnlyStorageRead_ReadNextBlock(AppendOnlyStorageRead *storageRead) { uint8 *header; AOHeaderCheckError checkError; int32 blockLimitLen = 0; /* Shutup compiler. */ pg_crc32 storedChecksum; pg_crc32 computedChecksum; /* * Reset current* variables. */ /* For efficiency, zero out. Comment out lines that set fields to 0. */ memset(&storageRead->current, 0, sizeof(AppendOnlyStorageReadCurrent)); /* storageRead->current.headerOffsetInFile = 0; */ storageRead->current.headerKind = AoHeaderKind_None; /* storageRead->current.actualHeaderLen = 0; */ /* storageRead->current.contentLen = 0; */ /* storageRead->current.overallBlockLen = 0; */ /* storageRead->current.contentOffset = 0; */ /* storageRead->current.executorBlockKind = 0; */ /* storageRead->current.hasFirstRowNum = false; */ storageRead->current.firstRowNum = INT64CONST(-1); /* storageRead->current.rowCount = 0; */ /* storageRead->current.isLarge = false; */ /* storageRead->current.isCompressed = false; */ /* storageRead->current.compressedLen = 0; */ elogif(Debug_appendonly_print_datumstream, LOG, "before AppendOnlyStorageRead_PositionToNextBlock, storageRead->current.headerOffsetInFile is" INT64_FORMAT "storageRead->current.overallBlockLen is %d", storageRead->current.headerOffsetInFile, storageRead->current.overallBlockLen); if (!AppendOnlyStorageRead_PositionToNextBlock(storageRead, &storageRead->current.headerOffsetInFile, &header, &blockLimitLen)) { /* Done reading the file */ return false; } elogif(Debug_appendonly_print_datumstream, LOG, "after AppendOnlyStorageRead_PositionToNextBlock, storageRead->current.headerOffsetInFile is" INT64_FORMAT "storageRead->current.overallBlockLen is %d", storageRead->current.headerOffsetInFile, storageRead->current.overallBlockLen); /*---------- * Proceed very carefully: * [ 1. Verify header checksum ] * 2. Examine (basic) header. * 3. Examine specific header. * [ 4. Verify the block checksum ] *---------- */ if (storageRead->storageAttributes.checksum && gp_appendonly_verify_block_checksums) { if (!AppendOnlyStorageFormat_VerifyHeaderChecksum(header, &storedChecksum, &computedChecksum)) ereport(ERROR, (errmsg("Header checksum does not match. Expected 0x%X and found 0x%X ", storedChecksum, computedChecksum), errdetail_appendonly_read_storage_content_header(storageRead), errcontext_appendonly_read_storage_block(storageRead))); } /* * Check the (basic) header information. */ checkError = AppendOnlyStorageFormat_GetHeaderInfo(header, storageRead->storageAttributes.checksum, &storageRead->current.headerKind, &storageRead->current.actualHeaderLen); if (checkError != AOHeaderCheckOk) ereport(ERROR, (errmsg("Bad append-only storage header. Header check error %d, detail '%s'", (int) checkError, AppendOnlyStorageFormat_GetHeaderCheckErrorStr()), errdetail_appendonly_read_storage_content_header(storageRead), errcontext_appendonly_read_storage_block(storageRead))); /* * Get more header since AppendOnlyStorageRead_PositionToNextBlock only * gets minimum. */ if (storageRead->minimumHeaderLen < storageRead->current.actualHeaderLen) { int32 availableLen; header = BufferedReadGrowBuffer(&storageRead->bufferedRead, storageRead->current.actualHeaderLen, &availableLen); if (header == NULL || availableLen != storageRead->current.actualHeaderLen) ereport(ERROR, (errcode(ERRCODE_GP_INTERNAL_ERROR), errmsg("Expected %d bytes and found %d bytes in table %s " "(segment file '%s', header offset in file = " INT64_FORMAT ", bufferCount " INT64_FORMAT ")", storageRead->current.actualHeaderLen, availableLen, storageRead->relationName, storageRead->segmentFileName, storageRead->current.headerOffsetInFile, storageRead->bufferCount))); } /* * Based on the kind of header, we either have small or large content. */ switch (storageRead->current.headerKind) { case AoHeaderKind_SmallContent: /* * Check the SmallContent header information. */ checkError = AppendOnlyStorageFormat_GetSmallContentHeaderInfo (header, storageRead->current.actualHeaderLen, storageRead->storageAttributes.checksum, blockLimitLen, &storageRead->current.overallBlockLen, &storageRead->current.contentOffset, &storageRead->current.uncompressedLen, &storageRead->current.executorBlockKind, &storageRead->current.hasFirstRowNum, storageRead->formatVersion, &storageRead->current.firstRowNum, &storageRead->current.rowCount, &storageRead->current.isCompressed, &storageRead->current.compressedLen ); if (checkError != AOHeaderCheckOk) ereport(ERROR, (errmsg("Bad append-only storage header of type small content. Header check error %d, detail '%s'", (int) checkError, AppendOnlyStorageFormat_GetHeaderCheckErrorStr()), errdetail_appendonly_read_storage_content_header(storageRead), errcontext_appendonly_read_storage_block(storageRead))); break; case AoHeaderKind_LargeContent: /* * Check the LargeContent metadata header information. */ checkError = AppendOnlyStorageFormat_GetLargeContentHeaderInfo (header, storageRead->current.actualHeaderLen, storageRead->storageAttributes.checksum, &storageRead->current.uncompressedLen, &storageRead->current.executorBlockKind, &storageRead->current.hasFirstRowNum, &storageRead->current.firstRowNum, &storageRead->current.rowCount); if (checkError != AOHeaderCheckOk) ereport(ERROR, (errmsg("Bad append-only storage header of type large content. Header check error %d, detail '%s'", (int) checkError, AppendOnlyStorageFormat_GetHeaderCheckErrorStr()), errdetail_appendonly_read_storage_content_header(storageRead), errcontext_appendonly_read_storage_block(storageRead))); storageRead->current.isLarge = true; break; case AoHeaderKind_NonBulkDenseContent: /* * Check the NonBulkDense header information. */ checkError = AppendOnlyStorageFormat_GetNonBulkDenseContentHeaderInfo (header, storageRead->current.actualHeaderLen, storageRead->storageAttributes.checksum, blockLimitLen, &storageRead->current.overallBlockLen, &storageRead->current.contentOffset, &storageRead->current.uncompressedLen, &storageRead->current.executorBlockKind, &storageRead->current.hasFirstRowNum, storageRead->formatVersion, &storageRead->current.firstRowNum, &storageRead->current.rowCount ); if (checkError != AOHeaderCheckOk) ereport(ERROR, (errmsg("Bad append-only storage header of type non-bulk dense content. Header check error %d, detail '%s'", (int) checkError, AppendOnlyStorageFormat_GetHeaderCheckErrorStr()), errdetail_appendonly_read_storage_content_header(storageRead), errcontext_appendonly_read_storage_block(storageRead))); break; case AoHeaderKind_BulkDenseContent: /* * Check the BulkDenseContent header information. */ checkError = AppendOnlyStorageFormat_GetBulkDenseContentHeaderInfo (header, storageRead->current.actualHeaderLen, storageRead->storageAttributes.checksum, blockLimitLen, &storageRead->current.overallBlockLen, &storageRead->current.contentOffset, &storageRead->current.uncompressedLen, &storageRead->current.executorBlockKind, &storageRead->current.hasFirstRowNum, storageRead->formatVersion, &storageRead->current.firstRowNum, &storageRead->current.rowCount, &storageRead->current.isCompressed, &storageRead->current.compressedLen ); if (checkError != AOHeaderCheckOk) ereport(ERROR, (errmsg("Bad append-only storage header of type bulk dense content. Header check error %d, detail '%s'", (int) checkError, AppendOnlyStorageFormat_GetHeaderCheckErrorStr()), errdetail_appendonly_read_storage_content_header(storageRead), errcontext_appendonly_read_storage_block(storageRead))); break; default: elog(ERROR, "Unexpected Append-Only header kind %d", storageRead->current.headerKind); break; } if (Debug_appendonly_print_storage_headers) { AppendOnlyStorageRead_LogBlockHeader(storageRead, header); } if (storageRead->current.hasFirstRowNum) { /* UNDONE: Grow buffer and read the value into firstRowNum. */ } if (storageRead->current.headerKind == AoHeaderKind_LargeContent) { /* UNDONE: Finish the read for the information only header. */ } return true; }
/* * Initialize AppendOnlyStorageRead. * * The AppendOnlyStorageRead data structure is initialized once for a read * "session" and can be used to read Append-Only Storage Blocks from 1 or * more segment files. * * The current file to read to is opened with the * AppendOnlyStorageRead_OpenFile routine. * * storageRead - data structure to initialize * memoryContext - memory context to use for buffers and other memory * needs. When NULL, the current memory context is used. * maxBufferLen - maximum Append-Only Storage Block length including all * storage headers. * relationName - name of the relation to use in system logging and * error messages. * title - A phrase that better describes the purpose of this open. * The caller manages the storage for this. * storageAttributes - Append-Only Storage Attributes from relation creation. */ void AppendOnlyStorageRead_Init(AppendOnlyStorageRead *storageRead, MemoryContext memoryContext, int32 maxBufferLen, char *relationName, char *title, AppendOnlyStorageAttributes *storageAttributes) { int relationNameLen; uint8 *memory; int32 memoryLen; MemoryContext oldMemoryContext; Assert(storageRead != NULL); /* UNDONE: Range check maxBufferLen */ Assert(relationName != NULL); Assert(storageAttributes != NULL); /* UNDONE: Range check fields in storageAttributes */ MemSet(storageRead, 0, sizeof(AppendOnlyStorageRead)); storageRead->maxBufferLen = maxBufferLen; if (memoryContext == NULL) storageRead->memoryContext = CurrentMemoryContext; else storageRead->memoryContext = memoryContext; oldMemoryContext = MemoryContextSwitchTo(storageRead->memoryContext); memcpy(&storageRead->storageAttributes, storageAttributes, sizeof(AppendOnlyStorageAttributes)); relationNameLen = strlen(relationName); storageRead->relationName = (char *) palloc(relationNameLen + 1); memcpy(storageRead->relationName, relationName, relationNameLen + 1); storageRead->title = title; storageRead->minimumHeaderLen = AppendOnlyStorageFormat_RegularHeaderLenNeeded( storageRead->storageAttributes.checksum); /* * Initialize BufferedRead. */ storageRead->largeReadLen = 2 * storageRead->maxBufferLen; memoryLen = BufferedReadMemoryLen(storageRead->maxBufferLen, storageRead->largeReadLen); Assert(CurrentMemoryContext == storageRead->memoryContext); memory = (uint8 *) palloc(memoryLen); BufferedReadInit(&storageRead->bufferedRead, memory, memoryLen, storageRead->maxBufferLen, storageRead->largeReadLen, relationName); elogif(Debug_appendonly_print_scan || Debug_appendonly_print_read_block, LOG, "Append-Only Storage Read initialize for table '%s' " "(compression = %s, compression level %d, maximum buffer length %d, large read length %d)", storageRead->relationName, (storageRead->storageAttributes.compress ? "true" : "false"), storageRead->storageAttributes.compressLevel, storageRead->maxBufferLen, storageRead->largeReadLen); storageRead->file = -1; storageRead->formatVersion = -1; MemoryContextSwitchTo(oldMemoryContext); storageRead->isActive = true; }
/* * AOCSSegmentFileTruncateToEOF() * * Assumes that the segment file lock is already held. * * For the segment file is truncates to the eof. */ static void AOCSSegmentFileTruncateToEOF(Relation aorel, AOCSFileSegInfo *fsinfo) { const char *relname = RelationGetRelationName(aorel); int segno; int j; Assert(fsinfo); Assert(RelationIsAoCols(aorel)); segno = fsinfo->segno; relname = RelationGetRelationName(aorel); for (j = 0; j < fsinfo->vpinfo.nEntry; ++j) { int64 segeof; char filenamepath[MAXPGPATH]; AOCSVPInfoEntry *entry; File fd; int32 fileSegNo; entry = getAOCSVPEntry(fsinfo, j); segeof = entry->eof; /* Open and truncate the relation segfile to its eof */ MakeAOSegmentFileName(aorel, segno, j, &fileSegNo, filenamepath); elogif(Debug_appendonly_print_compaction, LOG, "Opening AO COL relation \"%s.%s\", relation id %u, relfilenode %u column #%d, logical segment #%d (physical segment file #%d, logical EOF " INT64_FORMAT ")", get_namespace_name(RelationGetNamespace(aorel)), relname, aorel->rd_id, aorel->rd_node.relNode, j, segno, fileSegNo, segeof); fd = OpenAOSegmentFile(aorel, filenamepath, fileSegNo, segeof); if (fd >= 0) { TruncateAOSegmentFile(fd, aorel, fileSegNo, segeof); CloseAOSegmentFile(fd); elogif(Debug_appendonly_print_compaction, LOG, "Successfully truncated AO COL relation \"%s.%s\", relation id %u, relfilenode %u column #%d, logical segment #%d (physical segment file #%d, logical EOF " INT64_FORMAT ")", get_namespace_name(RelationGetNamespace(aorel)), relname, aorel->rd_id, aorel->rd_node.relNode, j, segno, fileSegNo, segeof); } else { elogif(Debug_appendonly_print_compaction, LOG, "No gp_relation_node entry for AO COL relation \"%s.%s\", relation id %u, relfilenode %u column #%d, logical segment #%d (physical segment file #%d, logical EOF " INT64_FORMAT ")", get_namespace_name(RelationGetNamespace(aorel)), relname, aorel->rd_id, aorel->rd_node.relNode, j, segno, fileSegNo, segeof); } } }
/* * Assumes that the segment file lock is already held. * Assumes that the segment file should be compacted. */ static bool AOCSSegmentFileFullCompaction(Relation aorel, AOCSInsertDesc insertDesc, AOCSFileSegInfo *fsinfo, Snapshot snapshot) { const char *relname; AppendOnlyVisimap visiMap; AOCSScanDesc scanDesc; TupleDesc tupDesc; TupleTableSlot *slot; int compact_segno; int64 movedTupleCount = 0; ResultRelInfo *resultRelInfo; MemTupleBinding *mt_bind; EState *estate; bool *proj; int i; AOTupleId *aoTupleId; int64 tupleCount = 0; int64 tuplePerPage = INT_MAX; Assert(Gp_role == GP_ROLE_EXECUTE || Gp_role == GP_ROLE_UTILITY); Assert(RelationIsAoCols(aorel)); Assert(insertDesc); compact_segno = fsinfo->segno; if (fsinfo->varblockcount > 0) { tuplePerPage = fsinfo->total_tupcount / fsinfo->varblockcount; } relname = RelationGetRelationName(aorel); AppendOnlyVisimap_Init(&visiMap, aorel->rd_appendonly->visimaprelid, aorel->rd_appendonly->visimapidxid, ShareLock, snapshot); elogif(Debug_appendonly_print_compaction, LOG, "Compact AO segfile %d, relation %sd", compact_segno, relname); proj = palloc0(sizeof(bool) * RelationGetNumberOfAttributes(aorel)); for (i = 0; i < RelationGetNumberOfAttributes(aorel); ++i) { proj[i] = true; } scanDesc = aocs_beginrangescan(aorel, snapshot, snapshot, &compact_segno, 1, NULL, proj); tupDesc = RelationGetDescr(aorel); slot = MakeSingleTupleTableSlot(tupDesc); mt_bind = create_memtuple_binding(tupDesc); /* * We need a ResultRelInfo and an EState so we can use the regular * executor's index-entry-making machinery. */ estate = CreateExecutorState(); resultRelInfo = makeNode(ResultRelInfo); resultRelInfo->ri_RangeTableIndex = 1; /* dummy */ resultRelInfo->ri_RelationDesc = aorel; resultRelInfo->ri_TrigDesc = NULL; /* we don't fire triggers */ ExecOpenIndices(resultRelInfo); estate->es_result_relations = resultRelInfo; estate->es_num_result_relations = 1; estate->es_result_relation_info = resultRelInfo; while (aocs_getnext(scanDesc, ForwardScanDirection, slot)) { CHECK_FOR_INTERRUPTS(); aoTupleId = (AOTupleId *) slot_get_ctid(slot); if (AppendOnlyVisimap_IsVisible(&scanDesc->visibilityMap, aoTupleId)) { AOCSMoveTuple(slot, insertDesc, resultRelInfo, estate); movedTupleCount++; } else { /* Tuple is invisible and needs to be dropped */ AppendOnlyThrowAwayTuple(aorel, slot, mt_bind); } /* * Check for vacuum delay point after approximatly a var block */ tupleCount++; if (VacuumCostActive && tupleCount % tuplePerPage == 0) { vacuum_delay_point(); } } SetAOCSFileSegInfoState(aorel, compact_segno, AOSEG_STATE_AWAITING_DROP); AppendOnlyVisimap_DeleteSegmentFile(&visiMap, compact_segno); /* Delete all mini pages of the segment files if block directory exists */ if (OidIsValid(aorel->rd_appendonly->blkdirrelid)) { AppendOnlyBlockDirectory_DeleteSegmentFile(aorel, snapshot, compact_segno, 0); } elogif(Debug_appendonly_print_compaction, LOG, "Finished compaction: " "AO segfile %d, relation %s, moved tuple count " INT64_FORMAT, compact_segno, relname, movedTupleCount); AppendOnlyVisimap_Finish(&visiMap, NoLock); ExecCloseIndices(resultRelInfo); FreeExecutorState(estate); ExecDropSingleTupleTableSlot(slot); destroy_memtuple_binding(mt_bind); aocs_endscan(scanDesc); pfree(proj); return true; }
void AppendOnlyStorageFormat_MakeLargeContentHeader( uint8 *headerPtr, bool usingChecksums, bool hasFirstRowNum, int version, int64 firstRowNum, int executorKind, int largeRowCount, int32 largeContentLength) { AOLargeContentHeader *largeContentHeader; Assert(headerPtr != NULL); largeContentHeader = (AOLargeContentHeader*)headerPtr; elogif(Debug_appendonly_print_storage_headers, LOG, "Append-Only Storage make LargeContent header parameters: usingChecksums = %s, executorKind = %d, " "largeRowCount = %d, largeContentLength %d", (usingChecksums ? "true" : "false"), executorKind, largeRowCount, largeContentLength); /* Zero out whole header */ AOLargeContentHeaderInit_Init(largeContentHeader); AOLargeContentHeaderInit_headerKind(largeContentHeader,AoHeaderKind_LargeContent); AOLargeContentHeaderInit_executorBlockKind(largeContentHeader,executorKind); AOLargeContentHeaderInit_largeRowCount(largeContentHeader,largeRowCount); AOLargeContentHeaderInit_largeContentLength(largeContentHeader,largeContentLength); AOLargeContentHeaderInit_hasFirstRowNum(largeContentHeader,hasFirstRowNum); /* * Add the optional firstRowNum. * * NOTE: This is not part of the 8-byte (64-bit) header because it is so big. * NOTE: And, it is not covered by the header checksum because in order to * NOTE: determine if we should checksum more data we would need to examine * NOTE: the header data not verified by checksum yet... * * So, the firstRowNum is extra data between the header (and checksums) and * the content. We must add it before computing the checksum. */ if (hasFirstRowNum) { AppendOnlyStorageFormat_AddFirstRowNum( headerPtr, usingChecksums, firstRowNum); } if (usingChecksums) { // UNDONE: Set 2nd checksum to 0 when there is no content??? AppendOnlyStorageFormat_AddBlockHeaderChecksums( headerPtr, /* isCompressed */ false, /* hasFirstRowNum */ false, version, /* dataLength */ 0, /* compressedLength */ 0); } else { elogif(Debug_appendonly_print_storage_headers, LOG, "Append-Only storage make block header result: block_bytes_0_3 0x%X, block_bytes_4_7 0x%X", largeContentHeader->largecontent_bytes_0_3, largeContentHeader->largecontent_bytes_4_7); } }
/* * Performs a compaction of an append-only AOCS relation. * * In non-utility mode, all compaction segment files should be * marked as in-use/in-compaction in the appendonlywriter.c code. * */ void AOCSDrop(Relation aorel, List *compaction_segno) { const char *relname; int total_segfiles; AOCSFileSegInfo **segfile_array; int i, segno; LockAcquireResult acquireResult; AOCSFileSegInfo *fsinfo; Snapshot appendOnlyMetaDataSnapshot = RegisterSnapshot(GetCatalogSnapshot(InvalidOid)); Assert(Gp_role == GP_ROLE_EXECUTE || Gp_role == GP_ROLE_UTILITY); Assert(RelationIsAoCols(aorel)); relname = RelationGetRelationName(aorel); elogif(Debug_appendonly_print_compaction, LOG, "Drop AOCS relation %s", relname); /* Get information about all the file segments we need to scan */ segfile_array = GetAllAOCSFileSegInfo(aorel, appendOnlyMetaDataSnapshot, &total_segfiles); for (i = 0; i < total_segfiles; i++) { segno = segfile_array[i]->segno; if (!list_member_int(compaction_segno, segno)) { continue; } /* * Try to get the transaction write-lock for the Append-Only segment * file. * * NOTE: This is a transaction scope lock that must be held until * commit / abort. */ acquireResult = LockRelationAppendOnlySegmentFile( &aorel->rd_node, segfile_array[i]->segno, AccessExclusiveLock, /* dontWait */ true); if (acquireResult == LOCKACQUIRE_NOT_AVAIL) { elog(DEBUG5, "drop skips AOCS segfile %d, " "relation %s", segfile_array[i]->segno, relname); continue; } /* Re-fetch under the write lock to get latest committed eof. */ fsinfo = GetAOCSFileSegInfo(aorel, appendOnlyMetaDataSnapshot, segno); if (fsinfo->state == AOSEG_STATE_AWAITING_DROP) { Assert(HasLockForSegmentFileDrop(aorel)); AOCSCompaction_DropSegmentFile(aorel, segno); ClearAOCSFileSegInfo(aorel, segno, AOSEG_STATE_DEFAULT); } pfree(fsinfo); } if (segfile_array) { FreeAllAOCSSegFileInfo(segfile_array, total_segfiles); pfree(segfile_array); } UnregisterSnapshot(appendOnlyMetaDataSnapshot); }
void AlterTableCreateAoVisimapTable(Oid relOid, bool is_part_child) { Relation rel; IndexInfo *indexInfo; TupleDesc tupdesc; Oid classObjectId[2]; int16 coloptions[2]; elogif(Debug_appendonly_print_visimap, LOG, "Create visimap for relation %d", relOid); /* * Grab an exclusive lock on the target table, which we will NOT release * until end of transaction. (This is probably redundant in all present * uses...) */ if (is_part_child) rel = heap_open(relOid, NoLock); else rel = heap_open(relOid, AccessExclusiveLock); if (!RelationIsAoRows(rel) && !RelationIsAoCols(rel)) { heap_close(rel, NoLock); return; } /* Create a tuple descriptor */ tupdesc = CreateTemplateTupleDesc(Natts_pg_aovisimap, false); TupleDescInitEntry(tupdesc, (AttrNumber) 1, "segno", INT4OID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 2, "first_row_no", INT8OID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 3, "visimap", BYTEAOID, -1, 0); /* * We don't want any toast columns here. */ tupdesc->attrs[0]->attstorage = 'p'; tupdesc->attrs[1]->attstorage = 'p'; tupdesc->attrs[2]->attstorage = 'p'; /* * Create index on segno, first_row_no. */ indexInfo = makeNode(IndexInfo); indexInfo->ii_NumIndexAttrs = 2; indexInfo->ii_KeyAttrNumbers[0] = 1; indexInfo->ii_KeyAttrNumbers[1] = 2; indexInfo->ii_Expressions = NIL; indexInfo->ii_ExpressionsState = NIL; indexInfo->ii_Predicate = NIL; indexInfo->ii_PredicateState = NIL; indexInfo->ii_Unique = true; indexInfo->ii_Concurrent = false; classObjectId[0] = INT4_BTREE_OPS_OID; classObjectId[1] = INT8_BTREE_OPS_OID; coloptions[0] = 0; coloptions[1] = 0; (void) CreateAOAuxiliaryTable(rel, "pg_aovisimap", RELKIND_AOVISIMAP, tupdesc, indexInfo, classObjectId, coloptions); heap_close(rel, NoLock); }
/* * Initialize AppendOnlyStorageRead. * * The AppendOnlyStorageRead data structure is initialized * once for a read "session" and can be used to read * Append-Only Storage Blocks from 1 or more segment files. * * The current file to read to is opened with the * AppendOnlyStorageRead_OpenFile routine. */ void AppendOnlyStorageRead_Init( AppendOnlyStorageRead *storageRead, /* The data structure to initialize. */ MemoryContext memoryContext, /* * The memory context to use for buffers and * other memory needs. When NULL, the * current memory context is used. */ int32 maxBufferLen, /* * The maximum Append-Only Storage Block * length including all storage headers. */ char *relationName, /* * Name of the relation to use in system * logging and error messages. */ char *title, /* * A phrase that better describes the purpose of the this open. * * The caller manages the storage for this. */ AppendOnlyStorageAttributes *storageAttributes) /* * The Append-Only Storage Attributes * from relation creation. */ { int relationNameLen; uint8 *memory; int32 memoryLen; MemoryContext oldMemoryContext; Assert(storageRead != NULL); // UNDONE: Range check maxBufferLen Assert(relationName != NULL); Assert(storageAttributes != NULL); // UNDONE: Range check fields in storageAttributes MemSet(storageRead, 0, sizeof(AppendOnlyStorageRead)); storageRead->maxBufferLen = maxBufferLen; if (memoryContext == NULL) storageRead->memoryContext = CurrentMemoryContext; else storageRead->memoryContext = memoryContext; oldMemoryContext = MemoryContextSwitchTo(storageRead->memoryContext); memcpy( &storageRead->storageAttributes, storageAttributes, sizeof(AppendOnlyStorageAttributes)); relationNameLen = strlen(relationName); storageRead->relationName = (char *) palloc(relationNameLen + 1); memcpy(storageRead->relationName, relationName, relationNameLen + 1); storageRead->title = title; storageRead->minimumHeaderLen = AppendOnlyStorageFormat_RegularHeaderLenNeeded( storageRead->storageAttributes.checksum); /* * Initialize BufferedRead. */ storageRead->largeReadLen = 2 * storageRead->maxBufferLen; memoryLen = BufferedReadMemoryLen( storageRead->maxBufferLen, storageRead->largeReadLen); Assert(CurrentMemoryContext == storageRead->memoryContext); memory = (uint8*)palloc(memoryLen); BufferedReadInit(&storageRead->bufferedRead, memory, memoryLen, storageRead->maxBufferLen, storageRead->largeReadLen, relationName); elogif(Debug_appendonly_print_scan || Debug_appendonly_print_read_block, LOG, "Append-Only Storage Read initialize for table '%s' " "(compression = %s, compression level %d, maximum buffer length %d, large read length %d)", storageRead->relationName, (storageRead->storageAttributes.compress ? "true" : "false"), storageRead->storageAttributes.compressLevel, storageRead->maxBufferLen, storageRead->largeReadLen); storageRead->file = -1; MemoryContextSwitchTo(oldMemoryContext); storageRead->isActive = true; }
/* * Skip zero padding to next page boundary, if necessary. * * This function is called when the file system block we are scanning has * no more valid data but instead is padded with zero's from the position * we are currently in until the end of the block. The function will skip * to the end of block if skipLen is -1 or skip skipLen bytes otherwise. */ static void AppendOnlyStorageRead_DoSkipPadding(AppendOnlyStorageRead *storageRead, int32 skipLen) { int64 nextReadPosition; int64 nextBoundaryPosition; int32 safeWriteRemainder; bool doSkip; uint8 *buffer; int32 availableLen; int32 safewrite = storageRead->storageAttributes.safeFSWriteSize; /* early exit if no pad used */ if (safewrite == 0) return; nextReadPosition = BufferedReadNextBufferPosition(&storageRead->bufferedRead); nextBoundaryPosition = ((nextReadPosition + safewrite - 1) / safewrite) * safewrite; safeWriteRemainder = (int32) (nextBoundaryPosition - nextReadPosition); if (safeWriteRemainder <= 0) doSkip = false; else if (skipLen == -1) { /* * Skip to end of page. */ doSkip = true; skipLen = safeWriteRemainder; } else doSkip = (safeWriteRemainder < skipLen); if (doSkip) { /* * Read through the remainder. */ buffer = BufferedReadGetNextBuffer(&storageRead->bufferedRead, safeWriteRemainder, &availableLen); /* * Since our file EOF should always be a multiple of the file-system * page, we do not expect a short read here. */ if (buffer == NULL) availableLen = 0; if (buffer == NULL || safeWriteRemainder != availableLen) { ereport(ERROR, (errcode(ERRCODE_GP_INTERNAL_ERROR), errmsg("Unexpected end of file. Expected to read %d bytes after position " INT64_FORMAT " but found %d bytes (bufferCount " INT64_FORMAT ")\n", safeWriteRemainder, nextReadPosition, availableLen, storageRead->bufferCount))); } /* * UNDONE: For verification purposes, we should verify the * remainder is all zeroes. */ elogif(Debug_appendonly_print_scan, LOG, "Append-only scan skipping zero padded remainder for table '%s' (nextReadPosition = " INT64_FORMAT ", safeWriteRemainder = %d)", storageRead->relationName, nextReadPosition, safeWriteRemainder); } }
void AppendOnlyStorageFormat_MakeBulkDenseContentHeader( uint8 *headerPtr, bool usingChecksums, bool hasFirstRowNum, int version, int64 firstRowNum, int executorKind, int rowCount, int32 dataLength, int32 compressedLength) { AOBulkDenseContentHeader *blockHeader; int32 firstHeaderAndChecksumsLen; AOBulkDenseContentHeaderExt *extHeader; bool isCompressed; Assert(headerPtr != NULL); blockHeader = (AOBulkDenseContentHeader*)headerPtr; firstHeaderAndChecksumsLen = AoHeader_RegularSize + (usingChecksums ? 2 * sizeof(pg_crc32) : 0); /* * The extension header is in the data portion with first row number. */ extHeader = (AOBulkDenseContentHeaderExt*)(headerPtr + firstHeaderAndChecksumsLen); elogif(Debug_appendonly_print_storage_headers, LOG, "Append-Only storage make Bulk Dense Content header parameters: wantChecksum = %s, hasFirstRowNum %s, executorKind = %d, " "rowCount = %d, dataLength %d, compressedLength %d", (usingChecksums ? "true" : "false"), (hasFirstRowNum ? "true" : "false"), executorKind, rowCount, dataLength, compressedLength); /* Zero out whole header */ AOBulkDenseContentHeaderInit_Init(blockHeader); AOBulkDenseContentHeaderInit_headerKind(blockHeader,AoHeaderKind_BulkDenseContent); AOBulkDenseContentHeaderInit_executorBlockKind(blockHeader,executorKind); AOBulkDenseContentHeaderInit_dataLength(blockHeader,dataLength); AOBulkDenseContentHeaderInit_hasFirstRowNum(blockHeader,hasFirstRowNum); isCompressed = (compressedLength > 0); if (isCompressed) AOBulkDenseContentHeaderInit_compressedLength(blockHeader,compressedLength); /* Zero out whole extension */ AOBulkDenseContentHeaderExtInit_Init(extHeader); AOBulkDenseContentHeaderExtInit_largeRowCount(extHeader,rowCount); /* * Add the optional firstRowNum. * * NOTE: This is not part of the 8-byte (64-bit) header because it is so big. * NOTE: And, it is not covered by the header checksum because in order to * NOTE: determine if we should checksum more data we would need to examine * NOTE: the header data not verified by checksum yet... * * So, the firstRowNum is extra data between the header (and checksums) and * the content. We must add it before computing the checksum. */ if (hasFirstRowNum) { AppendOnlyStorageFormat_AddFirstRowNum( headerPtr, usingChecksums, firstRowNum); } if (usingChecksums) { AppendOnlyStorageFormat_AddBlockHeaderChecksums( headerPtr, isCompressed, hasFirstRowNum, version, dataLength, compressedLength); } else { elogif(Debug_appendonly_print_storage_headers, LOG, "Append-Only storage make Bulk Dense Content header result: " "bulkdensecontent_bytes_0_3 0x%X, bulkdensecontent_bytes_4_7 0x%X " "bulkdensecontent_ext_bytes_0_3 0x%X, bulkdensecontent_ext_bytes_4_7 0x%X ", blockHeader->bulkdensecontent_bytes_0_3, blockHeader->bulkdensecontent_bytes_4_7, extHeader->bulkdensecontent_ext_bytes_0_3, extHeader->bulkdensecontent_ext_bytes_4_7); } #ifdef USE_ASSERT_CHECKING { int checkHeaderLen; int32 checkLength; int32 checkBlockLimitLen; int32 checkOverallBlockLen; int32 checkOffset; int32 checkUncompressedLen; int checkExecutorBlockKind; bool checkHasFirstRowNum; int64 checkFirstRowNum; int checkRowCount; bool checkIsCompressed; int32 checkCompressedLen; AOHeaderCheckError checkError; checkHeaderLen = firstHeaderAndChecksumsLen + AoHeader_RegularSize; if (hasFirstRowNum) checkHeaderLen += sizeof(int64); if (compressedLength == 0) { checkLength = dataLength; } else { checkLength = compressedLength; } checkBlockLimitLen = checkHeaderLen + AOStorage_RoundUp(checkLength, version); checkError = AppendOnlyStorageFormat_GetBulkDenseContentHeaderInfo( headerPtr, checkHeaderLen, usingChecksums, checkBlockLimitLen, &checkOverallBlockLen, &checkOffset, &checkUncompressedLen, &checkExecutorBlockKind, &checkHasFirstRowNum, version, &checkFirstRowNum, &checkRowCount, &checkIsCompressed, &checkCompressedLen); if (checkError != AOHeaderCheckOk) ereport(ERROR, (errmsg("Problem making append-only storage header of type bulk dense content. Header check error %d, detail '%s'", (int)checkError, AppendOnlyStorageFormat_GetHeaderCheckErrorStr()))); if (checkOverallBlockLen != checkBlockLimitLen) ereport(ERROR, (errmsg("Problem making append-only storage header of type bulk dense content. Found block length %d, expected %d", checkOverallBlockLen, checkBlockLimitLen))); if (checkOffset != checkHeaderLen) ereport(ERROR, (errmsg("Problem making append-only storage header of type bulk dense content. Found data offset %d, expected %d", checkOffset, checkHeaderLen))); if (checkUncompressedLen != dataLength) ereport(ERROR, (errmsg("Problem making append-only storage header of type bulk dense content. Found uncompressed length %d, expected %d", checkUncompressedLen, dataLength))); if (checkExecutorBlockKind != executorKind) ereport(ERROR, (errmsg("Problem making append-only storage header of type bulk dense content. Found executor kind %d, expected %d", checkExecutorBlockKind, executorKind))); if (checkHasFirstRowNum != hasFirstRowNum) ereport(ERROR, (errmsg("Problem making append-only storage header of type bulk dense content. Found has first row number flag %s, expected %s", (checkHasFirstRowNum ? "true" : "false"), (hasFirstRowNum ? "true" : "false")))); if (hasFirstRowNum) { if (checkFirstRowNum != firstRowNum) ereport(ERROR, (errmsg("Problem making append-only storage header of type bulk dense content. " "Found first row number " INT64_FORMAT ", expected " INT64_FORMAT, checkFirstRowNum, firstRowNum))); } if (checkRowCount != rowCount) ereport(ERROR, (errmsg("Problem making append-only storage header of type bulk dense content. Found row count %d, expected %d", checkRowCount, rowCount))); if (checkIsCompressed != isCompressed) ereport(ERROR, (errmsg("Problem making append-only storage header of type bulk dense content. Found is compressed flag %s, expected %s", (checkIsCompressed ? "true" : "false"), (isCompressed ? "true" : "false")))); if (checkCompressedLen != compressedLength) ereport(ERROR, (errmsg("Problem making append-only storage header of type bulk dense content. Found data length %d, expected %d", checkCompressedLen, dataLength))); } #endif }
/* * Performs a compaction of an append-only relation in column-orientation. * * In non-utility mode, all compaction segment files should be * marked as in-use/in-compaction in the appendonlywriter.c code. If * set, the insert_segno should also be marked as in-use. * When the insert segno is negative, only truncate to eof operations * can be executed. * * The caller is required to hold either an AccessExclusiveLock (vacuum full) * or a ShareLock on the relation. */ void AOCSCompact(Relation aorel, List *compaction_segno, int insert_segno, bool isFull) { const char *relname; int total_segfiles; AOCSFileSegInfo **segfile_array; AOCSInsertDesc insertDesc = NULL; int i, segno; LockAcquireResult acquireResult; AOCSFileSegInfo *fsinfo; Snapshot appendOnlyMetaDataSnapshot = RegisterSnapshot(GetCatalogSnapshot(InvalidOid)); Assert(RelationIsAoCols(aorel)); Assert(Gp_role == GP_ROLE_EXECUTE || Gp_role == GP_ROLE_UTILITY); Assert(insert_segno >= 0); relname = RelationGetRelationName(aorel); elogif(Debug_appendonly_print_compaction, LOG, "Compact AO relation %s", relname); /* Get information about all the file segments we need to scan */ segfile_array = GetAllAOCSFileSegInfo(aorel, appendOnlyMetaDataSnapshot, &total_segfiles); if (insert_segno >= 0) { insertDesc = aocs_insert_init(aorel, insert_segno, false); } for (i = 0; i < total_segfiles; i++) { segno = segfile_array[i]->segno; if (!list_member_int(compaction_segno, segno)) { continue; } if (segno == insert_segno) { /* We cannot compact the segment file we are inserting to. */ continue; } /* * Try to get the transaction write-lock for the Append-Only segment * file. * * NOTE: This is a transaction scope lock that must be held until * commit / abort. */ acquireResult = LockRelationAppendOnlySegmentFile( &aorel->rd_node, segfile_array[i]->segno, AccessExclusiveLock, /* dontWait */ true); if (acquireResult == LOCKACQUIRE_NOT_AVAIL) { elog(DEBUG5, "compaction skips AOCS segfile %d, " "relation %s", segfile_array[i]->segno, relname); continue; } /* Re-fetch under the write lock to get latest committed eof. */ fsinfo = GetAOCSFileSegInfo(aorel, appendOnlyMetaDataSnapshot, segno); /* * This should not occur since this segfile info was found by the * "all" method, but better to catch for trouble shooting (possibly * index corruption?) */ if (fsinfo == NULL) elog(ERROR, "file seginfo for AOCS relation %s %u/%u/%u (segno=%u) is missing", relname, aorel->rd_node.spcNode, aorel->rd_node.dbNode, aorel->rd_node.relNode, segno); if (AppendOnlyCompaction_ShouldCompact(aorel, fsinfo->segno, fsinfo->total_tupcount, isFull, appendOnlyMetaDataSnapshot)) { AOCSSegmentFileFullCompaction(aorel, insertDesc, fsinfo, appendOnlyMetaDataSnapshot); } pfree(fsinfo); } if (insertDesc != NULL) aocs_insert_finish(insertDesc); if (segfile_array) { FreeAllAOCSSegFileInfo(segfile_array, total_segfiles); pfree(segfile_array); } UnregisterSnapshot(appendOnlyMetaDataSnapshot); }
/* * Perform a large write i/o. */ static void BufferedAppendWrite( BufferedAppend *bufferedAppend) { int32 writeLen; uint8 *largeWriteMemory; int actualLen; writeLen = bufferedAppend->largeWriteLen; Assert(bufferedAppend->largeWriteLen > 0); largeWriteMemory = bufferedAppend->largeWriteMemory; #ifdef USE_ASSERT_CHECKING { int64 currentWritePosition; currentWritePosition = FileNonVirtualCurSeek(bufferedAppend->file); if (currentWritePosition < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("unable to get current position in table \"%s\" for file \"%s\" (errcode %d)", bufferedAppend->relationName, bufferedAppend->filePathName, errno))); if (currentWritePosition != bufferedAppend->largeWritePosition) ereport(ERROR, (errcode_for_file_access(), errmsg("Current position mismatch actual " INT64_FORMAT ", expected " INT64_FORMAT " in table \"%s\" for file \"%s\"", currentWritePosition, bufferedAppend->largeWritePosition, bufferedAppend->relationName, bufferedAppend->filePathName))); } #endif while (writeLen > 0) { int primaryError; bool mirrorDataLossOccurred; MirroredAppendOnly_Append( &bufferedAppend->mirroredOpen, (char*)largeWriteMemory, writeLen, &primaryError, &mirrorDataLossOccurred); if (primaryError != 0) ereport(ERROR, (errcode_for_file_access(), errmsg("Could not write in table \"%s\" to segment file '%s': %m", bufferedAppend->relationName, bufferedAppend->filePathName))); elogif(Debug_appendonly_print_append_block, LOG, "Append-Only storage write: table '%s', segment file '%s', write position " INT64_FORMAT ", " "writeLen %d (equals large write length %d is %s)", bufferedAppend->relationName, bufferedAppend->filePathName, bufferedAppend->largeWritePosition, writeLen, bufferedAppend->largeWriteLen, (writeLen == bufferedAppend->largeWriteLen ? "true" : "false")); actualLen = writeLen; writeLen -= actualLen; largeWriteMemory += actualLen; } bufferedAppend->largeWritePosition += bufferedAppend->largeWriteLen; bufferedAppend->largeWriteLen = 0; }
void AppendOnlyStorageFormat_MakeSmallContentHeader( uint8 *headerPtr, bool usingChecksums, bool hasFirstRowNum, int version, int64 firstRowNum, int executorKind, int rowCount, int32 dataLength, int32 compressedLength) { AOSmallContentHeader *blockHeader; bool isCompressed; Assert(headerPtr != NULL); blockHeader = (AOSmallContentHeader*)headerPtr; elogif(Debug_appendonly_print_storage_headers, LOG, "Append-Only storage make Small Content header parameters: wantChecksum = %s, hasFirstRowNum %s, executorKind = %d, " "rowCount = %d, dataLength %d, compressedLength %d", (usingChecksums ? "true" : "false"), (hasFirstRowNum ? "true" : "false"), executorKind, rowCount, dataLength, compressedLength); /* Zero out whole header */ AOSmallContentHeaderInit_Init(blockHeader); AOSmallContentHeaderInit_headerKind(blockHeader,AoHeaderKind_SmallContent); AOSmallContentHeaderInit_executorBlockKind(blockHeader,executorKind); AOSmallContentHeaderInit_rowCount(blockHeader,rowCount); AOSmallContentHeaderInit_dataLength(blockHeader,dataLength); AOSmallContentHeaderInit_hasFirstRowNum(blockHeader,hasFirstRowNum); isCompressed = (compressedLength > 0); if (isCompressed) AOSmallContentHeaderInit_compressedLength(blockHeader,compressedLength); /* * Add the optional firstRowNum. * * NOTE: This is not part of the 8-byte (64-bit) header because it is so big. * NOTE: And, it is not covered by the header checksum because in order to * NOTE: determine if we should checksum more data we would need to examine * NOTE: the header data not verified by checksum yet... * * So, the firstRowNum is extra data between the header (and checksums) and * the content. We must add it before computing the checksum. */ if (hasFirstRowNum) { AppendOnlyStorageFormat_AddFirstRowNum( headerPtr, usingChecksums, firstRowNum); } if (usingChecksums) { AppendOnlyStorageFormat_AddBlockHeaderChecksums( headerPtr, isCompressed, hasFirstRowNum, version, dataLength, compressedLength); } else { elogif(Debug_appendonly_print_storage_headers, LOG, "Append-Only storage make Small Content header result: smallcontent_bytes_0_3 0x%X, smallcontent_bytes_4_7 0x%X", blockHeader->smallcontent_bytes_0_3, blockHeader->smallcontent_bytes_4_7); } }