/* Forget any invalid pages in a whole database */ static void forget_invalid_pages_db(Oid tblspc, Oid dbid) { HASH_SEQ_STATUS status; xl_invalid_page *hentry; if (invalid_page_tab == NULL) return; /* nothing to do */ hash_seq_init(&status, invalid_page_tab); while ((hentry = (xl_invalid_page *) hash_seq_search(&status)) != NULL) { if ((!OidIsValid(tblspc) || hentry->key.node.spcNode == tblspc) && hentry->key.node.dbNode == dbid) { elog(DEBUG2, "page %u of relation %u/%u/%u has been dropped", hentry->key.blkno, hentry->key.node.spcNode, hentry->key.node.dbNode, hentry->key.node.relNode); if (Debug_persistent_recovery_print) elog(PersistentRecovery_DebugPrintLevel(), "forget_invalid_pages_db: %u of relation %u/%u/%u has been dropped", hentry->key.blkno, hentry->key.node.spcNode, hentry->key.node.dbNode, hentry->key.node.relNode); if (hash_search(invalid_page_tab, (void *) &hentry->key, HASH_REMOVE, NULL) == NULL) elog(ERROR, "hash table corrupted"); } } }
/* Forget any invalid pages >= minblkno, because they've been dropped */ static void forget_invalid_pages(RelFileNode node, BlockNumber minblkno) { HASH_SEQ_STATUS status; xl_invalid_page *hentry; if (invalid_page_tab == NULL) return; /* nothing to do */ hash_seq_init(&status, invalid_page_tab); while ((hentry = (xl_invalid_page *) hash_seq_search(&status)) != NULL) { if (RelFileNodeEquals(hentry->key.node, node) && hentry->key.blkno >= minblkno) { elog(DEBUG2, "page %u of relation %u/%u/%u has been dropped", hentry->key.blkno, hentry->key.node.spcNode, hentry->key.node.dbNode, hentry->key.node.relNode); if (Debug_persistent_recovery_print) elog(PersistentRecovery_DebugPrintLevel(), "forget_invalid_pages: page %u of relation %u/%u/%u has been dropped", hentry->key.blkno, hentry->key.node.spcNode, hentry->key.node.dbNode, hentry->key.node.relNode); if (hash_search(invalid_page_tab, (void *) &hentry->key, HASH_REMOVE, NULL) == NULL) elog(ERROR, "hash table corrupted"); } } }
/* Log a reference to an invalid page */ static void log_invalid_page(RelFileNode node, BlockNumber blkno, bool present) { xl_invalid_page_key key; xl_invalid_page *hentry; bool found; /* * Log references to invalid pages at DEBUG1 level. This allows some * tracing of the cause (note the elog context mechanism will tell us * something about the XLOG record that generated the reference). */ if (present) { elog(DEBUG1, "page %u of relation %u/%u/%u is uninitialized", blkno, node.spcNode, node.dbNode, node.relNode); if (Debug_persistent_recovery_print) elog(PersistentRecovery_DebugPrintLevel(), "log_invalid_page: page %u of relation %u/%u/%u is uninitialized", blkno, node.spcNode, node.dbNode, node.relNode); } else { elog(DEBUG1, "page %u of relation %u/%u/%u does not exist", blkno, node.spcNode, node.dbNode, node.relNode); if (Debug_persistent_recovery_print) elog(PersistentRecovery_DebugPrintLevel(), "log_invalid_page: page %u of relation %u/%u/%u does not exist", blkno, node.spcNode, node.dbNode, node.relNode); } if (invalid_page_tab == NULL) { /* create hash table when first needed */ HASHCTL ctl; memset(&ctl, 0, sizeof(ctl)); ctl.keysize = sizeof(xl_invalid_page_key); ctl.entrysize = sizeof(xl_invalid_page); ctl.hash = tag_hash; invalid_page_tab = hash_create("XLOG invalid-page table", 100, &ctl, HASH_ELEM | HASH_FUNCTION); } /* we currently assume xl_invalid_page_key contains no padding */ key.node = node; key.blkno = blkno; hentry = (xl_invalid_page *) hash_search(invalid_page_tab, (void *) &key, HASH_ENTER, &found); if (!found) { /* hash_search already filled in the key */ hentry->present = present; } else { /* repeat reference ... leave "present" as it was */ } }
/* * Open a relation during XLOG replay * * Note: this once had an API that allowed NULL return on failure, but it * no longer does; any failure results in elog(). */ Relation XLogOpenRelation(RelFileNode rnode) { XLogRelDesc *res; XLogRelCacheEntry *hentry; bool found; hentry = (XLogRelCacheEntry *) hash_search(_xlrelcache, (void *) &rnode, HASH_FIND, NULL); if (hentry) { res = hentry->rdesc; res->lessRecently->moreRecently = res->moreRecently; res->moreRecently->lessRecently = res->lessRecently; } else { /* * We need to fault in the database directory on the standby. */ if (rnode.spcNode != GLOBALTABLESPACE_OID && IsStandbyMode()) { char *primaryFilespaceLocation = NULL; char *dbPath; if (IsBuiltinTablespace(rnode.spcNode)) { /* * No filespace to fetch. */ } else { char *mirrorFilespaceLocation = NULL; /* * Investigate whether the containing directories exist to give more detail. */ PersistentTablespace_GetPrimaryAndMirrorFilespaces( rnode.spcNode, &primaryFilespaceLocation, &mirrorFilespaceLocation); if (primaryFilespaceLocation == NULL || strlen(primaryFilespaceLocation) == 0) { elog(ERROR, "Empty primary filespace directory location"); } if (mirrorFilespaceLocation != NULL) { pfree(mirrorFilespaceLocation); mirrorFilespaceLocation = NULL; } } dbPath = (char*)palloc(MAXPGPATH + 1); FormDatabasePath( dbPath, primaryFilespaceLocation, rnode.spcNode, rnode.dbNode); if (primaryFilespaceLocation != NULL) { pfree(primaryFilespaceLocation); primaryFilespaceLocation = NULL; } if (mkdir(dbPath, 0700) == 0) { if (Debug_persistent_recovery_print) { elog(PersistentRecovery_DebugPrintLevel(), "XLogOpenRelation: Re-created database directory \"%s\"", dbPath); } } else { /* * Allowed to already exist. */ if (errno != EEXIST) { elog(ERROR, "could not create database directory \"%s\": %m", dbPath); } else { if (Debug_persistent_recovery_print) { elog(PersistentRecovery_DebugPrintLevel(), "XLogOpenRelation: Database directory \"%s\" already exists", dbPath); } } } pfree(dbPath); } res = _xl_new_reldesc(); sprintf(RelationGetRelationName(&(res->reldata)), "%u", rnode.relNode); res->reldata.rd_node = rnode; /* * We set up the lockRelId in case anything tries to lock the dummy * relation. Note that this is fairly bogus since relNode may be * different from the relation's OID. It shouldn't really matter * though, since we are presumably running by ourselves and can't have * any lock conflicts ... */ res->reldata.rd_lockInfo.lockRelId.dbId = rnode.dbNode; res->reldata.rd_lockInfo.lockRelId.relId = rnode.relNode; hentry = (XLogRelCacheEntry *) hash_search(_xlrelcache, (void *) &rnode, HASH_ENTER, &found); if (found) elog(PANIC, "xlog relation already present on insert into cache"); hentry->rdesc = res; res->reldata.rd_targblock = InvalidBlockNumber; res->reldata.rd_smgr = NULL; RelationOpenSmgr(&(res->reldata)); /* * Create the target file if it doesn't already exist. This lets us * cope if the replay sequence contains writes to a relation that is * later deleted. (The original coding of this routine would instead * return NULL, causing the writes to be suppressed. But that seems * like it risks losing valuable data if the filesystem loses an inode * during a crash. Better to write the data until we are actually * told to delete the file.) */ // NOTE: We no longer re-create files automatically because // new FileRep persistent objects will ensure files exist. // UNDONE: Can't remove this block of code yet until boot time calls to this routine are analyzed... { MirrorDataLossTrackingState mirrorDataLossTrackingState; int64 mirrorDataLossTrackingSessionNum; bool mirrorDataLossOccurred; // UNDONE: What about the persistent rel files table??? // UNDONE: This condition should not occur anymore. // UNDONE: segmentFileNum and AO? mirrorDataLossTrackingState = FileRepPrimary_GetMirrorDataLossTrackingSessionNum( &mirrorDataLossTrackingSessionNum); smgrcreate( res->reldata.rd_smgr, res->reldata.rd_isLocalBuf, /* relationName */ NULL, // Ok to be NULL -- we don't know the name here. mirrorDataLossTrackingState, mirrorDataLossTrackingSessionNum, /* ignoreAlreadyExists */ true, &mirrorDataLossOccurred); } } res->moreRecently = &(_xlrelarr[0]); res->lessRecently = _xlrelarr[0].lessRecently; _xlrelarr[0].lessRecently = res; res->lessRecently->moreRecently = res; Assert(&(res->reldata) != NULL); // Assert what it says in the interface -- we don't return NULL anymore. return &(res->reldata); }
static void PersistentStore_DoInitScan( PersistentStoreData *storeData, PersistentStoreSharedData *storeSharedData) { PersistentStoreScan storeScan; ItemPointerData persistentTid; int64 persistentSerialNum; ItemPointerData previousFreeTid; Datum *values; int64 globalSequenceNum; values = (Datum*)palloc(storeData->numAttributes * sizeof(Datum)); MemSet(&storeSharedData->maxTid, 0, sizeof(ItemPointerData)); PersistentStore_BeginScan( storeData, storeSharedData, &storeScan); while (PersistentStore_GetNext( &storeScan, values, &persistentTid, &persistentSerialNum)) { /* * We are scanning from low to high TID. */ Assert( PersistentStore_IsZeroTid(&storeSharedData->maxTid) || ItemPointerCompare( &storeSharedData->maxTid, &persistentTid) == -1); // Less-Than. storeSharedData->maxTid = persistentTid; PersistentStore_ExtractOurTupleData( storeData, values, &persistentSerialNum, &previousFreeTid); if (Debug_persistent_recovery_print) (*storeData->printTupleCallback)( PersistentRecovery_DebugPrintLevel(), "SCAN", &persistentTid, values); if (!PersistentStore_IsZeroTid(&previousFreeTid)) { /* * Non-zero previousFreeTid implies a free entry. */ if (storeSharedData->maxFreeOrderNum < persistentSerialNum) { storeSharedData->maxFreeOrderNum = persistentSerialNum; storeSharedData->freeTid = persistentTid; } if (!gp_persistent_skip_free_list) { PersistentStore_InitScanAddFreeEntry( &persistentTid, &previousFreeTid, /* freeOrderNum */ persistentSerialNum); } } else { storeSharedData->inUseCount++; if (storeSharedData->maxInUseSerialNum < persistentSerialNum) { storeSharedData->maxInUseSerialNum = persistentSerialNum; storeData->myHighestSerialNum = storeSharedData->maxInUseSerialNum; } } if (storeData->scanTupleCallback != NULL) (*storeData->scanTupleCallback)( &persistentTid, persistentSerialNum, values); } PersistentStore_EndScan(&storeScan); pfree(values); globalSequenceNum = GlobalSequence_Current(storeData->gpGlobalSequence); /* * Note: Originally the below IF STMT was guarded with a InRecovery flag check. * However, this routine should not be called during recovery since the entries are * not consistent... */ Assert(!InRecovery); if (globalSequenceNum < storeSharedData->maxInUseSerialNum) { /* * We seem to have a corruption problem. * * Use the gp_persistent_repair_global_sequence GUC to get the system up. */ if (gp_persistent_repair_global_sequence) { elog(LOG, "Need to Repair global sequence number " INT64_FORMAT " so use scanned maximum value " INT64_FORMAT " ('%s')", globalSequenceNum, storeSharedData->maxInUseSerialNum, storeData->tableName); } else { elog(ERROR, "Global sequence number " INT64_FORMAT " less than maximum value " INT64_FORMAT " found in scan ('%s')", globalSequenceNum, storeSharedData->maxInUseSerialNum, storeData->tableName); } } else { storeSharedData->maxCachedSerialNum = globalSequenceNum; } if (Debug_persistent_recovery_print) elog(PersistentRecovery_DebugPrintLevel(), "PersistentStore_DoInitScan ('%s'): maximum in-use serial number " INT64_FORMAT ", maximum free order number " INT64_FORMAT ", free TID %s, maximum known TID %s", storeData->tableName, storeSharedData->maxInUseSerialNum, storeSharedData->maxFreeOrderNum, ItemPointerToString(&storeSharedData->freeTid), ItemPointerToString2(&storeSharedData->maxTid)); if (!gp_persistent_skip_free_list) { PersistentStore_InitScanVerifyFreeEntries( storeData, storeSharedData); } else { if (Debug_persistent_recovery_print) elog(PersistentRecovery_DebugPrintLevel(), "PersistentStore_DoInitScan ('%s'): Skipping verification because gp_persistent_skip_free_list GUC is ON", storeData->tableName); } }
static void PersistentStore_InitScanVerifyFreeEntries( PersistentStoreData *storeData, PersistentStoreSharedData *storeSharedData) { int64 freeOrderNum; ItemPointerData freeTid; PersistentFreeEntry *entry; HASH_SEQ_STATUS iterateStatus; freeOrderNum = storeSharedData->maxFreeOrderNum; freeTid = storeSharedData->freeTid; if (freeOrderNum == 0) { if (!PersistentStore_IsZeroTid(&freeTid)) elog(ERROR, "Expected free TID to be (0,0) when free order number is 0 in '%s'", storeData->tableName); } else { PersistentFreeEntryKey key; PersistentFreeEntry *removeEntry; if (freeEntryHashTable == NULL) elog(ERROR, "Looking for free order number " INT64_FORMAT " and the free entry hash table is empty for '%s'", freeOrderNum, storeData->tableName); while (true) { MemSet(&key, 0, sizeof(key)); key.persistentTid = freeTid; entry = (PersistentFreeEntry*) hash_search(freeEntryHashTable, (void *) &key, HASH_FIND, NULL); if (entry == NULL) elog(ERROR, "Did not find free entry for free TID %s (free order number " INT64_FORMAT ") for '%s'", ItemPointerToString(&freeTid), freeOrderNum, storeData->tableName); if (PersistentStore_IsZeroTid(&entry->previousFreeTid)) elog(ERROR, "Previous free TID not expected to be (0,0) -- persistent Free Entry hashtable corrupted for '%s' " "(expected free order number " INT64_FORMAT ", entry free order number " INT64_FORMAT ")", storeData->tableName, freeOrderNum, entry->freeOrderNum); if (freeOrderNum != entry->freeOrderNum) elog(ERROR, "Free entry for free TID %s has wrong free order number (expected free order number " INT64_FORMAT ", found free order number " INT64_FORMAT ") for '%s'", ItemPointerToString(&freeTid), freeOrderNum, entry->freeOrderNum, storeData->tableName); if (Debug_persistent_recovery_print) elog(PersistentRecovery_DebugPrintLevel(), "PersistentStore_InitScanVerifyFreeEntries ('%s'): Free order number " INT64_FORMAT ", free TID %s, previous free TID %s", storeData->tableName, freeOrderNum, ItemPointerToString(&freeTid), ItemPointerToString2(&entry->previousFreeTid)); freeTid = entry->previousFreeTid; Insist(!PersistentStore_IsZeroTid(&freeTid)); // Note the error check above. if (freeOrderNum == 1) { /* * The last free entry uses its own TID in previous_free_tid. */ if (ItemPointerCompare( &entry->key.persistentTid, &freeTid) != 0) { elog(ERROR, "Expected previous_free_tid %s to match the persistent TID %s for the last free entry (free order number 1) for '%s'", ItemPointerToString(&freeTid), ItemPointerToString2(&entry->key.persistentTid), storeData->tableName); } } removeEntry = hash_search( freeEntryHashTable, (void *) &entry->key, HASH_REMOVE, NULL); if (removeEntry == NULL) elog(ERROR, "Persistent Free Entry hashtable corrupted for '%s'", storeData->tableName); entry = NULL; freeOrderNum--; if (freeOrderNum == 0) break; } } if (freeEntryHashTable != NULL) { hash_seq_init( &iterateStatus, freeEntryHashTable); /* * Verify the hash table has no free entries left. */ while ((entry = hash_seq_search(&iterateStatus)) != NULL) { elog(ERROR, "Found at least one unaccounted for free entry for '%s'. Example: free order number " INT64_FORMAT ", free TID %s, previous free TID %s", storeData->tableName, entry->freeOrderNum, ItemPointerToString(&entry->key.persistentTid), ItemPointerToString2(&entry->previousFreeTid)); } hash_destroy(freeEntryHashTable); freeEntryHashTable = NULL; } if (Debug_persistent_recovery_print) elog(PersistentRecovery_DebugPrintLevel(), "PersistentStore_InitScanVerifyFreeEntries ('%s'): Successfully verified " INT64_FORMAT " free entries", storeData->tableName, storeSharedData->maxFreeOrderNum); }
static void PersistentStore_DoInitScan( PersistentStoreData *storeData, PersistentStoreSharedData *storeSharedData) { PersistentStoreScan storeScan; ItemPointerData persistentTid; int64 persistentSerialNum; Datum *values; int64 globalSequenceNum; values = (Datum*)palloc(storeData->numAttributes * sizeof(Datum)); PersistentStore_BeginScan( storeData, storeSharedData, &storeScan); while (PersistentStore_GetNext( &storeScan, values, &persistentTid, &persistentSerialNum)) { /* * We are scanning from low to high TID. */ PersistentStore_ExtractOurTupleData( storeData, values, &persistentSerialNum); if (Debug_persistent_recovery_print) (*storeData->printTupleCallback)( PersistentRecovery_DebugPrintLevel(), "SCAN", &persistentTid, values); storeSharedData->inUseCount++; if (storeSharedData->maxInUseSerialNum < persistentSerialNum) { storeSharedData->maxInUseSerialNum = persistentSerialNum; storeData->myHighestSerialNum = storeSharedData->maxInUseSerialNum; } if (storeData->scanTupleCallback != NULL) (*storeData->scanTupleCallback)( &persistentTid, persistentSerialNum, values); } PersistentStore_EndScan(&storeScan); pfree(values); globalSequenceNum = GlobalSequence_Current(storeData->gpGlobalSequence); /* * Note: Originally the below IF STMT was guarded with a InRecovery flag check. * However, this routine should not be called during recovery since the entries are * not consistent... */ Assert(!InRecovery); if (globalSequenceNum < storeSharedData->maxInUseSerialNum) { /* * We seem to have a corruption problem. * * Use the gp_persistent_repair_global_sequence GUC to get the * system up. */ if (gp_persistent_repair_global_sequence) { elog(LOG, "need to repair global sequence number " INT64_FORMAT " so use scanned maximum value " INT64_FORMAT " ('%s')", globalSequenceNum, storeSharedData->maxInUseSerialNum, storeData->tableName); } else { elog(ERROR, "global sequence number " INT64_FORMAT " less than " "maximum value " INT64_FORMAT " found in scan ('%s')", globalSequenceNum, storeSharedData->maxInUseSerialNum, storeData->tableName); } } else { storeSharedData->maxInUseSerialNum = globalSequenceNum; } if (Debug_persistent_recovery_print) elog(PersistentRecovery_DebugPrintLevel(), "PersistentStore_DoInitScan ('%s'): maximum in-use serial number " INT64_FORMAT , storeData->tableName, storeSharedData->maxInUseSerialNum); }
/* * Try to walk the free TID chain using free TIDs recorded in * freeEntryHashTable. Return true if the chain is valid, false * otherwise. * * Note: we have already validated that number of free entries is * equal to max free order number. */ static bool PersistentStore_InitScanVerifyFreeEntries( PersistentStoreData *storeData, PersistentStoreSharedData *storeSharedData) { int64 freeOrderNum; ItemPointerData freeTid; PersistentFreeEntryKey key; PersistentFreeEntry *entry; freeOrderNum = storeSharedData->maxFreeOrderNum; freeTid = storeSharedData->freeTid; elog(LOG, "beginning free list validation for %s, maxFreeOrderNum = " INT64_FORMAT, storeData->tableName, freeOrderNum); if (freeOrderNum == 0) { if (!PersistentStore_IsZeroTid(&freeTid)) { PersistentStore_DiagnoseDumpTable(storeData, storeSharedData); ereport(WARNING, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("free list validation failed"), errdetail("expected free TID to be (0,0) when " "free order number is 0 in '%s'", storeData->tableName))); return false; } } else { if (freeEntryHashTable == NULL) { PersistentStore_DiagnoseDumpTable(storeData, storeSharedData); ereport(WARNING, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("free list validation failed"), errdetail("looking for free order number " INT64_FORMAT " and the free entry hash table is empty for '%s'", freeOrderNum, storeData->tableName))); return false; } while (freeOrderNum > 0) { MemSet(&key, 0, sizeof(key)); key.persistentTid = freeTid; entry = (PersistentFreeEntry*) hash_search(freeEntryHashTable, (void *) &key, HASH_FIND, NULL); if (entry == NULL) { PersistentStore_DiagnoseDumpTable(storeData, storeSharedData); ereport(WARNING, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("free list validation failed"), errdetail( "did not find free entry for free TID %s (" "free order number " INT64_FORMAT ") for '%s'", ItemPointerToString(&freeTid), freeOrderNum, storeData->tableName))); return false; } if (PersistentStore_IsZeroTid(&entry->previousFreeTid)) { PersistentStore_DiagnoseDumpTable(storeData, storeSharedData); ereport(WARNING, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("free list validation failed"), errdetail( "previous free TID not expected to be (0,0) -- " "persistent Free Entry hashtable corrupted for " "'%s' (expected free order number " INT64_FORMAT ", entry free order number " INT64_FORMAT ")", storeData->tableName, freeOrderNum, entry->freeOrderNum))); return false; } if (freeOrderNum != entry->freeOrderNum) { PersistentStore_DiagnoseDumpTable(storeData, storeSharedData); ereport(WARNING, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("free list validation failed"), errdetail( "free entry for free TID %s has wrong free " "order number (expected free order number " INT64_FORMAT ", found free order number " INT64_FORMAT ") for '%s'", ItemPointerToString(&freeTid), freeOrderNum, entry->freeOrderNum, storeData->tableName))); return false; } if (Debug_persistent_recovery_print) elog(PersistentRecovery_DebugPrintLevel(), "PersistentStore_InitScanVerifyFreeEntries ('%s'): Free order" " number " INT64_FORMAT ", free TID %s, previous free TID %s", storeData->tableName, freeOrderNum, ItemPointerToString(&freeTid), ItemPointerToString2(&entry->previousFreeTid)); freeTid = entry->previousFreeTid; Insist(!PersistentStore_IsZeroTid(&freeTid)); // Note the error check above. if (freeOrderNum == 1) { /* * The last free entry uses its own TID in previous_free_tid. */ if (ItemPointerCompare( &entry->key.persistentTid, &freeTid) != 0) { PersistentStore_DiagnoseDumpTable(storeData, storeSharedData); ereport(WARNING, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("free list validation failed"), errdetail( "expected previous_free_tid %s to match the" " persistent TID %s for the last free entry" " (free order number 1) for '%s'", ItemPointerToString(&freeTid), ItemPointerToString2(&entry->key.persistentTid), storeData->tableName))); return false; } } entry = NULL; freeOrderNum--; } } if (Debug_persistent_recovery_print) elog(PersistentRecovery_DebugPrintLevel(), "PersistentStore_InitScanVerifyFreeEntries ('%s'): successfully " "verified " INT64_FORMAT " free entries", storeData->tableName, storeSharedData->maxFreeOrderNum); return true; }