/* * calculate size of database in all tablespaces */ static int64 calculate_database_size(Oid dbOid) { int64 totalsize; char pathname[MAXPGPATH]; Relation rel; HeapScanDesc scandesc; HeapTuple tuple; AclResult aclresult; /* User must have connect privilege for target database */ aclresult = pg_database_aclcheck(dbOid, GetUserId(), ACL_CONNECT); if (aclresult != ACLCHECK_OK) aclcheck_error(aclresult, ACL_KIND_DATABASE, get_database_name(dbOid)); /* Scan through all tablespaces */ rel = heap_open(TableSpaceRelationId, AccessShareLock); scandesc = heap_beginscan(rel, SnapshotNow, 0, NULL); tuple = heap_getnext(scandesc, ForwardScanDirection); totalsize = 0; while (HeapTupleIsValid(tuple)) { char *priFilespace, *mirFilespace; Oid tsOid; tsOid = HeapTupleGetOid(tuple); /* Don't include shared relations */ if (tsOid != GLOBALTABLESPACE_OID) { /* Find the filespace path for this tablespace */ PersistentTablespace_GetPrimaryAndMirrorFilespaces( tsOid, &priFilespace, &mirFilespace); /* Build the path for this database in this tablespace */ FormDatabasePath(pathname, priFilespace, tsOid, dbOid); totalsize += db_dir_size(pathname); } tuple = heap_getnext(scandesc, ForwardScanDirection); } heap_endscan(scandesc); heap_close(rel, AccessShareLock); /* Complain if we found no trace of the DB at all */ if (totalsize == 0) ereport(ERROR, (ERRCODE_UNDEFINED_DATABASE, errmsg("database with OID %u does not exist", dbOid))); return totalsize; }
/* * Open a relation during XLOG replay * * Note: this once had an API that allowed NULL return on failure, but it * no longer does; any failure results in elog(). */ Relation XLogOpenRelation(RelFileNode rnode) { XLogRelDesc *res; XLogRelCacheEntry *hentry; bool found; hentry = (XLogRelCacheEntry *) hash_search(_xlrelcache, (void *) &rnode, HASH_FIND, NULL); if (hentry) { res = hentry->rdesc; res->lessRecently->moreRecently = res->moreRecently; res->moreRecently->lessRecently = res->lessRecently; } else { /* * We need to fault in the database directory on the standby. */ if (rnode.spcNode != GLOBALTABLESPACE_OID && IsStandbyMode()) { char *primaryFilespaceLocation = NULL; char *dbPath; if (IsBuiltinTablespace(rnode.spcNode)) { /* * No filespace to fetch. */ } else { char *mirrorFilespaceLocation = NULL; /* * Investigate whether the containing directories exist to give more detail. */ PersistentTablespace_GetPrimaryAndMirrorFilespaces( rnode.spcNode, &primaryFilespaceLocation, &mirrorFilespaceLocation); if (primaryFilespaceLocation == NULL || strlen(primaryFilespaceLocation) == 0) { elog(ERROR, "Empty primary filespace directory location"); } if (mirrorFilespaceLocation != NULL) { pfree(mirrorFilespaceLocation); mirrorFilespaceLocation = NULL; } } dbPath = (char*)palloc(MAXPGPATH + 1); FormDatabasePath( dbPath, primaryFilespaceLocation, rnode.spcNode, rnode.dbNode); if (primaryFilespaceLocation != NULL) { pfree(primaryFilespaceLocation); primaryFilespaceLocation = NULL; } if (mkdir(dbPath, 0700) == 0) { if (Debug_persistent_recovery_print) { elog(PersistentRecovery_DebugPrintLevel(), "XLogOpenRelation: Re-created database directory \"%s\"", dbPath); } } else { /* * Allowed to already exist. */ if (errno != EEXIST) { elog(ERROR, "could not create database directory \"%s\": %m", dbPath); } else { if (Debug_persistent_recovery_print) { elog(PersistentRecovery_DebugPrintLevel(), "XLogOpenRelation: Database directory \"%s\" already exists", dbPath); } } } pfree(dbPath); } res = _xl_new_reldesc(); sprintf(RelationGetRelationName(&(res->reldata)), "%u", rnode.relNode); res->reldata.rd_node = rnode; /* * We set up the lockRelId in case anything tries to lock the dummy * relation. Note that this is fairly bogus since relNode may be * different from the relation's OID. It shouldn't really matter * though, since we are presumably running by ourselves and can't have * any lock conflicts ... */ res->reldata.rd_lockInfo.lockRelId.dbId = rnode.dbNode; res->reldata.rd_lockInfo.lockRelId.relId = rnode.relNode; hentry = (XLogRelCacheEntry *) hash_search(_xlrelcache, (void *) &rnode, HASH_ENTER, &found); if (found) elog(PANIC, "xlog relation already present on insert into cache"); hentry->rdesc = res; res->reldata.rd_targblock = InvalidBlockNumber; res->reldata.rd_smgr = NULL; RelationOpenSmgr(&(res->reldata)); /* * Create the target file if it doesn't already exist. This lets us * cope if the replay sequence contains writes to a relation that is * later deleted. (The original coding of this routine would instead * return NULL, causing the writes to be suppressed. But that seems * like it risks losing valuable data if the filesystem loses an inode * during a crash. Better to write the data until we are actually * told to delete the file.) */ // NOTE: We no longer re-create files automatically because // new FileRep persistent objects will ensure files exist. // UNDONE: Can't remove this block of code yet until boot time calls to this routine are analyzed... { MirrorDataLossTrackingState mirrorDataLossTrackingState; int64 mirrorDataLossTrackingSessionNum; bool mirrorDataLossOccurred; // UNDONE: What about the persistent rel files table??? // UNDONE: This condition should not occur anymore. // UNDONE: segmentFileNum and AO? mirrorDataLossTrackingState = FileRepPrimary_GetMirrorDataLossTrackingSessionNum( &mirrorDataLossTrackingSessionNum); smgrcreate( res->reldata.rd_smgr, res->reldata.rd_isLocalBuf, /* relationName */ NULL, // Ok to be NULL -- we don't know the name here. mirrorDataLossTrackingState, mirrorDataLossTrackingSessionNum, /* ignoreAlreadyExists */ true, &mirrorDataLossOccurred); } } res->moreRecently = &(_xlrelarr[0]); res->lessRecently = _xlrelarr[0].lessRecently; _xlrelarr[0].lessRecently = res; res->lessRecently->moreRecently = res; Assert(&(res->reldata) != NULL); // Assert what it says in the interface -- we don't return NULL anymore. return &(res->reldata); }
/* * gp_persistent_relation_node_check() * * Reads the physical filesystem for every defined filespace and returns the * list of relfilenodes that actually exist. This list should match the set of * relfilenodes tracked in gp_persistent_relation_node. */ Datum gp_persistent_relation_node_check(PG_FUNCTION_ARGS) { FuncCallContext *fcontext; node_check_data *fdata; ReturnSetInfo *rsinfo; MemoryContext oldcontext; Oid relfilenode = InvalidOid; int32 segnum = 0; HeapTuple tuple; char *primaryPath = NULL; char *mirrorPath = NULL; if (SRF_IS_FIRSTCALL()) { Relation rel; TupleDesc tupdesc; fcontext = SRF_FIRSTCALL_INIT(); rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; /* * The fdata cannot be allocated in the multi_call_ctx because the * multi_call_context gets cleaned up by the MultiFuncCall callback * function which gets called before the callback this function * registers to cleanup the fdata structure. So instead we allocate * in the parent context fn_mcxt. */ oldcontext = MemoryContextSwitchTo(fcinfo->flinfo->fn_mcxt); fdata = (node_check_data*) palloc0(sizeof(node_check_data)); fcontext->user_fctx = fdata; /* * Register a call to cleanup when the function ends. */ RegisterExprContextCallback(rsinfo->econtext, nodeCheckCleanup, PointerGetDatum(fdata)); /* * Setup the main loop over the list of tablespaces */ fdata->tablespaceRelation = heap_open(TableSpaceRelationId, AccessShareLock); fdata->scandesc = heap_beginscan(fdata->tablespaceRelation, SnapshotNow, 0, NULL); /* * Bless a tuple descriptor for the return type */ MemoryContextSwitchTo(fcontext->multi_call_memory_ctx); rel = RelationIdGetRelation(GpPersistentRelationNodeRelationId); tupdesc = RelationGetDescr(rel); fcontext->tuple_desc = BlessTupleDesc(tupdesc); relation_close(rel, NoLock); MemoryContextSwitchTo(oldcontext); } fcontext = SRF_PERCALL_SETUP(); fdata = fcontext->user_fctx; /* * The code here is basically a nested loop that has been unwound so that * it can be wrapped up into a set-returning function. * * Basic structure is: * - Loop over tablespace relation * - Loop over database directories in the tablespace * - Loop over relfilenodes in the directory * - Return each tablespace, database, relfilenode, segment_number. * * The complicating factor is that we return from this function and * reenter the loop at the innermost level, so the entire loop is turned * inside-out. */ while (true) { /* Innermost loop */ if (fdata->databaseDir) { struct dirent *dent; Datum values[Natts_gp_persistent_relation_node]; bool nulls[Natts_gp_persistent_relation_node]; dent = ReadDir(fdata->databaseDir, fdata->databaseDirName); if (!dent) { /* step out of innermost loop */ FreeDir(fdata->databaseDir); fdata->databaseDir = NULL; continue; } /* skip the boring stuff */ if (strcmp(dent->d_name, ".") == 0 || strcmp(dent->d_name, "..") == 0) continue; /* Skip things that don't look like relfilenodes */ if (!strToRelfilenode(dent->d_name, &relfilenode, &segnum)) continue; /* Return relfilenodes as we find them */ MemSet(nulls, true, sizeof(nulls)); nulls[Anum_gp_persistent_relation_node_tablespace_oid-1] = false; nulls[Anum_gp_persistent_relation_node_database_oid-1] = false; nulls[Anum_gp_persistent_relation_node_relfilenode_oid-1] = false; nulls[Anum_gp_persistent_relation_node_segment_file_num-1] = false; values[Anum_gp_persistent_relation_node_tablespace_oid-1] = ObjectIdGetDatum(fdata->tablespaceOid); values[Anum_gp_persistent_relation_node_database_oid-1] = ObjectIdGetDatum(fdata->databaseOid); values[Anum_gp_persistent_relation_node_relfilenode_oid-1] = ObjectIdGetDatum(relfilenode); values[Anum_gp_persistent_relation_node_segment_file_num-1] = Int32GetDatum(segnum); tuple = heap_form_tuple(fcontext->tuple_desc, values, nulls); SRF_RETURN_NEXT(fcontext, HeapTupleGetDatum(tuple)); } /* Loop over database directories in the tablespace */ if (fdata->tablespaceDir) { struct dirent *dent; dent = ReadDir(fdata->tablespaceDir, fdata->tablespaceDirName); if (!dent) { /* step out of database loop */ FreeDir(fdata->tablespaceDir); fdata->tablespaceDir = NULL; continue; } /* skip the borring stuff */ if (strcmp(dent->d_name, ".") == 0 || strcmp(dent->d_name, "..") == 0) continue; /* Skip things that don't look like database oids */ if (strlen(dent->d_name) != strspn(dent->d_name, "0123456789")) continue; /* convert the string to an oid */ fdata->databaseOid = pg_atoi(dent->d_name, 4, 0); /* form a database path using this oid */ snprintf(fdata->databaseDirName, MAXPGPATH, "%s/%s", fdata->tablespaceDirName, dent->d_name); oldcontext = MemoryContextSwitchTo(fcontext->multi_call_memory_ctx); fdata->databaseDir = AllocateDir(fdata->databaseDirName); MemoryContextSwitchTo(oldcontext); if (fdata->databaseDir == NULL) ereport(ERROR, (errcode_for_file_access(), errmsg("could not open directory \"%s\": %m", fdata->databaseDirName))); continue; } /* Outermost loop over tablespaces */ tuple = heap_getnext(fdata->scandesc, ForwardScanDirection); if (!HeapTupleIsValid(tuple)) SRF_RETURN_DONE(fcontext); /* FINAL return */ fdata->tablespaceOid = HeapTupleGetOid(tuple); PersistentTablespace_GetPrimaryAndMirrorFilespaces( fdata->tablespaceOid, &primaryPath, &mirrorPath); /* Find the location of this tablespace on disk */ FormTablespacePath(fdata->tablespaceDirName, primaryPath, fdata->tablespaceOid); /* * Primary path is null for the pg_system filespace, additionally * Mirror path is null if there are no mirrors */ if (primaryPath) { pfree(primaryPath); primaryPath = NULL; } if (mirrorPath) { pfree(mirrorPath); mirrorPath = NULL; } oldcontext = MemoryContextSwitchTo(fcontext->multi_call_memory_ctx); fdata->tablespaceDir = AllocateDir(fdata->tablespaceDirName); MemoryContextSwitchTo(oldcontext); if (fdata->tablespaceDir == NULL) ereport(ERROR, (errcode_for_file_access(), errmsg("could not open directory \"%s\": %m", fdata->tablespaceDirName))); /* The global tablespace doesn't have database directories */ if (fdata->tablespaceOid == GLOBALTABLESPACE_OID) { fdata->databaseOid = 0; /* Skip to the innermost loop */ fdata->databaseDir = fdata->tablespaceDir; fdata->tablespaceDir = NULL; strncpy(fdata->databaseDirName, fdata->tablespaceDirName, MAXPGPATH); } } /* Unreachable */ SRF_RETURN_DONE(fcontext); }
/* * Open a relation for mirrored write. */ static void MirroredBufferPool_DoOpen( MirroredBufferPoolOpen *open, /* The resulting open struct. */ RelFileNode *relFileNode, /* The tablespace, database, and relation OIDs for the open. */ uint32 segmentFileNum, /* Which segment file. */ char *relationName, /* For tracing only. Can be NULL in some execution paths. */ MirrorDataLossTrackingState mirrorDataLossTrackingState, int64 mirrorDataLossTrackingSessionNum, bool create, bool mirrorOnly, bool copyToMirror, int *primaryError, bool *mirrorDataLossOccurred) { int fileFlags = O_RDWR | PG_BINARY; int fileMode = 0600; /* * File mode is S_IRUSR 00400 user has read permission * + S_IWUSR 00200 user has write permission */ char *primaryFilespaceLocation = NULL; char *mirrorFilespaceLocation = NULL; Assert(open != NULL); *primaryError = 0; *mirrorDataLossOccurred = false; if (create) fileFlags = O_CREAT | O_RDWR | PG_BINARY; PersistentTablespace_GetPrimaryAndMirrorFilespaces( relFileNode->spcNode, &primaryFilespaceLocation, &mirrorFilespaceLocation); if (Debug_persistent_print && (create || mirrorOnly || copyToMirror)) { SUPPRESS_ERRCONTEXT_DECLARE; SUPPRESS_ERRCONTEXT_PUSH(); elog(Persistent_DebugPrintLevel(), "MirroredBufferPool_DoOpen: Special open %u/%u/%u --> create %s, mirrorOnly %s, copyToMirror %s, " "primary filespace location %s " "mirror filespace location %s ", relFileNode->spcNode, relFileNode->dbNode, relFileNode->relNode, (create ? "true" : "false"), (mirrorOnly ? "true" : "false"), (copyToMirror ? "true" : "false"), (primaryFilespaceLocation == NULL) ? "<null>" : primaryFilespaceLocation, (mirrorFilespaceLocation == NULL) ? "<null>" : mirrorFilespaceLocation); SUPPRESS_ERRCONTEXT_POP(); } MemSet(open, 0, sizeof(MirroredBufferPoolOpen)); open->primaryFile = -1; if (mirrorFilespaceLocation == NULL) sprintf(open->mirrorFilespaceLocation, "%s", ""); else sprintf(open->mirrorFilespaceLocation, "%s", mirrorFilespaceLocation); open->relFileNode = *relFileNode; open->segmentFileNum = segmentFileNum; open->create = create; open->mirrorOnly = mirrorOnly; open->copyToMirror = copyToMirror; MirroredBufferPool_SetUpMirrorAccess( relFileNode, segmentFileNum, relationName, mirrorDataLossTrackingState, mirrorDataLossTrackingSessionNum, /* primaryOnly */ false, mirrorOnly, &open->mirrorMode, &open->mirrorDataLossOccurred); if (StorageManagerMirrorMode_DoPrimaryWork(open->mirrorMode)) { char *dbPath; char *path; dbPath = (char*)palloc(MAXPGPATH + 1); path = (char*)palloc(MAXPGPATH + 1); /* * Do the primary work first so we don't leave files on the mirror or have an * open to clean up. */ FormDatabasePath( dbPath, primaryFilespaceLocation, relFileNode->spcNode, relFileNode->dbNode); if (segmentFileNum == 0) sprintf(path, "%s/%u", dbPath, relFileNode->relNode); else sprintf(path, "%s/%u.%u", dbPath, relFileNode->relNode, segmentFileNum); errno = 0; open->primaryFile = PathNameOpenFile(path, fileFlags, fileMode); if (open->primaryFile < 0) { *primaryError = errno; } pfree(dbPath); pfree(path); } if (StorageManagerMirrorMode_SendToMirror(open->mirrorMode) && *primaryError == 0 && !open->mirrorDataLossOccurred) { if (FileRepPrimary_MirrorOpen( FileRep_GetRelationIdentifier( open->mirrorFilespaceLocation, open->relFileNode, open->segmentFileNum), FileRepRelationTypeBufferPool, FILEREP_OFFSET_UNDEFINED, fileFlags, fileMode, TRUE /* supressError */) != 0) { if (Debug_filerep_print) ereport(LOG, (errmsg("could not sent file open request to mirror "), FileRep_ReportRelationPath( open->mirrorFilespaceLocation, open->relFileNode, open->segmentFileNum))); } open->mirrorDataLossOccurred = FileRepPrimary_IsMirrorDataLossOccurred(); } if (*primaryError != 0) { open->isActive = false; } else if (StorageManagerMirrorMode_DoPrimaryWork(open->mirrorMode)) { open->isActive = true; } else if (StorageManagerMirrorMode_SendToMirror(open->mirrorMode) && !open->mirrorDataLossOccurred) { open->isActive = true; } *mirrorDataLossOccurred = open->mirrorDataLossOccurred; if (primaryFilespaceLocation != NULL) pfree(primaryFilespaceLocation); if (mirrorFilespaceLocation != NULL) pfree(mirrorFilespaceLocation); }
/* * The Background writer (for example) might have an open in the primary when the mirror was down. * Later when the mirror comes up we need to recognize that and send new writes there.... */ static void MirroredBufferPool_RecheckMirrorAccess( MirroredBufferPoolOpen *open) /* The resulting open struct. */ { MirrorDataLossTrackingState mirrorDataLossTrackingState; int64 mirrorDataLossTrackingSessionNum; /* * Make this call while under the MirroredLock (unless we are a resync worker). */ mirrorDataLossTrackingState = FileRepPrimary_GetMirrorDataLossTrackingSessionNum( &mirrorDataLossTrackingSessionNum); MirroredBufferPool_SetUpMirrorAccess( &open->relFileNode, open->segmentFileNum, /* relationName */ NULL, // Ok to be NULL -- we don't know the name here. mirrorDataLossTrackingState, mirrorDataLossTrackingSessionNum, /* primaryOnly */ false, open->mirrorOnly, &open->mirrorMode, &open->mirrorDataLossOccurred); /* * mirror filespace location has to be populated for * a) adding mirror with filespaces * b) resynchronization with filespaces and full copy to new location */ if (open->relFileNode.spcNode != GLOBALTABLESPACE_OID && open->relFileNode.spcNode != DEFAULTTABLESPACE_OID && strcmp(open->mirrorFilespaceLocation, "") == 0) { if (mirrorDataLossTrackingState == MirrorDataLossTrackingState_MirrorCurrentlyUpInSync || mirrorDataLossTrackingState == MirrorDataLossTrackingState_MirrorCurrentlyUpInResync) { char *primaryFilespaceLocation; char *mirrorFilespaceLocation; PersistentTablespace_GetPrimaryAndMirrorFilespaces( open->relFileNode.spcNode, &primaryFilespaceLocation, &mirrorFilespaceLocation); if (mirrorFilespaceLocation != NULL) { sprintf(open->mirrorFilespaceLocation, "%s", mirrorFilespaceLocation); { char tmpBuf[FILEREP_MAX_LOG_DESCRIPTION_LEN]; snprintf(tmpBuf, sizeof(tmpBuf), "recheck mirror access, identifier '%s/%u/%u/%u' ", open->mirrorFilespaceLocation, open->relFileNode.spcNode, open->relFileNode.dbNode, open->relFileNode.relNode); FileRep_InsertConfigLogEntry(tmpBuf); } } if (primaryFilespaceLocation != NULL) pfree(primaryFilespaceLocation); if (mirrorFilespaceLocation != NULL) pfree(mirrorFilespaceLocation); } } }
/* * Mirrored drop. */ static void MirroredBufferPool_DoDrop( RelFileNode *relFileNode, /* The tablespace, database, and relation OIDs for the open. */ uint32 segmentFileNum, /* Which segment file. */ char *relationName, /* For tracing only. Can be NULL in some execution paths. */ MirrorDataLossTrackingState mirrorDataLossTrackingState, int64 mirrorDataLossTrackingSessionNum, bool primaryOnly, bool mirrorOnly, int *primaryError, bool *mirrorDataLossOccurred) { StorageManagerMirrorMode mirrorMode; char *primaryFilespaceLocation; char *mirrorFilespaceLocation; *primaryError = 0; *mirrorDataLossOccurred = false; MirroredBufferPool_SetUpMirrorAccess( relFileNode, segmentFileNum, relationName, mirrorDataLossTrackingState, mirrorDataLossTrackingSessionNum, primaryOnly, mirrorOnly, &mirrorMode, mirrorDataLossOccurred); PersistentTablespace_GetPrimaryAndMirrorFilespaces( relFileNode->spcNode, &primaryFilespaceLocation, &mirrorFilespaceLocation); if (StorageManagerMirrorMode_SendToMirror(mirrorMode) && !*mirrorDataLossOccurred) { if (FileRepPrimary_MirrorDrop( FileRep_GetRelationIdentifier( mirrorFilespaceLocation, *relFileNode, segmentFileNum), FileRepRelationTypeBufferPool) != 0) { if (Debug_filerep_print) ereport(LOG, (errmsg("could not sent file drop request to mirror "), FileRep_ReportRelationPath( mirrorFilespaceLocation, *relFileNode, segmentFileNum))); } *mirrorDataLossOccurred = FileRepPrimary_IsMirrorDataLossOccurred(); } if (StorageManagerMirrorMode_DoPrimaryWork(mirrorMode)) { char *dbPath; char *path; dbPath = (char*)palloc(MAXPGPATH + 1); path = (char*)palloc(MAXPGPATH + 1); FormDatabasePath( dbPath, primaryFilespaceLocation, relFileNode->spcNode, relFileNode->dbNode); if (segmentFileNum == 0) sprintf(path, "%s/%u", dbPath, relFileNode->relNode); else sprintf(path, "%s/%u.%u", dbPath, relFileNode->relNode, segmentFileNum); errno = 0; if (unlink(path) < 0) { *primaryError = errno; } pfree(dbPath); pfree(path); } if (StorageManagerMirrorMode_SendToMirror(mirrorMode) && !*mirrorDataLossOccurred) { if (FileRepPrimary_IsOperationCompleted( FileRep_GetRelationIdentifier( mirrorFilespaceLocation, *relFileNode, segmentFileNum), FileRepRelationTypeBufferPool) == FALSE) { if (Debug_filerep_print) ereport(LOG, (errmsg("could not drop file on mirror "), FileRep_ReportRelationPath( mirrorFilespaceLocation, *relFileNode, segmentFileNum))); } *mirrorDataLossOccurred = FileRepPrimary_IsMirrorDataLossOccurred(); } if (primaryFilespaceLocation != NULL) pfree(primaryFilespaceLocation); if (mirrorFilespaceLocation != NULL) pfree(mirrorFilespaceLocation); }