/* * Read and validate the state file for xid. * * If it looks OK (has a valid magic number and CRC), return the palloc'd * contents of the file. Otherwise return NULL. */ static char * ReadTwoPhaseFile(TransactionId xid) { char path[MAXPGPATH]; char *buf; TwoPhaseFileHeader *hdr; int fd; struct stat stat; uint32 crc_offset; pg_crc32 calc_crc, file_crc; TwoPhaseFilePath(path, xid); fd = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0); if (fd < 0) { ereport(WARNING, (errcode_for_file_access(), errmsg("could not open two-phase state file \"%s\": %m", path))); return NULL; } /* * Check file length. We can determine a lower bound pretty easily. We * set an upper bound mainly to avoid palloc() failure on a corrupt file. */ if (fstat(fd, &stat)) { close(fd); ereport(WARNING, (errcode_for_file_access(), errmsg("could not stat two-phase state file \"%s\": %m", path))); return NULL; } if (stat.st_size < (MAXALIGN(sizeof(TwoPhaseFileHeader)) + MAXALIGN(sizeof(TwoPhaseRecordOnDisk)) + sizeof(pg_crc32)) || stat.st_size > 10000000) { close(fd); return NULL; } crc_offset = stat.st_size - sizeof(pg_crc32); if (crc_offset != MAXALIGN(crc_offset)) { close(fd); return NULL; } /* * OK, slurp in the file. */ buf = (char *) palloc(stat.st_size); if (read(fd, buf, stat.st_size) != stat.st_size) { close(fd); ereport(WARNING, (errcode_for_file_access(), errmsg("could not read two-phase state file \"%s\": %m", path))); pfree(buf); return NULL; } close(fd); hdr = (TwoPhaseFileHeader *) buf; if (hdr->magic != TWOPHASE_MAGIC || hdr->total_len != stat.st_size) { pfree(buf); return NULL; } INIT_CRC32(calc_crc); COMP_CRC32(calc_crc, buf, crc_offset); FIN_CRC32(calc_crc); file_crc = *((pg_crc32 *) (buf + crc_offset)); if (!EQ_CRC32(calc_crc, file_crc)) { pfree(buf); return NULL; } return buf; }
/* * pgreadlink - uses Win32 junction points */ int pgreadlink(const char *path, char *buf, size_t size) { DWORD attr; HANDLE h; char buffer[MAX_PATH * sizeof(WCHAR) + sizeof(REPARSE_JUNCTION_DATA_BUFFER)]; REPARSE_JUNCTION_DATA_BUFFER *reparseBuf = (REPARSE_JUNCTION_DATA_BUFFER *) buffer; DWORD len; int r; attr = GetFileAttributes(path); if (attr == INVALID_FILE_ATTRIBUTES) { _dosmaperr(GetLastError()); return -1; } if ((attr & FILE_ATTRIBUTE_REPARSE_POINT) == 0) { errno = EINVAL; return -1; } h = CreateFile(path, GENERIC_READ, FILE_SHARE_READ | FILE_SHARE_WRITE, NULL, OPEN_EXISTING, FILE_FLAG_OPEN_REPARSE_POINT | FILE_FLAG_BACKUP_SEMANTICS, 0); if (h == INVALID_HANDLE_VALUE) { _dosmaperr(GetLastError()); return -1; } if (!DeviceIoControl(h, FSCTL_GET_REPARSE_POINT, NULL, 0, (LPVOID) reparseBuf, sizeof(buffer), &len, NULL)) { LPSTR msg; errno = 0; FormatMessage(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM, NULL, GetLastError(), MAKELANGID(LANG_ENGLISH, SUBLANG_DEFAULT), (LPSTR) &msg, 0, NULL); #ifndef FRONTEND ereport(ERROR, (errcode_for_file_access(), errmsg("could not get junction for \"%s\": %s", path, msg))); #else fprintf(stderr, _("could not get junction for \"%s\": %s\n"), path, msg); #endif LocalFree(msg); CloseHandle(h); errno = EINVAL; return -1; } CloseHandle(h); /* Got it, let's get some results from this */ if (reparseBuf->ReparseTag != IO_REPARSE_TAG_MOUNT_POINT) { errno = EINVAL; return -1; } r = WideCharToMultiByte(CP_ACP, 0, reparseBuf->PathBuffer, -1, buf, size, NULL, NULL); if (r <= 0) { errno = EINVAL; return -1; } /* * If the path starts with "\??\", which it will do in most (all?) cases, * strip those out. */ if (r > 4 && strncmp(buf, "\\??\\", 4) == 0) { memmove(buf, buf + 4, strlen(buf + 4) + 1); r -= 4; } return r; }
/* * Read 'nbytes' bytes from WAL into 'buf', starting at location 'recptr' * * XXX probably this should be improved to suck data directly from the * WAL buffers when possible. */ static void XLogRead(char *buf, XLogRecPtr recptr, Size nbytes) { XLogRecPtr startRecPtr = recptr; char path[MAXPGPATH]; uint32 lastRemovedLog; uint32 lastRemovedSeg; uint32 log; uint32 seg; while (nbytes > 0) { uint32 startoff; int segbytes; int readbytes; startoff = recptr.xrecoff % XLogSegSize; if (sendFile < 0 || !XLByteInSeg(recptr, sendId, sendSeg)) { /* Switch to another logfile segment */ if (sendFile >= 0) close(sendFile); XLByteToSeg(recptr, sendId, sendSeg); XLogFilePath(path, ThisTimeLineID, sendId, sendSeg); sendFile = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0); if (sendFile < 0) { /* * If the file is not found, assume it's because the standby * asked for a too old WAL segment that has already been * removed or recycled. */ if (errno == ENOENT) { char filename[MAXFNAMELEN]; XLogFileName(filename, ThisTimeLineID, sendId, sendSeg); ereport(ERROR, (errcode_for_file_access(), errmsg("requested WAL segment %s has already been removed", filename))); } else ereport(ERROR, (errcode_for_file_access(), errmsg("could not open file \"%s\" (log file %u, segment %u): %m", path, sendId, sendSeg))); } sendOff = 0; } /* Need to seek in the file? */ if (sendOff != startoff) { if (lseek(sendFile, (off_t) startoff, SEEK_SET) < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not seek in log file %u, segment %u to offset %u: %m", sendId, sendSeg, startoff))); sendOff = startoff; } /* How many bytes are within this segment? */ if (nbytes > (XLogSegSize - startoff)) segbytes = XLogSegSize - startoff; else segbytes = nbytes; readbytes = read(sendFile, buf, segbytes); if (readbytes <= 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not read from log file %u, segment %u, offset %u, " "length %lu: %m", sendId, sendSeg, sendOff, (unsigned long) segbytes))); /* Update state for read */ XLByteAdvance(recptr, readbytes); sendOff += readbytes; nbytes -= readbytes; buf += readbytes; } /* * After reading into the buffer, check that what we read was valid. We do * this after reading, because even though the segment was present when we * opened it, it might get recycled or removed while we read it. The * read() succeeds in that case, but the data we tried to read might * already have been overwritten with new WAL records. */ XLogGetLastRemoved(&lastRemovedLog, &lastRemovedSeg); XLByteToSeg(startRecPtr, log, seg); if (log < lastRemovedLog || (log == lastRemovedLog && seg <= lastRemovedSeg)) { char filename[MAXFNAMELEN]; XLogFileName(filename, ThisTimeLineID, log, seg); ereport(ERROR, (errcode_for_file_access(), errmsg("requested WAL segment %s has already been removed", filename))); } }
/* * copydir: copy a directory * * If recurse is false, subdirectories are ignored. Anything that's not * a directory or a regular file is ignored. */ void copydir(char *fromdir, char *todir, bool recurse) { DIR *xldir; struct dirent *xlde; char fromfile[MAXPGPATH]; char tofile[MAXPGPATH]; if (mkdir(todir, S_IRWXU) != 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not create directory \"%s\": %m", todir))); xldir = AllocateDir(fromdir); if (xldir == NULL) ereport(ERROR, (errcode_for_file_access(), errmsg("could not open directory \"%s\": %m", fromdir))); while ((xlde = ReadDir(xldir, fromdir)) != NULL) { struct stat fst; /* If we got a cancel signal during the copy of the directory, quit */ CHECK_FOR_INTERRUPTS(); if (strcmp(xlde->d_name, ".") == 0 || strcmp(xlde->d_name, "..") == 0) continue; snprintf(fromfile, MAXPGPATH, "%s/%s", fromdir, xlde->d_name); snprintf(tofile, MAXPGPATH, "%s/%s", todir, xlde->d_name); if (lstat(fromfile, &fst) < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not stat file \"%s\": %m", fromfile))); if (S_ISDIR(fst.st_mode)) { /* recurse to handle subdirectories */ if (recurse) copydir(fromfile, tofile, true); } else if (S_ISREG(fst.st_mode)) copy_file(fromfile, tofile); } FreeDir(xldir); /* * Be paranoid here and fsync all files to ensure the copy is really done. * But if fsync is disabled, we're done. */ if (!enableFsync) return; xldir = AllocateDir(todir); if (xldir == NULL) ereport(ERROR, (errcode_for_file_access(), errmsg("could not open directory \"%s\": %m", todir))); while ((xlde = ReadDir(xldir, todir)) != NULL) { struct stat fst; if (strcmp(xlde->d_name, ".") == 0 || strcmp(xlde->d_name, "..") == 0) continue; snprintf(tofile, MAXPGPATH, "%s/%s", todir, xlde->d_name); /* * We don't need to sync subdirectories here since the recursive * copydir will do it before it returns */ if (lstat(tofile, &fst) < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not stat file \"%s\": %m", tofile))); if (S_ISREG(fst.st_mode)) fsync_fname(tofile, false); } FreeDir(xldir); /* * It's important to fsync the destination directory itself as individual * file fsyncs don't guarantee that the directory entry for the file is * synced. Recent versions of ext4 have made the window much wider but * it's been true for ext3 and other filesystems in the past. */ fsync_fname(todir, true); }
/* * copy one file */ static void copy_file(char *fromfile, char *tofile) { char *buffer; int srcfd; int dstfd; int nbytes; /* Use palloc to ensure we get a maxaligned buffer */ #define COPY_BUF_SIZE (8 * BLCKSZ) buffer = palloc(COPY_BUF_SIZE); /* * Open the files */ srcfd = BasicOpenFile(fromfile, O_RDONLY | PG_BINARY, 0); if (srcfd < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not open file \"%s\": %m", fromfile))); dstfd = BasicOpenFile(tofile, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, S_IRUSR | S_IWUSR); if (dstfd < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not create file \"%s\": %m", tofile))); /* * Do the data copying. */ for (;;) { nbytes = read(srcfd, buffer, COPY_BUF_SIZE); if (nbytes < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not read file \"%s\": %m", fromfile))); if (nbytes == 0) break; errno = 0; if ((int) write(dstfd, buffer, nbytes) != nbytes) { /* if write didn't set errno, assume problem is no disk space */ if (errno == 0) errno = ENOSPC; ereport(ERROR, (errcode_for_file_access(), errmsg("could not write to file \"%s\": %m", tofile))); } } /* * Be paranoid here to ensure we catch problems. */ if (pg_fsync(dstfd) != 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not fsync file \"%s\": %m", tofile))); if (close(dstfd)) ereport(ERROR, (errcode_for_file_access(), errmsg("could not close file \"%s\": %m", tofile))); close(srcfd); pfree(buffer); }
/* * Attempt to retrieve the specified file from off-line archival storage. * If successful, fill "path" with its complete path (note that this will be * a temp file name that doesn't follow the normal naming convention), and * return TRUE. * * If not successful, fill "path" with the name of the normal on-line file * (which may or may not actually exist, but we'll try to use it), and return * FALSE. * * For fixed-size files, the caller may pass the expected size as an * additional crosscheck on successful recovery. If the file size is not * known, set expectedSize = 0. * * When 'cleanupEnabled' is false, refrain from deleting any old WAL segments * in the archive. This is used when fetching the initial checkpoint record, * when we are not yet sure how far back we need the WAL. */ bool RestoreArchivedFile(char *path, const char *xlogfname, const char *recovername, off_t expectedSize, bool cleanupEnabled) { char xlogpath[MAXPGPATH]; char xlogRestoreCmd[MAXPGPATH]; char lastRestartPointFname[MAXPGPATH]; char *dp; char *endp; const char *sp; int rc; bool signaled; struct stat stat_buf; XLogSegNo restartSegNo; XLogRecPtr restartRedoPtr; TimeLineID restartTli; /* In standby mode, restore_command might not be supplied */ if (recoveryRestoreCommand == NULL) goto not_available; /* * When doing archive recovery, we always prefer an archived log file even * if a file of the same name exists in XLOGDIR. The reason is that the * file in XLOGDIR could be an old, un-filled or partly-filled version * that was copied and restored as part of backing up $PGDATA. * * We could try to optimize this slightly by checking the local copy * lastchange timestamp against the archived copy, but we have no API to * do this, nor can we guarantee that the lastchange timestamp was * preserved correctly when we copied to archive. Our aim is robustness, * so we elect not to do this. * * If we cannot obtain the log file from the archive, however, we will try * to use the XLOGDIR file if it exists. This is so that we can make use * of log segments that weren't yet transferred to the archive. * * Notice that we don't actually overwrite any files when we copy back * from archive because the restore_command may inadvertently * restore inappropriate xlogs, or they may be corrupt, so we may wish to * fallback to the segments remaining in current XLOGDIR later. The * copy-from-archive filename is always the same, ensuring that we don't * run out of disk space on long recoveries. */ snprintf(xlogpath, MAXPGPATH, XLOGDIR "/%s", recovername); /* * Make sure there is no existing file named recovername. */ if (stat(xlogpath, &stat_buf) != 0) { if (errno != ENOENT) ereport(FATAL, (errcode_for_file_access(), errmsg("could not stat file \"%s\": %m", xlogpath))); } else { if (unlink(xlogpath) != 0) ereport(FATAL, (errcode_for_file_access(), errmsg("could not remove file \"%s\": %m", xlogpath))); } /* * Calculate the archive file cutoff point for use during log shipping * replication. All files earlier than this point can be deleted from the * archive, though there is no requirement to do so. * * If cleanup is not enabled, initialise this with the filename of * InvalidXLogRecPtr, which will prevent the deletion of any WAL files * from the archive because of the alphabetic sorting property of WAL * filenames. * * Once we have successfully located the redo pointer of the checkpoint * from which we start recovery we never request a file prior to the redo * pointer of the last restartpoint. When redo begins we know that we have * successfully located it, so there is no need for additional status * flags to signify the point when we can begin deleting WAL files from * the archive. */ if (cleanupEnabled) { GetOldestRestartPoint(&restartRedoPtr, &restartTli); XLByteToSeg(restartRedoPtr, restartSegNo); XLogFileName(lastRestartPointFname, restartTli, restartSegNo); /* we shouldn't need anything earlier than last restart point */ Assert(strcmp(lastRestartPointFname, xlogfname) <= 0); } else XLogFileName(lastRestartPointFname, 0, 0L); /* * construct the command to be executed */ dp = xlogRestoreCmd; endp = xlogRestoreCmd + MAXPGPATH - 1; *endp = '\0'; for (sp = recoveryRestoreCommand; *sp; sp++) { if (*sp == '%') { switch (sp[1]) { case 'p': /* %p: relative path of target file */ sp++; StrNCpy(dp, xlogpath, endp - dp); make_native_path(dp); dp += strlen(dp); break; case 'f': /* %f: filename of desired file */ sp++; StrNCpy(dp, xlogfname, endp - dp); dp += strlen(dp); break; case 'r': /* %r: filename of last restartpoint */ sp++; StrNCpy(dp, lastRestartPointFname, endp - dp); dp += strlen(dp); break; case '%': /* convert %% to a single % */ sp++; if (dp < endp) *dp++ = *sp; break; default: /* otherwise treat the % as not special */ if (dp < endp) *dp++ = *sp; break; } } else { if (dp < endp) *dp++ = *sp; } } *dp = '\0'; ereport(DEBUG3, (errmsg_internal("executing restore command \"%s\"", xlogRestoreCmd))); /* * Check signals before restore command and reset afterwards. */ PreRestoreCommand(); /* * Copy xlog from archival storage to XLOGDIR */ rc = system(xlogRestoreCmd); PostRestoreCommand(); if (rc == 0) { /* * command apparently succeeded, but let's make sure the file is * really there now and has the correct size. */ if (stat(xlogpath, &stat_buf) == 0) { if (expectedSize > 0 && stat_buf.st_size != expectedSize) { int elevel; /* * If we find a partial file in standby mode, we assume it's * because it's just being copied to the archive, and keep * trying. * * Otherwise treat a wrong-sized file as FATAL to ensure the * DBA would notice it, but is that too strong? We could try * to plow ahead with a local copy of the file ... but the * problem is that there probably isn't one, and we'd * incorrectly conclude we've reached the end of WAL and we're * done recovering ... */ if (StandbyMode && stat_buf.st_size < expectedSize) elevel = DEBUG1; else elevel = FATAL; ereport(elevel, (errmsg("archive file \"%s\" has wrong size: %lu instead of %lu", xlogfname, (unsigned long) stat_buf.st_size, (unsigned long) expectedSize))); return false; } else { ereport(LOG, (errmsg("restored log file \"%s\" from archive", xlogfname))); strcpy(path, xlogpath); return true; } } else { /* stat failed */ if (errno != ENOENT) ereport(FATAL, (errcode_for_file_access(), errmsg("could not stat file \"%s\": %m", xlogpath))); } } /* * Remember, we rollforward UNTIL the restore fails so failure here is * just part of the process... that makes it difficult to determine * whether the restore failed because there isn't an archive to restore, * or because the administrator has specified the restore program * incorrectly. We have to assume the former. * * However, if the failure was due to any sort of signal, it's best to * punt and abort recovery. (If we "return false" here, upper levels will * assume that recovery is complete and start up the database!) It's * essential to abort on child SIGINT and SIGQUIT, because per spec * system() ignores SIGINT and SIGQUIT while waiting; if we see one of * those it's a good bet we should have gotten it too. * * On SIGTERM, assume we have received a fast shutdown request, and exit * cleanly. It's pure chance whether we receive the SIGTERM first, or the * child process. If we receive it first, the signal handler will call * proc_exit, otherwise we do it here. If we or the child process received * SIGTERM for any other reason than a fast shutdown request, postmaster * will perform an immediate shutdown when it sees us exiting * unexpectedly. * * Per the Single Unix Spec, shells report exit status > 128 when a called * command died on a signal. Also, 126 and 127 are used to report * problems such as an unfindable command; treat those as fatal errors * too. */ if (WIFSIGNALED(rc) && WTERMSIG(rc) == SIGTERM) proc_exit(1); signaled = WIFSIGNALED(rc) || WEXITSTATUS(rc) > 125; ereport(signaled ? FATAL : DEBUG2, (errmsg("could not restore file \"%s\" from archive: return code %d", xlogfname, rc))); not_available: /* * if an archived file is not available, there might still be a version of * this file in XLOGDIR, so return that as the filename to open. * * In many recovery scenarios we expect this to fail also, but if so that * just means we've reached the end of WAL. */ snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname); return false; }
/* * ParseWorkerNodeFile opens and parses the node name and node port from the * specified configuration file. The function relies on the file being at the * top level in the data directory. */ static List * ParseWorkerNodeFile(char *workerNodeFilename) { FILE *workerFileStream = NULL; List *workerNodeList = NIL; char workerNodeLine[MAXPGPATH]; char *workerFilePath = make_absolute_path(workerNodeFilename); char workerLinePattern[1024]; memset(workerLinePattern, '\0', sizeof(workerLinePattern)); workerFileStream = AllocateFile(workerFilePath, PG_BINARY_R); if (workerFileStream == NULL) { ereport(ERROR, (errcode_for_file_access(), errmsg("could not open worker list file \"%s\": %m", workerFilePath))); } /* build pattern to contain node name length limit */ snprintf(workerLinePattern, sizeof(workerLinePattern), "%%%us%%*[ \t]%%10u", MAX_NODE_LENGTH); while (fgets(workerNodeLine, sizeof(workerNodeLine), workerFileStream) != NULL) { WorkerNode *workerNode = NULL; char *linePointer = NULL; uint32 nodePort = 0; int parsedValues = 0; char nodeName[MAX_NODE_LENGTH + 1]; memset(nodeName, '\0', sizeof(nodeName)); if (strnlen(workerNodeLine, MAXPGPATH) == MAXPGPATH - 1) { ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("worker node list file line exceeds the maximum " "length of %d", MAXPGPATH))); } /* skip leading whitespace and check for # comment */ for (linePointer = workerNodeLine; *linePointer; linePointer++) { if (!isspace((unsigned char) *linePointer)) { break; } } if (*linePointer == '\0' || *linePointer == '#') { continue; } /* parse out the node name and node port */ parsedValues = sscanf(workerNodeLine, workerLinePattern, nodeName, &nodePort); if (parsedValues != 2) { ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("could not parse worker node line: %s", workerNodeLine), errhint("Lines in the worker node file consist of a node " "name and port separated by whitespace. Lines that " "start with a '#' character are skipped."))); } /* allocate worker node structure and set fields */ workerNode = (WorkerNode *) palloc0(sizeof(WorkerNode)); workerNode->nodeName = palloc(sizeof(char) * MAX_NODE_LENGTH + 1); strlcpy(workerNode->nodeName, nodeName, MAX_NODE_LENGTH + 1); workerNode->nodePort = nodePort; workerNodeList = lappend(workerNodeList, workerNode); } FreeFile(workerFileStream); free(workerFilePath); return workerNodeList; }
/* -------------------------------- * InitPostgres * Initialize POSTGRES. * * The database can be specified by name, using the in_dbname parameter, or by * OID, using the dboid parameter. In the latter case, the actual database * name can be returned to the caller in out_dbname. If out_dbname isn't * NULL, it must point to a buffer of size NAMEDATALEN. * * In bootstrap mode no parameters are used. The autovacuum launcher process * doesn't use any parameters either, because it only goes far enough to be * able to read pg_database; it doesn't connect to any particular database. * In walsender mode only username is used. * * As of PostgreSQL 8.2, we expect InitProcess() was already called, so we * already have a PGPROC struct ... but it's not completely filled in yet. * * Note: * Be very careful with the order of calls in the InitPostgres function. * -------------------------------- */ void InitPostgres(const char *in_dbname, Oid dboid, const char *username, char *out_dbname) { bool bootstrap = IsBootstrapProcessingMode(); bool am_superuser; char *fullpath; char dbname[NAMEDATALEN]; elog(DEBUG3, "InitPostgres"); /* * Add my PGPROC struct to the ProcArray. * * Once I have done this, I am visible to other backends! */ InitProcessPhase2(); /* * Initialize my entry in the shared-invalidation manager's array of * per-backend data. * * Sets up MyBackendId, a unique backend identifier. */ MyBackendId = InvalidBackendId; SharedInvalBackendInit(false); if (MyBackendId > MaxBackends || MyBackendId <= 0) elog(FATAL, "bad backend ID: %d", MyBackendId); /* Now that we have a BackendId, we can participate in ProcSignal */ ProcSignalInit(MyBackendId); /* * Also set up timeout handlers needed for backend operation. We need * these in every case except bootstrap. */ if (!bootstrap) { RegisterTimeout(DEADLOCK_TIMEOUT, CheckDeadLock); RegisterTimeout(STATEMENT_TIMEOUT, StatementTimeoutHandler); RegisterTimeout(LOCK_TIMEOUT, LockTimeoutHandler); } /* * bufmgr needs another initialization call too */ InitBufferPoolBackend(); /* * Initialize local process's access to XLOG. */ if (IsUnderPostmaster) { /* * The postmaster already started the XLOG machinery, but we need to * call InitXLOGAccess(), if the system isn't in hot-standby mode. * This is handled by calling RecoveryInProgress and ignoring the * result. */ (void) RecoveryInProgress(); } else { /* * We are either a bootstrap process or a standalone backend. Either * way, start up the XLOG machinery, and register to have it closed * down at exit. */ StartupXLOG(); on_shmem_exit(ShutdownXLOG, 0); } /* * Initialize the relation cache and the system catalog caches. Note that * no catalog access happens here; we only set up the hashtable structure. * We must do this before starting a transaction because transaction abort * would try to touch these hashtables. */ RelationCacheInitialize(); InitCatalogCache(); InitPlanCache(); /* Initialize portal manager */ EnablePortalManager(); /* Initialize stats collection --- must happen before first xact */ if (!bootstrap) pgstat_initialize(); /* * Load relcache entries for the shared system catalogs. This must create * at least entries for pg_database and catalogs used for authentication. */ RelationCacheInitializePhase2(); /* * Set up process-exit callback to do pre-shutdown cleanup. This is the * first before_shmem_exit callback we register; thus, this will be the * last thing we do before low-level modules like the buffer manager begin * to close down. We need to have this in place before we begin our first * transaction --- if we fail during the initialization transaction, as is * entirely possible, we need the AbortTransaction call to clean up. */ before_shmem_exit(ShutdownPostgres, 0); /* The autovacuum launcher is done here */ if (IsAutoVacuumLauncherProcess()) return; /* The continuous query scheduler is done here */ if (IsContQuerySchedulerProcess()) return; /* * Start a new transaction here before first access to db, and get a * snapshot. We don't have a use for the snapshot itself, but we're * interested in the secondary effect that it sets RecentGlobalXmin. (This * is critical for anything that reads heap pages, because HOT may decide * to prune them even if the process doesn't attempt to modify any * tuples.) */ if (!bootstrap) { /* statement_timestamp must be set for timeouts to work correctly */ SetCurrentStatementStartTimestamp(); StartTransactionCommand(); /* * transaction_isolation will have been set to the default by the * above. If the default is "serializable", and we are in hot * standby, we will fail if we don't change it to something lower. * Fortunately, "read committed" is plenty good enough. */ XactIsoLevel = XACT_READ_COMMITTED; (void) GetTransactionSnapshot(); } /* * Perform client authentication if necessary, then figure out our * postgres user ID, and see if we are a superuser. * * In standalone mode and in autovacuum worker processes, we use a fixed * ID, otherwise we figure it out from the authenticated user name. */ if (bootstrap || IsAutoVacuumWorkerProcess()) { InitializeSessionUserIdStandalone(); am_superuser = true; } else if (!IsUnderPostmaster) { InitializeSessionUserIdStandalone(); am_superuser = true; if (!ThereIsAtLeastOneRole()) ereport(WARNING, (errcode(ERRCODE_UNDEFINED_OBJECT), errmsg("no roles are defined in this database system"), errhint("You should immediately run CREATE USER \"%s\" SUPERUSER;.", username))); } else if (IsBackgroundWorker) { if (username == NULL) { InitializeSessionUserIdStandalone(); am_superuser = true; } else { InitializeSessionUserId(username); am_superuser = superuser(); } } else { /* normal multiuser case */ Assert(MyProcPort != NULL); PerformAuthentication(MyProcPort); InitializeSessionUserId(username); am_superuser = superuser(); } /* * If we're trying to shut down, only superusers can connect, and new * replication connections are not allowed. */ if ((!am_superuser || am_walsender) && MyProcPort != NULL && MyProcPort->canAcceptConnections == CAC_WAITBACKUP) { if (am_walsender) ereport(FATAL, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("new replication connections are not allowed during database shutdown"))); else ereport(FATAL, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("must be superuser to connect during database shutdown"))); } /* * Binary upgrades only allowed super-user connections */ if (IsBinaryUpgrade && !am_superuser) { ereport(FATAL, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("must be superuser to connect in binary upgrade mode"))); } /* * The last few connections slots are reserved for superusers. Although * replication connections currently require superuser privileges, we * don't allow them to consume the reserved slots, which are intended for * interactive use. */ if ((!am_superuser || am_walsender) && ReservedBackends > 0 && !HaveNFreeProcs(ReservedBackends)) ereport(FATAL, (errcode(ERRCODE_TOO_MANY_CONNECTIONS), errmsg("remaining connection slots are reserved for non-replication superuser connections"))); /* Check replication permissions needed for walsender processes. */ if (am_walsender) { Assert(!bootstrap); if (!superuser() && !has_rolreplication(GetUserId())) ereport(FATAL, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("must be superuser or replication role to start walsender"))); } /* * If this is a plain walsender only supporting physical replication, we * don't want to connect to any particular database. Just finish the * backend startup by processing any options from the startup packet, and * we're done. */ if (am_walsender && !am_db_walsender) { /* process any options passed in the startup packet */ if (MyProcPort != NULL) process_startup_options(MyProcPort, am_superuser); /* Apply PostAuthDelay as soon as we've read all options */ if (PostAuthDelay > 0) pg_usleep(PostAuthDelay * 1000000L); /* initialize client encoding */ InitializeClientEncoding(); /* report this backend in the PgBackendStatus array */ pgstat_bestart(); /* close the transaction we started above */ CommitTransactionCommand(); return; } /* * Set up the global variables holding database id and default tablespace. * But note we won't actually try to touch the database just yet. * * We take a shortcut in the bootstrap case, otherwise we have to look up * the db's entry in pg_database. */ if (bootstrap) { MyDatabaseId = TemplateDbOid; MyDatabaseTableSpace = DEFAULTTABLESPACE_OID; } else if (in_dbname != NULL) { HeapTuple tuple; Form_pg_database dbform; tuple = GetDatabaseTuple(in_dbname); if (!HeapTupleIsValid(tuple)) ereport(FATAL, (errcode(ERRCODE_UNDEFINED_DATABASE), errmsg("database \"%s\" does not exist", in_dbname))); dbform = (Form_pg_database) GETSTRUCT(tuple); MyDatabaseId = HeapTupleGetOid(tuple); MyDatabaseTableSpace = dbform->dattablespace; /* take database name from the caller, just for paranoia */ strlcpy(dbname, in_dbname, sizeof(dbname)); } else { /* caller specified database by OID */ HeapTuple tuple; Form_pg_database dbform; tuple = GetDatabaseTupleByOid(dboid); if (!HeapTupleIsValid(tuple)) ereport(FATAL, (errcode(ERRCODE_UNDEFINED_DATABASE), errmsg("database %u does not exist", dboid))); dbform = (Form_pg_database) GETSTRUCT(tuple); MyDatabaseId = HeapTupleGetOid(tuple); MyDatabaseTableSpace = dbform->dattablespace; Assert(MyDatabaseId == dboid); strlcpy(dbname, NameStr(dbform->datname), sizeof(dbname)); /* pass the database name back to the caller */ if (out_dbname) strcpy(out_dbname, dbname); } /* * Now, take a writer's lock on the database we are trying to connect to. * If there is a concurrently running DROP DATABASE on that database, this * will block us until it finishes (and has committed its update of * pg_database). * * Note that the lock is not held long, only until the end of this startup * transaction. This is OK since we will advertise our use of the * database in the ProcArray before dropping the lock (in fact, that's the * next thing to do). Anyone trying a DROP DATABASE after this point will * see us in the array once they have the lock. Ordering is important for * this because we don't want to advertise ourselves as being in this * database until we have the lock; otherwise we create what amounts to a * deadlock with CountOtherDBBackends(). * * Note: use of RowExclusiveLock here is reasonable because we envision * our session as being a concurrent writer of the database. If we had a * way of declaring a session as being guaranteed-read-only, we could use * AccessShareLock for such sessions and thereby not conflict against * CREATE DATABASE. */ if (!bootstrap) LockSharedObject(DatabaseRelationId, MyDatabaseId, 0, RowExclusiveLock); /* * Now we can mark our PGPROC entry with the database ID. * * We assume this is an atomic store so no lock is needed; though actually * things would work fine even if it weren't atomic. Anyone searching the * ProcArray for this database's ID should hold the database lock, so they * would not be executing concurrently with this store. A process looking * for another database's ID could in theory see a chance match if it read * a partially-updated databaseId value; but as long as all such searches * wait and retry, as in CountOtherDBBackends(), they will certainly see * the correct value on their next try. */ MyProc->databaseId = MyDatabaseId; /* * We established a catalog snapshot while reading pg_authid and/or * pg_database; but until we have set up MyDatabaseId, we won't react to * incoming sinval messages for unshared catalogs, so we won't realize it * if the snapshot has been invalidated. Assume it's no good anymore. */ InvalidateCatalogSnapshot(); /* * Recheck pg_database to make sure the target database hasn't gone away. * If there was a concurrent DROP DATABASE, this ensures we will die * cleanly without creating a mess. */ if (!bootstrap) { HeapTuple tuple; tuple = GetDatabaseTuple(dbname); if (!HeapTupleIsValid(tuple) || MyDatabaseId != HeapTupleGetOid(tuple) || MyDatabaseTableSpace != ((Form_pg_database) GETSTRUCT(tuple))->dattablespace) ereport(FATAL, (errcode(ERRCODE_UNDEFINED_DATABASE), errmsg("database \"%s\" does not exist", dbname), errdetail("It seems to have just been dropped or renamed."))); } /* * Now we should be able to access the database directory safely. Verify * it's there and looks reasonable. */ fullpath = GetDatabasePath(MyDatabaseId, MyDatabaseTableSpace); if (!bootstrap) { if (access(fullpath, F_OK) == -1) { if (errno == ENOENT) ereport(FATAL, (errcode(ERRCODE_UNDEFINED_DATABASE), errmsg("database \"%s\" does not exist", dbname), errdetail("The database subdirectory \"%s\" is missing.", fullpath))); else ereport(FATAL, (errcode_for_file_access(), errmsg("could not access directory \"%s\": %m", fullpath))); } ValidatePgVersion(fullpath); } SetDatabasePath(fullpath); /* * It's now possible to do real access to the system catalogs. * * Load relcache entries for the system catalogs. This must create at * least the minimum set of "nailed-in" cache entries. */ RelationCacheInitializePhase3(); /* set up ACL framework (so CheckMyDatabase can check permissions) */ initialize_acl(); /* * Re-read the pg_database row for our database, check permissions and set * up database-specific GUC settings. We can't do this until all the * database-access infrastructure is up. (Also, it wants to know if the * user is a superuser, so the above stuff has to happen first.) */ if (!bootstrap) CheckMyDatabase(dbname, am_superuser); /* * Now process any command-line switches and any additional GUC variable * settings passed in the startup packet. We couldn't do this before * because we didn't know if client is a superuser. */ if (MyProcPort != NULL) process_startup_options(MyProcPort, am_superuser); /* Process pg_db_role_setting options */ process_settings(MyDatabaseId, GetSessionUserId()); /* Apply PostAuthDelay as soon as we've read all options */ if (PostAuthDelay > 0) pg_usleep(PostAuthDelay * 1000000L); /* * Initialize various default states that can't be set up until we've * selected the active user and gotten the right GUC settings. */ /* set default namespace search path */ InitializeSearchPath(); /* initialize client encoding */ InitializeClientEncoding(); /* report this backend in the PgBackendStatus array */ if (!bootstrap) pgstat_bestart(); /* initialize all PipelineDB stuff */ if (!bootstrap) PipelineShmemInit(); /* close the transaction we started above */ if (!bootstrap) CommitTransactionCommand(); }
/* * Setup_AF_UNIX -- configure unix socket permissions */ static int Setup_AF_UNIX(void) { /* Arrange to unlink the socket file at exit */ on_proc_exit(StreamDoUnlink, 0); /* * Fix socket ownership/permission if requested. Note we must do this * before we listen() to avoid a window where unwanted connections could * get accepted. */ Assert(Unix_socket_group); if (Unix_socket_group[0] != '\0') { #ifdef WIN32 elog(WARNING, "configuration item unix_socket_group is not supported on this platform"); #else char *endptr; unsigned long val; gid_t gid; val = strtoul(Unix_socket_group, &endptr, 10); if (*endptr == '\0') { /* numeric group id */ gid = val; } else { /* convert group name to id */ struct group *gr; gr = getgrnam(Unix_socket_group); if (!gr) { ereport(LOG, (errmsg("group \"%s\" does not exist", Unix_socket_group))); return STATUS_ERROR; } gid = gr->gr_gid; } if (chown(sock_path, -1, gid) == -1) { ereport(LOG, (errcode_for_file_access(), errmsg("could not set group of file \"%s\": %m", sock_path))); return STATUS_ERROR; } #endif } if (chmod(sock_path, Unix_socket_permissions) == -1) { ereport(LOG, (errcode_for_file_access(), errmsg("could not set permissions of file \"%s\": %m", sock_path))); return STATUS_ERROR; } return STATUS_OK; }
/* * TODO: This is duplicate code with pg_xlogdump, similar to walsender.c, but * we currently don't have the infrastructure (elog!) to share it. */ static void XLogRead(char *buf, TimeLineID tli, XLogRecPtr startptr, Size count) { char *p; XLogRecPtr recptr; Size nbytes; static int sendFile = -1; static XLogSegNo sendSegNo = 0; static uint32 sendOff = 0; p = buf; recptr = startptr; nbytes = count; while (nbytes > 0) { uint32 startoff; int segbytes; int readbytes; startoff = recptr % XLogSegSize; if (sendFile < 0 || !XLByteInSeg(recptr, sendSegNo)) { char path[MAXPGPATH]; /* Switch to another logfile segment */ if (sendFile >= 0) close(sendFile); XLByteToSeg(recptr, sendSegNo); XLogFilePath(path, tli, sendSegNo); sendFile = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0); if (sendFile < 0) { if (errno == ENOENT) ereport(ERROR, (errcode_for_file_access(), errmsg("requested WAL segment %s has already been removed", path))); else ereport(ERROR, (errcode_for_file_access(), errmsg("could not open file \"%s\": %m", path))); } sendOff = 0; } /* Need to seek in the file? */ if (sendOff != startoff) { if (lseek(sendFile, (off_t) startoff, SEEK_SET) < 0) { char path[MAXPGPATH]; XLogFilePath(path, tli, sendSegNo); ereport(ERROR, (errcode_for_file_access(), errmsg("could not seek in log segment %s to offset %u: %m", path, startoff))); } sendOff = startoff; } /* How many bytes are within this segment? */ if (nbytes > (XLogSegSize - startoff)) segbytes = XLogSegSize - startoff; else segbytes = nbytes; readbytes = read(sendFile, p, segbytes); if (readbytes <= 0) { char path[MAXPGPATH]; XLogFilePath(path, tli, sendSegNo); ereport(ERROR, (errcode_for_file_access(), errmsg("could not read from log segment %s, offset %u, length %lu: %m", path, sendOff, (unsigned long) segbytes))); } /* Update state for read */ recptr += readbytes; sendOff += readbytes; nbytes -= readbytes; p += readbytes; } }
/* * Calculate total size of tablespace. Returns -1 if the tablespace directory * cannot be found. */ static int64 calculate_tablespace_size(Oid tblspcOid) { char tblspcPath[MAXPGPATH]; char pathname[MAXPGPATH * 2]; int64 totalsize = 0; DIR *dirdesc; struct dirent *direntry; AclResult aclresult; /* * User must be a member of pg_read_all_stats or have CREATE privilege for * target tablespace, either explicitly granted or implicitly because * it is default for current database. */ if (tblspcOid != MyDatabaseTableSpace && !is_member_of_role(GetUserId(), DEFAULT_ROLE_READ_ALL_STATS)) { aclresult = pg_tablespace_aclcheck(tblspcOid, GetUserId(), ACL_CREATE); if (aclresult != ACLCHECK_OK) aclcheck_error(aclresult, ACL_KIND_TABLESPACE, get_tablespace_name(tblspcOid)); } if (tblspcOid == DEFAULTTABLESPACE_OID) snprintf(tblspcPath, MAXPGPATH, "base"); else if (tblspcOid == GLOBALTABLESPACE_OID) snprintf(tblspcPath, MAXPGPATH, "global"); else snprintf(tblspcPath, MAXPGPATH, "pg_tblspc/%u/%s", tblspcOid, TABLESPACE_VERSION_DIRECTORY); dirdesc = AllocateDir(tblspcPath); if (!dirdesc) return -1; while ((direntry = ReadDir(dirdesc, tblspcPath)) != NULL) { struct stat fst; CHECK_FOR_INTERRUPTS(); if (strcmp(direntry->d_name, ".") == 0 || strcmp(direntry->d_name, "..") == 0) continue; snprintf(pathname, sizeof(pathname), "%s/%s", tblspcPath, direntry->d_name); if (stat(pathname, &fst) < 0) { if (errno == ENOENT) continue; else ereport(ERROR, (errcode_for_file_access(), errmsg("could not stat file \"%s\": %m", pathname))); } if (S_ISDIR(fst.st_mode)) totalsize += db_dir_size(pathname); totalsize += fst.st_size; } FreeDir(dirdesc); return totalsize; }
/* * create_tablespace_directories * * Attempt to create filesystem infrastructure linking $PGDATA/pg_tblspc/ * to the specified directory */ static void create_tablespace_directories(const char *location, const Oid tablespaceoid) { char *linkloc; char *location_with_version_dir; linkloc = psprintf("pg_tblspc/%u", tablespaceoid); location_with_version_dir = psprintf("%s/%s", location, TABLESPACE_VERSION_DIRECTORY); /* * Attempt to coerce target directory to safe permissions. If this fails, * it doesn't exist or has the wrong owner. */ if (chmod(location, S_IRWXU) != 0) { if (errno == ENOENT) ereport(ERROR, (errcode(ERRCODE_UNDEFINED_FILE), errmsg("directory \"%s\" does not exist", location), InRecovery ? errhint("Create this directory for the tablespace before " "restarting the server.") : 0)); else ereport(ERROR, (errcode_for_file_access(), errmsg("could not set permissions on directory \"%s\": %m", location))); } if (InRecovery) { struct stat st; /* * Our theory for replaying a CREATE is to forcibly drop the target * subdirectory if present, and then recreate it. This may be more * work than needed, but it is simple to implement. */ if (stat(location_with_version_dir, &st) == 0 && S_ISDIR(st.st_mode)) { if (!rmtree(location_with_version_dir, true)) /* If this failed, mkdir() below is going to error. */ ereport(WARNING, (errmsg("some useless files may be left behind in old database directory \"%s\"", location_with_version_dir))); } } /* * The creation of the version directory prevents more than one tablespace * in a single location. */ if (mkdir(location_with_version_dir, S_IRWXU) < 0) { if (errno == EEXIST) ereport(ERROR, (errcode(ERRCODE_OBJECT_IN_USE), errmsg("directory \"%s\" already in use as a tablespace", location_with_version_dir))); else ereport(ERROR, (errcode_for_file_access(), errmsg("could not create directory \"%s\": %m", location_with_version_dir))); } /* Remove old symlink in recovery, in case it points to the wrong place */ if (InRecovery) { if (unlink(linkloc) < 0 && errno != ENOENT) ereport(ERROR, (errcode_for_file_access(), errmsg("could not remove symbolic link \"%s\": %m", linkloc))); } /* * Create the symlink under PGDATA */ if (symlink(location, linkloc) < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not create symbolic link \"%s\": %m", linkloc))); pfree(linkloc); pfree(location_with_version_dir); }
/* * Finish preparing state file. * * Calculates CRC and writes state file to WAL and in pg_twophase directory. */ void EndPrepare(GlobalTransaction gxact) { TransactionId xid = gxact->proc.xid; TwoPhaseFileHeader *hdr; char path[MAXPGPATH]; XLogRecData *record; pg_crc32 statefile_crc; pg_crc32 bogus_crc; int fd; /* Add the end sentinel to the list of 2PC records */ RegisterTwoPhaseRecord(TWOPHASE_RM_END_ID, 0, NULL, 0); /* Go back and fill in total_len in the file header record */ hdr = (TwoPhaseFileHeader *) records.head->data; Assert(hdr->magic == TWOPHASE_MAGIC); hdr->total_len = records.total_len + sizeof(pg_crc32); /* * Create the 2PC state file. * * Note: because we use BasicOpenFile(), we are responsible for ensuring * the FD gets closed in any error exit path. Once we get into the * critical section, though, it doesn't matter since any failure causes * PANIC anyway. */ TwoPhaseFilePath(path, xid); fd = BasicOpenFile(path, O_CREAT | O_EXCL | O_WRONLY | PG_BINARY, S_IRUSR | S_IWUSR); if (fd < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not create two-phase state file \"%s\": %m", path))); /* Write data to file, and calculate CRC as we pass over it */ INIT_CRC32(statefile_crc); for (record = records.head; record != NULL; record = record->next) { COMP_CRC32(statefile_crc, record->data, record->len); if ((write(fd, record->data, record->len)) != record->len) { close(fd); ereport(ERROR, (errcode_for_file_access(), errmsg("could not write two-phase state file: %m"))); } } FIN_CRC32(statefile_crc); /* * Write a deliberately bogus CRC to the state file; this is just paranoia * to catch the case where four more bytes will run us out of disk space. */ bogus_crc = ~statefile_crc; if ((write(fd, &bogus_crc, sizeof(pg_crc32))) != sizeof(pg_crc32)) { close(fd); ereport(ERROR, (errcode_for_file_access(), errmsg("could not write two-phase state file: %m"))); } /* Back up to prepare for rewriting the CRC */ if (lseek(fd, -((off_t) sizeof(pg_crc32)), SEEK_CUR) < 0) { close(fd); ereport(ERROR, (errcode_for_file_access(), errmsg("could not seek in two-phase state file: %m"))); } /* * The state file isn't valid yet, because we haven't written the correct * CRC yet. Before we do that, insert entry in WAL and flush it to disk. * * Between the time we have written the WAL entry and the time we write * out the correct state file CRC, we have an inconsistency: the xact is * prepared according to WAL but not according to our on-disk state. We * use a critical section to force a PANIC if we are unable to complete * the write --- then, WAL replay should repair the inconsistency. The * odds of a PANIC actually occurring should be very tiny given that we * were able to write the bogus CRC above. * * We have to lock out checkpoint start here, too; otherwise a checkpoint * starting immediately after the WAL record is inserted could complete * without fsync'ing our state file. (This is essentially the same kind * of race condition as the COMMIT-to-clog-write case that * RecordTransactionCommit uses CheckpointStartLock for; see notes there.) * * We save the PREPARE record's location in the gxact for later use by * CheckPointTwoPhase. */ START_CRIT_SECTION(); LWLockAcquire(CheckpointStartLock, LW_SHARED); gxact->prepare_lsn = XLogInsert(RM_XACT_ID, XLOG_XACT_PREPARE, records.head); XLogFlush(gxact->prepare_lsn); /* If we crash now, we have prepared: WAL replay will fix things */ /* write correct CRC and close file */ if ((write(fd, &statefile_crc, sizeof(pg_crc32))) != sizeof(pg_crc32)) { close(fd); ereport(ERROR, (errcode_for_file_access(), errmsg("could not write two-phase state file: %m"))); } if (close(fd) != 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not close two-phase state file: %m"))); /* * Mark the prepared transaction as valid. As soon as xact.c marks MyProc * as not running our XID (which it will do immediately after this * function returns), others can commit/rollback the xact. * * NB: a side effect of this is to make a dummy ProcArray entry for the * prepared XID. This must happen before we clear the XID from MyProc, * else there is a window where the XID is not running according to * TransactionIdInProgress, and onlookers would be entitled to assume the * xact crashed. Instead we have a window where the same XID appears * twice in ProcArray, which is OK. */ MarkAsPrepared(gxact); /* * Now we can release the checkpoint start lock: a checkpoint starting * after this will certainly see the gxact as a candidate for fsyncing. */ LWLockRelease(CheckpointStartLock); END_CRIT_SECTION(); records.tail = records.head = NULL; }
/* * CheckPointTwoPhase -- handle 2PC component of checkpointing. * * We must fsync the state file of any GXACT that is valid and has a PREPARE * LSN <= the checkpoint's redo horizon. (If the gxact isn't valid yet or * has a later LSN, this checkpoint is not responsible for fsyncing it.) * * This is deliberately run as late as possible in the checkpoint sequence, * because GXACTs ordinarily have short lifespans, and so it is quite * possible that GXACTs that were valid at checkpoint start will no longer * exist if we wait a little bit. * * If a GXACT remains valid across multiple checkpoints, it'll be fsynced * each time. This is considered unusual enough that we don't bother to * expend any extra code to avoid the redundant fsyncs. (They should be * reasonably cheap anyway, since they won't cause I/O.) */ void CheckPointTwoPhase(XLogRecPtr redo_horizon) { TransactionId *xids; int nxids; char path[MAXPGPATH]; int i; /* * We don't want to hold the TwoPhaseStateLock while doing I/O, so we grab * it just long enough to make a list of the XIDs that require fsyncing, * and then do the I/O afterwards. * * This approach creates a race condition: someone else could delete a * GXACT between the time we release TwoPhaseStateLock and the time we try * to open its state file. We handle this by special-casing ENOENT * failures: if we see that, we verify that the GXACT is no longer valid, * and if so ignore the failure. */ if (max_prepared_xacts <= 0) return; /* nothing to do */ xids = (TransactionId *) palloc(max_prepared_xacts * sizeof(TransactionId)); nxids = 0; LWLockAcquire(TwoPhaseStateLock, LW_SHARED); for (i = 0; i < TwoPhaseState->numPrepXacts; i++) { GlobalTransaction gxact = TwoPhaseState->prepXacts[i]; if (gxact->valid && XLByteLE(gxact->prepare_lsn, redo_horizon)) xids[nxids++] = gxact->proc.xid; } LWLockRelease(TwoPhaseStateLock); for (i = 0; i < nxids; i++) { TransactionId xid = xids[i]; int fd; TwoPhaseFilePath(path, xid); fd = BasicOpenFile(path, O_RDWR | PG_BINARY, 0); if (fd < 0) { if (errno == ENOENT) { /* OK if gxact is no longer valid */ if (!TransactionIdIsPrepared(xid)) continue; /* Restore errno in case it was changed */ errno = ENOENT; } ereport(ERROR, (errcode_for_file_access(), errmsg("could not open two-phase state file \"%s\": %m", path))); } if (pg_fsync(fd) != 0) { close(fd); ereport(ERROR, (errcode_for_file_access(), errmsg("could not fsync two-phase state file \"%s\": %m", path))); } if (close(fd) != 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not close two-phase state file \"%s\": %m", path))); } pfree(xids); }
/* * A file was restored from the archive under a temporary filename (path), * and now we want to keep it. Rename it under the permanent filename in * in pg_xlog (xlogfname), replacing any existing file with the same name. */ void KeepFileRestoredFromArchive(char *path, char *xlogfname) { char xlogfpath[MAXPGPATH]; bool reload = false; struct stat statbuf; snprintf(xlogfpath, MAXPGPATH, XLOGDIR "/%s", xlogfname); if (stat(xlogfpath, &statbuf) == 0) { char oldpath[MAXPGPATH]; #ifdef WIN32 static unsigned int deletedcounter = 1; /* * On Windows, if another process (e.g a walsender process) holds the * file open in FILE_SHARE_DELETE mode, unlink will succeed, but the * file will still show up in directory listing until the last handle * is closed, and we cannot rename the new file in its place until * that. To avoid that problem, rename the old file to a temporary * name first. Use a counter to create a unique filename, because the * same file might be restored from the archive multiple times, and a * walsender could still be holding onto an old deleted version of it. */ snprintf(oldpath, MAXPGPATH, "%s.deleted%u", xlogfpath, deletedcounter++); if (rename(xlogfpath, oldpath) != 0) { ereport(ERROR, (errcode_for_file_access(), errmsg("could not rename file \"%s\" to \"%s\": %m", xlogfpath, oldpath))); } #else strncpy(oldpath, xlogfpath, MAXPGPATH); #endif if (unlink(oldpath) != 0) ereport(FATAL, (errcode_for_file_access(), errmsg("could not remove file \"%s\": %m", xlogfpath))); reload = true; } if (rename(path, xlogfpath) < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not rename file \"%s\" to \"%s\": %m", path, xlogfpath))); /* * Create .done file forcibly to prevent the restored segment from being * archived again later. */ XLogArchiveForceDone(xlogfname); /* * If the existing file was replaced, since walsenders might have it open, * request them to reload a currently-open segment. This is only required * for WAL segments, walsenders don't hold other files open, but there's * no harm in doing this too often, and we don't know what kind of a file * we're dealing with here. */ if (reload) WalSndRqstFileReload(); /* * Signal walsender that new WAL has arrived. Again, this isn't necessary * if we restored something other than a WAL segment, but it does no harm * either. */ WalSndWakeup(); }
/** * @brief Read one record from input file and transfer literal string to * PostgreSQL internal format. * * Process flow * - If record buffer is empty * + Read records up to READ_LINE_NUM by read(2) * * Return 0 if we reach EOF. * * If an error occurs, notify it to caller by ereport(). * + Count the number of records in the record buffer. * + Initialize the number of used records to 0. * + Store the head byte of the next record. * - If the number of records remained in the record buffer and there is not * enough room, notify it to the caller by ereport(). * - Get back the stored head byte, and store the head byte of the next record. * - Update the number of records used. * @param rd [in/out] Control information * @return Return true if there is a next record, or false if EOF. */ static HeapTuple BinaryParserRead(BinaryParser *self, Checker *checker) { HeapTuple tuple; char *record; int i; /* Skip first offset lines in the input file */ if (unlikely(self->need_offset > 0)) { int i; for (i = 0; i < self->need_offset; i++) { int len; len = SourceRead(self->source, self->buffer, self->rec_len); if (len != self->rec_len) { if (errno == 0) errno = EINVAL; ereport(ERROR, (errcode_for_file_access(), errmsg("could not skip " int64_FMT " lines (" int64_FMT " bytes) in the input file: %m", self->need_offset, self->rec_len * self->need_offset))); } } self->need_offset = 0; } /* * If the record buffer is exhausted, read next records from file * up to READ_LINE_NUM rows at once. */ if (self->used_rec_cnt >= self->total_rec_cnt) { int len; div_t v; BULKLOAD_PROFILE(&prof_reader_parser); while ((len = SourceRead(self->source, self->buffer, self->rec_len * READ_LINE_NUM)) < 0) { if (errno != EAGAIN && errno != EINTR) ereport(ERROR, (errcode_for_file_access(), errmsg("could not read input file: %m"))); } BULKLOAD_PROFILE(&prof_reader_source); /* * Calculate the actual number of rows. Trailing remainder bytes * at the end of the input file are ingored with WARNING. */ v = div(len, self->rec_len); if (v.rem != 0) elog(WARNING, "Ignore %d bytes at the end of file", v.rem); self->total_rec_cnt = v.quot; self->used_rec_cnt = 0; if (self->total_rec_cnt <= 0) return NULL; /* eof */ record = self->buffer; } else { record = self->buffer + (self->rec_len * self->used_rec_cnt); } /* * Increment the position *before* parsing the record so that we can * skip it when there are some errors on parsing it. */ self->used_rec_cnt++; self->base.count++; for (i = 0; i < self->nfield; i++) { /* Convert it to server encoding. */ if (self->fields[i].character) { char *str = record + self->fields[i].offset; int next_head = self->fields[i].offset + self->fields[i].len; self->next_head = record[next_head]; record[next_head] = '\0'; self->base.parsing_field = i + 1; self->fields[i].in = CheckerConversion(checker, str); record[next_head] = self->next_head; } else { self->fields[i].in = record + self->fields[i].offset; } } ExtractValuesFromFixed(self, record); self->next_head = '\0'; self->base.parsing_field = -1; if (self->filter.funcstr) tuple = FilterTuple(&self->filter, &self->former, &self->base.parsing_field); else tuple = TupleFormerTuple(&self->former); return tuple; }
/* * destroy_tablespace_directories * * Attempt to remove filesystem infrastructure * * 'redo' indicates we are redoing a drop from XLOG; okay if nothing there * * Returns TRUE if successful, FALSE if some subdirectory is not empty */ static bool destroy_tablespace_directories(Oid tablespaceoid, bool redo) { char *linkloc; char *linkloc_with_version_dir; DIR *dirdesc; struct dirent *de; char *subfile; struct stat st; linkloc_with_version_dir = palloc(9 + 1 + OIDCHARS + 1 + strlen(TABLESPACE_VERSION_DIRECTORY)); sprintf(linkloc_with_version_dir, "pg_tblspc/%u/%s", tablespaceoid, TABLESPACE_VERSION_DIRECTORY); /* * Check if the tablespace still contains any files. We try to rmdir each * per-database directory we find in it. rmdir failure implies there are * still files in that subdirectory, so give up. (We do not have to worry * about undoing any already completed rmdirs, since the next attempt to * use the tablespace from that database will simply recreate the * subdirectory via TablespaceCreateDbspace.) * * Since we hold TablespaceCreateLock, no one else should be creating any * fresh subdirectories in parallel. It is possible that new files are * being created within subdirectories, though, so the rmdir call could * fail. Worst consequence is a less friendly error message. * * If redo is true then ENOENT is a likely outcome here, and we allow it * to pass without comment. In normal operation we still allow it, but * with a warning. This is because even though ProcessUtility disallows * DROP TABLESPACE in a transaction block, it's possible that a previous * DROP failed and rolled back after removing the tablespace directories * and symlink. We want to allow a new DROP attempt to succeed at * removing the catalog entries, so we should not give a hard error here. */ dirdesc = AllocateDir(linkloc_with_version_dir); if (dirdesc == NULL) { if (errno == ENOENT) { if (!redo) ereport(WARNING, (errcode_for_file_access(), errmsg("could not open directory \"%s\": %m", linkloc_with_version_dir))); pfree(linkloc_with_version_dir); return true; } /* else let ReadDir report the error */ } while ((de = ReadDir(dirdesc, linkloc_with_version_dir)) != NULL) { if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) continue; subfile = palloc(strlen(linkloc_with_version_dir) + 1 + strlen(de->d_name) + 1); sprintf(subfile, "%s/%s", linkloc_with_version_dir, de->d_name); /* This check is just to deliver a friendlier error message */ if (!directory_is_empty(subfile)) { FreeDir(dirdesc); pfree(subfile); pfree(linkloc_with_version_dir); return false; } /* remove empty directory */ if (rmdir(subfile) < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not remove directory \"%s\": %m", subfile))); pfree(subfile); } FreeDir(dirdesc); /* remove version directory */ if (rmdir(linkloc_with_version_dir) < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not remove directory \"%s\": %m", linkloc_with_version_dir))); /* * Try to remove the symlink. We must however deal with the possibility * that it's a directory instead of a symlink --- this could happen during * WAL replay (see TablespaceCreateDbspace), and it is also the case on * Windows where junction points lstat() as directories. */ linkloc = pstrdup(linkloc_with_version_dir); get_parent_directory(linkloc); if (lstat(linkloc, &st) == 0 && S_ISDIR(st.st_mode)) { if (rmdir(linkloc) < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not remove directory \"%s\": %m", linkloc))); } else { if (unlink(linkloc) < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not remove symbolic link \"%s\": %m", linkloc))); } pfree(linkloc_with_version_dir); pfree(linkloc); return true; }
Datum pg_tablespace_databases(PG_FUNCTION_ARGS) { FuncCallContext *funcctx; struct dirent *de; ts_db_fctx *fctx; if (SRF_IS_FIRSTCALL()) { MemoryContext oldcontext; Oid tablespaceOid = PG_GETARG_OID(0); funcctx = SRF_FIRSTCALL_INIT(); oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); fctx = palloc(sizeof(ts_db_fctx)); if (tablespaceOid == GLOBALTABLESPACE_OID) { fctx->dirdesc = NULL; ereport(WARNING, (errmsg("global tablespace never has databases"))); } else { if (tablespaceOid == DEFAULTTABLESPACE_OID) fctx->location = psprintf("base"); else fctx->location = psprintf("pg_tblspc/%u/%s", tablespaceOid, TABLESPACE_VERSION_DIRECTORY); fctx->dirdesc = AllocateDir(fctx->location); if (!fctx->dirdesc) { /* the only expected error is ENOENT */ if (errno != ENOENT) ereport(ERROR, (errcode_for_file_access(), errmsg("could not open directory \"%s\": %m", fctx->location))); ereport(WARNING, (errmsg("%u is not a tablespace OID", tablespaceOid))); } } funcctx->user_fctx = fctx; MemoryContextSwitchTo(oldcontext); } funcctx = SRF_PERCALL_SETUP(); fctx = (ts_db_fctx *) funcctx->user_fctx; if (!fctx->dirdesc) /* not a tablespace */ SRF_RETURN_DONE(funcctx); while ((de = ReadDir(fctx->dirdesc, fctx->location)) != NULL) { char *subdir; DIR *dirdesc; Oid datOid = atooid(de->d_name); /* this test skips . and .., but is awfully weak */ if (!datOid) continue; /* if database subdir is empty, don't report tablespace as used */ subdir = psprintf("%s/%s", fctx->location, de->d_name); dirdesc = AllocateDir(subdir); while ((de = ReadDir(dirdesc, subdir)) != NULL) { if (strcmp(de->d_name, ".") != 0 && strcmp(de->d_name, "..") != 0) break; } FreeDir(dirdesc); pfree(subdir); if (!de) continue; /* indeed, nothing in it */ SRF_RETURN_NEXT(funcctx, ObjectIdGetDatum(datOid)); } FreeDir(fctx->dirdesc); SRF_RETURN_DONE(funcctx); }
static int FileRepPrimary_ResyncWrite(FileRepResyncHashEntry_s *entry) { int status = STATUS_OK; Page page; Buffer buf; BlockNumber numBlocks; BlockNumber blkno; SMgrRelation smgr_relation; char relidstr[OIDCHARS + 1 + OIDCHARS + 1 + OIDCHARS + 1]; XLogRecPtr loc; int count = 0; int thresholdCount = 0; bool mirrorDataLossOccurred = FALSE; switch (entry->relStorageMgr) { case PersistentFileSysRelStorageMgr_BufferPool: switch (entry->mirrorDataSynchronizationState) { case MirroredRelDataSynchronizationState_BufferPoolScanIncremental: case MirroredRelDataSynchronizationState_FullCopy: smgr_relation = smgropen(entry->relFileNode); numBlocks = smgrnblocks(smgr_relation); snprintf(relidstr, sizeof(relidstr), "%u/%u/%u", smgr_relation->smgr_rnode.spcNode, smgr_relation->smgr_rnode.dbNode, smgr_relation->smgr_rnode.relNode); if (Debug_filerep_print) elog(LOG, "resync buffer pool relation '%s' number of blocks '%d' ", relidstr, numBlocks); thresholdCount = Min(numBlocks, 1024); /* * required in order to report how many blocks were synchronized * if gp_persistent_relation_node does not return that information */ if (entry->mirrorBufpoolResyncChangedPageCount == 0) { entry->mirrorBufpoolResyncChangedPageCount = numBlocks - entry->mirrorBufpoolResyncCkptBlockNum; } for (blkno = entry->mirrorBufpoolResyncCkptBlockNum; blkno < numBlocks; blkno++) { XLogRecPtr endResyncLSN = (isFullResync() ? FileRepResync_GetEndFullResyncLSN() : FileRepResync_GetEndIncrResyncLSN()); SIMPLE_FAULT_INJECTOR(FileRepResyncWorkerRead); FileRepResync_SetReadBufferRequest(); buf = ReadBuffer_Resync(smgr_relation, blkno); FileRepResync_ResetReadBufferRequest(); LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); page = BufferGetPage(buf); loc = PageGetLSN(page); if (Debug_filerep_print) { elog(LOG, "full resync buffer pool identifier '%s' num blocks '%d' blkno '%d' lsn begin change tracking '%s(%u/%u)' " "lsn page '%s(%u/%u)' lsn end change tracking '%s(%u/%u)' ", relidstr, numBlocks, blkno, XLogLocationToString(&entry->mirrorBufpoolResyncCkptLoc), entry->mirrorBufpoolResyncCkptLoc.xlogid, entry->mirrorBufpoolResyncCkptLoc.xrecoff, XLogLocationToString(&loc), loc.xlogid, loc.xrecoff, XLogLocationToString(&endResyncLSN), endResyncLSN.xlogid, endResyncLSN.xrecoff); } else { char tmpBuf[FILEREP_MAX_LOG_DESCRIPTION_LEN]; snprintf(tmpBuf, sizeof(tmpBuf), "full resync buffer pool identifier '%s' num blocks '%d' blkno '%d' lsn begin change tracking '%s(%u/%u)' ", relidstr, numBlocks, blkno, XLogLocationToString(&entry->mirrorBufpoolResyncCkptLoc), entry->mirrorBufpoolResyncCkptLoc.xlogid, entry->mirrorBufpoolResyncCkptLoc.xrecoff); FileRep_InsertConfigLogEntry(tmpBuf); snprintf(tmpBuf, sizeof(tmpBuf), "full resync buffer pool identifier '%s' lsn page '%s(%u/%u)' lsn end change tracking '%s(%u/%u)' ", relidstr, XLogLocationToString(&loc), loc.xlogid, loc.xrecoff, XLogLocationToString(&endResyncLSN), endResyncLSN.xlogid, endResyncLSN.xrecoff); FileRep_InsertConfigLogEntry(tmpBuf); } if (XLByteLE(PageGetLSN(page), endResyncLSN) && XLByteLE(entry->mirrorBufpoolResyncCkptLoc, PageGetLSN(page))) { smgrwrite(smgr_relation, blkno, (char *)BufferGetBlock(buf), FALSE); } SIMPLE_FAULT_INJECTOR(FileRepResyncWorker); UnlockReleaseBuffer(buf); if (count > thresholdCount) { count = 0; FileRepSubProcess_ProcessSignals(); if (! (FileRepSubProcess_GetState() == FileRepStateReady && dataState == DataStateInResync)) { mirrorDataLossOccurred = TRUE; break; } } else count++; } if (mirrorDataLossOccurred) break; if (entry->mirrorDataSynchronizationState != MirroredRelDataSynchronizationState_FullCopy) { LockRelationForResyncExtension(&smgr_relation->smgr_rnode, ExclusiveLock); numBlocks = smgrnblocks(smgr_relation); smgrtruncate(smgr_relation, numBlocks, TRUE /* isTemp, TRUE means to not record in XLOG */, FALSE /* isLocalBuf */, &entry->persistentTid, entry->persistentSerialNum); UnlockRelationForResyncExtension(&smgr_relation->smgr_rnode, ExclusiveLock); } smgrimmedsync(smgr_relation); smgrclose(smgr_relation); smgr_relation = NULL; break; case MirroredRelDataSynchronizationState_None: case MirroredRelDataSynchronizationState_DataSynchronized: break; default: ereport(LOG, (errmsg("could not resynchronize relation '%u/%u/%u' " "mirror synchronization state:'%s(%d)' ", entry->relFileNode.relNode, entry->relFileNode.spcNode, entry->relFileNode.dbNode, MirroredRelDataSynchronizationState_Name(entry->mirrorDataSynchronizationState), entry->mirrorDataSynchronizationState))); break; } break; case PersistentFileSysRelStorageMgr_AppendOnly: { MirroredAppendOnlyOpen mirroredOpen; int primaryError; bool mirrorDataLossOccurred; char *buffer = NULL; int64 endOffset = entry->mirrorAppendOnlyNewEof; int64 startOffset = entry->mirrorAppendOnlyLossEof; int32 bufferLen = 0; int retval = 0; switch (entry->mirrorDataSynchronizationState) { case MirroredRelDataSynchronizationState_AppendOnlyCatchup: case MirroredRelDataSynchronizationState_FullCopy: /* * required in order to report how many blocks were synchronized * if gp_persistent_relation_node does not return that information */ if (entry->mirrorBufpoolResyncChangedPageCount == 0) { entry->mirrorBufpoolResyncChangedPageCount = (endOffset - startOffset) / BLCKSZ; } /* * The MirroredAppendOnly_OpenResynchonize routine knows we are a resynch worker and * will open BOTH, but write only the MIRROR!!! */ MirroredAppendOnly_OpenResynchonize( &mirroredOpen, &entry->relFileNode, entry->segmentFileNum, startOffset, &primaryError, &mirrorDataLossOccurred); if (primaryError != 0) { ereport(ERROR, (errcode_for_file_access(), errmsg("could not open file %u/%u/%u.%u : %s", entry->relFileNode.dbNode, entry->relFileNode.spcNode, entry->relFileNode.relNode, entry->segmentFileNum, strerror(primaryError)))); break; } if (mirrorDataLossOccurred) break; /* AO and CO Data Store writes 64k size by default */ bufferLen = (Size) Min(2*BLCKSZ, endOffset - startOffset); buffer = (char*) palloc(bufferLen); MemSet(buffer, 0, bufferLen); while (startOffset < endOffset) { retval = MirroredAppendOnly_Read( &mirroredOpen, buffer, bufferLen); if (retval != bufferLen) { ereport(ERROR, (errcode_for_file_access(), errmsg("could not read from position:" INT64_FORMAT " in file %u/%u/%u.%u : %m", startOffset, entry->relFileNode.dbNode, entry->relFileNode.spcNode, entry->relFileNode.relNode, entry->segmentFileNum))); break; } MirroredAppendOnly_Append( &mirroredOpen, buffer, bufferLen, &primaryError, &mirrorDataLossOccurred); if (mirrorDataLossOccurred) break; Assert(primaryError == 0); // No primary writes as resync worker. startOffset += bufferLen; /* AO and CO Data Store writes 64k size by default */ bufferLen = (Size) Min(2*BLCKSZ, endOffset - startOffset); } pfree(buffer); buffer = NULL; if (mirrorDataLossOccurred) break; /* Flush written data on Mirror */ MirroredAppendOnly_Flush( &mirroredOpen, &primaryError, &mirrorDataLossOccurred); if (mirrorDataLossOccurred) break; Assert(primaryError == 0); // Not flushed on primary as resync worker. /* Close Primary and Mirror */ MirroredAppendOnly_Close( &mirroredOpen, &mirrorDataLossOccurred); break; case MirroredRelDataSynchronizationState_None: case MirroredRelDataSynchronizationState_DataSynchronized: break; default: ereport(LOG, (errmsg("could not resynchronize relation '%u/%u/%u' " "mirror synchronization state:'%s(%d)' ", entry->relFileNode.relNode, entry->relFileNode.spcNode, entry->relFileNode.dbNode, MirroredRelDataSynchronizationState_Name(entry->mirrorDataSynchronizationState), entry->mirrorDataSynchronizationState))); break; } break; } //case default: Assert(0); break; } //switch if (mirrorDataLossOccurred) status = STATUS_ERROR; return status; }
/* * Load a single slot from disk into memory. */ static void RestoreSlotFromDisk(const char *name) { ReplicationSlotOnDisk cp; int i; char path[MAXPGPATH]; int fd; bool restored = false; int readBytes; pg_crc32 checksum; /* no need to lock here, no concurrent access allowed yet */ /* delete temp file if it exists */ sprintf(path, "pg_replslot/%s/state.tmp", name); if (unlink(path) < 0 && errno != ENOENT) ereport(PANIC, (errcode_for_file_access(), errmsg("could not remove file \"%s\": %m", path))); sprintf(path, "pg_replslot/%s/state", name); elog(DEBUG1, "restoring replication slot from \"%s\"", path); fd = OpenTransientFile(path, O_RDONLY | PG_BINARY, 0); /* * We do not need to handle this as we are rename()ing the directory into * place only after we fsync()ed the state file. */ if (fd < 0) ereport(PANIC, (errcode_for_file_access(), errmsg("could not open file \"%s\": %m", path))); /* * Sync state file before we're reading from it. We might have crashed * while it wasn't synced yet and we shouldn't continue on that basis. */ if (pg_fsync(fd) != 0) { CloseTransientFile(fd); ereport(PANIC, (errcode_for_file_access(), errmsg("could not fsync file \"%s\": %m", path))); } /* Also sync the parent directory */ START_CRIT_SECTION(); fsync_fname(path, true); END_CRIT_SECTION(); /* read part of statefile that's guaranteed to be version independent */ readBytes = read(fd, &cp, ReplicationSlotOnDiskConstantSize); if (readBytes != ReplicationSlotOnDiskConstantSize) { int saved_errno = errno; CloseTransientFile(fd); errno = saved_errno; ereport(PANIC, (errcode_for_file_access(), errmsg("could not read file \"%s\", read %d of %u: %m", path, readBytes, (uint32) ReplicationSlotOnDiskConstantSize))); } /* verify magic */ if (cp.magic != SLOT_MAGIC) ereport(PANIC, (errcode_for_file_access(), errmsg("replication slot file \"%s\" has wrong magic %u instead of %u", path, cp.magic, SLOT_MAGIC))); /* verify version */ if (cp.version != SLOT_VERSION) ereport(PANIC, (errcode_for_file_access(), errmsg("replication slot file \"%s\" has unsupported version %u", path, cp.version))); /* boundary check on length */ if (cp.length != ReplicationSlotOnDiskDynamicSize) ereport(PANIC, (errcode_for_file_access(), errmsg("replication slot file \"%s\" has corrupted length %u", path, cp.length))); /* Now that we know the size, read the entire file */ readBytes = read(fd, (char *) &cp + ReplicationSlotOnDiskConstantSize, cp.length); if (readBytes != cp.length) { int saved_errno = errno; CloseTransientFile(fd); errno = saved_errno; ereport(PANIC, (errcode_for_file_access(), errmsg("could not read file \"%s\", read %d of %u: %m", path, readBytes, cp.length))); } CloseTransientFile(fd); /* now verify the CRC32 */ INIT_CRC32(checksum); COMP_CRC32(checksum, (char *) &cp + ReplicationSlotOnDiskConstantSize, ReplicationSlotOnDiskDynamicSize); if (!EQ_CRC32(checksum, cp.checksum)) ereport(PANIC, (errmsg("replication slot file %s: checksum mismatch, is %u, should be %u", path, checksum, cp.checksum))); /* * If we crashed with an ephemeral slot active, don't restore but delete * it. */ if (cp.slotdata.persistency != RS_PERSISTENT) { sprintf(path, "pg_replslot/%s", name); if (!rmtree(path, true)) { ereport(WARNING, (errcode_for_file_access(), errmsg("could not remove directory \"%s\"", path))); } fsync_fname("pg_replslot", true); return; } /* nothing can be active yet, don't lock anything */ for (i = 0; i < max_replication_slots; i++) { ReplicationSlot *slot; slot = &ReplicationSlotCtl->replication_slots[i]; if (slot->in_use) continue; /* restore the entire set of persistent data */ memcpy(&slot->data, &cp.slotdata, sizeof(ReplicationSlotPersistentData)); /* initialize in memory state */ slot->effective_xmin = cp.slotdata.xmin; slot->effective_catalog_xmin = cp.slotdata.catalog_xmin; slot->candidate_catalog_xmin = InvalidTransactionId; slot->candidate_xmin_lsn = InvalidXLogRecPtr; slot->candidate_restart_lsn = InvalidXLogRecPtr; slot->candidate_restart_valid = InvalidXLogRecPtr; slot->in_use = true; slot->active = false; restored = true; break; } if (!restored) ereport(PANIC, (errmsg("too many replication slots active before shutdown"), errhint("Increase max_replication_slots and try again."))); }
/* * copy one file */ void copy_file(char *fromfile, char *tofile) { char *buffer; int srcfd; int dstfd; int nbytes; off_t offset; /* Use palloc to ensure we get a maxaligned buffer */ #define COPY_BUF_SIZE (8 * BLCKSZ) buffer = palloc(COPY_BUF_SIZE); /* * Open the files */ srcfd = BasicOpenFile(fromfile, O_RDONLY | PG_BINARY, 0); if (srcfd < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not open file \"%s\": %m", fromfile))); dstfd = BasicOpenFile(tofile, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, S_IRUSR | S_IWUSR); if (dstfd < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not create file \"%s\": %m", tofile))); /* * Do the data copying. */ for (offset = 0;; offset += nbytes) { /* If we got a cancel signal during the copy of the file, quit */ CHECK_FOR_INTERRUPTS(); nbytes = read(srcfd, buffer, COPY_BUF_SIZE); if (nbytes < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not read file \"%s\": %m", fromfile))); if (nbytes == 0) break; errno = 0; if ((int) write(dstfd, buffer, nbytes) != nbytes) { /* if write didn't set errno, assume problem is no disk space */ if (errno == 0) errno = ENOSPC; ereport(ERROR, (errcode_for_file_access(), errmsg("could not write to file \"%s\": %m", tofile))); } /* * We fsync the files later but first flush them to avoid spamming the * cache and hopefully get the kernel to start writing them out before * the fsync comes. Ignore any error, since it's only a hint. */ (void) pg_flush_data(dstfd, offset, nbytes); } if (close(dstfd)) ereport(ERROR, (errcode_for_file_access(), errmsg("could not close file \"%s\": %m", tofile))); close(srcfd); pfree(buffer); }
/* * Permanently drop the currently acquired replication slot which will be * released by the point this function returns. */ static void ReplicationSlotDropAcquired(void) { char path[MAXPGPATH]; char tmppath[MAXPGPATH]; ReplicationSlot *slot = MyReplicationSlot; Assert(MyReplicationSlot != NULL); /* slot isn't acquired anymore */ MyReplicationSlot = NULL; /* * If some other backend ran this code concurrently with us, we might try * to delete a slot with a certain name while someone else was trying to * create a slot with the same name. */ LWLockAcquire(ReplicationSlotAllocationLock, LW_EXCLUSIVE); /* Generate pathnames. */ sprintf(path, "pg_replslot/%s", NameStr(slot->data.name)); sprintf(tmppath, "pg_replslot/%s.tmp", NameStr(slot->data.name)); /* * Rename the slot directory on disk, so that we'll no longer recognize * this as a valid slot. Note that if this fails, we've got to mark the * slot inactive before bailing out. If we're dropping a ephemeral slot, * we better never fail hard as the caller won't expect the slot to * survive and this might get called during error handling. */ if (rename(path, tmppath) == 0) { /* * We need to fsync() the directory we just renamed and its parent to * make sure that our changes are on disk in a crash-safe fashion. If * fsync() fails, we can't be sure whether the changes are on disk or * not. For now, we handle that by panicking; * StartupReplicationSlots() will try to straighten it out after * restart. */ START_CRIT_SECTION(); fsync_fname(tmppath, true); fsync_fname("pg_replslot", true); END_CRIT_SECTION(); } else { volatile ReplicationSlot *vslot = slot; bool fail_softly = slot->data.persistency == RS_EPHEMERAL; SpinLockAcquire(&slot->mutex); vslot->active = false; SpinLockRelease(&slot->mutex); ereport(fail_softly ? WARNING : ERROR, (errcode_for_file_access(), errmsg("could not rename file \"%s\" to \"%s\": %m", path, tmppath))); } /* * The slot is definitely gone. Lock out concurrent scans of the array * long enough to kill it. It's OK to clear the active flag here without * grabbing the mutex because nobody else can be scanning the array here, * and nobody can be attached to this slot and thus access it without * scanning the array. */ LWLockAcquire(ReplicationSlotControlLock, LW_EXCLUSIVE); slot->active = false; slot->in_use = false; LWLockRelease(ReplicationSlotControlLock); /* * Slot is dead and doesn't prevent resource removal anymore, recompute * limits. */ ReplicationSlotsComputeRequiredXmin(false); ReplicationSlotsComputeRequiredLSN(); /* * If removing the directory fails, the worst thing that will happen is * that the user won't be able to create a new slot with the same name * until the next server restart. We warn about it, but that's all. */ if (!rmtree(tmppath, true)) ereport(WARNING, (errcode_for_file_access(), errmsg("could not remove directory \"%s\"", tmppath))); /* * We release this at the very end, so that nobody starts trying to create * a slot while we're still cleaning up the detritus of the old one. */ LWLockRelease(ReplicationSlotAllocationLock); }
/* * Actually do a base backup for the specified tablespaces. * * This is split out mainly to avoid complaints about "variable might be * clobbered by longjmp" from stupider versions of gcc. */ static void perform_base_backup(basebackup_options *opt, DIR *tblspcdir) { XLogRecPtr startptr; TimeLineID starttli; XLogRecPtr endptr; TimeLineID endtli; char *labelfile; int datadirpathlen; datadirpathlen = strlen(DataDir); backup_started_in_recovery = RecoveryInProgress(); startptr = do_pg_start_backup(opt->label, opt->fastcheckpoint, &starttli, &labelfile); /* * Once do_pg_start_backup has been called, ensure that any failure causes * us to abort the backup so we don't "leak" a backup counter. For this reason, * *all* functionality between do_pg_start_backup() and do_pg_stop_backup() * should be inside the error cleanup block! */ PG_ENSURE_ERROR_CLEANUP(base_backup_cleanup, (Datum) 0); { List *tablespaces = NIL; ListCell *lc; struct dirent *de; tablespaceinfo *ti; SendXlogRecPtrResult(startptr, starttli); /* * Calculate the relative path of temporary statistics directory in order * to skip the files which are located in that directory later. */ if (is_absolute_path(pgstat_stat_directory) && strncmp(pgstat_stat_directory, DataDir, datadirpathlen) == 0) statrelpath = psprintf("./%s", pgstat_stat_directory + datadirpathlen + 1); else if (strncmp(pgstat_stat_directory, "./", 2) != 0) statrelpath = psprintf("./%s", pgstat_stat_directory); else statrelpath = pgstat_stat_directory; /* Collect information about all tablespaces */ while ((de = ReadDir(tblspcdir, "pg_tblspc")) != NULL) { char fullpath[MAXPGPATH]; char linkpath[MAXPGPATH]; char *relpath = NULL; int rllen; /* Skip special stuff */ if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) continue; snprintf(fullpath, sizeof(fullpath), "pg_tblspc/%s", de->d_name); #if defined(HAVE_READLINK) || defined(WIN32) rllen = readlink(fullpath, linkpath, sizeof(linkpath)); if (rllen < 0) { ereport(WARNING, (errmsg("could not read symbolic link \"%s\": %m", fullpath))); continue; } else if (rllen >= sizeof(linkpath)) { ereport(WARNING, (errmsg("symbolic link \"%s\" target is too long", fullpath))); continue; } linkpath[rllen] = '\0'; /* * Relpath holds the relative path of the tablespace directory * when it's located within PGDATA, or NULL if it's located * elsewhere. */ if (rllen > datadirpathlen && strncmp(linkpath, DataDir, datadirpathlen) == 0 && IS_DIR_SEP(linkpath[datadirpathlen])) relpath = linkpath + datadirpathlen + 1; ti = palloc(sizeof(tablespaceinfo)); ti->oid = pstrdup(de->d_name); ti->path = pstrdup(linkpath); ti->rpath = relpath ? pstrdup(relpath) : NULL; ti->size = opt->progress ? sendTablespace(fullpath, true) : -1; tablespaces = lappend(tablespaces, ti); #else /* * If the platform does not have symbolic links, it should not be * possible to have tablespaces - clearly somebody else created * them. Warn about it and ignore. */ ereport(WARNING, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("tablespaces are not supported on this platform"))); #endif } /* Add a node for the base directory at the end */ ti = palloc0(sizeof(tablespaceinfo)); ti->size = opt->progress ? sendDir(".", 1, true, tablespaces) : -1; tablespaces = lappend(tablespaces, ti); /* Send tablespace header */ SendBackupHeader(tablespaces); /* Setup and activate network throttling, if client requested it */ if (opt->maxrate > 0) { throttling_sample = (int64) opt->maxrate * (int64) 1024 / THROTTLING_FREQUENCY; /* * The minimum amount of time for throttling_sample bytes to be * transfered. */ elapsed_min_unit = USECS_PER_SEC / THROTTLING_FREQUENCY; /* Enable throttling. */ throttling_counter = 0; /* The 'real data' starts now (header was ignored). */ throttled_last = GetCurrentIntegerTimestamp(); } else { /* Disable throttling. */ throttling_counter = -1; } /* Send off our tablespaces one by one */ foreach(lc, tablespaces) { tablespaceinfo *ti = (tablespaceinfo *) lfirst(lc); StringInfoData buf; /* Send CopyOutResponse message */ pq_beginmessage(&buf, 'H'); pq_sendbyte(&buf, 0); /* overall format */ pq_sendint(&buf, 0, 2); /* natts */ pq_endmessage(&buf); if (ti->path == NULL) { struct stat statbuf; /* In the main tar, include the backup_label first... */ sendFileWithContent(BACKUP_LABEL_FILE, labelfile); /* ... then the bulk of the files ... */ sendDir(".", 1, false, tablespaces); /* ... and pg_control after everything else. */ if (lstat(XLOG_CONTROL_FILE, &statbuf) != 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not stat control file \"%s\": %m", XLOG_CONTROL_FILE))); sendFile(XLOG_CONTROL_FILE, XLOG_CONTROL_FILE, &statbuf, false); } else sendTablespace(ti->path, false); /* * If we're including WAL, and this is the main data directory we * don't terminate the tar stream here. Instead, we will append * the xlog files below and terminate it then. This is safe since * the main data directory is always sent *last*. */ if (opt->includewal && ti->path == NULL) { Assert(lnext(lc) == NULL); } else pq_putemptymessage('c'); /* CopyDone */ } }
/* * Shared functionality between saving and creating a replication slot. */ static void SaveSlotToPath(ReplicationSlot *slot, const char *dir, int elevel) { char tmppath[MAXPGPATH]; char path[MAXPGPATH]; int fd; ReplicationSlotOnDisk cp; bool was_dirty; /* first check whether there's something to write out */ { volatile ReplicationSlot *vslot = slot; SpinLockAcquire(&vslot->mutex); was_dirty = vslot->dirty; vslot->just_dirtied = false; SpinLockRelease(&vslot->mutex); } /* and don't do anything if there's nothing to write */ if (!was_dirty) return; LWLockAcquire(slot->io_in_progress_lock, LW_EXCLUSIVE); /* silence valgrind :( */ memset(&cp, 0, sizeof(ReplicationSlotOnDisk)); sprintf(tmppath, "%s/state.tmp", dir); sprintf(path, "%s/state", dir); fd = OpenTransientFile(tmppath, O_CREAT | O_EXCL | O_WRONLY | PG_BINARY, S_IRUSR | S_IWUSR); if (fd < 0) { ereport(elevel, (errcode_for_file_access(), errmsg("could not create file \"%s\": %m", tmppath))); return; } cp.magic = SLOT_MAGIC; INIT_CRC32(cp.checksum); cp.version = 1; cp.length = ReplicationSlotOnDiskDynamicSize; SpinLockAcquire(&slot->mutex); memcpy(&cp.slotdata, &slot->data, sizeof(ReplicationSlotPersistentData)); SpinLockRelease(&slot->mutex); COMP_CRC32(cp.checksum, (char *) (&cp) + ReplicationSlotOnDiskConstantSize, ReplicationSlotOnDiskDynamicSize); if ((write(fd, &cp, sizeof(cp))) != sizeof(cp)) { int save_errno = errno; CloseTransientFile(fd); errno = save_errno; ereport(elevel, (errcode_for_file_access(), errmsg("could not write to file \"%s\": %m", tmppath))); return; } /* fsync the temporary file */ if (pg_fsync(fd) != 0) { int save_errno = errno; CloseTransientFile(fd); errno = save_errno; ereport(elevel, (errcode_for_file_access(), errmsg("could not fsync file \"%s\": %m", tmppath))); return; } CloseTransientFile(fd); /* rename to permanent file, fsync file and directory */ if (rename(tmppath, path) != 0) { ereport(elevel, (errcode_for_file_access(), errmsg("could not rename file \"%s\" to \"%s\": %m", tmppath, path))); return; } /* Check CreateSlot() for the reasoning of using a crit. section. */ START_CRIT_SECTION(); fsync_fname(path, false); fsync_fname((char *) dir, true); fsync_fname("pg_replslot", true); END_CRIT_SECTION(); /* * Successfully wrote, unset dirty bit, unless somebody dirtied again * already. */ { volatile ReplicationSlot *vslot = slot; SpinLockAcquire(&vslot->mutex); if (!vslot->just_dirtied) vslot->dirty = false; SpinLockRelease(&vslot->mutex); } LWLockRelease(slot->io_in_progress_lock); }
/* * pgsymlink - uses Win32 junction points * * For reference: http://www.codeproject.com/KB/winsdk/junctionpoints.aspx */ int pgsymlink(const char *oldpath, const char *newpath) { HANDLE dirhandle; DWORD len; char buffer[MAX_PATH * sizeof(WCHAR) + sizeof(REPARSE_JUNCTION_DATA_BUFFER)]; char nativeTarget[MAX_PATH]; char *p = nativeTarget; REPARSE_JUNCTION_DATA_BUFFER *reparseBuf = (REPARSE_JUNCTION_DATA_BUFFER *) buffer; CreateDirectory(newpath, 0); dirhandle = CreateFile(newpath, GENERIC_READ | GENERIC_WRITE, 0, 0, OPEN_EXISTING, FILE_FLAG_OPEN_REPARSE_POINT | FILE_FLAG_BACKUP_SEMANTICS, 0); if (dirhandle == INVALID_HANDLE_VALUE) return -1; /* make sure we have an unparsed native win32 path */ if (memcmp("\\??\\", oldpath, 4)) sprintf(nativeTarget, "\\??\\%s", oldpath); else strcpy(nativeTarget, oldpath); while ((p = strchr(p, '/')) != NULL) *p++ = '\\'; len = strlen(nativeTarget) * sizeof(WCHAR); reparseBuf->ReparseTag = IO_REPARSE_TAG_MOUNT_POINT; reparseBuf->ReparseDataLength = len + 12; reparseBuf->Reserved = 0; reparseBuf->SubstituteNameOffset = 0; reparseBuf->SubstituteNameLength = len; reparseBuf->PrintNameOffset = len + sizeof(WCHAR); reparseBuf->PrintNameLength = 0; MultiByteToWideChar(CP_ACP, 0, nativeTarget, -1, reparseBuf->PathBuffer, MAX_PATH); /* * FSCTL_SET_REPARSE_POINT is coded differently depending on SDK version; * we use our own definition */ if (!DeviceIoControl(dirhandle, CTL_CODE(FILE_DEVICE_FILE_SYSTEM, 41, METHOD_BUFFERED, FILE_ANY_ACCESS), reparseBuf, reparseBuf->ReparseDataLength + REPARSE_JUNCTION_DATA_BUFFER_HEADER_SIZE, 0, 0, &len, 0)) { LPSTR msg; errno = 0; FormatMessage(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM, NULL, GetLastError(), MAKELANGID(LANG_ENGLISH, SUBLANG_DEFAULT), (LPSTR) &msg, 0, NULL); #ifndef FRONTEND ereport(ERROR, (errcode_for_file_access(), errmsg("could not set junction for \"%s\": %s", nativeTarget, msg))); #else fprintf(stderr, _("could not set junction for \"%s\": %s\n"), nativeTarget, msg); #endif LocalFree(msg); CloseHandle(dirhandle); RemoveDirectory(newpath); return -1; } CloseHandle(dirhandle); return 0; }
/* * ExportSnapshot * Export the snapshot to a file so that other backends can import it. * Returns the token (the file name) that can be used to import this * snapshot. */ char * ExportSnapshot(Snapshot snapshot) { TransactionId topXid; TransactionId *children; int nchildren; int addTopXid; StringInfoData buf; FILE *f; int i; MemoryContext oldcxt; char path[MAXPGPATH]; char pathtmp[MAXPGPATH]; /* * It's tempting to call RequireTransactionChain here, since it's not very * useful to export a snapshot that will disappear immediately afterwards. * However, we haven't got enough information to do that, since we don't * know if we're at top level or not. For example, we could be inside a * plpgsql function that is going to fire off other transactions via * dblink. Rather than disallow perfectly legitimate usages, don't make a * check. * * Also note that we don't make any restriction on the transaction's * isolation level; however, importers must check the level if they are * serializable. */ /* * This will assign a transaction ID if we do not yet have one. */ topXid = GetTopTransactionId(); /* * We cannot export a snapshot from a subtransaction because there's no * easy way for importers to verify that the same subtransaction is still * running. */ if (IsSubTransaction()) ereport(ERROR, (errcode(ERRCODE_ACTIVE_SQL_TRANSACTION), errmsg("cannot export a snapshot from a subtransaction"))); /* * We do however allow previous committed subtransactions to exist. * Importers of the snapshot must see them as still running, so get their * XIDs to add them to the snapshot. */ nchildren = xactGetCommittedChildren(&children); /* * Copy the snapshot into TopTransactionContext, add it to the * exportedSnapshots list, and mark it pseudo-registered. We do this to * ensure that the snapshot's xmin is honored for the rest of the * transaction. (Right now, because SnapshotResetXmin is so stupid, this * is overkill; but later we might make that routine smarter.) */ snapshot = CopySnapshot(snapshot); oldcxt = MemoryContextSwitchTo(TopTransactionContext); exportedSnapshots = lappend(exportedSnapshots, snapshot); MemoryContextSwitchTo(oldcxt); snapshot->regd_count++; RegisteredSnapshots++; /* * Fill buf with a text serialization of the snapshot, plus identification * data about this transaction. The format expected by ImportSnapshot is * pretty rigid: each line must be fieldname:value. */ initStringInfo(&buf); appendStringInfo(&buf, "xid:%u\n", topXid); appendStringInfo(&buf, "dbid:%u\n", MyDatabaseId); appendStringInfo(&buf, "iso:%d\n", XactIsoLevel); appendStringInfo(&buf, "ro:%d\n", XactReadOnly); appendStringInfo(&buf, "xmin:%u\n", snapshot->xmin); appendStringInfo(&buf, "xmax:%u\n", snapshot->xmax); /* * We must include our own top transaction ID in the top-xid data, since * by definition we will still be running when the importing transaction * adopts the snapshot, but GetSnapshotData never includes our own XID in * the snapshot. (There must, therefore, be enough room to add it.) * * However, it could be that our topXid is after the xmax, in which case * we shouldn't include it because xip[] members are expected to be before * xmax. (We need not make the same check for subxip[] members, see * snapshot.h.) */ addTopXid = TransactionIdPrecedes(topXid, snapshot->xmax) ? 1 : 0; appendStringInfo(&buf, "xcnt:%d\n", snapshot->xcnt + addTopXid); for (i = 0; i < snapshot->xcnt; i++) appendStringInfo(&buf, "xip:%u\n", snapshot->xip[i]); if (addTopXid) appendStringInfo(&buf, "xip:%u\n", topXid); /* * Similarly, we add our subcommitted child XIDs to the subxid data. Here, * we have to cope with possible overflow. */ if (snapshot->suboverflowed || snapshot->subxcnt + nchildren > GetMaxSnapshotSubxidCount()) appendStringInfoString(&buf, "sof:1\n"); else { appendStringInfoString(&buf, "sof:0\n"); appendStringInfo(&buf, "sxcnt:%d\n", snapshot->subxcnt + nchildren); for (i = 0; i < snapshot->subxcnt; i++) appendStringInfo(&buf, "sxp:%u\n", snapshot->subxip[i]); for (i = 0; i < nchildren; i++) appendStringInfo(&buf, "sxp:%u\n", children[i]); } appendStringInfo(&buf, "rec:%u\n", snapshot->takenDuringRecovery); /* * Now write the text representation into a file. We first write to a * ".tmp" filename, and rename to final filename if no error. This * ensures that no other backend can read an incomplete file * (ImportSnapshot won't allow it because of its valid-characters check). */ XactExportFilePath(pathtmp, topXid, list_length(exportedSnapshots), ".tmp"); if (!(f = AllocateFile(pathtmp, PG_BINARY_W))) ereport(ERROR, (errcode_for_file_access(), errmsg("could not create file \"%s\": %m", pathtmp))); if (fwrite(buf.data, buf.len, 1, f) != 1) ereport(ERROR, (errcode_for_file_access(), errmsg("could not write to file \"%s\": %m", pathtmp))); /* no fsync() since file need not survive a system crash */ if (FreeFile(f)) ereport(ERROR, (errcode_for_file_access(), errmsg("could not write to file \"%s\": %m", pathtmp))); /* * Now that we have written everything into a .tmp file, rename the file * to remove the .tmp suffix. */ XactExportFilePath(path, topXid, list_length(exportedSnapshots), ""); if (rename(pathtmp, path) < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not rename file \"%s\" to \"%s\": %m", pathtmp, path))); /* * The basename of the file is what we return from pg_export_snapshot(). * It's already in path in a textual format and we know that the path * starts with SNAPSHOT_EXPORT_DIR. Skip over the prefix and the slash * and pstrdup it so as not to return the address of a local variable. */ return pstrdup(path + strlen(SNAPSHOT_EXPORT_DIR) + 1); }
/* * Import data into GPDB. */ Datum demoprot_import(PG_FUNCTION_ARGS) { extprotocol_t *myData; char *data; int datlen; size_t nread = 0; /* Must be called via the external table format manager */ if (!CALLED_AS_EXTPROTOCOL(fcinfo)) elog(ERROR, "extprotocol_import: not called by external protocol manager"); /* Get our internal description of the protocol */ myData = (extprotocol_t *) EXTPROTOCOL_GET_USER_CTX(fcinfo); if(EXTPROTOCOL_IS_LAST_CALL(fcinfo)) { /* we're done receiving data. close our connection */ if(myData && myData->file) if(fclose(myData->file)) ereport(ERROR, (errcode_for_file_access(), errmsg("could not close file \"%s\": %m", myData->filename))); PG_RETURN_INT32(0); } if (myData == NULL) { /* first call. do any desired init */ const char *p_name = "demoprot"; DemoUri *parsed_url; char *url = EXTPROTOCOL_GET_URL(fcinfo); myData = palloc(sizeof(extprotocol_t)); myData->url = pstrdup(url); parsed_url = ParseDemoUri(myData->url); myData->filename = pstrdup(parsed_url->path); if(strcasecmp(parsed_url->protocol, p_name) != 0) elog(ERROR, "internal error: demoprot called with a different protocol (%s)", parsed_url->protocol); FreeDemoUri(parsed_url); /* open the destination file (or connect to remote server in other cases) */ myData->file = fopen(myData->filename, "r"); if (myData->file == NULL) ereport(ERROR, (errcode_for_file_access(), errmsg("demoprot_import: could not open file \"%s\" for reading: %m", myData->filename), errOmitLocation(true))); EXTPROTOCOL_SET_USER_CTX(fcinfo, myData); } /* ======================================================================= * DO THE IMPORT * ======================================================================= */ data = EXTPROTOCOL_GET_DATABUF(fcinfo); datlen = EXTPROTOCOL_GET_DATALEN(fcinfo); if(datlen > 0) { nread = fread(data, 1, datlen, myData->file); if (ferror(myData->file)) ereport(ERROR, (errcode_for_file_access(), errmsg("demoprot_import: could not write to file \"%s\": %m", myData->filename))); } PG_RETURN_INT32((int)nread); }
/* * Opens the next segment file to write. The file must already exist. * This routine is responsible for seeking to the proper write location * given the logical EOF. * * @filePathName: The name of the segment file to open. * @logicalEof: The last committed write transaction's EOF * value to use as the end of the segment file. * @parquet_file The file handler of segment file */ static void OpenSegmentFile( MirroredAppendOnlyOpen *mirroredOpen, char *filePathName, int64 logicalEof, RelFileNode *relFileNode, int32 segmentFileNum, char *relname, File *parquet_file, File *parquet_file_previous, CompactProtocol **protocol_read, TupleDesc tableAttrs, ParquetMetadata *parquetMetadata, int64 *fileLen, int64 *fileLen_uncompressed, int *previous_rowgroupcnt) { int primaryError; File file; int64 seekResult; Assert(filePathName != NULL); bool metadataExist = false; /* * Open the file for metadata reading. */ MirroredAppendOnly_OpenReadWrite(mirroredOpen, relFileNode, segmentFileNum, relname, logicalEof, true, &primaryError); if (primaryError != 0) ereport(ERROR, (errcode_for_file_access(), errmsg("file open error when opening file " "'%s' for relation '%s': %s", filePathName, relname, strerror(primaryError)))); *parquet_file_previous = mirroredOpen->primaryFile; int64 fileSize = FileSeek(*parquet_file_previous, 0, SEEK_END); if (fileSize < 0){ ereport(ERROR, (errcode_for_file_access(), errmsg("file seek error in file '%s' for relation " "'%s'", filePathName, relname))); } if (logicalEof > fileSize) { ereport(ERROR, (errcode(ERRCODE_GP_INTERNAL_ERROR), errmsg("logical eof exceeds file size in file '%s' for relation '%s'", filePathName, relname))); } /*read parquet footer, get metadata information before rowgroup metadata*/ metadataExist = readParquetFooter(*parquet_file_previous, parquetMetadata, protocol_read, logicalEof, filePathName); *previous_rowgroupcnt = (*parquetMetadata)->blockCount; /* * Open the file for writing. */ MirroredAppendOnly_OpenReadWrite(mirroredOpen, relFileNode, segmentFileNum, relname, logicalEof, false, &primaryError); if (primaryError != 0) ereport(ERROR, (errcode_for_file_access(), errmsg("file open error when opening file '%s' " "for relation '%s': %s", filePathName, relname, strerror(primaryError)))); file = mirroredOpen->primaryFile; seekResult = FileNonVirtualTell(file); if (seekResult != logicalEof) { /* previous transaction is aborted truncate file*/ if (FileTruncate(file, logicalEof)) { MirroredAppendOnly_Close(mirroredOpen); ereport(ERROR, (errcode_for_file_access(), errmsg("file truncate error in file '%s' for relation " "'%s' to position " INT64_FORMAT ": %s", filePathName, relname, logicalEof, strerror(errno)))); } } *parquet_file = file; /*if metadata not exist, should initialize the metadata, and write out file header*/ if (metadataExist == false) { /* init parquet metadata information, init schema information using table attributes, * and may get existing information from data file*/ initparquetMetadata(*parquetMetadata, tableAttrs, *parquet_file); /*should judge whether file already exists, if a new file, should write header out*/ writeParquetHeader(*parquet_file, filePathName, fileLen, fileLen_uncompressed); } else { if (!checkAndSyncMetadata(*parquetMetadata, tableAttrs)) { ereport(ERROR, (errcode(ERRCODE_GP_INTERNAL_ERROR), errmsg("parquet storage write file's metadata incompatible " "with table's schema for relation '%s'.", relname))); } } }
/* * Each database using a table space is isolated into its own name space * by a subdirectory named for the database OID. On first creation of an * object in the tablespace, create the subdirectory. If the subdirectory * already exists, fall through quietly. * * isRedo indicates that we are creating an object during WAL replay. * In this case we will cope with the possibility of the tablespace * directory not being there either --- this could happen if we are * replaying an operation on a table in a subsequently-dropped tablespace. * We handle this by making a directory in the place where the tablespace * symlink would normally be. This isn't an exact replay of course, but * it's the best we can do given the available information. * * If tablespaces are not supported, we still need it in case we have to * re-create a database subdirectory (of $PGDATA/base) during WAL replay. */ void TablespaceCreateDbspace(Oid spcNode, Oid dbNode, bool isRedo) { struct stat st; char *dir; /* * The global tablespace doesn't have per-database subdirectories, so * nothing to do for it. */ if (spcNode == GLOBALTABLESPACE_OID) return; Assert(OidIsValid(spcNode)); Assert(OidIsValid(dbNode)); dir = GetDatabasePath(dbNode, spcNode); if (stat(dir, &st) < 0) { /* Directory does not exist? */ if (errno == ENOENT) { /* * Acquire TablespaceCreateLock to ensure that no DROP TABLESPACE * or TablespaceCreateDbspace is running concurrently. */ LWLockAcquire(TablespaceCreateLock, LW_EXCLUSIVE); /* * Recheck to see if someone created the directory while we were * waiting for lock. */ if (stat(dir, &st) == 0 && S_ISDIR(st.st_mode)) { /* Directory was created */ } else { /* Directory creation failed? */ if (mkdir(dir, S_IRWXU) < 0) { char *parentdir; /* Failure other than not exists or not in WAL replay? */ if (errno != ENOENT || !isRedo) ereport(ERROR, (errcode_for_file_access(), errmsg("could not create directory \"%s\": %m", dir))); /* * Parent directories are missing during WAL replay, so * continue by creating simple parent directories rather * than a symlink. */ /* create two parents up if not exist */ parentdir = pstrdup(dir); get_parent_directory(parentdir); get_parent_directory(parentdir); /* Can't create parent and it doesn't already exist? */ if (mkdir(parentdir, S_IRWXU) < 0 && errno != EEXIST) ereport(ERROR, (errcode_for_file_access(), errmsg("could not create directory \"%s\": %m", parentdir))); pfree(parentdir); /* create one parent up if not exist */ parentdir = pstrdup(dir); get_parent_directory(parentdir); /* Can't create parent and it doesn't already exist? */ if (mkdir(parentdir, S_IRWXU) < 0 && errno != EEXIST) ereport(ERROR, (errcode_for_file_access(), errmsg("could not create directory \"%s\": %m", parentdir))); pfree(parentdir); /* Create database directory */ if (mkdir(dir, S_IRWXU) < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not create directory \"%s\": %m", dir))); } } LWLockRelease(TablespaceCreateLock); } else { ereport(ERROR, (errcode_for_file_access(), errmsg("could not stat directory \"%s\": %m", dir))); } } else { /* Is it not a directory? */ if (!S_ISDIR(st.st_mode)) ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("\"%s\" exists but is not a directory", dir))); } pfree(dir); }
/* * Perform a large write i/o. */ static void BufferedAppendWrite( BufferedAppend *bufferedAppend) { int32 writeLen; uint8 *largeWriteMemory; int actualLen; writeLen = bufferedAppend->largeWriteLen; Assert(bufferedAppend->largeWriteLen > 0); largeWriteMemory = bufferedAppend->largeWriteMemory; #ifdef USE_ASSERT_CHECKING { int64 currentWritePosition; currentWritePosition = FileNonVirtualCurSeek(bufferedAppend->file); if (currentWritePosition < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("unable to get current position in table \"%s\" for file \"%s\": %m", bufferedAppend->relationName, bufferedAppend->filePathName))); if (currentWritePosition != bufferedAppend->largeWritePosition) ereport(ERROR, (errcode_for_file_access(), errmsg("Current position mismatch actual " INT64_FORMAT ", expected " INT64_FORMAT " in table \"%s\" for file \"%s\"", currentWritePosition, bufferedAppend->largeWritePosition, bufferedAppend->relationName, bufferedAppend->filePathName))); } #endif while (writeLen > 0) { int primaryError; bool mirrorDataLossOccurred; MirroredAppendOnly_Append( &bufferedAppend->mirroredOpen, (char *) largeWriteMemory, writeLen, &primaryError, &mirrorDataLossOccurred); if (primaryError != 0) ereport(ERROR, (errcode_for_file_access(), errmsg("Could not write in table \"%s\" to segment file \"%s\": %m", bufferedAppend->relationName, bufferedAppend->filePathName))); elogif(Debug_appendonly_print_append_block, LOG, "Append-Only storage write: table \"%s\", segment file \"%s\", write position " INT64_FORMAT ", " "writeLen %d (equals large write length %d is %s)", bufferedAppend->relationName, bufferedAppend->filePathName, bufferedAppend->largeWritePosition, writeLen, bufferedAppend->largeWriteLen, (writeLen == bufferedAppend->largeWriteLen ? "true" : "false")); actualLen = writeLen; writeLen -= actualLen; largeWriteMemory += actualLen; } bufferedAppend->largeWritePosition += bufferedAppend->largeWriteLen; bufferedAppend->largeWriteLen = 0; }