/* * Using the Hadoop connector requires proper setup of GUCs * This procedure does sanity check on gp_hadoop_connector_jardir, * gp_hadoop_target_version and gp_hadoop_home. * It also update GUC gp_hadoop_connector_version for the current gp_hadoop_target_version. * * It checks the following: * 1. $GPHOME/<gp_hadoop_jardir>/$GP_HADOOP_CONN_VERSION.jar must exists. * 2. if gp_hadoop_home is set, then gp_hadoop_home must exists. */ static void checkHadoopGUCs() { char gphome[MAXPGPATH]; StringInfoData path; int jarFD; /* Check the existence of $GPHOME/<gp_hadoop_jardir>/$GP_HADOOP_CONN_VERSION.jar * * To get $GPHOME, we go from my_exec_path, which is $GPHOME/bin/postgres, and * go up 2 levels. * * Currently, gp_hadoop_connector_jardir is fixed. We look up $GP_HADOOP_CONN_VERSION * using gp_hadoop_target_version. */ snprintf(gphome, sizeof(gphome), "%s", my_exec_path); get_parent_directory(gphome); get_parent_directory(gphome); initStringInfoOfSize(&path, MAXPGPATH); gp_hadoop_connector_version = (char*)getConnectorVersion(); appendStringInfo(&path, "%s/%s/%s.jar", gphome, gp_hadoop_connector_jardir, gp_hadoop_connector_version); jarFD = BasicOpenFile(path.data, O_RDONLY | PG_BINARY, 0); if (jarFD == -1) { ereport(ERROR, (errcode(ERRCODE_EXTERNAL_ROUTINE_EXCEPTION), errmsg("cannot open Hadoop Cross Connect in %s: %m", path.data))); } close(jarFD); /* Check the existence of gp_hadoop_home, if specified. * * If user has already specified $HADOOP_HOME in the env, then * there's no need to setup this GUC. */ if (strlen(gp_hadoop_home)> 0) { int hdHomeFD = BasicOpenFile(gp_hadoop_home, O_RDONLY, 0); if (hdHomeFD == -1) { ereport(ERROR, (errcode(ERRCODE_EXTERNAL_ROUTINE_EXCEPTION), errmsg("cannot open gp_hadoop_home in %s: %m", gp_hadoop_home))); } close(hdHomeFD); } }
/* * fsync a file * * Try to fsync directories but ignore errors that indicate the OS * just doesn't allow/require fsyncing directories. */ static void fsync_fname(char *fname, bool isdir) { int fd; int returncode; /* * Some OSs require directories to be opened read-only whereas other * systems don't allow us to fsync files opened read-only; so we need both * cases here */ if (!isdir) fd = BasicOpenFile(fname, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR); else fd = BasicOpenFile(fname, O_RDONLY | PG_BINARY, S_IRUSR | S_IWUSR); /* * Some OSs don't allow us to open directories at all (Windows returns * EACCES) */ if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES)) return; else if (fd < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not open file \"%s\": %m", fname))); returncode = pg_fsync(fd); /* Some OSs don't allow us to fsync directories at all */ if (returncode != 0 && isdir && errno == EBADF) { close(fd); return; } if (returncode != 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not fsync file \"%s\": %m", fname))); close(fd); }
/* * Can the given OID be used as pg_class.relfilenode? * * As a side-effect, advances OID counter to the given OID and remembers * that the OID has been used as a relfilenode, so that the same value * doesn't get chosen again. */ bool CheckNewRelFileNodeIsOk(Oid newOid, Oid reltablespace, bool relisshared) { RelFileNode rnode; char *rpath; int fd; bool collides; SnapshotData SnapshotDirty; /* * Advance our current OID counter with the given value, to keep * the counter roughly in sync across all nodes. This ensures * that a GetNewRelFileNode() call after this will not choose the * same OID, and won't have to loop excessively to retry. That * still leaves a race condition, if GetNewRelFileNode() is called * just before CheckNewRelFileNodeIsOk() - UseOidForRelFileNode() * is called to plug that. * * FIXME: handle OID wraparound gracefully. */ while(GetNewObjectId() < newOid); if (!UseOidForRelFileNode(newOid)) return false; InitDirtySnapshot(SnapshotDirty); /* This should match RelationInitPhysicalAddr */ rnode.spcNode = reltablespace ? reltablespace : MyDatabaseTableSpace; rnode.dbNode = relisshared ? InvalidOid : MyDatabaseId; rnode.relNode = newOid; /* Check for existing file of same name */ rpath = relpath(rnode); fd = BasicOpenFile(rpath, O_RDONLY | PG_BINARY, 0); if (fd >= 0) { /* definite collision */ gp_retry_close(fd); collides = true; } else collides = false; pfree(rpath); elog(DEBUG1, "Called CheckNewRelFileNodeIsOk in %s mode for %u / %u / %u. " "collides = %s", (Gp_role == GP_ROLE_EXECUTE ? "execute" : Gp_role == GP_ROLE_UTILITY ? "utility" : "dispatch"), newOid, reltablespace, relisshared, collides ? "true" : "false"); return !collides; }
/* * Recreates a state file. This is used in WAL replay. * * Note: content and len don't include CRC. */ void RecreateTwoPhaseFile(TransactionId xid, void *content, int len) { char path[MAXPGPATH]; pg_crc32 statefile_crc; int fd; /* Recompute CRC */ INIT_CRC32(statefile_crc); COMP_CRC32(statefile_crc, content, len); FIN_CRC32(statefile_crc); TwoPhaseFilePath(path, xid); fd = BasicOpenFile(path, O_CREAT | O_TRUNC | O_WRONLY | PG_BINARY, S_IRUSR | S_IWUSR); if (fd < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not recreate two-phase state file \"%s\": %m", path))); /* Write content and CRC */ if (write(fd, content, len) != len) { close(fd); ereport(ERROR, (errcode_for_file_access(), errmsg("could not write two-phase state file: %m"))); } if (write(fd, &statefile_crc, sizeof(pg_crc32)) != sizeof(pg_crc32)) { close(fd); ereport(ERROR, (errcode_for_file_access(), errmsg("could not write two-phase state file: %m"))); } /* * We must fsync the file because the end-of-replay checkpoint will not do * so, there being no GXACT in shared memory yet to tell it to. */ if (pg_fsync(fd) != 0) { close(fd); ereport(ERROR, (errcode_for_file_access(), errmsg("could not fsync two-phase state file: %m"))); } if (close(fd) != 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not close two-phase state file: %m"))); }
/* returns 0 on success, -1 on re-open failure (with errno set) */ static int LruInsert(File file) { Vfd *vfdP; Assert(file != 0); DO_DB(elog(LOG, "LruInsert %d (%s)", file, VfdCache[file].fileName)); vfdP = &VfdCache[file]; if (FileIsNotOpen(file)) { while (nfile + numAllocatedDescs >= max_safe_fds) { if (!ReleaseLruFile()) break; } /* * The open could still fail for lack of file descriptors, eg due to * overall system file table being full. So, be prepared to release * another FD if necessary... */ vfdP->fd = BasicOpenFile(vfdP->fileName, vfdP->fileFlags, vfdP->fileMode); if (vfdP->fd < 0) { DO_DB(elog(LOG, "RE_OPEN FAILED: %d", errno)); return vfdP->fd; } else { DO_DB(elog(LOG, "RE_OPEN SUCCESS")); ++nfile; } /* seek to the right position */ if (vfdP->seekPos != INT64CONST(0)) { int64 returnValue; returnValue = pg_lseek64(vfdP->fd, vfdP->seekPos, SEEK_SET); Assert(returnValue != INT64CONST(-1)); } } /* * put it at the head of the Lru ring */ Insert(file); return 0; }
/* * GetNewRelFileNode * Generate a new relfilenode number that is unique within the given * tablespace. * * Note: we don't support using this in bootstrap mode. All relations * created by bootstrap have preassigned OIDs, so there's no need. */ Oid GetNewRelFileNode(Oid reltablespace, bool relisshared) { RelFileNode rnode; char *rpath; int fd; bool collides = true; /* This should match RelationInitPhysicalAddr */ rnode.spcNode = reltablespace ? reltablespace : MyDatabaseTableSpace; rnode.dbNode = relisshared ? InvalidOid : MyDatabaseId; do { CHECK_FOR_INTERRUPTS(); /* Generate the Relfilenode */ rnode.relNode = GetNewSegRelfilenode(); if (!IsOidAcceptable(rnode.relNode)) continue; /* Check for existing file of same name */ rpath = relpath(rnode); fd = BasicOpenFile(rpath, O_RDONLY | PG_BINARY, 0); if (fd >= 0) { /* definite collision */ gp_retry_close(fd); collides = true; } else { /* * Here we have a little bit of a dilemma: if errno is something * other than ENOENT, should we declare a collision and loop? In * particular one might think this advisable for, say, EPERM. * However there really shouldn't be any unreadable files in a * tablespace directory, and if the EPERM is actually complaining * that we can't read the directory itself, we'd be in an infinite * loop. In practice it seems best to go ahead regardless of the * errno. If there is a colliding file we will get an smgr * failure when we attempt to create the new relation file. */ collides = false; } pfree(rpath); } while (collides); elog(DEBUG1, "Calling GetNewRelFileNode returns new relfilenode = %d", rnode.relNode); return rnode.relNode; }
/* * open a file in an arbitrary directory * * NB: if the passed pathname is relative (which it usually is), * it will be interpreted relative to the process' working directory * (which should always be $PGDATA when this code is running). */ File PathNameOpenFile(FileName fileName, int fileFlags, int fileMode) { char *fnamecopy; File file; Vfd *vfdP; DO_DB(elog(LOG, "PathNameOpenFile: %s %x %o", fileName, fileFlags, fileMode)); /* * We need a malloc'd copy of the file name; fail cleanly if no room. */ fnamecopy = strdup(fileName); if (fnamecopy == NULL) ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of memory"))); file = AllocateVfd(); vfdP = &VfdCache[file]; while (nfile + numAllocatedDescs >= max_safe_fds) { if (!ReleaseLruFile()) break; } vfdP->fd = BasicOpenFile(fileName, fileFlags, fileMode); if (vfdP->fd < 0) { FreeVfd(file); free(fnamecopy); return -1; } ++nfile; DO_DB(elog(LOG, "PathNameOpenFile: success %d", vfdP->fd)); Insert(file); vfdP->fileName = fnamecopy; /* Saved flags are adjusted to be OK for re-opening file */ vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL); vfdP->fileMode = fileMode; vfdP->seekPos = 0; vfdP->fdstate = 0x0; vfdP->resowner = NULL; return file; }
/** * @brief Open the next data file and returns its descriptor. * @param rnode [in] RelFileNode of target relation. * @param blknum [in] Block number to seek. * @return File descriptor of the last data file. */ static int open_data_file(RelFileNode rnode, bool istemp, BlockNumber blknum) { int fd = -1; int ret; BlockNumber segno; char *fname = NULL; #if PG_VERSION_NUM >= 90100 RelFileNodeBackend bknode; bknode.node = rnode; bknode.backend = istemp ? MyBackendId : InvalidBackendId; fname = relpath(bknode, MAIN_FORKNUM); #else fname = relpath(rnode, MAIN_FORKNUM); #endif segno = blknum / RELSEG_SIZE; if (segno > 0) { /* * The length `+ 12' is taken from _mdfd_openmesg() in backend/storage/smgr/md.c. */ char *tmp = palloc(strlen(fname) + 12); sprintf(tmp, "%s.%u", fname, segno); pfree(fname); fname = tmp; } fd = BasicOpenFile(fname, O_CREAT | O_WRONLY | PG_BINARY, S_IRUSR | S_IWUSR); if (fd == -1) ereport(ERROR, (errcode_for_file_access(), errmsg("could not open data file: %m"))); ret = lseek(fd, BLCKSZ * (blknum % RELSEG_SIZE), SEEK_SET); if (ret == -1) { close(fd); ereport(ERROR, (errcode_for_file_access(), errmsg ("could not seek the end of the data file: %m"))); } pfree(fname); return fd; }
/* * TODO: This is duplicate code with pg_xlogdump, similar to walsender.c, but * we currently don't have the infrastructure (elog!) to share it. */ static void XLogRead(char *buf, TimeLineID tli, XLogRecPtr startptr, Size count) { char *p; XLogRecPtr recptr; Size nbytes; static int sendFile = -1; static XLogSegNo sendSegNo = 0; static uint32 sendOff = 0; p = buf; recptr = startptr; nbytes = count; while (nbytes > 0) { uint32 startoff; int segbytes; int readbytes; startoff = recptr % XLogSegSize; if (sendFile < 0 || !XLByteInSeg(recptr, sendSegNo)) { char path[MAXPGPATH]; /* Switch to another logfile segment */ if (sendFile >= 0) close(sendFile); XLByteToSeg(recptr, sendSegNo); XLogFilePath(path, tli, sendSegNo); sendFile = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0); if (sendFile < 0) { if (errno == ENOENT) ereport(ERROR, (errcode_for_file_access(), errmsg("requested WAL segment %s has already been removed", path))); else ereport(ERROR, (errcode_for_file_access(), errmsg("could not open file \"%s\": %m", path))); } sendOff = 0; } /* Need to seek in the file? */ if (sendOff != startoff) { if (lseek(sendFile, (off_t) startoff, SEEK_SET) < 0) { char path[MAXPGPATH]; XLogFilePath(path, tli, sendSegNo); ereport(ERROR, (errcode_for_file_access(), errmsg("could not seek in log segment %s to offset %u: %m", path, startoff))); } sendOff = startoff; } /* How many bytes are within this segment? */ if (nbytes > (XLogSegSize - startoff)) segbytes = XLogSegSize - startoff; else segbytes = nbytes; readbytes = read(sendFile, p, segbytes); if (readbytes <= 0) { char path[MAXPGPATH]; XLogFilePath(path, tli, sendSegNo); ereport(ERROR, (errcode_for_file_access(), errmsg("could not read from log segment %s, offset %u, length %lu: %m", path, sendOff, (unsigned long) segbytes))); } /* Update state for read */ recptr += readbytes; sendOff += readbytes; nbytes -= readbytes; p += readbytes; } }
/* * Read and validate the state file for xid. * * If it looks OK (has a valid magic number and CRC), return the palloc'd * contents of the file. Otherwise return NULL. */ static char * ReadTwoPhaseFile(TransactionId xid) { char path[MAXPGPATH]; char *buf; TwoPhaseFileHeader *hdr; int fd; struct stat stat; uint32 crc_offset; pg_crc32 calc_crc, file_crc; TwoPhaseFilePath(path, xid); fd = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0); if (fd < 0) { ereport(WARNING, (errcode_for_file_access(), errmsg("could not open two-phase state file \"%s\": %m", path))); return NULL; } /* * Check file length. We can determine a lower bound pretty easily. We * set an upper bound to avoid palloc() failure on a corrupt file, though * we can't guarantee that we won't get an out of memory error anyway, * even on a valid file. */ if (fstat(fd, &stat)) { close(fd); ereport(WARNING, (errcode_for_file_access(), errmsg("could not stat two-phase state file \"%s\": %m", path))); return NULL; } if (stat.st_size < (MAXALIGN(sizeof(TwoPhaseFileHeader)) + MAXALIGN(sizeof(TwoPhaseRecordOnDisk)) + sizeof(pg_crc32)) || stat.st_size > MaxAllocSize) { close(fd); return NULL; } crc_offset = stat.st_size - sizeof(pg_crc32); if (crc_offset != MAXALIGN(crc_offset)) { close(fd); return NULL; } /* * OK, slurp in the file. */ buf = (char *) palloc(stat.st_size); if (read(fd, buf, stat.st_size) != stat.st_size) { close(fd); ereport(WARNING, (errcode_for_file_access(), errmsg("could not read two-phase state file \"%s\": %m", path))); pfree(buf); return NULL; } close(fd); hdr = (TwoPhaseFileHeader *) buf; if (hdr->magic != TWOPHASE_MAGIC || hdr->total_len != stat.st_size) { pfree(buf); return NULL; } INIT_CRC32(calc_crc); COMP_CRC32(calc_crc, buf, crc_offset); FIN_CRC32(calc_crc); file_crc = *((pg_crc32 *) (buf + crc_offset)); if (!EQ_CRC32(calc_crc, file_crc)) { pfree(buf); return NULL; } return buf; }
bool CheckNewRelFileNodeIsOk(Oid newOid, Oid reltablespace, bool relisshared, Relation pg_class) { RelFileNode rnode; char *rpath; int fd; bool collides; if (pg_class) { Oid oidIndex; Relation indexrel; IndexScanDesc scan; ScanKeyData key; Assert(!IsBootstrapProcessingMode()); Assert(pg_class->rd_rel->relhasoids); /* The relcache will cache the identity of the OID index for us */ oidIndex = RelationGetOidIndex(pg_class); Assert(OidIsValid(oidIndex)); indexrel = index_open(oidIndex, AccessShareLock); ScanKeyInit(&key, (AttrNumber) 1, BTEqualStrategyNumber, F_OIDEQ, ObjectIdGetDatum(newOid)); scan = index_beginscan(pg_class, indexrel, SnapshotDirty, 1, &key); collides = HeapTupleIsValid(index_getnext(scan, ForwardScanDirection)); index_endscan(scan); index_close(indexrel, AccessShareLock); if (collides) elog(ERROR, "relfilenode %d already in use in \"pg_class\"", newOid); } /* This should match RelationInitPhysicalAddr */ rnode.spcNode = reltablespace ? reltablespace : MyDatabaseTableSpace; rnode.dbNode = relisshared ? InvalidOid : MyDatabaseId; rnode.relNode = newOid; /* Check for existing file of same name */ rpath = relpath(rnode); fd = BasicOpenFile(rpath, O_RDONLY | PG_BINARY, 0); if (fd >= 0) { /* definite collision */ gp_retry_close(fd); collides = true; } else collides = false; pfree(rpath); if (collides && !relisshared) elog(ERROR, "oid %d already in use", newOid); while(GetNewObjectId() < newOid); return !collides; }
/* * copy one file */ static void copy_file(char *fromfile, char *tofile) { char *buffer; int srcfd; int dstfd; int nbytes; /* Use palloc to ensure we get a maxaligned buffer */ #define COPY_BUF_SIZE (8 * BLCKSZ) buffer = palloc(COPY_BUF_SIZE); /* * Open the files */ srcfd = BasicOpenFile(fromfile, O_RDONLY | PG_BINARY, 0); if (srcfd < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not open file \"%s\": %m", fromfile))); dstfd = BasicOpenFile(tofile, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, S_IRUSR | S_IWUSR); if (dstfd < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not create file \"%s\": %m", tofile))); /* * Do the data copying. */ for (;;) { /* If we got a cancel signal during the copy of the file, quit */ CHECK_FOR_INTERRUPTS(); nbytes = read(srcfd, buffer, COPY_BUF_SIZE); if (nbytes < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not read file \"%s\": %m", fromfile))); if (nbytes == 0) break; errno = 0; if ((int) write(dstfd, buffer, nbytes) != nbytes) { /* if write didn't set errno, assume problem is no disk space */ if (errno == 0) errno = ENOSPC; ereport(ERROR, (errcode_for_file_access(), errmsg("could not write to file \"%s\": %m", tofile))); } } /* * Be paranoid here to ensure we catch problems. */ if (pg_fsync(dstfd) != 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not fsync file \"%s\": %m", tofile))); if (close(dstfd)) ereport(ERROR, (errcode_for_file_access(), errmsg("could not close file \"%s\": %m", tofile))); close(srcfd); pfree(buffer); }
/* * Physical write of a page from a buffer slot * * On failure, we cannot just ereport(ERROR) since caller has put state in * shared memory that must be undone. So, we return FALSE and save enough * info in static variables to let SlruReportIOError make the report. * * For now, assume it's not worth keeping a file pointer open across * independent read/write operations. We do batch operations during * SimpleLruFlush, though. * * fdata is NULL for a standalone write, pointer to open-file info during * SimpleLruFlush. */ static bool SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruFlush fdata) { SlruShared shared = ctl->shared; int segno = pageno / SLRU_PAGES_PER_SEGMENT; int rpageno = pageno % SLRU_PAGES_PER_SEGMENT; int offset = rpageno * BLCKSZ; char path[MAXPGPATH]; int fd = -1; struct timeval tv; /* * Honor the write-WAL-before-data rule, if appropriate, so that we do not * write out data before associated WAL records. This is the same action * performed during FlushBuffer() in the main buffer manager. */ if (shared->group_lsn != NULL) { /* * We must determine the largest async-commit LSN for the page. This * is a bit tedious, but since this entire function is a slow path * anyway, it seems better to do this here than to maintain a per-page * LSN variable (which'd need an extra comparison in the * transaction-commit path). */ XLogRecPtr max_lsn; int lsnindex, lsnoff; lsnindex = slotno * shared->lsn_groups_per_page; max_lsn = shared->group_lsn[lsnindex++]; for (lsnoff = 1; lsnoff < shared->lsn_groups_per_page; lsnoff++) { XLogRecPtr this_lsn = shared->group_lsn[lsnindex++]; if (XLByteLT(max_lsn, this_lsn)) max_lsn = this_lsn; } if (!XLogRecPtrIsInvalid(max_lsn)) { /* * As noted above, elog(ERROR) is not acceptable here, so if * XLogFlush were to fail, we must PANIC. This isn't much of a * restriction because XLogFlush is just about all critical * section anyway, but let's make sure. */ START_CRIT_SECTION(); XLogFlush(max_lsn); END_CRIT_SECTION(); } } /* * During a Flush, we may already have the desired file open. */ if (fdata) { int i; for (i = 0; i < fdata->num_files; i++) { if (fdata->segno[i] == segno) { fd = fdata->fd[i]; break; } } } if (fd < 0) { /* * If the file doesn't already exist, we should create it. It is * possible for this to need to happen when writing a page that's not * first in its segment; we assume the OS can cope with that. (Note: * it might seem that it'd be okay to create files only when * SimpleLruZeroPage is called for the first page of a segment. * However, if after a crash and restart the REDO logic elects to * replay the log from a checkpoint before the latest one, then it's * possible that we will get commands to set transaction status of * transactions that have already been truncated from the commit log. * Easiest way to deal with that is to accept references to * nonexistent files here and in SlruPhysicalReadPage.) * * Note: it is possible for more than one backend to be executing this * code simultaneously for different pages of the same file. Hence, * don't use O_EXCL or O_TRUNC or anything like that. */ SlruFileName(ctl, path, segno); fd = BasicOpenFile(path, O_RDWR | O_CREAT | PG_BINARY, S_IRUSR | S_IWUSR); if (fd < 0) { slru_errcause = SLRU_OPEN_FAILED; slru_errno = errno; return false; } if (fdata) { if (fdata->num_files < MAX_FLUSH_BUFFERS) { fdata->fd[fdata->num_files] = fd; fdata->segno[fdata->num_files] = segno; fdata->num_files++; } else { /* * In the unlikely event that we exceed MAX_FLUSH_BUFFERS, * fall back to treating it as a standalone write. */ fdata = NULL; } } } if (lseek(fd, (off_t) offset, SEEK_SET) < 0) { slru_errcause = SLRU_SEEK_FAILED; slru_errno = errno; if (!fdata) close(fd); return false; } errno = 0; if (write(fd, shared->page_buffer[slotno], BLCKSZ) != BLCKSZ) { /* if write didn't set errno, assume problem is no disk space */ if (errno == 0) errno = ENOSPC; slru_errcause = SLRU_WRITE_FAILED; slru_errno = errno; if (!fdata) close(fd); return false; } #ifdef XP_TRACE_LRU_WRITE gettimeofday(&tv, NULL); ereport(TRACE_LEVEL, (errmsg("%ld.%ld:\tWRITE:\tSlruPhysicalWritePage:\tfile:%s", tv.tv_sec, tv.tv_usec, path))); #endif /* * If not part of Flush, need to fsync now. We assume this happens * infrequently enough that it's not a performance issue. */ if (!fdata) { if (ctl->do_fsync && pg_fsync(fd)) { slru_errcause = SLRU_FSYNC_FAILED; slru_errno = errno; close(fd); return false; } if (close(fd)) { slru_errcause = SLRU_CLOSE_FAILED; slru_errno = errno; return false; } } return true; }
/* * GetNewSequenceRelationOid * Get a sequence relation Oid and verify it is valid against * the pg_class relation by doing an index lookup. The caller * should have a suitable lock on pg_class. */ Oid GetNewSequenceRelationOid(Relation relation) { Oid newOid; Oid oidIndex; Relation indexrel; SnapshotData SnapshotDirty; IndexScanDesc scan; ScanKeyData key; bool collides; RelFileNode rnode; char *rpath; int fd; /* This should match RelationInitPhysicalAddr */ rnode.spcNode = relation->rd_rel->reltablespace ? relation->rd_rel->reltablespace : MyDatabaseTableSpace; rnode.dbNode = relation->rd_rel->relisshared ? InvalidOid : MyDatabaseId; /* We should only be using pg_class */ Assert(RelationGetRelid(relation) == RelationRelationId); /* The relcache will cache the identity of the OID index for us */ oidIndex = RelationGetOidIndex(relation); /* Otherwise, use the index to find a nonconflicting OID */ indexrel = index_open(oidIndex, AccessShareLock); InitDirtySnapshot(SnapshotDirty); /* Generate new sequence relation OIDs until we find one not in the table */ do { CHECK_FOR_INTERRUPTS(); newOid = GetNewSequenceRelationObjectId(); ScanKeyInit(&key, (AttrNumber) 1, BTEqualStrategyNumber, F_OIDEQ, ObjectIdGetDatum(newOid)); /* see notes above about using SnapshotDirty */ scan = index_beginscan(relation, indexrel, &SnapshotDirty, 1, &key); collides = HeapTupleIsValid(index_getnext(scan, ForwardScanDirection)); index_endscan(scan); if (!collides) { /* Check for existing file of same name */ rpath = relpath(rnode); fd = BasicOpenFile(rpath, O_RDONLY | PG_BINARY, 0); if (fd >= 0) { /* definite collision */ gp_retry_close(fd); collides = true; } else { /* * Here we have a little bit of a dilemma: if errno is something * other than ENOENT, should we declare a collision and loop? In * particular one might think this advisable for, say, EPERM. * However there really shouldn't be any unreadable files in a * tablespace directory, and if the EPERM is actually complaining * that we can't read the directory itself, we'd be in an infinite * loop. In practice it seems best to go ahead regardless of the * errno. If there is a colliding file we will get an smgr * failure when we attempt to create the new relation file. */ collides = false; } } /* * Also check that the OID hasn't been pre-assigned for a different * relation. * * We're a bit sloppy between OIDs and relfilenodes here; it would be * OK to use a value that's been reserved for use as a type or * relation OID here, as long as the relfilenode is free. But there's * no harm in skipping over those too, so we don't bother to * distinguish them. */ if (!collides && !IsOidAcceptable(newOid)) collides = true; } while (collides); index_close(indexrel, AccessShareLock); return newOid; }
/* * GetNewRelFileNode * Generate a new relfilenode number that is unique within the given * tablespace. * * If the relfilenode will also be used as the relation's OID, pass the * opened pg_class catalog, and this routine will guarantee that the result * is also an unused OID within pg_class. If the result is to be used only * as a relfilenode for an existing relation, pass NULL for pg_class. * * As with GetNewOid, there is some theoretical risk of a race condition, * but it doesn't seem worth worrying about. * * Note: we don't support using this in bootstrap mode. All relations * created by bootstrap have preassigned OIDs, so there's no need. */ Oid GetNewRelFileNode(Oid reltablespace, bool relisshared, Relation pg_class) { RelFileNode rnode; char *rpath; int fd; bool collides = true; /* This should match RelationInitPhysicalAddr */ rnode.spcNode = reltablespace ? reltablespace : MyDatabaseTableSpace; rnode.dbNode = relisshared ? InvalidOid : MyDatabaseId; do { CHECK_FOR_INTERRUPTS(); /* Generate the OID */ if (pg_class) rnode.relNode = GetNewOid(pg_class); else rnode.relNode = GetNewObjectId(); if (!UseOidForRelFileNode(rnode.relNode)) continue; /* Check for existing file of same name */ rpath = relpath(rnode); fd = BasicOpenFile(rpath, O_RDONLY | PG_BINARY, 0); if (fd >= 0) { /* definite collision */ gp_retry_close(fd); collides = true; } else { /* * Here we have a little bit of a dilemma: if errno is something * other than ENOENT, should we declare a collision and loop? In * particular one might think this advisable for, say, EPERM. * However there really shouldn't be any unreadable files in a * tablespace directory, and if the EPERM is actually complaining * that we can't read the directory itself, we'd be in an infinite * loop. In practice it seems best to go ahead regardless of the * errno. If there is a colliding file we will get an smgr * failure when we attempt to create the new relation file. */ collides = false; } pfree(rpath); } while (collides); if (Gp_role == GP_ROLE_EXECUTE) Insist(!PointerIsValid(pg_class)); elog(DEBUG1, "Calling GetNewRelFileNode in %s mode %s pg_class. New relOid = %d", (Gp_role == GP_ROLE_EXECUTE ? "execute" : Gp_role == GP_ROLE_UTILITY ? "utility" : "dispatch"), pg_class ? "with" : "without", rnode.relNode); return rnode.relNode; }
/* * ReceiveRegularFile creates a local file at the given file path, and connects * to remote database that has the given node name and port number. The function * then issues the given transmit command using client-side logic (libpq), reads * the remote file's contents, and appends these contents to the local file. On * success, the function returns success; on failure, it cleans up all resources * and returns false. */ static bool ReceiveRegularFile(const char *nodeName, uint32 nodePort, StringInfo transmitCommand, StringInfo filePath) { int32 fileDescriptor = -1; char filename[MAXPGPATH]; int closed = -1; const int fileFlags = (O_APPEND | O_CREAT | O_RDWR | O_TRUNC | PG_BINARY); const int fileMode = (S_IRUSR | S_IWUSR); QueryStatus queryStatus = CLIENT_INVALID_QUERY; int32 connectionId = INVALID_CONNECTION_ID; char *nodeDatabase = NULL; bool querySent = false; bool queryReady = false; bool copyDone = false; /* create local file to append remote data to */ snprintf(filename, MAXPGPATH, "%s", filePath->data); fileDescriptor = BasicOpenFile(filename, fileFlags, fileMode); if (fileDescriptor < 0) { ereport(WARNING, (errcode_for_file_access(), errmsg("could not open file \"%s\": %m", filePath->data))); return false; } /* we use the same database name on the master and worker nodes */ nodeDatabase = get_database_name(MyDatabaseId); /* connect to remote node */ connectionId = MultiClientConnect(nodeName, nodePort, nodeDatabase, NULL); if (connectionId == INVALID_CONNECTION_ID) { ReceiveResourceCleanup(connectionId, filename, fileDescriptor); return false; } /* send request to remote node to start transmitting data */ querySent = MultiClientSendQuery(connectionId, transmitCommand->data); if (!querySent) { ReceiveResourceCleanup(connectionId, filename, fileDescriptor); return false; } /* loop until the remote node acknowledges our transmit request */ while (!queryReady) { ResultStatus resultStatus = MultiClientResultStatus(connectionId); if (resultStatus == CLIENT_RESULT_READY) { queryReady = true; } else if (resultStatus == CLIENT_RESULT_BUSY) { /* remote node did not respond; wait for longer */ long sleepIntervalPerCycle = RemoteTaskCheckInterval * 1000L; pg_usleep(sleepIntervalPerCycle); } else { ReceiveResourceCleanup(connectionId, filename, fileDescriptor); return false; } } /* check query response is as expected */ queryStatus = MultiClientQueryStatus(connectionId); if (queryStatus != CLIENT_QUERY_COPY) { ReceiveResourceCleanup(connectionId, filename, fileDescriptor); return false; } /* loop until we receive and append all the data from remote node */ while (!copyDone) { CopyStatus copyStatus = MultiClientCopyData(connectionId, fileDescriptor); if (copyStatus == CLIENT_COPY_DONE) { copyDone = true; } else if (copyStatus == CLIENT_COPY_MORE) { /* remote node will continue to send more data */ } else { ReceiveResourceCleanup(connectionId, filename, fileDescriptor); return false; } } /* we are done executing; release the connection and the file handle */ MultiClientDisconnect(connectionId); closed = close(fileDescriptor); if (closed < 0) { ereport(WARNING, (errcode_for_file_access(), errmsg("could not close file \"%s\": %m", filename))); /* if we failed to close file, try to delete it before erroring out */ DeleteFile(filename); return false; } /* we successfully received the remote file */ ereport(DEBUG2, (errmsg("received remote file \"%s\"", filename))); return true; }
/* * Read 'count' bytes from WAL into 'buf', starting at location 'startptr' * * XXX probably this should be improved to suck data directly from the * WAL buffers when possible. * * Will open, and keep open, one WAL segment stored in the global file * descriptor sendFile. This means if XLogRead is used once, there will * always be one descriptor left open until the process ends, but never * more than one. */ void XLogRead(char *buf, XLogRecPtr startptr, Size count) { char *p; XLogRecPtr recptr; Size nbytes; uint32 lastRemovedLog; uint32 lastRemovedSeg; uint32 log; uint32 seg; retry: p = buf; recptr = startptr; nbytes = count; while (nbytes > 0) { uint32 startoff; int segbytes; int readbytes; startoff = recptr.xrecoff % XLogSegSize; if (sendFile < 0 || !XLByteInSeg(recptr, sendId, sendSeg)) { char path[MAXPGPATH]; /* Switch to another logfile segment */ if (sendFile >= 0) close(sendFile); XLByteToSeg(recptr, sendId, sendSeg); XLogFilePath(path, ThisTimeLineID, sendId, sendSeg); sendFile = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0); if (sendFile < 0) { /* * If the file is not found, assume it's because the standby * asked for a too old WAL segment that has already been * removed or recycled. */ if (errno == ENOENT) { char filename[MAXFNAMELEN]; XLogFileName(filename, ThisTimeLineID, sendId, sendSeg); ereport(ERROR, (errcode_for_file_access(), errmsg("requested WAL segment %s has already been removed", filename))); } else ereport(ERROR, (errcode_for_file_access(), errmsg("could not open file \"%s\" (log file %u, segment %u): %m", path, sendId, sendSeg))); } sendOff = 0; } /* Need to seek in the file? */ if (sendOff != startoff) { if (lseek(sendFile, (off_t) startoff, SEEK_SET) < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not seek in log file %u, segment %u to offset %u: %m", sendId, sendSeg, startoff))); sendOff = startoff; } /* How many bytes are within this segment? */ if (nbytes > (XLogSegSize - startoff)) segbytes = XLogSegSize - startoff; else segbytes = nbytes; readbytes = read(sendFile, p, segbytes); if (readbytes <= 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not read from log file %u, segment %u, offset %u, " "length %lu: %m", sendId, sendSeg, sendOff, (unsigned long) segbytes))); /* Update state for read */ XLByteAdvance(recptr, readbytes); sendOff += readbytes; nbytes -= readbytes; p += readbytes; } /* * After reading into the buffer, check that what we read was valid. We do * this after reading, because even though the segment was present when we * opened it, it might get recycled or removed while we read it. The * read() succeeds in that case, but the data we tried to read might * already have been overwritten with new WAL records. */ XLogGetLastRemoved(&lastRemovedLog, &lastRemovedSeg); XLByteToSeg(startptr, log, seg); if (log < lastRemovedLog || (log == lastRemovedLog && seg <= lastRemovedSeg)) { char filename[MAXFNAMELEN]; XLogFileName(filename, ThisTimeLineID, log, seg); ereport(ERROR, (errcode_for_file_access(), errmsg("requested WAL segment %s has already been removed", filename))); } /* * During recovery, the currently-open WAL file might be replaced with * the file of the same name retrieved from archive. So we always need * to check what we read was valid after reading into the buffer. If it's * invalid, we try to open and read the file again. */ if (am_cascading_walsender) { /* use volatile pointer to prevent code rearrangement */ volatile WalSnd *walsnd = MyWalSnd; bool reload; SpinLockAcquire(&walsnd->mutex); reload = walsnd->needreload; walsnd->needreload = false; SpinLockRelease(&walsnd->mutex); if (reload && sendFile >= 0) { close(sendFile); sendFile = -1; goto retry; } } }
/* * mdunlink() -- Unlink a relation. * * Note that we're passed a RelFileNode --- by the time this is called, * there won't be an SMgrRelation hashtable entry anymore. * * Actually, we don't unlink the first segment file of the relation, but * just truncate it to zero length, and record a request to unlink it after * the next checkpoint. Additional segments can be unlinked immediately, * however. Leaving the empty file in place prevents that relfilenode * number from being reused. The scenario this protects us from is: * 1. We delete a relation (and commit, and actually remove its file). * 2. We create a new relation, which by chance gets the same relfilenode as * the just-deleted one (OIDs must've wrapped around for that to happen). * 3. We crash before another checkpoint occurs. * During replay, we would delete the file and then recreate it, which is fine * if the contents of the file were repopulated by subsequent WAL entries. * But if we didn't WAL-log insertions, but instead relied on fsyncing the * file after populating it (as for instance CLUSTER and CREATE INDEX do), * the contents of the file would be lost forever. By leaving the empty file * until after the next checkpoint, we prevent reassignment of the relfilenode * number until it's safe, because relfilenode assignment skips over any * existing file. * * If isRedo is true, it's okay for the relation to be already gone. * Also, we should remove the file immediately instead of queuing a request * for later, since during redo there's no possibility of creating a * conflicting relation. * * Note: any failure should be reported as WARNING not ERROR, because * we are usually not in a transaction anymore when this is called. */ void mdunlink(RelFileNode rnode, ForkNumber forkNum, bool isRedo) { char *path; int ret; /* * We have to clean out any pending fsync requests for the doomed * relation, else the next mdsync() will fail. */ ForgetRelationFsyncRequests(rnode, forkNum); path = relpath(rnode, forkNum); /* * Delete or truncate the first segment. */ if (isRedo || forkNum != MAIN_FORKNUM) { ret = unlink(path); if (ret < 0) { if (!isRedo || errno != ENOENT) ereport(WARNING, (errcode_for_file_access(), errmsg("could not remove file \"%s\": %m", path))); } } else { /* truncate(2) would be easier here, but Windows hasn't got it */ int fd; fd = BasicOpenFile(path, O_RDWR | PG_BINARY, 0); if (fd >= 0) { int save_errno; ret = ftruncate(fd, 0); save_errno = errno; close(fd); errno = save_errno; } else ret = -1; if (ret < 0 && errno != ENOENT) ereport(WARNING, (errcode_for_file_access(), errmsg("could not truncate file \"%s\": %m", path))); } /* * Delete any additional segments. */ if (ret >= 0) { char *segpath = (char *) palloc(strlen(path) + 12); BlockNumber segno; /* * Note that because we loop until getting ENOENT, we will correctly * remove all inactive segments as well as active ones. */ for (segno = 1;; segno++) { sprintf(segpath, "%s.%u", path, segno); if (unlink(segpath) < 0) { /* ENOENT is expected after the last segment... */ if (errno != ENOENT) ereport(WARNING, (errcode_for_file_access(), errmsg("could not remove file \"%s\": %m", segpath))); break; } } pfree(segpath); } pfree(path); /* Register request to unlink first segment later */ if (!isRedo && forkNum == MAIN_FORKNUM) register_unlink(rnode); }
/* * Write out a new shared or local map file with the given contents. * * The magic number and CRC are automatically updated in *newmap. On * success, we copy the data to the appropriate permanent static variable. * * If write_wal is TRUE then an appropriate WAL message is emitted. * (It will be false for bootstrap and WAL replay cases.) * * If send_sinval is TRUE then a SI invalidation message is sent. * (This should be true except in bootstrap case.) * * If preserve_files is TRUE then the storage manager is warned not to * delete the files listed in the map. * * Because this may be called during WAL replay when MyDatabaseId, * DatabasePath, etc aren't valid, we require the caller to pass in suitable * values. The caller is also responsible for being sure no concurrent * map update could be happening. */ static void write_relmap_file(bool shared, RelMapFile *newmap, bool write_wal, bool send_sinval, bool preserve_files, Oid dbid, Oid tsid, const char *dbpath) { int fd; RelMapFile *realmap; char mapfilename[MAXPGPATH]; /* * Fill in the overhead fields and update CRC. */ newmap->magic = RELMAPPER_FILEMAGIC; if (newmap->num_mappings < 0 || newmap->num_mappings > MAX_MAPPINGS) elog(ERROR, "attempt to write bogus relation mapping"); INIT_CRC32(newmap->crc); COMP_CRC32(newmap->crc, (char *) newmap, offsetof(RelMapFile, crc)); FIN_CRC32(newmap->crc); /* * Open the target file. We prefer to do this before entering the * critical section, so that an open() failure need not force PANIC. * * Note: since we use BasicOpenFile, we are nominally responsible for * ensuring the fd is closed on error. In practice, this isn't important * because either an error happens inside the critical section, or we are * in bootstrap or WAL replay; so an error past this point is always fatal * anyway. */ if (shared) { snprintf(mapfilename, sizeof(mapfilename), "global/%s", RELMAPPER_FILENAME); realmap = &shared_map; } else { snprintf(mapfilename, sizeof(mapfilename), "%s/%s", dbpath, RELMAPPER_FILENAME); realmap = &local_map; } fd = BasicOpenFile(mapfilename, O_WRONLY | O_CREAT | PG_BINARY, S_IRUSR | S_IWUSR); if (fd < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not open relation mapping file \"%s\": %m", mapfilename))); if (write_wal) { xl_relmap_update xlrec; XLogRecData rdata[2]; XLogRecPtr lsn; /* now errors are fatal ... */ START_CRIT_SECTION(); xlrec.dbid = dbid; xlrec.tsid = tsid; xlrec.nbytes = sizeof(RelMapFile); rdata[0].data = (char *) (&xlrec); rdata[0].len = MinSizeOfRelmapUpdate; rdata[0].buffer = InvalidBuffer; rdata[0].next = &(rdata[1]); rdata[1].data = (char *) newmap; rdata[1].len = sizeof(RelMapFile); rdata[1].buffer = InvalidBuffer; rdata[1].next = NULL; lsn = XLogInsert(RM_RELMAP_ID, XLOG_RELMAP_UPDATE, rdata); /* As always, WAL must hit the disk before the data update does */ XLogFlush(lsn); } errno = 0; if (write(fd, newmap, sizeof(RelMapFile)) != sizeof(RelMapFile)) { /* if write didn't set errno, assume problem is no disk space */ if (errno == 0) errno = ENOSPC; ereport(ERROR, (errcode_for_file_access(), errmsg("could not write to relation mapping file \"%s\": %m", mapfilename))); } /* * We choose to fsync the data to disk before considering the task done. * It would be possible to relax this if it turns out to be a performance * issue, but it would complicate checkpointing --- see notes for * CheckPointRelationMap. */ if (pg_fsync(fd) != 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not fsync relation mapping file \"%s\": %m", mapfilename))); if (close(fd)) ereport(ERROR, (errcode_for_file_access(), errmsg("could not close relation mapping file \"%s\": %m", mapfilename))); /* * Now that the file is safely on disk, send sinval message to let other * backends know to re-read it. We must do this inside the critical * section: if for some reason we fail to send the message, we have to * force a database-wide PANIC. Otherwise other backends might continue * execution with stale mapping information, which would be catastrophic * as soon as others began to use the now-committed data. */ if (send_sinval) CacheInvalidateRelmap(dbid); /* * Make sure that the files listed in the map are not deleted if the outer * transaction aborts. This had better be within the critical section * too: it's not likely to fail, but if it did, we'd arrive at transaction * abort with the files still vulnerable. PANICing will leave things in a * good state on-disk. * * Note: we're cheating a little bit here by assuming that mapped files * are either in pg_global or the database's default tablespace. */ if (preserve_files) { int32 i; for (i = 0; i < newmap->num_mappings; i++) { RelFileNode rnode; rnode.spcNode = tsid; rnode.dbNode = dbid; rnode.relNode = newmap->mappings[i].mapfilenode; RelationPreserveStorage(rnode); } } /* Success, update permanent copy */ memcpy(realmap, newmap, sizeof(RelMapFile)); /* Critical section done */ if (write_wal) END_CRIT_SECTION(); }
/* * load_relmap_file -- load data from the shared or local map file * * Because the map file is essential for access to core system catalogs, * failure to read it is a fatal error. * * Note that the local case requires DatabasePath to be set up. */ static void load_relmap_file(bool shared) { RelMapFile *map; char mapfilename[MAXPGPATH]; pg_crc32 crc; int fd; if (shared) { snprintf(mapfilename, sizeof(mapfilename), "global/%s", RELMAPPER_FILENAME); map = &shared_map; } else { snprintf(mapfilename, sizeof(mapfilename), "%s/%s", DatabasePath, RELMAPPER_FILENAME); map = &local_map; } /* Read data ... */ fd = BasicOpenFile(mapfilename, O_RDONLY | PG_BINARY, S_IRUSR | S_IWUSR); if (fd < 0) ereport(FATAL, (errcode_for_file_access(), errmsg("could not open relation mapping file \"%s\": %m", mapfilename))); /* * Note: we could take RelationMappingLock in shared mode here, but it * seems unnecessary since our read() should be atomic against any * concurrent updater's write(). If the file is updated shortly after we * look, the sinval signaling mechanism will make us re-read it before we * are able to access any relation that's affected by the change. */ if (read(fd, map, sizeof(RelMapFile)) != sizeof(RelMapFile)) ereport(FATAL, (errcode_for_file_access(), errmsg("could not read relation mapping file \"%s\": %m", mapfilename))); close(fd); /* check for correct magic number, etc */ if (map->magic != RELMAPPER_FILEMAGIC || map->num_mappings < 0 || map->num_mappings > MAX_MAPPINGS) ereport(FATAL, (errmsg("relation mapping file \"%s\" contains invalid data", mapfilename))); /* verify the CRC */ INIT_CRC32(crc); COMP_CRC32(crc, (char *) map, offsetof(RelMapFile, crc)); FIN_CRC32(crc); if (!EQ_CRC32(crc, map->crc)) ereport(FATAL, (errmsg("relation mapping file \"%s\" contains incorrect checksum", mapfilename))); }
/* * copy one file */ void copy_file(char *fromfile, char *tofile) { char *buffer; int srcfd; int dstfd; int nbytes; off_t offset; /* Use palloc to ensure we get a maxaligned buffer */ #define COPY_BUF_SIZE (8 * BLCKSZ) buffer = palloc(COPY_BUF_SIZE); /* * Open the files */ srcfd = BasicOpenFile(fromfile, O_RDONLY | PG_BINARY, 0); if (srcfd < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not open file \"%s\": %m", fromfile))); dstfd = BasicOpenFile(tofile, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, S_IRUSR | S_IWUSR); if (dstfd < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not create file \"%s\": %m", tofile))); /* * Do the data copying. */ for (offset = 0;; offset += nbytes) { /* If we got a cancel signal during the copy of the file, quit */ CHECK_FOR_INTERRUPTS(); nbytes = read(srcfd, buffer, COPY_BUF_SIZE); if (nbytes < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not read file \"%s\": %m", fromfile))); if (nbytes == 0) break; errno = 0; if ((int) write(dstfd, buffer, nbytes) != nbytes) { /* if write didn't set errno, assume problem is no disk space */ if (errno == 0) errno = ENOSPC; ereport(ERROR, (errcode_for_file_access(), errmsg("could not write to file \"%s\": %m", tofile))); } /* * We fsync the files later but first flush them to avoid spamming the * cache and hopefully get the kernel to start writing them out before * the fsync comes. Ignore any error, since it's only a hint. */ (void) pg_flush_data(dstfd, offset, nbytes); } if (close(dstfd)) ereport(ERROR, (errcode_for_file_access(), errmsg("could not close file \"%s\": %m", tofile))); close(srcfd); pfree(buffer); }
/* * CheckPointTwoPhase -- handle 2PC component of checkpointing. * * We must fsync the state file of any GXACT that is valid and has a PREPARE * LSN <= the checkpoint's redo horizon. (If the gxact isn't valid yet or * has a later LSN, this checkpoint is not responsible for fsyncing it.) * * This is deliberately run as late as possible in the checkpoint sequence, * because GXACTs ordinarily have short lifespans, and so it is quite * possible that GXACTs that were valid at checkpoint start will no longer * exist if we wait a little bit. * * If a GXACT remains valid across multiple checkpoints, it'll be fsynced * each time. This is considered unusual enough that we don't bother to * expend any extra code to avoid the redundant fsyncs. (They should be * reasonably cheap anyway, since they won't cause I/O.) */ void CheckPointTwoPhase(XLogRecPtr redo_horizon) { TransactionId *xids; int nxids; char path[MAXPGPATH]; int i; /* * We don't want to hold the TwoPhaseStateLock while doing I/O, so we grab * it just long enough to make a list of the XIDs that require fsyncing, * and then do the I/O afterwards. * * This approach creates a race condition: someone else could delete a * GXACT between the time we release TwoPhaseStateLock and the time we try * to open its state file. We handle this by special-casing ENOENT * failures: if we see that, we verify that the GXACT is no longer valid, * and if so ignore the failure. */ if (max_prepared_xacts <= 0) return; /* nothing to do */ TRACE_POSTGRESQL_TWOPHASE_CHECKPOINT_START(); xids = (TransactionId *) palloc(max_prepared_xacts * sizeof(TransactionId)); nxids = 0; LWLockAcquire(TwoPhaseStateLock, LW_SHARED); for (i = 0; i < TwoPhaseState->numPrepXacts; i++) { GlobalTransaction gxact = TwoPhaseState->prepXacts[i]; if (gxact->valid && XLByteLE(gxact->prepare_lsn, redo_horizon)) xids[nxids++] = gxact->proc.xid; } LWLockRelease(TwoPhaseStateLock); for (i = 0; i < nxids; i++) { TransactionId xid = xids[i]; int fd; TwoPhaseFilePath(path, xid); fd = BasicOpenFile(path, O_RDWR | PG_BINARY, 0); if (fd < 0) { if (errno == ENOENT) { /* OK if gxact is no longer valid */ if (!TransactionIdIsPrepared(xid)) continue; /* Restore errno in case it was changed */ errno = ENOENT; } ereport(ERROR, (errcode_for_file_access(), errmsg("could not open two-phase state file \"%s\": %m", path))); } if (pg_fsync(fd) != 0) { close(fd); ereport(ERROR, (errcode_for_file_access(), errmsg("could not fsync two-phase state file \"%s\": %m", path))); } if (close(fd) != 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not close two-phase state file \"%s\": %m", path))); } pfree(xids); TRACE_POSTGRESQL_TWOPHASE_CHECKPOINT_DONE(); }
/* * Finish preparing state file. * * Calculates CRC and writes state file to WAL and in pg_twophase directory. */ void EndPrepare(GlobalTransaction gxact) { TransactionId xid = gxact->proc.xid; TwoPhaseFileHeader *hdr; char path[MAXPGPATH]; XLogRecData *record; pg_crc32 statefile_crc; pg_crc32 bogus_crc; int fd; /* Add the end sentinel to the list of 2PC records */ RegisterTwoPhaseRecord(TWOPHASE_RM_END_ID, 0, NULL, 0); /* Go back and fill in total_len in the file header record */ hdr = (TwoPhaseFileHeader *) records.head->data; Assert(hdr->magic == TWOPHASE_MAGIC); hdr->total_len = records.total_len + sizeof(pg_crc32); /* * If the file size exceeds MaxAllocSize, we won't be able to read it in * ReadTwoPhaseFile. Check for that now, rather than fail at commit time. */ if (hdr->total_len > MaxAllocSize) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("two-phase state file maximum length exceeded"))); /* * Create the 2PC state file. * * Note: because we use BasicOpenFile(), we are responsible for ensuring * the FD gets closed in any error exit path. Once we get into the * critical section, though, it doesn't matter since any failure causes * PANIC anyway. */ TwoPhaseFilePath(path, xid); fd = BasicOpenFile(path, O_CREAT | O_EXCL | O_WRONLY | PG_BINARY, S_IRUSR | S_IWUSR); if (fd < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not create two-phase state file \"%s\": %m", path))); /* Write data to file, and calculate CRC as we pass over it */ INIT_CRC32(statefile_crc); for (record = records.head; record != NULL; record = record->next) { COMP_CRC32(statefile_crc, record->data, record->len); if ((write(fd, record->data, record->len)) != record->len) { close(fd); ereport(ERROR, (errcode_for_file_access(), errmsg("could not write two-phase state file: %m"))); } } FIN_CRC32(statefile_crc); /* * Write a deliberately bogus CRC to the state file; this is just paranoia * to catch the case where four more bytes will run us out of disk space. */ bogus_crc = ~statefile_crc; if ((write(fd, &bogus_crc, sizeof(pg_crc32))) != sizeof(pg_crc32)) { close(fd); ereport(ERROR, (errcode_for_file_access(), errmsg("could not write two-phase state file: %m"))); } /* Back up to prepare for rewriting the CRC */ if (lseek(fd, -((off_t) sizeof(pg_crc32)), SEEK_CUR) < 0) { close(fd); ereport(ERROR, (errcode_for_file_access(), errmsg("could not seek in two-phase state file: %m"))); } /* * The state file isn't valid yet, because we haven't written the correct * CRC yet. Before we do that, insert entry in WAL and flush it to disk. * * Between the time we have written the WAL entry and the time we write * out the correct state file CRC, we have an inconsistency: the xact is * prepared according to WAL but not according to our on-disk state. We * use a critical section to force a PANIC if we are unable to complete * the write --- then, WAL replay should repair the inconsistency. The * odds of a PANIC actually occurring should be very tiny given that we * were able to write the bogus CRC above. * * We have to set inCommit here, too; otherwise a checkpoint starting * immediately after the WAL record is inserted could complete without * fsync'ing our state file. (This is essentially the same kind of race * condition as the COMMIT-to-clog-write case that RecordTransactionCommit * uses inCommit for; see notes there.) * * We save the PREPARE record's location in the gxact for later use by * CheckPointTwoPhase. */ START_CRIT_SECTION(); MyProc->inCommit = true; gxact->prepare_lsn = XLogInsert(RM_XACT_ID, XLOG_XACT_PREPARE, records.head); XLogFlush(gxact->prepare_lsn); /* If we crash now, we have prepared: WAL replay will fix things */ /* write correct CRC and close file */ if ((write(fd, &statefile_crc, sizeof(pg_crc32))) != sizeof(pg_crc32)) { close(fd); ereport(ERROR, (errcode_for_file_access(), errmsg("could not write two-phase state file: %m"))); } if (close(fd) != 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not close two-phase state file: %m"))); /* * Mark the prepared transaction as valid. As soon as xact.c marks MyProc * as not running our XID (which it will do immediately after this * function returns), others can commit/rollback the xact. * * NB: a side effect of this is to make a dummy ProcArray entry for the * prepared XID. This must happen before we clear the XID from MyProc, * else there is a window where the XID is not running according to * TransactionIdIsInProgress, and onlookers would be entitled to assume * the xact crashed. Instead we have a window where the same XID appears * twice in ProcArray, which is OK. */ MarkAsPrepared(gxact); /* * Now we can mark ourselves as out of the commit critical section: a * checkpoint starting after this will certainly see the gxact as a * candidate for fsyncing. */ MyProc->inCommit = false; END_CRIT_SECTION(); records.tail = records.head = NULL; }
/* * Read 'nbytes' bytes from WAL into 'buf', starting at location 'recptr' * * XXX probably this should be improved to suck data directly from the * WAL buffers when possible. * * Will open, and keep open, one WAL segment stored in the global file * descriptor sendFile. This means if XLogRead is used once, there will * always be one descriptor left open until the process ends, but never * more than one. */ void XLogRead(char *buf, XLogRecPtr recptr, Size nbytes) { XLogRecPtr startRecPtr = recptr; char path[MAXPGPATH]; uint32 lastRemovedLog; uint32 lastRemovedSeg; uint32 log; uint32 seg; while (nbytes > 0) { uint32 startoff; int segbytes; int readbytes; startoff = recptr.xrecoff % XLogSegSize; if (sendFile < 0 || !XLByteInSeg(recptr, sendId, sendSeg)) { /* Switch to another logfile segment */ if (sendFile >= 0) close(sendFile); XLByteToSeg(recptr, sendId, sendSeg); XLogFilePath(path, ThisTimeLineID, sendId, sendSeg); sendFile = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0); if (sendFile < 0) { /* * If the file is not found, assume it's because the standby * asked for a too old WAL segment that has already been * removed or recycled. */ if (errno == ENOENT) { char filename[MAXFNAMELEN]; XLogFileName(filename, ThisTimeLineID, sendId, sendSeg); ereport(ERROR, (errcode_for_file_access(), errmsg("requested WAL segment %s has already been removed", filename))); } else ereport(ERROR, (errcode_for_file_access(), errmsg("could not open file \"%s\" (log file %u, segment %u): %m", path, sendId, sendSeg))); } sendOff = 0; } /* Need to seek in the file? */ if (sendOff != startoff) { if (lseek(sendFile, (off_t) startoff, SEEK_SET) < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not seek in log file %u, segment %u to offset %u: %m", sendId, sendSeg, startoff))); sendOff = startoff; } /* How many bytes are within this segment? */ if (nbytes > (XLogSegSize - startoff)) segbytes = XLogSegSize - startoff; else segbytes = nbytes; readbytes = read(sendFile, buf, segbytes); if (readbytes <= 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not read from log file %u, segment %u, offset %u, " "length %lu: %m", sendId, sendSeg, sendOff, (unsigned long) segbytes))); /* Update state for read */ XLByteAdvance(recptr, readbytes); sendOff += readbytes; nbytes -= readbytes; buf += readbytes; } /* * After reading into the buffer, check that what we read was valid. We do * this after reading, because even though the segment was present when we * opened it, it might get recycled or removed while we read it. The * read() succeeds in that case, but the data we tried to read might * already have been overwritten with new WAL records. */ XLogGetLastRemoved(&lastRemovedLog, &lastRemovedSeg); XLByteToSeg(startRecPtr, log, seg); if (log < lastRemovedLog || (log == lastRemovedLog && seg <= lastRemovedSeg)) { char filename[MAXFNAMELEN]; XLogFileName(filename, ThisTimeLineID, log, seg); ereport(ERROR, (errcode_for_file_access(), errmsg("requested WAL segment %s has already been removed", filename))); } }
/* * Physical read of a (previously existing) page into a buffer slot * * On failure, we cannot just ereport(ERROR) since caller has put state in * shared memory that must be undone. So, we return FALSE and save enough * info in static variables to let SlruReportIOError make the report. * * For now, assume it's not worth keeping a file pointer open across * read/write operations. We could cache one virtual file pointer ... */ static bool SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno) { SlruShared shared = ctl->shared; int segno = pageno / SLRU_PAGES_PER_SEGMENT; int rpageno = pageno % SLRU_PAGES_PER_SEGMENT; int offset = rpageno * BLCKSZ; char path[MAXPGPATH]; int fd; struct timeval tv; SlruFileName(ctl, path, segno); /* * In a crash-and-restart situation, it's possible for us to receive * commands to set the commit status of transactions whose bits are in * already-truncated segments of the commit log (see notes in * SlruPhysicalWritePage). Hence, if we are InRecovery, allow the case * where the file doesn't exist, and return zeroes instead. */ fd = BasicOpenFile(path, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR); if (fd < 0) { if (errno != ENOENT || !InRecovery) { slru_errcause = SLRU_OPEN_FAILED; slru_errno = errno; return false; } ereport(LOG, (errmsg("file \"%s\" doesn't exist, reading as zeroes", path))); MemSet(shared->page_buffer[slotno], 0, BLCKSZ); return true; } if (lseek(fd, (off_t) offset, SEEK_SET) < 0) { slru_errcause = SLRU_SEEK_FAILED; slru_errno = errno; close(fd); return false; } errno = 0; if (read(fd, shared->page_buffer[slotno], BLCKSZ) != BLCKSZ) { slru_errcause = SLRU_READ_FAILED; slru_errno = errno; close(fd); return false; } #ifdef XP_TRACE_LRU_READ gettimeofday(&tv, NULL); #ifdef STACK_TRACE xp_stack_trace(TRACE_SIZE); #endif ereport(TRACE_LEVEL, (errmsg("%ld.%ld:\tREAD:\tSlruPhysicalReadPage:\tfile:%s", tv.tv_sec, tv.tv_usec, path))); #endif if (close(fd)) { slru_errcause = SLRU_CLOSE_FAILED; slru_errno = errno; return false; } return true; }
/* * Physical write of a page from a buffer slot * * On failure, we cannot just ereport(ERROR) since caller has put state in * shared memory that must be undone. So, we return FALSE and save enough * info in static variables to let SlruReportIOError make the report. * * For now, assume it's not worth keeping a file pointer open across * independent read/write operations. We do batch operations during * SimpleLruFlush, though. * * fdata is NULL for a standalone write, pointer to open-file info during * SimpleLruFlush. */ static bool SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruFlush fdata) { SlruShared shared = ctl->shared; int segno = pageno / SLRU_PAGES_PER_SEGMENT; int rpageno = pageno % SLRU_PAGES_PER_SEGMENT; int offset = rpageno * BLCKSZ; char path[MAXPGPATH]; int fd = -1; /* * During a Flush, we may already have the desired file open. */ if (fdata) { int i; for (i = 0; i < fdata->num_files; i++) { if (fdata->segno[i] == segno) { fd = fdata->fd[i]; break; } } } if (fd < 0) { /* * If the file doesn't already exist, we should create it. It is * possible for this to need to happen when writing a page that's not * first in its segment; we assume the OS can cope with that. (Note: * it might seem that it'd be okay to create files only when * SimpleLruZeroPage is called for the first page of a segment. * However, if after a crash and restart the REDO logic elects to * replay the log from a checkpoint before the latest one, then it's * possible that we will get commands to set transaction status of * transactions that have already been truncated from the commit log. * Easiest way to deal with that is to accept references to * nonexistent files here and in SlruPhysicalReadPage.) * * Note: it is possible for more than one backend to be executing * this code simultaneously for different pages of the same file. * Hence, don't use O_EXCL or O_TRUNC or anything like that. */ SlruFileName(ctl, path, segno); fd = BasicOpenFile(path, O_RDWR | O_CREAT | PG_BINARY, S_IRUSR | S_IWUSR); if (fd < 0) { slru_errcause = SLRU_OPEN_FAILED; slru_errno = errno; return false; } if (fdata) { fdata->fd[fdata->num_files] = fd; fdata->segno[fdata->num_files] = segno; fdata->num_files++; } } if (lseek(fd, (off_t) offset, SEEK_SET) < 0) { slru_errcause = SLRU_SEEK_FAILED; slru_errno = errno; if (!fdata) close(fd); return false; } errno = 0; if (write(fd, shared->page_buffer[slotno], BLCKSZ) != BLCKSZ) { /* if write didn't set errno, assume problem is no disk space */ if (errno == 0) errno = ENOSPC; slru_errcause = SLRU_WRITE_FAILED; slru_errno = errno; if (!fdata) close(fd); return false; } /* * If not part of Flush, need to fsync now. We assume this happens * infrequently enough that it's not a performance issue. */ if (!fdata) { if (ctl->do_fsync && pg_fsync(fd)) { slru_errcause = SLRU_FSYNC_FAILED; slru_errno = errno; close(fd); return false; } if (close(fd)) { slru_errcause = SLRU_CLOSE_FAILED; slru_errno = errno; return false; } } return true; }
/* * GetNewRelFileNode * Generate a new relfilenode number that is unique within the given * tablespace. * * If the relfilenode will also be used as the relation's OID, pass the * opened pg_class catalog, and this routine will guarantee that the result * is also an unused OID within pg_class. If the result is to be used only * as a relfilenode for an existing relation, pass NULL for pg_class. * * As with GetNewOid, there is some theoretical risk of a race condition, * but it doesn't seem worth worrying about. * * Note: we don't support using this in bootstrap mode. All relations * created by bootstrap have preassigned OIDs, so there's no need. */ Oid GetNewRelFileNode(Oid reltablespace, Relation pg_class, char relpersistence) { RelFileNodeBackend rnode; char *rpath; int fd; bool collides; BackendId backend; switch (relpersistence) { case RELPERSISTENCE_TEMP: backend = MyBackendId; break; case RELPERSISTENCE_UNLOGGED: case RELPERSISTENCE_PERMANENT: backend = InvalidBackendId; break; default: elog(ERROR, "invalid relpersistence: %c", relpersistence); return InvalidOid; /* placate compiler */ } /* This logic should match RelationInitPhysicalAddr */ rnode.node.spcNode = reltablespace ? reltablespace : MyDatabaseTableSpace; rnode.node.dbNode = (rnode.node.spcNode == GLOBALTABLESPACE_OID) ? InvalidOid : MyDatabaseId; /* * The relpath will vary based on the backend ID, so we must initialize * that properly here to make sure that any collisions based on filename * are properly detected. */ rnode.backend = backend; do { CHECK_FOR_INTERRUPTS(); /* Generate the OID */ if (pg_class) rnode.node.relNode = GetNewOid(pg_class); else rnode.node.relNode = GetNewObjectId(); /* Check for existing file of same name */ rpath = relpath(rnode, MAIN_FORKNUM); fd = BasicOpenFile(rpath, O_RDONLY | PG_BINARY, 0); if (fd >= 0) { /* definite collision */ close(fd); collides = true; } else { /* * Here we have a little bit of a dilemma: if errno is something * other than ENOENT, should we declare a collision and loop? In * particular one might think this advisable for, say, EPERM. * However there really shouldn't be any unreadable files in a * tablespace directory, and if the EPERM is actually complaining * that we can't read the directory itself, we'd be in an infinite * loop. In practice it seems best to go ahead regardless of the * errno. If there is a colliding file we will get an smgr * failure when we attempt to create the new relation file. */ collides = false; } pfree(rpath); } while (collides); return rnode.node.relNode; }
/** * @brief Initialize a DirectWriter */ static void DirectWriterInit(DirectWriter *self) { LoadStatus *ls; /* * Set defaults to unspecified parameters. */ if (self->base.max_dup_errors < -1) self->base.max_dup_errors = DEFAULT_MAX_DUP_ERRORS; self->base.rel = heap_open(self->base.relid, AccessExclusiveLock); VerifyTarget(self->base.rel, self->base.max_dup_errors); self->base.desc = RelationGetDescr(self->base.rel); SpoolerOpen(&self->spooler, self->base.rel, false, self->base.on_duplicate, self->base.max_dup_errors, self->base.dup_badfile); self->base.context = GetPerTupleMemoryContext(self->spooler.estate); /* Verify DataDir/pg_bulkload directory */ ValidateLSFDirectory(BULKLOAD_LSF_DIR); /* Initialize first block */ PageInit(GetCurrentPage(self), BLCKSZ, 0); PageSetTLI(GetCurrentPage(self), ThisTimeLineID); /* Obtain transaction ID and command ID. */ self->xid = GetCurrentTransactionId(); self->cid = GetCurrentCommandId(true); /* * Initialize load status information */ ls = &self->ls; ls->ls.relid = self->base.relid; ls->ls.rnode = self->base.rel->rd_node; ls->ls.exist_cnt = RelationGetNumberOfBlocks(self->base.rel); ls->ls.create_cnt = 0; /* * Create a load status file and write the initial status for it. * At the time, if we find any existing load status files, exit with * error because recovery process haven't been executed after failing * load to the same table. */ BULKLOAD_LSF_PATH(self->lsf_path, ls); self->lsf_fd = BasicOpenFile(self->lsf_path, O_CREAT | O_EXCL | O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR); if (self->lsf_fd == -1) ereport(ERROR, (errcode_for_file_access(), errmsg("could not create loadstatus file \"%s\": %m", self->lsf_path))); if (write(self->lsf_fd, ls, sizeof(LoadStatus)) != sizeof(LoadStatus) || pg_fsync(self->lsf_fd) != 0) { UnlinkLSF(self); ereport(ERROR, (errcode_for_file_access(), errmsg("could not write loadstatus file \"%s\": %m", self->lsf_path))); } self->base.tchecker = CreateTupleChecker(self->base.desc); self->base.tchecker->checker = (CheckerTupleProc) CoercionCheckerTuple; }