示例#1
0
/*
 * Using the Hadoop connector requires proper setup of GUCs
 * This procedure does sanity check on gp_hadoop_connector_jardir,
 * gp_hadoop_target_version and gp_hadoop_home.
 * It also update GUC gp_hadoop_connector_version for the current gp_hadoop_target_version.
 *
 * It checks the following:
 * 1. $GPHOME/<gp_hadoop_jardir>/$GP_HADOOP_CONN_VERSION.jar must exists.
 * 2. if gp_hadoop_home is set, then gp_hadoop_home must exists.
 */
static void
checkHadoopGUCs()
{
	char gphome[MAXPGPATH];
	StringInfoData path;
	int  jarFD;

	/* Check the existence of $GPHOME/<gp_hadoop_jardir>/$GP_HADOOP_CONN_VERSION.jar
	 *
	 * To get $GPHOME, we go from my_exec_path, which is $GPHOME/bin/postgres, and
	 * go up 2 levels.
	 *
	 * Currently, gp_hadoop_connector_jardir is fixed. We look up $GP_HADOOP_CONN_VERSION
	 * using gp_hadoop_target_version.
	 */
	snprintf(gphome, sizeof(gphome), "%s", my_exec_path);
	get_parent_directory(gphome);
	get_parent_directory(gphome);

	initStringInfoOfSize(&path, MAXPGPATH);
	gp_hadoop_connector_version = (char*)getConnectorVersion();
	appendStringInfo(&path, "%s/%s/%s.jar",
			gphome, gp_hadoop_connector_jardir, gp_hadoop_connector_version);

	jarFD = BasicOpenFile(path.data, O_RDONLY | PG_BINARY, 0);
	if (jarFD == -1)
	{
		ereport(ERROR,
				(errcode(ERRCODE_EXTERNAL_ROUTINE_EXCEPTION),
				 errmsg("cannot open Hadoop Cross Connect in %s: %m", path.data)));
	}
	close(jarFD);


	/* Check the existence of gp_hadoop_home, if specified.
	 *
	 * If user has already specified $HADOOP_HOME in the env, then
	 * there's no need to setup this GUC.
	 */
	if (strlen(gp_hadoop_home)> 0)
	{
		int hdHomeFD = BasicOpenFile(gp_hadoop_home, O_RDONLY, 0);
		if (hdHomeFD == -1)
		{
			ereport(ERROR,
					(errcode(ERRCODE_EXTERNAL_ROUTINE_EXCEPTION),
					 errmsg("cannot open gp_hadoop_home in %s: %m", gp_hadoop_home)));
		}
		close(hdHomeFD);
	}
}
示例#2
0
文件: copydir.c 项目: botp/postgres
/*
 * fsync a file
 *
 * Try to fsync directories but ignore errors that indicate the OS
 * just doesn't allow/require fsyncing directories.
 */
static void
fsync_fname(char *fname, bool isdir)
{
	int			fd;
	int			returncode;

	/*
	 * Some OSs require directories to be opened read-only whereas other
	 * systems don't allow us to fsync files opened read-only; so we need both
	 * cases here
	 */
	if (!isdir)
		fd = BasicOpenFile(fname,
						   O_RDWR | PG_BINARY,
						   S_IRUSR | S_IWUSR);
	else
		fd = BasicOpenFile(fname,
						   O_RDONLY | PG_BINARY,
						   S_IRUSR | S_IWUSR);

	/*
	 * Some OSs don't allow us to open directories at all (Windows returns
	 * EACCES)
	 */
	if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
		return;

	else if (fd < 0)
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not open file \"%s\": %m", fname)));

	returncode = pg_fsync(fd);

	/* Some OSs don't allow us to fsync directories at all */
	if (returncode != 0 && isdir && errno == EBADF)
	{
		close(fd);
		return;
	}

	if (returncode != 0)
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not fsync file \"%s\": %m", fname)));

	close(fd);
}
示例#3
0
/*
 * Can the given OID be used as pg_class.relfilenode?
 *
 * As a side-effect, advances OID counter to the given OID and remembers
 * that the OID has been used as a relfilenode, so that the same value
 * doesn't get chosen again.
 */
bool
CheckNewRelFileNodeIsOk(Oid newOid, Oid reltablespace, bool relisshared)
{
	RelFileNode rnode;
	char	   *rpath;
	int			fd;
	bool		collides;
	SnapshotData SnapshotDirty;

	/*
	 * Advance our current OID counter with the given value, to keep
	 * the counter roughly in sync across all nodes. This ensures
	 * that a GetNewRelFileNode() call after this will not choose the
	 * same OID, and won't have to loop excessively to retry. That
	 * still leaves a race condition, if GetNewRelFileNode() is called
	 * just before CheckNewRelFileNodeIsOk() - UseOidForRelFileNode()
	 * is called to plug that.
	 *
	 * FIXME: handle OID wraparound gracefully.
	 */
	while(GetNewObjectId() < newOid);

	if (!UseOidForRelFileNode(newOid))
		return false;

	InitDirtySnapshot(SnapshotDirty);

	/* This should match RelationInitPhysicalAddr */
	rnode.spcNode = reltablespace ? reltablespace : MyDatabaseTableSpace;
	rnode.dbNode = relisshared ? InvalidOid : MyDatabaseId;

	rnode.relNode = newOid;

	/* Check for existing file of same name */
	rpath = relpath(rnode);
	fd = BasicOpenFile(rpath, O_RDONLY | PG_BINARY, 0);

	if (fd >= 0)
	{
		/* definite collision */
		gp_retry_close(fd);
		collides = true;
	}
	else
		collides = false;

	pfree(rpath);

	elog(DEBUG1, "Called CheckNewRelFileNodeIsOk in %s mode for %u / %u / %u. "
		 "collides = %s",
		 (Gp_role == GP_ROLE_EXECUTE ? "execute" :
		  Gp_role == GP_ROLE_UTILITY ? "utility" :
		  "dispatch"), newOid, reltablespace, relisshared,
		 collides ? "true" : "false");

	return !collides;
}
示例#4
0
/*
 * Recreates a state file. This is used in WAL replay.
 *
 * Note: content and len don't include CRC.
 */
void
RecreateTwoPhaseFile(TransactionId xid, void *content, int len)
{
	char		path[MAXPGPATH];
	pg_crc32	statefile_crc;
	int			fd;

	/* Recompute CRC */
	INIT_CRC32(statefile_crc);
	COMP_CRC32(statefile_crc, content, len);
	FIN_CRC32(statefile_crc);

	TwoPhaseFilePath(path, xid);

	fd = BasicOpenFile(path,
					   O_CREAT | O_TRUNC | O_WRONLY | PG_BINARY,
					   S_IRUSR | S_IWUSR);
	if (fd < 0)
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not recreate two-phase state file \"%s\": %m",
						path)));

	/* Write content and CRC */
	if (write(fd, content, len) != len)
	{
		close(fd);
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not write two-phase state file: %m")));
	}
	if (write(fd, &statefile_crc, sizeof(pg_crc32)) != sizeof(pg_crc32))
	{
		close(fd);
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not write two-phase state file: %m")));
	}

	/*
	 * We must fsync the file because the end-of-replay checkpoint will not do
	 * so, there being no GXACT in shared memory yet to tell it to.
	 */
	if (pg_fsync(fd) != 0)
	{
		close(fd);
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not fsync two-phase state file: %m")));
	}

	if (close(fd) != 0)
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not close two-phase state file: %m")));
}
示例#5
0
文件: fd.c 项目: hellower/gpdb
/* returns 0 on success, -1 on re-open failure (with errno set) */
static int
LruInsert(File file)
{
	Vfd		   *vfdP;

	Assert(file != 0);

	DO_DB(elog(LOG, "LruInsert %d (%s)",
			   file, VfdCache[file].fileName));

	vfdP = &VfdCache[file];

	if (FileIsNotOpen(file))
	{
		while (nfile + numAllocatedDescs >= max_safe_fds)
		{
			if (!ReleaseLruFile())
				break;
		}

		/*
		 * The open could still fail for lack of file descriptors, eg due to
		 * overall system file table being full.  So, be prepared to release
		 * another FD if necessary...
		 */
		vfdP->fd = BasicOpenFile(vfdP->fileName, vfdP->fileFlags,
								 vfdP->fileMode);
		if (vfdP->fd < 0)
		{
			DO_DB(elog(LOG, "RE_OPEN FAILED: %d", errno));
			return vfdP->fd;
		}
		else
		{
			DO_DB(elog(LOG, "RE_OPEN SUCCESS"));
			++nfile;
		}

		/* seek to the right position */
		if (vfdP->seekPos != INT64CONST(0))
		{
			int64		returnValue;

			returnValue = pg_lseek64(vfdP->fd, vfdP->seekPos, SEEK_SET);
			Assert(returnValue != INT64CONST(-1));
		}
	}

	/*
	 * put it at the head of the Lru ring
	 */

	Insert(file);

	return 0;
}
示例#6
0
/*
 * GetNewRelFileNode
 *		Generate a new relfilenode number that is unique within the given
 *		tablespace.
 *
 * Note: we don't support using this in bootstrap mode.  All relations
 * created by bootstrap have preassigned OIDs, so there's no need.
 */
Oid
GetNewRelFileNode(Oid reltablespace, bool relisshared)
{
	RelFileNode rnode;
	char	   *rpath;
	int			fd;
	bool		collides = true;

	/* This should match RelationInitPhysicalAddr */
	rnode.spcNode = reltablespace ? reltablespace : MyDatabaseTableSpace;
	rnode.dbNode = relisshared ? InvalidOid : MyDatabaseId;

	do
	{
		CHECK_FOR_INTERRUPTS();

		/* Generate the Relfilenode */
		rnode.relNode = GetNewSegRelfilenode();

		if (!IsOidAcceptable(rnode.relNode))
			continue;

		/* Check for existing file of same name */
		rpath = relpath(rnode);
		fd = BasicOpenFile(rpath, O_RDONLY | PG_BINARY, 0);

		if (fd >= 0)
		{
			/* definite collision */
			gp_retry_close(fd);
			collides = true;
		}
		else
		{
			/*
			 * Here we have a little bit of a dilemma: if errno is something
			 * other than ENOENT, should we declare a collision and loop? In
			 * particular one might think this advisable for, say, EPERM.
			 * However there really shouldn't be any unreadable files in a
			 * tablespace directory, and if the EPERM is actually complaining
			 * that we can't read the directory itself, we'd be in an infinite
			 * loop.  In practice it seems best to go ahead regardless of the
			 * errno.  If there is a colliding file we will get an smgr
			 * failure when we attempt to create the new relation file.
			 */
			collides = false;
		}

		pfree(rpath);
	} while (collides);

	elog(DEBUG1, "Calling GetNewRelFileNode returns new relfilenode = %d", rnode.relNode);

	return rnode.relNode;
}
示例#7
0
/*
 * open a file in an arbitrary directory
 *
 * NB: if the passed pathname is relative (which it usually is),
 * it will be interpreted relative to the process' working directory
 * (which should always be $PGDATA when this code is running).
 */
File
PathNameOpenFile(FileName fileName, int fileFlags, int fileMode)
{
    char	   *fnamecopy;
    File		file;
    Vfd		   *vfdP;

    DO_DB(elog(LOG, "PathNameOpenFile: %s %x %o",
               fileName, fileFlags, fileMode));

    /*
     * We need a malloc'd copy of the file name; fail cleanly if no room.
     */
    fnamecopy = strdup(fileName);
    if (fnamecopy == NULL)
        ereport(ERROR,
                (errcode(ERRCODE_OUT_OF_MEMORY),
                 errmsg("out of memory")));

    file = AllocateVfd();
    vfdP = &VfdCache[file];

    while (nfile + numAllocatedDescs >= max_safe_fds)
    {
        if (!ReleaseLruFile())
            break;
    }

    vfdP->fd = BasicOpenFile(fileName, fileFlags, fileMode);

    if (vfdP->fd < 0)
    {
        FreeVfd(file);
        free(fnamecopy);
        return -1;
    }
    ++nfile;
    DO_DB(elog(LOG, "PathNameOpenFile: success %d",
               vfdP->fd));

    Insert(file);

    vfdP->fileName = fnamecopy;
    /* Saved flags are adjusted to be OK for re-opening file */
    vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
    vfdP->fileMode = fileMode;
    vfdP->seekPos = 0;
    vfdP->fdstate = 0x0;
    vfdP->resowner = NULL;

    return file;
}
示例#8
0
/**
 * @brief Open the next data file and returns its descriptor.
 * @param rnode  [in] RelFileNode of target relation.
 * @param blknum [in] Block number to seek.
 * @return File descriptor of the last data file.
 */
static int
open_data_file(RelFileNode rnode, bool istemp, BlockNumber blknum)
{
	int			fd = -1;
	int			ret;
	BlockNumber segno;
	char	   *fname = NULL;

#if PG_VERSION_NUM >= 90100
	RelFileNodeBackend	bknode;
	bknode.node = rnode;
	bknode.backend = istemp ? MyBackendId : InvalidBackendId;
	fname = relpath(bknode, MAIN_FORKNUM);
#else
	fname = relpath(rnode, MAIN_FORKNUM);
#endif
	segno = blknum / RELSEG_SIZE;
	if (segno > 0)
	{
		/*
		 * The length `+ 12' is taken from _mdfd_openmesg() in backend/storage/smgr/md.c.
		 */
		char	   *tmp = palloc(strlen(fname) + 12);

		sprintf(tmp, "%s.%u", fname, segno);
		pfree(fname);
		fname = tmp;
	}
	fd = BasicOpenFile(fname, O_CREAT | O_WRONLY | PG_BINARY, S_IRUSR | S_IWUSR);
	if (fd == -1)
		ereport(ERROR, (errcode_for_file_access(),
						errmsg("could not open data file: %m")));
	ret = lseek(fd, BLCKSZ * (blknum % RELSEG_SIZE), SEEK_SET);
	if (ret == -1)
	{
		close(fd);
		ereport(ERROR, (errcode_for_file_access(),
						errmsg
						("could not seek the end of the data file: %m")));
	}

	pfree(fname);

	return fd;
}
示例#9
0
/*
 * TODO: This is duplicate code with pg_xlogdump, similar to walsender.c, but
 * we currently don't have the infrastructure (elog!) to share it.
 */
static void
XLogRead(char *buf, TimeLineID tli, XLogRecPtr startptr, Size count)
{
	char	   *p;
	XLogRecPtr	recptr;
	Size		nbytes;

	static int	sendFile = -1;
	static XLogSegNo sendSegNo = 0;
	static uint32 sendOff = 0;

	p = buf;
	recptr = startptr;
	nbytes = count;

	while (nbytes > 0)
	{
		uint32		startoff;
		int			segbytes;
		int			readbytes;

		startoff = recptr % XLogSegSize;

		if (sendFile < 0 || !XLByteInSeg(recptr, sendSegNo))
		{
			char		path[MAXPGPATH];

			/* Switch to another logfile segment */
			if (sendFile >= 0)
				close(sendFile);

			XLByteToSeg(recptr, sendSegNo);

			XLogFilePath(path, tli, sendSegNo);

			sendFile = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);

			if (sendFile < 0)
			{
				if (errno == ENOENT)
					ereport(ERROR,
							(errcode_for_file_access(),
							 errmsg("requested WAL segment %s has already been removed",
									path)));
				else
					ereport(ERROR,
							(errcode_for_file_access(),
							 errmsg("could not open file \"%s\": %m",
									path)));
			}
			sendOff = 0;
		}

		/* Need to seek in the file? */
		if (sendOff != startoff)
		{
			if (lseek(sendFile, (off_t) startoff, SEEK_SET) < 0)
			{
				char		path[MAXPGPATH];

				XLogFilePath(path, tli, sendSegNo);

				ereport(ERROR,
						(errcode_for_file_access(),
				  errmsg("could not seek in log segment %s to offset %u: %m",
						 path, startoff)));
			}
			sendOff = startoff;
		}

		/* How many bytes are within this segment? */
		if (nbytes > (XLogSegSize - startoff))
			segbytes = XLogSegSize - startoff;
		else
			segbytes = nbytes;

		readbytes = read(sendFile, p, segbytes);
		if (readbytes <= 0)
		{
			char		path[MAXPGPATH];

			XLogFilePath(path, tli, sendSegNo);

			ereport(ERROR,
					(errcode_for_file_access(),
					 errmsg("could not read from log segment %s, offset %u, length %lu: %m",
							path, sendOff, (unsigned long) segbytes)));
		}

		/* Update state for read */
		recptr += readbytes;

		sendOff += readbytes;
		nbytes -= readbytes;
		p += readbytes;
	}
}
示例#10
0
/*
 * Read and validate the state file for xid.
 *
 * If it looks OK (has a valid magic number and CRC), return the palloc'd
 * contents of the file.  Otherwise return NULL.
 */
static char *
ReadTwoPhaseFile(TransactionId xid)
{
	char		path[MAXPGPATH];
	char	   *buf;
	TwoPhaseFileHeader *hdr;
	int			fd;
	struct stat stat;
	uint32		crc_offset;
	pg_crc32	calc_crc,
				file_crc;

	TwoPhaseFilePath(path, xid);

	fd = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);
	if (fd < 0)
	{
		ereport(WARNING,
				(errcode_for_file_access(),
				 errmsg("could not open two-phase state file \"%s\": %m",
						path)));
		return NULL;
	}

	/*
	 * Check file length.  We can determine a lower bound pretty easily. We
	 * set an upper bound to avoid palloc() failure on a corrupt file, though
	 * we can't guarantee that we won't get an out of memory error anyway,
	 * even on a valid file.
	 */
	if (fstat(fd, &stat))
	{
		close(fd);
		ereport(WARNING,
				(errcode_for_file_access(),
				 errmsg("could not stat two-phase state file \"%s\": %m",
						path)));
		return NULL;
	}

	if (stat.st_size < (MAXALIGN(sizeof(TwoPhaseFileHeader)) +
						MAXALIGN(sizeof(TwoPhaseRecordOnDisk)) +
						sizeof(pg_crc32)) ||
		stat.st_size > MaxAllocSize)
	{
		close(fd);
		return NULL;
	}

	crc_offset = stat.st_size - sizeof(pg_crc32);
	if (crc_offset != MAXALIGN(crc_offset))
	{
		close(fd);
		return NULL;
	}

	/*
	 * OK, slurp in the file.
	 */
	buf = (char *) palloc(stat.st_size);

	if (read(fd, buf, stat.st_size) != stat.st_size)
	{
		close(fd);
		ereport(WARNING,
				(errcode_for_file_access(),
				 errmsg("could not read two-phase state file \"%s\": %m",
						path)));
		pfree(buf);
		return NULL;
	}

	close(fd);

	hdr = (TwoPhaseFileHeader *) buf;
	if (hdr->magic != TWOPHASE_MAGIC || hdr->total_len != stat.st_size)
	{
		pfree(buf);
		return NULL;
	}

	INIT_CRC32(calc_crc);
	COMP_CRC32(calc_crc, buf, crc_offset);
	FIN_CRC32(calc_crc);

	file_crc = *((pg_crc32 *) (buf + crc_offset));

	if (!EQ_CRC32(calc_crc, file_crc))
	{
		pfree(buf);
		return NULL;
	}

	return buf;
}
示例#11
0
bool
CheckNewRelFileNodeIsOk(Oid newOid, Oid reltablespace, bool relisshared, 
						Relation pg_class)
{
	RelFileNode rnode;
	char	   *rpath;
	int			fd;
	bool		collides;
	
	
	if (pg_class)
	{
		Oid			oidIndex;
		Relation	indexrel;
		IndexScanDesc scan;
		ScanKeyData key;
	
		Assert(!IsBootstrapProcessingMode());
		Assert(pg_class->rd_rel->relhasoids);
	
		/* The relcache will cache the identity of the OID index for us */
		oidIndex = RelationGetOidIndex(pg_class);
	
		Assert(OidIsValid(oidIndex));
		
		indexrel = index_open(oidIndex, AccessShareLock);
		
		ScanKeyInit(&key,
					(AttrNumber) 1,
					BTEqualStrategyNumber, F_OIDEQ,
					ObjectIdGetDatum(newOid));

		scan = index_beginscan(pg_class, indexrel, SnapshotDirty, 1, &key);

		collides = HeapTupleIsValid(index_getnext(scan, ForwardScanDirection));

		index_endscan(scan);
		
		index_close(indexrel, AccessShareLock);
		
		if (collides)
			elog(ERROR, "relfilenode %d already in use in \"pg_class\"",
				 newOid);	
		
	}

	/* This should match RelationInitPhysicalAddr */
	rnode.spcNode = reltablespace ? reltablespace : MyDatabaseTableSpace;
	rnode.dbNode = relisshared ? InvalidOid : MyDatabaseId;
	
	rnode.relNode = newOid;
	
	/* Check for existing file of same name */
	rpath = relpath(rnode);
	fd = BasicOpenFile(rpath, O_RDONLY | PG_BINARY, 0);

	if (fd >= 0)
	{
		/* definite collision */
		gp_retry_close(fd);
		collides = true;
	}
	else
		collides = false;

	pfree(rpath);
	
	if (collides && !relisshared)
		elog(ERROR, "oid %d already in use", newOid);	

	while(GetNewObjectId() < newOid);

	return !collides;
}
示例#12
0
/*
 * copy one file
 */
static void
copy_file(char *fromfile, char *tofile)
{
	char	   *buffer;
	int			srcfd;
	int			dstfd;
	int			nbytes;

	/* Use palloc to ensure we get a maxaligned buffer */
#define COPY_BUF_SIZE (8 * BLCKSZ)

	buffer = palloc(COPY_BUF_SIZE);

	/*
	 * Open the files
	 */
	srcfd = BasicOpenFile(fromfile, O_RDONLY | PG_BINARY, 0);
	if (srcfd < 0)
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not open file \"%s\": %m", fromfile)));

	dstfd = BasicOpenFile(tofile, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
						  S_IRUSR | S_IWUSR);
	if (dstfd < 0)
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not create file \"%s\": %m", tofile)));

	/*
	 * Do the data copying.
	 */
	for (;;)
	{
        /* If we got a cancel signal during the copy of the file, quit */
        CHECK_FOR_INTERRUPTS();

		nbytes = read(srcfd, buffer, COPY_BUF_SIZE);
		if (nbytes < 0)
			ereport(ERROR,
					(errcode_for_file_access(),
					 errmsg("could not read file \"%s\": %m", fromfile)));
		if (nbytes == 0)
			break;
		errno = 0;
		if ((int) write(dstfd, buffer, nbytes) != nbytes)
		{
			/* if write didn't set errno, assume problem is no disk space */
			if (errno == 0)
				errno = ENOSPC;
			ereport(ERROR,
					(errcode_for_file_access(),
					 errmsg("could not write to file \"%s\": %m", tofile)));
		}
	}

	/*
	 * Be paranoid here to ensure we catch problems.
	 */
	if (pg_fsync(dstfd) != 0)
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not fsync file \"%s\": %m", tofile)));

	if (close(dstfd))
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not close file \"%s\": %m", tofile)));

	close(srcfd);

	pfree(buffer);
}
示例#13
0
文件: slru.c 项目: panyx0718/shadow
/*
 * Physical write of a page from a buffer slot
 *
 * On failure, we cannot just ereport(ERROR) since caller has put state in
 * shared memory that must be undone.  So, we return FALSE and save enough
 * info in static variables to let SlruReportIOError make the report.
 *
 * For now, assume it's not worth keeping a file pointer open across
 * independent read/write operations.  We do batch operations during
 * SimpleLruFlush, though.
 *
 * fdata is NULL for a standalone write, pointer to open-file info during
 * SimpleLruFlush.
 */
static bool
SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruFlush fdata)
{
	SlruShared	shared = ctl->shared;
	int			segno = pageno / SLRU_PAGES_PER_SEGMENT;
	int			rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
	int			offset = rpageno * BLCKSZ;
	char		path[MAXPGPATH];
	int			fd = -1;
	struct timeval tv;

	/*
	 * Honor the write-WAL-before-data rule, if appropriate, so that we do not
	 * write out data before associated WAL records.  This is the same action
	 * performed during FlushBuffer() in the main buffer manager.
	 */
	if (shared->group_lsn != NULL)
	{
		/*
		 * We must determine the largest async-commit LSN for the page. This
		 * is a bit tedious, but since this entire function is a slow path
		 * anyway, it seems better to do this here than to maintain a per-page
		 * LSN variable (which'd need an extra comparison in the
		 * transaction-commit path).
		 */
		XLogRecPtr	max_lsn;
		int			lsnindex,
					lsnoff;

		lsnindex = slotno * shared->lsn_groups_per_page;
		max_lsn = shared->group_lsn[lsnindex++];
		for (lsnoff = 1; lsnoff < shared->lsn_groups_per_page; lsnoff++)
		{
			XLogRecPtr	this_lsn = shared->group_lsn[lsnindex++];

			if (XLByteLT(max_lsn, this_lsn))
				max_lsn = this_lsn;
		}

		if (!XLogRecPtrIsInvalid(max_lsn))
		{
			/*
			 * As noted above, elog(ERROR) is not acceptable here, so if
			 * XLogFlush were to fail, we must PANIC.  This isn't much of a
			 * restriction because XLogFlush is just about all critical
			 * section anyway, but let's make sure.
			 */
			START_CRIT_SECTION();
			XLogFlush(max_lsn);
			END_CRIT_SECTION();
		}
	}

	/*
	 * During a Flush, we may already have the desired file open.
	 */
	if (fdata)
	{
		int			i;

		for (i = 0; i < fdata->num_files; i++)
		{
			if (fdata->segno[i] == segno)
			{
				fd = fdata->fd[i];
				break;
			}
		}
	}

	if (fd < 0)
	{
		/*
		 * If the file doesn't already exist, we should create it.  It is
		 * possible for this to need to happen when writing a page that's not
		 * first in its segment; we assume the OS can cope with that. (Note:
		 * it might seem that it'd be okay to create files only when
		 * SimpleLruZeroPage is called for the first page of a segment.
		 * However, if after a crash and restart the REDO logic elects to
		 * replay the log from a checkpoint before the latest one, then it's
		 * possible that we will get commands to set transaction status of
		 * transactions that have already been truncated from the commit log.
		 * Easiest way to deal with that is to accept references to
		 * nonexistent files here and in SlruPhysicalReadPage.)
		 *
		 * Note: it is possible for more than one backend to be executing this
		 * code simultaneously for different pages of the same file. Hence,
		 * don't use O_EXCL or O_TRUNC or anything like that.
		 */
		SlruFileName(ctl, path, segno);
		fd = BasicOpenFile(path, O_RDWR | O_CREAT | PG_BINARY,
						   S_IRUSR | S_IWUSR);
		if (fd < 0)
		{
			slru_errcause = SLRU_OPEN_FAILED;
			slru_errno = errno;
			return false;
		}

		if (fdata)
		{
			if (fdata->num_files < MAX_FLUSH_BUFFERS)
			{
				fdata->fd[fdata->num_files] = fd;
				fdata->segno[fdata->num_files] = segno;
				fdata->num_files++;
			}
			else
			{
				/*
				 * In the unlikely event that we exceed MAX_FLUSH_BUFFERS,
				 * fall back to treating it as a standalone write.
				 */
				fdata = NULL;
			}
		}
	}

	if (lseek(fd, (off_t) offset, SEEK_SET) < 0)
	{
		slru_errcause = SLRU_SEEK_FAILED;
		slru_errno = errno;
		if (!fdata)
			close(fd);
		return false;
	}

	errno = 0;
	if (write(fd, shared->page_buffer[slotno], BLCKSZ) != BLCKSZ)
	{
		/* if write didn't set errno, assume problem is no disk space */
		if (errno == 0)
			errno = ENOSPC;
		slru_errcause = SLRU_WRITE_FAILED;
		slru_errno = errno;
		if (!fdata)
			close(fd);
		return false;
	}
#ifdef XP_TRACE_LRU_WRITE
	gettimeofday(&tv, NULL);
	ereport(TRACE_LEVEL,
		(errmsg("%ld.%ld:\tWRITE:\tSlruPhysicalWritePage:\tfile:%s",
				tv.tv_sec, tv.tv_usec, path)));
#endif

	/*
	 * If not part of Flush, need to fsync now.  We assume this happens
	 * infrequently enough that it's not a performance issue.
	 */
	if (!fdata)
	{
		if (ctl->do_fsync && pg_fsync(fd))
		{
			slru_errcause = SLRU_FSYNC_FAILED;
			slru_errno = errno;
			close(fd);
			return false;
		}

		if (close(fd))
		{
			slru_errcause = SLRU_CLOSE_FAILED;
			slru_errno = errno;
			return false;
		}
	}

	return true;
}
示例#14
0
/*
 * GetNewSequenceRelationOid
 *		Get a sequence relation Oid and verify it is valid against
 *		the pg_class relation by doing an index lookup. The caller
 *		should have a suitable lock on pg_class.
 */
Oid
GetNewSequenceRelationOid(Relation relation)
{
	Oid			newOid;
	Oid			oidIndex;
	Relation	indexrel;
	SnapshotData SnapshotDirty;
	IndexScanDesc scan;
	ScanKeyData key;
	bool		collides;
	RelFileNode rnode;
	char	   *rpath;
	int			fd;

	/* This should match RelationInitPhysicalAddr */
	rnode.spcNode = relation->rd_rel->reltablespace ? relation->rd_rel->reltablespace : MyDatabaseTableSpace;
	rnode.dbNode = relation->rd_rel->relisshared ? InvalidOid : MyDatabaseId;

	/* We should only be using pg_class */
	Assert(RelationGetRelid(relation) == RelationRelationId);

	/* The relcache will cache the identity of the OID index for us */
	oidIndex = RelationGetOidIndex(relation);

	/* Otherwise, use the index to find a nonconflicting OID */
	indexrel = index_open(oidIndex, AccessShareLock);

	InitDirtySnapshot(SnapshotDirty);

	/* Generate new sequence relation OIDs until we find one not in the table */
	do
	{
		CHECK_FOR_INTERRUPTS();

		newOid = GetNewSequenceRelationObjectId();

		ScanKeyInit(&key,
					(AttrNumber) 1,
					BTEqualStrategyNumber, F_OIDEQ,
					ObjectIdGetDatum(newOid));

		/* see notes above about using SnapshotDirty */
		scan = index_beginscan(relation, indexrel,
							   &SnapshotDirty, 1, &key);

		collides = HeapTupleIsValid(index_getnext(scan, ForwardScanDirection));

		index_endscan(scan);

		if (!collides)
		{
			/* Check for existing file of same name */
			rpath = relpath(rnode);
			fd = BasicOpenFile(rpath, O_RDONLY | PG_BINARY, 0);

			if (fd >= 0)
			{
				/* definite collision */
				gp_retry_close(fd);
				collides = true;
			}
			else
			{
				/*
				 * Here we have a little bit of a dilemma: if errno is something
				 * other than ENOENT, should we declare a collision and loop? In
				 * particular one might think this advisable for, say, EPERM.
				 * However there really shouldn't be any unreadable files in a
				 * tablespace directory, and if the EPERM is actually complaining
				 * that we can't read the directory itself, we'd be in an infinite
				 * loop.  In practice it seems best to go ahead regardless of the
				 * errno.  If there is a colliding file we will get an smgr
				 * failure when we attempt to create the new relation file.
				 */
				collides = false;
			}
		}

		/*
		 * Also check that the OID hasn't been pre-assigned for a different
		 * relation.
		 *
		 * We're a bit sloppy between OIDs and relfilenodes here; it would be
		 * OK to use a value that's been reserved for use as a type or
		 * relation OID here, as long as the relfilenode is free. But there's
		 * no harm in skipping over those too, so we don't bother to
		 * distinguish them.
		 */
		if (!collides && !IsOidAcceptable(newOid))
			collides = true;

	} while (collides);

	index_close(indexrel, AccessShareLock);

	return newOid;
}
示例#15
0
/*
 * GetNewRelFileNode
 *		Generate a new relfilenode number that is unique within the given
 *		tablespace.
 *
 * If the relfilenode will also be used as the relation's OID, pass the
 * opened pg_class catalog, and this routine will guarantee that the result
 * is also an unused OID within pg_class.  If the result is to be used only
 * as a relfilenode for an existing relation, pass NULL for pg_class.
 *
 * As with GetNewOid, there is some theoretical risk of a race condition,
 * but it doesn't seem worth worrying about.
 *
 * Note: we don't support using this in bootstrap mode.  All relations
 * created by bootstrap have preassigned OIDs, so there's no need.
 */
Oid
GetNewRelFileNode(Oid reltablespace, bool relisshared, Relation pg_class)
{
	RelFileNode rnode;
	char	   *rpath;
	int			fd;
	bool		collides = true;

	/* This should match RelationInitPhysicalAddr */
	rnode.spcNode = reltablespace ? reltablespace : MyDatabaseTableSpace;
	rnode.dbNode = relisshared ? InvalidOid : MyDatabaseId;

	do
	{
		CHECK_FOR_INTERRUPTS();

		/* Generate the OID */
		if (pg_class)
			rnode.relNode = GetNewOid(pg_class);
		else
			rnode.relNode = GetNewObjectId();

		if (!UseOidForRelFileNode(rnode.relNode))
			continue;

		/* Check for existing file of same name */
		rpath = relpath(rnode);
		fd = BasicOpenFile(rpath, O_RDONLY | PG_BINARY, 0);

		if (fd >= 0)
		{
			/* definite collision */
			gp_retry_close(fd);
			collides = true;
		}
		else
		{
			/*
			 * Here we have a little bit of a dilemma: if errno is something
			 * other than ENOENT, should we declare a collision and loop? In
			 * particular one might think this advisable for, say, EPERM.
			 * However there really shouldn't be any unreadable files in a
			 * tablespace directory, and if the EPERM is actually complaining
			 * that we can't read the directory itself, we'd be in an infinite
			 * loop.  In practice it seems best to go ahead regardless of the
			 * errno.  If there is a colliding file we will get an smgr
			 * failure when we attempt to create the new relation file.
			 */
			collides = false;
		}

		pfree(rpath);
	} while (collides);
	
	if (Gp_role == GP_ROLE_EXECUTE)
		Insist(!PointerIsValid(pg_class));

	elog(DEBUG1, "Calling GetNewRelFileNode in %s mode %s pg_class. New relOid = %d",
		 (Gp_role == GP_ROLE_EXECUTE ? "execute" :
		  Gp_role == GP_ROLE_UTILITY ? "utility" :
		  "dispatch"), pg_class ? "with" : "without",
		 rnode.relNode);

	return rnode.relNode;
}
示例#16
0
/*
 * ReceiveRegularFile creates a local file at the given file path, and connects
 * to remote database that has the given node name and port number. The function
 * then issues the given transmit command using client-side logic (libpq), reads
 * the remote file's contents, and appends these contents to the local file. On
 * success, the function returns success; on failure, it cleans up all resources
 * and returns false.
 */
static bool
ReceiveRegularFile(const char *nodeName, uint32 nodePort,
				   StringInfo transmitCommand, StringInfo filePath)
{
	int32 fileDescriptor = -1;
	char filename[MAXPGPATH];
	int closed = -1;
	const int fileFlags = (O_APPEND | O_CREAT | O_RDWR | O_TRUNC | PG_BINARY);
	const int fileMode = (S_IRUSR | S_IWUSR);

	QueryStatus queryStatus = CLIENT_INVALID_QUERY;
	int32 connectionId = INVALID_CONNECTION_ID;
	char *nodeDatabase = NULL;
	bool querySent = false;
	bool queryReady = false;
	bool copyDone = false;

	/* create local file to append remote data to */
	snprintf(filename, MAXPGPATH, "%s", filePath->data);

	fileDescriptor = BasicOpenFile(filename, fileFlags, fileMode);
	if (fileDescriptor < 0)
	{
		ereport(WARNING, (errcode_for_file_access(),
						  errmsg("could not open file \"%s\": %m", filePath->data)));

		return false;
	}

	/* we use the same database name on the master and worker nodes */
	nodeDatabase = get_database_name(MyDatabaseId);

	/* connect to remote node */
	connectionId = MultiClientConnect(nodeName, nodePort, nodeDatabase, NULL);
	if (connectionId == INVALID_CONNECTION_ID)
	{
		ReceiveResourceCleanup(connectionId, filename, fileDescriptor);

		return false;
	}

	/* send request to remote node to start transmitting data */
	querySent = MultiClientSendQuery(connectionId, transmitCommand->data);
	if (!querySent)
	{
		ReceiveResourceCleanup(connectionId, filename, fileDescriptor);

		return false;
	}

	/* loop until the remote node acknowledges our transmit request */
	while (!queryReady)
	{
		ResultStatus resultStatus = MultiClientResultStatus(connectionId);
		if (resultStatus == CLIENT_RESULT_READY)
		{
			queryReady = true;
		}
		else if (resultStatus == CLIENT_RESULT_BUSY)
		{
			/* remote node did not respond; wait for longer */
			long sleepIntervalPerCycle = RemoteTaskCheckInterval * 1000L;
			pg_usleep(sleepIntervalPerCycle);
		}
		else
		{
			ReceiveResourceCleanup(connectionId, filename, fileDescriptor);

			return false;
		}
	}

	/* check query response is as expected */
	queryStatus = MultiClientQueryStatus(connectionId);
	if (queryStatus != CLIENT_QUERY_COPY)
	{
		ReceiveResourceCleanup(connectionId, filename, fileDescriptor);

		return false;
	}

	/* loop until we receive and append all the data from remote node */
	while (!copyDone)
	{
		CopyStatus copyStatus = MultiClientCopyData(connectionId, fileDescriptor);
		if (copyStatus == CLIENT_COPY_DONE)
		{
			copyDone = true;
		}
		else if (copyStatus == CLIENT_COPY_MORE)
		{
			/* remote node will continue to send more data */
		}
		else
		{
			ReceiveResourceCleanup(connectionId, filename, fileDescriptor);

			return false;
		}
	}

	/* we are done executing; release the connection and the file handle */
	MultiClientDisconnect(connectionId);

	closed = close(fileDescriptor);
	if (closed < 0)
	{
		ereport(WARNING, (errcode_for_file_access(),
						  errmsg("could not close file \"%s\": %m", filename)));

		/* if we failed to close file, try to delete it before erroring out */
		DeleteFile(filename);

		return false;
	}

	/* we successfully received the remote file */
	ereport(DEBUG2, (errmsg("received remote file \"%s\"", filename)));

	return true;
}
示例#17
0
/*
 * Read 'count' bytes from WAL into 'buf', starting at location 'startptr'
 *
 * XXX probably this should be improved to suck data directly from the
 * WAL buffers when possible.
 *
 * Will open, and keep open, one WAL segment stored in the global file
 * descriptor sendFile. This means if XLogRead is used once, there will
 * always be one descriptor left open until the process ends, but never
 * more than one.
 */
void
XLogRead(char *buf, XLogRecPtr startptr, Size count)
{
	char		   *p;
	XLogRecPtr	recptr;
	Size			nbytes;
	uint32		lastRemovedLog;
	uint32		lastRemovedSeg;
	uint32		log;
	uint32		seg;

retry:
	p = buf;
	recptr = startptr;
	nbytes = count;

	while (nbytes > 0)
	{
		uint32		startoff;
		int			segbytes;
		int			readbytes;

		startoff = recptr.xrecoff % XLogSegSize;

		if (sendFile < 0 || !XLByteInSeg(recptr, sendId, sendSeg))
		{
			char		path[MAXPGPATH];

			/* Switch to another logfile segment */
			if (sendFile >= 0)
				close(sendFile);

			XLByteToSeg(recptr, sendId, sendSeg);
			XLogFilePath(path, ThisTimeLineID, sendId, sendSeg);

			sendFile = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);
			if (sendFile < 0)
			{
				/*
				 * If the file is not found, assume it's because the standby
				 * asked for a too old WAL segment that has already been
				 * removed or recycled.
				 */
				if (errno == ENOENT)
				{
					char		filename[MAXFNAMELEN];

					XLogFileName(filename, ThisTimeLineID, sendId, sendSeg);
					ereport(ERROR,
							(errcode_for_file_access(),
							 errmsg("requested WAL segment %s has already been removed",
									filename)));
				}
				else
					ereport(ERROR,
							(errcode_for_file_access(),
							 errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
									path, sendId, sendSeg)));
			}
			sendOff = 0;
		}

		/* Need to seek in the file? */
		if (sendOff != startoff)
		{
			if (lseek(sendFile, (off_t) startoff, SEEK_SET) < 0)
				ereport(ERROR,
						(errcode_for_file_access(),
						 errmsg("could not seek in log file %u, segment %u to offset %u: %m",
								sendId, sendSeg, startoff)));
			sendOff = startoff;
		}

		/* How many bytes are within this segment? */
		if (nbytes > (XLogSegSize - startoff))
			segbytes = XLogSegSize - startoff;
		else
			segbytes = nbytes;

		readbytes = read(sendFile, p, segbytes);
		if (readbytes <= 0)
			ereport(ERROR,
					(errcode_for_file_access(),
			errmsg("could not read from log file %u, segment %u, offset %u, "
				   "length %lu: %m",
				   sendId, sendSeg, sendOff, (unsigned long) segbytes)));

		/* Update state for read */
		XLByteAdvance(recptr, readbytes);

		sendOff += readbytes;
		nbytes -= readbytes;
		p += readbytes;
	}

	/*
	 * After reading into the buffer, check that what we read was valid. We do
	 * this after reading, because even though the segment was present when we
	 * opened it, it might get recycled or removed while we read it. The
	 * read() succeeds in that case, but the data we tried to read might
	 * already have been overwritten with new WAL records.
	 */
	XLogGetLastRemoved(&lastRemovedLog, &lastRemovedSeg);
	XLByteToSeg(startptr, log, seg);
	if (log < lastRemovedLog ||
		(log == lastRemovedLog && seg <= lastRemovedSeg))
	{
		char		filename[MAXFNAMELEN];

		XLogFileName(filename, ThisTimeLineID, log, seg);
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("requested WAL segment %s has already been removed",
						filename)));
	}

	/*
	 * During recovery, the currently-open WAL file might be replaced with
	 * the file of the same name retrieved from archive. So we always need
	 * to check what we read was valid after reading into the buffer. If it's
	 * invalid, we try to open and read the file again.
	 */
	if (am_cascading_walsender)
	{
		/* use volatile pointer to prevent code rearrangement */
		volatile WalSnd *walsnd = MyWalSnd;
		bool		reload;

		SpinLockAcquire(&walsnd->mutex);
		reload = walsnd->needreload;
		walsnd->needreload = false;
		SpinLockRelease(&walsnd->mutex);

		if (reload && sendFile >= 0)
		{
			close(sendFile);
			sendFile = -1;

			goto retry;
		}
	}
}
示例#18
0
/*
 *	mdunlink() -- Unlink a relation.
 *
 * Note that we're passed a RelFileNode --- by the time this is called,
 * there won't be an SMgrRelation hashtable entry anymore.
 *
 * Actually, we don't unlink the first segment file of the relation, but
 * just truncate it to zero length, and record a request to unlink it after
 * the next checkpoint.  Additional segments can be unlinked immediately,
 * however.  Leaving the empty file in place prevents that relfilenode
 * number from being reused.  The scenario this protects us from is:
 * 1. We delete a relation (and commit, and actually remove its file).
 * 2. We create a new relation, which by chance gets the same relfilenode as
 *	  the just-deleted one (OIDs must've wrapped around for that to happen).
 * 3. We crash before another checkpoint occurs.
 * During replay, we would delete the file and then recreate it, which is fine
 * if the contents of the file were repopulated by subsequent WAL entries.
 * But if we didn't WAL-log insertions, but instead relied on fsyncing the
 * file after populating it (as for instance CLUSTER and CREATE INDEX do),
 * the contents of the file would be lost forever.	By leaving the empty file
 * until after the next checkpoint, we prevent reassignment of the relfilenode
 * number until it's safe, because relfilenode assignment skips over any
 * existing file.
 *
 * If isRedo is true, it's okay for the relation to be already gone.
 * Also, we should remove the file immediately instead of queuing a request
 * for later, since during redo there's no possibility of creating a
 * conflicting relation.
 *
 * Note: any failure should be reported as WARNING not ERROR, because
 * we are usually not in a transaction anymore when this is called.
 */
void
mdunlink(RelFileNode rnode, ForkNumber forkNum, bool isRedo)
{
	char	   *path;
	int			ret;

	/*
	 * We have to clean out any pending fsync requests for the doomed
	 * relation, else the next mdsync() will fail.
	 */
	ForgetRelationFsyncRequests(rnode, forkNum);

	path = relpath(rnode, forkNum);

	/*
	 * Delete or truncate the first segment.
	 */
	if (isRedo || forkNum != MAIN_FORKNUM)
	{
		ret = unlink(path);
		if (ret < 0)
		{
			if (!isRedo || errno != ENOENT)
				ereport(WARNING,
						(errcode_for_file_access(),
						 errmsg("could not remove file \"%s\": %m", path)));
		}
	}
	else
	{
		/* truncate(2) would be easier here, but Windows hasn't got it */
		int			fd;

		fd = BasicOpenFile(path, O_RDWR | PG_BINARY, 0);
		if (fd >= 0)
		{
			int			save_errno;

			ret = ftruncate(fd, 0);
			save_errno = errno;
			close(fd);
			errno = save_errno;
		}
		else
			ret = -1;
		if (ret < 0 && errno != ENOENT)
			ereport(WARNING,
					(errcode_for_file_access(),
					 errmsg("could not truncate file \"%s\": %m", path)));
	}

	/*
	 * Delete any additional segments.
	 */
	if (ret >= 0)
	{
		char	   *segpath = (char *) palloc(strlen(path) + 12);
		BlockNumber segno;

		/*
		 * Note that because we loop until getting ENOENT, we will correctly
		 * remove all inactive segments as well as active ones.
		 */
		for (segno = 1;; segno++)
		{
			sprintf(segpath, "%s.%u", path, segno);
			if (unlink(segpath) < 0)
			{
				/* ENOENT is expected after the last segment... */
				if (errno != ENOENT)
					ereport(WARNING,
							(errcode_for_file_access(),
					   errmsg("could not remove file \"%s\": %m", segpath)));
				break;
			}
		}
		pfree(segpath);
	}

	pfree(path);

	/* Register request to unlink first segment later */
	if (!isRedo && forkNum == MAIN_FORKNUM)
		register_unlink(rnode);
}
示例#19
0
文件: relmapper.c 项目: GisKook/Gis
/*
 * Write out a new shared or local map file with the given contents.
 *
 * The magic number and CRC are automatically updated in *newmap.  On
 * success, we copy the data to the appropriate permanent static variable.
 *
 * If write_wal is TRUE then an appropriate WAL message is emitted.
 * (It will be false for bootstrap and WAL replay cases.)
 *
 * If send_sinval is TRUE then a SI invalidation message is sent.
 * (This should be true except in bootstrap case.)
 *
 * If preserve_files is TRUE then the storage manager is warned not to
 * delete the files listed in the map.
 *
 * Because this may be called during WAL replay when MyDatabaseId,
 * DatabasePath, etc aren't valid, we require the caller to pass in suitable
 * values.	The caller is also responsible for being sure no concurrent
 * map update could be happening.
 */
static void
write_relmap_file(bool shared, RelMapFile *newmap,
				  bool write_wal, bool send_sinval, bool preserve_files,
				  Oid dbid, Oid tsid, const char *dbpath)
{
	int			fd;
	RelMapFile *realmap;
	char		mapfilename[MAXPGPATH];

	/*
	 * Fill in the overhead fields and update CRC.
	 */
	newmap->magic = RELMAPPER_FILEMAGIC;
	if (newmap->num_mappings < 0 || newmap->num_mappings > MAX_MAPPINGS)
		elog(ERROR, "attempt to write bogus relation mapping");

	INIT_CRC32(newmap->crc);
	COMP_CRC32(newmap->crc, (char *) newmap, offsetof(RelMapFile, crc));
	FIN_CRC32(newmap->crc);

	/*
	 * Open the target file.  We prefer to do this before entering the
	 * critical section, so that an open() failure need not force PANIC.
	 *
	 * Note: since we use BasicOpenFile, we are nominally responsible for
	 * ensuring the fd is closed on error.	In practice, this isn't important
	 * because either an error happens inside the critical section, or we are
	 * in bootstrap or WAL replay; so an error past this point is always fatal
	 * anyway.
	 */
	if (shared)
	{
		snprintf(mapfilename, sizeof(mapfilename), "global/%s",
				 RELMAPPER_FILENAME);
		realmap = &shared_map;
	}
	else
	{
		snprintf(mapfilename, sizeof(mapfilename), "%s/%s",
				 dbpath, RELMAPPER_FILENAME);
		realmap = &local_map;
	}

	fd = BasicOpenFile(mapfilename,
					   O_WRONLY | O_CREAT | PG_BINARY,
					   S_IRUSR | S_IWUSR);
	if (fd < 0)
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not open relation mapping file \"%s\": %m",
						mapfilename)));

	if (write_wal)
	{
		xl_relmap_update xlrec;
		XLogRecData rdata[2];
		XLogRecPtr	lsn;

		/* now errors are fatal ... */
		START_CRIT_SECTION();

		xlrec.dbid = dbid;
		xlrec.tsid = tsid;
		xlrec.nbytes = sizeof(RelMapFile);

		rdata[0].data = (char *) (&xlrec);
		rdata[0].len = MinSizeOfRelmapUpdate;
		rdata[0].buffer = InvalidBuffer;
		rdata[0].next = &(rdata[1]);
		rdata[1].data = (char *) newmap;
		rdata[1].len = sizeof(RelMapFile);
		rdata[1].buffer = InvalidBuffer;
		rdata[1].next = NULL;

		lsn = XLogInsert(RM_RELMAP_ID, XLOG_RELMAP_UPDATE, rdata);

		/* As always, WAL must hit the disk before the data update does */
		XLogFlush(lsn);
	}

	errno = 0;
	if (write(fd, newmap, sizeof(RelMapFile)) != sizeof(RelMapFile))
	{
		/* if write didn't set errno, assume problem is no disk space */
		if (errno == 0)
			errno = ENOSPC;
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not write to relation mapping file \"%s\": %m",
						mapfilename)));
	}

	/*
	 * We choose to fsync the data to disk before considering the task done.
	 * It would be possible to relax this if it turns out to be a performance
	 * issue, but it would complicate checkpointing --- see notes for
	 * CheckPointRelationMap.
	 */
	if (pg_fsync(fd) != 0)
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not fsync relation mapping file \"%s\": %m",
						mapfilename)));

	if (close(fd))
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not close relation mapping file \"%s\": %m",
						mapfilename)));

	/*
	 * Now that the file is safely on disk, send sinval message to let other
	 * backends know to re-read it.  We must do this inside the critical
	 * section: if for some reason we fail to send the message, we have to
	 * force a database-wide PANIC.  Otherwise other backends might continue
	 * execution with stale mapping information, which would be catastrophic
	 * as soon as others began to use the now-committed data.
	 */
	if (send_sinval)
		CacheInvalidateRelmap(dbid);

	/*
	 * Make sure that the files listed in the map are not deleted if the outer
	 * transaction aborts.	This had better be within the critical section
	 * too: it's not likely to fail, but if it did, we'd arrive at transaction
	 * abort with the files still vulnerable.  PANICing will leave things in a
	 * good state on-disk.
	 *
	 * Note: we're cheating a little bit here by assuming that mapped files
	 * are either in pg_global or the database's default tablespace.
	 */
	if (preserve_files)
	{
		int32		i;

		for (i = 0; i < newmap->num_mappings; i++)
		{
			RelFileNode rnode;

			rnode.spcNode = tsid;
			rnode.dbNode = dbid;
			rnode.relNode = newmap->mappings[i].mapfilenode;
			RelationPreserveStorage(rnode);
		}
	}

	/* Success, update permanent copy */
	memcpy(realmap, newmap, sizeof(RelMapFile));

	/* Critical section done */
	if (write_wal)
		END_CRIT_SECTION();
}
示例#20
0
文件: relmapper.c 项目: GisKook/Gis
/*
 * load_relmap_file -- load data from the shared or local map file
 *
 * Because the map file is essential for access to core system catalogs,
 * failure to read it is a fatal error.
 *
 * Note that the local case requires DatabasePath to be set up.
 */
static void
load_relmap_file(bool shared)
{
	RelMapFile *map;
	char		mapfilename[MAXPGPATH];
	pg_crc32	crc;
	int			fd;

	if (shared)
	{
		snprintf(mapfilename, sizeof(mapfilename), "global/%s",
				 RELMAPPER_FILENAME);
		map = &shared_map;
	}
	else
	{
		snprintf(mapfilename, sizeof(mapfilename), "%s/%s",
				 DatabasePath, RELMAPPER_FILENAME);
		map = &local_map;
	}

	/* Read data ... */
	fd = BasicOpenFile(mapfilename, O_RDONLY | PG_BINARY, S_IRUSR | S_IWUSR);
	if (fd < 0)
		ereport(FATAL,
				(errcode_for_file_access(),
				 errmsg("could not open relation mapping file \"%s\": %m",
						mapfilename)));

	/*
	 * Note: we could take RelationMappingLock in shared mode here, but it
	 * seems unnecessary since our read() should be atomic against any
	 * concurrent updater's write().  If the file is updated shortly after we
	 * look, the sinval signaling mechanism will make us re-read it before we
	 * are able to access any relation that's affected by the change.
	 */
	if (read(fd, map, sizeof(RelMapFile)) != sizeof(RelMapFile))
		ereport(FATAL,
				(errcode_for_file_access(),
				 errmsg("could not read relation mapping file \"%s\": %m",
						mapfilename)));

	close(fd);

	/* check for correct magic number, etc */
	if (map->magic != RELMAPPER_FILEMAGIC ||
		map->num_mappings < 0 ||
		map->num_mappings > MAX_MAPPINGS)
		ereport(FATAL,
				(errmsg("relation mapping file \"%s\" contains invalid data",
						mapfilename)));

	/* verify the CRC */
	INIT_CRC32(crc);
	COMP_CRC32(crc, (char *) map, offsetof(RelMapFile, crc));
	FIN_CRC32(crc);

	if (!EQ_CRC32(crc, map->crc))
		ereport(FATAL,
		  (errmsg("relation mapping file \"%s\" contains incorrect checksum",
				  mapfilename)));
}
示例#21
0
文件: copydir.c 项目: botp/postgres
/*
 * copy one file
 */
void
copy_file(char *fromfile, char *tofile)
{
	char	   *buffer;
	int			srcfd;
	int			dstfd;
	int			nbytes;
	off_t		offset;

	/* Use palloc to ensure we get a maxaligned buffer */
#define COPY_BUF_SIZE (8 * BLCKSZ)

	buffer = palloc(COPY_BUF_SIZE);

	/*
	 * Open the files
	 */
	srcfd = BasicOpenFile(fromfile, O_RDONLY | PG_BINARY, 0);
	if (srcfd < 0)
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not open file \"%s\": %m", fromfile)));

	dstfd = BasicOpenFile(tofile, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
						  S_IRUSR | S_IWUSR);
	if (dstfd < 0)
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not create file \"%s\": %m", tofile)));

	/*
	 * Do the data copying.
	 */
	for (offset = 0;; offset += nbytes)
	{
		/* If we got a cancel signal during the copy of the file, quit */
		CHECK_FOR_INTERRUPTS();

		nbytes = read(srcfd, buffer, COPY_BUF_SIZE);
		if (nbytes < 0)
			ereport(ERROR,
					(errcode_for_file_access(),
					 errmsg("could not read file \"%s\": %m", fromfile)));
		if (nbytes == 0)
			break;
		errno = 0;
		if ((int) write(dstfd, buffer, nbytes) != nbytes)
		{
			/* if write didn't set errno, assume problem is no disk space */
			if (errno == 0)
				errno = ENOSPC;
			ereport(ERROR,
					(errcode_for_file_access(),
					 errmsg("could not write to file \"%s\": %m", tofile)));
		}

		/*
		 * We fsync the files later but first flush them to avoid spamming the
		 * cache and hopefully get the kernel to start writing them out before
		 * the fsync comes.  Ignore any error, since it's only a hint.
		 */
		(void) pg_flush_data(dstfd, offset, nbytes);
	}

	if (close(dstfd))
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not close file \"%s\": %m", tofile)));

	close(srcfd);

	pfree(buffer);
}
示例#22
0
/*
 * CheckPointTwoPhase -- handle 2PC component of checkpointing.
 *
 * We must fsync the state file of any GXACT that is valid and has a PREPARE
 * LSN <= the checkpoint's redo horizon.  (If the gxact isn't valid yet or
 * has a later LSN, this checkpoint is not responsible for fsyncing it.)
 *
 * This is deliberately run as late as possible in the checkpoint sequence,
 * because GXACTs ordinarily have short lifespans, and so it is quite
 * possible that GXACTs that were valid at checkpoint start will no longer
 * exist if we wait a little bit.
 *
 * If a GXACT remains valid across multiple checkpoints, it'll be fsynced
 * each time.  This is considered unusual enough that we don't bother to
 * expend any extra code to avoid the redundant fsyncs.  (They should be
 * reasonably cheap anyway, since they won't cause I/O.)
 */
void
CheckPointTwoPhase(XLogRecPtr redo_horizon)
{
	TransactionId *xids;
	int			nxids;
	char		path[MAXPGPATH];
	int			i;

	/*
	 * We don't want to hold the TwoPhaseStateLock while doing I/O, so we grab
	 * it just long enough to make a list of the XIDs that require fsyncing,
	 * and then do the I/O afterwards.
	 *
	 * This approach creates a race condition: someone else could delete a
	 * GXACT between the time we release TwoPhaseStateLock and the time we try
	 * to open its state file.	We handle this by special-casing ENOENT
	 * failures: if we see that, we verify that the GXACT is no longer valid,
	 * and if so ignore the failure.
	 */
	if (max_prepared_xacts <= 0)
		return;					/* nothing to do */

	TRACE_POSTGRESQL_TWOPHASE_CHECKPOINT_START();

	xids = (TransactionId *) palloc(max_prepared_xacts * sizeof(TransactionId));
	nxids = 0;

	LWLockAcquire(TwoPhaseStateLock, LW_SHARED);

	for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
	{
		GlobalTransaction gxact = TwoPhaseState->prepXacts[i];

		if (gxact->valid &&
			XLByteLE(gxact->prepare_lsn, redo_horizon))
			xids[nxids++] = gxact->proc.xid;
	}

	LWLockRelease(TwoPhaseStateLock);

	for (i = 0; i < nxids; i++)
	{
		TransactionId xid = xids[i];
		int			fd;

		TwoPhaseFilePath(path, xid);

		fd = BasicOpenFile(path, O_RDWR | PG_BINARY, 0);
		if (fd < 0)
		{
			if (errno == ENOENT)
			{
				/* OK if gxact is no longer valid */
				if (!TransactionIdIsPrepared(xid))
					continue;
				/* Restore errno in case it was changed */
				errno = ENOENT;
			}
			ereport(ERROR,
					(errcode_for_file_access(),
					 errmsg("could not open two-phase state file \"%s\": %m",
							path)));
		}

		if (pg_fsync(fd) != 0)
		{
			close(fd);
			ereport(ERROR,
					(errcode_for_file_access(),
					 errmsg("could not fsync two-phase state file \"%s\": %m",
							path)));
		}

		if (close(fd) != 0)
			ereport(ERROR,
					(errcode_for_file_access(),
					 errmsg("could not close two-phase state file \"%s\": %m",
							path)));
	}

	pfree(xids);

	TRACE_POSTGRESQL_TWOPHASE_CHECKPOINT_DONE();
}
示例#23
0
/*
 * Finish preparing state file.
 *
 * Calculates CRC and writes state file to WAL and in pg_twophase directory.
 */
void
EndPrepare(GlobalTransaction gxact)
{
	TransactionId xid = gxact->proc.xid;
	TwoPhaseFileHeader *hdr;
	char		path[MAXPGPATH];
	XLogRecData *record;
	pg_crc32	statefile_crc;
	pg_crc32	bogus_crc;
	int			fd;

	/* Add the end sentinel to the list of 2PC records */
	RegisterTwoPhaseRecord(TWOPHASE_RM_END_ID, 0,
						   NULL, 0);

	/* Go back and fill in total_len in the file header record */
	hdr = (TwoPhaseFileHeader *) records.head->data;
	Assert(hdr->magic == TWOPHASE_MAGIC);
	hdr->total_len = records.total_len + sizeof(pg_crc32);

	/*
	 * If the file size exceeds MaxAllocSize, we won't be able to read it in
	 * ReadTwoPhaseFile. Check for that now, rather than fail at commit time.
	 */
	if (hdr->total_len > MaxAllocSize)
		ereport(ERROR,
				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
				 errmsg("two-phase state file maximum length exceeded")));

	/*
	 * Create the 2PC state file.
	 *
	 * Note: because we use BasicOpenFile(), we are responsible for ensuring
	 * the FD gets closed in any error exit path.  Once we get into the
	 * critical section, though, it doesn't matter since any failure causes
	 * PANIC anyway.
	 */
	TwoPhaseFilePath(path, xid);

	fd = BasicOpenFile(path,
					   O_CREAT | O_EXCL | O_WRONLY | PG_BINARY,
					   S_IRUSR | S_IWUSR);
	if (fd < 0)
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not create two-phase state file \"%s\": %m",
						path)));

	/* Write data to file, and calculate CRC as we pass over it */
	INIT_CRC32(statefile_crc);

	for (record = records.head; record != NULL; record = record->next)
	{
		COMP_CRC32(statefile_crc, record->data, record->len);
		if ((write(fd, record->data, record->len)) != record->len)
		{
			close(fd);
			ereport(ERROR,
					(errcode_for_file_access(),
					 errmsg("could not write two-phase state file: %m")));
		}
	}

	FIN_CRC32(statefile_crc);

	/*
	 * Write a deliberately bogus CRC to the state file; this is just paranoia
	 * to catch the case where four more bytes will run us out of disk space.
	 */
	bogus_crc = ~statefile_crc;

	if ((write(fd, &bogus_crc, sizeof(pg_crc32))) != sizeof(pg_crc32))
	{
		close(fd);
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not write two-phase state file: %m")));
	}

	/* Back up to prepare for rewriting the CRC */
	if (lseek(fd, -((off_t) sizeof(pg_crc32)), SEEK_CUR) < 0)
	{
		close(fd);
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not seek in two-phase state file: %m")));
	}

	/*
	 * The state file isn't valid yet, because we haven't written the correct
	 * CRC yet.  Before we do that, insert entry in WAL and flush it to disk.
	 *
	 * Between the time we have written the WAL entry and the time we write
	 * out the correct state file CRC, we have an inconsistency: the xact is
	 * prepared according to WAL but not according to our on-disk state. We
	 * use a critical section to force a PANIC if we are unable to complete
	 * the write --- then, WAL replay should repair the inconsistency.	The
	 * odds of a PANIC actually occurring should be very tiny given that we
	 * were able to write the bogus CRC above.
	 *
	 * We have to set inCommit here, too; otherwise a checkpoint starting
	 * immediately after the WAL record is inserted could complete without
	 * fsync'ing our state file.  (This is essentially the same kind of race
	 * condition as the COMMIT-to-clog-write case that RecordTransactionCommit
	 * uses inCommit for; see notes there.)
	 *
	 * We save the PREPARE record's location in the gxact for later use by
	 * CheckPointTwoPhase.
	 */
	START_CRIT_SECTION();

	MyProc->inCommit = true;

	gxact->prepare_lsn = XLogInsert(RM_XACT_ID, XLOG_XACT_PREPARE,
									records.head);
	XLogFlush(gxact->prepare_lsn);

	/* If we crash now, we have prepared: WAL replay will fix things */

	/* write correct CRC and close file */
	if ((write(fd, &statefile_crc, sizeof(pg_crc32))) != sizeof(pg_crc32))
	{
		close(fd);
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not write two-phase state file: %m")));
	}

	if (close(fd) != 0)
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not close two-phase state file: %m")));

	/*
	 * Mark the prepared transaction as valid.	As soon as xact.c marks MyProc
	 * as not running our XID (which it will do immediately after this
	 * function returns), others can commit/rollback the xact.
	 *
	 * NB: a side effect of this is to make a dummy ProcArray entry for the
	 * prepared XID.  This must happen before we clear the XID from MyProc,
	 * else there is a window where the XID is not running according to
	 * TransactionIdIsInProgress, and onlookers would be entitled to assume
	 * the xact crashed.  Instead we have a window where the same XID appears
	 * twice in ProcArray, which is OK.
	 */
	MarkAsPrepared(gxact);

	/*
	 * Now we can mark ourselves as out of the commit critical section: a
	 * checkpoint starting after this will certainly see the gxact as a
	 * candidate for fsyncing.
	 */
	MyProc->inCommit = false;

	END_CRIT_SECTION();

	records.tail = records.head = NULL;
}
示例#24
0
/*
 * Read 'nbytes' bytes from WAL into 'buf', starting at location 'recptr'
 *
 * XXX probably this should be improved to suck data directly from the
 * WAL buffers when possible.
 *
 * Will open, and keep open, one WAL segment stored in the global file
 * descriptor sendFile. This means if XLogRead is used once, there will
 * always be one descriptor left open until the process ends, but never
 * more than one.
 */
void
XLogRead(char *buf, XLogRecPtr recptr, Size nbytes)
{
    XLogRecPtr	startRecPtr = recptr;
    char		path[MAXPGPATH];
    uint32		lastRemovedLog;
    uint32		lastRemovedSeg;
    uint32		log;
    uint32		seg;

    while (nbytes > 0)
    {
        uint32		startoff;
        int			segbytes;
        int			readbytes;

        startoff = recptr.xrecoff % XLogSegSize;

        if (sendFile < 0 || !XLByteInSeg(recptr, sendId, sendSeg))
        {
            /* Switch to another logfile segment */
            if (sendFile >= 0)
                close(sendFile);

            XLByteToSeg(recptr, sendId, sendSeg);
            XLogFilePath(path, ThisTimeLineID, sendId, sendSeg);

            sendFile = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);
            if (sendFile < 0)
            {
                /*
                 * If the file is not found, assume it's because the standby
                 * asked for a too old WAL segment that has already been
                 * removed or recycled.
                 */
                if (errno == ENOENT)
                {
                    char		filename[MAXFNAMELEN];

                    XLogFileName(filename, ThisTimeLineID, sendId, sendSeg);
                    ereport(ERROR,
                            (errcode_for_file_access(),
                             errmsg("requested WAL segment %s has already been removed",
                                    filename)));
                }
                else
                    ereport(ERROR,
                            (errcode_for_file_access(),
                             errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
                                    path, sendId, sendSeg)));
            }
            sendOff = 0;
        }

        /* Need to seek in the file? */
        if (sendOff != startoff)
        {
            if (lseek(sendFile, (off_t) startoff, SEEK_SET) < 0)
                ereport(ERROR,
                        (errcode_for_file_access(),
                         errmsg("could not seek in log file %u, segment %u to offset %u: %m",
                                sendId, sendSeg, startoff)));
            sendOff = startoff;
        }

        /* How many bytes are within this segment? */
        if (nbytes > (XLogSegSize - startoff))
            segbytes = XLogSegSize - startoff;
        else
            segbytes = nbytes;

        readbytes = read(sendFile, buf, segbytes);
        if (readbytes <= 0)
            ereport(ERROR,
                    (errcode_for_file_access(),
                     errmsg("could not read from log file %u, segment %u, offset %u, "
                            "length %lu: %m",
                            sendId, sendSeg, sendOff, (unsigned long) segbytes)));

        /* Update state for read */
        XLByteAdvance(recptr, readbytes);

        sendOff += readbytes;
        nbytes -= readbytes;
        buf += readbytes;
    }

    /*
     * After reading into the buffer, check that what we read was valid. We do
     * this after reading, because even though the segment was present when we
     * opened it, it might get recycled or removed while we read it. The
     * read() succeeds in that case, but the data we tried to read might
     * already have been overwritten with new WAL records.
     */
    XLogGetLastRemoved(&lastRemovedLog, &lastRemovedSeg);
    XLByteToSeg(startRecPtr, log, seg);
    if (log < lastRemovedLog ||
            (log == lastRemovedLog && seg <= lastRemovedSeg))
    {
        char		filename[MAXFNAMELEN];

        XLogFileName(filename, ThisTimeLineID, log, seg);
        ereport(ERROR,
                (errcode_for_file_access(),
                 errmsg("requested WAL segment %s has already been removed",
                        filename)));
    }
}
示例#25
0
文件: slru.c 项目: panyx0718/shadow
/*
 * Physical read of a (previously existing) page into a buffer slot
 *
 * On failure, we cannot just ereport(ERROR) since caller has put state in
 * shared memory that must be undone.  So, we return FALSE and save enough
 * info in static variables to let SlruReportIOError make the report.
 *
 * For now, assume it's not worth keeping a file pointer open across
 * read/write operations.  We could cache one virtual file pointer ...
 */
static bool
SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno)
{
	SlruShared	shared = ctl->shared;
	int			segno = pageno / SLRU_PAGES_PER_SEGMENT;
	int			rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
	int			offset = rpageno * BLCKSZ;
	char		path[MAXPGPATH];
	int			fd;
	struct timeval tv;

	SlruFileName(ctl, path, segno);

	/*
	 * In a crash-and-restart situation, it's possible for us to receive
	 * commands to set the commit status of transactions whose bits are in
	 * already-truncated segments of the commit log (see notes in
	 * SlruPhysicalWritePage).	Hence, if we are InRecovery, allow the case
	 * where the file doesn't exist, and return zeroes instead.
	 */
	fd = BasicOpenFile(path, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR);
	if (fd < 0)
	{
		if (errno != ENOENT || !InRecovery)
		{
			slru_errcause = SLRU_OPEN_FAILED;
			slru_errno = errno;
			return false;
		}

		ereport(LOG,
				(errmsg("file \"%s\" doesn't exist, reading as zeroes",
						path)));
		MemSet(shared->page_buffer[slotno], 0, BLCKSZ);
		return true;
	}

	if (lseek(fd, (off_t) offset, SEEK_SET) < 0)
	{
		slru_errcause = SLRU_SEEK_FAILED;
		slru_errno = errno;
		close(fd);
		return false;
	}

	errno = 0;
	if (read(fd, shared->page_buffer[slotno], BLCKSZ) != BLCKSZ)
	{
		slru_errcause = SLRU_READ_FAILED;
		slru_errno = errno;
		close(fd);
		return false;
	}
#ifdef XP_TRACE_LRU_READ
	gettimeofday(&tv, NULL);
#ifdef STACK_TRACE
	xp_stack_trace(TRACE_SIZE);
#endif
	ereport(TRACE_LEVEL,
		(errmsg("%ld.%ld:\tREAD:\tSlruPhysicalReadPage:\tfile:%s",
				tv.tv_sec, tv.tv_usec, path)));
#endif

	if (close(fd))
	{
		slru_errcause = SLRU_CLOSE_FAILED;
		slru_errno = errno;
		return false;
	}

	return true;
}
示例#26
0
/*
 * Physical write of a page from a buffer slot
 *
 * On failure, we cannot just ereport(ERROR) since caller has put state in
 * shared memory that must be undone.  So, we return FALSE and save enough
 * info in static variables to let SlruReportIOError make the report.
 *
 * For now, assume it's not worth keeping a file pointer open across
 * independent read/write operations.  We do batch operations during
 * SimpleLruFlush, though.
 *
 * fdata is NULL for a standalone write, pointer to open-file info during
 * SimpleLruFlush.
 */
static bool
SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruFlush fdata)
{
	SlruShared	shared = ctl->shared;
	int			segno = pageno / SLRU_PAGES_PER_SEGMENT;
	int			rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
	int			offset = rpageno * BLCKSZ;
	char		path[MAXPGPATH];
	int			fd = -1;

	/*
	 * During a Flush, we may already have the desired file open.
	 */
	if (fdata)
	{
		int			i;

		for (i = 0; i < fdata->num_files; i++)
		{
			if (fdata->segno[i] == segno)
			{
				fd = fdata->fd[i];
				break;
			}
		}
	}

	if (fd < 0)
	{
		/*
		 * If the file doesn't already exist, we should create it.  It is
		 * possible for this to need to happen when writing a page that's not
		 * first in its segment; we assume the OS can cope with that. (Note:
		 * it might seem that it'd be okay to create files only when
		 * SimpleLruZeroPage is called for the first page of a segment.
		 * However, if after a crash and restart the REDO logic elects to
		 * replay the log from a checkpoint before the latest one, then it's
		 * possible that we will get commands to set transaction status of
		 * transactions that have already been truncated from the commit log.
		 * Easiest way to deal with that is to accept references to
		 * nonexistent files here and in SlruPhysicalReadPage.)
		 *
		 * Note: it is possible for more than one backend to be executing
		 * this code simultaneously for different pages of the same file.
		 * Hence, don't use O_EXCL or O_TRUNC or anything like that.
		 */
		SlruFileName(ctl, path, segno);
		fd = BasicOpenFile(path, O_RDWR | O_CREAT | PG_BINARY,
						   S_IRUSR | S_IWUSR);
		if (fd < 0)
		{
			slru_errcause = SLRU_OPEN_FAILED;
			slru_errno = errno;
			return false;
		}

		if (fdata)
		{
			fdata->fd[fdata->num_files] = fd;
			fdata->segno[fdata->num_files] = segno;
			fdata->num_files++;
		}
	}

	if (lseek(fd, (off_t) offset, SEEK_SET) < 0)
	{
		slru_errcause = SLRU_SEEK_FAILED;
		slru_errno = errno;
		if (!fdata)
			close(fd);
		return false;
	}

	errno = 0;
	if (write(fd, shared->page_buffer[slotno], BLCKSZ) != BLCKSZ)
	{
		/* if write didn't set errno, assume problem is no disk space */
		if (errno == 0)
			errno = ENOSPC;
		slru_errcause = SLRU_WRITE_FAILED;
		slru_errno = errno;
		if (!fdata)
			close(fd);
		return false;
	}

	/*
	 * If not part of Flush, need to fsync now.  We assume this happens
	 * infrequently enough that it's not a performance issue.
	 */
	if (!fdata)
	{
		if (ctl->do_fsync && pg_fsync(fd))
		{
			slru_errcause = SLRU_FSYNC_FAILED;
			slru_errno = errno;
			close(fd);
			return false;
		}

		if (close(fd))
		{
			slru_errcause = SLRU_CLOSE_FAILED;
			slru_errno = errno;
			return false;
		}
	}

	return true;
}
示例#27
0
/*
 * GetNewRelFileNode
 *		Generate a new relfilenode number that is unique within the given
 *		tablespace.
 *
 * If the relfilenode will also be used as the relation's OID, pass the
 * opened pg_class catalog, and this routine will guarantee that the result
 * is also an unused OID within pg_class.  If the result is to be used only
 * as a relfilenode for an existing relation, pass NULL for pg_class.
 *
 * As with GetNewOid, there is some theoretical risk of a race condition,
 * but it doesn't seem worth worrying about.
 *
 * Note: we don't support using this in bootstrap mode.  All relations
 * created by bootstrap have preassigned OIDs, so there's no need.
 */
Oid
GetNewRelFileNode(Oid reltablespace, Relation pg_class, char relpersistence)
{
	RelFileNodeBackend rnode;
	char	   *rpath;
	int			fd;
	bool		collides;
	BackendId	backend;

	switch (relpersistence)
	{
		case RELPERSISTENCE_TEMP:
			backend = MyBackendId;
			break;
		case RELPERSISTENCE_UNLOGGED:
		case RELPERSISTENCE_PERMANENT:
			backend = InvalidBackendId;
			break;
		default:
			elog(ERROR, "invalid relpersistence: %c", relpersistence);
			return InvalidOid;	/* placate compiler */
	}

	/* This logic should match RelationInitPhysicalAddr */
	rnode.node.spcNode = reltablespace ? reltablespace : MyDatabaseTableSpace;
	rnode.node.dbNode = (rnode.node.spcNode == GLOBALTABLESPACE_OID) ? InvalidOid : MyDatabaseId;

	/*
	 * The relpath will vary based on the backend ID, so we must initialize
	 * that properly here to make sure that any collisions based on filename
	 * are properly detected.
	 */
	rnode.backend = backend;

	do
	{
		CHECK_FOR_INTERRUPTS();

		/* Generate the OID */
		if (pg_class)
			rnode.node.relNode = GetNewOid(pg_class);
		else
			rnode.node.relNode = GetNewObjectId();

		/* Check for existing file of same name */
		rpath = relpath(rnode, MAIN_FORKNUM);
		fd = BasicOpenFile(rpath, O_RDONLY | PG_BINARY, 0);

		if (fd >= 0)
		{
			/* definite collision */
			close(fd);
			collides = true;
		}
		else
		{
			/*
			 * Here we have a little bit of a dilemma: if errno is something
			 * other than ENOENT, should we declare a collision and loop? In
			 * particular one might think this advisable for, say, EPERM.
			 * However there really shouldn't be any unreadable files in a
			 * tablespace directory, and if the EPERM is actually complaining
			 * that we can't read the directory itself, we'd be in an infinite
			 * loop.  In practice it seems best to go ahead regardless of the
			 * errno.  If there is a colliding file we will get an smgr
			 * failure when we attempt to create the new relation file.
			 */
			collides = false;
		}

		pfree(rpath);
	} while (collides);

	return rnode.node.relNode;
}
示例#28
0
/**
 * @brief Initialize a DirectWriter
 */
static void
DirectWriterInit(DirectWriter *self)
{
	LoadStatus		   *ls;

	/*
	 * Set defaults to unspecified parameters.
	 */
	if (self->base.max_dup_errors < -1)
		self->base.max_dup_errors = DEFAULT_MAX_DUP_ERRORS;

	self->base.rel = heap_open(self->base.relid, AccessExclusiveLock);
	VerifyTarget(self->base.rel, self->base.max_dup_errors);

	self->base.desc = RelationGetDescr(self->base.rel);

	SpoolerOpen(&self->spooler, self->base.rel, false, self->base.on_duplicate,
				self->base.max_dup_errors, self->base.dup_badfile);
	self->base.context = GetPerTupleMemoryContext(self->spooler.estate);

	/* Verify DataDir/pg_bulkload directory */
	ValidateLSFDirectory(BULKLOAD_LSF_DIR);

	/* Initialize first block */
	PageInit(GetCurrentPage(self), BLCKSZ, 0);
	PageSetTLI(GetCurrentPage(self), ThisTimeLineID);

	/* Obtain transaction ID and command ID. */
	self->xid = GetCurrentTransactionId();
	self->cid = GetCurrentCommandId(true);

	/*
	 * Initialize load status information
	 */
	ls = &self->ls;
	ls->ls.relid = self->base.relid;
	ls->ls.rnode = self->base.rel->rd_node;
	ls->ls.exist_cnt = RelationGetNumberOfBlocks(self->base.rel);
	ls->ls.create_cnt = 0;

	/*
	 * Create a load status file and write the initial status for it.
	 * At the time, if we find any existing load status files, exit with
	 * error because recovery process haven't been executed after failing
	 * load to the same table.
	 */
	BULKLOAD_LSF_PATH(self->lsf_path, ls);
	self->lsf_fd = BasicOpenFile(self->lsf_path,
		O_CREAT | O_EXCL | O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR);
	if (self->lsf_fd == -1)
		ereport(ERROR, (errcode_for_file_access(),
			errmsg("could not create loadstatus file \"%s\": %m", self->lsf_path)));

	if (write(self->lsf_fd, ls, sizeof(LoadStatus)) != sizeof(LoadStatus) ||
		pg_fsync(self->lsf_fd) != 0)
	{
		UnlinkLSF(self);
		ereport(ERROR, (errcode_for_file_access(),
			errmsg("could not write loadstatus file \"%s\": %m", self->lsf_path)));
	}

	self->base.tchecker = CreateTupleChecker(self->base.desc);
	self->base.tchecker->checker = (CheckerTupleProc) CoercionCheckerTuple;
}