Example #1
0
/*
 * Record a data block number in a logical tape's lowest indirect block,
 * or record an indirect block's number in the next higher indirect level.
 */
static void
ltsRecordBlockNum(LogicalTapeSet *lts, IndirectBlock *indirect,
				  long blocknum)
{
	if (indirect->nextSlot >= BLOCKS_PER_INDIR_BLOCK)
	{
		/*
		 * This indirect block is full, so dump it out and recursively save
		 * its address in the next indirection level.  Create a new
		 * indirection level if there wasn't one before.
		 */
		long		indirblock = ltsGetFreeBlock(lts);

		ltsWriteBlock(lts, indirblock, (void *) indirect->ptrs);
		if (indirect->nextup == NULL)
		{
			indirect->nextup = (IndirectBlock *) palloc(sizeof(IndirectBlock));
			indirect->nextup->nextSlot = 0;
			indirect->nextup->nextup = NULL;
		}
		ltsRecordBlockNum(lts, indirect->nextup, indirblock);

		/*
		 * Reset to fill another indirect block at this level.
		 */
		indirect->nextSlot = 0;
	}
	indirect->ptrs[indirect->nextSlot++] = blocknum;
}
Example #2
0
/*
 * Dump the dirty buffer of a logical tape.
 */
static void
ltsDumpBuffer(LogicalTapeSet *lts, LogicalTape *lt)
{
	long		datablock = ltsGetFreeBlock(lts);

	Assert(lt->dirty);
	ltsWriteBlock(lts, datablock, (void *) lt->buffer);
	ltsRecordBlockNum(lts, lt->indirect, datablock);
	lt->dirty = false;
	/* Caller must do other state update as needed */
}
Example #3
0
/*
 * Rewind logical tape and switch from writing to reading or vice versa.
 *
 * Unless the tape has been "frozen" in read state, forWrite must be the
 * opposite of the previous tape state.
 */
void 
LogicalTapeRewind(LogicalTapeSet *lts, LogicalTape *lt, bool forWrite)
{
	AssertEquivalent(lt->firstBlkNum==-1, lt->currPos.blkNum == -1);

	if (!forWrite)
	{
		if (lt->writing)
		{
			if(lt->firstBlkNum != -1)
			{
				Assert(lt->currBlk.next_blk == -1L);
				ltsWriteBlock(lts, lt->currPos.blkNum, &lt->currBlk);

				if(lt->currPos.blkNum != lt->firstBlkNum)
					ltsReadBlock(lts, lt->firstBlkNum, &lt->currBlk);
			}
			
			lt->currPos.blkNum = lt->firstBlkNum;
			lt->currPos.offset = 0;
			lt->writing = false;
		}
		else
		{
			/*
			 * This is only OK if tape is frozen; we rewind for (another) read
			 * pass.
			 */
			Assert(lt->frozen);
			if(lt->currPos.blkNum != lt->firstBlkNum)
				ltsReadBlock(lts, lt->firstBlkNum, &lt->currBlk);

			lt->currPos.blkNum = lt->firstBlkNum;
			lt->currPos.offset = 0;
		}
	}
	else
	{
		lt->firstBlkNum = -1L;
		lt->currBlk.prev_blk = -1L;
		lt->currBlk.next_blk = -1L;
		lt->currBlk.payload_tail = 0;
		lt->currPos.blkNum = -1L;
		lt->currPos.offset = 0;
		lt->writing = true;
	}
}
Example #4
0
/*
 * Write to a logical tape.
 *
 * There are no error returns; we ereport() on failure.
 */
void
LogicalTapeWrite(LogicalTapeSet *lts, LogicalTape *lt, void *ptr, size_t size)
{
	long        tmpBlkNum;
	size_t		nthistime;

	Assert(lt->writing);

	if(lt->firstBlkNum == -1)
	{
		lt->firstBlkNum = ltsGetFreeBlock(lts);
		lt->currBlk.prev_blk = -1L;
		lt->currBlk.next_blk = -1L;
		lt->currBlk.payload_tail = 0;

		lt->currPos.blkNum = lt->firstBlkNum;
		lt->currPos.offset = 0;
	}

	while(size > 0)
	{
		Assert(lt->currPos.offset == lt->currBlk.payload_tail);
		Assert(lt->currPos.offset <= LOGTAPE_BLK_PAYLOAD_SIZE);

		if (lt->currPos.offset == LOGTAPE_BLK_PAYLOAD_SIZE)
		{
			Assert(lt->currBlk.payload_tail == LOGTAPE_BLK_PAYLOAD_SIZE);
			tmpBlkNum = ltsGetFreeBlock(lts);
			lt->currBlk.next_blk = tmpBlkNum;
			ltsWriteBlock(lts, lt->currPos.blkNum, &(lt->currBlk));
			lt->currBlk.prev_blk = lt->currPos.blkNum;
			lt->currBlk.next_blk = -1L;
			lt->currBlk.payload_tail = 0;
			lt->currPos.blkNum = tmpBlkNum;
			lt->currPos.offset = 0;
		}

		nthistime = size > (LOGTAPE_BLK_PAYLOAD_SIZE - lt->currPos.offset) ? 
			(LOGTAPE_BLK_PAYLOAD_SIZE - lt->currPos.offset) : size;

		memcpy(lt->currBlk.payload + lt->currBlk.payload_tail, ptr, nthistime);
		ptr = (void *) ((char *) ptr + nthistime);
		lt->currBlk.payload_tail += nthistime;
		lt->currPos.offset += nthistime;
		size -= nthistime;
	}
}
Example #5
0
/*
 * Write a block-sized buffer to the specified block of the underlying file.
 *
 * No need for an error return convention; we ereport() on any error.
 */
static void
ltsWriteBlock(LogicalTapeSet *lts, long blocknum, void *buffer)
{
	/*
	 * BufFile does not support "holes", so if we're about to write a block
	 * that's past the current end of file, fill the space between the current
	 * end of file and the target block with zeros.
	 *
	 * This should happen rarely, otherwise you are not writing very
	 * sequentially.  In current use, this only happens when the sort ends
	 * writing a run, and switches to another tape.  The last block of the
	 * previous tape isn't flushed to disk until the end of the sort, so you
	 * get one-block hole, where the last block of the previous tape will
	 * later go.
	 *
	 * Note that BufFile concatenation can leave "holes" in BufFile between
	 * worker-owned block ranges.  These are tracked for reporting purposes
	 * only.  We never read from nor write to these hole blocks, and so they
	 * are not considered here.
	 */
	while (blocknum > lts->nBlocksWritten)
	{
		PGAlignedBlock zerobuf;

		MemSet(zerobuf.data, 0, sizeof(zerobuf));

		ltsWriteBlock(lts, lts->nBlocksWritten, zerobuf.data);
	}

	/* Write the requested block */
	if (BufFileSeekBlock(lts->pfile, blocknum) != 0 ||
		BufFileWrite(lts->pfile, buffer, BLCKSZ) != BLCKSZ)
		ereport(ERROR,
				(errcode_for_file_access(),
				 errmsg("could not write block %ld of temporary file: %m",
						blocknum)));

	/* Update nBlocksWritten, if we extended the file */
	if (blocknum == lts->nBlocksWritten)
		lts->nBlocksWritten++;
}
Example #6
0
/*
 * Reset a logical tape's indirect-block hierarchy after a write pass
 * to prepare for reading.	We dump out partly-filled blocks except
 * at the top of the hierarchy, and we rewind each level to the start.
 * This call returns the first data block number, or -1L if the tape
 * is empty.
 *
 * Unless 'freezing' is true, release indirect blocks to the free pool after
 * reading them.
 */
static long
ltsRewindIndirectBlock(LogicalTapeSet *lts,
					   IndirectBlock *indirect,
					   bool freezing)
{
	/* Handle case of never-written-to tape */
	if (indirect == NULL)
		return -1L;

	/* Insert sentinel if block is not full */
	if (indirect->nextSlot < BLOCKS_PER_INDIR_BLOCK)
		indirect->ptrs[indirect->nextSlot] = -1L;

	/*
	 * If block is not topmost, write it out, and recurse to obtain address of
	 * first block in this hierarchy level.  Read that one in.
	 */
	if (indirect->nextup != NULL)
	{
		long		indirblock = ltsGetFreeBlock(lts);

		ltsWriteBlock(lts, indirblock, (void *) indirect->ptrs);
		ltsRecordBlockNum(lts, indirect->nextup, indirblock);
		indirblock = ltsRewindIndirectBlock(lts, indirect->nextup, freezing);
		Assert(indirblock != -1L);
		ltsReadBlock(lts, indirblock, (void *) indirect->ptrs);
		if (!freezing)
			ltsReleaseBlock(lts, indirblock);
	}

	/*
	 * Reset my next-block pointer, and then fetch a block number if any.
	 */
	indirect->nextSlot = 0;
	if (indirect->ptrs[0] == -1L)
		return -1L;
	return indirect->ptrs[indirect->nextSlot++];
}
Example #7
0
/*
 * "Freeze" the contents of a tape so that it can be read multiple times
 * and/or read backwards.  Once a tape is frozen, its contents will not
 * be released until the LogicalTapeSet is destroyed.  This is expected
 * to be used only for the final output pass of a merge.
 *
 * This *must* be called just at the end of a write pass, before the
 * tape is rewound (after rewind is too late!).  It performs a rewind
 * and switch to read mode "for free".  An immediately following rewind-
 * for-read call is OK but not necessary.
 *
 * share output argument is set with details of storage used for tape after
 * freezing, which may be passed to LogicalTapeSetCreate within leader
 * process later.  This metadata is only of interest to worker callers
 * freezing their final output for leader (single materialized tape).
 * Serial sorts should set share to NULL.
 */
void
LogicalTapeFreeze(LogicalTapeSet *lts, int tapenum, TapeShare *share)
{
	LogicalTape *lt;

	Assert(tapenum >= 0 && tapenum < lts->nTapes);
	lt = &lts->tapes[tapenum];
	Assert(lt->writing);
	Assert(lt->offsetBlockNumber == 0L);

	/*
	 * Completion of a write phase.  Flush last partial data block, and rewind
	 * for nondestructive read.
	 */
	if (lt->dirty)
	{
		/*
		 * As long as we've filled the buffer at least once, its contents are
		 * entirely defined from valgrind's point of view, even though
		 * contents beyond the current end point may be stale.  But it's
		 * possible - at least in the case of a parallel sort - to sort such
		 * small amount of data that we do not fill the buffer even once. Tell
		 * valgrind that its contents are defined, so it doesn't bleat.
		 */
		VALGRIND_MAKE_MEM_DEFINED(lt->buffer + lt->nbytes,
								  lt->buffer_size - lt->nbytes);

		TapeBlockSetNBytes(lt->buffer, lt->nbytes);
		ltsWriteBlock(lts, lt->curBlockNumber, (void *) lt->buffer);
		lt->writing = false;
	}
	lt->writing = false;
	lt->frozen = true;

	/*
	 * The seek and backspace functions assume a single block read buffer.
	 * That's OK with current usage.  A larger buffer is helpful to make the
	 * read pattern of the backing file look more sequential to the OS, when
	 * we're reading from multiple tapes.  But at the end of a sort, when a
	 * tape is frozen, we only read from a single tape anyway.
	 */
	if (!lt->buffer || lt->buffer_size != BLCKSZ)
	{
		if (lt->buffer)
			pfree(lt->buffer);
		lt->buffer = palloc(BLCKSZ);
		lt->buffer_size = BLCKSZ;
	}

	/* Read the first block, or reset if tape is empty */
	lt->curBlockNumber = lt->firstBlockNumber;
	lt->pos = 0;
	lt->nbytes = 0;

	if (lt->firstBlockNumber == -1L)
		lt->nextBlockNumber = -1L;
	ltsReadBlock(lts, lt->curBlockNumber, (void *) lt->buffer);
	if (TapeBlockIsLast(lt->buffer))
		lt->nextBlockNumber = -1L;
	else
		lt->nextBlockNumber = TapeBlockGetTrailer(lt->buffer)->next;
	lt->nbytes = TapeBlockGetNBytes(lt->buffer);

	/* Handle extra steps when caller is to share its tapeset */
	if (share)
	{
		BufFileExportShared(lts->pfile);
		share->firstblocknumber = lt->firstBlockNumber;
	}
}
Example #8
0
/*
 * Rewind logical tape and switch from writing to reading.
 *
 * The tape must currently be in writing state, or "frozen" in read state.
 *
 * 'buffer_size' specifies how much memory to use for the read buffer.
 * Regardless of the argument, the actual amount of memory used is between
 * BLCKSZ and MaxAllocSize, and is a multiple of BLCKSZ.  The given value is
 * rounded down and truncated to fit those constraints, if necessary.  If the
 * tape is frozen, the 'buffer_size' argument is ignored, and a small BLCKSZ
 * byte buffer is used.
 */
void
LogicalTapeRewindForRead(LogicalTapeSet *lts, int tapenum, size_t buffer_size)
{
	LogicalTape *lt;

	Assert(tapenum >= 0 && tapenum < lts->nTapes);
	lt = &lts->tapes[tapenum];

	/*
	 * Round and cap buffer_size if needed.
	 */
	if (lt->frozen)
		buffer_size = BLCKSZ;
	else
	{
		/* need at least one block */
		if (buffer_size < BLCKSZ)
			buffer_size = BLCKSZ;

		/* palloc() larger than max_size is unlikely to be helpful */
		if (buffer_size > lt->max_size)
			buffer_size = lt->max_size;

		/* round down to BLCKSZ boundary */
		buffer_size -= buffer_size % BLCKSZ;
	}

	if (lt->writing)
	{
		/*
		 * Completion of a write phase.  Flush last partial data block, and
		 * rewind for normal (destructive) read.
		 */
		if (lt->dirty)
		{
			/*
			 * As long as we've filled the buffer at least once, its contents
			 * are entirely defined from valgrind's point of view, even though
			 * contents beyond the current end point may be stale.  But it's
			 * possible - at least in the case of a parallel sort - to sort
			 * such small amount of data that we do not fill the buffer even
			 * once.  Tell valgrind that its contents are defined, so it
			 * doesn't bleat.
			 */
			VALGRIND_MAKE_MEM_DEFINED(lt->buffer + lt->nbytes,
									  lt->buffer_size - lt->nbytes);

			TapeBlockSetNBytes(lt->buffer, lt->nbytes);
			ltsWriteBlock(lts, lt->curBlockNumber, (void *) lt->buffer);
		}
		lt->writing = false;
	}
	else
	{
		/*
		 * This is only OK if tape is frozen; we rewind for (another) read
		 * pass.
		 */
		Assert(lt->frozen);
	}

	/* Allocate a read buffer (unless the tape is empty) */
	if (lt->buffer)
		pfree(lt->buffer);
	lt->buffer = NULL;
	lt->buffer_size = 0;
	if (lt->firstBlockNumber != -1L)
	{
		lt->buffer = palloc(buffer_size);
		lt->buffer_size = buffer_size;
	}

	/* Read the first block, or reset if tape is empty */
	lt->nextBlockNumber = lt->firstBlockNumber;
	lt->pos = 0;
	lt->nbytes = 0;
	ltsReadFillBuffer(lts, lt);
}
Example #9
0
/*
 * Write to a logical tape.
 *
 * There are no error returns; we ereport() on failure.
 */
void
LogicalTapeWrite(LogicalTapeSet *lts, int tapenum,
				 void *ptr, size_t size)
{
	LogicalTape *lt;
	size_t		nthistime;

	Assert(tapenum >= 0 && tapenum < lts->nTapes);
	lt = &lts->tapes[tapenum];
	Assert(lt->writing);
	Assert(lt->offsetBlockNumber == 0L);

	/* Allocate data buffer and first block on first write */
	if (lt->buffer == NULL)
	{
		lt->buffer = (char *) palloc(BLCKSZ);
		lt->buffer_size = BLCKSZ;
	}
	if (lt->curBlockNumber == -1)
	{
		Assert(lt->firstBlockNumber == -1);
		Assert(lt->pos == 0);

		lt->curBlockNumber = ltsGetFreeBlock(lts);
		lt->firstBlockNumber = lt->curBlockNumber;

		TapeBlockGetTrailer(lt->buffer)->prev = -1L;
	}

	Assert(lt->buffer_size == BLCKSZ);
	while (size > 0)
	{
		if (lt->pos >= TapeBlockPayloadSize)
		{
			/* Buffer full, dump it out */
			long		nextBlockNumber;

			if (!lt->dirty)
			{
				/* Hmm, went directly from reading to writing? */
				elog(ERROR, "invalid logtape state: should be dirty");
			}

			/*
			 * First allocate the next block, so that we can store it in the
			 * 'next' pointer of this block.
			 */
			nextBlockNumber = ltsGetFreeBlock(lts);

			/* set the next-pointer and dump the current block. */
			TapeBlockGetTrailer(lt->buffer)->next = nextBlockNumber;
			ltsWriteBlock(lts, lt->curBlockNumber, (void *) lt->buffer);

			/* initialize the prev-pointer of the next block */
			TapeBlockGetTrailer(lt->buffer)->prev = lt->curBlockNumber;
			lt->curBlockNumber = nextBlockNumber;
			lt->pos = 0;
			lt->nbytes = 0;
		}

		nthistime = TapeBlockPayloadSize - lt->pos;
		if (nthistime > size)
			nthistime = size;
		Assert(nthistime > 0);

		memcpy(lt->buffer + lt->pos, ptr, nthistime);

		lt->dirty = true;
		lt->pos += nthistime;
		if (lt->nbytes < lt->pos)
			lt->nbytes = lt->pos;
		ptr = (void *) ((char *) ptr + nthistime);
		size -= nthistime;
	}
}