/* * Record a data block number in a logical tape's lowest indirect block, * or record an indirect block's number in the next higher indirect level. */ static void ltsRecordBlockNum(LogicalTapeSet *lts, IndirectBlock *indirect, long blocknum) { if (indirect->nextSlot >= BLOCKS_PER_INDIR_BLOCK) { /* * This indirect block is full, so dump it out and recursively save * its address in the next indirection level. Create a new * indirection level if there wasn't one before. */ long indirblock = ltsGetFreeBlock(lts); ltsWriteBlock(lts, indirblock, (void *) indirect->ptrs); if (indirect->nextup == NULL) { indirect->nextup = (IndirectBlock *) palloc(sizeof(IndirectBlock)); indirect->nextup->nextSlot = 0; indirect->nextup->nextup = NULL; } ltsRecordBlockNum(lts, indirect->nextup, indirblock); /* * Reset to fill another indirect block at this level. */ indirect->nextSlot = 0; } indirect->ptrs[indirect->nextSlot++] = blocknum; }
/* * Dump the dirty buffer of a logical tape. */ static void ltsDumpBuffer(LogicalTapeSet *lts, LogicalTape *lt) { long datablock = ltsGetFreeBlock(lts); Assert(lt->dirty); ltsWriteBlock(lts, datablock, (void *) lt->buffer); ltsRecordBlockNum(lts, lt->indirect, datablock); lt->dirty = false; /* Caller must do other state update as needed */ }
/* * Rewind logical tape and switch from writing to reading or vice versa. * * Unless the tape has been "frozen" in read state, forWrite must be the * opposite of the previous tape state. */ void LogicalTapeRewind(LogicalTapeSet *lts, LogicalTape *lt, bool forWrite) { AssertEquivalent(lt->firstBlkNum==-1, lt->currPos.blkNum == -1); if (!forWrite) { if (lt->writing) { if(lt->firstBlkNum != -1) { Assert(lt->currBlk.next_blk == -1L); ltsWriteBlock(lts, lt->currPos.blkNum, <->currBlk); if(lt->currPos.blkNum != lt->firstBlkNum) ltsReadBlock(lts, lt->firstBlkNum, <->currBlk); } lt->currPos.blkNum = lt->firstBlkNum; lt->currPos.offset = 0; lt->writing = false; } else { /* * This is only OK if tape is frozen; we rewind for (another) read * pass. */ Assert(lt->frozen); if(lt->currPos.blkNum != lt->firstBlkNum) ltsReadBlock(lts, lt->firstBlkNum, <->currBlk); lt->currPos.blkNum = lt->firstBlkNum; lt->currPos.offset = 0; } } else { lt->firstBlkNum = -1L; lt->currBlk.prev_blk = -1L; lt->currBlk.next_blk = -1L; lt->currBlk.payload_tail = 0; lt->currPos.blkNum = -1L; lt->currPos.offset = 0; lt->writing = true; } }
/* * Write to a logical tape. * * There are no error returns; we ereport() on failure. */ void LogicalTapeWrite(LogicalTapeSet *lts, LogicalTape *lt, void *ptr, size_t size) { long tmpBlkNum; size_t nthistime; Assert(lt->writing); if(lt->firstBlkNum == -1) { lt->firstBlkNum = ltsGetFreeBlock(lts); lt->currBlk.prev_blk = -1L; lt->currBlk.next_blk = -1L; lt->currBlk.payload_tail = 0; lt->currPos.blkNum = lt->firstBlkNum; lt->currPos.offset = 0; } while(size > 0) { Assert(lt->currPos.offset == lt->currBlk.payload_tail); Assert(lt->currPos.offset <= LOGTAPE_BLK_PAYLOAD_SIZE); if (lt->currPos.offset == LOGTAPE_BLK_PAYLOAD_SIZE) { Assert(lt->currBlk.payload_tail == LOGTAPE_BLK_PAYLOAD_SIZE); tmpBlkNum = ltsGetFreeBlock(lts); lt->currBlk.next_blk = tmpBlkNum; ltsWriteBlock(lts, lt->currPos.blkNum, &(lt->currBlk)); lt->currBlk.prev_blk = lt->currPos.blkNum; lt->currBlk.next_blk = -1L; lt->currBlk.payload_tail = 0; lt->currPos.blkNum = tmpBlkNum; lt->currPos.offset = 0; } nthistime = size > (LOGTAPE_BLK_PAYLOAD_SIZE - lt->currPos.offset) ? (LOGTAPE_BLK_PAYLOAD_SIZE - lt->currPos.offset) : size; memcpy(lt->currBlk.payload + lt->currBlk.payload_tail, ptr, nthistime); ptr = (void *) ((char *) ptr + nthistime); lt->currBlk.payload_tail += nthistime; lt->currPos.offset += nthistime; size -= nthistime; } }
/* * Write a block-sized buffer to the specified block of the underlying file. * * No need for an error return convention; we ereport() on any error. */ static void ltsWriteBlock(LogicalTapeSet *lts, long blocknum, void *buffer) { /* * BufFile does not support "holes", so if we're about to write a block * that's past the current end of file, fill the space between the current * end of file and the target block with zeros. * * This should happen rarely, otherwise you are not writing very * sequentially. In current use, this only happens when the sort ends * writing a run, and switches to another tape. The last block of the * previous tape isn't flushed to disk until the end of the sort, so you * get one-block hole, where the last block of the previous tape will * later go. * * Note that BufFile concatenation can leave "holes" in BufFile between * worker-owned block ranges. These are tracked for reporting purposes * only. We never read from nor write to these hole blocks, and so they * are not considered here. */ while (blocknum > lts->nBlocksWritten) { PGAlignedBlock zerobuf; MemSet(zerobuf.data, 0, sizeof(zerobuf)); ltsWriteBlock(lts, lts->nBlocksWritten, zerobuf.data); } /* Write the requested block */ if (BufFileSeekBlock(lts->pfile, blocknum) != 0 || BufFileWrite(lts->pfile, buffer, BLCKSZ) != BLCKSZ) ereport(ERROR, (errcode_for_file_access(), errmsg("could not write block %ld of temporary file: %m", blocknum))); /* Update nBlocksWritten, if we extended the file */ if (blocknum == lts->nBlocksWritten) lts->nBlocksWritten++; }
/* * Reset a logical tape's indirect-block hierarchy after a write pass * to prepare for reading. We dump out partly-filled blocks except * at the top of the hierarchy, and we rewind each level to the start. * This call returns the first data block number, or -1L if the tape * is empty. * * Unless 'freezing' is true, release indirect blocks to the free pool after * reading them. */ static long ltsRewindIndirectBlock(LogicalTapeSet *lts, IndirectBlock *indirect, bool freezing) { /* Handle case of never-written-to tape */ if (indirect == NULL) return -1L; /* Insert sentinel if block is not full */ if (indirect->nextSlot < BLOCKS_PER_INDIR_BLOCK) indirect->ptrs[indirect->nextSlot] = -1L; /* * If block is not topmost, write it out, and recurse to obtain address of * first block in this hierarchy level. Read that one in. */ if (indirect->nextup != NULL) { long indirblock = ltsGetFreeBlock(lts); ltsWriteBlock(lts, indirblock, (void *) indirect->ptrs); ltsRecordBlockNum(lts, indirect->nextup, indirblock); indirblock = ltsRewindIndirectBlock(lts, indirect->nextup, freezing); Assert(indirblock != -1L); ltsReadBlock(lts, indirblock, (void *) indirect->ptrs); if (!freezing) ltsReleaseBlock(lts, indirblock); } /* * Reset my next-block pointer, and then fetch a block number if any. */ indirect->nextSlot = 0; if (indirect->ptrs[0] == -1L) return -1L; return indirect->ptrs[indirect->nextSlot++]; }
/* * "Freeze" the contents of a tape so that it can be read multiple times * and/or read backwards. Once a tape is frozen, its contents will not * be released until the LogicalTapeSet is destroyed. This is expected * to be used only for the final output pass of a merge. * * This *must* be called just at the end of a write pass, before the * tape is rewound (after rewind is too late!). It performs a rewind * and switch to read mode "for free". An immediately following rewind- * for-read call is OK but not necessary. * * share output argument is set with details of storage used for tape after * freezing, which may be passed to LogicalTapeSetCreate within leader * process later. This metadata is only of interest to worker callers * freezing their final output for leader (single materialized tape). * Serial sorts should set share to NULL. */ void LogicalTapeFreeze(LogicalTapeSet *lts, int tapenum, TapeShare *share) { LogicalTape *lt; Assert(tapenum >= 0 && tapenum < lts->nTapes); lt = <s->tapes[tapenum]; Assert(lt->writing); Assert(lt->offsetBlockNumber == 0L); /* * Completion of a write phase. Flush last partial data block, and rewind * for nondestructive read. */ if (lt->dirty) { /* * As long as we've filled the buffer at least once, its contents are * entirely defined from valgrind's point of view, even though * contents beyond the current end point may be stale. But it's * possible - at least in the case of a parallel sort - to sort such * small amount of data that we do not fill the buffer even once. Tell * valgrind that its contents are defined, so it doesn't bleat. */ VALGRIND_MAKE_MEM_DEFINED(lt->buffer + lt->nbytes, lt->buffer_size - lt->nbytes); TapeBlockSetNBytes(lt->buffer, lt->nbytes); ltsWriteBlock(lts, lt->curBlockNumber, (void *) lt->buffer); lt->writing = false; } lt->writing = false; lt->frozen = true; /* * The seek and backspace functions assume a single block read buffer. * That's OK with current usage. A larger buffer is helpful to make the * read pattern of the backing file look more sequential to the OS, when * we're reading from multiple tapes. But at the end of a sort, when a * tape is frozen, we only read from a single tape anyway. */ if (!lt->buffer || lt->buffer_size != BLCKSZ) { if (lt->buffer) pfree(lt->buffer); lt->buffer = palloc(BLCKSZ); lt->buffer_size = BLCKSZ; } /* Read the first block, or reset if tape is empty */ lt->curBlockNumber = lt->firstBlockNumber; lt->pos = 0; lt->nbytes = 0; if (lt->firstBlockNumber == -1L) lt->nextBlockNumber = -1L; ltsReadBlock(lts, lt->curBlockNumber, (void *) lt->buffer); if (TapeBlockIsLast(lt->buffer)) lt->nextBlockNumber = -1L; else lt->nextBlockNumber = TapeBlockGetTrailer(lt->buffer)->next; lt->nbytes = TapeBlockGetNBytes(lt->buffer); /* Handle extra steps when caller is to share its tapeset */ if (share) { BufFileExportShared(lts->pfile); share->firstblocknumber = lt->firstBlockNumber; } }
/* * Rewind logical tape and switch from writing to reading. * * The tape must currently be in writing state, or "frozen" in read state. * * 'buffer_size' specifies how much memory to use for the read buffer. * Regardless of the argument, the actual amount of memory used is between * BLCKSZ and MaxAllocSize, and is a multiple of BLCKSZ. The given value is * rounded down and truncated to fit those constraints, if necessary. If the * tape is frozen, the 'buffer_size' argument is ignored, and a small BLCKSZ * byte buffer is used. */ void LogicalTapeRewindForRead(LogicalTapeSet *lts, int tapenum, size_t buffer_size) { LogicalTape *lt; Assert(tapenum >= 0 && tapenum < lts->nTapes); lt = <s->tapes[tapenum]; /* * Round and cap buffer_size if needed. */ if (lt->frozen) buffer_size = BLCKSZ; else { /* need at least one block */ if (buffer_size < BLCKSZ) buffer_size = BLCKSZ; /* palloc() larger than max_size is unlikely to be helpful */ if (buffer_size > lt->max_size) buffer_size = lt->max_size; /* round down to BLCKSZ boundary */ buffer_size -= buffer_size % BLCKSZ; } if (lt->writing) { /* * Completion of a write phase. Flush last partial data block, and * rewind for normal (destructive) read. */ if (lt->dirty) { /* * As long as we've filled the buffer at least once, its contents * are entirely defined from valgrind's point of view, even though * contents beyond the current end point may be stale. But it's * possible - at least in the case of a parallel sort - to sort * such small amount of data that we do not fill the buffer even * once. Tell valgrind that its contents are defined, so it * doesn't bleat. */ VALGRIND_MAKE_MEM_DEFINED(lt->buffer + lt->nbytes, lt->buffer_size - lt->nbytes); TapeBlockSetNBytes(lt->buffer, lt->nbytes); ltsWriteBlock(lts, lt->curBlockNumber, (void *) lt->buffer); } lt->writing = false; } else { /* * This is only OK if tape is frozen; we rewind for (another) read * pass. */ Assert(lt->frozen); } /* Allocate a read buffer (unless the tape is empty) */ if (lt->buffer) pfree(lt->buffer); lt->buffer = NULL; lt->buffer_size = 0; if (lt->firstBlockNumber != -1L) { lt->buffer = palloc(buffer_size); lt->buffer_size = buffer_size; } /* Read the first block, or reset if tape is empty */ lt->nextBlockNumber = lt->firstBlockNumber; lt->pos = 0; lt->nbytes = 0; ltsReadFillBuffer(lts, lt); }
/* * Write to a logical tape. * * There are no error returns; we ereport() on failure. */ void LogicalTapeWrite(LogicalTapeSet *lts, int tapenum, void *ptr, size_t size) { LogicalTape *lt; size_t nthistime; Assert(tapenum >= 0 && tapenum < lts->nTapes); lt = <s->tapes[tapenum]; Assert(lt->writing); Assert(lt->offsetBlockNumber == 0L); /* Allocate data buffer and first block on first write */ if (lt->buffer == NULL) { lt->buffer = (char *) palloc(BLCKSZ); lt->buffer_size = BLCKSZ; } if (lt->curBlockNumber == -1) { Assert(lt->firstBlockNumber == -1); Assert(lt->pos == 0); lt->curBlockNumber = ltsGetFreeBlock(lts); lt->firstBlockNumber = lt->curBlockNumber; TapeBlockGetTrailer(lt->buffer)->prev = -1L; } Assert(lt->buffer_size == BLCKSZ); while (size > 0) { if (lt->pos >= TapeBlockPayloadSize) { /* Buffer full, dump it out */ long nextBlockNumber; if (!lt->dirty) { /* Hmm, went directly from reading to writing? */ elog(ERROR, "invalid logtape state: should be dirty"); } /* * First allocate the next block, so that we can store it in the * 'next' pointer of this block. */ nextBlockNumber = ltsGetFreeBlock(lts); /* set the next-pointer and dump the current block. */ TapeBlockGetTrailer(lt->buffer)->next = nextBlockNumber; ltsWriteBlock(lts, lt->curBlockNumber, (void *) lt->buffer); /* initialize the prev-pointer of the next block */ TapeBlockGetTrailer(lt->buffer)->prev = lt->curBlockNumber; lt->curBlockNumber = nextBlockNumber; lt->pos = 0; lt->nbytes = 0; } nthistime = TapeBlockPayloadSize - lt->pos; if (nthistime > size) nthistime = size; Assert(nthistime > 0); memcpy(lt->buffer + lt->pos, ptr, nthistime); lt->dirty = true; lt->pos += nthistime; if (lt->nbytes < lt->pos) lt->nbytes = lt->pos; ptr = (void *) ((char *) ptr + nthistime); size -= nthistime; } }