/* * Build an empty SPGiST index in the initialization fork */ Datum spgbuildempty(PG_FUNCTION_ARGS) { Relation index = (Relation) PG_GETARG_POINTER(0); Page page; /* Construct metapage. */ page = (Page) palloc(BLCKSZ); SpGistInitMetapage(page); /* Write the page. If archiving/streaming, XLOG it. */ smgrwrite(index->rd_smgr, INIT_FORKNUM, SPGIST_METAPAGE_BLKNO, (char *) page, true); if (XLogIsNeeded()) log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM, SPGIST_METAPAGE_BLKNO, page); /* Likewise for the root page. */ SpGistInitPage(page, SPGIST_LEAF); smgrwrite(index->rd_smgr, INIT_FORKNUM, SPGIST_HEAD_BLKNO, (char *) page, true); if (XLogIsNeeded()) log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM, SPGIST_HEAD_BLKNO, page); /* * An immediate sync is required even if we xlog'd the pages, because the * writes did not go through shared buffers and therefore a concurrent * checkpoint may have moved the redo pointer past our xlog record. */ smgrimmedsync(index->rd_smgr, INIT_FORKNUM); PG_RETURN_VOID(); }
/* * btbuildempty() -- build an empty btree index in the initialization fork */ void btbuildempty(Relation index) { Page metapage; /* Construct metapage. */ metapage = (Page) palloc(BLCKSZ); _bt_initmetapage(metapage, P_NONE, 0); /* * Write the page and log it. It might seem that an immediate sync would * be sufficient to guarantee that the file exists on disk, but recovery * itself might remove it while replaying, for example, an * XLOG_DBASE_CREATE or XLOG_TBLSPC_CREATE record. Therefore, we need * this even when wal_level=minimal. */ PageSetChecksumInplace(metapage, BTREE_METAPAGE); smgrwrite(index->rd_smgr, INIT_FORKNUM, BTREE_METAPAGE, (char *) metapage, true); log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM, BTREE_METAPAGE, metapage, true); /* * An immediate sync is required even if we xlog'd the page, because the * write did not go through shared_buffers and therefore a concurrent * checkpoint may have moved the redo pointer past our xlog record. */ smgrimmedsync(index->rd_smgr, INIT_FORKNUM); }
/* * btbuildempty() -- build an empty btree index in the initialization fork */ Datum btbuildempty(PG_FUNCTION_ARGS) { Relation index = (Relation) PG_GETARG_POINTER(0); Page metapage; /* Construct metapage. */ metapage = (Page) palloc(BLCKSZ); _bt_initmetapage(metapage, P_NONE, 0); /* Write the page. If archiving/streaming, XLOG it. */ smgrwrite(index->rd_smgr, INIT_FORKNUM, BTREE_METAPAGE, (char *) metapage, true); if (XLogIsNeeded()) log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM, BTREE_METAPAGE, metapage); /* * An immediate sync is require even if we xlog'd the page, because the * write did not go through shared_buffers and therefore a concurrent * checkpoint may have move the redo pointer past our xlog record. */ smgrimmedsync(index->rd_smgr, INIT_FORKNUM); PG_RETURN_VOID(); }
/* * SetMatViewToPopulated * Indicate that the materialized view has been populated by its query. * * NOTE: The heap starts out in a state that doesn't look scannable, and can * only transition from there to scannable at the time a new heap is created. * * NOTE: caller must be holding an appropriate lock on the relation. */ void SetMatViewToPopulated(Relation relation) { Page page; Assert(relation->rd_rel->relkind == RELKIND_MATVIEW); Assert(relation->rd_ispopulated == false); page = (Page) palloc(BLCKSZ); PageInit(page, BLCKSZ, 0); if (RelationNeedsWAL(relation)) log_newpage(&(relation->rd_node), MAIN_FORKNUM, 0, page); RelationOpenSmgr(relation); PageSetChecksumInplace(page, 0); smgrextend(relation->rd_smgr, MAIN_FORKNUM, 0, (char *) page, true); pfree(page); smgrimmedsync(relation->rd_smgr, MAIN_FORKNUM); RelationCacheInvalidateEntry(relation->rd_id); }
/* * Build an empty SPGiST index in the initialization fork */ void spgbuildempty(Relation index) { Page page; /* Construct metapage. */ page = (Page) palloc(BLCKSZ); SpGistInitMetapage(page); /* Write the page. If archiving/streaming, XLOG it. */ PageSetChecksumInplace(page, SPGIST_METAPAGE_BLKNO); smgrwrite(index->rd_smgr, INIT_FORKNUM, SPGIST_METAPAGE_BLKNO, (char *) page, true); if (XLogIsNeeded()) log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM, SPGIST_METAPAGE_BLKNO, page, false); /* Likewise for the root page. */ SpGistInitPage(page, SPGIST_LEAF); PageSetChecksumInplace(page, SPGIST_ROOT_BLKNO); smgrwrite(index->rd_smgr, INIT_FORKNUM, SPGIST_ROOT_BLKNO, (char *) page, true); if (XLogIsNeeded()) log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM, SPGIST_ROOT_BLKNO, page, true); /* Likewise for the null-tuples root page. */ SpGistInitPage(page, SPGIST_LEAF | SPGIST_NULLS); PageSetChecksumInplace(page, SPGIST_NULL_BLKNO); smgrwrite(index->rd_smgr, INIT_FORKNUM, SPGIST_NULL_BLKNO, (char *) page, true); if (XLogIsNeeded()) log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM, SPGIST_NULL_BLKNO, page, true); /* * An immediate sync is required even if we xlog'd the pages, because the * writes did not go through shared buffers and therefore a concurrent * checkpoint may have moved the redo pointer past our xlog record. */ smgrimmedsync(index->rd_smgr, INIT_FORKNUM); }
/* * emit a completed btree page, and release the working storage. */ static void _bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno) { /* Ensure rd_smgr is open (could have been closed by relcache flush!) */ RelationOpenSmgr(wstate->index); /* XLOG stuff */ if (wstate->btws_use_wal) { /* We use the heap NEWPAGE record type for this */ log_newpage(&wstate->index->rd_node, MAIN_FORKNUM, blkno, page); } else { /* Leave the page LSN zero if not WAL-logged, but set TLI anyway */ PageSetTLI(page, ThisTimeLineID); } /* * If we have to write pages nonsequentially, fill in the space with * zeroes until we come back and overwrite. This is not logically * necessary on standard Unix filesystems (unwritten space will read as * zeroes anyway), but it should help to avoid fragmentation. The dummy * pages aren't WAL-logged though. */ while (blkno > wstate->btws_pages_written) { if (!wstate->btws_zeropage) wstate->btws_zeropage = (Page) palloc0(BLCKSZ); smgrextend(wstate->index->rd_smgr, MAIN_FORKNUM, wstate->btws_pages_written++, (char *) wstate->btws_zeropage, true); } /* * Now write the page. We say isTemp = true even if it's not a temp * index, because there's no need for smgr to schedule an fsync for this * write; we'll do it ourselves before ending the build. */ if (blkno == wstate->btws_pages_written) { /* extending the file... */ smgrextend(wstate->index->rd_smgr, MAIN_FORKNUM, blkno, (char *) page, true); wstate->btws_pages_written++; } else { /* overwriting a block we zero-filled before */ smgrwrite(wstate->index->rd_smgr, MAIN_FORKNUM, blkno, (char *) page, true); } pfree(page); }
/* * End a rewrite. * * state and any other resources are freed. */ void end_heap_rewrite(RewriteState state) { HASH_SEQ_STATUS seq_status; UnresolvedTup unresolved; /* * Write any remaining tuples in the UnresolvedTups table. If we have any * left, they should in fact be dead, but let's err on the safe side. */ hash_seq_init(&seq_status, state->rs_unresolved_tups); while ((unresolved = hash_seq_search(&seq_status)) != NULL) { ItemPointerSetInvalid(&unresolved->tuple->t_data->t_ctid); raw_heap_insert(state, unresolved->tuple); } /* Write the last page, if any */ if (state->rs_buffer_valid) { if (state->rs_use_wal) log_newpage(&state->rs_new_rel->rd_node, MAIN_FORKNUM, state->rs_blockno, state->rs_buffer, true); RelationOpenSmgr(state->rs_new_rel); PageSetChecksumInplace(state->rs_buffer, state->rs_blockno); smgrextend(state->rs_new_rel->rd_smgr, MAIN_FORKNUM, state->rs_blockno, (char *) state->rs_buffer, true); } /* * If the rel is WAL-logged, must fsync before commit. We use heap_sync * to ensure that the toast table gets fsync'd too. * * It's obvious that we must do this when not WAL-logging. It's less * obvious that we have to do it even if we did WAL-log the pages. The * reason is the same as in tablecmds.c's copy_relation_data(): we're * writing data that's not in shared buffers, and so a CHECKPOINT * occurring during the rewriteheap operation won't have fsync'd data we * wrote before the checkpoint. */ if (RelationNeedsWAL(state->rs_new_rel)) heap_sync(state->rs_new_rel); /* Deleting the context frees everything */ MemoryContextDelete(state->rs_cxt); }
/* * ginbuildempty() -- build an empty gin index in the initialization fork */ Datum ginbuildempty(PG_FUNCTION_ARGS) { Relation index = (Relation) PG_GETARG_POINTER(0); Buffer RootBuffer, MetaBuffer; /* An empty GIN index has two pages. */ MetaBuffer = ReadBufferExtended(index, INIT_FORKNUM, P_NEW, RBM_NORMAL, NULL); LockBuffer(MetaBuffer, BUFFER_LOCK_EXCLUSIVE); RootBuffer = ReadBufferExtended(index, INIT_FORKNUM, P_NEW, RBM_NORMAL, NULL); LockBuffer(RootBuffer, BUFFER_LOCK_EXCLUSIVE); /* Initialize both pages, mark them dirty, unlock and release buffer. */ START_CRIT_SECTION(); GinInitMetabuffer(MetaBuffer); MarkBufferDirty(MetaBuffer); GinInitBuffer(RootBuffer, GIN_LEAF); MarkBufferDirty(RootBuffer); /* XLOG the new pages */ log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM, BufferGetBlockNumber(MetaBuffer), BufferGetPage(MetaBuffer)); log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM, BufferGetBlockNumber(RootBuffer), BufferGetPage(RootBuffer)); END_CRIT_SECTION(); /* Unlock and release the buffers. */ UnlockReleaseBuffer(MetaBuffer); UnlockReleaseBuffer(RootBuffer); PG_RETURN_VOID(); }
/* * Write a WAL record containing a full image of a page. * * Caller should initialize the buffer and mark it dirty before calling this * function. This function will set the page LSN. * * If the page follows the standard page layout, with a PageHeader and unused * space between pd_lower and pd_upper, set 'page_std' to TRUE. That allows * the unused space to be left out from the WAL record, making it smaller. */ XLogRecPtr log_newpage_buffer(Buffer buffer, bool page_std) { Page page = BufferGetPage(buffer); RelFileNode rnode; ForkNumber forkNum; BlockNumber blkno; /* Shared buffers should be modified in a critical section. */ Assert(CritSectionCount > 0); BufferGetTag(buffer, &rnode, &forkNum, &blkno); return log_newpage(&rnode, forkNum, blkno, page, page_std); }
/* * _hash_alloc_buckets -- allocate a new splitpoint's worth of bucket pages * * This does not need to initialize the new bucket pages; we'll do that as * each one is used by _hash_expandtable(). But we have to extend the logical * EOF to the end of the splitpoint; this keeps smgr's idea of the EOF in * sync with ours, so that we don't get complaints from smgr. * * We do this by writing a page of zeroes at the end of the splitpoint range. * We expect that the filesystem will ensure that the intervening pages read * as zeroes too. On many filesystems this "hole" will not be allocated * immediately, which means that the index file may end up more fragmented * than if we forced it all to be allocated now; but since we don't scan * hash indexes sequentially anyway, that probably doesn't matter. * * XXX It's annoying that this code is executed with the metapage lock held. * We need to interlock against _hash_addovflpage() adding a new overflow page * concurrently, but it'd likely be better to use LockRelationForExtension * for the purpose. OTOH, adding a splitpoint is a very infrequent operation, * so it may not be worth worrying about. * * Returns TRUE if successful, or FALSE if allocation failed due to * BlockNumber overflow. */ static bool _hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks) { BlockNumber lastblock; char zerobuf[BLCKSZ]; Page page; HashPageOpaque ovflopaque; lastblock = firstblock + nblocks - 1; /* * Check for overflow in block number calculation; if so, we cannot extend * the index anymore. */ if (lastblock < firstblock || lastblock == InvalidBlockNumber) return false; page = (Page) zerobuf; /* * Initialize the page. Just zeroing the page won't work; see * _hash_freeovflpage for similar usage. We take care to make the special * space valid for the benefit of tools such as pageinspect. */ _hash_pageinit(page, BLCKSZ); ovflopaque = (HashPageOpaque) PageGetSpecialPointer(page); ovflopaque->hasho_prevblkno = InvalidBlockNumber; ovflopaque->hasho_nextblkno = InvalidBlockNumber; ovflopaque->hasho_bucket = -1; ovflopaque->hasho_flag = LH_UNUSED_PAGE; ovflopaque->hasho_page_id = HASHO_PAGE_ID; if (RelationNeedsWAL(rel)) log_newpage(&rel->rd_node, MAIN_FORKNUM, lastblock, zerobuf, true); RelationOpenSmgr(rel); smgrextend(rel->rd_smgr, MAIN_FORKNUM, lastblock, zerobuf, false); return true; }
/* * btbuildempty() -- build an empty btree index in the initialization fork */ void btbuildempty(Relation index) { Page metapage; /* Construct metapage. */ metapage = (Page) palloc(BLCKSZ); _bt_initmetapage(metapage, P_NONE, 0); /* Write the page. If archiving/streaming, XLOG it. */ PageSetChecksumInplace(metapage, BTREE_METAPAGE); smgrwrite(index->rd_smgr, INIT_FORKNUM, BTREE_METAPAGE, (char *) metapage, true); if (XLogIsNeeded()) log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM, BTREE_METAPAGE, metapage, false); /* * An immediate sync is required even if we xlog'd the page, because the * write did not go through shared_buffers and therefore a concurrent * checkpoint may have moved the redo pointer past our xlog record. */ smgrimmedsync(index->rd_smgr, INIT_FORKNUM); }
/* * Insert a tuple to the new relation. This has to track heap_insert * and its subsidiary functions! * * t_self of the tuple is set to the new TID of the tuple. If t_ctid of the * tuple is invalid on entry, it's replaced with the new TID as well (in * the inserted data only, not in the caller's copy). */ static void raw_heap_insert(RewriteState state, HeapTuple tup) { Page page = state->rs_buffer; Size pageFreeSpace, saveFreeSpace; Size len; OffsetNumber newoff; HeapTuple heaptup; /* * If the new tuple is too big for storage or contains already toasted * out-of-line attributes from some other relation, invoke the toaster. * * Note: below this point, heaptup is the data we actually intend to store * into the relation; tup is the caller's original untoasted data. */ if (state->rs_new_rel->rd_rel->relkind == RELKIND_TOASTVALUE) { /* toast table entries should never be recursively toasted */ Assert(!HeapTupleHasExternal(tup)); heaptup = tup; } else if (HeapTupleHasExternal(tup) || tup->t_len > TOAST_TUPLE_THRESHOLD) heaptup = toast_insert_or_update(state->rs_new_rel, tup, NULL, HEAP_INSERT_SKIP_FSM | (state->rs_use_wal ? 0 : HEAP_INSERT_SKIP_WAL)); else heaptup = tup; len = MAXALIGN(heaptup->t_len); /* be conservative */ /* * If we're gonna fail for oversize tuple, do it right away */ if (len > MaxHeapTupleSize) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("row is too big: size %zu, maximum size %zu", len, MaxHeapTupleSize))); /* Compute desired extra freespace due to fillfactor option */ saveFreeSpace = RelationGetTargetPageFreeSpace(state->rs_new_rel, HEAP_DEFAULT_FILLFACTOR); /* Now we can check to see if there's enough free space already. */ if (state->rs_buffer_valid) { pageFreeSpace = PageGetHeapFreeSpace(page); if (len + saveFreeSpace > pageFreeSpace) { /* Doesn't fit, so write out the existing page */ /* XLOG stuff */ if (state->rs_use_wal) log_newpage(&state->rs_new_rel->rd_node, MAIN_FORKNUM, state->rs_blockno, page, true); /* * Now write the page. We say isTemp = true even if it's not a * temp table, because there's no need for smgr to schedule an * fsync for this write; we'll do it ourselves in * end_heap_rewrite. */ RelationOpenSmgr(state->rs_new_rel); PageSetChecksumInplace(page, state->rs_blockno); smgrextend(state->rs_new_rel->rd_smgr, MAIN_FORKNUM, state->rs_blockno, (char *) page, true); state->rs_blockno++; state->rs_buffer_valid = false; } } if (!state->rs_buffer_valid) { /* Initialize a new empty page */ PageInit(page, BLCKSZ, 0); state->rs_buffer_valid = true; } /* And now we can insert the tuple into the page */ newoff = PageAddItem(page, (Item) heaptup->t_data, heaptup->t_len, InvalidOffsetNumber, false, true); if (newoff == InvalidOffsetNumber) elog(ERROR, "failed to add tuple"); /* Update caller's t_self to the actual position where it was stored */ ItemPointerSet(&(tup->t_self), state->rs_blockno, newoff); /* * Insert the correct position into CTID of the stored tuple, too, if the * caller didn't supply a valid CTID. */ if (!ItemPointerIsValid(&tup->t_data->t_ctid)) { ItemId newitemid; HeapTupleHeader onpage_tup; newitemid = PageGetItemId(page, newoff); onpage_tup = (HeapTupleHeader) PageGetItem(page, newitemid); onpage_tup->t_ctid = tup->t_self; } /* If heaptup is a private copy, release it. */ if (heaptup != tup) heap_freetuple(heaptup); }
/* * _hash_init() -- Initialize the metadata page of a hash index, * the initial buckets, and the initial bitmap page. * * The initial number of buckets is dependent on num_tuples, an estimate * of the number of tuples to be loaded into the index initially. The * chosen number of buckets is returned. * * We are fairly cavalier about locking here, since we know that no one else * could be accessing this index. In particular the rule about not holding * multiple buffer locks is ignored. */ uint32 _hash_init(Relation rel, double num_tuples, ForkNumber forkNum) { Buffer metabuf; Buffer buf; Buffer bitmapbuf; Page pg; HashMetaPage metap; RegProcedure procid; int32 data_width; int32 item_width; int32 ffactor; uint32 num_buckets; uint32 i; bool use_wal; /* safety check */ if (RelationGetNumberOfBlocksInFork(rel, forkNum) != 0) elog(ERROR, "cannot initialize non-empty hash index \"%s\"", RelationGetRelationName(rel)); /* * WAL log creation of pages if the relation is persistent, or this is the * init fork. Init forks for unlogged relations always need to be WAL * logged. */ use_wal = RelationNeedsWAL(rel) || forkNum == INIT_FORKNUM; /* * Determine the target fill factor (in tuples per bucket) for this index. * The idea is to make the fill factor correspond to pages about as full * as the user-settable fillfactor parameter says. We can compute it * exactly since the index datatype (i.e. uint32 hash key) is fixed-width. */ data_width = sizeof(uint32); item_width = MAXALIGN(sizeof(IndexTupleData)) + MAXALIGN(data_width) + sizeof(ItemIdData); /* include the line pointer */ ffactor = RelationGetTargetPageUsage(rel, HASH_DEFAULT_FILLFACTOR) / item_width; /* keep to a sane range */ if (ffactor < 10) ffactor = 10; procid = index_getprocid(rel, 1, HASHSTANDARD_PROC); /* * We initialize the metapage, the first N bucket pages, and the first * bitmap page in sequence, using _hash_getnewbuf to cause smgrextend() * calls to occur. This ensures that the smgr level has the right idea of * the physical index length. * * Critical section not required, because on error the creation of the * whole relation will be rolled back. */ metabuf = _hash_getnewbuf(rel, HASH_METAPAGE, forkNum); _hash_init_metabuffer(metabuf, num_tuples, procid, ffactor, false); MarkBufferDirty(metabuf); pg = BufferGetPage(metabuf); metap = HashPageGetMeta(pg); /* XLOG stuff */ if (use_wal) { xl_hash_init_meta_page xlrec; XLogRecPtr recptr; xlrec.num_tuples = num_tuples; xlrec.procid = metap->hashm_procid; xlrec.ffactor = metap->hashm_ffactor; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfHashInitMetaPage); XLogRegisterBuffer(0, metabuf, REGBUF_WILL_INIT); recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_INIT_META_PAGE); PageSetLSN(BufferGetPage(metabuf), recptr); } num_buckets = metap->hashm_maxbucket + 1; /* * Release buffer lock on the metapage while we initialize buckets. * Otherwise, we'll be in interrupt holdoff and the CHECK_FOR_INTERRUPTS * won't accomplish anything. It's a bad idea to hold buffer locks for * long intervals in any case, since that can block the bgwriter. */ LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); /* * Initialize and WAL Log the first N buckets */ for (i = 0; i < num_buckets; i++) { BlockNumber blkno; /* Allow interrupts, in case N is huge */ CHECK_FOR_INTERRUPTS(); blkno = BUCKET_TO_BLKNO(metap, i); buf = _hash_getnewbuf(rel, blkno, forkNum); _hash_initbuf(buf, metap->hashm_maxbucket, i, LH_BUCKET_PAGE, false); MarkBufferDirty(buf); if (use_wal) log_newpage(&rel->rd_node, forkNum, blkno, BufferGetPage(buf), true); _hash_relbuf(rel, buf); } /* Now reacquire buffer lock on metapage */ LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); /* * Initialize bitmap page */ bitmapbuf = _hash_getnewbuf(rel, num_buckets + 1, forkNum); _hash_initbitmapbuffer(bitmapbuf, metap->hashm_bmsize, false); MarkBufferDirty(bitmapbuf); /* add the new bitmap page to the metapage's list of bitmaps */ /* metapage already has a write lock */ if (metap->hashm_nmaps >= HASH_MAX_BITMAPS) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("out of overflow pages in hash index \"%s\"", RelationGetRelationName(rel)))); metap->hashm_mapp[metap->hashm_nmaps] = num_buckets + 1; metap->hashm_nmaps++; MarkBufferDirty(metabuf); /* XLOG stuff */ if (use_wal) { xl_hash_init_bitmap_page xlrec; XLogRecPtr recptr; xlrec.bmsize = metap->hashm_bmsize; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfHashInitBitmapPage); XLogRegisterBuffer(0, bitmapbuf, REGBUF_WILL_INIT); /* * This is safe only because nobody else can be modifying the index at * this stage; it's only visible to the transaction that is creating * it. */ XLogRegisterBuffer(1, metabuf, REGBUF_STANDARD); recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_INIT_BITMAP_PAGE); PageSetLSN(BufferGetPage(bitmapbuf), recptr); PageSetLSN(BufferGetPage(metabuf), recptr); } /* all done */ _hash_relbuf(rel, bitmapbuf); _hash_relbuf(rel, metabuf); return num_buckets; }
/** * @brief Write block buffer contents. Number of block buffer to be * written is specified by num argument. * * Flow: * <ol> * <li>If no more space is available in the data file, switch to a new one.</li> * <li>Compute block number which can be written to the current file.</li> * <li>Save the last block number in the load status file.</li> * <li>Write to the current file.</li> * <li>If there are other data, write them too.</li> * </ol> * * @param loader [in] Direct Writer. * @return File descriptor for the current data file. */ static void flush_pages(DirectWriter *loader) { int i; int num; LoadStatus *ls = &loader->ls; num = loader->curblk; if (!PageIsEmpty(GetCurrentPage(loader))) num += 1; if (num <= 0) return; /* no work */ /* * Add WAL entry (only the first page) to ensure the current xid will * be recorded in xlog. We must flush some xlog records with XLogFlush() * before write any data blocks to follow the WAL protocol. * * If postgres process, such as loader and COPY, is killed by "kill -9", * database will be rewound to the last checkpoint and recovery will * be performed using WAL. * * After the recovery, if there are xid's which have not been recorded * to WAL, such xid's will be reused. * * However, in the loader and COPY, data file is actually updated and * xid must not be reused. * * WAL entry with such xid can be added using XLogInsert(). However, * such entries are not really written to the disk immediately. * WAL entries are flushed to the disk by XLogFlush(), typically * when a transaction is commited. COPY prevents xid reuse by * this method. */ #if PG_VERSION_NUM >= 90100 if (ls->ls.create_cnt == 0 && !RELATION_IS_LOCAL(loader->base.rel) && !(loader->base.rel->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED) ) { XLogRecPtr recptr; recptr = log_newpage(&ls->ls.rnode, MAIN_FORKNUM, ls->ls.exist_cnt, loader->blocks); XLogFlush(recptr); } #else if (ls->ls.create_cnt == 0 && !RELATION_IS_LOCAL(loader->base.rel) ) { XLogRecPtr recptr; recptr = log_newpage(&ls->ls.rnode, MAIN_FORKNUM, ls->ls.exist_cnt, loader->blocks); XLogFlush(recptr); } #endif /* * Write blocks. We might need to write multiple files on boundary of * relation segments. */ for (i = 0; i < num;) { char *buffer; int total; int written; int flush_num; BlockNumber relblks = LS_TOTAL_CNT(ls); /* Switch to the next file if the current file has been filled up. */ if (relblks % RELSEG_SIZE == 0) close_data_file(loader); if (loader->datafd == -1) loader->datafd = open_data_file(ls->ls.rnode, RELATION_IS_LOCAL(loader->base.rel), relblks); /* Number of blocks to be added to the current file. */ flush_num = Min(num - i, RELSEG_SIZE - relblks % RELSEG_SIZE); Assert(flush_num > 0); /* Write the last block number to the load status file. */ UpdateLSF(loader, flush_num); #if PG_VERSION_NUM >= 90300 /* If we need a checksum, add it */ if (DataChecksumsEnabled()){ int j = 0; Page contained_page; for ( j=0; j<flush_num; j++ ) { contained_page = GetTargetPage(loader,j); ((PageHeader) contained_page)->pd_checksum = pg_checksum_page((char *) contained_page, LS_TOTAL_CNT(ls) - 1 - j); } } #endif /* * Flush flush_num data block to the current file. * Then the current file size becomes RELSEG_SIZE self->blocks. */ buffer = loader->blocks + BLCKSZ * i; total = BLCKSZ * flush_num; written = 0; while (total > 0) { int len = write(loader->datafd, buffer + written, total); if (len == -1) { /* fatal error, do not want to write blocks anymore */ ereport(ERROR, (errcode_for_file_access(), errmsg("could not write to data file: %m"))); } written += len; total -= len; } i += flush_num; } /* * NOTICE: Be sure reset curblk to 0 and reinitialize recycled page * if you will continue to use blocks. */ }