/* * Read tuples in correct sort order from tuplesort, and load them into * btree leaves. */ static void _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) { BTPageState *state = NULL; bool merge = (btspool2 != NULL); IndexTuple itup, itup2 = NULL; bool load1; TupleDesc tupdes = RelationGetDescr(wstate->index); int i, keysz = RelationGetNumberOfAttributes(wstate->index); ScanKey indexScanKey = NULL; SortSupport sortKeys; if (merge) { /* * Another BTSpool for dead tuples exists. Now we have to merge * btspool and btspool2. */ /* the preparation of merge */ itup = tuplesort_getindextuple(btspool->sortstate, true); itup2 = tuplesort_getindextuple(btspool2->sortstate, true); indexScanKey = _bt_mkscankey_nodata(wstate->index); /* Prepare SortSupport data for each column */ sortKeys = (SortSupport) palloc0(keysz * sizeof(SortSupportData)); for (i = 0; i < keysz; i++) { SortSupport sortKey = sortKeys + i; ScanKey scanKey = indexScanKey + i; int16 strategy; sortKey->ssup_cxt = CurrentMemoryContext; sortKey->ssup_collation = scanKey->sk_collation; sortKey->ssup_nulls_first = (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0; sortKey->ssup_attno = scanKey->sk_attno; /* Abbreviation is not supported here */ sortKey->abbreviate = false; AssertState(sortKey->ssup_attno != 0); strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ? BTGreaterStrategyNumber : BTLessStrategyNumber; PrepareSortSupportFromIndexRel(wstate->index, strategy, sortKey); } _bt_freeskey(indexScanKey); for (;;) { load1 = true; /* load BTSpool next ? */ if (itup2 == NULL) { if (itup == NULL) break; } else if (itup != NULL) { for (i = 1; i <= keysz; i++) { SortSupport entry; Datum attrDatum1, attrDatum2; bool isNull1, isNull2; int32 compare; entry = sortKeys + i - 1; attrDatum1 = index_getattr(itup, i, tupdes, &isNull1); attrDatum2 = index_getattr(itup2, i, tupdes, &isNull2); compare = ApplySortComparator(attrDatum1, isNull1, attrDatum2, isNull2, entry); if (compare > 0) { load1 = false; break; } else if (compare < 0) break; } } else load1 = false; /* When we see first tuple, create first index page */ if (state == NULL) state = _bt_pagestate(wstate, 0); if (load1) { _bt_buildadd(wstate, state, itup); itup = tuplesort_getindextuple(btspool->sortstate, true); } else { _bt_buildadd(wstate, state, itup2); itup2 = tuplesort_getindextuple(btspool2->sortstate, true); } } pfree(sortKeys); } else { /* merge is unnecessary */ while ((itup = tuplesort_getindextuple(btspool->sortstate, true)) != NULL) { /* When we see first tuple, create first index page */ if (state == NULL) state = _bt_pagestate(wstate, 0); _bt_buildadd(wstate, state, itup); } } /* Close down final pages and write the metapage */ _bt_uppershutdown(wstate, state); /* * If the index is WAL-logged, we must fsync it down to disk before it's * safe to commit the transaction. (For a non-WAL-logged index we don't * care since the index will be uninteresting after a crash anyway.) * * It's obvious that we must do this when not WAL-logging the build. It's * less obvious that we have to do it even if we did WAL-log the index * pages. The reason is that since we're building outside shared buffers, * a CHECKPOINT occurring during the build has no way to flush the * previously written data to disk (indeed it won't know the index even * exists). A crash later on would replay WAL from the checkpoint, * therefore it wouldn't replay our earlier WAL entries. If we do not * fsync those pages here, they might still not be on disk when the crash * occurs. */ if (RelationNeedsWAL(wstate->index)) { RelationOpenSmgr(wstate->index); smgrimmedsync(wstate->index->rd_smgr, MAIN_FORKNUM); } }
/* * _bt_mergeload - Merge two streams of index tuples into new index files. */ static void _bt_mergeload(Spooler *self, BTWriteState *wstate, BTSpool *btspool, BTReader *btspool2, Relation heapRel) { BTPageState *state = NULL; IndexTuple itup, itup2; bool should_free = false; TupleDesc tupdes = RelationGetDescr(wstate->index); int keysz = RelationGetNumberOfAttributes(wstate->index); ScanKey indexScanKey; ON_DUPLICATE on_duplicate = self->on_duplicate; Assert(btspool != NULL); /* the preparation of merge */ itup = BTSpoolGetNextItem(btspool, NULL, &should_free); itup2 = BTReaderGetNextItem(btspool2); indexScanKey = _bt_mkscankey_nodata(wstate->index); for (;;) { bool load1 = true; /* load BTSpool next ? */ bool hasnull; int32 compare; if (self->dup_old + self->dup_new > self->max_dup_errors) ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Maximum duplicate error count exceeded"))); if (itup2 == NULL) { if (itup == NULL) break; } else if (itup != NULL) { compare = compare_indextuple(itup, itup2, indexScanKey, keysz, tupdes, &hasnull); if (compare == 0 && !hasnull && btspool->isunique) { ItemPointerData t_tid2; /* * t_tid is update by heap_is_visible(), because use it for an * index, t_tid backup */ ItemPointerCopy(&itup2->t_tid, &t_tid2); /* The tuple pointed by the old index should not be visible. */ if (!heap_is_visible(heapRel, &itup->t_tid)) { itup = BTSpoolGetNextItem(btspool, itup, &should_free); } else if (!heap_is_visible(heapRel, &itup2->t_tid)) { itup2 = BTReaderGetNextItem(btspool2); } else { if (on_duplicate == ON_DUPLICATE_KEEP_NEW) { self->dup_old++; remove_duplicate(self, heapRel, itup2, RelationGetRelationName(wstate->index)); itup2 = BTReaderGetNextItem(btspool2); } else { ItemPointerCopy(&t_tid2, &itup2->t_tid); self->dup_new++; remove_duplicate(self, heapRel, itup, RelationGetRelationName(wstate->index)); itup = BTSpoolGetNextItem(btspool, itup, &should_free); } } continue; } else if (compare > 0) load1 = false; } else load1 = false; BULKLOAD_PROFILE(&prof_merge_unique); /* When we see first tuple, create first index page */ if (state == NULL) state = _bt_pagestate(wstate, 0); if (load1) { IndexTuple next_itup = NULL; bool next_should_free = false; for (;;) { /* get next item */ next_itup = BTSpoolGetNextItem(btspool, next_itup, &next_should_free); if (!btspool->isunique || next_itup == NULL) break; compare = compare_indextuple(itup, next_itup, indexScanKey, keysz, tupdes, &hasnull); if (compare < 0 || hasnull) break; if (compare > 0) { /* shouldn't happen */ elog(ERROR, "faild in tuplesort_performsort"); } /* * If tupple is deleted by other unique indexes, not visible */ if (!heap_is_visible(heapRel, &next_itup->t_tid)) { continue; } if (!heap_is_visible(heapRel, &itup->t_tid)) { if (should_free) pfree(itup); itup = next_itup; should_free = next_should_free; next_should_free = false; continue; } /* not unique between input files */ self->dup_new++; remove_duplicate(self, heapRel, next_itup, RelationGetRelationName(wstate->index)); if (self->dup_old + self->dup_new > self->max_dup_errors) ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Maximum duplicate error count exceeded"))); } _bt_buildadd(wstate, state, itup); if (should_free) pfree(itup); itup = next_itup; should_free = next_should_free; } else { _bt_buildadd(wstate, state, itup2); itup2 = BTReaderGetNextItem(btspool2); } BULKLOAD_PROFILE(&prof_merge_insert); } _bt_freeskey(indexScanKey); /* Close down final pages and write the metapage */ _bt_uppershutdown(wstate, state); /* * If the index isn't temp, we must fsync it down to disk before it's safe * to commit the transaction. (For a temp index we don't care since the * index will be uninteresting after a crash anyway.) * * It's obvious that we must do this when not WAL-logging the build. It's * less obvious that we have to do it even if we did WAL-log the index * pages. The reason is that since we're building outside shared buffers, * a CHECKPOINT occurring during the build has no way to flush the * previously written data to disk (indeed it won't know the index even * exists). A crash later on would replay WAL from the checkpoint, * therefore it wouldn't replay our earlier WAL entries. If we do not * fsync those pages here, they might still not be on disk when the crash * occurs. */ if (!RELATION_IS_LOCAL(wstate->index)) { RelationOpenSmgr(wstate->index); smgrimmedsync(wstate->index->rd_smgr, MAIN_FORKNUM); } BULKLOAD_PROFILE(&prof_merge_term); }
/*---------- * Add an item to a disk page from the sort output. * * We must be careful to observe the page layout conventions of nbtsearch.c: * - rightmost pages start data items at P_HIKEY instead of at P_FIRSTKEY. * - on non-leaf pages, the key portion of the first item need not be * stored, we should store only the link. * * A leaf page being built looks like: * * +----------------+---------------------------------+ * | PageHeaderData | linp0 linp1 linp2 ... | * +-----------+----+---------------------------------+ * | ... linpN | | * +-----------+--------------------------------------+ * | ^ last | * | | * +-------------+------------------------------------+ * | | itemN ... | * +-------------+------------------+-----------------+ * | ... item3 item2 item1 | "special space" | * +--------------------------------+-----------------+ * * Contrast this with the diagram in bufpage.h; note the mismatch * between linps and items. This is because we reserve linp0 as a * placeholder for the pointer to the "high key" item; when we have * filled up the page, we will set linp0 to point to itemN and clear * linpN. On the other hand, if we find this is the last (rightmost) * page, we leave the items alone and slide the linp array over. * * 'last' pointer indicates the last offset added to the page. *---------- */ static void _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup) { Page npage; BlockNumber nblkno; OffsetNumber last_off; Size pgspc; Size itupsz; /* * This is a handy place to check for cancel interrupts during the btree * load phase of index creation. */ CHECK_FOR_INTERRUPTS(); npage = state->btps_page; nblkno = state->btps_blkno; last_off = state->btps_lastoff; pgspc = PageGetFreeSpace(npage); itupsz = IndexTupleDSize(*itup); itupsz = MAXALIGN(itupsz); /* * Check whether the item can fit on a btree page at all. (Eventually, we * ought to try to apply TOAST methods if not.) We actually need to be * able to fit three items on every page, so restrict any one item to 1/3 * the per-page available space. Note that at this point, itupsz doesn't * include the ItemId. * * NOTE: similar code appears in _bt_insertonpg() to defend against * oversize items being inserted into an already-existing index. But * during creation of an index, we don't go through there. */ if (itupsz > BTMaxItemSize(npage)) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("index row size %zu exceeds maximum %zu for index \"%s\"", itupsz, BTMaxItemSize(npage), RelationGetRelationName(wstate->index)), errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n" "Consider a function index of an MD5 hash of the value, " "or use full text indexing."), errtableconstraint(wstate->heap, RelationGetRelationName(wstate->index)))); /* * Check to see if page is "full". It's definitely full if the item won't * fit. Otherwise, compare to the target freespace derived from the * fillfactor. However, we must put at least two items on each page, so * disregard fillfactor if we don't have that many. */ if (pgspc < itupsz || (pgspc < state->btps_full && last_off > P_FIRSTKEY)) { /* * Finish off the page and write it out. */ Page opage = npage; BlockNumber oblkno = nblkno; ItemId ii; ItemId hii; IndexTuple oitup; /* Create new page of same level */ npage = _bt_blnewpage(state->btps_level); /* and assign it a page position */ nblkno = wstate->btws_pages_alloced++; /* * We copy the last item on the page into the new page, and then * rearrange the old page so that the 'last item' becomes its high key * rather than a true data item. There had better be at least two * items on the page already, else the page would be empty of useful * data. */ Assert(last_off > P_FIRSTKEY); ii = PageGetItemId(opage, last_off); oitup = (IndexTuple) PageGetItem(opage, ii); _bt_sortaddtup(npage, ItemIdGetLength(ii), oitup, P_FIRSTKEY); /* * Move 'last' into the high key position on opage */ hii = PageGetItemId(opage, P_HIKEY); *hii = *ii; ItemIdSetUnused(ii); /* redundant */ ((PageHeader) opage)->pd_lower -= sizeof(ItemIdData); /* * Link the old page into its parent, using its minimum key. If we * don't have a parent, we have to create one; this adds a new btree * level. */ if (state->btps_next == NULL) state->btps_next = _bt_pagestate(wstate, state->btps_level + 1); Assert(state->btps_minkey != NULL); ItemPointerSet(&(state->btps_minkey->t_tid), oblkno, P_HIKEY); _bt_buildadd(wstate, state->btps_next, state->btps_minkey); pfree(state->btps_minkey); /* * Save a copy of the minimum key for the new page. We have to copy * it off the old page, not the new one, in case we are not at leaf * level. */ state->btps_minkey = CopyIndexTuple(oitup); /* * Set the sibling links for both pages. */ { BTPageOpaque oopaque = (BTPageOpaque) PageGetSpecialPointer(opage); BTPageOpaque nopaque = (BTPageOpaque) PageGetSpecialPointer(npage); oopaque->btpo_next = nblkno; nopaque->btpo_prev = oblkno; nopaque->btpo_next = P_NONE; /* redundant */ } /* * Write out the old page. We never need to touch it again, so we can * free the opage workspace too. */ _bt_blwritepage(wstate, opage, oblkno); /* * Reset last_off to point to new page */ last_off = P_FIRSTKEY; } /* * If the new item is the first for its page, stash a copy for later. Note * this will only happen for the first item on a level; on later pages, * the first item for a page is copied from the prior page in the code * above. */ if (last_off == P_HIKEY) { Assert(state->btps_minkey == NULL); state->btps_minkey = CopyIndexTuple(itup); } /* * Add the new item into the current page. */ last_off = OffsetNumberNext(last_off); _bt_sortaddtup(npage, itupsz, itup, last_off); state->btps_page = npage; state->btps_blkno = nblkno; state->btps_lastoff = last_off; }
/* * Read tuples in correct sort order from tuplesort, and load them into * btree leaves. */ static void _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) { BTPageState *state = NULL; bool merge = (btspool2 != NULL); IndexTuple itup, itup2 = NULL; bool should_free, should_free2, load1; TupleDesc tupdes = RelationGetDescr(wstate->index); int i, keysz = RelationGetNumberOfAttributes(wstate->index); ScanKey indexScanKey = NULL; if (merge) { /* * Another BTSpool for dead tuples exists. Now we have to merge * btspool and btspool2. */ /* the preparation of merge */ itup = tuplesort_getindextuple(btspool->sortstate, true, &should_free); itup2 = tuplesort_getindextuple(btspool2->sortstate, true, &should_free2); indexScanKey = _bt_mkscankey_nodata(wstate->index); for (;;) { load1 = true; /* load BTSpool next ? */ if (itup2 == NULL) { if (itup == NULL) break; } else if (itup != NULL) { for (i = 1; i <= keysz; i++) { ScanKey entry; Datum attrDatum1, attrDatum2; bool isNull1, isNull2; int32 compare; entry = indexScanKey + i - 1; attrDatum1 = index_getattr(itup, i, tupdes, &isNull1); attrDatum2 = index_getattr(itup2, i, tupdes, &isNull2); if (isNull1) { if (isNull2) compare = 0; /* NULL "=" NULL */ else if (entry->sk_flags & SK_BT_NULLS_FIRST) compare = -1; /* NULL "<" NOT_NULL */ else compare = 1; /* NULL ">" NOT_NULL */ } else if (isNull2) { if (entry->sk_flags & SK_BT_NULLS_FIRST) compare = 1; /* NOT_NULL ">" NULL */ else compare = -1; /* NOT_NULL "<" NULL */ } else { compare = DatumGetInt32(FunctionCall2Coll(&entry->sk_func, entry->sk_collation, attrDatum1, attrDatum2)); if (entry->sk_flags & SK_BT_DESC) compare = -compare; } if (compare > 0) { load1 = false; break; } else if (compare < 0) break; } } else load1 = false; /* When we see first tuple, create first index page */ if (state == NULL) state = _bt_pagestate(wstate, 0); if (load1) { _bt_buildadd(wstate, state, itup); if (should_free) pfree(itup); itup = tuplesort_getindextuple(btspool->sortstate, true, &should_free); } else { _bt_buildadd(wstate, state, itup2); if (should_free2) pfree(itup2); itup2 = tuplesort_getindextuple(btspool2->sortstate, true, &should_free2); } } _bt_freeskey(indexScanKey); } else { /* merge is unnecessary */ while ((itup = tuplesort_getindextuple(btspool->sortstate, true, &should_free)) != NULL) { /* When we see first tuple, create first index page */ if (state == NULL) state = _bt_pagestate(wstate, 0); _bt_buildadd(wstate, state, itup); if (should_free) pfree(itup); } } /* Close down final pages and write the metapage */ _bt_uppershutdown(wstate, state); /* * If the index is WAL-logged, we must fsync it down to disk before it's * safe to commit the transaction. (For a non-WAL-logged index we don't * care since the index will be uninteresting after a crash anyway.) * * It's obvious that we must do this when not WAL-logging the build. It's * less obvious that we have to do it even if we did WAL-log the index * pages. The reason is that since we're building outside shared buffers, * a CHECKPOINT occurring during the build has no way to flush the * previously written data to disk (indeed it won't know the index even * exists). A crash later on would replay WAL from the checkpoint, * therefore it wouldn't replay our earlier WAL entries. If we do not * fsync those pages here, they might still not be on disk when the crash * occurs. */ if (RelationNeedsWAL(wstate->index)) { RelationOpenSmgr(wstate->index); smgrimmedsync(wstate->index->rd_smgr, MAIN_FORKNUM); } }