/*---------- * Add an item to a disk page from the sort output. * * We must be careful to observe the page layout conventions of nbtsearch.c: * - rightmost pages start data items at P_HIKEY instead of at P_FIRSTKEY. * - on non-leaf pages, the key portion of the first item need not be * stored, we should store only the link. * * A leaf page being built looks like: * * +----------------+---------------------------------+ * | PageHeaderData | linp0 linp1 linp2 ... | * +-----------+----+---------------------------------+ * | ... linpN | | * +-----------+--------------------------------------+ * | ^ last | * | | * +-------------+------------------------------------+ * | | itemN ... | * +-------------+------------------+-----------------+ * | ... item3 item2 item1 | "special space" | * +--------------------------------+-----------------+ * * Contrast this with the diagram in bufpage.h; note the mismatch * between linps and items. This is because we reserve linp0 as a * placeholder for the pointer to the "high key" item; when we have * filled up the page, we will set linp0 to point to itemN and clear * linpN. On the other hand, if we find this is the last (rightmost) * page, we leave the items alone and slide the linp array over. * * 'last' pointer indicates the last offset added to the page. *---------- */ static void _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup) { Page npage; BlockNumber nblkno; OffsetNumber last_off; Size pgspc; Size itupsz; /* * This is a handy place to check for cancel interrupts during the btree * load phase of index creation. */ CHECK_FOR_INTERRUPTS(); npage = state->btps_page; nblkno = state->btps_blkno; last_off = state->btps_lastoff; pgspc = PageGetFreeSpace(npage); itupsz = IndexTupleDSize(*itup); itupsz = MAXALIGN(itupsz); /* * Check whether the item can fit on a btree page at all. (Eventually, we * ought to try to apply TOAST methods if not.) We actually need to be * able to fit three items on every page, so restrict any one item to 1/3 * the per-page available space. Note that at this point, itupsz doesn't * include the ItemId. * * NOTE: similar code appears in _bt_insertonpg() to defend against * oversize items being inserted into an already-existing index. But * during creation of an index, we don't go through there. */ if (itupsz > BTMaxItemSize(npage)) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("index row size %zu exceeds maximum %zu for index \"%s\"", itupsz, BTMaxItemSize(npage), RelationGetRelationName(wstate->index)), errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n" "Consider a function index of an MD5 hash of the value, " "or use full text indexing."), errtableconstraint(wstate->heap, RelationGetRelationName(wstate->index)))); /* * Check to see if page is "full". It's definitely full if the item won't * fit. Otherwise, compare to the target freespace derived from the * fillfactor. However, we must put at least two items on each page, so * disregard fillfactor if we don't have that many. */ if (pgspc < itupsz || (pgspc < state->btps_full && last_off > P_FIRSTKEY)) { /* * Finish off the page and write it out. */ Page opage = npage; BlockNumber oblkno = nblkno; ItemId ii; ItemId hii; IndexTuple oitup; /* Create new page of same level */ npage = _bt_blnewpage(state->btps_level); /* and assign it a page position */ nblkno = wstate->btws_pages_alloced++; /* * We copy the last item on the page into the new page, and then * rearrange the old page so that the 'last item' becomes its high key * rather than a true data item. There had better be at least two * items on the page already, else the page would be empty of useful * data. */ Assert(last_off > P_FIRSTKEY); ii = PageGetItemId(opage, last_off); oitup = (IndexTuple) PageGetItem(opage, ii); _bt_sortaddtup(npage, ItemIdGetLength(ii), oitup, P_FIRSTKEY); /* * Move 'last' into the high key position on opage */ hii = PageGetItemId(opage, P_HIKEY); *hii = *ii; ItemIdSetUnused(ii); /* redundant */ ((PageHeader) opage)->pd_lower -= sizeof(ItemIdData); /* * Link the old page into its parent, using its minimum key. If we * don't have a parent, we have to create one; this adds a new btree * level. */ if (state->btps_next == NULL) state->btps_next = _bt_pagestate(wstate, state->btps_level + 1); Assert(state->btps_minkey != NULL); ItemPointerSet(&(state->btps_minkey->t_tid), oblkno, P_HIKEY); _bt_buildadd(wstate, state->btps_next, state->btps_minkey); pfree(state->btps_minkey); /* * Save a copy of the minimum key for the new page. We have to copy * it off the old page, not the new one, in case we are not at leaf * level. */ state->btps_minkey = CopyIndexTuple(oitup); /* * Set the sibling links for both pages. */ { BTPageOpaque oopaque = (BTPageOpaque) PageGetSpecialPointer(opage); BTPageOpaque nopaque = (BTPageOpaque) PageGetSpecialPointer(npage); oopaque->btpo_next = nblkno; nopaque->btpo_prev = oblkno; nopaque->btpo_next = P_NONE; /* redundant */ } /* * Write out the old page. We never need to touch it again, so we can * free the opage workspace too. */ _bt_blwritepage(wstate, opage, oblkno); /* * Reset last_off to point to new page */ last_off = P_FIRSTKEY; } /* * If the new item is the first for its page, stash a copy for later. Note * this will only happen for the first item on a level; on later pages, * the first item for a page is copied from the prior page in the code * above. */ if (last_off == P_HIKEY) { Assert(state->btps_minkey == NULL); state->btps_minkey = CopyIndexTuple(itup); } /* * Add the new item into the current page. */ last_off = OffsetNumberNext(last_off); _bt_sortaddtup(npage, itupsz, itup, last_off); state->btps_page = npage; state->btps_blkno = nblkno; state->btps_lastoff = last_off; }
/* * Insert an entry and perhaps return the top element of the heap in *e * * Comparison happens from the specified level to the end of levels, as needed: * Return < 0 if smaller than heap top; *e is unchanged * Return = 0 if eq to heap top ; *e is unchanged (but will have value equal to the heap top) * Return > 0 if successfully inserted; *e is populated with the removed heap top * * If 0 would be returned but the heap is marked as needing uniqueness enforcement, error is generated instead */ static int mkheap_putAndGet_impl(MKHeap *mkheap, MKEntry *e) { int c = 0; int toplv; MKEntry tmp; /* can't put+get from an empty heap */ Assert(mkheap->count > 0); if (mkheap->mkctxt->enforceUnique && mke_has_duplicates_with_root(mkheap)) { /** * See NOTE ON UNIQUENESS CHECKING in the comment at the top of the file * for information about why we check for duplicates here */ Datum values[INDEX_MAX_KEYS]; bool isnull[INDEX_MAX_KEYS]; index_deform_tuple((IndexTuple) mkheap->p->ptr, mkheap->mkctxt->tupdesc, values, isnull); ereport(ERROR, (errcode(ERRCODE_UNIQUE_VIOLATION), errmsg("could not create unique index \"%s\"", RelationGetRelationName(mkheap->mkctxt->indexRel)), errdetail("Key %s is duplicated.", BuildIndexValueDescription(mkheap->mkctxt->indexRel, values, isnull)), errtableconstraint(mkheap->mkctxt->heapRel, RelationGetRelationName(mkheap->mkctxt->indexRel)))); } if (mke_is_empty(e)) { /* * adding an empty (sentinel): just remove from count and fallthrough * to where top is removed */ --mkheap->count; } else if (mke_get_run(e) != mke_get_run(mkheap->p)) { /* * this code assumes that the new one, with lower run, is LARGER than * the top -- so it must be larger run */ Assert(mke_get_run(e) > mke_get_run(mkheap->p)); /* * when the runs differ it is because we attempted once with the runs * equal. So if level is zero then: the level was zero AND validly * prepared for the previous run -- and there is no need to prep again */ if (mke_get_lv(e) != 0) { /* Not same run, at least prepare lv 0 */ if (mkheap->mkctxt->fetchForPrep) tupsort_prepare(e, mkheap->mkctxt, 0); mke_set_lv(e, 0); } /* * now fall through and let top be returned, new one is also inserted * so no change to count */ } else { /* same run so figure out where it fits in relation to the heap top */ int lv = 0; toplv = mke_get_lv(mkheap->p); mke_set_lv(e, lv); /* populate level until we differ from the top element of the heap */ while (lv < toplv) { if (mkheap->mkctxt->fetchForPrep) tupsort_prepare(e, mkheap->mkctxt, lv); c = mkheap_compare(mkheap, e, mkheap->lvtops + lv); if (c != 0) break; mke_set_lv(e, ++lv); } /* smaller than top */ if (c < 0) return -1; /* * we have not done e->lv == toplv yet since we increment at the end * of the previous loop. Do it now. */ Assert(mke_get_lv(e) == lv); if (lv == toplv) { if (mkheap->mkctxt->fetchForPrep) tupsort_prepare(e, mkheap->mkctxt, lv); c = mkheap_compare(mkheap, e, mkheap->p); if (c < 0) return -1; } if (c == 0) { /* * Equal and at top level. * * This means that e is less-than/equal to all entries except the * heap top. */ Assert(mke_get_lv(e) == lv); Assert(lv == mke_get_lv(mkheap->p)); /* * Expand more levels of lvtop in the current top and the new one * until we detect a difference. */ while (lv < mkheap->mkctxt->total_lv - 1) { mkheap_save_lvtop(mkheap); ++lv; /* expand top */ if (mkheap->mkctxt->fetchForPrep) tupsort_prepare(mkheap->p, mkheap->mkctxt, lv); /* expand new element */ if (mkheap->mkctxt->fetchForPrep) tupsort_prepare(e, mkheap->mkctxt, lv); mke_set_lv(mkheap->p, lv); mke_set_lv(e, lv); c = mkheap_compare(mkheap, e, mkheap->p); if (c != 0) break; } if (c <= 0) { /* * if new one is less than current top then we just return * that negative comparison */ /* * if new one equals the current top then we could do an * insert and immediate removal -- but it won't matter so we * simply return right away, leaving *e untouched */ /* enforce uniqueness first */ if (c == 0 && mkheap->mkctxt->enforceUnique) { Datum values[INDEX_MAX_KEYS]; bool isnull[INDEX_MAX_KEYS]; index_deform_tuple((IndexTuple) mkheap->p->ptr, mkheap->mkctxt->tupdesc, values, isnull); ereport(ERROR, (errcode(ERRCODE_UNIQUE_VIOLATION), errmsg("could not create unique index \"%s\"", RelationGetRelationName(mkheap->mkctxt->indexRel)), errdetail("Key %s is duplicated.", BuildIndexValueDescription(mkheap->mkctxt->indexRel, values, isnull)))); } return c; } } } /* * Now, I am bigger than top but not definitely smaller/equal to all other * entries * * So we will: return top as *e do heap shuffling to restore heap ordering */ tmp = *e; *e = mkheap->p[0]; /* Sift down a hole to bottom of (current or next) run, depends on tmp.run */ mkheap_siftdown(mkheap, 0, &tmp); if (mkheap_need_heapify(mkheap)) mkheap_heapify(mkheap, false); if (mkheap->count > 0) { mkheap_update_lvtops(mkheap); } #ifdef USE_ASSERT_CHECKING if (gp_mk_sort_check) mkheap_verify_heap(mkheap, 0); #endif return 1; }