/* * Adds array of item pointers to tuple's posting list or * creates posting tree and tuple pointed to tree in a case * of not enough space. Max size of tuple is defined in * GinFormTuple(). */ static IndexTuple addItemPointersToTuple(Relation index, GinState *ginstate, GinBtreeStack *stack, IndexTuple old, ItemPointerData *items, uint32 nitem, GinStatsData *buildStats) { Datum key = gin_index_getattr(ginstate, old); OffsetNumber attnum = gintuple_get_attrnum(ginstate, old); IndexTuple res = GinFormTuple(index, ginstate, attnum, key, NULL, nitem + GinGetNPosting(old), false); if (res) { /* good, small enough */ uint32 newnitem; newnitem = ginMergeItemPointers(GinGetPosting(res), GinGetPosting(old), GinGetNPosting(old), items, nitem); /* merge might have eliminated some duplicate items */ GinShortenTuple(res, newnitem); } else { BlockNumber postingRoot; GinPostingTreeScan *gdi; /* posting list becomes big, so we need to make posting's tree */ res = GinFormTuple(index, ginstate, attnum, key, NULL, 0, true); postingRoot = createPostingTree(index, GinGetPosting(old), GinGetNPosting(old)); GinSetPostingTree(res, postingRoot); gdi = ginPrepareScanPostingTree(index, postingRoot, FALSE); gdi->btree.isBuild = (buildStats != NULL); ginInsertItemPointer(gdi, items, nitem, buildStats); pfree(gdi); /* During index build, count the newly-added data page */ if (buildStats) buildStats->nDataPages++; } return res; }
/* * Read item pointers from leaf entry tuple. * * Returns a palloc'd array of ItemPointers. The number of items is returned * in *nitems. */ ItemPointer ginReadTuple(GinState *ginstate, OffsetNumber attnum, IndexTuple itup, int *nitems) { Pointer ptr = GinGetPosting(itup); int nipd = GinGetNPosting(itup); ItemPointer ipd; int ndecoded; if (GinItupIsCompressed(itup)) { if (nipd > 0) { ipd = ginPostingListDecode((GinPostingList *) ptr, &ndecoded); if (nipd != ndecoded) elog(ERROR, "number of items mismatch in GIN entry tuple, %d in tuple header, %d decoded", nipd, ndecoded); } else { ipd = palloc(0); } } else { ipd = (ItemPointer) palloc(sizeof(ItemPointerData) * nipd); memcpy(ipd, ptr, sizeof(ItemPointerData) * nipd); } *nitems = nipd; return ipd; }
/* * Adds array of item pointers to tuple's posting list or * creates posting tree and tuple pointed to tree in a case * of not enough space. Max size of tuple is defined in * GinFormTuple(). */ static IndexTuple addItemPointersToTuple(Relation index, GinState *ginstate, GinBtreeStack *stack, IndexTuple old, ItemPointerData *items, uint32 nitem, bool isBuild) { bool isnull; Datum key = index_getattr(old, FirstOffsetNumber, ginstate->tupdesc, &isnull); IndexTuple res = GinFormTuple(ginstate, key, NULL, nitem + GinGetNPosting(old)); if (res) { /* good, small enough */ MergeItemPointers(GinGetPosting(res), GinGetPosting(old), GinGetNPosting(old), items, nitem ); GinSetNPosting(res, nitem + GinGetNPosting(old)); } else { BlockNumber postingRoot; GinPostingTreeScan *gdi; /* posting list becomes big, so we need to make posting's tree */ res = GinFormTuple(ginstate, key, NULL, 0); postingRoot = createPostingTree(index, GinGetPosting(old), GinGetNPosting(old)); GinSetPostingTree(res, postingRoot); gdi = prepareScanPostingTree(index, postingRoot, FALSE); gdi->btree.isBuild = isBuild; insertItemPointer(gdi, items, nitem); pfree(gdi); } return res; }
/* * Form a tuple for entry tree. * * On leaf pages, Index tuple has non-traditional layout. Tuple may contain * posting list or root blocknumber of posting tree. * Macros: GinIsPostingTree(itup) / GinSetPostingTree(itup, blkno) * 1) Posting list * - itup->t_info & INDEX_SIZE_MASK contains total size of tuple as usual * - ItemPointerGetBlockNumber(&itup->t_tid) contains original * size of tuple (without posting list). * Macros: GinGetOrigSizePosting(itup) / GinSetOrigSizePosting(itup,n) * - ItemPointerGetOffsetNumber(&itup->t_tid) contains number * of elements in posting list (number of heap itempointers) * Macros: GinGetNPosting(itup) / GinSetNPosting(itup,n) * - After standard part of tuple there is a posting list, ie, array * of heap itempointers * Macros: GinGetPosting(itup) * 2) Posting tree * - itup->t_info & INDEX_SIZE_MASK contains size of tuple as usual * - ItemPointerGetBlockNumber(&itup->t_tid) contains block number of * root of posting tree * - ItemPointerGetOffsetNumber(&itup->t_tid) contains magic number * GIN_TREE_POSTING, which distinguishes this from posting-list case * * Attributes of an index tuple are different for single and multicolumn index. * For single-column case, index tuple stores only value to be indexed. * For multicolumn case, it stores two attributes: column number of value * and value. */ IndexTuple GinFormTuple(GinState *ginstate, OffsetNumber attnum, Datum key, ItemPointerData *ipd, uint32 nipd) { bool isnull[2] = {FALSE, FALSE}; IndexTuple itup; if (ginstate->oneCol) itup = index_form_tuple(ginstate->origTupdesc, &key, isnull); else { Datum datums[2]; datums[0] = UInt16GetDatum(attnum); datums[1] = key; itup = index_form_tuple(ginstate->tupdesc[attnum - 1], datums, isnull); } GinSetOrigSizePosting(itup, IndexTupleSize(itup)); if (nipd > 0) { uint32 newsize = MAXALIGN(SHORTALIGN(IndexTupleSize(itup)) + sizeof(ItemPointerData) * nipd); if (newsize >= INDEX_SIZE_MASK) return NULL; if (newsize > TOAST_INDEX_TARGET && nipd > 1) return NULL; itup = repalloc(itup, newsize); /* set new size */ itup->t_info &= ~INDEX_SIZE_MASK; itup->t_info |= newsize; if (ipd) memcpy(GinGetPosting(itup), ipd, sizeof(ItemPointerData) * nipd); GinSetNPosting(itup, nipd); } else { GinSetNPosting(itup, 0); } return itup; }
/* * returns modified page or NULL if page isn't modified. * Function works with original page until first change is occurred, * then page is copied into temporary one. */ static Page ginVacuumEntryPage(GinVacuumState *gvs, Buffer buffer, BlockNumber *roots, uint32 *nroot) { Page origpage = BufferGetPage(buffer), tmppage; OffsetNumber i, maxoff = PageGetMaxOffsetNumber(origpage); tmppage = origpage; *nroot = 0; for (i = FirstOffsetNumber; i <= maxoff; i++) { IndexTuple itup = (IndexTuple) PageGetItem(tmppage, PageGetItemId(tmppage, i)); if (GinIsPostingTree(itup)) { /* * store posting tree's roots for further processing, we can't * vacuum it just now due to risk of deadlocks with scans/inserts */ roots[*nroot] = GinGetDownlink(itup); (*nroot)++; } else if (GinGetNPosting(itup) > 0) { /* * if we already create temporary page, we will make changes in * place */ ItemPointerData *cleaned = (tmppage == origpage) ? NULL : GinGetPosting(itup); uint32 newN = ginVacuumPostingList(gvs, GinGetPosting(itup), GinGetNPosting(itup), &cleaned); if (GinGetNPosting(itup) != newN) { OffsetNumber attnum; Datum key; GinNullCategory category; /* * Some ItemPointers was deleted, so we should remake our * tuple */ if (tmppage == origpage) { /* * On first difference we create temporary page in memory * and copies content in to it. */ tmppage = PageGetTempPageCopy(origpage); if (newN > 0) { Size pos = ((char *) GinGetPosting(itup)) - ((char *) origpage); memcpy(tmppage + pos, cleaned, sizeof(ItemPointerData) * newN); } pfree(cleaned); /* set itup pointer to new page */ itup = (IndexTuple) PageGetItem(tmppage, PageGetItemId(tmppage, i)); } attnum = gintuple_get_attrnum(&gvs->ginstate, itup); key = gintuple_get_key(&gvs->ginstate, itup, &category); itup = GinFormTuple(&gvs->ginstate, attnum, key, category, GinGetPosting(itup), newN, true); PageIndexTupleDelete(tmppage, i); if (PageAddItem(tmppage, (Item) itup, IndexTupleSize(itup), i, false, false) != i) elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(gvs->index)); pfree(itup); } } } return (tmppage == origpage) ? NULL : tmppage; }
/* * Adds array of item pointers to tuple's posting list, or * creates posting tree and tuple pointing to tree in case * of not enough space. Max size of tuple is defined in * GinFormTuple(). Returns a new, modified index tuple. * items[] must be in sorted order with no duplicates. */ static IndexTuple addItemPointersToLeafTuple(GinState *ginstate, IndexTuple old, ItemPointerData *items, uint32 nitem, GinStatsData *buildStats) { OffsetNumber attnum; Datum key; GinNullCategory category; IndexTuple res; Assert(!GinIsPostingTree(old)); attnum = gintuple_get_attrnum(ginstate, old); key = gintuple_get_key(ginstate, old, &category); /* try to build tuple with room for all the items */ res = GinFormTuple(ginstate, attnum, key, category, NULL, nitem + GinGetNPosting(old), false); if (res) { /* good, small enough */ uint32 newnitem; /* fill in the posting list with union of old and new TIDs */ newnitem = ginMergeItemPointers(GinGetPosting(res), GinGetPosting(old), GinGetNPosting(old), items, nitem); /* merge might have eliminated some duplicate items */ GinShortenTuple(res, newnitem); } else { /* posting list would be too big, convert to posting tree */ BlockNumber postingRoot; /* * Initialize posting tree with the old tuple's posting list. It's * surely small enough to fit on one posting-tree page, and should * already be in order with no duplicates. */ postingRoot = createPostingTree(ginstate->index, GinGetPosting(old), GinGetNPosting(old), buildStats); /* Now insert the TIDs-to-be-added into the posting tree */ ginInsertItemPointers(ginstate->index, postingRoot, items, nitem, buildStats); /* And build a new posting-tree-only result tuple */ res = GinFormTuple(ginstate, attnum, key, category, NULL, 0, true); GinSetPostingTree(res, postingRoot); } return res; }
/* * returns modified page or NULL if page isn't modified. * Function works with original page until first change is occurred, * then page is copied into temporary one. */ static Page ginVacuumEntryPage(GinVacuumState *gvs, Buffer buffer, BlockNumber *roots, uint32 *nroot) { Page origpage = BufferGetPage(buffer), tmppage; OffsetNumber i, maxoff = PageGetMaxOffsetNumber(origpage); tmppage = origpage; *nroot = 0; for (i = FirstOffsetNumber; i <= maxoff; i++) { IndexTuple itup = (IndexTuple) PageGetItem(tmppage, PageGetItemId(tmppage, i)); if (GinIsPostingTree(itup)) { /* * store posting tree's roots for further processing, we can't * vacuum it just now due to risk of deadlocks with scans/inserts */ roots[*nroot] = GinGetDownlink(itup); (*nroot)++; } else if (GinGetNPosting(itup) > 0) { int nitems; ItemPointer items_orig; bool free_items_orig; ItemPointer items; /* Get list of item pointers from the tuple. */ if (GinItupIsCompressed(itup)) { items_orig = ginPostingListDecode((GinPostingList *) GinGetPosting(itup), &nitems); free_items_orig = true; } else { items_orig = (ItemPointer) GinGetPosting(itup); nitems = GinGetNPosting(itup); free_items_orig = false; } /* Remove any items from the list that need to be vacuumed. */ items = ginVacuumItemPointers(gvs, items_orig, nitems, &nitems); if (free_items_orig) pfree(items_orig); /* If any item pointers were removed, recreate the tuple. */ if (items) { OffsetNumber attnum; Datum key; GinNullCategory category; GinPostingList *plist; int plistsize; if (nitems > 0) { plist = ginCompressPostingList(items, nitems, GinMaxItemSize, NULL); plistsize = SizeOfGinPostingList(plist); } else { plist = NULL; plistsize = 0; } /* * if we already created a temporary page, make changes in * place */ if (tmppage == origpage) { /* * On first difference, create a temporary copy of the * page and copy the tuple's posting list to it. */ tmppage = PageGetTempPageCopy(origpage); /* set itup pointer to new page */ itup = (IndexTuple) PageGetItem(tmppage, PageGetItemId(tmppage, i)); } attnum = gintuple_get_attrnum(&gvs->ginstate, itup); key = gintuple_get_key(&gvs->ginstate, itup, &category); itup = GinFormTuple(&gvs->ginstate, attnum, key, category, (char *) plist, plistsize, nitems, true); if (plist) pfree(plist); PageIndexTupleDelete(tmppage, i); if (PageAddItem(tmppage, (Item) itup, IndexTupleSize(itup), i, false, false) != i) elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(gvs->index)); pfree(itup); pfree(items); } } } return (tmppage == origpage) ? NULL : tmppage; }
/* * Form a tuple for entry tree. * * If the tuple would be too big to be stored, function throws a suitable * error if errorTooBig is TRUE, or returns NULL if errorTooBig is FALSE. * * See src/backend/access/gin/README for a description of the index tuple * format that is being built here. We build on the assumption that we * are making a leaf-level key entry containing a posting list of nipd items. * If the caller is actually trying to make a posting-tree entry, non-leaf * entry, or pending-list entry, it should pass dataSize = 0 and then overwrite * the t_tid fields as necessary. In any case, 'data' can be NULL to skip * filling in the posting list; the caller is responsible for filling it * afterwards if data = NULL and nipd > 0. */ IndexTuple GinFormTuple(GinState *ginstate, OffsetNumber attnum, Datum key, GinNullCategory category, Pointer data, Size dataSize, int nipd, bool errorTooBig) { Datum datums[2]; bool isnull[2]; IndexTuple itup; uint32 newsize; /* Build the basic tuple: optional column number, plus key datum */ if (ginstate->oneCol) { datums[0] = key; isnull[0] = (category != GIN_CAT_NORM_KEY); } else { datums[0] = UInt16GetDatum(attnum); isnull[0] = false; datums[1] = key; isnull[1] = (category != GIN_CAT_NORM_KEY); } itup = index_form_tuple(ginstate->tupdesc[attnum - 1], datums, isnull); /* * Determine and store offset to the posting list, making sure there is * room for the category byte if needed. * * Note: because index_form_tuple MAXALIGNs the tuple size, there may well * be some wasted pad space. Is it worth recomputing the data length to * prevent that? That would also allow us to Assert that the real data * doesn't overlap the GinNullCategory byte, which this code currently * takes on faith. */ newsize = IndexTupleSize(itup); if (IndexTupleHasNulls(itup)) { uint32 minsize; Assert(category != GIN_CAT_NORM_KEY); minsize = GinCategoryOffset(itup, ginstate) + sizeof(GinNullCategory); newsize = Max(newsize, minsize); } newsize = SHORTALIGN(newsize); GinSetPostingOffset(itup, newsize); GinSetNPosting(itup, nipd); /* * Add space needed for posting list, if any. Then check that the tuple * won't be too big to store. */ newsize += dataSize; newsize = MAXALIGN(newsize); if (newsize > GinMaxItemSize) { if (errorTooBig) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("index row size %zu exceeds maximum %zu for index \"%s\"", (Size) newsize, (Size) GinMaxItemSize, RelationGetRelationName(ginstate->index)))); pfree(itup); return NULL; } /* * Resize tuple if needed */ if (newsize != IndexTupleSize(itup)) { itup = repalloc(itup, newsize); /* * PostgreSQL 9.3 and earlier did not clear this new space, so we * might find uninitialized padding when reading tuples from disk. */ memset((char *) itup + IndexTupleSize(itup), 0, newsize - IndexTupleSize(itup)); /* set new size in tuple header */ itup->t_info &= ~INDEX_SIZE_MASK; itup->t_info |= newsize; } /* * Copy in the posting list, if provided */ if (data) { char *ptr = GinGetPosting(itup); memcpy(ptr, data, dataSize); } /* * Insert category byte, if needed */ if (category != GIN_CAT_NORM_KEY) { Assert(IndexTupleHasNulls(itup)); GinSetNullCategory(itup, ginstate, category); } return itup; }
/* * Start* functions setup state of searches: find correct buffer and locks it, * Stop* functions unlock buffer (but don't release!) */ static void startScanEntry(Relation index, GinState *ginstate, GinScanEntry entry) { GinBtreeData btreeEntry; GinBtreeStack *stackEntry; Page page; bool needUnlock = TRUE; if (entry->master != NULL) { entry->isFinished = entry->master->isFinished; return; } /* * We should find entry, and begin scan of posting tree * or just store posting list in memory */ prepareEntryScan(&btreeEntry, index, entry->entry, ginstate); btreeEntry.searchMode = TRUE; stackEntry = ginFindLeafPage(&btreeEntry, NULL); page = BufferGetPage(stackEntry->buffer); entry->isFinished = TRUE; entry->buffer = InvalidBuffer; entry->offset = InvalidOffsetNumber; entry->list = NULL; entry->nlist = 0; entry->reduceResult = FALSE; entry->predictNumberResult = 0; if (btreeEntry.findItem(&btreeEntry, stackEntry)) { IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stackEntry->off)); if (GinIsPostingTree(itup)) { BlockNumber rootPostingTree = GinGetPostingTree(itup); GinPostingTreeScan *gdi; Page page; LockBuffer(stackEntry->buffer, GIN_UNLOCK); needUnlock = FALSE; gdi = prepareScanPostingTree(index, rootPostingTree, TRUE); entry->buffer = scanBeginPostingTree(gdi); /* * We keep buffer pinned because we need to prevent deletition * page during scan. See GIN's vacuum implementation. RefCount * is increased to keep buffer pinned after freeGinBtreeStack() call. */ IncrBufferRefCount(entry->buffer); page = BufferGetPage(entry->buffer); entry->predictNumberResult = gdi->stack->predictNumber * GinPageGetOpaque(page)->maxoff; /* * Keep page content in memory to prevent durable page locking */ entry->list = (ItemPointerData *) palloc( BLCKSZ ); entry->nlist = GinPageGetOpaque(page)->maxoff; memcpy( entry->list, GinDataPageGetItem(page, FirstOffsetNumber), GinPageGetOpaque(page)->maxoff * sizeof(ItemPointerData) ); LockBuffer(entry->buffer, GIN_UNLOCK); freeGinBtreeStack(gdi->stack); pfree(gdi); entry->isFinished = FALSE; } else if (GinGetNPosting(itup) > 0) { entry->nlist = GinGetNPosting(itup); entry->list = (ItemPointerData *) palloc(sizeof(ItemPointerData) * entry->nlist); memcpy(entry->list, GinGetPosting(itup), sizeof(ItemPointerData) * entry->nlist); entry->isFinished = FALSE; } } if (needUnlock) LockBuffer(stackEntry->buffer, GIN_UNLOCK); freeGinBtreeStack(stackEntry); }
/* * Form a tuple for entry tree. * * If the tuple would be too big to be stored, function throws a suitable * error if errorTooBig is TRUE, or returns NULL if errorTooBig is FALSE. * * On leaf pages, Index tuple has non-traditional layout. Tuple may contain * posting list or root blocknumber of posting tree. * Macros: GinIsPostingTree(itup) / GinSetPostingTree(itup, blkno) * 1) Posting list * - itup->t_info & INDEX_SIZE_MASK contains total size of tuple as usual * - ItemPointerGetBlockNumber(&itup->t_tid) contains original * size of tuple (without posting list). * Macros: GinGetOrigSizePosting(itup) / GinSetOrigSizePosting(itup,n) * - ItemPointerGetOffsetNumber(&itup->t_tid) contains number * of elements in posting list (number of heap itempointers) * Macros: GinGetNPosting(itup) / GinSetNPosting(itup,n) * - After standard part of tuple there is a posting list, ie, array * of heap itempointers * Macros: GinGetPosting(itup) * 2) Posting tree * - itup->t_info & INDEX_SIZE_MASK contains size of tuple as usual * - ItemPointerGetBlockNumber(&itup->t_tid) contains block number of * root of posting tree * - ItemPointerGetOffsetNumber(&itup->t_tid) contains magic number * GIN_TREE_POSTING, which distinguishes this from posting-list case * * Attributes of an index tuple are different for single and multicolumn index. * For single-column case, index tuple stores only value to be indexed. * For multicolumn case, it stores two attributes: column number of value * and value. */ IndexTuple GinFormTuple(Relation index, GinState *ginstate, OffsetNumber attnum, Datum key, ItemPointerData *ipd, uint32 nipd, bool errorTooBig) { bool isnull[2] = {FALSE, FALSE}; IndexTuple itup; uint32 newsize; if (ginstate->oneCol) itup = index_form_tuple(ginstate->origTupdesc, &key, isnull); else { Datum datums[2]; datums[0] = UInt16GetDatum(attnum); datums[1] = key; itup = index_form_tuple(ginstate->tupdesc[attnum - 1], datums, isnull); } GinSetOrigSizePosting(itup, IndexTupleSize(itup)); if (nipd > 0) { newsize = MAXALIGN(SHORTALIGN(IndexTupleSize(itup)) + sizeof(ItemPointerData) * nipd); if (newsize > Min(INDEX_SIZE_MASK, GinMaxItemSize)) { if (errorTooBig) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("index row size %lu exceeds maximum %lu for index \"%s\"", (unsigned long) newsize, (unsigned long) Min(INDEX_SIZE_MASK, GinMaxItemSize), RelationGetRelationName(index)))); return NULL; } itup = repalloc(itup, newsize); /* set new size */ itup->t_info &= ~INDEX_SIZE_MASK; itup->t_info |= newsize; if (ipd) memcpy(GinGetPosting(itup), ipd, sizeof(ItemPointerData) * nipd); GinSetNPosting(itup, nipd); } else { /* * Gin tuple without any ItemPointers should be large enough to keep * one ItemPointer, to prevent inconsistency between * ginHeapTupleFastCollect and ginEntryInsert called by * ginHeapTupleInsert. ginHeapTupleFastCollect forms tuple without * extra pointer to heap, but ginEntryInsert (called for pending list * cleanup during vacuum) will form the same tuple with one * ItemPointer. */ newsize = MAXALIGN(SHORTALIGN(IndexTupleSize(itup)) + sizeof(ItemPointerData)); if (newsize > Min(INDEX_SIZE_MASK, GinMaxItemSize)) { if (errorTooBig) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("index row size %lu exceeds maximum %lu for index \"%s\"", (unsigned long) newsize, (unsigned long) Min(INDEX_SIZE_MASK, GinMaxItemSize), RelationGetRelationName(index)))); return NULL; } GinSetNPosting(itup, 0); } return itup; }