/* * Given two deformed tuples, adjust the first one so that it's consistent * with the summary values in both. */ static void union_tuples(BrinDesc *bdesc, BrinMemTuple *a, BrinTuple *b) { int keyno; BrinMemTuple *db; MemoryContext cxt; MemoryContext oldcxt; /* Use our own memory context to avoid retail pfree */ cxt = AllocSetContextCreate(CurrentMemoryContext, "brin union", ALLOCSET_DEFAULT_SIZES); oldcxt = MemoryContextSwitchTo(cxt); db = brin_deform_tuple(bdesc, b); MemoryContextSwitchTo(oldcxt); for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++) { FmgrInfo *unionFn; BrinValues *col_a = &a->bt_columns[keyno]; BrinValues *col_b = &db->bt_columns[keyno]; unionFn = index_getprocinfo(bdesc->bd_index, keyno + 1, BRIN_PROCNUM_UNION); FunctionCall3Coll(unionFn, bdesc->bd_index->rd_indcollation[keyno], PointerGetDatum(bdesc), PointerGetDatum(col_a), PointerGetDatum(col_b)); } MemoryContextDelete(cxt); }
/* * Extract all item values from a BRIN index page * * Usage: SELECT * FROM brin_page_items(get_raw_page('idx', 1), 'idx'::regclass); */ Datum brin_page_items(PG_FUNCTION_ARGS) { bytea *raw_page = PG_GETARG_BYTEA_P(0); Oid indexRelid = PG_GETARG_OID(1); ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; TupleDesc tupdesc; MemoryContext oldcontext; Tuplestorestate *tupstore; Relation indexRel; brin_column_state **columns; BrinDesc *bdesc; BrinMemTuple *dtup; Page page; OffsetNumber offset; AttrNumber attno; bool unusedItem; if (!superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), (errmsg("must be superuser to use raw page functions")))); /* check to see if caller supports us returning a tuplestore */ if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo)) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("set-valued function called in context that cannot accept a set"))); if (!(rsinfo->allowedModes & SFRM_Materialize) || rsinfo->expectedDesc == NULL) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("materialize mode required, but it is not allowed in this context"))); /* Build a tuple descriptor for our result type */ if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) elog(ERROR, "return type must be a row type"); /* Build tuplestore to hold the result rows */ oldcontext = MemoryContextSwitchTo(rsinfo->econtext->ecxt_per_query_memory); tupstore = tuplestore_begin_heap(true, false, work_mem); rsinfo->returnMode = SFRM_Materialize; rsinfo->setResult = tupstore; rsinfo->setDesc = tupdesc; MemoryContextSwitchTo(oldcontext); indexRel = index_open(indexRelid, AccessShareLock); bdesc = brin_build_desc(indexRel); /* minimally verify the page we got */ page = verify_brin_page(raw_page, BRIN_PAGETYPE_REGULAR, "regular"); /* * Initialize output functions for all indexed datatypes; simplifies * calling them later. */ columns = palloc(sizeof(brin_column_state *) * RelationGetDescr(indexRel)->natts); for (attno = 1; attno <= bdesc->bd_tupdesc->natts; attno++) { Oid output; bool isVarlena; BrinOpcInfo *opcinfo; int i; brin_column_state *column; opcinfo = bdesc->bd_info[attno - 1]; column = palloc(offsetof(brin_column_state, outputFn) + sizeof(FmgrInfo) * opcinfo->oi_nstored); column->nstored = opcinfo->oi_nstored; for (i = 0; i < opcinfo->oi_nstored; i++) { getTypeOutputInfo(opcinfo->oi_typcache[i]->type_id, &output, &isVarlena); fmgr_info(output, &column->outputFn[i]); } columns[attno - 1] = column; } offset = FirstOffsetNumber; unusedItem = false; dtup = NULL; for (;;) { Datum values[7]; bool nulls[7]; /* * This loop is called once for every attribute of every tuple in the * page. At the start of a tuple, we get a NULL dtup; that's our * signal for obtaining and decoding the next one. If that's not the * case, we output the next attribute. */ if (dtup == NULL) { ItemId itemId; /* verify item status: if there's no data, we can't decode */ itemId = PageGetItemId(page, offset); if (ItemIdIsUsed(itemId)) { dtup = brin_deform_tuple(bdesc, (BrinTuple *) PageGetItem(page, itemId)); attno = 1; unusedItem = false; } else unusedItem = true; } else attno++; MemSet(nulls, 0, sizeof(nulls)); if (unusedItem) { values[0] = UInt16GetDatum(offset); nulls[1] = true; nulls[2] = true; nulls[3] = true; nulls[4] = true; nulls[5] = true; nulls[6] = true; } else { int att = attno - 1; values[0] = UInt16GetDatum(offset); values[1] = UInt32GetDatum(dtup->bt_blkno); values[2] = UInt16GetDatum(attno); values[3] = BoolGetDatum(dtup->bt_columns[att].bv_allnulls); values[4] = BoolGetDatum(dtup->bt_columns[att].bv_hasnulls); values[5] = BoolGetDatum(dtup->bt_placeholder); if (!dtup->bt_columns[att].bv_allnulls) { BrinValues *bvalues = &dtup->bt_columns[att]; StringInfoData s; bool first; int i; initStringInfo(&s); appendStringInfoChar(&s, '{'); first = true; for (i = 0; i < columns[att]->nstored; i++) { char *val; if (!first) appendStringInfoString(&s, " .. "); first = false; val = OutputFunctionCall(&columns[att]->outputFn[i], bvalues->bv_values[i]); appendStringInfoString(&s, val); pfree(val); } appendStringInfoChar(&s, '}'); values[6] = CStringGetTextDatum(s.data); pfree(s.data); } else { nulls[6] = true; } } tuplestore_putvalues(tupstore, tupdesc, values, nulls); /* * If the item was unused, jump straight to the next one; otherwise, * the only cleanup needed here is to set our signal to go to the next * tuple in the following iteration, by freeing the current one. */ if (unusedItem) offset = OffsetNumberNext(offset); else if (attno >= bdesc->bd_tupdesc->natts) { pfree(dtup); dtup = NULL; offset = OffsetNumberNext(offset); } /* * If we're beyond the end of the page, we're done. */ if (offset > PageGetMaxOffsetNumber(page)) break; } /* clean up and return the tuplestore */ brin_free_desc(bdesc); tuplestore_donestoring(tupstore); index_close(indexRel, AccessShareLock); return (Datum) 0; }
/* * Execute the index scan. * * This works by reading index TIDs from the revmap, and obtaining the index * tuples pointed to by them; the summary values in the index tuples are * compared to the scan keys. We return into the TID bitmap all the pages in * ranges corresponding to index tuples that match the scan keys. * * If a TID from the revmap is read as InvalidTID, we know that range is * unsummarized. Pages in those ranges need to be returned regardless of scan * keys. */ int64 bringetbitmap(IndexScanDesc scan, TIDBitmap *tbm) { Relation idxRel = scan->indexRelation; Buffer buf = InvalidBuffer; BrinDesc *bdesc; Oid heapOid; Relation heapRel; BrinOpaque *opaque; BlockNumber nblocks; BlockNumber heapBlk; int totalpages = 0; FmgrInfo *consistentFn; MemoryContext oldcxt; MemoryContext perRangeCxt; opaque = (BrinOpaque *) scan->opaque; bdesc = opaque->bo_bdesc; pgstat_count_index_scan(idxRel); /* * We need to know the size of the table so that we know how long to * iterate on the revmap. */ heapOid = IndexGetRelation(RelationGetRelid(idxRel), false); heapRel = heap_open(heapOid, AccessShareLock); nblocks = RelationGetNumberOfBlocks(heapRel); heap_close(heapRel, AccessShareLock); /* * Make room for the consistent support procedures of indexed columns. We * don't look them up here; we do that lazily the first time we see a scan * key reference each of them. We rely on zeroing fn_oid to InvalidOid. */ consistentFn = palloc0(sizeof(FmgrInfo) * bdesc->bd_tupdesc->natts); /* * Setup and use a per-range memory context, which is reset every time we * loop below. This avoids having to free the tuples within the loop. */ perRangeCxt = AllocSetContextCreate(CurrentMemoryContext, "bringetbitmap cxt", ALLOCSET_DEFAULT_SIZES); oldcxt = MemoryContextSwitchTo(perRangeCxt); /* * Now scan the revmap. We start by querying for heap page 0, * incrementing by the number of pages per range; this gives us a full * view of the table. */ for (heapBlk = 0; heapBlk < nblocks; heapBlk += opaque->bo_pagesPerRange) { bool addrange; BrinTuple *tup; OffsetNumber off; Size size; CHECK_FOR_INTERRUPTS(); MemoryContextResetAndDeleteChildren(perRangeCxt); tup = brinGetTupleForHeapBlock(opaque->bo_rmAccess, heapBlk, &buf, &off, &size, BUFFER_LOCK_SHARE, scan->xs_snapshot); if (tup) { tup = brin_copy_tuple(tup, size); LockBuffer(buf, BUFFER_LOCK_UNLOCK); } /* * For page ranges with no indexed tuple, we must return the whole * range; otherwise, compare it to the scan keys. */ if (tup == NULL) { addrange = true; } else { BrinMemTuple *dtup; dtup = brin_deform_tuple(bdesc, tup); if (dtup->bt_placeholder) { /* * Placeholder tuples are always returned, regardless of the * values stored in them. */ addrange = true; } else { int keyno; /* * Compare scan keys with summary values stored for the range. * If scan keys are matched, the page range must be added to * the bitmap. We initially assume the range needs to be * added; in particular this serves the case where there are * no keys. */ addrange = true; for (keyno = 0; keyno < scan->numberOfKeys; keyno++) { ScanKey key = &scan->keyData[keyno]; AttrNumber keyattno = key->sk_attno; BrinValues *bval = &dtup->bt_columns[keyattno - 1]; Datum add; /* * The collation of the scan key must match the collation * used in the index column (but only if the search is not * IS NULL/ IS NOT NULL). Otherwise we shouldn't be using * this index ... */ Assert((key->sk_flags & SK_ISNULL) || (key->sk_collation == bdesc->bd_tupdesc->attrs[keyattno - 1]->attcollation)); /* First time this column? look up consistent function */ if (consistentFn[keyattno - 1].fn_oid == InvalidOid) { FmgrInfo *tmp; tmp = index_getprocinfo(idxRel, keyattno, BRIN_PROCNUM_CONSISTENT); fmgr_info_copy(&consistentFn[keyattno - 1], tmp, CurrentMemoryContext); } /* * Check whether the scan key is consistent with the page * range values; if so, have the pages in the range added * to the output bitmap. * * When there are multiple scan keys, failure to meet the * criteria for a single one of them is enough to discard * the range as a whole, so break out of the loop as soon * as a false return value is obtained. */ add = FunctionCall3Coll(&consistentFn[keyattno - 1], key->sk_collation, PointerGetDatum(bdesc), PointerGetDatum(bval), PointerGetDatum(key)); addrange = DatumGetBool(add); if (!addrange) break; } } } /* add the pages in the range to the output bitmap, if needed */ if (addrange) { BlockNumber pageno; for (pageno = heapBlk; pageno <= heapBlk + opaque->bo_pagesPerRange - 1; pageno++) { MemoryContextSwitchTo(oldcxt); tbm_add_page(tbm, pageno); totalpages++; MemoryContextSwitchTo(perRangeCxt); } } } MemoryContextSwitchTo(oldcxt); MemoryContextDelete(perRangeCxt); if (buf != InvalidBuffer) ReleaseBuffer(buf); /* * XXX We have an approximation of the number of *pages* that our scan * returns, but we don't have a precise idea of the number of heap tuples * involved. */ return totalpages * 10; }
/* * A tuple in the heap is being inserted. To keep a brin index up to date, * we need to obtain the relevant index tuple and compare its stored values * with those of the new tuple. If the tuple values are not consistent with * the summary tuple, we need to update the index tuple. * * If the range is not currently summarized (i.e. the revmap returns NULL for * it), there's nothing to do. */ bool brininsert(Relation idxRel, Datum *values, bool *nulls, ItemPointer heaptid, Relation heapRel, IndexUniqueCheck checkUnique) { BlockNumber pagesPerRange; BrinDesc *bdesc = NULL; BrinRevmap *revmap; Buffer buf = InvalidBuffer; MemoryContext tupcxt = NULL; MemoryContext oldcxt = NULL; revmap = brinRevmapInitialize(idxRel, &pagesPerRange, NULL); for (;;) { bool need_insert = false; OffsetNumber off; BrinTuple *brtup; BrinMemTuple *dtup; BlockNumber heapBlk; int keyno; CHECK_FOR_INTERRUPTS(); heapBlk = ItemPointerGetBlockNumber(heaptid); /* normalize the block number to be the first block in the range */ heapBlk = (heapBlk / pagesPerRange) * pagesPerRange; brtup = brinGetTupleForHeapBlock(revmap, heapBlk, &buf, &off, NULL, BUFFER_LOCK_SHARE, NULL); /* if range is unsummarized, there's nothing to do */ if (!brtup) break; /* First time through? */ if (bdesc == NULL) { bdesc = brin_build_desc(idxRel); tupcxt = AllocSetContextCreate(CurrentMemoryContext, "brininsert cxt", ALLOCSET_DEFAULT_SIZES); oldcxt = MemoryContextSwitchTo(tupcxt); } dtup = brin_deform_tuple(bdesc, brtup); /* * Compare the key values of the new tuple to the stored index values; * our deformed tuple will get updated if the new tuple doesn't fit * the original range (note this means we can't break out of the loop * early). Make a note of whether this happens, so that we know to * insert the modified tuple later. */ for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++) { Datum result; BrinValues *bval; FmgrInfo *addValue; bval = &dtup->bt_columns[keyno]; addValue = index_getprocinfo(idxRel, keyno + 1, BRIN_PROCNUM_ADDVALUE); result = FunctionCall4Coll(addValue, idxRel->rd_indcollation[keyno], PointerGetDatum(bdesc), PointerGetDatum(bval), values[keyno], nulls[keyno]); /* if that returned true, we need to insert the updated tuple */ need_insert |= DatumGetBool(result); } if (!need_insert) { /* * The tuple is consistent with the new values, so there's nothing * to do. */ LockBuffer(buf, BUFFER_LOCK_UNLOCK); } else { Page page = BufferGetPage(buf); ItemId lp = PageGetItemId(page, off); Size origsz; BrinTuple *origtup; Size newsz; BrinTuple *newtup; bool samepage; /* * Make a copy of the old tuple, so that we can compare it after * re-acquiring the lock. */ origsz = ItemIdGetLength(lp); origtup = brin_copy_tuple(brtup, origsz); /* * Before releasing the lock, check if we can attempt a same-page * update. Another process could insert a tuple concurrently in * the same page though, so downstream we must be prepared to cope * if this turns out to not be possible after all. */ newtup = brin_form_tuple(bdesc, heapBlk, dtup, &newsz); samepage = brin_can_do_samepage_update(buf, origsz, newsz); LockBuffer(buf, BUFFER_LOCK_UNLOCK); /* * Try to update the tuple. If this doesn't work for whatever * reason, we need to restart from the top; the revmap might be * pointing at a different tuple for this block now, so we need to * recompute to ensure both our new heap tuple and the other * inserter's are covered by the combined tuple. It might be that * we don't need to update at all. */ if (!brin_doupdate(idxRel, pagesPerRange, revmap, heapBlk, buf, off, origtup, origsz, newtup, newsz, samepage)) { /* no luck; start over */ MemoryContextResetAndDeleteChildren(tupcxt); continue; } } /* success! */ break; } brinRevmapTerminate(revmap); if (BufferIsValid(buf)) ReleaseBuffer(buf); if (bdesc != NULL) { brin_free_desc(bdesc); MemoryContextSwitchTo(oldcxt); MemoryContextDelete(tupcxt); } return false; }
/* * A tuple in the heap is being inserted. To keep a brin index up to date, * we need to obtain the relevant index tuple and compare its stored values * with those of the new tuple. If the tuple values are not consistent with * the summary tuple, we need to update the index tuple. * * If the range is not currently summarized (i.e. the revmap returns NULL for * it), there's nothing to do. */ Datum brininsert(PG_FUNCTION_ARGS) { Relation idxRel = (Relation) PG_GETARG_POINTER(0); Datum *values = (Datum *) PG_GETARG_POINTER(1); bool *nulls = (bool *) PG_GETARG_POINTER(2); ItemPointer heaptid = (ItemPointer) PG_GETARG_POINTER(3); /* we ignore the rest of our arguments */ BlockNumber pagesPerRange; BrinDesc *bdesc = NULL; BrinRevmap *revmap; Buffer buf = InvalidBuffer; MemoryContext tupcxt = NULL; MemoryContext oldcxt = NULL; revmap = brinRevmapInitialize(idxRel, &pagesPerRange); for (;;) { bool need_insert = false; OffsetNumber off; BrinTuple *brtup; BrinMemTuple *dtup; BlockNumber heapBlk; int keyno; #ifdef USE_ASSERT_CHECKING BrinTuple *tmptup; BrinMemTuple *tmpdtup; Size tmpsiz; #endif CHECK_FOR_INTERRUPTS(); heapBlk = ItemPointerGetBlockNumber(heaptid); /* normalize the block number to be the first block in the range */ heapBlk = (heapBlk / pagesPerRange) * pagesPerRange; brtup = brinGetTupleForHeapBlock(revmap, heapBlk, &buf, &off, NULL, BUFFER_LOCK_SHARE); /* if range is unsummarized, there's nothing to do */ if (!brtup) break; /* First time through? */ if (bdesc == NULL) { bdesc = brin_build_desc(idxRel); tupcxt = AllocSetContextCreate(CurrentMemoryContext, "brininsert cxt", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); oldcxt = MemoryContextSwitchTo(tupcxt); } dtup = brin_deform_tuple(bdesc, brtup); #ifdef USE_ASSERT_CHECKING { /* * When assertions are enabled, we use this as an opportunity to * test the "union" method, which would otherwise be used very * rarely: first create a placeholder tuple, and addValue the * value we just got into it. Then union the existing index tuple * with the updated placeholder tuple. The tuple resulting from * that union should be identical to the one resulting from the * regular operation (straight addValue) below. * * Here we create the tuple to compare with; the actual comparison * is below. */ tmptup = brin_form_placeholder_tuple(bdesc, heapBlk, &tmpsiz); tmpdtup = brin_deform_tuple(bdesc, tmptup); for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++) { BrinValues *bval; FmgrInfo *addValue; bval = &tmpdtup->bt_columns[keyno]; addValue = index_getprocinfo(idxRel, keyno + 1, BRIN_PROCNUM_ADDVALUE); FunctionCall4Coll(addValue, idxRel->rd_indcollation[keyno], PointerGetDatum(bdesc), PointerGetDatum(bval), values[keyno], nulls[keyno]); } union_tuples(bdesc, tmpdtup, brtup); tmpdtup->bt_placeholder = dtup->bt_placeholder; tmptup = brin_form_tuple(bdesc, heapBlk, tmpdtup, &tmpsiz); } #endif /* * Compare the key values of the new tuple to the stored index values; * our deformed tuple will get updated if the new tuple doesn't fit * the original range (note this means we can't break out of the loop * early). Make a note of whether this happens, so that we know to * insert the modified tuple later. */ for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++) { Datum result; BrinValues *bval; FmgrInfo *addValue; bval = &dtup->bt_columns[keyno]; addValue = index_getprocinfo(idxRel, keyno + 1, BRIN_PROCNUM_ADDVALUE); result = FunctionCall4Coll(addValue, idxRel->rd_indcollation[keyno], PointerGetDatum(bdesc), PointerGetDatum(bval), values[keyno], nulls[keyno]); /* if that returned true, we need to insert the updated tuple */ need_insert |= DatumGetBool(result); } #ifdef USE_ASSERT_CHECKING { /* * Now we can compare the tuple produced by the union function * with the one from plain addValue. */ BrinTuple *cmptup; Size cmpsz; cmptup = brin_form_tuple(bdesc, heapBlk, dtup, &cmpsz); Assert(brin_tuples_equal(tmptup, tmpsiz, cmptup, cmpsz)); } #endif if (!need_insert) { /* * The tuple is consistent with the new values, so there's nothing * to do. */ LockBuffer(buf, BUFFER_LOCK_UNLOCK); } else { Page page = BufferGetPage(buf); ItemId lp = PageGetItemId(page, off); Size origsz; BrinTuple *origtup; Size newsz; BrinTuple *newtup; bool samepage; /* * Make a copy of the old tuple, so that we can compare it after * re-acquiring the lock. */ origsz = ItemIdGetLength(lp); origtup = brin_copy_tuple(brtup, origsz); /* * Before releasing the lock, check if we can attempt a same-page * update. Another process could insert a tuple concurrently in * the same page though, so downstream we must be prepared to cope * if this turns out to not be possible after all. */ newtup = brin_form_tuple(bdesc, heapBlk, dtup, &newsz); samepage = brin_can_do_samepage_update(buf, origsz, newsz); LockBuffer(buf, BUFFER_LOCK_UNLOCK); /* * Try to update the tuple. If this doesn't work for whatever * reason, we need to restart from the top; the revmap might be * pointing at a different tuple for this block now, so we need to * recompute to ensure both our new heap tuple and the other * inserter's are covered by the combined tuple. It might be that * we don't need to update at all. */ if (!brin_doupdate(idxRel, pagesPerRange, revmap, heapBlk, buf, off, origtup, origsz, newtup, newsz, samepage)) { /* no luck; start over */ MemoryContextResetAndDeleteChildren(tupcxt); continue; } } /* success! */ break; } brinRevmapTerminate(revmap); if (BufferIsValid(buf)) ReleaseBuffer(buf); if (bdesc != NULL) { brin_free_desc(bdesc); MemoryContextSwitchTo(oldcxt); MemoryContextDelete(tupcxt); } return BoolGetDatum(false); }
/* * Extract all item values from a BRIN index page * * Usage: SELECT * FROM brin_page_items(get_raw_page('idx', 1), 'idx'::regclass); */ Datum brin_page_items(PG_FUNCTION_ARGS) { brin_page_state *state; FuncCallContext *fctx; if (!superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), (errmsg("must be superuser to use raw page functions")))); if (SRF_IS_FIRSTCALL()) { bytea *raw_page = PG_GETARG_BYTEA_P(0); Oid indexRelid = PG_GETARG_OID(1); Page page; TupleDesc tupdesc; MemoryContext mctx; Relation indexRel; AttrNumber attno; /* minimally verify the page we got */ page = verify_brin_page(raw_page, BRIN_PAGETYPE_REGULAR, "regular"); /* create a function context for cross-call persistence */ fctx = SRF_FIRSTCALL_INIT(); /* switch to memory context appropriate for multiple function calls */ mctx = MemoryContextSwitchTo(fctx->multi_call_memory_ctx); /* Build a tuple descriptor for our result type */ if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) elog(ERROR, "return type must be a row type"); indexRel = index_open(indexRelid, AccessShareLock); state = palloc(offsetof(brin_page_state, columns) + sizeof(brin_column_state) * RelationGetDescr(indexRel)->natts); state->bdesc = brin_build_desc(indexRel); state->page = page; state->offset = FirstOffsetNumber; state->unusedItem = false; state->done = false; state->dtup = NULL; /* * Initialize output functions for all indexed datatypes; simplifies * calling them later. */ for (attno = 1; attno <= state->bdesc->bd_tupdesc->natts; attno++) { Oid output; bool isVarlena; BrinOpcInfo *opcinfo; int i; brin_column_state *column; opcinfo = state->bdesc->bd_info[attno - 1]; column = palloc(offsetof(brin_column_state, outputFn) + sizeof(FmgrInfo) * opcinfo->oi_nstored); column->nstored = opcinfo->oi_nstored; for (i = 0; i < opcinfo->oi_nstored; i++) { getTypeOutputInfo(opcinfo->oi_typcache[i]->type_id, &output, &isVarlena); fmgr_info(output, &column->outputFn[i]); } state->columns[attno - 1] = column; } index_close(indexRel, AccessShareLock); fctx->user_fctx = state; fctx->tuple_desc = BlessTupleDesc(tupdesc); MemoryContextSwitchTo(mctx); } fctx = SRF_PERCALL_SETUP(); state = fctx->user_fctx; if (!state->done) { HeapTuple result; Datum values[7]; bool nulls[7]; /* * This loop is called once for every attribute of every tuple in the * page. At the start of a tuple, we get a NULL dtup; that's our * signal for obtaining and decoding the next one. If that's not the * case, we output the next attribute. */ if (state->dtup == NULL) { BrinTuple *tup; MemoryContext mctx; ItemId itemId; /* deformed tuple must live across calls */ mctx = MemoryContextSwitchTo(fctx->multi_call_memory_ctx); /* verify item status: if there's no data, we can't decode */ itemId = PageGetItemId(state->page, state->offset); if (ItemIdIsUsed(itemId)) { tup = (BrinTuple *) PageGetItem(state->page, PageGetItemId(state->page, state->offset)); state->dtup = brin_deform_tuple(state->bdesc, tup); state->attno = 1; state->unusedItem = false; } else state->unusedItem = true; MemoryContextSwitchTo(mctx); } else state->attno++; MemSet(nulls, 0, sizeof(nulls)); if (state->unusedItem) { values[0] = UInt16GetDatum(state->offset); nulls[1] = true; nulls[2] = true; nulls[3] = true; nulls[4] = true; nulls[5] = true; nulls[6] = true; } else { int att = state->attno - 1; values[0] = UInt16GetDatum(state->offset); values[1] = UInt32GetDatum(state->dtup->bt_blkno); values[2] = UInt16GetDatum(state->attno); values[3] = BoolGetDatum(state->dtup->bt_columns[att].bv_allnulls); values[4] = BoolGetDatum(state->dtup->bt_columns[att].bv_hasnulls); values[5] = BoolGetDatum(state->dtup->bt_placeholder); if (!state->dtup->bt_columns[att].bv_allnulls) { BrinValues *bvalues = &state->dtup->bt_columns[att]; StringInfoData s; bool first; int i; initStringInfo(&s); appendStringInfoChar(&s, '{'); first = true; for (i = 0; i < state->columns[att]->nstored; i++) { char *val; if (!first) appendStringInfoString(&s, " .. "); first = false; val = OutputFunctionCall(&state->columns[att]->outputFn[i], bvalues->bv_values[i]); appendStringInfoString(&s, val); pfree(val); } appendStringInfoChar(&s, '}'); values[6] = CStringGetTextDatum(s.data); pfree(s.data); } else { nulls[6] = true; } } result = heap_form_tuple(fctx->tuple_desc, values, nulls); /* * If the item was unused, jump straight to the next one; otherwise, * the only cleanup needed here is to set our signal to go to the next * tuple in the following iteration, by freeing the current one. */ if (state->unusedItem) state->offset = OffsetNumberNext(state->offset); else if (state->attno >= state->bdesc->bd_tupdesc->natts) { pfree(state->dtup); state->dtup = NULL; state->offset = OffsetNumberNext(state->offset); } /* * If we're beyond the end of the page, set flag to end the function * in the following iteration. */ if (state->offset > PageGetMaxOffsetNumber(state->page)) state->done = true; SRF_RETURN_NEXT(fctx, HeapTupleGetDatum(result)); } brin_free_desc(state->bdesc); SRF_RETURN_DONE(fctx); }
/* * A tuple in the heap is being inserted. To keep a brin index up to date, * we need to obtain the relevant index tuple and compare its stored values * with those of the new tuple. If the tuple values are not consistent with * the summary tuple, we need to update the index tuple. * * If autosummarization is enabled, check if we need to summarize the previous * page range. * * If the range is not currently summarized (i.e. the revmap returns NULL for * it), there's nothing to do for this tuple. */ bool brininsert(Relation idxRel, Datum *values, bool *nulls, ItemPointer heaptid, Relation heapRel, IndexUniqueCheck checkUnique, IndexInfo *indexInfo) { BlockNumber pagesPerRange; BlockNumber origHeapBlk; BlockNumber heapBlk; BrinDesc *bdesc = (BrinDesc *) indexInfo->ii_AmCache; BrinRevmap *revmap; Buffer buf = InvalidBuffer; MemoryContext tupcxt = NULL; MemoryContext oldcxt = CurrentMemoryContext; bool autosummarize = BrinGetAutoSummarize(idxRel); revmap = brinRevmapInitialize(idxRel, &pagesPerRange, NULL); /* * origHeapBlk is the block number where the insertion occurred. heapBlk * is the first block in the corresponding page range. */ origHeapBlk = ItemPointerGetBlockNumber(heaptid); heapBlk = (origHeapBlk / pagesPerRange) * pagesPerRange; for (;;) { bool need_insert = false; OffsetNumber off; BrinTuple *brtup; BrinMemTuple *dtup; int keyno; CHECK_FOR_INTERRUPTS(); /* * If auto-summarization is enabled and we just inserted the first * tuple into the first block of a new non-first page range, request a * summarization run of the previous range. */ if (autosummarize && heapBlk > 0 && heapBlk == origHeapBlk && ItemPointerGetOffsetNumber(heaptid) == FirstOffsetNumber) { BlockNumber lastPageRange = heapBlk - 1; BrinTuple *lastPageTuple; lastPageTuple = brinGetTupleForHeapBlock(revmap, lastPageRange, &buf, &off, NULL, BUFFER_LOCK_SHARE, NULL); if (!lastPageTuple) { bool recorded; recorded = AutoVacuumRequestWork(AVW_BRINSummarizeRange, RelationGetRelid(idxRel), lastPageRange); if (!recorded) ereport(LOG, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("request for BRIN range summarization for index \"%s\" page %u was not recorded", RelationGetRelationName(idxRel), lastPageRange))); } else LockBuffer(buf, BUFFER_LOCK_UNLOCK); } brtup = brinGetTupleForHeapBlock(revmap, heapBlk, &buf, &off, NULL, BUFFER_LOCK_SHARE, NULL); /* if range is unsummarized, there's nothing to do */ if (!brtup) break; /* First time through in this statement? */ if (bdesc == NULL) { MemoryContextSwitchTo(indexInfo->ii_Context); bdesc = brin_build_desc(idxRel); indexInfo->ii_AmCache = (void *) bdesc; MemoryContextSwitchTo(oldcxt); } /* First time through in this brininsert call? */ if (tupcxt == NULL) { tupcxt = AllocSetContextCreate(CurrentMemoryContext, "brininsert cxt", ALLOCSET_DEFAULT_SIZES); MemoryContextSwitchTo(tupcxt); } dtup = brin_deform_tuple(bdesc, brtup, NULL); /* * Compare the key values of the new tuple to the stored index values; * our deformed tuple will get updated if the new tuple doesn't fit * the original range (note this means we can't break out of the loop * early). Make a note of whether this happens, so that we know to * insert the modified tuple later. */ for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++) { Datum result; BrinValues *bval; FmgrInfo *addValue; bval = &dtup->bt_columns[keyno]; addValue = index_getprocinfo(idxRel, keyno + 1, BRIN_PROCNUM_ADDVALUE); result = FunctionCall4Coll(addValue, idxRel->rd_indcollation[keyno], PointerGetDatum(bdesc), PointerGetDatum(bval), values[keyno], nulls[keyno]); /* if that returned true, we need to insert the updated tuple */ need_insert |= DatumGetBool(result); } if (!need_insert) { /* * The tuple is consistent with the new values, so there's nothing * to do. */ LockBuffer(buf, BUFFER_LOCK_UNLOCK); } else { Page page = BufferGetPage(buf); ItemId lp = PageGetItemId(page, off); Size origsz; BrinTuple *origtup; Size newsz; BrinTuple *newtup; bool samepage; /* * Make a copy of the old tuple, so that we can compare it after * re-acquiring the lock. */ origsz = ItemIdGetLength(lp); origtup = brin_copy_tuple(brtup, origsz, NULL, NULL); /* * Before releasing the lock, check if we can attempt a same-page * update. Another process could insert a tuple concurrently in * the same page though, so downstream we must be prepared to cope * if this turns out to not be possible after all. */ newtup = brin_form_tuple(bdesc, heapBlk, dtup, &newsz); samepage = brin_can_do_samepage_update(buf, origsz, newsz); LockBuffer(buf, BUFFER_LOCK_UNLOCK); /* * Try to update the tuple. If this doesn't work for whatever * reason, we need to restart from the top; the revmap might be * pointing at a different tuple for this block now, so we need to * recompute to ensure both our new heap tuple and the other * inserter's are covered by the combined tuple. It might be that * we don't need to update at all. */ if (!brin_doupdate(idxRel, pagesPerRange, revmap, heapBlk, buf, off, origtup, origsz, newtup, newsz, samepage)) { /* no luck; start over */ MemoryContextResetAndDeleteChildren(tupcxt); continue; } } /* success! */ break; } brinRevmapTerminate(revmap); if (BufferIsValid(buf)) ReleaseBuffer(buf); MemoryContextSwitchTo(oldcxt); if (tupcxt != NULL) MemoryContextDelete(tupcxt); return false; }