Datum ltree_in(PG_FUNCTION_ARGS) { char *buf = (char *) PG_GETARG_POINTER(0); char *ptr; nodeitem *list, *lptr; int num = 0, totallen = 0; int state = LTPRS_WAITNAME; ltree *result; ltree_level *curlevel; int charlen; int pos = 0; ptr = buf; while (*ptr) { charlen = pg_mblen(ptr); if (charlen == 1 && t_iseq(ptr, '.')) num++; ptr += charlen; } list = lptr = (nodeitem *) palloc(sizeof(nodeitem) * (num + 1)); ptr = buf; while (*ptr) { charlen = pg_mblen(ptr); if (state == LTPRS_WAITNAME) { if (ISALNUM(ptr)) { lptr->start = ptr; lptr->wlen = 0; state = LTPRS_WAITDELIM; } else UNCHAR; } else if (state == LTPRS_WAITDELIM) { if (charlen == 1 && t_iseq(ptr, '.')) { lptr->len = ptr - lptr->start; if (lptr->wlen > 255) ereport(ERROR, (errcode(ERRCODE_NAME_TOO_LONG), errmsg("name of level is too long"), errdetail("Name length is %d, must " "be < 256, in position %d.", lptr->wlen, pos))); totallen += MAXALIGN(lptr->len + LEVEL_HDRSIZE); lptr++; state = LTPRS_WAITNAME; } else if (!ISALNUM(ptr)) UNCHAR; } else /* internal error */ elog(ERROR, "internal error in parser"); ptr += charlen; lptr->wlen++; pos++; } if (state == LTPRS_WAITDELIM) { lptr->len = ptr - lptr->start; if (lptr->wlen > 255) ereport(ERROR, (errcode(ERRCODE_NAME_TOO_LONG), errmsg("name of level is too long"), errdetail("Name length is %d, must " "be < 256, in position %d.", lptr->wlen, pos))); totallen += MAXALIGN(lptr->len + LEVEL_HDRSIZE); lptr++; } else if (!(state == LTPRS_WAITNAME && lptr == list)) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("syntax error"), errdetail("Unexpected end of line."))); result = (ltree *) palloc0(LTREE_HDRSIZE + totallen); SET_VARSIZE(result, LTREE_HDRSIZE + totallen); result->numlevel = lptr - list; curlevel = LTREE_FIRST(result); lptr = list; while (lptr - list < result->numlevel) { curlevel->len = (uint16) lptr->len; memcpy(curlevel->name, lptr->start, lptr->len); curlevel = LEVEL_NEXT(curlevel); lptr++; } pfree(list); PG_RETURN_POINTER(result); }
/* * Attempt to read an XLOG record into readRecordBuf. */ static bool ReadRecord(void) { char *buffer; XLogRecord *record; XLogContRecord *contrecord; uint32 len, total_len; int retries = 0; restart: while (logRecOff <= 0 || logRecOff > XLOG_BLCKSZ - SizeOfXLogRecord) { /* Need to advance to new page */ if (! readXLogPage()) return false; logRecOff = XLogPageHeaderSize((XLogPageHeader) pageBuffer); if ((((XLogPageHeader) pageBuffer)->xlp_info & ~XLP_LONG_HEADER) != 0) { printf("Unexpected page info flags %04X at offset %X\n", ((XLogPageHeader) pageBuffer)->xlp_info, logPageOff); /* Check for a continuation record */ if (((XLogPageHeader) pageBuffer)->xlp_info & XLP_FIRST_IS_CONTRECORD) { printf("Skipping unexpected continuation record at offset %X\n", logPageOff); contrecord = (XLogContRecord *) (pageBuffer + logRecOff); logRecOff += MAXALIGN(contrecord->xl_rem_len + SizeOfXLogContRecord); } } } curRecPtr.xlogid = logId; curRecPtr.xrecoff = logSeg * XLogSegSize + logPageOff + logRecOff; record = (XLogRecord *) (pageBuffer + logRecOff); if (record->xl_len == 0) { /* Stop if XLOG_SWITCH was found. */ if (record->xl_rmid == RM_XLOG_ID && record->xl_info == XLOG_SWITCH) { dumpXLogRecord(record, false); return false; } printf("ReadRecord: record with zero len at %u/%08X\n", curRecPtr.xlogid, curRecPtr.xrecoff); /* Attempt to recover on new page, but give up after a few... */ logRecOff = 0; if (++retries > 4) return false; goto restart; } if (record->xl_tot_len < SizeOfXLogRecord + record->xl_len || record->xl_tot_len > SizeOfXLogRecord + record->xl_len + XLR_MAX_BKP_BLOCKS * (sizeof(BkpBlock) + BLCKSZ)) { printf("invalid record length(expected %u ~ %u, actual %d) at %X/%X\n", (unsigned int)SizeOfXLogRecord + record->xl_len, (unsigned int)SizeOfXLogRecord + record->xl_len + XLR_MAX_BKP_BLOCKS * (sizeof(BkpBlock) + BLCKSZ), record->xl_tot_len, curRecPtr.xlogid, curRecPtr.xrecoff); return false; } total_len = record->xl_tot_len; /* * Allocate or enlarge readRecordBuf as needed. To avoid useless * small increases, round its size to a multiple of XLOG_BLCKSZ, and make * sure it's at least 4*BLCKSZ to start with. (That is enough for all * "normal" records, but very large commit or abort records might need * more space.) */ if (total_len > readRecordBufSize) { uint32 newSize = total_len; newSize += XLOG_BLCKSZ - (newSize % XLOG_BLCKSZ); newSize = Max(newSize, 4 * XLOG_BLCKSZ); if (readRecordBuf) free(readRecordBuf); readRecordBuf = (char *) malloc(newSize); if (!readRecordBuf) { readRecordBufSize = 0; /* We treat this as a "bogus data" condition */ fprintf(stderr, "record length %u at %X/%X too long\n", total_len, curRecPtr.xlogid, curRecPtr.xrecoff); return false; } readRecordBufSize = newSize; } buffer = readRecordBuf; len = XLOG_BLCKSZ - curRecPtr.xrecoff % XLOG_BLCKSZ; /* available in block */ if (total_len > len) { /* Need to reassemble record */ uint32 gotlen = len; memcpy(buffer, record, len); record = (XLogRecord *) buffer; buffer += len; for (;;) { uint32 pageHeaderSize; if (! readXLogPage()) { /* XXX ought to be able to advance to new input file! */ fprintf(stderr, "Unable to read continuation page?\n"); dumpXLogRecord(record, true); return false; } if (!(((XLogPageHeader) pageBuffer)->xlp_info & XLP_FIRST_IS_CONTRECORD)) { printf("ReadRecord: there is no ContRecord flag in logfile %u seg %u off %u\n", logId, logSeg, logPageOff); return false; } pageHeaderSize = XLogPageHeaderSize((XLogPageHeader) pageBuffer); contrecord = (XLogContRecord *) (pageBuffer + pageHeaderSize); if (contrecord->xl_rem_len == 0 || total_len != (contrecord->xl_rem_len + gotlen)) { printf("ReadRecord: invalid cont-record len %u in logfile %u seg %u off %u\n", contrecord->xl_rem_len, logId, logSeg, logPageOff); return false; } len = XLOG_BLCKSZ - pageHeaderSize - SizeOfXLogContRecord; if (contrecord->xl_rem_len > len) { memcpy(buffer, (char *)contrecord + SizeOfXLogContRecord, len); gotlen += len; buffer += len; continue; } memcpy(buffer, (char *) contrecord + SizeOfXLogContRecord, contrecord->xl_rem_len); logRecOff = MAXALIGN(pageHeaderSize + SizeOfXLogContRecord + contrecord->xl_rem_len); break; } if (!RecordIsValid(record, curRecPtr)) return false; return true; } /* Record is contained in this page */ memcpy(buffer, record, total_len); record = (XLogRecord *) buffer; logRecOff += MAXALIGN(total_len); if (!RecordIsValid(record, curRecPtr)) return false; return true; }
Datum lquery_in(PG_FUNCTION_ARGS) { char *buf = (char *) PG_GETARG_POINTER(0); char *ptr; int num = 0, totallen = 0, numOR = 0; int state = LQPRS_WAITLEVEL; lquery *result; nodeitem *lptr = NULL; lquery_level *cur, *curqlevel, *tmpql; lquery_variant *lrptr = NULL; bool hasnot = false; bool wasbad = false; int charlen; int pos = 0; ptr = buf; while (*ptr) { charlen = pg_mblen(ptr); if (charlen == 1) { if (t_iseq(ptr, '.')) num++; else if (t_iseq(ptr, '|')) numOR++; } ptr += charlen; } num++; curqlevel = tmpql = (lquery_level *) palloc0(ITEMSIZE * num); ptr = buf; while (*ptr) { charlen = pg_mblen(ptr); if (state == LQPRS_WAITLEVEL) { if (ISALNUM(ptr)) { GETVAR(curqlevel) = lptr = (nodeitem *) palloc0(sizeof(nodeitem) * (numOR + 1)); lptr->start = ptr; state = LQPRS_WAITDELIM; curqlevel->numvar = 1; } else if (charlen == 1 && t_iseq(ptr, '!')) { GETVAR(curqlevel) = lptr = (nodeitem *) palloc0(sizeof(nodeitem) * (numOR + 1)); lptr->start = ptr + 1; state = LQPRS_WAITDELIM; curqlevel->numvar = 1; curqlevel->flag |= LQL_NOT; hasnot = true; } else if (charlen == 1 && t_iseq(ptr, '*')) state = LQPRS_WAITOPEN; else UNCHAR; } else if (state == LQPRS_WAITVAR) { if (ISALNUM(ptr)) { lptr++; lptr->start = ptr; state = LQPRS_WAITDELIM; curqlevel->numvar++; } else UNCHAR; } else if (state == LQPRS_WAITDELIM) { if (charlen == 1 && t_iseq(ptr, '@')) { if (lptr->start == ptr) UNCHAR; lptr->flag |= LVAR_INCASE; curqlevel->flag |= LVAR_INCASE; } else if (charlen == 1 && t_iseq(ptr, '*')) { if (lptr->start == ptr) UNCHAR; lptr->flag |= LVAR_ANYEND; curqlevel->flag |= LVAR_ANYEND; } else if (charlen == 1 && t_iseq(ptr, '%')) { if (lptr->start == ptr) UNCHAR; lptr->flag |= LVAR_SUBLEXEME; curqlevel->flag |= LVAR_SUBLEXEME; } else if (charlen == 1 && t_iseq(ptr, '|')) { lptr->len = ptr - lptr->start - ((lptr->flag & LVAR_SUBLEXEME) ? 1 : 0) - ((lptr->flag & LVAR_INCASE) ? 1 : 0) - ((lptr->flag & LVAR_ANYEND) ? 1 : 0); if (lptr->wlen > 255) ereport(ERROR, (errcode(ERRCODE_NAME_TOO_LONG), errmsg("name of level is too long"), errdetail("Name length is %d, must " "be < 256, in position %d.", lptr->wlen, pos))); state = LQPRS_WAITVAR; } else if (charlen == 1 && t_iseq(ptr, '.')) { lptr->len = ptr - lptr->start - ((lptr->flag & LVAR_SUBLEXEME) ? 1 : 0) - ((lptr->flag & LVAR_INCASE) ? 1 : 0) - ((lptr->flag & LVAR_ANYEND) ? 1 : 0); if (lptr->wlen > 255) ereport(ERROR, (errcode(ERRCODE_NAME_TOO_LONG), errmsg("name of level is too long"), errdetail("Name length is %d, must " "be < 256, in position %d.", lptr->wlen, pos))); state = LQPRS_WAITLEVEL; curqlevel = NEXTLEV(curqlevel); } else if (ISALNUM(ptr)) { if (lptr->flag) UNCHAR; } else UNCHAR; } else if (state == LQPRS_WAITOPEN) { if (charlen == 1 && t_iseq(ptr, '{')) state = LQPRS_WAITFNUM; else if (charlen == 1 && t_iseq(ptr, '.')) { curqlevel->low = 0; curqlevel->high = 0xffff; curqlevel = NEXTLEV(curqlevel); state = LQPRS_WAITLEVEL; } else UNCHAR; } else if (state == LQPRS_WAITFNUM) { if (charlen == 1 && t_iseq(ptr, ',')) state = LQPRS_WAITSNUM; else if (t_isdigit(ptr)) { curqlevel->low = atoi(ptr); state = LQPRS_WAITND; } else UNCHAR; } else if (state == LQPRS_WAITSNUM) { if (t_isdigit(ptr)) { curqlevel->high = atoi(ptr); state = LQPRS_WAITCLOSE; } else if (charlen == 1 && t_iseq(ptr, '}')) { curqlevel->high = 0xffff; state = LQPRS_WAITEND; } else UNCHAR; } else if (state == LQPRS_WAITCLOSE) { if (charlen == 1 && t_iseq(ptr, '}')) state = LQPRS_WAITEND; else if (!t_isdigit(ptr)) UNCHAR; } else if (state == LQPRS_WAITND) { if (charlen == 1 && t_iseq(ptr, '}')) { curqlevel->high = curqlevel->low; state = LQPRS_WAITEND; } else if (charlen == 1 && t_iseq(ptr, ',')) state = LQPRS_WAITSNUM; else if (!t_isdigit(ptr)) UNCHAR; } else if (state == LQPRS_WAITEND) { if (charlen == 1 && t_iseq(ptr, '.')) { state = LQPRS_WAITLEVEL; curqlevel = NEXTLEV(curqlevel); } else UNCHAR; } else /* internal error */ elog(ERROR, "internal error in parser"); ptr += charlen; if (state == LQPRS_WAITDELIM) lptr->wlen++; pos++; } if (state == LQPRS_WAITDELIM) { if (lptr->start == ptr) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("syntax error"), errdetail("Unexpected end of line."))); lptr->len = ptr - lptr->start - ((lptr->flag & LVAR_SUBLEXEME) ? 1 : 0) - ((lptr->flag & LVAR_INCASE) ? 1 : 0) - ((lptr->flag & LVAR_ANYEND) ? 1 : 0); if (lptr->len == 0) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("syntax error"), errdetail("Unexpected end of line."))); if (lptr->wlen > 255) ereport(ERROR, (errcode(ERRCODE_NAME_TOO_LONG), errmsg("name of level is too long"), errdetail("Name length is %d, must " "be < 256, in position %d.", lptr->wlen, pos))); } else if (state == LQPRS_WAITOPEN) curqlevel->high = 0xffff; else if (state != LQPRS_WAITEND) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("syntax error"), errdetail("Unexpected end of line."))); curqlevel = tmpql; totallen = LQUERY_HDRSIZE; while ((char *) curqlevel - (char *) tmpql < num * ITEMSIZE) { totallen += LQL_HDRSIZE; if (curqlevel->numvar) { lptr = GETVAR(curqlevel); while (lptr - GETVAR(curqlevel) < curqlevel->numvar) { totallen += MAXALIGN(LVAR_HDRSIZE + lptr->len); lptr++; } } else if (curqlevel->low > curqlevel->high) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("syntax error"), errdetail("Low limit(%d) is greater than upper(%d).", curqlevel->low, curqlevel->high))); curqlevel = NEXTLEV(curqlevel); } result = (lquery *) palloc0(totallen); SET_VARSIZE(result, totallen); result->numlevel = num; result->firstgood = 0; result->flag = 0; if (hasnot) result->flag |= LQUERY_HASNOT; cur = LQUERY_FIRST(result); curqlevel = tmpql; while ((char *) curqlevel - (char *) tmpql < num * ITEMSIZE) { memcpy(cur, curqlevel, LQL_HDRSIZE); cur->totallen = LQL_HDRSIZE; if (curqlevel->numvar) { lrptr = LQL_FIRST(cur); lptr = GETVAR(curqlevel); while (lptr - GETVAR(curqlevel) < curqlevel->numvar) { cur->totallen += MAXALIGN(LVAR_HDRSIZE + lptr->len); lrptr->len = lptr->len; lrptr->flag = lptr->flag; lrptr->val = ltree_crc32_sz(lptr->start, lptr->len); memcpy(lrptr->name, lptr->start, lptr->len); lptr++; lrptr = LVAR_NEXT(lrptr); } pfree(GETVAR(curqlevel)); if (cur->numvar > 1 || cur->flag != 0) wasbad = true; else if (wasbad == false) (result->firstgood)++; } else wasbad = true; curqlevel = NEXTLEV(curqlevel); cur = LQL_NEXT(cur); } pfree(tmpql); PG_RETURN_POINTER(result); }
/* * Insert a tuple to the new relation. This has to track heap_insert * and its subsidiary functions! * * t_self of the tuple is set to the new TID of the tuple. If t_ctid of the * tuple is invalid on entry, it's replaced with the new TID as well (in * the inserted data only, not in the caller's copy). */ static void raw_heap_insert(RewriteState state, HeapTuple tup) { Page page = state->rs_buffer; Size pageFreeSpace, saveFreeSpace; Size len; OffsetNumber newoff; HeapTuple heaptup; /* * If the new tuple is too big for storage or contains already toasted * out-of-line attributes from some other relation, invoke the toaster. * * Note: below this point, heaptup is the data we actually intend to store * into the relation; tup is the caller's original untoasted data. */ if (state->rs_new_rel->rd_rel->relkind == RELKIND_TOASTVALUE) { /* toast table entries should never be recursively toasted */ Assert(!HeapTupleHasExternal(tup)); heaptup = tup; } else if (HeapTupleHasExternal(tup) || tup->t_len > TOAST_TUPLE_THRESHOLD) heaptup = toast_insert_or_update(state->rs_new_rel, tup, NULL, HEAP_INSERT_SKIP_FSM | (state->rs_use_wal ? 0 : HEAP_INSERT_SKIP_WAL)); else heaptup = tup; len = MAXALIGN(heaptup->t_len); /* be conservative */ /* * If we're gonna fail for oversize tuple, do it right away */ if (len > MaxHeapTupleSize) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("row is too big: size %lu, maximum size %lu", (unsigned long) len, (unsigned long) MaxHeapTupleSize))); /* Compute desired extra freespace due to fillfactor option */ saveFreeSpace = RelationGetTargetPageFreeSpace(state->rs_new_rel, HEAP_DEFAULT_FILLFACTOR); /* Now we can check to see if there's enough free space already. */ if (state->rs_buffer_valid) { pageFreeSpace = PageGetHeapFreeSpace(page); if (len + saveFreeSpace > pageFreeSpace) { /* Doesn't fit, so write out the existing page */ /* XLOG stuff */ if (state->rs_use_wal) log_newpage(&state->rs_new_rel->rd_node, MAIN_FORKNUM, state->rs_blockno, page, true); /* * Now write the page. We say isTemp = true even if it's not a * temp table, because there's no need for smgr to schedule an * fsync for this write; we'll do it ourselves in * end_heap_rewrite. */ RelationOpenSmgr(state->rs_new_rel); PageSetChecksumInplace(page, state->rs_blockno); smgrextend(state->rs_new_rel->rd_smgr, MAIN_FORKNUM, state->rs_blockno, (char *) page, true); state->rs_blockno++; state->rs_buffer_valid = false; } } if (!state->rs_buffer_valid) { /* Initialize a new empty page */ PageInit(page, BLCKSZ, 0); state->rs_buffer_valid = true; } /* And now we can insert the tuple into the page */ newoff = PageAddItem(page, (Item) heaptup->t_data, heaptup->t_len, InvalidOffsetNumber, false, true); if (newoff == InvalidOffsetNumber) elog(ERROR, "failed to add tuple"); /* Update caller's t_self to the actual position where it was stored */ ItemPointerSet(&(tup->t_self), state->rs_blockno, newoff); /* * Insert the correct position into CTID of the stored tuple, too, if the * caller didn't supply a valid CTID. */ if (!ItemPointerIsValid(&tup->t_data->t_ctid)) { ItemId newitemid; HeapTupleHeader onpage_tup; newitemid = PageGetItemId(page, newoff); onpage_tup = (HeapTupleHeader) PageGetItem(page, newitemid); onpage_tup->t_ctid = tup->t_self; } /* If heaptup is a private copy, release it. */ if (heaptup != tup) heap_freetuple(heaptup); }
/* * Update tuple origtup (size origsz), located in offset oldoff of buffer * oldbuf, to newtup (size newsz) as summary tuple for the page range starting * at heapBlk. oldbuf must not be locked on entry, and is not locked at exit. * * If samepage is true, attempt to put the new tuple in the same page, but if * there's no room, use some other one. * * If the update is successful, return true; the revmap is updated to point to * the new tuple. If the update is not done for whatever reason, return false. * Caller may retry the update if this happens. */ bool brin_doupdate(Relation idxrel, BlockNumber pagesPerRange, BrinRevmap *revmap, BlockNumber heapBlk, Buffer oldbuf, OffsetNumber oldoff, const BrinTuple *origtup, Size origsz, const BrinTuple *newtup, Size newsz, bool samepage) { Page oldpage; ItemId oldlp; BrinTuple *oldtup; Size oldsz; Buffer newbuf; bool extended; Assert(newsz == MAXALIGN(newsz)); /* If the item is oversized, don't bother. */ if (newsz > BrinMaxItemSize) { ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("index row size %lu exceeds maximum %lu for index \"%s\"", (unsigned long) newsz, (unsigned long) BrinMaxItemSize, RelationGetRelationName(idxrel)))); return false; /* keep compiler quiet */ } /* make sure the revmap is long enough to contain the entry we need */ brinRevmapExtend(revmap, heapBlk); if (!samepage) { /* need a page on which to put the item */ newbuf = brin_getinsertbuffer(idxrel, oldbuf, newsz, &extended); if (!BufferIsValid(newbuf)) { Assert(!extended); return false; } /* * Note: it's possible (though unlikely) that the returned newbuf is * the same as oldbuf, if brin_getinsertbuffer determined that the old * buffer does in fact have enough space. */ if (newbuf == oldbuf) { Assert(!extended); newbuf = InvalidBuffer; } } else { LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE); newbuf = InvalidBuffer; extended = false; } oldpage = BufferGetPage(oldbuf); oldlp = PageGetItemId(oldpage, oldoff); /* * Check that the old tuple wasn't updated concurrently: it might have * moved someplace else entirely ... */ if (!ItemIdIsNormal(oldlp)) { LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); /* * If this happens, and the new buffer was obtained by extending the * relation, then we need to ensure we don't leave it uninitialized or * forget about it. */ if (BufferIsValid(newbuf)) { if (extended) brin_initialize_empty_new_buffer(idxrel, newbuf); UnlockReleaseBuffer(newbuf); if (extended) FreeSpaceMapVacuum(idxrel); } return false; } oldsz = ItemIdGetLength(oldlp); oldtup = (BrinTuple *) PageGetItem(oldpage, oldlp); /* * ... or it might have been updated in place to different contents. */ if (!brin_tuples_equal(oldtup, oldsz, origtup, origsz)) { LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); if (BufferIsValid(newbuf)) { if (extended) brin_initialize_empty_new_buffer(idxrel, newbuf); UnlockReleaseBuffer(newbuf); if (extended) FreeSpaceMapVacuum(idxrel); } return false; } /* * Great, the old tuple is intact. We can proceed with the update. * * If there's enough room in the old page for the new tuple, replace it. * * Note that there might now be enough space on the page even though the * caller told us there isn't, if a concurrent update moved another tuple * elsewhere or replaced a tuple with a smaller one. */ if (((BrinPageFlags(oldpage) & BRIN_EVACUATE_PAGE) == 0) && brin_can_do_samepage_update(oldbuf, origsz, newsz)) { if (BufferIsValid(newbuf)) { /* as above */ if (extended) brin_initialize_empty_new_buffer(idxrel, newbuf); UnlockReleaseBuffer(newbuf); } START_CRIT_SECTION(); if (!PageIndexTupleOverwrite(oldpage, oldoff, (Item) newtup, newsz)) elog(ERROR, "failed to replace BRIN tuple"); MarkBufferDirty(oldbuf); /* XLOG stuff */ if (RelationNeedsWAL(idxrel)) { xl_brin_samepage_update xlrec; XLogRecPtr recptr; uint8 info = XLOG_BRIN_SAMEPAGE_UPDATE; xlrec.offnum = oldoff; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfBrinSamepageUpdate); XLogRegisterBuffer(0, oldbuf, REGBUF_STANDARD); XLogRegisterBufData(0, (char *) newtup, newsz); recptr = XLogInsert(RM_BRIN_ID, info); PageSetLSN(oldpage, recptr); } END_CRIT_SECTION(); LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); if (extended) FreeSpaceMapVacuum(idxrel); return true; } else if (newbuf == InvalidBuffer) { /* * Not enough space, but caller said that there was. Tell them to * start over. */ LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); return false; } else { /* * Not enough free space on the oldpage. Put the new tuple on the new * page, and update the revmap. */ Page newpage = BufferGetPage(newbuf); Buffer revmapbuf; ItemPointerData newtid; OffsetNumber newoff; BlockNumber newblk = InvalidBlockNumber; Size freespace = 0; revmapbuf = brinLockRevmapPageForUpdate(revmap, heapBlk); START_CRIT_SECTION(); /* * We need to initialize the page if it's newly obtained. Note we * will WAL-log the initialization as part of the update, so we don't * need to do that here. */ if (extended) brin_page_init(BufferGetPage(newbuf), BRIN_PAGETYPE_REGULAR); PageIndexTupleDeleteNoCompact(oldpage, oldoff); newoff = PageAddItem(newpage, (Item) newtup, newsz, InvalidOffsetNumber, false, false); if (newoff == InvalidOffsetNumber) elog(ERROR, "failed to add BRIN tuple to new page"); MarkBufferDirty(oldbuf); MarkBufferDirty(newbuf); /* needed to update FSM below */ if (extended) { newblk = BufferGetBlockNumber(newbuf); freespace = br_page_get_freespace(newpage); } ItemPointerSet(&newtid, BufferGetBlockNumber(newbuf), newoff); brinSetHeapBlockItemptr(revmapbuf, pagesPerRange, heapBlk, newtid); MarkBufferDirty(revmapbuf); /* XLOG stuff */ if (RelationNeedsWAL(idxrel)) { xl_brin_update xlrec; XLogRecPtr recptr; uint8 info; info = XLOG_BRIN_UPDATE | (extended ? XLOG_BRIN_INIT_PAGE : 0); xlrec.insert.offnum = newoff; xlrec.insert.heapBlk = heapBlk; xlrec.insert.pagesPerRange = pagesPerRange; xlrec.oldOffnum = oldoff; XLogBeginInsert(); /* new page */ XLogRegisterData((char *) &xlrec, SizeOfBrinUpdate); XLogRegisterBuffer(0, newbuf, REGBUF_STANDARD | (extended ? REGBUF_WILL_INIT : 0)); XLogRegisterBufData(0, (char *) newtup, newsz); /* revmap page */ XLogRegisterBuffer(1, revmapbuf, REGBUF_STANDARD); /* old page */ XLogRegisterBuffer(2, oldbuf, REGBUF_STANDARD); recptr = XLogInsert(RM_BRIN_ID, info); PageSetLSN(oldpage, recptr); PageSetLSN(newpage, recptr); PageSetLSN(BufferGetPage(revmapbuf), recptr); } END_CRIT_SECTION(); LockBuffer(revmapbuf, BUFFER_LOCK_UNLOCK); LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); UnlockReleaseBuffer(newbuf); if (extended) { Assert(BlockNumberIsValid(newblk)); RecordPageWithFreeSpace(idxrel, newblk, freespace); FreeSpaceMapVacuum(idxrel); } return true; } }
/* * PageIndexTupleDelete * * This routine does the work of removing a tuple from an index page. * * Unlike heap pages, we compact out the line pointer for the removed tuple. */ void PageIndexTupleDelete(Page page, OffsetNumber offnum) { PageHeader phdr = (PageHeader) page; char *addr; ItemId tup; Size size; unsigned offset; int nbytes; int offidx; int nline; /* * As with PageRepairFragmentation, paranoia seems justified. */ if (phdr->pd_lower < SizeOfPageHeaderData || phdr->pd_lower > phdr->pd_upper || phdr->pd_upper > phdr->pd_special || phdr->pd_special > BLCKSZ) ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("corrupted page pointers: lower = %u, upper = %u, special = %u", phdr->pd_lower, phdr->pd_upper, phdr->pd_special))); nline = PageGetMaxOffsetNumber(page); if ((int) offnum <= 0 || (int) offnum > nline) elog(ERROR, "invalid index offnum: %u", offnum); /* change offset number to offset index */ offidx = offnum - 1; tup = PageGetItemId(page, offnum); Assert(ItemIdHasStorage(tup)); size = ItemIdGetLength(tup); offset = ItemIdGetOffset(tup); if (offset < phdr->pd_upper || (offset + size) > phdr->pd_special || offset != MAXALIGN(offset) || size != MAXALIGN(size)) ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("corrupted item pointer: offset = %u, size = %u", offset, (unsigned int) size))); /* * First, we want to get rid of the pd_linp entry for the index tuple. We * copy all subsequent linp's back one slot in the array. We don't use * PageGetItemId, because we are manipulating the _array_, not individual * linp's. */ nbytes = phdr->pd_lower - ((char *) &phdr->pd_linp[offidx + 1] - (char *) phdr); if (nbytes > 0) memmove((char *) &(phdr->pd_linp[offidx]), (char *) &(phdr->pd_linp[offidx + 1]), nbytes); /* * Now move everything between the old upper bound (beginning of tuple * space) and the beginning of the deleted tuple forward, so that space in * the middle of the page is left free. If we've just deleted the tuple * at the beginning of tuple space, then there's no need to do the copy * (and bcopy on some architectures SEGV's if asked to move zero bytes). */ /* beginning of tuple space */ addr = (char *) page + phdr->pd_upper; if (offset > phdr->pd_upper) memmove(addr + size, addr, (int) (offset - phdr->pd_upper)); /* adjust free space boundary pointers */ phdr->pd_upper += size; phdr->pd_lower -= sizeof(ItemIdData); /* * Finally, we need to adjust the linp entries that remain. * * Anything that used to be before the deleted tuple's data was moved * forward by the size of the deleted tuple. */ if (!PageIsEmpty(page)) { int i; nline--; /* there's one less than when we started */ for (i = 1; i <= nline; i++) { ItemId ii = PageGetItemId(phdr, i); Assert(ItemIdHasStorage(ii)); if (ItemIdGetOffset(ii) <= offset) ii->lp_off += size; } } }
/* * PageIndexMultiDelete * * This routine handles the case of deleting multiple tuples from an * index page at once. It is considerably faster than a loop around * PageIndexTupleDelete ... however, the caller *must* supply the array * of item numbers to be deleted in item number order! */ void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems) { PageHeader phdr = (PageHeader) page; Offset pd_lower = phdr->pd_lower; Offset pd_upper = phdr->pd_upper; Offset pd_special = phdr->pd_special; itemIdSortData itemidbase[MaxIndexTuplesPerPage]; ItemIdData newitemids[MaxIndexTuplesPerPage]; itemIdSort itemidptr; ItemId lp; int nline, nused; Size totallen; Size size; unsigned offset; int nextitm; OffsetNumber offnum; Assert(nitems <= MaxIndexTuplesPerPage); /* * If there aren't very many items to delete, then retail * PageIndexTupleDelete is the best way. Delete the items in reverse * order so we don't have to think about adjusting item numbers for * previous deletions. * * TODO: tune the magic number here */ if (nitems <= 2) { while (--nitems >= 0) PageIndexTupleDelete(page, itemnos[nitems]); return; } /* * As with PageRepairFragmentation, paranoia seems justified. */ if (pd_lower < SizeOfPageHeaderData || pd_lower > pd_upper || pd_upper > pd_special || pd_special > BLCKSZ || pd_special != MAXALIGN(pd_special)) ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("corrupted page pointers: lower = %u, upper = %u, special = %u", pd_lower, pd_upper, pd_special))); /* * Scan the item pointer array and build a list of just the ones we are * going to keep. Notice we do not modify the page yet, since we are * still validity-checking. */ nline = PageGetMaxOffsetNumber(page); itemidptr = itemidbase; totallen = 0; nused = 0; nextitm = 0; for (offnum = FirstOffsetNumber; offnum <= nline; offnum = OffsetNumberNext(offnum)) { lp = PageGetItemId(page, offnum); Assert(ItemIdHasStorage(lp)); size = ItemIdGetLength(lp); offset = ItemIdGetOffset(lp); if (offset < pd_upper || (offset + size) > pd_special || offset != MAXALIGN(offset)) ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("corrupted item pointer: offset = %u, length = %u", offset, (unsigned int) size))); if (nextitm < nitems && offnum == itemnos[nextitm]) { /* skip item to be deleted */ nextitm++; } else { itemidptr->offsetindex = nused; /* where it will go */ itemidptr->itemoff = offset; itemidptr->alignedlen = MAXALIGN(size); totallen += itemidptr->alignedlen; newitemids[nused] = *lp; itemidptr++; nused++; } } /* this will catch invalid or out-of-order itemnos[] */ if (nextitm != nitems) elog(ERROR, "incorrect index offsets supplied"); if (totallen > (Size) (pd_special - pd_lower)) ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("corrupted item lengths: total %u, available space %u", (unsigned int) totallen, pd_special - pd_lower))); /* * Looks good. Overwrite the line pointers with the copy, from which we've * removed all the unused items. */ memcpy(phdr->pd_linp, newitemids, nused * sizeof(ItemIdData)); phdr->pd_lower = SizeOfPageHeaderData + nused * sizeof(ItemIdData); /* and compactify the tuple data */ compactify_tuples(itemidbase, nused, page); }
/* * _hash_init() -- Initialize the metadata page of a hash index, * the initial buckets, and the initial bitmap page. * * The initial number of buckets is dependent on num_tuples, an estimate * of the number of tuples to be loaded into the index initially. The * chosen number of buckets is returned. * * We are fairly cavalier about locking here, since we know that no one else * could be accessing this index. In particular the rule about not holding * multiple buffer locks is ignored. */ uint32 _hash_init(Relation rel, double num_tuples, ForkNumber forkNum) { Buffer metabuf; Buffer buf; Buffer bitmapbuf; Page pg; HashMetaPage metap; RegProcedure procid; int32 data_width; int32 item_width; int32 ffactor; uint32 num_buckets; uint32 i; bool use_wal; /* safety check */ if (RelationGetNumberOfBlocksInFork(rel, forkNum) != 0) elog(ERROR, "cannot initialize non-empty hash index \"%s\"", RelationGetRelationName(rel)); /* * WAL log creation of pages if the relation is persistent, or this is the * init fork. Init forks for unlogged relations always need to be WAL * logged. */ use_wal = RelationNeedsWAL(rel) || forkNum == INIT_FORKNUM; /* * Determine the target fill factor (in tuples per bucket) for this index. * The idea is to make the fill factor correspond to pages about as full * as the user-settable fillfactor parameter says. We can compute it * exactly since the index datatype (i.e. uint32 hash key) is fixed-width. */ data_width = sizeof(uint32); item_width = MAXALIGN(sizeof(IndexTupleData)) + MAXALIGN(data_width) + sizeof(ItemIdData); /* include the line pointer */ ffactor = RelationGetTargetPageUsage(rel, HASH_DEFAULT_FILLFACTOR) / item_width; /* keep to a sane range */ if (ffactor < 10) ffactor = 10; procid = index_getprocid(rel, 1, HASHSTANDARD_PROC); /* * We initialize the metapage, the first N bucket pages, and the first * bitmap page in sequence, using _hash_getnewbuf to cause smgrextend() * calls to occur. This ensures that the smgr level has the right idea of * the physical index length. * * Critical section not required, because on error the creation of the * whole relation will be rolled back. */ metabuf = _hash_getnewbuf(rel, HASH_METAPAGE, forkNum); _hash_init_metabuffer(metabuf, num_tuples, procid, ffactor, false); MarkBufferDirty(metabuf); pg = BufferGetPage(metabuf); metap = HashPageGetMeta(pg); /* XLOG stuff */ if (use_wal) { xl_hash_init_meta_page xlrec; XLogRecPtr recptr; xlrec.num_tuples = num_tuples; xlrec.procid = metap->hashm_procid; xlrec.ffactor = metap->hashm_ffactor; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfHashInitMetaPage); XLogRegisterBuffer(0, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD); recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_INIT_META_PAGE); PageSetLSN(BufferGetPage(metabuf), recptr); } num_buckets = metap->hashm_maxbucket + 1; /* * Release buffer lock on the metapage while we initialize buckets. * Otherwise, we'll be in interrupt holdoff and the CHECK_FOR_INTERRUPTS * won't accomplish anything. It's a bad idea to hold buffer locks for * long intervals in any case, since that can block the bgwriter. */ LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); /* * Initialize and WAL Log the first N buckets */ for (i = 0; i < num_buckets; i++) { BlockNumber blkno; /* Allow interrupts, in case N is huge */ CHECK_FOR_INTERRUPTS(); blkno = BUCKET_TO_BLKNO(metap, i); buf = _hash_getnewbuf(rel, blkno, forkNum); _hash_initbuf(buf, metap->hashm_maxbucket, i, LH_BUCKET_PAGE, false); MarkBufferDirty(buf); if (use_wal) log_newpage(&rel->rd_node, forkNum, blkno, BufferGetPage(buf), true); _hash_relbuf(rel, buf); } /* Now reacquire buffer lock on metapage */ LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); /* * Initialize bitmap page */ bitmapbuf = _hash_getnewbuf(rel, num_buckets + 1, forkNum); _hash_initbitmapbuffer(bitmapbuf, metap->hashm_bmsize, false); MarkBufferDirty(bitmapbuf); /* add the new bitmap page to the metapage's list of bitmaps */ /* metapage already has a write lock */ if (metap->hashm_nmaps >= HASH_MAX_BITMAPS) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("out of overflow pages in hash index \"%s\"", RelationGetRelationName(rel)))); metap->hashm_mapp[metap->hashm_nmaps] = num_buckets + 1; metap->hashm_nmaps++; MarkBufferDirty(metabuf); /* XLOG stuff */ if (use_wal) { xl_hash_init_bitmap_page xlrec; XLogRecPtr recptr; xlrec.bmsize = metap->hashm_bmsize; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfHashInitBitmapPage); XLogRegisterBuffer(0, bitmapbuf, REGBUF_WILL_INIT); /* * This is safe only because nobody else can be modifying the index at * this stage; it's only visible to the transaction that is creating * it. */ XLogRegisterBuffer(1, metabuf, REGBUF_STANDARD); recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_INIT_BITMAP_PAGE); PageSetLSN(BufferGetPage(bitmapbuf), recptr); PageSetLSN(BufferGetPage(metabuf), recptr); } /* all done */ _hash_relbuf(rel, bitmapbuf); _hash_relbuf(rel, metabuf); return num_buckets; }
static void makeSublist(Relation index, IndexTuple *tuples, int32 ntuples, GinMetaPageData *res) { Buffer curBuffer = InvalidBuffer; Buffer prevBuffer = InvalidBuffer; int i, size = 0, tupsize; int startTuple = 0; Assert(ntuples > 0); /* * Split tuples into pages */ for (i = 0; i < ntuples; i++) { if (curBuffer == InvalidBuffer) { curBuffer = GinNewBuffer(index); if (prevBuffer != InvalidBuffer) { res->nPendingPages++; writeListPage(index, prevBuffer, tuples + startTuple, i - startTuple, BufferGetBlockNumber(curBuffer)); } else { res->head = BufferGetBlockNumber(curBuffer); } prevBuffer = curBuffer; startTuple = i; size = 0; } tupsize = MAXALIGN(IndexTupleSize(tuples[i])) + sizeof(ItemIdData); if (size + tupsize > GinListPageSize) { /* won't fit, force a new page and reprocess */ i--; curBuffer = InvalidBuffer; } else { size += tupsize; } } /* * Write last page */ res->tail = BufferGetBlockNumber(curBuffer); res->tailFreeSize = writeListPage(index, curBuffer, tuples + startTuple, ntuples - startTuple, InvalidBlockNumber); res->nPendingPages++; /* that was only one heap tuple */ res->nPendingHeapTuples = 1; }
/* * Write bytes into a shared message queue. */ static shm_mq_result shm_mq_send_bytes(shm_mq_handle *mqh, Size nbytes, const void *data, bool nowait, Size *bytes_written) { shm_mq *mq = mqh->mqh_queue; Size sent = 0; uint64 used; Size ringsize = mq->mq_ring_size; Size available; while (sent < nbytes) { bool detached; uint64 rb; /* Compute number of ring buffer bytes used and available. */ rb = shm_mq_get_bytes_read(mq, &detached); Assert(mq->mq_bytes_written >= rb); used = mq->mq_bytes_written - rb; Assert(used <= ringsize); available = Min(ringsize - used, nbytes - sent); /* Bail out if the queue has been detached. */ if (detached) { *bytes_written = sent; return SHM_MQ_DETACHED; } if (available == 0 && !mqh->mqh_counterparty_attached) { /* * The queue is full, so if the receiver isn't yet known to be * attached, we must wait for that to happen. */ if (nowait) { if (shm_mq_get_receiver(mq) == NULL) { *bytes_written = sent; return SHM_MQ_WOULD_BLOCK; } } else if (!shm_mq_wait_internal(mq, &mq->mq_receiver, mqh->mqh_handle)) { mq->mq_detached = true; *bytes_written = sent; return SHM_MQ_DETACHED; } mqh->mqh_counterparty_attached = true; /* * The receiver may have read some data after attaching, so we * must not wait without rechecking the queue state. */ } else if (available == 0) { shm_mq_result res; /* Let the receiver know that we need them to read some data. */ res = shm_mq_notify_receiver(mq); if (res != SHM_MQ_SUCCESS) { *bytes_written = sent; return res; } /* Skip manipulation of our latch if nowait = true. */ if (nowait) { *bytes_written = sent; return SHM_MQ_WOULD_BLOCK; } /* * Wait for our latch to be set. It might already be set for some * unrelated reason, but that'll just result in one extra trip * through the loop. It's worth it to avoid resetting the latch * at top of loop, because setting an already-set latch is much * cheaper than setting one that has been reset. */ WaitLatch(MyLatch, WL_LATCH_SET, 0); /* An interrupt may have occurred while we were waiting. */ CHECK_FOR_INTERRUPTS(); /* Reset the latch so we don't spin. */ ResetLatch(MyLatch); } else { Size offset = mq->mq_bytes_written % (uint64) ringsize; Size sendnow = Min(available, ringsize - offset); /* Write as much data as we can via a single memcpy(). */ memcpy(&mq->mq_ring[mq->mq_ring_offset + offset], (char *) data + sent, sendnow); sent += sendnow; /* * Update count of bytes written, with alignment padding. Note * that this will never actually insert any padding except at the * end of a run of bytes, because the buffer size is a multiple of * MAXIMUM_ALIGNOF, and each read is as well. */ Assert(sent == nbytes || sendnow == MAXALIGN(sendnow)); shm_mq_inc_bytes_written(mq, MAXALIGN(sendnow)); /* * For efficiency, we don't set the reader's latch here. We'll do * that only when the buffer fills up or after writing an entire * message. */ } } *bytes_written = sent; return SHM_MQ_SUCCESS; }
/* * _hash_splitbucket -- split 'obucket' into 'obucket' and 'nbucket' * * This routine is used to partition the tuples between old and new bucket and * is used to finish the incomplete split operations. To finish the previously * interrupted split operation, the caller needs to fill htab. If htab is set, * then we skip the movement of tuples that exists in htab, otherwise NULL * value of htab indicates movement of all the tuples that belong to the new * bucket. * * We are splitting a bucket that consists of a base bucket page and zero * or more overflow (bucket chain) pages. We must relocate tuples that * belong in the new bucket. * * The caller must hold cleanup locks on both buckets to ensure that * no one else is trying to access them (see README). * * The caller must hold a pin, but no lock, on the metapage buffer. * The buffer is returned in the same state. (The metapage is only * touched if it becomes necessary to add or remove overflow pages.) * * Split needs to retain pin on primary bucket pages of both old and new * buckets till end of operation. This is to prevent vacuum from starting * while a split is in progress. * * In addition, the caller must have created the new bucket's base page, * which is passed in buffer nbuf, pinned and write-locked. The lock will be * released here and pin must be released by the caller. (The API is set up * this way because we must do _hash_getnewbuf() before releasing the metapage * write lock. So instead of passing the new bucket's start block number, we * pass an actual buffer.) */ static void _hash_splitbucket(Relation rel, Buffer metabuf, Bucket obucket, Bucket nbucket, Buffer obuf, Buffer nbuf, HTAB *htab, uint32 maxbucket, uint32 highmask, uint32 lowmask) { Buffer bucket_obuf; Buffer bucket_nbuf; Page opage; Page npage; HashPageOpaque oopaque; HashPageOpaque nopaque; OffsetNumber itup_offsets[MaxIndexTuplesPerPage]; IndexTuple itups[MaxIndexTuplesPerPage]; Size all_tups_size = 0; int i; uint16 nitups = 0; bucket_obuf = obuf; opage = BufferGetPage(obuf); oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); bucket_nbuf = nbuf; npage = BufferGetPage(nbuf); nopaque = (HashPageOpaque) PageGetSpecialPointer(npage); /* Copy the predicate locks from old bucket to new bucket. */ PredicateLockPageSplit(rel, BufferGetBlockNumber(bucket_obuf), BufferGetBlockNumber(bucket_nbuf)); /* * Partition the tuples in the old bucket between the old bucket and the * new bucket, advancing along the old bucket's overflow bucket chain and * adding overflow pages to the new bucket as needed. Outer loop iterates * once per page in old bucket. */ for (;;) { BlockNumber oblkno; OffsetNumber ooffnum; OffsetNumber omaxoffnum; /* Scan each tuple in old page */ omaxoffnum = PageGetMaxOffsetNumber(opage); for (ooffnum = FirstOffsetNumber; ooffnum <= omaxoffnum; ooffnum = OffsetNumberNext(ooffnum)) { IndexTuple itup; Size itemsz; Bucket bucket; bool found = false; /* skip dead tuples */ if (ItemIdIsDead(PageGetItemId(opage, ooffnum))) continue; /* * Before inserting a tuple, probe the hash table containing TIDs * of tuples belonging to new bucket, if we find a match, then * skip that tuple, else fetch the item's hash key (conveniently * stored in the item) and determine which bucket it now belongs * in. */ itup = (IndexTuple) PageGetItem(opage, PageGetItemId(opage, ooffnum)); if (htab) (void) hash_search(htab, &itup->t_tid, HASH_FIND, &found); if (found) continue; bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup), maxbucket, highmask, lowmask); if (bucket == nbucket) { IndexTuple new_itup; /* * make a copy of index tuple as we have to scribble on it. */ new_itup = CopyIndexTuple(itup); /* * mark the index tuple as moved by split, such tuples are * skipped by scan if there is split in progress for a bucket. */ new_itup->t_info |= INDEX_MOVED_BY_SPLIT_MASK; /* * insert the tuple into the new bucket. if it doesn't fit on * the current page in the new bucket, we must allocate a new * overflow page and place the tuple on that page instead. */ itemsz = IndexTupleSize(new_itup); itemsz = MAXALIGN(itemsz); if (PageGetFreeSpaceForMultipleTuples(npage, nitups + 1) < (all_tups_size + itemsz)) { /* * Change the shared buffer state in critical section, * otherwise any error could make it unrecoverable. */ START_CRIT_SECTION(); _hash_pgaddmultitup(rel, nbuf, itups, itup_offsets, nitups); MarkBufferDirty(nbuf); /* log the split operation before releasing the lock */ log_split_page(rel, nbuf); END_CRIT_SECTION(); /* drop lock, but keep pin */ LockBuffer(nbuf, BUFFER_LOCK_UNLOCK); /* be tidy */ for (i = 0; i < nitups; i++) pfree(itups[i]); nitups = 0; all_tups_size = 0; /* chain to a new overflow page */ nbuf = _hash_addovflpage(rel, metabuf, nbuf, (nbuf == bucket_nbuf) ? true : false); npage = BufferGetPage(nbuf); nopaque = (HashPageOpaque) PageGetSpecialPointer(npage); } itups[nitups++] = new_itup; all_tups_size += itemsz; } else { /* * the tuple stays on this page, so nothing to do. */ Assert(bucket == obucket); } } oblkno = oopaque->hasho_nextblkno; /* retain the pin on the old primary bucket */ if (obuf == bucket_obuf) LockBuffer(obuf, BUFFER_LOCK_UNLOCK); else _hash_relbuf(rel, obuf); /* Exit loop if no more overflow pages in old bucket */ if (!BlockNumberIsValid(oblkno)) { /* * Change the shared buffer state in critical section, otherwise * any error could make it unrecoverable. */ START_CRIT_SECTION(); _hash_pgaddmultitup(rel, nbuf, itups, itup_offsets, nitups); MarkBufferDirty(nbuf); /* log the split operation before releasing the lock */ log_split_page(rel, nbuf); END_CRIT_SECTION(); if (nbuf == bucket_nbuf) LockBuffer(nbuf, BUFFER_LOCK_UNLOCK); else _hash_relbuf(rel, nbuf); /* be tidy */ for (i = 0; i < nitups; i++) pfree(itups[i]); break; } /* Else, advance to next old page */ obuf = _hash_getbuf(rel, oblkno, HASH_READ, LH_OVERFLOW_PAGE); opage = BufferGetPage(obuf); oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); } /* * We're at the end of the old bucket chain, so we're done partitioning * the tuples. Mark the old and new buckets to indicate split is * finished. * * To avoid deadlocks due to locking order of buckets, first lock the old * bucket and then the new bucket. */ LockBuffer(bucket_obuf, BUFFER_LOCK_EXCLUSIVE); opage = BufferGetPage(bucket_obuf); oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); LockBuffer(bucket_nbuf, BUFFER_LOCK_EXCLUSIVE); npage = BufferGetPage(bucket_nbuf); nopaque = (HashPageOpaque) PageGetSpecialPointer(npage); START_CRIT_SECTION(); oopaque->hasho_flag &= ~LH_BUCKET_BEING_SPLIT; nopaque->hasho_flag &= ~LH_BUCKET_BEING_POPULATED; /* * After the split is finished, mark the old bucket to indicate that it * contains deletable tuples. We will clear split-cleanup flag after * deleting such tuples either at the end of split or at the next split * from old bucket or at the time of vacuum. */ oopaque->hasho_flag |= LH_BUCKET_NEEDS_SPLIT_CLEANUP; /* * now write the buffers, here we don't release the locks as caller is * responsible to release locks. */ MarkBufferDirty(bucket_obuf); MarkBufferDirty(bucket_nbuf); if (RelationNeedsWAL(rel)) { XLogRecPtr recptr; xl_hash_split_complete xlrec; xlrec.old_bucket_flag = oopaque->hasho_flag; xlrec.new_bucket_flag = nopaque->hasho_flag; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfHashSplitComplete); XLogRegisterBuffer(0, bucket_obuf, REGBUF_STANDARD); XLogRegisterBuffer(1, bucket_nbuf, REGBUF_STANDARD); recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_COMPLETE); PageSetLSN(BufferGetPage(bucket_obuf), recptr); PageSetLSN(BufferGetPage(bucket_nbuf), recptr); } END_CRIT_SECTION(); /* * If possible, clean up the old bucket. We might not be able to do this * if someone else has a pin on it, but if not then we can go ahead. This * isn't absolutely necessary, but it reduces bloat; if we don't do it * now, VACUUM will do it eventually, but maybe not until new overflow * pages have been allocated. Note that there's no need to clean up the * new bucket. */ if (IsBufferCleanupOK(bucket_obuf)) { LockBuffer(bucket_nbuf, BUFFER_LOCK_UNLOCK); hashbucketcleanup(rel, obucket, bucket_obuf, BufferGetBlockNumber(bucket_obuf), NULL, maxbucket, highmask, lowmask, NULL, NULL, true, NULL, NULL); } else { LockBuffer(bucket_nbuf, BUFFER_LOCK_UNLOCK); LockBuffer(bucket_obuf, BUFFER_LOCK_UNLOCK); } }
/* * Receive a message from a shared message queue. * * We set *nbytes to the message length and *data to point to the message * payload. If the entire message exists in the queue as a single, * contiguous chunk, *data will point directly into shared memory; otherwise, * it will point to a temporary buffer. This mostly avoids data copying in * the hoped-for case where messages are short compared to the buffer size, * while still allowing longer messages. In either case, the return value * remains valid until the next receive operation is perfomed on the queue. * * When nowait = false, we'll wait on our process latch when the ring buffer * is empty and we have not yet received a full message. The sender will * set our process latch after more data has been written, and we'll resume * processing. Each call will therefore return a complete message * (unless the sender detaches the queue). * * When nowait = true, we do not manipulate the state of the process latch; * instead, whenever the buffer is empty and we need to read from it, we * return SHM_MQ_WOULD_BLOCK. In this case, the caller should call this * function again after the process latch has been set. */ shm_mq_result shm_mq_receive(shm_mq_handle *mqh, Size *nbytesp, void **datap, bool nowait) { shm_mq *mq = mqh->mqh_queue; shm_mq_result res; Size rb = 0; Size nbytes; void *rawdata; Assert(mq->mq_receiver == MyProc); /* We can't receive data until the sender has attached. */ if (!mqh->mqh_counterparty_attached) { if (nowait) { if (shm_mq_get_sender(mq) == NULL) return SHM_MQ_WOULD_BLOCK; } else if (!shm_mq_wait_internal(mq, &mq->mq_sender, mqh->mqh_handle) && shm_mq_get_sender(mq) == NULL) { mq->mq_detached = true; return SHM_MQ_DETACHED; } mqh->mqh_counterparty_attached = true; } /* Consume any zero-copy data from previous receive operation. */ if (mqh->mqh_consume_pending > 0) { shm_mq_inc_bytes_read(mq, mqh->mqh_consume_pending); mqh->mqh_consume_pending = 0; } /* Try to read, or finish reading, the length word from the buffer. */ while (!mqh->mqh_length_word_complete) { /* Try to receive the message length word. */ Assert(mqh->mqh_partial_bytes < sizeof(Size)); res = shm_mq_receive_bytes(mq, sizeof(Size) - mqh->mqh_partial_bytes, nowait, &rb, &rawdata); if (res != SHM_MQ_SUCCESS) return res; /* * Hopefully, we'll receive the entire message length word at once. * But if sizeof(Size) > MAXIMUM_ALIGNOF, then it might be split over * multiple reads. */ if (mqh->mqh_partial_bytes == 0 && rb >= sizeof(Size)) { Size needed; nbytes = *(Size *) rawdata; /* If we've already got the whole message, we're done. */ needed = MAXALIGN(sizeof(Size)) + MAXALIGN(nbytes); if (rb >= needed) { /* * Technically, we could consume the message length * information at this point, but the extra write to shared * memory wouldn't be free and in most cases we would reap no * benefit. */ mqh->mqh_consume_pending = needed; *nbytesp = nbytes; *datap = ((char *) rawdata) + MAXALIGN(sizeof(Size)); return SHM_MQ_SUCCESS; } /* * We don't have the whole message, but we at least have the whole * length word. */ mqh->mqh_expected_bytes = nbytes; mqh->mqh_length_word_complete = true; shm_mq_inc_bytes_read(mq, MAXALIGN(sizeof(Size))); rb -= MAXALIGN(sizeof(Size)); } else { Size lengthbytes; /* Can't be split unless bigger than required alignment. */ Assert(sizeof(Size) > MAXIMUM_ALIGNOF); /* Message word is split; need buffer to reassemble. */ if (mqh->mqh_buffer == NULL) { mqh->mqh_buffer = MemoryContextAlloc(mqh->mqh_context, MQH_INITIAL_BUFSIZE); mqh->mqh_buflen = MQH_INITIAL_BUFSIZE; } Assert(mqh->mqh_buflen >= sizeof(Size)); /* Copy and consume partial length word. */ if (mqh->mqh_partial_bytes + rb > sizeof(Size)) lengthbytes = sizeof(Size) - mqh->mqh_partial_bytes; else lengthbytes = rb; memcpy(&mqh->mqh_buffer[mqh->mqh_partial_bytes], rawdata, lengthbytes); mqh->mqh_partial_bytes += lengthbytes; shm_mq_inc_bytes_read(mq, MAXALIGN(lengthbytes)); rb -= lengthbytes; /* If we now have the whole word, we're ready to read payload. */ if (mqh->mqh_partial_bytes >= sizeof(Size)) { Assert(mqh->mqh_partial_bytes == sizeof(Size)); mqh->mqh_expected_bytes = *(Size *) mqh->mqh_buffer; mqh->mqh_length_word_complete = true; mqh->mqh_partial_bytes = 0; } } } nbytes = mqh->mqh_expected_bytes; if (mqh->mqh_partial_bytes == 0) { /* * Try to obtain the whole message in a single chunk. If this works, * we need not copy the data and can return a pointer directly into * shared memory. */ res = shm_mq_receive_bytes(mq, nbytes, nowait, &rb, &rawdata); if (res != SHM_MQ_SUCCESS) return res; if (rb >= nbytes) { mqh->mqh_length_word_complete = false; mqh->mqh_consume_pending = MAXALIGN(nbytes); *nbytesp = nbytes; *datap = rawdata; return SHM_MQ_SUCCESS; } /* * The message has wrapped the buffer. We'll need to copy it in order * to return it to the client in one chunk. First, make sure we have * a large enough buffer available. */ if (mqh->mqh_buflen < nbytes) { Size newbuflen = Max(mqh->mqh_buflen, MQH_INITIAL_BUFSIZE); while (newbuflen < nbytes) newbuflen *= 2; if (mqh->mqh_buffer != NULL) { pfree(mqh->mqh_buffer); mqh->mqh_buffer = NULL; mqh->mqh_buflen = 0; } mqh->mqh_buffer = MemoryContextAlloc(mqh->mqh_context, newbuflen); mqh->mqh_buflen = newbuflen; } } /* Loop until we've copied the entire message. */ for (;;) { Size still_needed; /* Copy as much as we can. */ Assert(mqh->mqh_partial_bytes + rb <= nbytes); memcpy(&mqh->mqh_buffer[mqh->mqh_partial_bytes], rawdata, rb); mqh->mqh_partial_bytes += rb; /* * Update count of bytes read, with alignment padding. Note that this * will never actually insert any padding except at the end of a * message, because the buffer size is a multiple of MAXIMUM_ALIGNOF, * and each read and write is as well. */ Assert(mqh->mqh_partial_bytes == nbytes || rb == MAXALIGN(rb)); shm_mq_inc_bytes_read(mq, MAXALIGN(rb)); /* If we got all the data, exit the loop. */ if (mqh->mqh_partial_bytes >= nbytes) break; /* Wait for some more data. */ still_needed = nbytes - mqh->mqh_partial_bytes; res = shm_mq_receive_bytes(mq, still_needed, nowait, &rb, &rawdata); if (res != SHM_MQ_SUCCESS) return res; if (rb > still_needed) rb = still_needed; } /* Return the complete message, and reset for next message. */ *nbytesp = nbytes; *datap = mqh->mqh_buffer; mqh->mqh_length_word_complete = false; mqh->mqh_partial_bytes = 0; return SHM_MQ_SUCCESS; }
static shm_mq_result shm_mq_send_bytes(shm_mq_handle *mq, Size nbytes, const void *data, bool nowait, Size *bytes_written); static shm_mq_result shm_mq_receive_bytes(shm_mq *mq, Size bytes_needed, bool nowait, Size *nbytesp, void **datap); static bool shm_mq_wait_internal(volatile shm_mq *mq, PGPROC *volatile * ptr, BackgroundWorkerHandle *handle); static uint64 shm_mq_get_bytes_read(volatile shm_mq *mq, bool *detached); static void shm_mq_inc_bytes_read(volatile shm_mq *mq, Size n); static uint64 shm_mq_get_bytes_written(volatile shm_mq *mq, bool *detached); static void shm_mq_inc_bytes_written(volatile shm_mq *mq, Size n); static shm_mq_result shm_mq_notify_receiver(volatile shm_mq *mq); static void shm_mq_detach_callback(dsm_segment *seg, Datum arg); /* Minimum queue size is enough for header and at least one chunk of data. */ const Size shm_mq_minimum_size = MAXALIGN(offsetof(shm_mq, mq_ring)) + MAXIMUM_ALIGNOF; #define MQH_INITIAL_BUFSIZE 8192 /* * Initialize a new shared message queue. */ shm_mq * shm_mq_create(void *address, Size size) { shm_mq *mq = address; Size data_offset = MAXALIGN(offsetof(shm_mq, mq_ring)); /* If the size isn't MAXALIGN'd, just discard the odd bytes. */ size = MAXALIGN_DOWN(size);
/*---------- * Add an item to a disk page from the sort output. * * We must be careful to observe the page layout conventions of nbtsearch.c: * - rightmost pages start data items at P_HIKEY instead of at P_FIRSTKEY. * - on non-leaf pages, the key portion of the first item need not be * stored, we should store only the link. * * A leaf page being built looks like: * * +----------------+---------------------------------+ * | PageHeaderData | linp0 linp1 linp2 ... | * +-----------+----+---------------------------------+ * | ... linpN | | * +-----------+--------------------------------------+ * | ^ last | * | | * +-------------+------------------------------------+ * | | itemN ... | * +-------------+------------------+-----------------+ * | ... item3 item2 item1 | "special space" | * +--------------------------------+-----------------+ * * Contrast this with the diagram in bufpage.h; note the mismatch * between linps and items. This is because we reserve linp0 as a * placeholder for the pointer to the "high key" item; when we have * filled up the page, we will set linp0 to point to itemN and clear * linpN. On the other hand, if we find this is the last (rightmost) * page, we leave the items alone and slide the linp array over. * * 'last' pointer indicates the last offset added to the page. *---------- */ static void _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup) { Page npage; BlockNumber nblkno; OffsetNumber last_off; Size pgspc; Size itupsz; /* * This is a handy place to check for cancel interrupts during the btree * load phase of index creation. */ CHECK_FOR_INTERRUPTS(); npage = state->btps_page; nblkno = state->btps_blkno; last_off = state->btps_lastoff; pgspc = PageGetFreeSpace(npage); itupsz = IndexTupleDSize(*itup); itupsz = MAXALIGN(itupsz); /* * Check whether the item can fit on a btree page at all. (Eventually, we * ought to try to apply TOAST methods if not.) We actually need to be * able to fit three items on every page, so restrict any one item to 1/3 * the per-page available space. Note that at this point, itupsz doesn't * include the ItemId. * * NOTE: similar code appears in _bt_insertonpg() to defend against * oversize items being inserted into an already-existing index. But * during creation of an index, we don't go through there. */ if (itupsz > BTMaxItemSize(npage)) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("index row size %zu exceeds maximum %zu for index \"%s\"", itupsz, BTMaxItemSize(npage), RelationGetRelationName(wstate->index)), errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n" "Consider a function index of an MD5 hash of the value, " "or use full text indexing."), errtableconstraint(wstate->heap, RelationGetRelationName(wstate->index)))); /* * Check to see if page is "full". It's definitely full if the item won't * fit. Otherwise, compare to the target freespace derived from the * fillfactor. However, we must put at least two items on each page, so * disregard fillfactor if we don't have that many. */ if (pgspc < itupsz || (pgspc < state->btps_full && last_off > P_FIRSTKEY)) { /* * Finish off the page and write it out. */ Page opage = npage; BlockNumber oblkno = nblkno; ItemId ii; ItemId hii; IndexTuple oitup; /* Create new page of same level */ npage = _bt_blnewpage(state->btps_level); /* and assign it a page position */ nblkno = wstate->btws_pages_alloced++; /* * We copy the last item on the page into the new page, and then * rearrange the old page so that the 'last item' becomes its high key * rather than a true data item. There had better be at least two * items on the page already, else the page would be empty of useful * data. */ Assert(last_off > P_FIRSTKEY); ii = PageGetItemId(opage, last_off); oitup = (IndexTuple) PageGetItem(opage, ii); _bt_sortaddtup(npage, ItemIdGetLength(ii), oitup, P_FIRSTKEY); /* * Move 'last' into the high key position on opage */ hii = PageGetItemId(opage, P_HIKEY); *hii = *ii; ItemIdSetUnused(ii); /* redundant */ ((PageHeader) opage)->pd_lower -= sizeof(ItemIdData); /* * Link the old page into its parent, using its minimum key. If we * don't have a parent, we have to create one; this adds a new btree * level. */ if (state->btps_next == NULL) state->btps_next = _bt_pagestate(wstate, state->btps_level + 1); Assert(state->btps_minkey != NULL); ItemPointerSet(&(state->btps_minkey->t_tid), oblkno, P_HIKEY); _bt_buildadd(wstate, state->btps_next, state->btps_minkey); pfree(state->btps_minkey); /* * Save a copy of the minimum key for the new page. We have to copy * it off the old page, not the new one, in case we are not at leaf * level. */ state->btps_minkey = CopyIndexTuple(oitup); /* * Set the sibling links for both pages. */ { BTPageOpaque oopaque = (BTPageOpaque) PageGetSpecialPointer(opage); BTPageOpaque nopaque = (BTPageOpaque) PageGetSpecialPointer(npage); oopaque->btpo_next = nblkno; nopaque->btpo_prev = oblkno; nopaque->btpo_next = P_NONE; /* redundant */ } /* * Write out the old page. We never need to touch it again, so we can * free the opage workspace too. */ _bt_blwritepage(wstate, opage, oblkno); /* * Reset last_off to point to new page */ last_off = P_FIRSTKEY; } /* * If the new item is the first for its page, stash a copy for later. Note * this will only happen for the first item on a level; on later pages, * the first item for a page is copied from the prior page in the code * above. */ if (last_off == P_HIKEY) { Assert(state->btps_minkey == NULL); state->btps_minkey = CopyIndexTuple(itup); } /* * Add the new item into the current page. */ last_off = OffsetNumberNext(last_off); _bt_sortaddtup(npage, itupsz, itup, last_off); state->btps_page = npage; state->btps_blkno = nblkno; state->btps_lastoff = last_off; }
/* * PageAddItemExtended * * Add an item to a page. Return value is the offset at which it was * inserted, or InvalidOffsetNumber if the item is not inserted for any * reason. A WARNING is issued indicating the reason for the refusal. * * If flag PAI_OVERWRITE is set, we just store the item at the specified * offsetNumber (which must be either a currently-unused item pointer, * or one past the last existing item). Otherwise, * if offsetNumber is valid and <= current max offset in the page, * insert item into the array at that position by shuffling ItemId's * down to make room. * If offsetNumber is not valid, then assign one by finding the first * one that is both unused and deallocated. * * If flag PAI_IS_HEAP is set, we enforce that there can't be more than * MaxHeapTuplesPerPage line pointers on the page. * * If flag PAI_ALLOW_FAR_OFFSET is not set, we disallow placing items * beyond one past the last existing item. * * !!! EREPORT(ERROR) IS DISALLOWED HERE !!! */ OffsetNumber PageAddItemExtended(Page page, Item item, Size size, OffsetNumber offsetNumber, int flags) { PageHeader phdr = (PageHeader) page; Size alignedSize; int lower; int upper; ItemId itemId; OffsetNumber limit; bool needshuffle = false; /* * Be wary about corrupted page pointers */ if (phdr->pd_lower < SizeOfPageHeaderData || phdr->pd_lower > phdr->pd_upper || phdr->pd_upper > phdr->pd_special || phdr->pd_special > BLCKSZ) ereport(PANIC, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("corrupted page pointers: lower = %u, upper = %u, special = %u", phdr->pd_lower, phdr->pd_upper, phdr->pd_special))); /* * Select offsetNumber to place the new item at */ limit = OffsetNumberNext(PageGetMaxOffsetNumber(page)); /* was offsetNumber passed in? */ if (OffsetNumberIsValid(offsetNumber)) { /* yes, check it */ if ((flags & PAI_OVERWRITE) != 0) { if (offsetNumber < limit) { itemId = PageGetItemId(phdr, offsetNumber); if (ItemIdIsUsed(itemId) || ItemIdHasStorage(itemId)) { elog(WARNING, "will not overwrite a used ItemId"); return InvalidOffsetNumber; } } } else { if (offsetNumber < limit) needshuffle = true; /* need to move existing linp's */ } } else { /* offsetNumber was not passed in, so find a free slot */ /* if no free slot, we'll put it at limit (1st open slot) */ if (PageHasFreeLinePointers(phdr)) { /* * Look for "recyclable" (unused) ItemId. We check for no storage * as well, just to be paranoid --- unused items should never have * storage. */ for (offsetNumber = 1; offsetNumber < limit; offsetNumber++) { itemId = PageGetItemId(phdr, offsetNumber); if (!ItemIdIsUsed(itemId) && !ItemIdHasStorage(itemId)) break; } if (offsetNumber >= limit) { /* the hint is wrong, so reset it */ PageClearHasFreeLinePointers(phdr); } } else { /* don't bother searching if hint says there's no free slot */ offsetNumber = limit; } } /* * Reject placing items beyond the first unused line pointer, unless * caller asked for that behavior specifically. */ if ((flags & PAI_ALLOW_FAR_OFFSET) == 0 && offsetNumber > limit) { elog(WARNING, "specified item offset is too large"); return InvalidOffsetNumber; } /* Reject placing items beyond heap boundary, if heap */ if ((flags & PAI_IS_HEAP) != 0 && offsetNumber > MaxHeapTuplesPerPage) { elog(WARNING, "can't put more than MaxHeapTuplesPerPage items in a heap page"); return InvalidOffsetNumber; } /* * Compute new lower and upper pointers for page, see if it'll fit. * * Note: do arithmetic as signed ints, to avoid mistakes if, say, * alignedSize > pd_upper. */ if ((flags & PAI_ALLOW_FAR_OFFSET) != 0) lower = Max(phdr->pd_lower, SizeOfPageHeaderData + sizeof(ItemIdData) * offsetNumber); else if (offsetNumber == limit || needshuffle) lower = phdr->pd_lower + sizeof(ItemIdData); else lower = phdr->pd_lower; alignedSize = MAXALIGN(size); upper = (int) phdr->pd_upper - (int) alignedSize; if (lower > upper) return InvalidOffsetNumber; /* * OK to insert the item. First, shuffle the existing pointers if needed. */ itemId = PageGetItemId(phdr, offsetNumber); if (needshuffle) memmove(itemId + 1, itemId, (limit - offsetNumber) * sizeof(ItemIdData)); /* set the item pointer */ ItemIdSetNormal(itemId, upper, size); /* * Items normally contain no uninitialized bytes. Core bufpage consumers * conform, but this is not a necessary coding rule; a new index AM could * opt to depart from it. However, data type input functions and other * C-language functions that synthesize datums should initialize all * bytes; datumIsEqual() relies on this. Testing here, along with the * similar check in printtup(), helps to catch such mistakes. * * Values of the "name" type retrieved via index-only scans may contain * uninitialized bytes; see comment in btrescan(). Valgrind will report * this as an error, but it is safe to ignore. */ VALGRIND_CHECK_MEM_IS_DEFINED(item, size); /* copy the item's data onto the page */ memcpy((char *) page + upper, item, size); /* adjust page header */ phdr->pd_lower = (LocationIndex) lower; phdr->pd_upper = (LocationIndex) upper; return offsetNumber; }
/* * PGSharedMemoryCreate * * Create a shared memory segment of the given size and initialize its * standard header. Also, register an on_shmem_exit callback to release * the storage. * * Dead Postgres segments are recycled if found, but we do not fail upon * collision with non-Postgres shmem segments. The idea here is to detect and * re-use keys that may have been assigned by a crashed postmaster or backend. * * makePrivate means to always create a new segment, rather than attach to * or recycle any existing segment. * * The port number is passed for possible use as a key (for SysV, we use * it to generate the starting shmem key). In a standalone backend, * zero will be passed. */ PGShmemHeader * PGSharedMemoryCreate(Size size, bool makePrivate, int port, PGShmemHeader **shim) { IpcMemoryKey NextShmemSegID; void *memAddress; PGShmemHeader *hdr; IpcMemoryId shmid; struct stat statbuf; Size sysvsize; /* Complain if hugepages demanded but we can't possibly support them */ #if !defined(USE_ANONYMOUS_SHMEM) || !defined(MAP_HUGETLB) if (huge_pages == HUGE_PAGES_ON) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("huge pages not supported on this platform"))); #endif /* Room for a header? */ Assert(size > MAXALIGN(sizeof(PGShmemHeader))); #ifdef USE_ANONYMOUS_SHMEM AnonymousShmem = CreateAnonymousSegment(&size); AnonymousShmemSize = size; /* Register on-exit routine to unmap the anonymous segment */ on_shmem_exit(AnonymousShmemDetach, (Datum) 0); /* Now we need only allocate a minimal-sized SysV shmem block. */ sysvsize = sizeof(PGShmemHeader); #else sysvsize = size; #endif /* Make sure PGSharedMemoryAttach doesn't fail without need */ UsedShmemSegAddr = NULL; /* Loop till we find a free IPC key */ NextShmemSegID = port * 1000; for (NextShmemSegID++;; NextShmemSegID++) { /* Try to create new segment */ memAddress = InternalIpcMemoryCreate(NextShmemSegID, sysvsize); if (memAddress) break; /* successful create and attach */ /* Check shared memory and possibly remove and recreate */ if (makePrivate) /* a standalone backend shouldn't do this */ continue; if ((memAddress = PGSharedMemoryAttach(NextShmemSegID, &shmid)) == NULL) continue; /* can't attach, not one of mine */ /* * If I am not the creator and it belongs to an extant process, * continue. */ hdr = (PGShmemHeader *) memAddress; if (hdr->creatorPID != getpid()) { if (kill(hdr->creatorPID, 0) == 0 || errno != ESRCH) { shmdt(memAddress); continue; /* segment belongs to a live process */ } } /* * The segment appears to be from a dead Postgres process, or from a * previous cycle of life in this same process. Zap it, if possible, * and any associated dynamic shared memory segments, as well. This * probably shouldn't fail, but if it does, assume the segment belongs * to someone else after all, and continue quietly. */ if (hdr->dsm_control != 0) dsm_cleanup_using_control_segment(hdr->dsm_control); shmdt(memAddress); if (shmctl(shmid, IPC_RMID, NULL) < 0) continue; /* * Now try again to create the segment. */ memAddress = InternalIpcMemoryCreate(NextShmemSegID, sysvsize); if (memAddress) break; /* successful create and attach */ /* * Can only get here if some other process managed to create the same * shmem key before we did. Let him have that one, loop around to try * next key. */ } /* * OK, we created a new segment. Mark it as created by this process. The * order of assignments here is critical so that another Postgres process * can't see the header as valid but belonging to an invalid PID! */ hdr = (PGShmemHeader *) memAddress; hdr->creatorPID = getpid(); hdr->magic = PGShmemMagic; hdr->dsm_control = 0; /* Fill in the data directory ID info, too */ if (stat(DataDir, &statbuf) < 0) ereport(FATAL, (errcode_for_file_access(), errmsg("could not stat data directory \"%s\": %m", DataDir))); hdr->device = statbuf.st_dev; hdr->inode = statbuf.st_ino; /* * Initialize space allocation status for segment. */ hdr->totalsize = size; hdr->freeoffset = MAXALIGN(sizeof(PGShmemHeader)); *shim = hdr; /* Save info for possible future use */ UsedShmemSegAddr = memAddress; UsedShmemSegID = (unsigned long) NextShmemSegID; /* * If AnonymousShmem is NULL here, then we're not using anonymous shared * memory, and should return a pointer to the System V shared memory * block. Otherwise, the System V shared memory block is only a shim, and * we must return a pointer to the real block. */ #ifdef USE_ANONYMOUS_SHMEM if (AnonymousShmem == NULL) return hdr; memcpy(AnonymousShmem, hdr, sizeof(PGShmemHeader)); return (PGShmemHeader *) AnonymousShmem; #else return hdr; #endif }
/* * PageRepairFragmentation * * Frees fragmented space on a page. * It doesn't remove unused line pointers! Please don't change this. * * This routine is usable for heap pages only, but see PageIndexMultiDelete. * * As a side effect, the page's PD_HAS_FREE_LINES hint bit is updated. */ void PageRepairFragmentation(Page page) { Offset pd_lower = ((PageHeader) page)->pd_lower; Offset pd_upper = ((PageHeader) page)->pd_upper; Offset pd_special = ((PageHeader) page)->pd_special; ItemId lp; int nline, nstorage, nunused; int i; Size totallen; /* * It's worth the trouble to be more paranoid here than in most places, * because we are about to reshuffle data in (what is usually) a shared * disk buffer. If we aren't careful then corrupted pointers, lengths, * etc could cause us to clobber adjacent disk buffers, spreading the data * loss further. So, check everything. */ if (pd_lower < SizeOfPageHeaderData || pd_lower > pd_upper || pd_upper > pd_special || pd_special > BLCKSZ || pd_special != MAXALIGN(pd_special)) ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("corrupted page pointers: lower = %u, upper = %u, special = %u", pd_lower, pd_upper, pd_special))); nline = PageGetMaxOffsetNumber(page); nunused = nstorage = 0; for (i = FirstOffsetNumber; i <= nline; i++) { lp = PageGetItemId(page, i); if (ItemIdIsUsed(lp)) { if (ItemIdHasStorage(lp)) nstorage++; } else { /* Unused entries should have lp_len = 0, but make sure */ ItemIdSetUnused(lp); nunused++; } } if (nstorage == 0) { /* Page is completely empty, so just reset it quickly */ ((PageHeader) page)->pd_upper = pd_special; } else { /* Need to compact the page the hard way */ itemIdSortData itemidbase[MaxHeapTuplesPerPage]; itemIdSort itemidptr = itemidbase; totallen = 0; for (i = 0; i < nline; i++) { lp = PageGetItemId(page, i + 1); if (ItemIdHasStorage(lp)) { itemidptr->offsetindex = i; itemidptr->itemoff = ItemIdGetOffset(lp); if (itemidptr->itemoff < (int) pd_upper || itemidptr->itemoff >= (int) pd_special) ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("corrupted item pointer: %u", itemidptr->itemoff))); itemidptr->alignedlen = MAXALIGN(ItemIdGetLength(lp)); totallen += itemidptr->alignedlen; itemidptr++; } } if (totallen > (Size) (pd_special - pd_lower)) ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("corrupted item lengths: total %u, available space %u", (unsigned int) totallen, pd_special - pd_lower))); compactify_tuples(itemidbase, nstorage, page); } /* Set hint bit for PageAddItem */ if (nunused > 0) PageSetHasFreeLinePointers(page); else PageClearHasFreeLinePointers(page); }
void SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns, LWLock *ctllock, const char *subdir) { SlruShared shared; bool found; shared = (SlruShared) ShmemInitStruct(name, SimpleLruShmemSize(nslots, nlsns), &found); if (!IsUnderPostmaster) { /* Initialize locks and shared memory area */ char *ptr; Size offset; int slotno; Assert(!found); memset(shared, 0, sizeof(SlruSharedData)); shared->ControlLock = ctllock; shared->num_slots = nslots; shared->lsn_groups_per_page = nlsns; shared->cur_lru_count = 0; /* shared->latest_page_number will be set later */ ptr = (char *) shared; offset = MAXALIGN(sizeof(SlruSharedData)); shared->page_buffer = (char **) (ptr + offset); offset += MAXALIGN(nslots * sizeof(char *)); shared->page_status = (SlruPageStatus *) (ptr + offset); offset += MAXALIGN(nslots * sizeof(SlruPageStatus)); shared->page_dirty = (bool *) (ptr + offset); offset += MAXALIGN(nslots * sizeof(bool)); shared->page_number = (int *) (ptr + offset); offset += MAXALIGN(nslots * sizeof(int)); shared->page_lru_count = (int *) (ptr + offset); offset += MAXALIGN(nslots * sizeof(int)); shared->buffer_locks = (LWLock **) (ptr + offset); offset += MAXALIGN(nslots * sizeof(LWLock *)); if (nlsns > 0) { shared->group_lsn = (XLogRecPtr *) (ptr + offset); offset += MAXALIGN(nslots * nlsns * sizeof(XLogRecPtr)); } ptr += BUFFERALIGN(offset); for (slotno = 0; slotno < nslots; slotno++) { shared->page_buffer[slotno] = ptr; shared->page_status[slotno] = SLRU_PAGE_EMPTY; shared->page_dirty[slotno] = false; shared->page_lru_count[slotno] = 0; shared->buffer_locks[slotno] = LWLockAssign(); ptr += BLCKSZ; } } else Assert(found); /* * Initialize the unshared control struct, including directory path. We * assume caller set PagePrecedes. */ ctl->shared = shared; ctl->do_fsync = true; /* default behavior */ StrNCpy(ctl->Dir, subdir, sizeof(ctl->Dir)); }
/* * PageIsVerified * Check that the page header and checksum (if any) appear valid. * * This is called when a page has just been read in from disk. The idea is * to cheaply detect trashed pages before we go nuts following bogus item * pointers, testing invalid transaction identifiers, etc. * * It turns out to be necessary to allow zeroed pages here too. Even though * this routine is *not* called when deliberately adding a page to a relation, * there are scenarios in which a zeroed page might be found in a table. * (Example: a backend extends a relation, then crashes before it can write * any WAL entry about the new page. The kernel will already have the * zeroed page in the file, and it will stay that way after restart.) So we * allow zeroed pages here, and are careful that the page access macros * treat such a page as empty and without free space. Eventually, VACUUM * will clean up such a page and make it usable. */ bool PageIsVerified(Page page, BlockNumber blkno) { PageHeader p = (PageHeader) page; char *pagebytes; int i; bool checksum_failure = false; bool header_sane = false; bool all_zeroes = false; uint16 checksum = 0; /* * Don't verify page data unless the page passes basic non-zero test */ if (!PageIsNew(page)) { if (DataChecksumsEnabled()) { checksum = pg_checksum_page((char *) page, blkno); if (checksum != p->pd_checksum) checksum_failure = true; } /* * The following checks don't prove the header is correct, only that * it looks sane enough to allow into the buffer pool. Later usage of * the block can still reveal problems, which is why we offer the * checksum option. */ if ((p->pd_flags & ~PD_VALID_FLAG_BITS) == 0 && p->pd_lower <= p->pd_upper && p->pd_upper <= p->pd_special && p->pd_special <= BLCKSZ && p->pd_special == MAXALIGN(p->pd_special)) header_sane = true; if (header_sane && !checksum_failure) return true; } /* Check all-zeroes case */ all_zeroes = true; pagebytes = (char *) page; for (i = 0; i < BLCKSZ; i++) { if (pagebytes[i] != 0) { all_zeroes = false; break; } } if (all_zeroes) return true; /* * Throw a WARNING if the checksum fails, but only after we've checked for * the all-zeroes case. */ if (checksum_failure) { ereport(WARNING, (ERRCODE_DATA_CORRUPTED, errmsg("page verification failed, calculated checksum %u but expected %u", checksum, p->pd_checksum))); if (header_sane && ignore_checksum_failure) return true; } return false; }
/* * Place tuple and split page, original buffer(lbuf) leaves untouched, * returns shadow page of lbuf filled new data. * Tuples are distributed between pages by equal size on its, not * an equal number! */ static Page entrySplitPage(GinBtree btree, Buffer lbuf, Buffer rbuf, OffsetNumber off, XLogRecData **prdata) { static XLogRecData rdata[2]; OffsetNumber i, maxoff, separator = InvalidOffsetNumber; Size totalsize = 0; Size lsize = 0, size; static char tupstore[2 * BLCKSZ]; char *ptr; IndexTuple itup, leftrightmost = NULL; static ginxlogSplit data; Page page; Page lpage = PageGetTempPageCopy(BufferGetPage(lbuf)); Page rpage = BufferGetPage(rbuf); Size pageSize = PageGetPageSize(lpage); *prdata = rdata; data.leftChildBlkno = (GinPageIsLeaf(lpage)) ? InvalidOffsetNumber : GinItemPointerGetBlockNumber(&(btree->entry->t_tid)); data.updateBlkno = entryPreparePage(btree, lpage, off); maxoff = PageGetMaxOffsetNumber(lpage); ptr = tupstore; for (i = FirstOffsetNumber; i <= maxoff; i++) { if (i == off) { size = MAXALIGN(IndexTupleSize(btree->entry)); memcpy(ptr, btree->entry, size); ptr += size; totalsize += size + sizeof(ItemIdData); } itup = (IndexTuple) PageGetItem(lpage, PageGetItemId(lpage, i)); size = MAXALIGN(IndexTupleSize(itup)); memcpy(ptr, itup, size); ptr += size; totalsize += size + sizeof(ItemIdData); } if (off == maxoff + 1) { size = MAXALIGN(IndexTupleSize(btree->entry)); memcpy(ptr, btree->entry, size); ptr += size; totalsize += size + sizeof(ItemIdData); } GinInitPage(rpage, GinPageGetOpaque(lpage)->flags, pageSize); GinInitPage(lpage, GinPageGetOpaque(rpage)->flags, pageSize); ptr = tupstore; maxoff++; lsize = 0; page = lpage; for (i = FirstOffsetNumber; i <= maxoff; i++) { itup = (IndexTuple) ptr; if (lsize > totalsize / 2) { if (separator == InvalidOffsetNumber) separator = i - 1; page = rpage; } else { leftrightmost = itup; lsize += MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData); } if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber) elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(btree->index)); ptr += MAXALIGN(IndexTupleSize(itup)); } btree->entry = copyIndexTuple(leftrightmost, lpage); ItemPointerSet(&(btree->entry)->t_tid, BufferGetBlockNumber(lbuf), InvalidOffsetNumber); btree->rightblkno = BufferGetBlockNumber(rbuf); data.node = btree->index->rd_node; data.rootBlkno = InvalidBlockNumber; data.lblkno = BufferGetBlockNumber(lbuf); data.rblkno = BufferGetBlockNumber(rbuf); data.separator = separator; data.nitem = maxoff; data.isData = FALSE; data.isLeaf = GinPageIsLeaf(lpage) ? TRUE : FALSE; data.isRootSplit = FALSE; rdata[0].buffer = InvalidBuffer; rdata[0].data = (char *) &data; rdata[0].len = sizeof(ginxlogSplit); rdata[0].next = &rdata[1]; rdata[1].buffer = InvalidBuffer; rdata[1].data = tupstore; rdata[1].len = MAXALIGN(totalsize); rdata[1].next = NULL; return lpage; }
/* * PageIndexDeleteNoCompact * Delete the given items for an index page, and defragment the resulting * free space, but do not compact the item pointers array. * * itemnos is the array of tuples to delete; nitems is its size. maxIdxTuples * is the maximum number of tuples that can exist in a page. * * Unused items at the end of the array are removed. * * This is used for index AMs that require that existing TIDs of live tuples * remain unchanged. */ void PageIndexDeleteNoCompact(Page page, OffsetNumber *itemnos, int nitems) { PageHeader phdr = (PageHeader) page; LocationIndex pd_lower = phdr->pd_lower; LocationIndex pd_upper = phdr->pd_upper; LocationIndex pd_special = phdr->pd_special; int nline; bool empty; OffsetNumber offnum; int nextitm; /* * As with PageRepairFragmentation, paranoia seems justified. */ if (pd_lower < SizeOfPageHeaderData || pd_lower > pd_upper || pd_upper > pd_special || pd_special > BLCKSZ || pd_special != MAXALIGN(pd_special)) ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("corrupted page pointers: lower = %u, upper = %u, special = %u", pd_lower, pd_upper, pd_special))); /* * Scan the existing item pointer array and mark as unused those that are * in our kill-list; make sure any non-interesting ones are marked unused * as well. */ nline = PageGetMaxOffsetNumber(page); empty = true; nextitm = 0; for (offnum = FirstOffsetNumber; offnum <= nline; offnum = OffsetNumberNext(offnum)) { ItemId lp; ItemLength itemlen; ItemOffset offset; lp = PageGetItemId(page, offnum); itemlen = ItemIdGetLength(lp); offset = ItemIdGetOffset(lp); if (ItemIdIsUsed(lp)) { if (offset < pd_upper || (offset + itemlen) > pd_special || offset != MAXALIGN(offset)) ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("corrupted item pointer: offset = %u, length = %u", offset, (unsigned int) itemlen))); if (nextitm < nitems && offnum == itemnos[nextitm]) { /* this one is on our list to delete, so mark it unused */ ItemIdSetUnused(lp); nextitm++; } else if (ItemIdHasStorage(lp)) { /* This one's live -- must do the compaction dance */ empty = false; } else { /* get rid of this one too */ ItemIdSetUnused(lp); } } } /* this will catch invalid or out-of-order itemnos[] */ if (nextitm != nitems) elog(ERROR, "incorrect index offsets supplied"); if (empty) { /* Page is completely empty, so just reset it quickly */ phdr->pd_lower = SizeOfPageHeaderData; phdr->pd_upper = pd_special; } else { /* There are live items: need to compact the page the hard way */ itemIdSortData itemidbase[MaxOffsetNumber]; itemIdSort itemidptr; int i; Size totallen; /* * Scan the page taking note of each item that we need to preserve. * This includes both live items (those that contain data) and * interspersed unused ones. It's critical to preserve these unused * items, because otherwise the offset numbers for later live items * would change, which is not acceptable. Unused items might get used * again later; that is fine. */ itemidptr = itemidbase; totallen = 0; PageClearHasFreeLinePointers(page); for (i = 0; i < nline; i++) { ItemId lp; itemidptr->offsetindex = i; lp = PageGetItemId(page, i + 1); if (ItemIdHasStorage(lp)) { itemidptr->itemoff = ItemIdGetOffset(lp); itemidptr->alignedlen = MAXALIGN(ItemIdGetLength(lp)); totallen += itemidptr->alignedlen; itemidptr++; } else { PageSetHasFreeLinePointers(page); ItemIdSetUnused(lp); } } nline = itemidptr - itemidbase; /* By here, there are exactly nline elements in itemidbase array */ if (totallen > (Size) (pd_special - pd_lower)) ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("corrupted item lengths: total %u, available space %u", (unsigned int) totallen, pd_special - pd_lower))); /* * Defragment the data areas of each tuple, being careful to preserve * each item's position in the linp array. */ compactify_tuples(itemidbase, nline, page); } }
/* * Form a tuple for entry tree. * * If the tuple would be too big to be stored, function throws a suitable * error if errorTooBig is TRUE, or returns NULL if errorTooBig is FALSE. * * On leaf pages, Index tuple has non-traditional layout. Tuple may contain * posting list or root blocknumber of posting tree. * Macros: GinIsPostingTree(itup) / GinSetPostingTree(itup, blkno) * 1) Posting list * - itup->t_info & INDEX_SIZE_MASK contains total size of tuple as usual * - ItemPointerGetBlockNumber(&itup->t_tid) contains original * size of tuple (without posting list). * Macros: GinGetOrigSizePosting(itup) / GinSetOrigSizePosting(itup,n) * - ItemPointerGetOffsetNumber(&itup->t_tid) contains number * of elements in posting list (number of heap itempointers) * Macros: GinGetNPosting(itup) / GinSetNPosting(itup,n) * - After standard part of tuple there is a posting list, ie, array * of heap itempointers * Macros: GinGetPosting(itup) * 2) Posting tree * - itup->t_info & INDEX_SIZE_MASK contains size of tuple as usual * - ItemPointerGetBlockNumber(&itup->t_tid) contains block number of * root of posting tree * - ItemPointerGetOffsetNumber(&itup->t_tid) contains magic number * GIN_TREE_POSTING, which distinguishes this from posting-list case * * Attributes of an index tuple are different for single and multicolumn index. * For single-column case, index tuple stores only value to be indexed. * For multicolumn case, it stores two attributes: column number of value * and value. */ IndexTuple GinFormTuple(Relation index, GinState *ginstate, OffsetNumber attnum, Datum key, ItemPointerData *ipd, uint32 nipd, bool errorTooBig) { bool isnull[2] = {FALSE, FALSE}; IndexTuple itup; uint32 newsize; if (ginstate->oneCol) itup = index_form_tuple(ginstate->origTupdesc, &key, isnull); else { Datum datums[2]; datums[0] = UInt16GetDatum(attnum); datums[1] = key; itup = index_form_tuple(ginstate->tupdesc[attnum - 1], datums, isnull); } GinSetOrigSizePosting(itup, IndexTupleSize(itup)); if (nipd > 0) { newsize = MAXALIGN(SHORTALIGN(IndexTupleSize(itup)) + sizeof(ItemPointerData) * nipd); if (newsize > Min(INDEX_SIZE_MASK, GinMaxItemSize)) { if (errorTooBig) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("index row size %lu exceeds maximum %lu for index \"%s\"", (unsigned long) newsize, (unsigned long) Min(INDEX_SIZE_MASK, GinMaxItemSize), RelationGetRelationName(index)))); return NULL; } itup = repalloc(itup, newsize); /* set new size */ itup->t_info &= ~INDEX_SIZE_MASK; itup->t_info |= newsize; if (ipd) memcpy(GinGetPosting(itup), ipd, sizeof(ItemPointerData) * nipd); GinSetNPosting(itup, nipd); } else { /* * Gin tuple without any ItemPointers should be large enough to keep * one ItemPointer, to prevent inconsistency between * ginHeapTupleFastCollect and ginEntryInsert called by * ginHeapTupleInsert. ginHeapTupleFastCollect forms tuple without * extra pointer to heap, but ginEntryInsert (called for pending list * cleanup during vacuum) will form the same tuple with one * ItemPointer. */ newsize = MAXALIGN(SHORTALIGN(IndexTupleSize(itup)) + sizeof(ItemPointerData)); if (newsize > Min(INDEX_SIZE_MASK, GinMaxItemSize)) { if (errorTooBig) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("index row size %lu exceeds maximum %lu for index \"%s\"", (unsigned long) newsize, (unsigned long) Min(INDEX_SIZE_MASK, GinMaxItemSize), RelationGetRelationName(index)))); return NULL; } GinSetNPosting(itup, 0); } return itup; }
static Size PersistentFilespace_SharedDataSize(void) { return MAXALIGN(sizeof(PersistentFilespaceSharedData)); }
/* * Insert an index tuple into the index relation. The revmap is updated to * mark the range containing the given page as pointing to the inserted entry. * A WAL record is written. * * The buffer, if valid, is first checked for free space to insert the new * entry; if there isn't enough, a new buffer is obtained and pinned. No * buffer lock must be held on entry, no buffer lock is held on exit. * * Return value is the offset number where the tuple was inserted. */ OffsetNumber brin_doinsert(Relation idxrel, BlockNumber pagesPerRange, BrinRevmap *revmap, Buffer *buffer, BlockNumber heapBlk, BrinTuple *tup, Size itemsz) { Page page; BlockNumber blk; OffsetNumber off; Buffer revmapbuf; ItemPointerData tid; bool extended; Assert(itemsz == MAXALIGN(itemsz)); /* If the item is oversized, don't even bother. */ if (itemsz > BrinMaxItemSize) { ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("index row size %lu exceeds maximum %lu for index \"%s\"", (unsigned long) itemsz, (unsigned long) BrinMaxItemSize, RelationGetRelationName(idxrel)))); return InvalidOffsetNumber; /* keep compiler quiet */ } /* Make sure the revmap is long enough to contain the entry we need */ brinRevmapExtend(revmap, heapBlk); /* * Acquire lock on buffer supplied by caller, if any. If it doesn't have * enough space, unpin it to obtain a new one below. */ if (BufferIsValid(*buffer)) { /* * It's possible that another backend (or ourselves!) extended the * revmap over the page we held a pin on, so we cannot assume that * it's still a regular page. */ LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); if (br_page_get_freespace(BufferGetPage(*buffer)) < itemsz) { UnlockReleaseBuffer(*buffer); *buffer = InvalidBuffer; } } /* * If we still don't have a usable buffer, have brin_getinsertbuffer * obtain one for us. */ if (!BufferIsValid(*buffer)) { do *buffer = brin_getinsertbuffer(idxrel, InvalidBuffer, itemsz, &extended); while (!BufferIsValid(*buffer)); } else extended = false; /* Now obtain lock on revmap buffer */ revmapbuf = brinLockRevmapPageForUpdate(revmap, heapBlk); page = BufferGetPage(*buffer); blk = BufferGetBlockNumber(*buffer); /* Execute the actual insertion */ START_CRIT_SECTION(); if (extended) brin_page_init(BufferGetPage(*buffer), BRIN_PAGETYPE_REGULAR); off = PageAddItem(page, (Item) tup, itemsz, InvalidOffsetNumber, false, false); if (off == InvalidOffsetNumber) elog(ERROR, "could not insert new index tuple to page"); MarkBufferDirty(*buffer); BRIN_elog((DEBUG2, "inserted tuple (%u,%u) for range starting at %u", blk, off, heapBlk)); ItemPointerSet(&tid, blk, off); brinSetHeapBlockItemptr(revmapbuf, pagesPerRange, heapBlk, tid); MarkBufferDirty(revmapbuf); /* XLOG stuff */ if (RelationNeedsWAL(idxrel)) { xl_brin_insert xlrec; XLogRecPtr recptr; uint8 info; info = XLOG_BRIN_INSERT | (extended ? XLOG_BRIN_INIT_PAGE : 0); xlrec.heapBlk = heapBlk; xlrec.pagesPerRange = pagesPerRange; xlrec.offnum = off; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfBrinInsert); XLogRegisterBuffer(0, *buffer, REGBUF_STANDARD | (extended ? REGBUF_WILL_INIT : 0)); XLogRegisterBufData(0, (char *) tup, itemsz); XLogRegisterBuffer(1, revmapbuf, 0); recptr = XLogInsert(RM_BRIN_ID, info); PageSetLSN(page, recptr); PageSetLSN(BufferGetPage(revmapbuf), recptr); } END_CRIT_SECTION(); /* Tuple is firmly on buffer; we can release our locks */ LockBuffer(*buffer, BUFFER_LOCK_UNLOCK); LockBuffer(revmapbuf, BUFFER_LOCK_UNLOCK); if (extended) FreeSpaceMapVacuum(idxrel); return off; }