/* * PersistHoldablePortal * * Prepare the specified Portal for access outside of the current * transaction. When this function returns, all future accesses to the * portal must be done via the Tuplestore (not by invoking the * executor). */ void PersistHoldablePortal(Portal portal) { QueryDesc *queryDesc = PortalGetQueryDesc(portal); Portal saveActivePortal; ResourceOwner saveResourceOwner; MemoryContext savePortalContext; MemoryContext oldcxt; /* * If we're preserving a holdable portal, we had better be inside the * transaction that originally created it. */ Assert(portal->createSubid != InvalidSubTransactionId); Assert(queryDesc != NULL); /* * Caller must have created the tuplestore already ... but not a snapshot. */ Assert(portal->holdContext != NULL); Assert(portal->holdStore != NULL); Assert(portal->holdSnapshot == NULL); /* * Before closing down the executor, we must copy the tupdesc into * long-term memory, since it was created in executor memory. */ oldcxt = MemoryContextSwitchTo(portal->holdContext); portal->tupDesc = CreateTupleDescCopy(portal->tupDesc); MemoryContextSwitchTo(oldcxt); /* * Check for improper portal use, and mark portal active. */ MarkPortalActive(portal); /* * Set up global portal context pointers. */ saveActivePortal = ActivePortal; saveResourceOwner = CurrentResourceOwner; savePortalContext = PortalContext; PG_TRY(); { ActivePortal = portal; if (portal->resowner) CurrentResourceOwner = portal->resowner; PortalContext = PortalGetHeapMemory(portal); MemoryContextSwitchTo(PortalContext); PushActiveSnapshot(queryDesc->snapshot); /* * Rewind the executor: we need to store the entire result set in the * tuplestore, so that subsequent backward FETCHs can be processed. */ ExecutorRewind(queryDesc); /* * Change the destination to output to the tuplestore. Note we tell * the tuplestore receiver to detoast all data passed through it; this * makes it safe to not keep a snapshot associated with the data. */ queryDesc->dest = CreateDestReceiver(DestTuplestore); SetTuplestoreDestReceiverParams(queryDesc->dest, portal->holdStore, portal->holdContext, true); /* Fetch the result set into the tuplestore */ ExecutorRun(queryDesc, ForwardScanDirection, 0L, false); (*queryDesc->dest->rDestroy) (queryDesc->dest); queryDesc->dest = NULL; /* * Now shut down the inner executor. */ portal->queryDesc = NULL; /* prevent double shutdown */ ExecutorFinish(queryDesc); ExecutorEnd(queryDesc); FreeQueryDesc(queryDesc); /* * Set the position in the result set. */ MemoryContextSwitchTo(portal->holdContext); if (portal->atEnd) { /* * Just force the tuplestore forward to its end. The size of the * skip request here is arbitrary. */ while (tuplestore_skiptuples(portal->holdStore, 1000000, true)) /* continue */ ; } else { tuplestore_rescan(portal->holdStore); if (!tuplestore_skiptuples(portal->holdStore, portal->portalPos, true)) elog(ERROR, "unexpected end of tuple stream"); } } PG_CATCH(); { /* Uncaught error while executing portal: mark it dead */ MarkPortalFailed(portal); /* Restore global vars and propagate error */ ActivePortal = saveActivePortal; CurrentResourceOwner = saveResourceOwner; PortalContext = savePortalContext; PG_RE_THROW(); } PG_END_TRY(); MemoryContextSwitchTo(oldcxt); /* Mark portal not active */ portal->status = PORTAL_READY; ActivePortal = saveActivePortal; CurrentResourceOwner = saveResourceOwner; PortalContext = savePortalContext; PopActiveSnapshot(); /* * We can now release any subsidiary memory of the portal's heap context; * we'll never use it again. The executor already dropped its context, * but this will clean up anything that glommed onto the portal's heap via * PortalContext. */ MemoryContextDeleteChildren(PortalGetHeapMemory(portal)); }
/* * Main entry point for bgwriter process * * This is invoked from AuxiliaryProcessMain, which has already created the * basic execution environment, but not enabled signals yet. */ void BackgroundWriterMain(void) { sigjmp_buf local_sigjmp_buf; MemoryContext bgwriter_context; bool prev_hibernate; /* * Properly accept or ignore signals the postmaster might send us. * * bgwriter doesn't participate in ProcSignal signalling, but a SIGUSR1 * handler is still needed for latch wakeups. */ pqsignal(SIGHUP, BgSigHupHandler); /* set flag to read config file */ pqsignal(SIGINT, SIG_IGN); pqsignal(SIGTERM, ReqShutdownHandler); /* shutdown */ pqsignal(SIGQUIT, bg_quickdie); /* hard crash time */ pqsignal(SIGALRM, SIG_IGN); pqsignal(SIGPIPE, SIG_IGN); pqsignal(SIGUSR1, bgwriter_sigusr1_handler); pqsignal(SIGUSR2, SIG_IGN); /* * Reset some signals that are accepted by postmaster but not here */ pqsignal(SIGCHLD, SIG_DFL); pqsignal(SIGTTIN, SIG_DFL); pqsignal(SIGTTOU, SIG_DFL); pqsignal(SIGCONT, SIG_DFL); pqsignal(SIGWINCH, SIG_DFL); /* We allow SIGQUIT (quickdie) at all times */ sigdelset(&BlockSig, SIGQUIT); /* * Create a resource owner to keep track of our resources (currently only * buffer pins). */ CurrentResourceOwner = ResourceOwnerCreate(NULL, "Background Writer"); /* * We just started, assume there has been either a shutdown or * end-of-recovery snapshot. */ last_snapshot_ts = GetCurrentTimestamp(); /* * Create a memory context that we will do all our work in. We do this so * that we can reset the context during error recovery and thereby avoid * possible memory leaks. Formerly this code just ran in * TopMemoryContext, but resetting that would be a really bad idea. */ bgwriter_context = AllocSetContextCreate(TopMemoryContext, "Background Writer", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); MemoryContextSwitchTo(bgwriter_context); /* * If an exception is encountered, processing resumes here. * * See notes in postgres.c about the design of this coding. */ if (sigsetjmp(local_sigjmp_buf, 1) != 0) { /* Since not using PG_TRY, must reset error stack by hand */ error_context_stack = NULL; /* Prevent interrupts while cleaning up */ HOLD_INTERRUPTS(); /* Report the error to the server log */ EmitErrorReport(); /* * These operations are really just a minimal subset of * AbortTransaction(). We don't have very many resources to worry * about in bgwriter, but we do have LWLocks, buffers, and temp files. */ LWLockReleaseAll(); AbortBufferIO(); UnlockBuffers(); /* buffer pins are released here: */ ResourceOwnerRelease(CurrentResourceOwner, RESOURCE_RELEASE_BEFORE_LOCKS, false, true); /* we needn't bother with the other ResourceOwnerRelease phases */ AtEOXact_Buffers(false); AtEOXact_SMgr(); AtEOXact_Files(); AtEOXact_HashTables(false); /* * Now return to normal top-level context and clear ErrorContext for * next time. */ MemoryContextSwitchTo(bgwriter_context); FlushErrorState(); /* Flush any leaked data in the top-level context */ MemoryContextResetAndDeleteChildren(bgwriter_context); /* Now we can allow interrupts again */ RESUME_INTERRUPTS(); /* * Sleep at least 1 second after any error. A write error is likely * to be repeated, and we don't want to be filling the error logs as * fast as we can. */ pg_usleep(1000000L); /* * Close all open files after any error. This is helpful on Windows, * where holding deleted files open causes various strange errors. * It's not clear we need it elsewhere, but shouldn't hurt. */ smgrcloseall(); } /* We can now handle ereport(ERROR) */ PG_exception_stack = &local_sigjmp_buf; /* * Unblock signals (they were blocked when the postmaster forked us) */ PG_SETMASK(&UnBlockSig); /* * Reset hibernation state after any error. */ prev_hibernate = false; /* * Loop forever */ for (;;) { bool can_hibernate; int rc; /* Clear any already-pending wakeups */ ResetLatch(MyLatch); if (got_SIGHUP) { got_SIGHUP = false; ProcessConfigFile(PGC_SIGHUP); } if (shutdown_requested) { /* * From here on, elog(ERROR) should end with exit(1), not send * control back to the sigsetjmp block above */ ExitOnAnyError = true; /* Normal exit from the bgwriter is here */ proc_exit(0); /* done */ } /* * Do one cycle of dirty-buffer writing. */ can_hibernate = BgBufferSync(); /* * Send off activity statistics to the stats collector */ pgstat_send_bgwriter(); if (FirstCallSinceLastCheckpoint()) { /* * After any checkpoint, close all smgr files. This is so we * won't hang onto smgr references to deleted files indefinitely. */ smgrcloseall(); } /* * Log a new xl_running_xacts every now and then so replication can * get into a consistent state faster (think of suboverflowed * snapshots) and clean up resources (locks, KnownXids*) more * frequently. The costs of this are relatively low, so doing it 4 * times (LOG_SNAPSHOT_INTERVAL_MS) a minute seems fine. * * We assume the interval for writing xl_running_xacts is * significantly bigger than BgWriterDelay, so we don't complicate the * overall timeout handling but just assume we're going to get called * often enough even if hibernation mode is active. It's not that * important that log_snap_interval_ms is met strictly. To make sure * we're not waking the disk up unneccesarily on an idle system we * check whether there has been any WAL inserted since the last time * we've logged a running xacts. * * We do this logging in the bgwriter as its the only process thats * run regularly and returns to its mainloop all the time. E.g. * Checkpointer, when active, is barely ever in its mainloop and thus * makes it hard to log regularly. */ if (XLogStandbyInfoActive() && !RecoveryInProgress()) { TimestampTz timeout = 0; TimestampTz now = GetCurrentTimestamp(); timeout = TimestampTzPlusMilliseconds(last_snapshot_ts, LOG_SNAPSHOT_INTERVAL_MS); /* * only log if enough time has passed and some xlog record has * been inserted. */ if (now >= timeout && last_snapshot_lsn != GetXLogInsertRecPtr()) { last_snapshot_lsn = LogStandbySnapshot(); last_snapshot_ts = now; } } /* * Sleep until we are signaled or BgWriterDelay has elapsed. * * Note: the feedback control loop in BgBufferSync() expects that we * will call it every BgWriterDelay msec. While it's not critical for * correctness that that be exact, the feedback loop might misbehave * if we stray too far from that. Hence, avoid loading this process * down with latch events that are likely to happen frequently during * normal operation. */ rc = WaitLatch(MyLatch, WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, BgWriterDelay /* ms */ ); /* * If no latch event and BgBufferSync says nothing's happening, extend * the sleep in "hibernation" mode, where we sleep for much longer * than bgwriter_delay says. Fewer wakeups save electricity. When a * backend starts using buffers again, it will wake us up by setting * our latch. Because the extra sleep will persist only as long as no * buffer allocations happen, this should not distort the behavior of * BgBufferSync's control loop too badly; essentially, it will think * that the system-wide idle interval didn't exist. * * There is a race condition here, in that a backend might allocate a * buffer between the time BgBufferSync saw the alloc count as zero * and the time we call StrategyNotifyBgWriter. While it's not * critical that we not hibernate anyway, we try to reduce the odds of * that by only hibernating when BgBufferSync says nothing's happening * for two consecutive cycles. Also, we mitigate any possible * consequences of a missed wakeup by not hibernating forever. */ if (rc == WL_TIMEOUT && can_hibernate && prev_hibernate) { /* Ask for notification at next buffer allocation */ StrategyNotifyBgWriter(MyProc->pgprocno); /* Sleep ... */ rc = WaitLatch(MyLatch, WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, BgWriterDelay * HIBERNATE_FACTOR); /* Reset the notification request in case we timed out */ StrategyNotifyBgWriter(-1); } /* * Emergency bailout if postmaster has died. This is to avoid the * necessity for manual cleanup of all postmaster children. */ if (rc & WL_POSTMASTER_DEATH) exit(1); prev_hibernate = can_hibernate; } }
/* * load_last_minipage * * Search through the block directory btree to find the last row that * contains the last minipage. */ static void load_last_minipage(AppendOnlyBlockDirectory *blockDirectory, int64 lastSequence, int columnGroupNo) { Relation blkdirRel = blockDirectory->blkdirRel; Relation blkdirIdx = blockDirectory->blkdirIdx; TupleDesc idxTupleDesc; TupleDesc heapTupleDesc; IndexScanDesc idxScanDesc; HeapTuple tuple = NULL; MemoryContext oldcxt; int numScanKeys = blockDirectory->numScanKeys; ScanKey scanKeys = blockDirectory->scanKeys; #ifdef USE_ASSERT_CHECKING StrategyNumber *strategyNumbers = blockDirectory->strategyNumbers; #endif /* USE_ASSERT_CHECKING */ Assert(blockDirectory->aoRel != NULL); Assert(blockDirectory->blkdirRel != NULL); Assert(blockDirectory->blkdirIdx != NULL); oldcxt = MemoryContextSwitchTo(blockDirectory->memoryContext); heapTupleDesc = RelationGetDescr(blkdirRel); idxTupleDesc = RelationGetDescr(blkdirIdx); Assert(numScanKeys == 3); Assert(blockDirectory->currentSegmentFileInfo != NULL); /* Setup the scan keys for the scan. */ Assert(scanKeys != NULL); Assert(strategyNumbers != NULL); if (lastSequence == 0) lastSequence = 1; scanKeys[0].sk_argument = Int32GetDatum(blockDirectory->currentSegmentFileNum); scanKeys[1].sk_argument = Int32GetDatum(columnGroupNo); scanKeys[2].sk_argument = Int64GetDatum(lastSequence); /* * Search the btree to find the entry in the block directory * that contains the last minipage. */ idxScanDesc = index_beginscan(blkdirRel, blkdirIdx, blockDirectory->appendOnlyMetaDataSnapshot, numScanKeys, scanKeys); tuple = index_getnext(idxScanDesc, BackwardScanDirection); if (tuple != NULL) { extract_minipage(blockDirectory, tuple, heapTupleDesc, columnGroupNo); } index_endscan(idxScanDesc); MemoryContextSwitchTo(oldcxt); if (Debug_appendonly_print_blockdirectory) ereport(LOG, (errmsg("Append-only block directory load last minipage: " "(columnGroupNo, lastSequence, nEntries) = (%d, " INT64_FORMAT ", %u)", columnGroupNo, lastSequence, blockDirectory->minipages[columnGroupNo].numMinipageEntries))); }
/* Function to return the list of grammar keywords */ Datum pg_get_keywords(PG_FUNCTION_ARGS) { FuncCallContext *funcctx; if (SRF_IS_FIRSTCALL()) { MemoryContext oldcontext; TupleDesc tupdesc; funcctx = SRF_FIRSTCALL_INIT(); oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); tupdesc = CreateTemplateTupleDesc(3, false); TupleDescInitEntry(tupdesc, (AttrNumber) 1, "word", TEXTOID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 2, "catcode", CHAROID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 3, "catdesc", TEXTOID, -1, 0); funcctx->attinmeta = TupleDescGetAttInMetadata(tupdesc); MemoryContextSwitchTo(oldcontext); } funcctx = SRF_PERCALL_SETUP(); if (funcctx->call_cntr < NumScanKeywords) { char *values[3]; HeapTuple tuple; /* cast-away-const is ugly but alternatives aren't much better */ values[0] = (char *) ScanKeywords[funcctx->call_cntr].name; switch (ScanKeywords[funcctx->call_cntr].category) { case UNRESERVED_KEYWORD: values[1] = "U"; values[2] = _("unreserved"); break; case COL_NAME_KEYWORD: values[1] = "C"; values[2] = _("unreserved (cannot be function or type name)"); break; case TYPE_FUNC_NAME_KEYWORD: values[1] = "T"; values[2] = _("reserved (can be function or type name)"); break; case RESERVED_KEYWORD: values[1] = "R"; values[2] = _("reserved"); break; default: /* shouldn't be possible */ values[1] = NULL; values[2] = NULL; break; } tuple = BuildTupleFromCStrings(funcctx->attinmeta, values); SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple)); } SRF_RETURN_DONE(funcctx); }
static bool ginVacuumPostingTreeLeaves(GinVacuumState *gvs, BlockNumber blkno, bool isRoot, Buffer *rootBuffer) { Buffer buffer; Page page; bool hasVoidPage = FALSE; MemoryContext oldCxt; buffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, blkno, RBM_NORMAL, gvs->strategy); page = BufferGetPage(buffer); /* * We should be sure that we don't concurrent with inserts, insert process * never release root page until end (but it can unlock it and lock * again). New scan can't start but previously started ones work * concurrently. */ if (isRoot) LockBufferForCleanup(buffer); else LockBuffer(buffer, GIN_EXCLUSIVE); Assert(GinPageIsData(page)); if (GinPageIsLeaf(page)) { oldCxt = MemoryContextSwitchTo(gvs->tmpCxt); ginVacuumPostingTreeLeaf(gvs->index, buffer, gvs); MemoryContextSwitchTo(oldCxt); MemoryContextReset(gvs->tmpCxt); /* if root is a leaf page, we don't desire further processing */ if (!isRoot && !hasVoidPage && GinDataLeafPageIsEmpty(page)) hasVoidPage = TRUE; } else { OffsetNumber i; bool isChildHasVoid = FALSE; for (i = FirstOffsetNumber; i <= GinPageGetOpaque(page)->maxoff; i++) { PostingItem *pitem = GinDataPageGetPostingItem(page, i); if (ginVacuumPostingTreeLeaves(gvs, PostingItemGetBlockNumber(pitem), FALSE, NULL)) isChildHasVoid = TRUE; } if (isChildHasVoid) hasVoidPage = TRUE; } /* * if we have root and there are empty pages in tree, then we don't * release lock to go further processing and guarantee that tree is unused */ if (!(isRoot && hasVoidPage)) { UnlockReleaseBuffer(buffer); } else { Assert(rootBuffer); *rootBuffer = buffer; } return hasVoidPage; }
/* * decode(lhs, [rhs, ret], ..., [default]) */ Datum ora_decode(PG_FUNCTION_ARGS) { int nargs; int i; int retarg; /* default value is last arg or NULL. */ nargs = PG_NARGS(); if (nargs % 2 == 0) { retarg = nargs - 1; nargs -= 1; /* ignore the last argument */ } else retarg = -1; /* NULL */ if (PG_ARGISNULL(0)) { for (i = 1; i < nargs; i += 2) { if (PG_ARGISNULL(i)) { retarg = i + 1; break; } } } else { FmgrInfo *eq; Oid collation = PG_GET_COLLATION(); /* * On first call, get the input type's operator '=' and save at * fn_extra. */ if (fcinfo->flinfo->fn_extra == NULL) { MemoryContext oldctx; Oid typid = get_fn_expr_argtype(fcinfo->flinfo, 0); Oid eqoid = equality_oper_funcid(typid); oldctx = MemoryContextSwitchTo(fcinfo->flinfo->fn_mcxt); eq = palloc(sizeof(FmgrInfo)); fmgr_info(eqoid, eq); MemoryContextSwitchTo(oldctx); fcinfo->flinfo->fn_extra = eq; } else eq = fcinfo->flinfo->fn_extra; for (i = 1; i < nargs; i += 2) { FunctionCallInfoData func; Datum result; if (PG_ARGISNULL(i)) continue; InitFunctionCallInfoData(func, eq, 2, collation, NULL, NULL); func.arg[0] = PG_GETARG_DATUM(0); func.arg[1] = PG_GETARG_DATUM(i); func.argnull[0] = false; func.argnull[1] = false; result = FunctionCallInvoke(&func); if (!func.isnull && DatumGetBool(result)) { retarg = i + 1; break; } } } if (retarg < 0 || PG_ARGISNULL(retarg)) PG_RETURN_NULL(); else PG_RETURN_DATUM(PG_GETARG_DATUM(retarg)); }
/* * This function receives a command names and a query and it produces a protocol valid command string * which can be send to the server based on the format of the result set. It handles memory contexts * appropriately, returning a pointer to the command string in the current context when called. * */ char* exec_to_command(const char* command, char* q) { StringInfoData resultbuf; char* result; int i, j, processed, retval; SPITupleTable *coltuptable; MemoryContext pre_context; MemoryContext spi_conn_context; // elog(LOG, "%s", command); //prints the current command running pre_context = CurrentMemoryContext; SetCurrentStatementStartTimestamp(); StartTransactionCommand(); SPI_connect(); PushActiveSnapshot(GetTransactionSnapshot()); retval = SPI_execute(q, false, 0); if (retval != SPI_OK_SELECT) { elog(LOG, "Database SELECT execution failed: %d", retval); SPI_finish(); PopActiveSnapshot(); CommitTransactionCommand(); result = pstrdup(""); return result; } processed = SPI_processed; coltuptable = SPI_tuptable; initStringInfo(&resultbuf); appendStringInfoString(&resultbuf, command); appendStringInfoString(&resultbuf, ";"); //artisinal semicolon if (coltuptable != NULL) { for(i = 0; i < processed; i++) { for(j = 1; j <= coltuptable->tupdesc->natts; j++) { if (SPI_getvalue(coltuptable->vals[i], coltuptable->tupdesc, j) != NULL) { appendStringInfoString(&resultbuf, SPI_getvalue(coltuptable->vals[i], coltuptable->tupdesc, j)); } appendStringInfo(&resultbuf, FIELD_DELIMIT); } appendStringInfo(&resultbuf,REC_DELIMIT); } } appendStringInfo(&resultbuf, CDELIMIT); spi_conn_context = MemoryContextSwitchTo(pre_context); result = pstrdup(resultbuf.data); MemoryContextSwitchTo(spi_conn_context); SPI_finish(); PopActiveSnapshot(); CommitTransactionCommand(); return result; }
/* * Find or create a hashtable entry for the tuple group containing the * given tuple. * * If isnew is NULL, we do not create new entries; we return NULL if no * match is found. * * If isnew isn't NULL, then a new entry is created if no existing entry * matches. On return, *isnew is true if the entry is newly created, * false if it existed already. Any extra space in a new entry has been * zeroed. */ TupleHashEntry LookupTupleHashEntry(TupleHashTable hashtable, TupleTableSlot *slot, bool *isnew) { TupleHashEntry entry; MemoryContext oldContext; TupleHashTable saveCurHT; TupleHashEntryData dummy; bool found; /* If first time through, clone the input slot to make table slot */ if (hashtable->tableslot == NULL) { TupleDesc tupdesc; oldContext = MemoryContextSwitchTo(hashtable->tablecxt); /* * We copy the input tuple descriptor just for safety --- we assume * all input tuples will have equivalent descriptors. */ tupdesc = CreateTupleDescCopy(slot->tts_tupleDescriptor); hashtable->tableslot = MakeSingleTupleTableSlot(tupdesc); MemoryContextSwitchTo(oldContext); } /* Need to run the hash functions in short-lived context */ oldContext = MemoryContextSwitchTo(hashtable->tempcxt); /* * Set up data needed by hash and match functions * * We save and restore CurTupleHashTable just in case someone manages to * invoke this code re-entrantly. */ hashtable->inputslot = slot; saveCurHT = CurTupleHashTable; CurTupleHashTable = hashtable; /* Search the hash table */ dummy.firstTuple = NULL; /* flag to reference inputslot */ entry = (TupleHashEntry) hash_search(hashtable->hashtab, &dummy, isnew ? HASH_ENTER : HASH_FIND, &found); if (isnew) { if (found) { /* found pre-existing entry */ *isnew = false; } else { /* * created new entry * * Zero any caller-requested space in the entry. (This zaps the * "key data" dynahash.c copied into the new entry, but we don't * care since we're about to overwrite it anyway.) */ MemSet(entry, 0, hashtable->entrysize); /* Copy the first tuple into the table context */ MemoryContextSwitchTo(hashtable->tablecxt); entry->firstTuple = ExecCopySlotMinimalTuple(slot); *isnew = true; } } CurTupleHashTable = saveCurHT; MemoryContextSwitchTo(oldContext); return entry; }
void init_cfg(Oid id, TSCfgInfo * cfg) { Oid arg[2]; bool isnull; Datum pars[2]; int stat, i, j; text *ptr; text *prsname = NULL; char *nsp = get_namespace(TSNSP_FunctionOid); char buf[1024]; MemoryContext oldcontext; void *plan; arg[0] = OIDOID; arg[1] = OIDOID; pars[0] = ObjectIdGetDatum(id); pars[1] = ObjectIdGetDatum(id); memset(cfg, 0, sizeof(TSCfgInfo)); SPI_connect(); sprintf(buf, "select prs_name from %s.pg_ts_cfg where oid = $1", nsp); plan = SPI_prepare(buf, 1, arg); if (!plan) ts_error(ERROR, "SPI_prepare() failed"); stat = SPI_execp(plan, pars, " ", 1); if (stat < 0) ts_error(ERROR, "SPI_execp return %d", stat); if (SPI_processed > 0) { prsname = (text *) DatumGetPointer( SPI_getbinval(SPI_tuptable->vals[0], SPI_tuptable->tupdesc, 1, &isnull) ); oldcontext = MemoryContextSwitchTo(TopMemoryContext); prsname = ptextdup(prsname); MemoryContextSwitchTo(oldcontext); cfg->id = id; } else ts_error(ERROR, "No tsearch cfg with id %d", id); SPI_freeplan(plan); arg[0] = TEXTOID; sprintf(buf, "select lt.tokid, map.dict_name from %s.pg_ts_cfgmap as map, %s.pg_ts_cfg as cfg, %s.token_type( $1 ) as lt where lt.alias = map.tok_alias and map.ts_name = cfg.ts_name and cfg.oid= $2 order by lt.tokid desc;", nsp, nsp, nsp); plan = SPI_prepare(buf, 2, arg); if (!plan) ts_error(ERROR, "SPI_prepare() failed"); pars[0] = PointerGetDatum(prsname); stat = SPI_execp(plan, pars, " ", 0); if (stat < 0) ts_error(ERROR, "SPI_execp return %d", stat); if (SPI_processed <= 0) ts_error(ERROR, "No parser with id %d", id); for (i = 0; i < SPI_processed; i++) { int lexid = DatumGetInt32(SPI_getbinval(SPI_tuptable->vals[i], SPI_tuptable->tupdesc, 1, &isnull)); ArrayType *toasted_a = (ArrayType *) PointerGetDatum(SPI_getbinval(SPI_tuptable->vals[i], SPI_tuptable->tupdesc, 2, &isnull)); ArrayType *a; if (!cfg->map) { cfg->len = lexid + 1; cfg->map = (ListDictionary *) malloc(sizeof(ListDictionary) * cfg->len); if (!cfg->map) ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of memory"))); memset(cfg->map, 0, sizeof(ListDictionary) * cfg->len); } if (isnull) continue; a = (ArrayType *) PointerGetDatum(PG_DETOAST_DATUM(DatumGetPointer(toasted_a))); if (ARR_NDIM(a) != 1) ts_error(ERROR, "Wrong dimension"); if (ARRNELEMS(a) < 1) continue; if (ARR_HASNULL(a)) ts_error(ERROR, "Array must not contain nulls"); cfg->map[lexid].len = ARRNELEMS(a); cfg->map[lexid].dict_id = (Datum *) malloc(sizeof(Datum) * cfg->map[lexid].len); if (!cfg->map[lexid].dict_id) ts_error(ERROR, "No memory"); memset(cfg->map[lexid].dict_id, 0, sizeof(Datum) * cfg->map[lexid].len); ptr = (text *) ARR_DATA_PTR(a); oldcontext = MemoryContextSwitchTo(TopMemoryContext); for (j = 0; j < cfg->map[lexid].len; j++) { cfg->map[lexid].dict_id[j] = PointerGetDatum(ptextdup(ptr)); ptr = NEXTVAL(ptr); } MemoryContextSwitchTo(oldcontext); if (a != toasted_a) pfree(a); } SPI_freeplan(plan); SPI_finish(); cfg->prs_id = name2id_prs(prsname); pfree(prsname); pfree(nsp); for (i = 0; i < cfg->len; i++) { for (j = 0; j < cfg->map[i].len; j++) { ptr = (text *) DatumGetPointer(cfg->map[i].dict_id[j]); cfg->map[i].dict_id[j] = ObjectIdGetDatum(name2id_dict(ptr)); pfree(ptr); } } }
/* * Add a tuple to the new heap. * * Visibility information is copied from the original tuple, except that * we "freeze" very-old tuples. Note that since we scribble on new_tuple, * it had better be temp storage not a pointer to the original tuple. * * state opaque state as returned by begin_heap_rewrite * old_tuple original tuple in the old heap * new_tuple new, rewritten tuple to be inserted to new heap */ void rewrite_heap_tuple(RewriteState state, HeapTuple old_tuple, HeapTuple new_tuple) { MemoryContext old_cxt; ItemPointerData old_tid; TidHashKey hashkey; bool found; bool free_new; old_cxt = MemoryContextSwitchTo(state->rs_cxt); /* * Copy the original tuple's visibility information into new_tuple. * * XXX we might later need to copy some t_infomask2 bits, too? Right now, * we intentionally clear the HOT status bits. */ memcpy(&new_tuple->t_data->t_choice.t_heap, &old_tuple->t_data->t_choice.t_heap, sizeof(HeapTupleFields)); new_tuple->t_data->t_infomask &= ~HEAP_XACT_MASK; new_tuple->t_data->t_infomask2 &= ~HEAP2_XACT_MASK; new_tuple->t_data->t_infomask |= old_tuple->t_data->t_infomask & HEAP_XACT_MASK; /* * While we have our hands on the tuple, we may as well freeze any * very-old xmin or xmax, so that future VACUUM effort can be saved. */ heap_freeze_tuple(new_tuple->t_data, state->rs_freeze_xid, state->rs_cutoff_multi); /* * Invalid ctid means that ctid should point to the tuple itself. We'll * override it later if the tuple is part of an update chain. */ ItemPointerSetInvalid(&new_tuple->t_data->t_ctid); /* * If the tuple has been updated, check the old-to-new mapping hash table. */ if (!((old_tuple->t_data->t_infomask & HEAP_XMAX_INVALID) || HeapTupleHeaderIsOnlyLocked(old_tuple->t_data)) && !(ItemPointerEquals(&(old_tuple->t_self), &(old_tuple->t_data->t_ctid)))) { OldToNewMapping mapping; memset(&hashkey, 0, sizeof(hashkey)); hashkey.xmin = HeapTupleHeaderGetUpdateXid(old_tuple->t_data); hashkey.tid = old_tuple->t_data->t_ctid; mapping = (OldToNewMapping) hash_search(state->rs_old_new_tid_map, &hashkey, HASH_FIND, NULL); if (mapping != NULL) { /* * We've already copied the tuple that t_ctid points to, so we can * set the ctid of this tuple to point to the new location, and * insert it right away. */ new_tuple->t_data->t_ctid = mapping->new_tid; /* We don't need the mapping entry anymore */ hash_search(state->rs_old_new_tid_map, &hashkey, HASH_REMOVE, &found); Assert(found); } else { /* * We haven't seen the tuple t_ctid points to yet. Stash this * tuple into unresolved_tups to be written later. */ UnresolvedTup unresolved; unresolved = hash_search(state->rs_unresolved_tups, &hashkey, HASH_ENTER, &found); Assert(!found); unresolved->old_tid = old_tuple->t_self; unresolved->tuple = heap_copytuple(new_tuple); /* * We can't do anything more now, since we don't know where the * tuple will be written. */ MemoryContextSwitchTo(old_cxt); return; } } /* * Now we will write the tuple, and then check to see if it is the B tuple * in any new or known pair. When we resolve a known pair, we will be * able to write that pair's A tuple, and then we have to check if it * resolves some other pair. Hence, we need a loop here. */ old_tid = old_tuple->t_self; free_new = false; for (;;) { ItemPointerData new_tid; /* Insert the tuple and find out where it's put in new_heap */ raw_heap_insert(state, new_tuple); new_tid = new_tuple->t_self; /* * If the tuple is the updated version of a row, and the prior version * wouldn't be DEAD yet, then we need to either resolve the prior * version (if it's waiting in rs_unresolved_tups), or make an entry * in rs_old_new_tid_map (so we can resolve it when we do see it). The * previous tuple's xmax would equal this one's xmin, so it's * RECENTLY_DEAD if and only if the xmin is not before OldestXmin. */ if ((new_tuple->t_data->t_infomask & HEAP_UPDATED) && !TransactionIdPrecedes(HeapTupleHeaderGetXmin(new_tuple->t_data), state->rs_oldest_xmin)) { /* * Okay, this is B in an update pair. See if we've seen A. */ UnresolvedTup unresolved; memset(&hashkey, 0, sizeof(hashkey)); hashkey.xmin = HeapTupleHeaderGetXmin(new_tuple->t_data); hashkey.tid = old_tid; unresolved = hash_search(state->rs_unresolved_tups, &hashkey, HASH_FIND, NULL); if (unresolved != NULL) { /* * We have seen and memorized the previous tuple already. Now * that we know where we inserted the tuple its t_ctid points * to, fix its t_ctid and insert it to the new heap. */ if (free_new) heap_freetuple(new_tuple); new_tuple = unresolved->tuple; free_new = true; old_tid = unresolved->old_tid; new_tuple->t_data->t_ctid = new_tid; /* * We don't need the hash entry anymore, but don't free its * tuple just yet. */ hash_search(state->rs_unresolved_tups, &hashkey, HASH_REMOVE, &found); Assert(found); /* loop back to insert the previous tuple in the chain */ continue; } else { /* * Remember the new tid of this tuple. We'll use it to set the * ctid when we find the previous tuple in the chain. */ OldToNewMapping mapping; mapping = hash_search(state->rs_old_new_tid_map, &hashkey, HASH_ENTER, &found); Assert(!found); mapping->new_tid = new_tid; } } /* Done with this (chain of) tuples, for now */ if (free_new) heap_freetuple(new_tuple); break; } MemoryContextSwitchTo(old_cxt); }
/* * Main entry point for walwriter process * * This is invoked from AuxiliaryProcessMain, which has already created the * basic execution environment, but not enabled signals yet. */ void WalWriterMain(void) { sigjmp_buf local_sigjmp_buf; MemoryContext walwriter_context; int left_till_hibernate; bool hibernating; /* * Properly accept or ignore signals the postmaster might send us * * We have no particular use for SIGINT at the moment, but seems * reasonable to treat like SIGTERM. */ pqsignal(SIGHUP, WalSigHupHandler); /* set flag to read config file */ pqsignal(SIGINT, WalShutdownHandler); /* request shutdown */ pqsignal(SIGTERM, WalShutdownHandler); /* request shutdown */ pqsignal(SIGQUIT, wal_quickdie); /* hard crash time */ pqsignal(SIGALRM, SIG_IGN); pqsignal(SIGPIPE, SIG_IGN); pqsignal(SIGUSR1, walwriter_sigusr1_handler); pqsignal(SIGUSR2, SIG_IGN); /* not used */ /* * Reset some signals that are accepted by postmaster but not here */ pqsignal(SIGCHLD, SIG_DFL); pqsignal(SIGTTIN, SIG_DFL); pqsignal(SIGTTOU, SIG_DFL); pqsignal(SIGCONT, SIG_DFL); pqsignal(SIGWINCH, SIG_DFL); /* We allow SIGQUIT (quickdie) at all times */ sigdelset(&BlockSig, SIGQUIT); /* * Create a resource owner to keep track of our resources (not clear that * we need this, but may as well have one). */ CurrentResourceOwner = ResourceOwnerCreate(NULL, "Wal Writer"); /* * Create a memory context that we will do all our work in. We do this so * that we can reset the context during error recovery and thereby avoid * possible memory leaks. Formerly this code just ran in * TopMemoryContext, but resetting that would be a really bad idea. */ walwriter_context = AllocSetContextCreate(TopMemoryContext, "Wal Writer", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); MemoryContextSwitchTo(walwriter_context); /* * If an exception is encountered, processing resumes here. * * This code is heavily based on bgwriter.c, q.v. */ if (sigsetjmp(local_sigjmp_buf, 1) != 0) { /* Since not using PG_TRY, must reset error stack by hand */ error_context_stack = NULL; /* Prevent interrupts while cleaning up */ HOLD_INTERRUPTS(); /* Report the error to the server log */ EmitErrorReport(); /* * These operations are really just a minimal subset of * AbortTransaction(). We don't have very many resources to worry * about in walwriter, but we do have LWLocks, and perhaps buffers? */ LWLockReleaseAll(); AbortBufferIO(); UnlockBuffers(); /* buffer pins are released here: */ ResourceOwnerRelease(CurrentResourceOwner, RESOURCE_RELEASE_BEFORE_LOCKS, false, true); /* we needn't bother with the other ResourceOwnerRelease phases */ AtEOXact_Buffers(false); AtEOXact_SMgr(); AtEOXact_Files(); AtEOXact_HashTables(false); /* * Now return to normal top-level context and clear ErrorContext for * next time. */ MemoryContextSwitchTo(walwriter_context); FlushErrorState(); /* Flush any leaked data in the top-level context */ MemoryContextResetAndDeleteChildren(walwriter_context); /* Now we can allow interrupts again */ RESUME_INTERRUPTS(); /* * Sleep at least 1 second after any error. A write error is likely * to be repeated, and we don't want to be filling the error logs as * fast as we can. */ pg_usleep(1000000L); /* * Close all open files after any error. This is helpful on Windows, * where holding deleted files open causes various strange errors. * It's not clear we need it elsewhere, but shouldn't hurt. */ smgrcloseall(); } /* We can now handle ereport(ERROR) */ PG_exception_stack = &local_sigjmp_buf; /* * Unblock signals (they were blocked when the postmaster forked us) */ PG_SETMASK(&UnBlockSig); /* * Reset hibernation state after any error. */ left_till_hibernate = LOOPS_UNTIL_HIBERNATE; hibernating = false; SetWalWriterSleeping(false); /* * Advertise our latch that backends can use to wake us up while we're * sleeping. */ ProcGlobal->walwriterLatch = &MyProc->procLatch; /* * Loop forever */ for (;;) { long cur_timeout; int rc; /* * Advertise whether we might hibernate in this cycle. We do this * before resetting the latch to ensure that any async commits will * see the flag set if they might possibly need to wake us up, and * that we won't miss any signal they send us. (If we discover work * to do in the last cycle before we would hibernate, the global flag * will be set unnecessarily, but little harm is done.) But avoid * touching the global flag if it doesn't need to change. */ if (hibernating != (left_till_hibernate <= 1)) { hibernating = (left_till_hibernate <= 1); SetWalWriterSleeping(hibernating); } /* Clear any already-pending wakeups */ ResetLatch(MyLatch); /* * Process any requests or signals received recently. */ if (got_SIGHUP) { got_SIGHUP = false; ProcessConfigFile(PGC_SIGHUP); } if (shutdown_requested) { /* Normal exit from the walwriter is here */ proc_exit(0); /* done */ } /* * Do what we're here for; then, if XLogBackgroundFlush() found useful * work to do, reset hibernation counter. */ if (XLogBackgroundFlush()) left_till_hibernate = LOOPS_UNTIL_HIBERNATE; else if (left_till_hibernate > 0) left_till_hibernate--; /* * Sleep until we are signaled or WalWriterDelay has elapsed. If we * haven't done anything useful for quite some time, lengthen the * sleep time so as to reduce the server's idle power consumption. */ if (left_till_hibernate > 0) cur_timeout = WalWriterDelay; /* in ms */ else cur_timeout = WalWriterDelay * HIBERNATE_FACTOR; rc = WaitLatch(MyLatch, WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, cur_timeout); /* * Emergency bailout if postmaster has died. This is to avoid the * necessity for manual cleanup of all postmaster children. */ if (rc & WL_POSTMASTER_DEATH) exit(1); } }
/* * Opens an existing work file with the specified name, the file type, * and the compression type. * * If this fails, NULL is returned. */ ExecWorkFile * ExecWorkFile_Open(const char *fileName, ExecWorkFileType fileType, bool delOnClose, int compressType) { ExecWorkFile *workfile = NULL; void *file = NULL; int64 file_size = 0; /* * Create ExecWorkFile in the TopMemoryContext since this memory context * is still available when calling the transaction callback at the * time when the transaction aborts. */ MemoryContext oldContext = MemoryContextSwitchTo(TopMemoryContext); switch(fileType) { case BUFFILE: file = (void *)BufFileOpenFile(fileName, false, /* Create */ delOnClose, true /* interXact */ ); if (!file) { elog(ERROR, "could not open temporary file \"%s\": %m", fileName); } BufFileSetWorkfile(file); file_size = BufFileGetSize(file); break; case BFZ: file = (void *)bfz_open(fileName, delOnClose, compressType); if (!file) { elog(ERROR, "could not open temporary file \"%s\": %m", fileName); } file_size = bfz_totalbytes((bfz_t *)file); break; default: ereport(LOG, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("invalid work file type: %d", fileType))); Assert(false); } /* Failed opening existing workfile. Inform the caller */ if (NULL == file) { MemoryContextSwitchTo(oldContext); return NULL; } workfile = palloc0(sizeof(ExecWorkFile)); workfile->fileType = fileType; workfile->compressType = compressType; workfile->file = file; workfile->fileName = pstrdup(fileName); workfile->size = file_size; ExecWorkFile_SetFlags(workfile, delOnClose, false /* created */); MemoryContextSwitchTo(oldContext); return workfile; }
/* * Main entry point for checkpointer process * * This is invoked from BootstrapMain, which has already created the basic * execution environment, but not enabled signals yet. */ void CheckpointerMain(void) { sigjmp_buf local_sigjmp_buf; MemoryContext checkpointer_context; BgWriterShmem->checkpointer_pid = MyProcPid; am_checkpointer = true; /* * If possible, make this process a group leader, so that the postmaster * can signal any child processes too. (checkpointer probably never has any * child processes, but for consistency we make all postmaster child * processes do this.) */ #ifdef HAVE_SETSID if (setsid() < 0) elog(FATAL, "setsid() failed: %m"); #endif /* * Properly accept or ignore signals the postmaster might send us * * Note: we deliberately ignore SIGTERM, because during a standard Unix * system shutdown cycle, init will SIGTERM all processes at once. We * want to wait for the backends to exit, whereupon the postmaster will * tell us it's okay to shut down (via SIGUSR2). * * SIGUSR1 is presently unused; keep it spare in case someday we want this * process to participate in ProcSignal signalling. */ pqsignal(SIGHUP, ChkptSigHupHandler); /* set flag to read config file */ pqsignal(SIGINT, ReqCheckpointHandler); /* request checkpoint */ pqsignal(SIGTERM, SIG_IGN); /* ignore SIGTERM */ pqsignal(SIGQUIT, chkpt_quickdie); /* hard crash time */ pqsignal(SIGALRM, SIG_IGN); pqsignal(SIGPIPE, SIG_IGN); pqsignal(SIGUSR1, SIG_IGN); /* reserve for ProcSignal */ pqsignal(SIGUSR2, ReqShutdownHandler); /* request shutdown */ /* * Reset some signals that are accepted by postmaster but not here */ pqsignal(SIGCHLD, SIG_DFL); pqsignal(SIGTTIN, SIG_DFL); pqsignal(SIGTTOU, SIG_DFL); pqsignal(SIGCONT, SIG_DFL); pqsignal(SIGWINCH, SIG_DFL); /* We allow SIGQUIT (quickdie) at all times */ sigdelset(&BlockSig, SIGQUIT); /* * Initialize so that first time-driven event happens at the correct time. */ last_checkpoint_time = last_xlog_switch_time = (pg_time_t) time(NULL); /* * Create a resource owner to keep track of our resources (currently only * buffer pins). */ CurrentResourceOwner = ResourceOwnerCreate(NULL, "Checkpointer"); /* * Create a memory context that we will do all our work in. We do this so * that we can reset the context during error recovery and thereby avoid * possible memory leaks. Formerly this code just ran in * TopMemoryContext, but resetting that would be a really bad idea. */ checkpointer_context = AllocSetContextCreate(TopMemoryContext, "Checkpointer", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); MemoryContextSwitchTo(checkpointer_context); /* * If an exception is encountered, processing resumes here. * * See notes in postgres.c about the design of this coding. */ if (sigsetjmp(local_sigjmp_buf, 1) != 0) { /* Since not using PG_TRY, must reset error stack by hand */ error_context_stack = NULL; /* Prevent interrupts while cleaning up */ HOLD_INTERRUPTS(); /* Report the error to the server log */ EmitErrorReport(); /* * These operations are really just a minimal subset of * AbortTransaction(). We don't have very many resources to worry * about in checkpointer, but we do have LWLocks, buffers, and temp files. */ LWLockReleaseAll(); AbortBufferIO(); UnlockBuffers(); /* buffer pins are released here: */ ResourceOwnerRelease(CurrentResourceOwner, RESOURCE_RELEASE_BEFORE_LOCKS, false, true); /* we needn't bother with the other ResourceOwnerRelease phases */ AtEOXact_Buffers(false); AtEOXact_Files(); AtEOXact_HashTables(false); /* Warn any waiting backends that the checkpoint failed. */ if (ckpt_active) { /* use volatile pointer to prevent code rearrangement */ volatile BgWriterShmemStruct *bgs = BgWriterShmem; SpinLockAcquire(&bgs->ckpt_lck); bgs->ckpt_failed++; bgs->ckpt_done = bgs->ckpt_started; SpinLockRelease(&bgs->ckpt_lck); ckpt_active = false; } /* * Now return to normal top-level context and clear ErrorContext for * next time. */ MemoryContextSwitchTo(checkpointer_context); FlushErrorState(); /* Flush any leaked data in the top-level context */ MemoryContextResetAndDeleteChildren(checkpointer_context); /* Now we can allow interrupts again */ RESUME_INTERRUPTS(); /* * Sleep at least 1 second after any error. A write error is likely * to be repeated, and we don't want to be filling the error logs as * fast as we can. */ pg_usleep(1000000L); /* * Close all open files after any error. This is helpful on Windows, * where holding deleted files open causes various strange errors. * It's not clear we need it elsewhere, but shouldn't hurt. */ smgrcloseall(); } /* We can now handle ereport(ERROR) */ PG_exception_stack = &local_sigjmp_buf; /* * Unblock signals (they were blocked when the postmaster forked us) */ PG_SETMASK(&UnBlockSig); /* * Use the recovery target timeline ID during recovery */ if (RecoveryInProgress()) ThisTimeLineID = GetRecoveryTargetTLI(); /* Do this once before starting the loop, then just at SIGHUP time. */ SyncRepUpdateSyncStandbysDefined(); /* * Loop forever */ for (;;) { bool do_checkpoint = false; int flags = 0; pg_time_t now; int elapsed_secs; /* * Emergency bailout if postmaster has died. This is to avoid the * necessity for manual cleanup of all postmaster children. */ if (!PostmasterIsAlive()) exit(1); /* * Process any requests or signals received recently. */ AbsorbFsyncRequests(); if (got_SIGHUP) { got_SIGHUP = false; ProcessConfigFile(PGC_SIGHUP); /* update global shmem state for sync rep */ SyncRepUpdateSyncStandbysDefined(); } if (checkpoint_requested) { checkpoint_requested = false; do_checkpoint = true; BgWriterStats.m_requested_checkpoints++; } if (shutdown_requested) { /* * From here on, elog(ERROR) should end with exit(1), not send * control back to the sigsetjmp block above */ ExitOnAnyError = true; /* Close down the database */ ShutdownXLOG(0, 0); /* Normal exit from the checkpointer is here */ proc_exit(0); /* done */ } /* * Force a checkpoint if too much time has elapsed since the last one. * Note that we count a timed checkpoint in stats only when this * occurs without an external request, but we set the CAUSE_TIME flag * bit even if there is also an external request. */ now = (pg_time_t) time(NULL); elapsed_secs = now - last_checkpoint_time; if (elapsed_secs >= CheckPointTimeout) { if (!do_checkpoint) BgWriterStats.m_timed_checkpoints++; do_checkpoint = true; flags |= CHECKPOINT_CAUSE_TIME; } /* * Do a checkpoint if requested. */ if (do_checkpoint) { bool ckpt_performed = false; bool do_restartpoint; /* use volatile pointer to prevent code rearrangement */ volatile BgWriterShmemStruct *bgs = BgWriterShmem; /* * Check if we should perform a checkpoint or a restartpoint. As a * side-effect, RecoveryInProgress() initializes TimeLineID if * it's not set yet. */ do_restartpoint = RecoveryInProgress(); /* * Atomically fetch the request flags to figure out what kind of a * checkpoint we should perform, and increase the started-counter * to acknowledge that we've started a new checkpoint. */ SpinLockAcquire(&bgs->ckpt_lck); flags |= bgs->ckpt_flags; bgs->ckpt_flags = 0; bgs->ckpt_started++; SpinLockRelease(&bgs->ckpt_lck); /* * The end-of-recovery checkpoint is a real checkpoint that's * performed while we're still in recovery. */ if (flags & CHECKPOINT_END_OF_RECOVERY) do_restartpoint = false; /* * We will warn if (a) too soon since last checkpoint (whatever * caused it) and (b) somebody set the CHECKPOINT_CAUSE_XLOG flag * since the last checkpoint start. Note in particular that this * implementation will not generate warnings caused by * CheckPointTimeout < CheckPointWarning. */ if (!do_restartpoint && (flags & CHECKPOINT_CAUSE_XLOG) && elapsed_secs < CheckPointWarning) ereport(LOG, (errmsg_plural("checkpoints are occurring too frequently (%d second apart)", "checkpoints are occurring too frequently (%d seconds apart)", elapsed_secs, elapsed_secs), errhint("Consider increasing the configuration parameter \"checkpoint_segments\"."))); /* * Initialize checkpointer-private variables used during checkpoint. */ ckpt_active = true; if (!do_restartpoint) ckpt_start_recptr = GetInsertRecPtr(); ckpt_start_time = now; ckpt_cached_elapsed = 0; /* * Do the checkpoint. */ if (!do_restartpoint) { CreateCheckPoint(flags); ckpt_performed = true; } else ckpt_performed = CreateRestartPoint(flags); /* * After any checkpoint, close all smgr files. This is so we * won't hang onto smgr references to deleted files indefinitely. */ smgrcloseall(); /* * Indicate checkpoint completion to any waiting backends. */ SpinLockAcquire(&bgs->ckpt_lck); bgs->ckpt_done = bgs->ckpt_started; SpinLockRelease(&bgs->ckpt_lck); if (ckpt_performed) { /* * Note we record the checkpoint start time not end time as * last_checkpoint_time. This is so that time-driven * checkpoints happen at a predictable spacing. */ last_checkpoint_time = now; } else { /* * We were not able to perform the restartpoint (checkpoints * throw an ERROR in case of error). Most likely because we * have not received any new checkpoint WAL records since the * last restartpoint. Try again in 15 s. */ last_checkpoint_time = now - CheckPointTimeout + 15; } ckpt_active = false; } /* * Send off activity statistics to the stats collector */ pgstat_send_bgwriter(); /* * Nap for a while and then loop again. Later patches will replace * this with a latch loop. Keep it simple now for clarity. * Relatively long sleep because the bgwriter does cleanup now. */ pg_usleep(500000L); /* Check for archive_timeout and switch xlog files if necessary. */ CheckArchiveTimeout(); } }
/* * PerformCursorOpen * Execute SQL DECLARE CURSOR command. */ void PerformCursorOpen(DeclareCursorStmt *cstmt, ParamListInfo params, const char *queryString, bool isTopLevel) { Query *query = castNode(Query, cstmt->query); List *rewritten; PlannedStmt *plan; Portal portal; MemoryContext oldContext; /* * Disallow empty-string cursor name (conflicts with protocol-level * unnamed portal). */ if (!cstmt->portalname || cstmt->portalname[0] == '\0') ereport(ERROR, (errcode(ERRCODE_INVALID_CURSOR_NAME), errmsg("invalid cursor name: must not be empty"))); /* * If this is a non-holdable cursor, we require that this statement has * been executed inside a transaction block (or else, it would have no * user-visible effect). */ if (!(cstmt->options & CURSOR_OPT_HOLD)) RequireTransactionChain(isTopLevel, "DECLARE CURSOR"); /* * Parse analysis was done already, but we still have to run the rule * rewriter. We do not do AcquireRewriteLocks: we assume the query either * came straight from the parser, or suitable locks were acquired by * plancache.c. * * Because the rewriter and planner tend to scribble on the input, we make * a preliminary copy of the source querytree. This prevents problems in * the case that the DECLARE CURSOR is in a portal or plpgsql function and * is executed repeatedly. (See also the same hack in EXPLAIN and * PREPARE.) XXX FIXME someday. */ rewritten = QueryRewrite((Query *) copyObject(query)); /* SELECT should never rewrite to more or less than one query */ if (list_length(rewritten) != 1) elog(ERROR, "non-SELECT statement in DECLARE CURSOR"); query = castNode(Query, linitial(rewritten)); if (query->commandType != CMD_SELECT) elog(ERROR, "non-SELECT statement in DECLARE CURSOR"); /* Plan the query, applying the specified options */ plan = pg_plan_query(query, cstmt->options, params); /* * Create a portal and copy the plan and queryString into its memory. */ portal = CreatePortal(cstmt->portalname, false, false); oldContext = MemoryContextSwitchTo(PortalGetHeapMemory(portal)); plan = copyObject(plan); queryString = pstrdup(queryString); PortalDefineQuery(portal, NULL, queryString, "SELECT", /* cursor's query is always a SELECT */ list_make1(plan), NULL); /*---------- * Also copy the outer portal's parameter list into the inner portal's * memory context. We want to pass down the parameter values in case we * had a command like * DECLARE c CURSOR FOR SELECT ... WHERE foo = $1 * This will have been parsed using the outer parameter set and the * parameter value needs to be preserved for use when the cursor is * executed. *---------- */ params = copyParamList(params); MemoryContextSwitchTo(oldContext); /* * Set up options for portal. * * If the user didn't specify a SCROLL type, allow or disallow scrolling * based on whether it would require any additional runtime overhead to do * so. Also, we disallow scrolling for FOR UPDATE cursors. */ portal->cursorOptions = cstmt->options; if (!(portal->cursorOptions & (CURSOR_OPT_SCROLL | CURSOR_OPT_NO_SCROLL))) { if (plan->rowMarks == NIL && ExecSupportsBackwardScan(plan->planTree)) portal->cursorOptions |= CURSOR_OPT_SCROLL; else portal->cursorOptions |= CURSOR_OPT_NO_SCROLL; } /* * Start execution, inserting parameters if any. */ PortalStart(portal, params, 0, GetActiveSnapshot()); Assert(portal->strategy == PORTAL_ONE_SELECT); /* * We're done; the query won't actually be run until PerformPortalFetch is * called. */ }
/* * Execute the index scan. * * This works by reading index TIDs from the revmap, and obtaining the index * tuples pointed to by them; the summary values in the index tuples are * compared to the scan keys. We return into the TID bitmap all the pages in * ranges corresponding to index tuples that match the scan keys. * * If a TID from the revmap is read as InvalidTID, we know that range is * unsummarized. Pages in those ranges need to be returned regardless of scan * keys. * * XXX see _bt_first on what to do about sk_subtype. */ Datum bringetbitmap(PG_FUNCTION_ARGS) { IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0); TIDBitmap *tbm = (TIDBitmap *) PG_GETARG_POINTER(1); Relation idxRel = scan->indexRelation; Buffer buf = InvalidBuffer; BrinDesc *bdesc; Oid heapOid; Relation heapRel; BrinOpaque *opaque; BlockNumber nblocks; BlockNumber heapBlk; int totalpages = 0; int keyno; FmgrInfo *consistentFn; MemoryContext oldcxt; MemoryContext perRangeCxt; opaque = (BrinOpaque *) scan->opaque; bdesc = opaque->bo_bdesc; pgstat_count_index_scan(idxRel); /* * We need to know the size of the table so that we know how long to * iterate on the revmap. */ heapOid = IndexGetRelation(RelationGetRelid(idxRel), false); heapRel = heap_open(heapOid, AccessShareLock); nblocks = RelationGetNumberOfBlocks(heapRel); heap_close(heapRel, AccessShareLock); /* * Obtain consistent functions for all indexed column. Maybe it'd be * possible to do this lazily only the first time we see a scan key that * involves each particular attribute. */ consistentFn = palloc(sizeof(FmgrInfo) * bdesc->bd_tupdesc->natts); for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++) { FmgrInfo *tmp; tmp = index_getprocinfo(idxRel, keyno + 1, BRIN_PROCNUM_CONSISTENT); fmgr_info_copy(&consistentFn[keyno], tmp, CurrentMemoryContext); } /* * Setup and use a per-range memory context, which is reset every time we * loop below. This avoids having to free the tuples within the loop. */ perRangeCxt = AllocSetContextCreate(CurrentMemoryContext, "bringetbitmap cxt", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); oldcxt = MemoryContextSwitchTo(perRangeCxt); /* * Now scan the revmap. We start by querying for heap page 0, * incrementing by the number of pages per range; this gives us a full * view of the table. */ for (heapBlk = 0; heapBlk < nblocks; heapBlk += opaque->bo_pagesPerRange) { bool addrange; BrinTuple *tup; OffsetNumber off; Size size; CHECK_FOR_INTERRUPTS(); MemoryContextResetAndDeleteChildren(perRangeCxt); tup = brinGetTupleForHeapBlock(opaque->bo_rmAccess, heapBlk, &buf, &off, &size, BUFFER_LOCK_SHARE); if (tup) { tup = brin_copy_tuple(tup, size); LockBuffer(buf, BUFFER_LOCK_UNLOCK); } /* * For page ranges with no indexed tuple, we must return the whole * range; otherwise, compare it to the scan keys. */ if (tup == NULL) { addrange = true; } else { BrinMemTuple *dtup; int keyno; dtup = brin_deform_tuple(bdesc, tup); if (dtup->bt_placeholder) { /* * Placeholder tuples are always returned, regardless of the * values stored in them. */ addrange = true; } else { /* * Compare scan keys with summary values stored for the range. * If scan keys are matched, the page range must be added to * the bitmap. We initially assume the range needs to be * added; in particular this serves the case where there are * no keys. */ addrange = true; for (keyno = 0; keyno < scan->numberOfKeys; keyno++) { ScanKey key = &scan->keyData[keyno]; AttrNumber keyattno = key->sk_attno; BrinValues *bval = &dtup->bt_columns[keyattno - 1]; Datum add; /* * The collation of the scan key must match the collation * used in the index column (but only if the search is not * IS NULL/ IS NOT NULL). Otherwise we shouldn't be using * this index ... */ Assert((key->sk_flags & SK_ISNULL) || (key->sk_collation == bdesc->bd_tupdesc->attrs[keyattno - 1]->attcollation)); /* * Check whether the scan key is consistent with the page * range values; if so, have the pages in the range added * to the output bitmap. * * When there are multiple scan keys, failure to meet the * criteria for a single one of them is enough to discard * the range as a whole, so break out of the loop as soon * as a false return value is obtained. */ add = FunctionCall3Coll(&consistentFn[keyattno - 1], key->sk_collation, PointerGetDatum(bdesc), PointerGetDatum(bval), PointerGetDatum(key)); addrange = DatumGetBool(add); if (!addrange) break; } } } /* add the pages in the range to the output bitmap, if needed */ if (addrange) { BlockNumber pageno; for (pageno = heapBlk; pageno <= heapBlk + opaque->bo_pagesPerRange - 1; pageno++) { MemoryContextSwitchTo(oldcxt); tbm_add_page(tbm, pageno); totalpages++; MemoryContextSwitchTo(perRangeCxt); } } } MemoryContextSwitchTo(oldcxt); MemoryContextDelete(perRangeCxt); if (buf != InvalidBuffer) ReleaseBuffer(buf); /* * XXX We have an approximation of the number of *pages* that our scan * returns, but we don't have a precise idea of the number of heap tuples * involved. */ PG_RETURN_INT64(totalpages * 10); }
void dumpSharedComboCommandId(TransactionId xmin, CommandId cmin, CommandId cmax, CommandId combocid) { /* * In any given segment, there are many readers, but only one writer. The * combo cid file information is stored in the MyProc of the writer process, * and is referenced by reader process via lockHolderProcPtr. The writer * will setup and/or dump combocids to a combo cid file when appropriate. * The writer keeps track of the number of entries in the combo cid file in * MyProc->combocid_map_count. Readers reference the count via * lockHolderProcPtr->combocid_map_count. * * Since combo cid file entries are always appended to the end of a combo * cid file and because there is only one writer, it is not necessary to * lock the combo cid file during reading or writing. A new combo cid will * not become visable to the reader until the combocid_map_count variable * has been incremented. */ ComboCidEntryData entry; Assert(Gp_role != GP_ROLE_EXECUTE || Gp_is_writer); if (combocid_map == NULL) { /* This is the first time a combo cid is to be written by this writer. */ MemoryContext oldCtx; char path[MAXPGPATH]; MyProc->combocid_map_count = 0; ComboCidMapName(path, gp_session_id, MyProc->pid); /* open our file, as appropriate: this will throw an error if the create-fails. */ oldCtx = MemoryContextSwitchTo(TopMemoryContext); /* * XXX: We could probably close and delete the file at the end of * transaction. We would then need to keep combocid_map_count * synchronized with open files at (sub-) xact boundaries. */ combocid_map = BufFileCreateTemp_ReaderWriter(path, true, true); MemoryContextSwitchTo(oldCtx); } Assert(combocid_map != NULL); /* Seek to the end: BufFileSeek() doesn't support SEEK_END! */ /* build our entry */ memset(&entry, 0, sizeof(entry)); entry.key.cmin = cmin; entry.key.cmax = cmax; entry.key.xmin = xmin; entry.combocid = combocid; /* write our entry */ if (BufFileWrite(combocid_map, &entry, sizeof(entry)) != sizeof(entry)) { elog(ERROR, "Combocid map I/O error!"); } /* flush our output */ BufFileFlush(combocid_map); /* Increment combocid count to make new combocid visible to Readers */ MyProc->combocid_map_count += 1; }
/* * A tuple in the heap is being inserted. To keep a brin index up to date, * we need to obtain the relevant index tuple and compare its stored values * with those of the new tuple. If the tuple values are not consistent with * the summary tuple, we need to update the index tuple. * * If the range is not currently summarized (i.e. the revmap returns NULL for * it), there's nothing to do. */ Datum brininsert(PG_FUNCTION_ARGS) { Relation idxRel = (Relation) PG_GETARG_POINTER(0); Datum *values = (Datum *) PG_GETARG_POINTER(1); bool *nulls = (bool *) PG_GETARG_POINTER(2); ItemPointer heaptid = (ItemPointer) PG_GETARG_POINTER(3); /* we ignore the rest of our arguments */ BlockNumber pagesPerRange; BrinDesc *bdesc = NULL; BrinRevmap *revmap; Buffer buf = InvalidBuffer; MemoryContext tupcxt = NULL; MemoryContext oldcxt = NULL; revmap = brinRevmapInitialize(idxRel, &pagesPerRange); for (;;) { bool need_insert = false; OffsetNumber off; BrinTuple *brtup; BrinMemTuple *dtup; BlockNumber heapBlk; int keyno; #ifdef USE_ASSERT_CHECKING BrinTuple *tmptup; BrinMemTuple *tmpdtup; Size tmpsiz; #endif CHECK_FOR_INTERRUPTS(); heapBlk = ItemPointerGetBlockNumber(heaptid); /* normalize the block number to be the first block in the range */ heapBlk = (heapBlk / pagesPerRange) * pagesPerRange; brtup = brinGetTupleForHeapBlock(revmap, heapBlk, &buf, &off, NULL, BUFFER_LOCK_SHARE); /* if range is unsummarized, there's nothing to do */ if (!brtup) break; /* First time through? */ if (bdesc == NULL) { bdesc = brin_build_desc(idxRel); tupcxt = AllocSetContextCreate(CurrentMemoryContext, "brininsert cxt", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); oldcxt = MemoryContextSwitchTo(tupcxt); } dtup = brin_deform_tuple(bdesc, brtup); #ifdef USE_ASSERT_CHECKING { /* * When assertions are enabled, we use this as an opportunity to * test the "union" method, which would otherwise be used very * rarely: first create a placeholder tuple, and addValue the * value we just got into it. Then union the existing index tuple * with the updated placeholder tuple. The tuple resulting from * that union should be identical to the one resulting from the * regular operation (straight addValue) below. * * Here we create the tuple to compare with; the actual comparison * is below. */ tmptup = brin_form_placeholder_tuple(bdesc, heapBlk, &tmpsiz); tmpdtup = brin_deform_tuple(bdesc, tmptup); for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++) { BrinValues *bval; FmgrInfo *addValue; bval = &tmpdtup->bt_columns[keyno]; addValue = index_getprocinfo(idxRel, keyno + 1, BRIN_PROCNUM_ADDVALUE); FunctionCall4Coll(addValue, idxRel->rd_indcollation[keyno], PointerGetDatum(bdesc), PointerGetDatum(bval), values[keyno], nulls[keyno]); } union_tuples(bdesc, tmpdtup, brtup); tmpdtup->bt_placeholder = dtup->bt_placeholder; tmptup = brin_form_tuple(bdesc, heapBlk, tmpdtup, &tmpsiz); } #endif /* * Compare the key values of the new tuple to the stored index values; * our deformed tuple will get updated if the new tuple doesn't fit * the original range (note this means we can't break out of the loop * early). Make a note of whether this happens, so that we know to * insert the modified tuple later. */ for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++) { Datum result; BrinValues *bval; FmgrInfo *addValue; bval = &dtup->bt_columns[keyno]; addValue = index_getprocinfo(idxRel, keyno + 1, BRIN_PROCNUM_ADDVALUE); result = FunctionCall4Coll(addValue, idxRel->rd_indcollation[keyno], PointerGetDatum(bdesc), PointerGetDatum(bval), values[keyno], nulls[keyno]); /* if that returned true, we need to insert the updated tuple */ need_insert |= DatumGetBool(result); } #ifdef USE_ASSERT_CHECKING { /* * Now we can compare the tuple produced by the union function * with the one from plain addValue. */ BrinTuple *cmptup; Size cmpsz; cmptup = brin_form_tuple(bdesc, heapBlk, dtup, &cmpsz); Assert(brin_tuples_equal(tmptup, tmpsiz, cmptup, cmpsz)); } #endif if (!need_insert) { /* * The tuple is consistent with the new values, so there's nothing * to do. */ LockBuffer(buf, BUFFER_LOCK_UNLOCK); } else { Page page = BufferGetPage(buf); ItemId lp = PageGetItemId(page, off); Size origsz; BrinTuple *origtup; Size newsz; BrinTuple *newtup; bool samepage; /* * Make a copy of the old tuple, so that we can compare it after * re-acquiring the lock. */ origsz = ItemIdGetLength(lp); origtup = brin_copy_tuple(brtup, origsz); /* * Before releasing the lock, check if we can attempt a same-page * update. Another process could insert a tuple concurrently in * the same page though, so downstream we must be prepared to cope * if this turns out to not be possible after all. */ newtup = brin_form_tuple(bdesc, heapBlk, dtup, &newsz); samepage = brin_can_do_samepage_update(buf, origsz, newsz); LockBuffer(buf, BUFFER_LOCK_UNLOCK); /* * Try to update the tuple. If this doesn't work for whatever * reason, we need to restart from the top; the revmap might be * pointing at a different tuple for this block now, so we need to * recompute to ensure both our new heap tuple and the other * inserter's are covered by the combined tuple. It might be that * we don't need to update at all. */ if (!brin_doupdate(idxRel, pagesPerRange, revmap, heapBlk, buf, off, origtup, origsz, newtup, newsz, samepage)) { /* no luck; start over */ MemoryContextResetAndDeleteChildren(tupcxt); continue; } } /* success! */ break; } brinRevmapTerminate(revmap); if (BufferIsValid(buf)) ReleaseBuffer(buf); if (bdesc != NULL) { brin_free_desc(bdesc); MemoryContextSwitchTo(oldcxt); MemoryContextDelete(tupcxt); } return BoolGetDatum(false); }
void loadSharedComboCommandId(TransactionId xmin, CommandId combocid, CommandId *cmin, CommandId *cmax) { bool found = false; ComboCidEntryData entry; int i; Assert(Gp_role == GP_ROLE_EXECUTE); Assert(!Gp_is_writer); Assert(cmin != NULL); Assert(cmax != NULL); if (lockHolderProcPtr == NULL) { /* get lockholder! */ elog(ERROR, "loadSharedComboCommandId: NO LOCK HOLDER POINTER."); } if (combocid_map == NULL) { MemoryContext oldCtx; char path[MAXPGPATH]; ComboCidMapName(path, gp_session_id, lockHolderProcPtr->pid); /* open our file, as appropriate: this will throw an error if the create-fails. */ oldCtx = MemoryContextSwitchTo(TopMemoryContext); combocid_map = BufFileCreateTemp_ReaderWriter(path, false, true); MemoryContextSwitchTo(oldCtx); } Assert(combocid_map != NULL); /* Seek to the beginning to start our search ? */ if (BufFileSeek(combocid_map, 0 /* fileno */, 0 /* offset */, SEEK_SET) != 0) { elog(ERROR, "loadSharedComboCommandId: seek to beginning failed."); } /* * Read this entry in ... * * We're going to read in the entire table, caching all occurrences of * our xmin. */ for (i = 0; i < lockHolderProcPtr->combocid_map_count; i++) { if (BufFileRead(combocid_map, &entry, sizeof(ComboCidEntryData)) != sizeof(ComboCidEntryData)) { elog(ERROR, "loadSharedComboCommandId: read failed I/O error."); } if (entry.key.xmin == xmin) { bool cached = false; readerComboCidKeyData reader_key; readerComboCidEntryData *reader_entry; memset(&reader_key, 0, sizeof(reader_key)); reader_key.writer_pid = lockHolderProcPtr->pid; reader_key.xmin = entry.key.xmin; reader_key.session = gp_session_id; reader_key.combocid = entry.combocid; reader_entry = (readerComboCidEntryData *) hash_search(readerComboHash, &reader_key, HASH_ENTER, &cached); if (!cached) { reader_entry->cmin = entry.key.cmin; reader_entry->cmax = entry.key.cmax; } /* * This was our entry -- we're going to continue our scan, * to pull in any additional entries for our xmin */ if (entry.combocid == combocid) { *cmin = entry.key.cmin; *cmax = entry.key.cmax; found = true; } } } if (!found) { elog(ERROR, "loadSharedComboCommandId: no combocid entry found for %u/%u", xmin, combocid); } }
/* * Implements the 'EXECUTE' utility statement. */ void ExecuteQuery(ExecuteStmt *stmt, DestReceiver *dest) { PreparedStatement *entry; char *query_string; List *query_list, *plan_list; MemoryContext qcontext; ParamListInfo paramLI = NULL; EState *estate = NULL; Portal portal; /* Look it up in the hash table */ entry = FetchPreparedStatement(stmt->name, true); query_string = entry->query_string; query_list = entry->query_list; plan_list = entry->plan_list; qcontext = entry->context; Assert(length(query_list) == length(plan_list)); /* Evaluate parameters, if any */ if (entry->argtype_list != NIL) { /* * Need an EState to evaluate parameters; must not delete it till * end of query, in case parameters are pass-by-reference. */ estate = CreateExecutorState(); paramLI = EvaluateParams(estate, stmt->params, entry->argtype_list); } /* * Create a new portal to run the query in */ portal = CreateNewPortal(); /* * For CREATE TABLE / AS EXECUTE, make a copy of the stored query so * that we can modify its destination (yech, but this has always been * ugly). For regular EXECUTE we can just use the stored query where * it sits, since the executor is read-only. */ if (stmt->into) { MemoryContext oldContext; Query *query; oldContext = MemoryContextSwitchTo(PortalGetHeapMemory(portal)); if (query_string) query_string = pstrdup(query_string); query_list = copyObject(query_list); plan_list = copyObject(plan_list); qcontext = PortalGetHeapMemory(portal); if (length(query_list) != 1) ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("prepared statement is not a SELECT"))); query = (Query *) lfirst(query_list); if (query->commandType != CMD_SELECT) ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("prepared statement is not a SELECT"))); query->into = copyObject(stmt->into); MemoryContextSwitchTo(oldContext); } PortalDefineQuery(portal, query_string, entry->commandTag, query_list, plan_list, qcontext); /* * Run the portal to completion. */ PortalStart(portal, paramLI); (void) PortalRun(portal, FETCH_ALL, dest, dest, NULL); PortalDrop(portal, false); if (estate) FreeExecutorState(estate); /* No need to pfree other memory, MemoryContext will be reset */ }
/* ---------------------------------------------------------------- * ValuesNext * * This is a workhorse for ExecValuesScan * ---------------------------------------------------------------- */ static TupleTableSlot * ValuesNext(ValuesScanState *node) { TupleTableSlot *slot; EState *estate; ExprContext *econtext; ScanDirection direction; List *exprlist; /* * get information from the estate and scan state */ estate = node->ss.ps.state; direction = estate->es_direction; slot = node->ss.ss_ScanTupleSlot; econtext = node->rowcontext; /* * Get the next tuple. Return NULL if no more tuples. */ if (ScanDirectionIsForward(direction)) { if (node->curr_idx < node->array_len) node->curr_idx++; if (node->curr_idx < node->array_len) exprlist = node->exprlists[node->curr_idx]; else exprlist = NIL; } else { if (node->curr_idx >= 0) node->curr_idx--; if (node->curr_idx >= 0) exprlist = node->exprlists[node->curr_idx]; else exprlist = NIL; } /* * Always clear the result slot; this is appropriate if we are at the end * of the data, and if we're not, we still need it as the first step of * the store-virtual-tuple protocol. It seems wise to clear the slot * before we reset the context it might have pointers into. */ ExecClearTuple(slot); if (exprlist) { MemoryContext oldContext; List *exprstatelist; Datum *values; bool *isnull; ListCell *lc; int resind; /* * Get rid of any prior cycle's leftovers. We use ReScanExprContext * not just ResetExprContext because we want any registered shutdown * callbacks to be called. */ ReScanExprContext(econtext); /* * Build the expression eval state in the econtext's per-tuple memory. * This is a tad unusual, but we want to delete the eval state again * when we move to the next row, to avoid growth of memory * requirements over a long values list. */ oldContext = MemoryContextSwitchTo(econtext->ecxt_per_tuple_memory); /* * Pass NULL, not my plan node, because we don't want anything in this * transient state linking into permanent state. The only possibility * is a SubPlan, and there shouldn't be any (any subselects in the * VALUES list should be InitPlans). */ exprstatelist = (List *) ExecInitExpr((Expr *) exprlist, NULL); /* parser should have checked all sublists are the same length */ Assert(list_length(exprstatelist) == slot->tts_tupleDescriptor->natts); /* * Compute the expressions and build a virtual result tuple. We * already did ExecClearTuple(slot). */ values = slot->tts_values; isnull = slot->tts_isnull; resind = 0; foreach(lc, exprstatelist) { ExprState *estate = (ExprState *) lfirst(lc); values[resind] = ExecEvalExpr(estate, econtext, &isnull[resind], NULL); resind++; } MemoryContextSwitchTo(oldContext); /* * And return the virtual tuple. */ ExecStoreVirtualTuple(slot); }
Datum pg_tablespace_databases(PG_FUNCTION_ARGS) { FuncCallContext *funcctx; struct dirent *de; ts_db_fctx *fctx; if (SRF_IS_FIRSTCALL()) { MemoryContext oldcontext; Oid tablespaceOid = PG_GETARG_OID(0); funcctx = SRF_FIRSTCALL_INIT(); oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); fctx = palloc(sizeof(ts_db_fctx)); if (tablespaceOid == GLOBALTABLESPACE_OID) { fctx->dirdesc = NULL; ereport(WARNING, (errmsg("global tablespace never has databases"))); } else { if (tablespaceOid == DEFAULTTABLESPACE_OID) fctx->location = psprintf("base"); else fctx->location = psprintf("pg_tblspc/%u/%s", tablespaceOid, TABLESPACE_VERSION_DIRECTORY); fctx->dirdesc = AllocateDir(fctx->location); if (!fctx->dirdesc) { /* the only expected error is ENOENT */ if (errno != ENOENT) ereport(ERROR, (errcode_for_file_access(), errmsg("could not open directory \"%s\": %m", fctx->location))); ereport(WARNING, (errmsg("%u is not a tablespace OID", tablespaceOid))); } } funcctx->user_fctx = fctx; MemoryContextSwitchTo(oldcontext); } funcctx = SRF_PERCALL_SETUP(); fctx = (ts_db_fctx *) funcctx->user_fctx; if (!fctx->dirdesc) /* not a tablespace */ SRF_RETURN_DONE(funcctx); while ((de = ReadDir(fctx->dirdesc, fctx->location)) != NULL) { char *subdir; DIR *dirdesc; Oid datOid = atooid(de->d_name); /* this test skips . and .., but is awfully weak */ if (!datOid) continue; /* if database subdir is empty, don't report tablespace as used */ subdir = psprintf("%s/%s", fctx->location, de->d_name); dirdesc = AllocateDir(subdir); while ((de = ReadDir(dirdesc, subdir)) != NULL) { if (strcmp(de->d_name, ".") != 0 && strcmp(de->d_name, "..") != 0) break; } FreeDir(dirdesc); pfree(subdir); if (!de) continue; /* indeed, nothing in it */ SRF_RETURN_NEXT(funcctx, ObjectIdGetDatum(datOid)); } FreeDir(fctx->dirdesc); SRF_RETURN_DONE(funcctx); }
static void ServiceListenLoop(ServiceCtrl *serviceCtrl) { ServiceConfig *serviceConfig = (ServiceConfig*)serviceCtrl->serviceConfig; uint8 *inputBuff; int n, highsock = 0, newsockfd; mpp_fd_set rset, rrset; struct sockaddr_in addr; socklen_t addrlen; List *connectedSockets = NIL; ListCell *cell; Assert(TopMemoryContext != NULL); MemoryContextSwitchTo(TopMemoryContext); Assert(CurrentMemoryContext == TopMemoryContext); /* * Setup scratch buffer. */ inputBuff = palloc(serviceConfig->requestLen); MPP_FD_ZERO(&rset); MPP_FD_SET(serviceCtrl->listenerFd, &rset); highsock = serviceCtrl->listenerFd + 1; /* we'll handle many incoming sockets but keep the sockets in blocking * mode since we are dealing with very small messages. */ while(true) { struct timeval shutdownTimeout = {1,0}; // 1 second. // Use local variable since select modifies // the timeout parameter with remaining time. CHECK_FOR_INTERRUPTS(); if (serviceConfig->ServiceShutdownRequested()) { if (serviceConfig->ServiceShutdown != NULL) { serviceConfig->ServiceShutdown(); } break; } /* no need to live on if postmaster has died */ if (!PostmasterIsAlive(true)) { if (serviceConfig->ServicePostmasterDied != NULL) { serviceConfig->ServicePostmasterDied(); } else { ereport(LOG, (errmsg("exiting because postmaster has died"))); proc_exit(1); } } memcpy(&rrset, &rset, sizeof(mpp_fd_set)); n = select(highsock + 1, (fd_set *)&rrset, NULL, NULL, &shutdownTimeout); if (n == 0 || (n < 0 && errno == EINTR)) { /* intr or timeout: Have we been here too long ? */ continue; } if (n < 0) { /* this may be a little severe, but if we error on select() * we'll just go ahead and blow up. This will result in the * postmaster re-spawning a new process. */ ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), errmsg("'%s': error during select() call (error:%d).", serviceConfig->title, errno))); break; } /* is it someone tickling our listener port? */ if (MPP_FD_ISSET(serviceCtrl->listenerFd, &rrset)) { addrlen = sizeof(addr); if ((newsockfd = accept(serviceCtrl->listenerFd, (struct sockaddr *) & addr, &addrlen)) < 0) { /* * TODO: would be nice to read the errno and try and provide * more useful info as to why this happened. */ ereport(NOTICE, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), errmsg("'%s': error from client connection: %s)", serviceConfig->title, strerror(errno)))); } /* make socket non-blocking BEFORE we connect. */ if (!pg_set_noblock(newsockfd)) { /* * TODO: would be nice to read the errno and try and provide * more useful info as to why this happened. */ ereport(NOTICE, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), errmsg("'%s': could not set outbound socket to non-blocking mode: %s", serviceConfig->title, strerror(errno)))); } if (newsockfd > highsock) highsock = newsockfd + 1; MPP_FD_SET(newsockfd, &rset); /* * Read connection message. */ // UNDONE: temporarily turn off new connection flag... if( !ServiceProcessRequest(serviceCtrl, newsockfd, inputBuff, false)) { /* close it down */ MPP_FD_CLR( newsockfd, &rset); shutdown(newsockfd, SHUT_WR); close(newsockfd); } else { connectedSockets = lappend_int(connectedSockets, newsockfd); } } /* loop through all of our established sockets */ cell = list_head(connectedSockets); while (cell != NULL) { int fd = lfirst_int(cell); /* get the next cell ready before we delete */ cell = lnext(cell); if (MPP_FD_ISSET(fd, &rrset)) { if( !ServiceProcessRequest(serviceCtrl, fd, inputBuff, false)) { /* close it down */ MPP_FD_CLR( fd, &rset); connectedSockets = list_delete_int(connectedSockets, fd); shutdown(fd, SHUT_WR); close(fd); } } } } ereport(LOG, (errmsg("normal shutdown"))); proc_exit(0); }
/* * compute_tsvector_stats() -- compute statistics for a tsvector column * * This functions computes statistics that are useful for determining @@ * operations' selectivity, along with the fraction of non-null rows and * average width. * * Instead of finding the most common values, as we do for most datatypes, * we're looking for the most common lexemes. This is more useful, because * there most probably won't be any two rows with the same tsvector and thus * the notion of a MCV is a bit bogus with this datatype. With a list of the * most common lexemes we can do a better job at figuring out @@ selectivity. * * For the same reasons we assume that tsvector columns are unique when * determining the number of distinct values. * * The algorithm used is Lossy Counting, as proposed in the paper "Approximate * frequency counts over data streams" by G. S. Manku and R. Motwani, in * Proceedings of the 28th International Conference on Very Large Data Bases, * Hong Kong, China, August 2002, section 4.2. The paper is available at * http://www.vldb.org/conf/2002/S10P03.pdf * * The Lossy Counting (aka LC) algorithm goes like this: * Let s be the threshold frequency for an item (the minimum frequency we * are interested in) and epsilon the error margin for the frequency. Let D * be a set of triples (e, f, delta), where e is an element value, f is that * element's frequency (actually, its current occurrence count) and delta is * the maximum error in f. We start with D empty and process the elements in * batches of size w. (The batch size is also known as "bucket size" and is * equal to 1/epsilon.) Let the current batch number be b_current, starting * with 1. For each element e we either increment its f count, if it's * already in D, or insert a new triple into D with values (e, 1, b_current * - 1). After processing each batch we prune D, by removing from it all * elements with f + delta <= b_current. After the algorithm finishes we * suppress all elements from D that do not satisfy f >= (s - epsilon) * N, * where N is the total number of elements in the input. We emit the * remaining elements with estimated frequency f/N. The LC paper proves * that this algorithm finds all elements with true frequency at least s, * and that no frequency is overestimated or is underestimated by more than * epsilon. Furthermore, given reasonable assumptions about the input * distribution, the required table size is no more than about 7 times w. * * We set s to be the estimated frequency of the K'th word in a natural * language's frequency table, where K is the target number of entries in * the MCELEM array plus an arbitrary constant, meant to reflect the fact * that the most common words in any language would usually be stopwords * so we will not actually see them in the input. We assume that the * distribution of word frequencies (including the stopwords) follows Zipf's * law with an exponent of 1. * * Assuming Zipfian distribution, the frequency of the K'th word is equal * to 1/(K * H(W)) where H(n) is 1/2 + 1/3 + ... + 1/n and W is the number of * words in the language. Putting W as one million, we get roughly 0.07/K. * Assuming top 10 words are stopwords gives s = 0.07/(K + 10). We set * epsilon = s/10, which gives bucket width w = (K + 10)/0.007 and * maximum expected hashtable size of about 1000 * (K + 10). * * Note: in the above discussion, s, epsilon, and f/N are in terms of a * lexeme's frequency as a fraction of all lexemes seen in the input. * However, what we actually want to store in the finished pg_statistic * entry is each lexeme's frequency as a fraction of all rows that it occurs * in. Assuming that the input tsvectors are correctly constructed, no * lexeme occurs more than once per tsvector, so the final count f is a * correct estimate of the number of input tsvectors it occurs in, and we * need only change the divisor from N to nonnull_cnt to get the number we * want. */ static void compute_tsvector_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc, int samplerows, double totalrows) { int num_mcelem; int null_cnt = 0; double total_width = 0; /* This is D from the LC algorithm. */ HTAB *lexemes_tab; HASHCTL hash_ctl; HASH_SEQ_STATUS scan_status; /* This is the current bucket number from the LC algorithm */ int b_current; /* This is 'w' from the LC algorithm */ int bucket_width; int vector_no, lexeme_no; LexemeHashKey hash_key; TrackItem *item; /* * We want statistics_target * 10 lexemes in the MCELEM array. This * multiplier is pretty arbitrary, but is meant to reflect the fact that * the number of individual lexeme values tracked in pg_statistic ought to * be more than the number of values for a simple scalar column. */ num_mcelem = stats->attr->attstattarget * 10; /* * We set bucket width equal to (num_mcelem + 10) / 0.007 as per the * comment above. */ bucket_width = (num_mcelem + 10) * 1000 / 7; /* * Create the hashtable. It will be in local memory, so we don't need to * worry about overflowing the initial size. Also we don't need to pay any * attention to locking and memory management. */ MemSet(&hash_ctl, 0, sizeof(hash_ctl)); hash_ctl.keysize = sizeof(LexemeHashKey); hash_ctl.entrysize = sizeof(TrackItem); hash_ctl.hash = lexeme_hash; hash_ctl.match = lexeme_match; hash_ctl.hcxt = CurrentMemoryContext; lexemes_tab = hash_create("Analyzed lexemes table", num_mcelem, &hash_ctl, HASH_ELEM | HASH_FUNCTION | HASH_COMPARE | HASH_CONTEXT); /* Initialize counters. */ b_current = 1; lexeme_no = 0; /* Loop over the tsvectors. */ for (vector_no = 0; vector_no < samplerows; vector_no++) { Datum value; bool isnull; TSVector vector; WordEntry *curentryptr; char *lexemesptr; int j; vacuum_delay_point(); value = fetchfunc(stats, vector_no, &isnull); /* * Check for null/nonnull. */ if (isnull) { null_cnt++; continue; } /* * Add up widths for average-width calculation. Since it's a * tsvector, we know it's varlena. As in the regular * compute_minimal_stats function, we use the toasted width for this * calculation. */ total_width += VARSIZE_ANY(DatumGetPointer(value)); /* * Now detoast the tsvector if needed. */ vector = DatumGetTSVector(value); /* * We loop through the lexemes in the tsvector and add them to our * tracking hashtable. Note: the hashtable entries will point into * the (detoasted) tsvector value, therefore we cannot free that * storage until we're done. */ lexemesptr = STRPTR(vector); curentryptr = ARRPTR(vector); for (j = 0; j < vector->size; j++) { bool found; /* Construct a hash key */ hash_key.lexeme = lexemesptr + curentryptr->pos; hash_key.length = curentryptr->len; /* Lookup current lexeme in hashtable, adding it if new */ item = (TrackItem *) hash_search(lexemes_tab, (const void *) &hash_key, HASH_ENTER, &found); if (found) { /* The lexeme is already on the tracking list */ item->frequency++; } else { /* Initialize new tracking list element */ item->frequency = 1; item->delta = b_current - 1; } /* lexeme_no is the number of elements processed (ie N) */ lexeme_no++; /* We prune the D structure after processing each bucket */ if (lexeme_no % bucket_width == 0) { prune_lexemes_hashtable(lexemes_tab, b_current); b_current++; } /* Advance to the next WordEntry in the tsvector */ curentryptr++; } } /* We can only compute real stats if we found some non-null values. */ if (null_cnt < samplerows) { int nonnull_cnt = samplerows - null_cnt; int i; TrackItem **sort_table; int track_len; int cutoff_freq; int minfreq, maxfreq; stats->stats_valid = true; /* Do the simple null-frac and average width stats */ stats->stanullfrac = (double) null_cnt / (double) samplerows; stats->stawidth = total_width / (double) nonnull_cnt; /* Assume it's a unique column (see notes above) */ stats->stadistinct = -1.0 * (1.0 - stats->stanullfrac); /* * Construct an array of the interesting hashtable items, that is, * those meeting the cutoff frequency (s - epsilon)*N. Also identify * the minimum and maximum frequencies among these items. * * Since epsilon = s/10 and bucket_width = 1/epsilon, the cutoff * frequency is 9*N / bucket_width. */ cutoff_freq = 9 * lexeme_no / bucket_width; i = hash_get_num_entries(lexemes_tab); /* surely enough space */ sort_table = (TrackItem **) palloc(sizeof(TrackItem *) * i); hash_seq_init(&scan_status, lexemes_tab); track_len = 0; minfreq = lexeme_no; maxfreq = 0; while ((item = (TrackItem *) hash_seq_search(&scan_status)) != NULL) { if (item->frequency > cutoff_freq) { sort_table[track_len++] = item; minfreq = Min(minfreq, item->frequency); maxfreq = Max(maxfreq, item->frequency); } } Assert(track_len <= i); /* emit some statistics for debug purposes */ elog(DEBUG3, "tsvector_stats: target # mces = %d, bucket width = %d, " "# lexemes = %d, hashtable size = %d, usable entries = %d", num_mcelem, bucket_width, lexeme_no, i, track_len); /* * If we obtained more lexemes than we really want, get rid of those * with least frequencies. The easiest way is to qsort the array into * descending frequency order and truncate the array. */ if (num_mcelem < track_len) { qsort(sort_table, track_len, sizeof(TrackItem *), trackitem_compare_frequencies_desc); /* reset minfreq to the smallest frequency we're keeping */ minfreq = sort_table[num_mcelem - 1]->frequency; } else num_mcelem = track_len; /* Generate MCELEM slot entry */ if (num_mcelem > 0) { MemoryContext old_context; Datum *mcelem_values; float4 *mcelem_freqs; /* * We want to store statistics sorted on the lexeme value using * first length, then byte-for-byte comparison. The reason for * doing length comparison first is that we don't care about the * ordering so long as it's consistent, and comparing lengths * first gives us a chance to avoid a strncmp() call. * * This is different from what we do with scalar statistics -- * they get sorted on frequencies. The rationale is that we * usually search through most common elements looking for a * specific value, so we can grab its frequency. When values are * presorted we can employ binary search for that. See * ts_selfuncs.c for a real usage scenario. */ qsort(sort_table, num_mcelem, sizeof(TrackItem *), trackitem_compare_lexemes); /* Must copy the target values into anl_context */ old_context = MemoryContextSwitchTo(stats->anl_context); /* * We sorted statistics on the lexeme value, but we want to be * able to find out the minimal and maximal frequency without * going through all the values. We keep those two extra * frequencies in two extra cells in mcelem_freqs. * * (Note: the MCELEM statistics slot definition allows for a third * extra number containing the frequency of nulls, but we don't * create that for a tsvector column, since null elements aren't * possible.) */ mcelem_values = (Datum *) palloc(num_mcelem * sizeof(Datum)); mcelem_freqs = (float4 *) palloc((num_mcelem + 2) * sizeof(float4)); /* * See comments above about use of nonnull_cnt as the divisor for * the final frequency estimates. */ for (i = 0; i < num_mcelem; i++) { TrackItem *item = sort_table[i]; mcelem_values[i] = PointerGetDatum(cstring_to_text_with_len(item->key.lexeme, item->key.length)); mcelem_freqs[i] = (double) item->frequency / (double) nonnull_cnt; } mcelem_freqs[i++] = (double) minfreq / (double) nonnull_cnt; mcelem_freqs[i] = (double) maxfreq / (double) nonnull_cnt; MemoryContextSwitchTo(old_context); stats->stakind[0] = STATISTIC_KIND_MCELEM; stats->staop[0] = TextEqualOperator; stats->stanumbers[0] = mcelem_freqs; /* See above comment about two extra frequency fields */ stats->numnumbers[0] = num_mcelem + 2; stats->stavalues[0] = mcelem_values; stats->numvalues[0] = num_mcelem; /* We are storing text values */ stats->statypid[0] = TEXTOID; stats->statyplen[0] = -1; /* typlen, -1 for varlena */ stats->statypbyval[0] = false; stats->statypalign[0] = 'i'; } } else { /* We found only nulls; assume the column is entirely null */ stats->stats_valid = true; stats->stanullfrac = 1.0; stats->stawidth = 0; /* "unknown" */ stats->stadistinct = 0.0; /* "unknown" */ } /* * We don't need to bother cleaning up any of our temporary palloc's. The * hashtable should also go away, as it used a child memory context. */ }
Datum heap_page_items(PG_FUNCTION_ARGS) { bytea *raw_page = PG_GETARG_BYTEA_P(0); heap_page_items_state *inter_call_data = NULL; FuncCallContext *fctx; int raw_page_size; if (!superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), (errmsg("must be superuser to use raw page functions")))); raw_page_size = VARSIZE(raw_page) - VARHDRSZ; if (SRF_IS_FIRSTCALL()) { TupleDesc tupdesc; MemoryContext mctx; if (raw_page_size < SizeOfPageHeaderData) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("input page too small (%d bytes)", raw_page_size))); fctx = SRF_FIRSTCALL_INIT(); mctx = MemoryContextSwitchTo(fctx->multi_call_memory_ctx); inter_call_data = palloc(sizeof(heap_page_items_state)); /* Build a tuple descriptor for our result type */ if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) elog(ERROR, "return type must be a row type"); inter_call_data->tupd = tupdesc; inter_call_data->offset = FirstOffsetNumber; inter_call_data->page = VARDATA(raw_page); fctx->max_calls = PageGetMaxOffsetNumber(inter_call_data->page); fctx->user_fctx = inter_call_data; MemoryContextSwitchTo(mctx); } fctx = SRF_PERCALL_SETUP(); inter_call_data = fctx->user_fctx; if (fctx->call_cntr < fctx->max_calls) { Page page = inter_call_data->page; HeapTuple resultTuple; Datum result; ItemId id; Datum values[14]; bool nulls[14]; uint16 lp_offset; uint16 lp_flags; uint16 lp_len; memset(nulls, 0, sizeof(nulls)); /* Extract information from the line pointer */ id = PageGetItemId(page, inter_call_data->offset); lp_offset = ItemIdGetOffset(id); lp_flags = ItemIdGetFlags(id); lp_len = ItemIdGetLength(id); values[0] = UInt16GetDatum(inter_call_data->offset); values[1] = UInt16GetDatum(lp_offset); values[2] = UInt16GetDatum(lp_flags); values[3] = UInt16GetDatum(lp_len); /* * We do just enough validity checking to make sure we don't reference * data outside the page passed to us. The page could be corrupt in * many other ways, but at least we won't crash. */ if (ItemIdHasStorage(id) && lp_len >= MinHeapTupleSize && lp_offset == MAXALIGN(lp_offset) && lp_offset + lp_len <= raw_page_size) { HeapTupleHeader tuphdr; bytea *tuple_data_bytea; int tuple_data_len; /* Extract information from the tuple header */ tuphdr = (HeapTupleHeader) PageGetItem(page, id); values[4] = UInt32GetDatum(HeapTupleHeaderGetRawXmin(tuphdr)); values[5] = UInt32GetDatum(HeapTupleHeaderGetRawXmax(tuphdr)); /* shared with xvac */ values[6] = UInt32GetDatum(HeapTupleHeaderGetRawCommandId(tuphdr)); values[7] = PointerGetDatum(&tuphdr->t_ctid); values[8] = UInt32GetDatum(tuphdr->t_infomask2); values[9] = UInt32GetDatum(tuphdr->t_infomask); values[10] = UInt8GetDatum(tuphdr->t_hoff); /* Copy raw tuple data into bytea attribute */ tuple_data_len = lp_len - tuphdr->t_hoff; tuple_data_bytea = (bytea *) palloc(tuple_data_len + VARHDRSZ); SET_VARSIZE(tuple_data_bytea, tuple_data_len + VARHDRSZ); memcpy(VARDATA(tuple_data_bytea), (char *) tuphdr + tuphdr->t_hoff, tuple_data_len); values[13] = PointerGetDatum(tuple_data_bytea); /* * We already checked that the item is completely within the raw * page passed to us, with the length given in the line pointer. * Let's check that t_hoff doesn't point over lp_len, before using * it to access t_bits and oid. */ if (tuphdr->t_hoff >= SizeofHeapTupleHeader && tuphdr->t_hoff <= lp_len && tuphdr->t_hoff == MAXALIGN(tuphdr->t_hoff)) { if (tuphdr->t_infomask & HEAP_HASNULL) { int bits_len; bits_len = BITMAPLEN(HeapTupleHeaderGetNatts(tuphdr)) * BITS_PER_BYTE; values[11] = CStringGetTextDatum( bits_to_text(tuphdr->t_bits, bits_len)); } else nulls[11] = true; if (tuphdr->t_infomask & HEAP_HASOID_OLD) values[12] = HeapTupleHeaderGetOidOld(tuphdr); else nulls[12] = true; } else { nulls[11] = true; nulls[12] = true; } } else { /* * The line pointer is not used, or it's invalid. Set the rest of * the fields to NULL */ int i; for (i = 4; i <= 13; i++) nulls[i] = true; } /* Build and return the result tuple. */ resultTuple = heap_form_tuple(inter_call_data->tupd, values, nulls); result = HeapTupleGetDatum(resultTuple); inter_call_data->offset++; SRF_RETURN_NEXT(fctx, result); } else SRF_RETURN_DONE(fctx); }
/* * Copied from Postgres' src/backend/optimizer/util/clauses.c * * evaluate_expr: pre-evaluate a constant expression * * We use the executor's routine ExecEvalExpr() to avoid duplication of * code and ensure we get the same result as the executor would get. */ Expr * evaluate_expr(Expr *expr, Oid result_type, int32 result_typmod, Oid result_collation) { EState *estate; ExprState *exprstate; MemoryContext oldcontext; Datum const_val; bool const_is_null; int16 resultTypLen; bool resultTypByVal; /* * To use the executor, we need an EState. */ estate = CreateExecutorState(); /* We can use the estate's working context to avoid memory leaks. */ oldcontext = MemoryContextSwitchTo(estate->es_query_cxt); /* Make sure any opfuncids are filled in. */ fix_opfuncids((Node *) expr); /* * Prepare expr for execution. (Note: we can't use ExecPrepareExpr * because it'd result in recursively invoking eval_const_expressions.) */ exprstate = ExecInitExpr(expr, NULL); /* * And evaluate it. * * It is OK to use a default econtext because none of the ExecEvalExpr() * code used in this situation will use econtext. That might seem * fortuitous, but it's not so unreasonable --- a constant expression does * not depend on context, by definition, n'est ce pas? */ const_val = ExecEvalExprSwitchContext(exprstate, GetPerTupleExprContext(estate), &const_is_null, NULL); /* Get info needed about result datatype */ get_typlenbyval(result_type, &resultTypLen, &resultTypByVal); /* Get back to outer memory context */ MemoryContextSwitchTo(oldcontext); /* * Must copy result out of sub-context used by expression eval. * * Also, if it's varlena, forcibly detoast it. This protects us against * storing TOAST pointers into plans that might outlive the referenced * data. (makeConst would handle detoasting anyway, but it's worth a few * extra lines here so that we can do the copy and detoast in one step.) */ if (!const_is_null) { if (resultTypLen == -1) const_val = PointerGetDatum(PG_DETOAST_DATUM_COPY(const_val)); else const_val = datumCopy(const_val, resultTypByVal, resultTypLen); } /* Release all the junk we just created */ FreeExecutorState(estate); /* * Make the constant result node. */ return (Expr *) makeConst(result_type, result_typmod, result_collation, resultTypLen, const_val, const_is_null, resultTypByVal); }
*/ hashtable->hashCxt = AllocSetContextCreate(CurrentMemoryContext, "HashTableContext", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); hashtable->batchCxt = AllocSetContextCreate(hashtable->hashCxt, "HashBatchContext", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); /* Allocate data that will live for the life of the hashjoin */ oldcxt = MemoryContextSwitchTo(hashtable->hashCxt); if (nbatch > 1) { /* * allocate and initialize the file arrays in hashCxt */ hashtable->innerBatchFile = (BufFile **) palloc0(nbatch * sizeof(BufFile *)); hashtable->outerBatchFile = (BufFile **) palloc0(nbatch * sizeof(BufFile *)); /* The files will not be opened until needed... */ /* ... but make sure we have temp tablespaces established for them */ PrepareTempTablespaces(); }
Datum xpath_table(PG_FUNCTION_ARGS) { /* Function parameters */ char *pkeyfield = text_to_cstring(PG_GETARG_TEXT_PP(0)); char *xmlfield = text_to_cstring(PG_GETARG_TEXT_PP(1)); char *relname = text_to_cstring(PG_GETARG_TEXT_PP(2)); char *xpathset = text_to_cstring(PG_GETARG_TEXT_PP(3)); char *condition = text_to_cstring(PG_GETARG_TEXT_PP(4)); /* SPI (input tuple) support */ SPITupleTable *tuptable; HeapTuple spi_tuple; TupleDesc spi_tupdesc; /* Output tuple (tuplestore) support */ Tuplestorestate *tupstore = NULL; TupleDesc ret_tupdesc; HeapTuple ret_tuple; ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; AttInMetadata *attinmeta; MemoryContext per_query_ctx; MemoryContext oldcontext; char **values; xmlChar **xpaths; char *pos; const char *pathsep = "|"; int numpaths; int ret; int proc; int i; int j; int rownr; /* For issuing multiple rows from one original * document */ bool had_values; /* To determine end of nodeset results */ StringInfoData query_buf; PgXmlErrorContext *xmlerrcxt; volatile xmlDocPtr doctree = NULL; /* We only have a valid tuple description in table function mode */ if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo)) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("set-valued function called in context that cannot accept a set"))); if (rsinfo->expectedDesc == NULL) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("xpath_table must be called as a table function"))); /* * We want to materialise because it means that we don't have to carry * libxml2 parser state between invocations of this function */ if (!(rsinfo->allowedModes & SFRM_Materialize)) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("xpath_table requires Materialize mode, but it is not " "allowed in this context"))); /* * The tuplestore must exist in a higher context than this function call * (per_query_ctx is used) */ per_query_ctx = rsinfo->econtext->ecxt_per_query_memory; oldcontext = MemoryContextSwitchTo(per_query_ctx); /* * Create the tuplestore - work_mem is the max in-memory size before a * file is created on disk to hold it. */ tupstore = tuplestore_begin_heap(rsinfo->allowedModes & SFRM_Materialize_Random, false, work_mem); MemoryContextSwitchTo(oldcontext); /* get the requested return tuple description */ ret_tupdesc = CreateTupleDescCopy(rsinfo->expectedDesc); /* must have at least one output column (for the pkey) */ if (ret_tupdesc->natts < 1) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("xpath_table must have at least one output column"))); /* * At the moment we assume that the returned attributes make sense for the * XPath specififed (i.e. we trust the caller). It's not fatal if they get * it wrong - the input function for the column type will raise an error * if the path result can't be converted into the correct binary * representation. */ attinmeta = TupleDescGetAttInMetadata(ret_tupdesc); /* Set return mode and allocate value space. */ rsinfo->returnMode = SFRM_Materialize; rsinfo->setDesc = ret_tupdesc; values = (char **) palloc(ret_tupdesc->natts * sizeof(char *)); xpaths = (xmlChar **) palloc(ret_tupdesc->natts * sizeof(xmlChar *)); /* * Split XPaths. xpathset is a writable CString. * * Note that we stop splitting once we've done all needed for tupdesc */ numpaths = 0; pos = xpathset; while (numpaths < (ret_tupdesc->natts - 1)) { xpaths[numpaths++] = (xmlChar *) pos; pos = strstr(pos, pathsep); if (pos != NULL) { *pos = '\0'; pos++; } else break; } /* Now build query */ initStringInfo(&query_buf); /* Build initial sql statement */ appendStringInfo(&query_buf, "SELECT %s, %s FROM %s WHERE %s", pkeyfield, xmlfield, relname, condition); if ((ret = SPI_connect()) < 0) elog(ERROR, "xpath_table: SPI_connect returned %d", ret); if ((ret = SPI_exec(query_buf.data, 0)) != SPI_OK_SELECT) elog(ERROR, "xpath_table: SPI execution failed for query %s", query_buf.data); proc = SPI_processed; /* elog(DEBUG1,"xpath_table: SPI returned %d rows",proc); */ tuptable = SPI_tuptable; spi_tupdesc = tuptable->tupdesc; /* Switch out of SPI context */ MemoryContextSwitchTo(oldcontext); /* * Check that SPI returned correct result. If you put a comma into one of * the function parameters, this will catch it when the SPI query returns * e.g. 3 columns. */ if (spi_tupdesc->natts != 2) { ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("expression returning multiple columns is not valid in parameter list"), errdetail("Expected two columns in SPI result, got %d.", spi_tupdesc->natts))); } /* * Setup the parser. This should happen after we are done evaluating the * query, in case it calls functions that set up libxml differently. */ xmlerrcxt = pgxml_parser_init(PG_XML_STRICTNESS_LEGACY); PG_TRY(); { /* For each row i.e. document returned from SPI */ for (i = 0; i < proc; i++) { char *pkey; char *xmldoc; xmlXPathContextPtr ctxt; xmlXPathObjectPtr res; xmlChar *resstr; xmlXPathCompExprPtr comppath; /* Extract the row data as C Strings */ spi_tuple = tuptable->vals[i]; pkey = SPI_getvalue(spi_tuple, spi_tupdesc, 1); xmldoc = SPI_getvalue(spi_tuple, spi_tupdesc, 2); /* * Clear the values array, so that not-well-formed documents * return NULL in all columns. Note that this also means that * spare columns will be NULL. */ for (j = 0; j < ret_tupdesc->natts; j++) values[j] = NULL; /* Insert primary key */ values[0] = pkey; /* Parse the document */ if (xmldoc) doctree = xmlParseMemory(xmldoc, strlen(xmldoc)); else /* treat NULL as not well-formed */ doctree = NULL; if (doctree == NULL) { /* not well-formed, so output all-NULL tuple */ ret_tuple = BuildTupleFromCStrings(attinmeta, values); tuplestore_puttuple(tupstore, ret_tuple); heap_freetuple(ret_tuple); } else { /* New loop here - we have to deal with nodeset results */ rownr = 0; do { /* Now evaluate the set of xpaths. */ had_values = false; for (j = 0; j < numpaths; j++) { ctxt = xmlXPathNewContext(doctree); ctxt->node = xmlDocGetRootElement(doctree); /* compile the path */ comppath = xmlXPathCompile(xpaths[j]); if (comppath == NULL) xml_ereport(xmlerrcxt, ERROR, ERRCODE_EXTERNAL_ROUTINE_EXCEPTION, "XPath Syntax Error"); /* Now evaluate the path expression. */ res = xmlXPathCompiledEval(comppath, ctxt); xmlXPathFreeCompExpr(comppath); if (res != NULL) { switch (res->type) { case XPATH_NODESET: /* We see if this nodeset has enough nodes */ if (res->nodesetval != NULL && rownr < res->nodesetval->nodeNr) { resstr = xmlXPathCastNodeToString(res->nodesetval->nodeTab[rownr]); had_values = true; } else resstr = NULL; break; case XPATH_STRING: resstr = xmlStrdup(res->stringval); break; default: elog(NOTICE, "unsupported XQuery result: %d", res->type); resstr = xmlStrdup((const xmlChar *) "<unsupported/>"); } /* * Insert this into the appropriate column in the * result tuple. */ values[j + 1] = (char *) resstr; } xmlXPathFreeContext(ctxt); } /* Now add the tuple to the output, if there is one. */ if (had_values) { ret_tuple = BuildTupleFromCStrings(attinmeta, values); tuplestore_puttuple(tupstore, ret_tuple); heap_freetuple(ret_tuple); } rownr++; } while (had_values); } if (doctree != NULL) xmlFreeDoc(doctree); doctree = NULL; if (pkey) pfree(pkey); if (xmldoc) pfree(xmldoc); } } PG_CATCH(); { if (doctree != NULL) xmlFreeDoc(doctree); pg_xml_done(xmlerrcxt, true); PG_RE_THROW(); } PG_END_TRY(); if (doctree != NULL) xmlFreeDoc(doctree); pg_xml_done(xmlerrcxt, false); tuplestore_donestoring(tupstore); SPI_finish(); rsinfo->setResult = tupstore; /* * SFRM_Materialize mode expects us to return a NULL Datum. The actual * tuples are in our tuplestore and passed back through rsinfo->setResult. * rsinfo->setDesc is set to the tuple description that we actually used * to build our tuples with, so the caller can verify we did what it was * expecting. */ return (Datum) 0; }
/* * Primary entry point for VACUUM and ANALYZE commands. * * relid is normally InvalidOid; if it is not, then it provides the relation * OID to be processed, and vacstmt->relation is ignored. (The non-invalid * case is currently only used by autovacuum.) * * do_toast is passed as FALSE by autovacuum, because it processes TOAST * tables separately. * * for_wraparound is used by autovacuum to let us know when it's forcing * a vacuum for wraparound, which should not be auto-canceled. * * bstrategy is normally given as NULL, but in autovacuum it can be passed * in to use the same buffer strategy object across multiple vacuum() calls. * * isTopLevel should be passed down from ProcessUtility. * * It is the caller's responsibility that vacstmt and bstrategy * (if given) be allocated in a memory context that won't disappear * at transaction commit. */ void vacuum(VacuumStmt *vacstmt, Oid relid, bool do_toast, BufferAccessStrategy bstrategy, bool for_wraparound, bool isTopLevel) { const char *stmttype; volatile bool in_outer_xact, use_own_xacts; List *relations; /* sanity checks on options */ Assert(vacstmt->options & (VACOPT_VACUUM | VACOPT_ANALYZE)); Assert((vacstmt->options & VACOPT_VACUUM) || !(vacstmt->options & (VACOPT_FULL | VACOPT_FREEZE))); Assert((vacstmt->options & VACOPT_ANALYZE) || vacstmt->va_cols == NIL); stmttype = (vacstmt->options & VACOPT_VACUUM) ? "VACUUM" : "ANALYZE"; /* * We cannot run VACUUM inside a user transaction block; if we were inside * a transaction, then our commit- and start-transaction-command calls * would not have the intended effect! There are numerous other subtle * dependencies on this, too. * * ANALYZE (without VACUUM) can run either way. */ if (vacstmt->options & VACOPT_VACUUM) { PreventTransactionChain(isTopLevel, stmttype); in_outer_xact = false; } else in_outer_xact = IsInTransactionChain(isTopLevel); /* * Send info about dead objects to the statistics collector, unless we are * in autovacuum --- autovacuum.c does this for itself. */ if ((vacstmt->options & VACOPT_VACUUM) && !IsAutoVacuumWorkerProcess()) pgstat_vacuum_stat(); /* * Create special memory context for cross-transaction storage. * * Since it is a child of PortalContext, it will go away eventually even * if we suffer an error; there's no need for special abort cleanup logic. */ vac_context = AllocSetContextCreate(PortalContext, "Vacuum", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); /* * If caller didn't give us a buffer strategy object, make one in the * cross-transaction memory context. */ if (bstrategy == NULL) { MemoryContext old_context = MemoryContextSwitchTo(vac_context); bstrategy = GetAccessStrategy(BAS_VACUUM); MemoryContextSwitchTo(old_context); } vac_strategy = bstrategy; /* * Build list of relations to process, unless caller gave us one. (If we * build one, we put it in vac_context for safekeeping.) */ relations = get_rel_oids(relid, vacstmt->relation); /* * Decide whether we need to start/commit our own transactions. * * For VACUUM (with or without ANALYZE): always do so, so that we can * release locks as soon as possible. (We could possibly use the outer * transaction for a one-table VACUUM, but handling TOAST tables would be * problematic.) * * For ANALYZE (no VACUUM): if inside a transaction block, we cannot * start/commit our own transactions. Also, there's no need to do so if * only processing one relation. For multiple relations when not within a * transaction block, and also in an autovacuum worker, use own * transactions so we can release locks sooner. */ if (vacstmt->options & VACOPT_VACUUM) use_own_xacts = true; else { Assert(vacstmt->options & VACOPT_ANALYZE); if (IsAutoVacuumWorkerProcess()) use_own_xacts = true; else if (in_outer_xact) use_own_xacts = false; else if (list_length(relations) > 1) use_own_xacts = true; else use_own_xacts = false; } /* * vacuum_rel expects to be entered with no transaction active; it will * start and commit its own transaction. But we are called by an SQL * command, and so we are executing inside a transaction already. We * commit the transaction started in PostgresMain() here, and start * another one before exiting to match the commit waiting for us back in * PostgresMain(). */ if (use_own_xacts) { /* ActiveSnapshot is not set by autovacuum */ if (ActiveSnapshotSet()) PopActiveSnapshot(); /* matches the StartTransaction in PostgresMain() */ CommitTransactionCommand(); } /* Turn vacuum cost accounting on or off */ PG_TRY(); { ListCell *cur; VacuumCostActive = (VacuumCostDelay > 0); VacuumCostBalance = 0; VacuumPageHit = 0; VacuumPageMiss = 0; VacuumPageDirty = 0; /* * Loop to process each selected relation. */ foreach(cur, relations) { Oid relid = lfirst_oid(cur); if (vacstmt->options & VACOPT_VACUUM) { if (!vacuum_rel(relid, vacstmt, do_toast, for_wraparound)) continue; } if (vacstmt->options & VACOPT_ANALYZE) { /* * If using separate xacts, start one for analyze. Otherwise, * we can use the outer transaction. */ if (use_own_xacts) { StartTransactionCommand(); /* functions in indexes may want a snapshot set */ PushActiveSnapshot(GetTransactionSnapshot()); } analyze_rel(relid, vacstmt, vac_strategy); if (use_own_xacts) { PopActiveSnapshot(); CommitTransactionCommand(); } } } }
/* * write_minipage * * Write the in-memory minipage to the block directory relation. */ static void write_minipage(AppendOnlyBlockDirectory *blockDirectory, int columnGroupNo) { HeapTuple tuple; MemoryContext oldcxt; Datum *values = blockDirectory->values; bool *nulls = blockDirectory->nulls; Relation blkdirRel = blockDirectory->blkdirRel; TupleDesc heapTupleDesc = RelationGetDescr(blkdirRel); MinipagePerColumnGroup *minipageInfo = &blockDirectory->minipages[columnGroupNo]; Assert(minipageInfo->numMinipageEntries > 0); oldcxt = MemoryContextSwitchTo(blockDirectory->memoryContext); Assert(blkdirRel != NULL); values[Anum_pg_aoblkdir_segno - 1] = Int32GetDatum(blockDirectory->currentSegmentFileNum); nulls[Anum_pg_aoblkdir_segno - 1] = false; values[Anum_pg_aoblkdir_columngroupno - 1] = Int32GetDatum(columnGroupNo); nulls[Anum_pg_aoblkdir_columngroupno - 1] = false; values[Anum_pg_aoblkdir_firstrownum - 1] = Int64GetDatum(minipageInfo->minipage->entry[0].firstRowNum); nulls[Anum_pg_aoblkdir_firstrownum - 1] = false; SET_VARSIZE(minipageInfo->minipage, minipage_size(minipageInfo->numMinipageEntries)); minipageInfo->minipage->nEntry = minipageInfo->numMinipageEntries; values[Anum_pg_aoblkdir_minipage - 1] = PointerGetDatum(minipageInfo->minipage); nulls[Anum_pg_aoblkdir_minipage - 1] = false; tuple = heaptuple_form_to(heapTupleDesc, values, nulls, NULL, NULL); /* * Write out the minipage to the block directory relation. * If this minipage is already in the relation, we update * the row. Otherwise, a new row is inserted. */ if (ItemPointerIsValid(&minipageInfo->tupleTid)) { if (Debug_appendonly_print_blockdirectory) ereport(LOG, (errmsg("Append-only block directory update a minipage: " "(segno, columnGroupNo, nEntries, firstRowNum) = " "(%d, %d, %u, " INT64_FORMAT ")", blockDirectory->currentSegmentFileNum, columnGroupNo, minipageInfo->numMinipageEntries, minipageInfo->minipage->entry[0].firstRowNum))); simple_heap_update(blkdirRel, &minipageInfo->tupleTid, tuple); } else { if (Debug_appendonly_print_blockdirectory) ereport(LOG, (errmsg("Append-only block directory insert a minipage: " "(segno, columnGroupNo, nEntries, firstRowNum) = " "(%d, %d, %u, " INT64_FORMAT ")", blockDirectory->currentSegmentFileNum, columnGroupNo, minipageInfo->numMinipageEntries, minipageInfo->minipage->entry[0].firstRowNum))); simple_heap_insert(blkdirRel, tuple); } CatalogUpdateIndexes(blkdirRel, tuple); heap_freetuple(tuple); MemoryContextSwitchTo(oldcxt); }
/* * initSuffixTree - create suffix tree from file. Function converts * UTF8-encoded file into current encoding. */ static SuffixChar * initSuffixTree(char *filename) { SuffixChar *volatile rootSuffixTree = NULL; MemoryContext ccxt = CurrentMemoryContext; tsearch_readline_state trst; volatile bool skip; filename = get_tsearch_config_filename(filename, "rules"); if (!tsearch_readline_begin(&trst, filename)) ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("could not open unaccent file \"%s\": %m", filename))); do { /* * pg_do_encoding_conversion() (called by tsearch_readline()) will * emit exception if it finds untranslatable characters in current * locale. We just skip such lines, continuing with the next. */ skip = true; PG_TRY(); { char *line; while ((line = tsearch_readline(&trst)) != NULL) { /* * The format of each line must be "src trg" where src and trg * are sequences of one or more non-whitespace characters, * separated by whitespace. Whitespace at start or end of * line is ignored. */ int state; char *ptr; char *src = NULL; char *trg = NULL; int ptrlen; int srclen = 0; int trglen = 0; state = 0; for (ptr = line; *ptr; ptr += ptrlen) { ptrlen = pg_mblen(ptr); /* ignore whitespace, but end src or trg */ if (t_isspace(ptr)) { if (state == 1) state = 2; else if (state == 3) state = 4; continue; } switch (state) { case 0: /* start of src */ src = ptr; srclen = ptrlen; state = 1; break; case 1: /* continue src */ srclen += ptrlen; break; case 2: /* start of trg */ trg = ptr; trglen = ptrlen; state = 3; break; case 3: /* continue trg */ trglen += ptrlen; break; default: /* bogus line format */ state = -1; break; } } if (state >= 3) rootSuffixTree = placeChar(rootSuffixTree, (unsigned char *) src, srclen, trg, trglen); pfree(line); } skip = false; } PG_CATCH(); { ErrorData *errdata; MemoryContext ecxt; ecxt = MemoryContextSwitchTo(ccxt); errdata = CopyErrorData(); if (errdata->sqlerrcode == ERRCODE_UNTRANSLATABLE_CHARACTER) { FlushErrorState(); } else { MemoryContextSwitchTo(ecxt); PG_RE_THROW(); } } PG_END_TRY(); } while (skip); tsearch_readline_end(&trst); return rootSuffixTree; }