/* * Report a detected deadlock, with available details. */ void DeadLockReport(void) { StringInfoData clientbuf; /* errdetail for client */ StringInfoData logbuf; /* errdetail for server log */ StringInfoData locktagbuf; int i; initStringInfo(&clientbuf); initStringInfo(&logbuf); initStringInfo(&locktagbuf); /* Generate the "waits for" lines sent to the client */ for (i = 0; i < nDeadlockDetails; i++) { DEADLOCK_INFO *info = &deadlockDetails[i]; int nextpid; /* The last proc waits for the first one... */ if (i < nDeadlockDetails - 1) nextpid = info[1].pid; else nextpid = deadlockDetails[0].pid; /* reset locktagbuf to hold next object description */ resetStringInfo(&locktagbuf); DescribeLockTag(&locktagbuf, &info->locktag); if (i > 0) appendStringInfoChar(&clientbuf, '\n'); appendStringInfo(&clientbuf, _("Process %d waits for %s on %s; blocked by process %d."), info->pid, GetLockmodeName(info->locktag.locktag_lockmethodid, info->lockmode), locktagbuf.data, nextpid); } /* Duplicate all the above for the server ... */ appendStringInfoString(&logbuf, clientbuf.data); /* ... and add info about query strings */ for (i = 0; i < nDeadlockDetails; i++) { DEADLOCK_INFO *info = &deadlockDetails[i]; appendStringInfoChar(&logbuf, '\n'); appendStringInfo(&logbuf, _("Process %d: %s"), info->pid, pgstat_get_backend_current_activity(info->pid, false)); } ereport(ERROR, (errcode(ERRCODE_T_R_DEADLOCK_DETECTED), errmsg("deadlock detected"), errdetail_internal("%s", clientbuf.data), errdetail_log("%s", logbuf.data), errhint("See server log for query details."))); }
/* * ExecRefreshMatView -- execute a REFRESH MATERIALIZED VIEW command * * This refreshes the materialized view by creating a new table and swapping * the relfilenodes of the new table and the old materialized view, so the OID * of the original materialized view is preserved. Thus we do not lose GRANT * nor references to this materialized view. * * If WITH NO DATA was specified, this is effectively like a TRUNCATE; * otherwise it is like a TRUNCATE followed by an INSERT using the SELECT * statement associated with the materialized view. The statement node's * skipData field shows whether the clause was used. * * Indexes are rebuilt too, via REINDEX. Since we are effectively bulk-loading * the new heap, it's better to create the indexes afterwards than to fill them * incrementally while we load. * * The matview's "populated" state is changed based on whether the contents * reflect the result set of the materialized view's query. */ ObjectAddress ExecRefreshMatView(RefreshMatViewStmt *stmt, const char *queryString, ParamListInfo params, char *completionTag) { Oid matviewOid; Relation matviewRel; RewriteRule *rule; List *actions; Query *dataQuery; Oid tableSpace; Oid relowner; Oid OIDNewHeap; DestReceiver *dest; uint64 processed = 0; bool concurrent; LOCKMODE lockmode; char relpersistence; Oid save_userid; int save_sec_context; int save_nestlevel; ObjectAddress address; /* Determine strength of lock needed. */ concurrent = stmt->concurrent; lockmode = concurrent ? ExclusiveLock : AccessExclusiveLock; /* * Get a lock until end of transaction. */ matviewOid = RangeVarGetRelidExtended(stmt->relation, lockmode, 0, RangeVarCallbackOwnsTable, NULL); matviewRel = table_open(matviewOid, NoLock); /* Make sure it is a materialized view. */ if (matviewRel->rd_rel->relkind != RELKIND_MATVIEW) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("\"%s\" is not a materialized view", RelationGetRelationName(matviewRel)))); /* Check that CONCURRENTLY is not specified if not populated. */ if (concurrent && !RelationIsPopulated(matviewRel)) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("CONCURRENTLY cannot be used when the materialized view is not populated"))); /* Check that conflicting options have not been specified. */ if (concurrent && stmt->skipData) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("CONCURRENTLY and WITH NO DATA options cannot be used together"))); /* * Check that everything is correct for a refresh. Problems at this point * are internal errors, so elog is sufficient. */ if (matviewRel->rd_rel->relhasrules == false || matviewRel->rd_rules->numLocks < 1) elog(ERROR, "materialized view \"%s\" is missing rewrite information", RelationGetRelationName(matviewRel)); if (matviewRel->rd_rules->numLocks > 1) elog(ERROR, "materialized view \"%s\" has too many rules", RelationGetRelationName(matviewRel)); rule = matviewRel->rd_rules->rules[0]; if (rule->event != CMD_SELECT || !(rule->isInstead)) elog(ERROR, "the rule for materialized view \"%s\" is not a SELECT INSTEAD OF rule", RelationGetRelationName(matviewRel)); actions = rule->actions; if (list_length(actions) != 1) elog(ERROR, "the rule for materialized view \"%s\" is not a single action", RelationGetRelationName(matviewRel)); /* * Check that there is a unique index with no WHERE clause on one or more * columns of the materialized view if CONCURRENTLY is specified. */ if (concurrent) { List *indexoidlist = RelationGetIndexList(matviewRel); ListCell *indexoidscan; bool hasUniqueIndex = false; foreach(indexoidscan, indexoidlist) { Oid indexoid = lfirst_oid(indexoidscan); Relation indexRel; indexRel = index_open(indexoid, AccessShareLock); hasUniqueIndex = is_usable_unique_index(indexRel); index_close(indexRel, AccessShareLock); if (hasUniqueIndex) break; } list_free(indexoidlist); if (!hasUniqueIndex) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("cannot refresh materialized view \"%s\" concurrently", quote_qualified_identifier(get_namespace_name(RelationGetNamespace(matviewRel)), RelationGetRelationName(matviewRel))), errhint("Create a unique index with no WHERE clause on one or more columns of the materialized view."))); }
/* * Validate the generic options given to a FOREIGN DATA WRAPPER, SERVER, * USER MAPPING or FOREIGN TABLE that uses file_fdw. * * Raise an ERROR if the option or its value is considered invalid. */ Datum file_fdw_validator(PG_FUNCTION_ARGS) { List *options_list = untransformRelOptions(PG_GETARG_DATUM(0)); Oid catalog = PG_GETARG_OID(1); char *filename = NULL; DefElem *force_not_null = NULL; DefElem *force_null = NULL; List *other_options = NIL; ListCell *cell; /* * Only superusers are allowed to set options of a file_fdw foreign table. * This is because the filename is one of those options, and we don't want * non-superusers to be able to determine which file gets read. * * Putting this sort of permissions check in a validator is a bit of a * crock, but there doesn't seem to be any other place that can enforce * the check more cleanly. * * Note that the valid_options[] array disallows setting filename at any * options level other than foreign table --- otherwise there'd still be a * security hole. */ if (catalog == ForeignTableRelationId && !superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("only superuser can change options of a file_fdw foreign table"))); /* * Check that only options supported by file_fdw, and allowed for the * current object type, are given. */ foreach(cell, options_list) { DefElem *def = (DefElem *) lfirst(cell); if (!is_valid_option(def->defname, catalog)) { const struct FileFdwOption *opt; StringInfoData buf; /* * Unknown option specified, complain about it. Provide a hint * with list of valid options for the object. */ initStringInfo(&buf); for (opt = valid_options; opt->optname; opt++) { if (catalog == opt->optcontext) appendStringInfo(&buf, "%s%s", (buf.len > 0) ? ", " : "", opt->optname); } ereport(ERROR, (errcode(ERRCODE_FDW_INVALID_OPTION_NAME), errmsg("invalid option \"%s\"", def->defname), buf.len > 0 ? errhint("Valid options in this context are: %s", buf.data) : errhint("There are no valid options in this context."))); } /* * Separate out filename and column-specific options, since * ProcessCopyOptions won't accept them. */ if (strcmp(def->defname, "filename") == 0) { if (filename) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("conflicting or redundant options"))); filename = defGetString(def); } /* * force_not_null is a boolean option; after validation we can discard * it - it will be retrieved later in get_file_fdw_attribute_options() */ else if (strcmp(def->defname, "force_not_null") == 0) { if (force_not_null) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("conflicting or redundant options"), errhint("option \"force_not_null\" supplied more than once for a column"))); force_not_null = def; /* Don't care what the value is, as long as it's a legal boolean */ (void) defGetBoolean(def); } /* See comments for force_not_null above */ else if (strcmp(def->defname, "force_null") == 0) { if (force_null) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("conflicting or redundant options"), errhint("option \"force_null\" supplied more than once for a column"))); force_null = def; (void) defGetBoolean(def); } else other_options = lappend(other_options, def); }
static const char * identify_system_timezone(void) { int i; char tzname[128]; char localtzname[256]; time_t t = time(NULL); struct tm *tm = localtime(&t); HKEY rootKey; int idx; if (!tm) { ereport(LOG, (errmsg("could not identify system time zone: localtime() failed"), errdetail("The PostgreSQL time zone will be set to \"%s\".", "GMT"), errhint("You can specify the correct timezone in postgresql.conf."))); return NULL; /* go to GMT */ } memset(tzname, 0, sizeof(tzname)); strftime(tzname, sizeof(tzname) - 1, "%Z", tm); for (i = 0; win32_tzmap[i].stdname != NULL; i++) { if (strcmp(tzname, win32_tzmap[i].stdname) == 0 || strcmp(tzname, win32_tzmap[i].dstname) == 0) { elog(DEBUG4, "TZ \"%s\" matches system time zone \"%s\"", win32_tzmap[i].pgtzname, tzname); return win32_tzmap[i].pgtzname; } } /* * Localized Windows versions return localized names for the timezone. * Scan the registry to find the English name, and then try matching * against our table again. */ memset(localtzname, 0, sizeof(localtzname)); if (RegOpenKeyEx(HKEY_LOCAL_MACHINE, "SOFTWARE\\Microsoft\\Windows NT\\CurrentVersion\\Time Zones", 0, KEY_READ, &rootKey) != ERROR_SUCCESS) { ereport(LOG, (errmsg("could not open registry key to identify system time zone: %i", (int) GetLastError()), errdetail("The PostgreSQL time zone will be set to \"%s\".", "GMT"), errhint("You can specify the correct timezone in postgresql.conf."))); return NULL; /* go to GMT */ } for (idx = 0;; idx++) { char keyname[256]; char zonename[256]; DWORD namesize; FILETIME lastwrite; HKEY key; LONG r; memset(keyname, 0, sizeof(keyname)); namesize = sizeof(keyname); if ((r = RegEnumKeyEx(rootKey, idx, keyname, &namesize, NULL, NULL, NULL, &lastwrite)) != ERROR_SUCCESS) { if (r == ERROR_NO_MORE_ITEMS) break; ereport(LOG, (errmsg_internal("could not enumerate registry subkeys to identify system time zone: %i", (int) r))); break; } if ((r = RegOpenKeyEx(rootKey, keyname, 0, KEY_READ, &key)) != ERROR_SUCCESS) { ereport(LOG, (errmsg_internal("could not open registry subkey to identify system time zone: %i", (int) r))); break; } memset(zonename, 0, sizeof(zonename)); namesize = sizeof(zonename); if ((r = RegQueryValueEx(key, "Std", NULL, NULL, zonename, &namesize)) != ERROR_SUCCESS) { ereport(LOG, (errmsg_internal("could not query value for key \"std\" to identify system time zone \"%s\": %i", keyname, (int) r))); RegCloseKey(key); continue; /* Proceed to look at the next timezone */ } if (strcmp(tzname, zonename) == 0) { /* Matched zone */ strcpy(localtzname, keyname); RegCloseKey(key); break; } memset(zonename, 0, sizeof(zonename)); namesize = sizeof(zonename); if ((r = RegQueryValueEx(key, "Dlt", NULL, NULL, zonename, &namesize)) != ERROR_SUCCESS) { ereport(LOG, (errmsg_internal("could not query value for key \"dlt\" to identify system time zone \"%s\": %i", keyname, (int) r))); RegCloseKey(key); continue; /* Proceed to look at the next timezone */ } if (strcmp(tzname, zonename) == 0) { /* Matched DST zone */ strcpy(localtzname, keyname); RegCloseKey(key); break; } RegCloseKey(key); } RegCloseKey(rootKey); if (localtzname[0]) { /* Found a localized name, so scan for that one too */ for (i = 0; win32_tzmap[i].stdname != NULL; i++) { if (strcmp(localtzname, win32_tzmap[i].stdname) == 0 || strcmp(localtzname, win32_tzmap[i].dstname) == 0) { elog(DEBUG4, "TZ \"%s\" matches localized system time zone \"%s\" (\"%s\")", win32_tzmap[i].pgtzname, tzname, localtzname); return win32_tzmap[i].pgtzname; } } } ereport(LOG, (errmsg("could not find a match for system time zone \"%s\"", tzname), errdetail("The PostgreSQL time zone will be set to \"%s\".", "GMT"), errhint("You can specify the correct timezone in postgresql.conf."))); return NULL; /* go to GMT */ }
/* * Emit a PG error or notice, together with any available info about * the current Python error, previously set by PLy_exception_set(). * This should be used to propagate Python errors into PG. If fmt is * NULL, the Python error becomes the primary error message, otherwise * it becomes the detail. If there is a Python traceback, it is put * in the context. */ void PLy_elog(int elevel, const char *fmt,...) { char *xmsg; char *tbmsg; int tb_depth; StringInfoData emsg; PyObject *exc, *val, *tb; const char *primary = NULL; int sqlerrcode = 0; char *detail = NULL; char *hint = NULL; char *query = NULL; int position = 0; PyErr_Fetch(&exc, &val, &tb); if (exc != NULL) { PyErr_NormalizeException(&exc, &val, &tb); if (PyErr_GivenExceptionMatches(val, PLy_exc_spi_error)) PLy_get_spi_error_data(val, &sqlerrcode, &detail, &hint, &query, &position); else if (PyErr_GivenExceptionMatches(val, PLy_exc_fatal)) elevel = FATAL; } /* this releases our refcount on tb! */ PLy_traceback(exc, val, tb, &xmsg, &tbmsg, &tb_depth); if (fmt) { initStringInfo(&emsg); for (;;) { va_list ap; int needed; va_start(ap, fmt); needed = appendStringInfoVA(&emsg, dgettext(TEXTDOMAIN, fmt), ap); va_end(ap); if (needed == 0) break; enlargeStringInfo(&emsg, needed); } primary = emsg.data; /* Since we have a format string, we cannot have a SPI detail. */ Assert(detail == NULL); /* If there's an exception message, it goes in the detail. */ if (xmsg) detail = xmsg; } else { if (xmsg) primary = xmsg; } PG_TRY(); { ereport(elevel, (errcode(sqlerrcode ? sqlerrcode : ERRCODE_EXTERNAL_ROUTINE_EXCEPTION), errmsg_internal("%s", primary ? primary : "no exception data"), (detail) ? errdetail_internal("%s", detail) : 0, (tb_depth > 0 && tbmsg) ? errcontext("%s", tbmsg) : 0, (hint) ? errhint("%s", hint) : 0, (query) ? internalerrquery(query) : 0, (position) ? internalerrposition(position) : 0)); } PG_CATCH(); { if (fmt) pfree(emsg.data); if (xmsg) pfree(xmsg); if (tbmsg) pfree(tbmsg); Py_XDECREF(exc); Py_XDECREF(val); PG_RE_THROW(); } PG_END_TRY(); if (fmt) pfree(emsg.data); if (xmsg) pfree(xmsg); if (tbmsg) pfree(tbmsg); Py_XDECREF(exc); Py_XDECREF(val); }
/* -------------------------------- * InitPostgres * Initialize POSTGRES. * * The database can be specified by name, using the in_dbname parameter, or by * OID, using the dboid parameter. In the latter case, the actual database * name can be returned to the caller in out_dbname. If out_dbname isn't * NULL, it must point to a buffer of size NAMEDATALEN. * * Similarly, the username can be passed by name, using the username parameter, * or by OID using the useroid parameter. * * In bootstrap mode no parameters are used. The autovacuum launcher process * doesn't use any parameters either, because it only goes far enough to be * able to read pg_database; it doesn't connect to any particular database. * In walsender mode only username is used. * * As of PostgreSQL 8.2, we expect InitProcess() was already called, so we * already have a PGPROC struct ... but it's not completely filled in yet. * * Note: * Be very careful with the order of calls in the InitPostgres function. * -------------------------------- */ void InitPostgres(const char *in_dbname, Oid dboid, const char *username, Oid useroid, char *out_dbname) { bool bootstrap = IsBootstrapProcessingMode(); bool am_superuser; char *fullpath; char dbname[NAMEDATALEN]; elog(DEBUG3, "InitPostgres"); /* * Add my PGPROC struct to the ProcArray. * * Once I have done this, I am visible to other backends! */ InitProcessPhase2(); /* * Initialize my entry in the shared-invalidation manager's array of * per-backend data. * * Sets up MyBackendId, a unique backend identifier. */ MyBackendId = InvalidBackendId; SharedInvalBackendInit(false); if (MyBackendId > MaxBackends || MyBackendId <= 0) elog(FATAL, "bad backend ID: %d", MyBackendId); /* Now that we have a BackendId, we can participate in ProcSignal */ ProcSignalInit(MyBackendId); /* * Also set up timeout handlers needed for backend operation. We need * these in every case except bootstrap. */ if (!bootstrap) { RegisterTimeout(DEADLOCK_TIMEOUT, CheckDeadLockAlert); RegisterTimeout(STATEMENT_TIMEOUT, StatementTimeoutHandler); RegisterTimeout(LOCK_TIMEOUT, LockTimeoutHandler); } /* * bufmgr needs another initialization call too */ InitBufferPoolBackend(); /* * Initialize local process's access to XLOG. */ if (IsUnderPostmaster) { /* * The postmaster already started the XLOG machinery, but we need to * call InitXLOGAccess(), if the system isn't in hot-standby mode. * This is handled by calling RecoveryInProgress and ignoring the * result. */ (void) RecoveryInProgress(); } else { /* * We are either a bootstrap process or a standalone backend. Either * way, start up the XLOG machinery, and register to have it closed * down at exit. */ StartupXLOG(); on_shmem_exit(ShutdownXLOG, 0); } /* * Initialize the relation cache and the system catalog caches. Note that * no catalog access happens here; we only set up the hashtable structure. * We must do this before starting a transaction because transaction abort * would try to touch these hashtables. */ RelationCacheInitialize(); InitCatalogCache(); InitPlanCache(); /* Initialize portal manager */ EnablePortalManager(); /* Initialize stats collection --- must happen before first xact */ if (!bootstrap) pgstat_initialize(); /* * Load relcache entries for the shared system catalogs. This must create * at least entries for pg_database and catalogs used for authentication. */ RelationCacheInitializePhase2(); /* * Set up process-exit callback to do pre-shutdown cleanup. This is the * first before_shmem_exit callback we register; thus, this will be the * last thing we do before low-level modules like the buffer manager begin * to close down. We need to have this in place before we begin our first * transaction --- if we fail during the initialization transaction, as is * entirely possible, we need the AbortTransaction call to clean up. */ before_shmem_exit(ShutdownPostgres, 0); /* The autovacuum launcher is done here */ if (IsAutoVacuumLauncherProcess() || IsClusterMonitorProcess()) return; /* * Start a new transaction here before first access to db, and get a * snapshot. We don't have a use for the snapshot itself, but we're * interested in the secondary effect that it sets RecentGlobalXmin. (This * is critical for anything that reads heap pages, because HOT may decide * to prune them even if the process doesn't attempt to modify any * tuples.) */ if (!bootstrap) { /* statement_timestamp must be set for timeouts to work correctly */ SetCurrentStatementStartTimestamp(); StartTransactionCommand(); /* * transaction_isolation will have been set to the default by the * above. If the default is "serializable", and we are in hot * standby, we will fail if we don't change it to something lower. * Fortunately, "read committed" is plenty good enough. */ XactIsoLevel = XACT_READ_COMMITTED; (void) GetTransactionSnapshot(); } /* * Perform client authentication if necessary, then figure out our * postgres user ID, and see if we are a superuser. * * In standalone mode and in autovacuum worker processes, we use a fixed * ID, otherwise we figure it out from the authenticated user name. */ if (bootstrap || IsAutoVacuumWorkerProcess()) { InitializeSessionUserIdStandalone(); am_superuser = true; } else if (!IsUnderPostmaster) { InitializeSessionUserIdStandalone(); am_superuser = true; if (!ThereIsAtLeastOneRole()) ereport(WARNING, (errcode(ERRCODE_UNDEFINED_OBJECT), errmsg("no roles are defined in this database system"), errhint("You should immediately run CREATE USER \"%s\" SUPERUSER;.", username != NULL ? username : "******"))); } else if (IsBackgroundWorker) { if (username == NULL && !OidIsValid(useroid)) { InitializeSessionUserIdStandalone(); am_superuser = true; } else { InitializeSessionUserId(username, useroid); am_superuser = superuser(); } } else { /* normal multiuser case */ Assert(MyProcPort != NULL); PerformAuthentication(MyProcPort); InitializeSessionUserId(username, useroid); am_superuser = superuser(); } /* * If we're trying to shut down, only superusers can connect, and new * replication connections are not allowed. */ if ((!am_superuser || am_walsender) && MyProcPort != NULL && MyProcPort->canAcceptConnections == CAC_WAITBACKUP) { if (am_walsender) ereport(FATAL, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("new replication connections are not allowed during database shutdown"))); else ereport(FATAL, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("must be superuser to connect during database shutdown"))); } /* * Binary upgrades only allowed super-user connections */ if (IsBinaryUpgrade && !am_superuser) { ereport(FATAL, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("must be superuser to connect in binary upgrade mode"))); } /* * The last few connections slots are reserved for superusers. Although * replication connections currently require superuser privileges, we * don't allow them to consume the reserved slots, which are intended for * interactive use. */ if ((!am_superuser || am_walsender) && ReservedBackends > 0 && !HaveNFreeProcs(ReservedBackends)) ereport(FATAL, (errcode(ERRCODE_TOO_MANY_CONNECTIONS), errmsg("remaining connection slots are reserved for non-replication superuser connections"))); /* Check replication permissions needed for walsender processes. */ if (am_walsender) { Assert(!bootstrap); if (!superuser() && !has_rolreplication(GetUserId())) ereport(FATAL, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("must be superuser or replication role to start walsender"))); } /* * If this is a plain walsender only supporting physical replication, we * don't want to connect to any particular database. Just finish the * backend startup by processing any options from the startup packet, and * we're done. */ if (am_walsender && !am_db_walsender) { /* process any options passed in the startup packet */ if (MyProcPort != NULL) process_startup_options(MyProcPort, am_superuser); /* Apply PostAuthDelay as soon as we've read all options */ if (PostAuthDelay > 0) pg_usleep(PostAuthDelay * 1000000L); /* initialize client encoding */ InitializeClientEncoding(); /* report this backend in the PgBackendStatus array */ pgstat_bestart(); /* close the transaction we started above */ CommitTransactionCommand(); return; } /* * Set up the global variables holding database id and default tablespace. * But note we won't actually try to touch the database just yet. * * We take a shortcut in the bootstrap case, otherwise we have to look up * the db's entry in pg_database. */ if (bootstrap) { MyDatabaseId = TemplateDbOid; MyDatabaseTableSpace = DEFAULTTABLESPACE_OID; } else if (in_dbname != NULL) { HeapTuple tuple; Form_pg_database dbform; tuple = GetDatabaseTuple(in_dbname); if (!HeapTupleIsValid(tuple)) ereport(FATAL, (errcode(ERRCODE_UNDEFINED_DATABASE), errmsg("database \"%s\" does not exist", in_dbname))); dbform = (Form_pg_database) GETSTRUCT(tuple); MyDatabaseId = HeapTupleGetOid(tuple); MyDatabaseTableSpace = dbform->dattablespace; /* take database name from the caller, just for paranoia */ strlcpy(dbname, in_dbname, sizeof(dbname)); } else if (OidIsValid(dboid)) { /* caller specified database by OID */ HeapTuple tuple; Form_pg_database dbform; tuple = GetDatabaseTupleByOid(dboid); if (!HeapTupleIsValid(tuple)) ereport(FATAL, (errcode(ERRCODE_UNDEFINED_DATABASE), errmsg("database %u does not exist", dboid))); dbform = (Form_pg_database) GETSTRUCT(tuple); MyDatabaseId = HeapTupleGetOid(tuple); MyDatabaseTableSpace = dbform->dattablespace; Assert(MyDatabaseId == dboid); strlcpy(dbname, NameStr(dbform->datname), sizeof(dbname)); /* pass the database name back to the caller */ if (out_dbname) strcpy(out_dbname, dbname); } else { /* * If this is a background worker not bound to any particular * database, we're done now. Everything that follows only makes * sense if we are bound to a specific database. We do need to * close the transaction we started before returning. */ if (!bootstrap) CommitTransactionCommand(); return; } /* * Now, take a writer's lock on the database we are trying to connect to. * If there is a concurrently running DROP DATABASE on that database, this * will block us until it finishes (and has committed its update of * pg_database). * * Note that the lock is not held long, only until the end of this startup * transaction. This is OK since we will advertise our use of the * database in the ProcArray before dropping the lock (in fact, that's the * next thing to do). Anyone trying a DROP DATABASE after this point will * see us in the array once they have the lock. Ordering is important for * this because we don't want to advertise ourselves as being in this * database until we have the lock; otherwise we create what amounts to a * deadlock with CountOtherDBBackends(). * * Note: use of RowExclusiveLock here is reasonable because we envision * our session as being a concurrent writer of the database. If we had a * way of declaring a session as being guaranteed-read-only, we could use * AccessShareLock for such sessions and thereby not conflict against * CREATE DATABASE. */ if (!bootstrap) LockSharedObject(DatabaseRelationId, MyDatabaseId, 0, RowExclusiveLock); /* * Now we can mark our PGPROC entry with the database ID. * * We assume this is an atomic store so no lock is needed; though actually * things would work fine even if it weren't atomic. Anyone searching the * ProcArray for this database's ID should hold the database lock, so they * would not be executing concurrently with this store. A process looking * for another database's ID could in theory see a chance match if it read * a partially-updated databaseId value; but as long as all such searches * wait and retry, as in CountOtherDBBackends(), they will certainly see * the correct value on their next try. */ MyProc->databaseId = MyDatabaseId; /* * We established a catalog snapshot while reading pg_authid and/or * pg_database; but until we have set up MyDatabaseId, we won't react to * incoming sinval messages for unshared catalogs, so we won't realize it * if the snapshot has been invalidated. Assume it's no good anymore. */ InvalidateCatalogSnapshot(); /* * Recheck pg_database to make sure the target database hasn't gone away. * If there was a concurrent DROP DATABASE, this ensures we will die * cleanly without creating a mess. */ if (!bootstrap) { HeapTuple tuple; tuple = GetDatabaseTuple(dbname); if (!HeapTupleIsValid(tuple) || MyDatabaseId != HeapTupleGetOid(tuple) || MyDatabaseTableSpace != ((Form_pg_database) GETSTRUCT(tuple))->dattablespace) ereport(FATAL, (errcode(ERRCODE_UNDEFINED_DATABASE), errmsg("database \"%s\" does not exist", dbname), errdetail("It seems to have just been dropped or renamed."))); } /* * Now we should be able to access the database directory safely. Verify * it's there and looks reasonable. */ fullpath = GetDatabasePath(MyDatabaseId, MyDatabaseTableSpace); if (!bootstrap) { if (access(fullpath, F_OK) == -1) { if (errno == ENOENT) ereport(FATAL, (errcode(ERRCODE_UNDEFINED_DATABASE), errmsg("database \"%s\" does not exist", dbname), errdetail("The database subdirectory \"%s\" is missing.", fullpath))); else ereport(FATAL, (errcode_for_file_access(), errmsg("could not access directory \"%s\": %m", fullpath))); } ValidatePgVersion(fullpath); } SetDatabasePath(fullpath); /* * It's now possible to do real access to the system catalogs. * * Load relcache entries for the system catalogs. This must create at * least the minimum set of "nailed-in" cache entries. */ RelationCacheInitializePhase3(); /* set up ACL framework (so CheckMyDatabase can check permissions) */ initialize_acl(); /* * Re-read the pg_database row for our database, check permissions and set * up database-specific GUC settings. We can't do this until all the * database-access infrastructure is up. (Also, it wants to know if the * user is a superuser, so the above stuff has to happen first.) */ if (!bootstrap) CheckMyDatabase(dbname, am_superuser); /* * Now process any command-line switches and any additional GUC variable * settings passed in the startup packet. We couldn't do this before * because we didn't know if client is a superuser. */ if (MyProcPort != NULL) process_startup_options(MyProcPort, am_superuser); /* Process pg_db_role_setting options */ process_settings(MyDatabaseId, GetSessionUserId()); /* Apply PostAuthDelay as soon as we've read all options */ if (PostAuthDelay > 0) pg_usleep(PostAuthDelay * 1000000L); /* * Initialize various default states that can't be set up until we've * selected the active user and gotten the right GUC settings. */ /* set default namespace search path */ InitializeSearchPath(); /* initialize client encoding */ InitializeClientEncoding(); /* report this backend in the PgBackendStatus array */ if (!bootstrap) pgstat_bestart(); /* close the transaction we started above */ if (!bootstrap) CommitTransactionCommand(); }
/* * pg_prewarm(regclass, mode text, fork text, * first_block int8, last_block int8) * * The first argument is the relation to be prewarmed; the second controls * how prewarming is done; legal options are 'prefetch', 'read', and 'buffer'. * The third is the name of the relation fork to be prewarmed. The fourth * and fifth arguments specify the first and last block to be prewarmed. * If the fourth argument is NULL, it will be taken as 0; if the fifth argument * is NULL, it will be taken as the number of blocks in the relation. The * return value is the number of blocks successfully prewarmed. */ Datum pg_prewarm(PG_FUNCTION_ARGS) { Oid relOid; text *forkName; text *type; int64 first_block; int64 last_block; int64 nblocks; int64 blocks_done = 0; int64 block; Relation rel; ForkNumber forkNumber; char *forkString; char *ttype; PrewarmType ptype; AclResult aclresult; /* Basic sanity checking. */ if (PG_ARGISNULL(0)) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("relation cannot be null"))); relOid = PG_GETARG_OID(0); if (PG_ARGISNULL(1)) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), (errmsg("prewarm type cannot be null")))); type = PG_GETARG_TEXT_P(1); ttype = text_to_cstring(type); if (strcmp(ttype, "prefetch") == 0) ptype = PREWARM_PREFETCH; else if (strcmp(ttype, "read") == 0) ptype = PREWARM_READ; else if (strcmp(ttype, "buffer") == 0) ptype = PREWARM_BUFFER; else { ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("invalid prewarm type"), errhint("Valid prewarm types are \"prefetch\", \"read\", and \"buffer\"."))); PG_RETURN_INT64(0); /* Placate compiler. */ } if (PG_ARGISNULL(2)) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), (errmsg("relation fork cannot be null")))); forkName = PG_GETARG_TEXT_P(2); forkString = text_to_cstring(forkName); forkNumber = forkname_to_number(forkString); /* Open relation and check privileges. */ rel = relation_open(relOid, AccessShareLock); aclresult = pg_class_aclcheck(relOid, GetUserId(), ACL_SELECT); if (aclresult != ACLCHECK_OK) aclcheck_error(aclresult, ACL_KIND_CLASS, get_rel_name(relOid)); /* Check that the fork exists. */ RelationOpenSmgr(rel); if (!smgrexists(rel->rd_smgr, forkNumber)) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("fork \"%s\" does not exist for this relation", forkString))); /* Validate block numbers, or handle nulls. */ nblocks = RelationGetNumberOfBlocksInFork(rel, forkNumber); if (PG_ARGISNULL(3)) first_block = 0; else { first_block = PG_GETARG_INT64(3); if (first_block < 0 || first_block >= nblocks) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("starting block number must be between 0 and " INT64_FORMAT, nblocks - 1))); } if (PG_ARGISNULL(4)) last_block = nblocks - 1; else { last_block = PG_GETARG_INT64(4); if (last_block < 0 || last_block >= nblocks) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("ending block number must be between 0 and " INT64_FORMAT, nblocks - 1))); } /* Now we're ready to do the real work. */ if (ptype == PREWARM_PREFETCH) { #ifdef USE_PREFETCH /* * In prefetch mode, we just hint the OS to read the blocks, but we * don't know whether it really does it, and we don't wait for it to * finish. * * It would probably be better to pass our prefetch requests in chunks * of a megabyte or maybe even a whole segment at a time, but there's * no practical way to do that at present without a gross modularity * violation, so we just do this. */ for (block = first_block; block <= last_block; ++block) { CHECK_FOR_INTERRUPTS(); PrefetchBuffer(rel, forkNumber, block); ++blocks_done; } #else ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("prefetch is not supported by this build"))); #endif } else if (ptype == PREWARM_READ) { /* * In read mode, we actually read the blocks, but not into shared * buffers. This is more portable than prefetch mode (it works * everywhere) and is synchronous. */ for (block = first_block; block <= last_block; ++block) { CHECK_FOR_INTERRUPTS(); smgrread(rel->rd_smgr, forkNumber, block, blockbuffer); ++blocks_done; } } else if (ptype == PREWARM_BUFFER) { /* * In buffer mode, we actually pull the data into shared_buffers. */ for (block = first_block; block <= last_block; ++block) { Buffer buf; CHECK_FOR_INTERRUPTS(); buf = ReadBufferExtended(rel, forkNumber, block, RBM_NORMAL, NULL); ReleaseBuffer(buf); ++blocks_done; } } /* Close relation, release lock. */ relation_close(rel, AccessShareLock); PG_RETURN_INT64(blocks_done); }
/* * Validate the generic options given to a FOREIGN DATA WRAPPER, SERVER, * USER MAPPING or FOREIGN TABLE that uses file_fdw. * * Raise an ERROR if the option or its value is considered invalid. */ Datum mysql_fdw_validator(PG_FUNCTION_ARGS) { List *options_list = untransformRelOptions(PG_GETARG_DATUM(0)); Oid catalog = PG_GETARG_OID(1); char *svr_address = NULL; int svr_port = 0; char *svr_username = NULL; char *svr_password = NULL; char *svr_database = NULL; char *svr_query = NULL; char *svr_table = NULL; ListCell *cell; /* * Check that only options supported by mysql_fdw, * and allowed for the current object type, are given. */ foreach(cell, options_list) { DefElem *def = (DefElem *) lfirst(cell); if (!mysqlIsValidOption(def->defname, catalog)) { struct MySQLFdwOption *opt; StringInfoData buf; /* * Unknown option specified, complain about it. Provide a hint * with list of valid options for the object. */ initStringInfo(&buf); for (opt = valid_options; opt->optname; opt++) { if (catalog == opt->optcontext) appendStringInfo(&buf, "%s%s", (buf.len > 0) ? ", " : "", opt->optname); } ereport(ERROR, (errcode(ERRCODE_FDW_INVALID_OPTION_NAME), errmsg("invalid option \"%s\"", def->defname), errhint("Valid options in this context are: %s", buf.len ? buf.data : "<none>") )); } if (strcmp(def->defname, "address") == 0) { if (svr_address) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("conflicting or redundant options: address (%s)", defGetString(def)) )); svr_address = defGetString(def); } else if (strcmp(def->defname, "port") == 0) { if (svr_port) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("conflicting or redundant options: port (%s)", defGetString(def)) )); svr_port = atoi(defGetString(def)); } if (strcmp(def->defname, "username") == 0) { if (svr_username) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("conflicting or redundant options: username (%s)", defGetString(def)) )); svr_username = defGetString(def); } if (strcmp(def->defname, "password") == 0) { if (svr_password) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("conflicting or redundant options: password") )); svr_password = defGetString(def); } else if (strcmp(def->defname, "database") == 0) { if (svr_database) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("conflicting or redundant options: database (%s)", defGetString(def)) )); svr_database = defGetString(def); } else if (strcmp(def->defname, "query") == 0) { if (svr_table) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("conflicting options: query cannot be used with table") )); if (svr_query) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("conflicting or redundant options: query (%s)", defGetString(def)) )); svr_query = defGetString(def); } else if (strcmp(def->defname, "table") == 0) { if (svr_query) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("conflicting options: table cannot be used with query") )); if (svr_table) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("conflicting or redundant options: table (%s)", defGetString(def)) )); svr_table = defGetString(def); } }
/* * PGSharedMemoryCreate * * Create a shared memory segment of the given size and initialize its * standard header. Also, register an on_shmem_exit callback to release * the storage. * * Dead Postgres segments are recycled if found, but we do not fail upon * collision with non-Postgres shmem segments. The idea here is to detect and * re-use keys that may have been assigned by a crashed postmaster or backend. * * makePrivate means to always create a new segment, rather than attach to * or recycle any existing segment. * * The port number is passed for possible use as a key (for SysV, we use * it to generate the starting shmem key). In a standalone backend, * zero will be passed. */ PGShmemHeader * PGSharedMemoryCreate(Size size, bool makePrivate, int port) { IpcMemoryKey NextShmemSegID; void *memAddress; PGShmemHeader *hdr; IpcMemoryId shmid; struct stat statbuf; Size sysvsize = size; /* Room for a header? */ Assert(size > MAXALIGN(sizeof(PGShmemHeader))); /* * As of PostgreSQL 9.3, we normally allocate only a very small amount of * System V shared memory, and only for the purposes of providing an * interlock to protect the data directory. The real shared memory block * is allocated using mmap(). This works around the problem that many * systems have very low limits on the amount of System V shared memory * that can be allocated. Even a limit of a few megabytes will be enough * to run many copies of PostgreSQL without needing to adjust system * settings. * * However, we disable this logic in the EXEC_BACKEND case, and fall back * to the old method of allocating the entire segment using System V shared * memory, because there's no way to attach an mmap'd segment to a process * after exec(). Since EXEC_BACKEND is intended only for developer use, * this shouldn't be a big problem. */ #ifndef EXEC_BACKEND { long pagesize = sysconf(_SC_PAGE_SIZE); /* * Ensure request size is a multiple of pagesize. * * pagesize will, for practical purposes, always be a power of two. * But just in case it isn't, we do it this way instead of using * TYPEALIGN(). */ if (pagesize > 0 && size % pagesize != 0) size += pagesize - (size % pagesize); /* * We assume that no one will attempt to run PostgreSQL 9.3 or later * on systems that are ancient enough that anonymous shared memory is * not supported, such as pre-2.4 versions of Linux. If that turns out * to be false, we might need to add a run-time test here and do this * only if the running kernel supports it. */ AnonymousShmem = mmap(NULL, size, PROT_READ|PROT_WRITE, PG_MMAP_FLAGS, -1, 0); if (AnonymousShmem == MAP_FAILED) ereport(FATAL, (errmsg("could not map anonymous shared memory: %m"), (errno == ENOMEM) ? errhint("This error usually means that PostgreSQL's request " "for a shared memory segment exceeded available memory " "or swap space. To reduce the request size (currently " "%lu bytes), reduce PostgreSQL's shared memory usage, " "perhaps by reducing shared_buffers or " "max_connections.", (unsigned long) size) : 0)); AnonymousShmemSize = size; /* Now we need only allocate a minimal-sized SysV shmem block. */ sysvsize = sizeof(PGShmemHeader); } #endif /* Make sure PGSharedMemoryAttach doesn't fail without need */ UsedShmemSegAddr = NULL; /* Loop till we find a free IPC key */ NextShmemSegID = port * 1000; for (NextShmemSegID++;; NextShmemSegID++) { /* Try to create new segment */ memAddress = InternalIpcMemoryCreate(NextShmemSegID, sysvsize); if (memAddress) break; /* successful create and attach */ /* Check shared memory and possibly remove and recreate */ if (makePrivate) /* a standalone backend shouldn't do this */ continue; if ((memAddress = PGSharedMemoryAttach(NextShmemSegID, &shmid)) == NULL) continue; /* can't attach, not one of mine */ /* * If I am not the creator and it belongs to an extant process, * continue. */ hdr = (PGShmemHeader *) memAddress; if (hdr->creatorPID != getpid()) { if (kill(hdr->creatorPID, 0) == 0 || errno != ESRCH) { shmdt(memAddress); continue; /* segment belongs to a live process */ } } /* * The segment appears to be from a dead Postgres process, or from a * previous cycle of life in this same process. Zap it, if possible. * This probably shouldn't fail, but if it does, assume the segment * belongs to someone else after all, and continue quietly. */ shmdt(memAddress); if (shmctl(shmid, IPC_RMID, NULL) < 0) continue; /* * Now try again to create the segment. */ memAddress = InternalIpcMemoryCreate(NextShmemSegID, sysvsize); if (memAddress) break; /* successful create and attach */ /* * Can only get here if some other process managed to create the same * shmem key before we did. Let him have that one, loop around to try * next key. */ } /* * OK, we created a new segment. Mark it as created by this process. The * order of assignments here is critical so that another Postgres process * can't see the header as valid but belonging to an invalid PID! */ hdr = (PGShmemHeader *) memAddress; hdr->creatorPID = getpid(); hdr->magic = PGShmemMagic; /* Fill in the data directory ID info, too */ if (stat(DataDir, &statbuf) < 0) ereport(FATAL, (errcode_for_file_access(), errmsg("could not stat data directory \"%s\": %m", DataDir))); hdr->device = statbuf.st_dev; hdr->inode = statbuf.st_ino; /* * Initialize space allocation status for segment. */ hdr->totalsize = size; hdr->freeoffset = MAXALIGN(sizeof(PGShmemHeader)); /* Save info for possible future use */ UsedShmemSegAddr = memAddress; UsedShmemSegID = (unsigned long) NextShmemSegID; /* * If AnonymousShmem is NULL here, then we're not using anonymous shared * memory, and should return a pointer to the System V shared memory block. * Otherwise, the System V shared memory block is only a shim, and we must * return a pointer to the real block. */ if (AnonymousShmem == NULL) return hdr; memcpy(AnonymousShmem, hdr, sizeof(PGShmemHeader)); return (PGShmemHeader *) AnonymousShmem; }
/* * Load a single slot from disk into memory. */ static void RestoreSlotFromDisk(const char *name) { ReplicationSlotOnDisk cp; int i; char path[MAXPGPATH]; int fd; bool restored = false; int readBytes; pg_crc32c checksum; /* no need to lock here, no concurrent access allowed yet */ /* delete temp file if it exists */ sprintf(path, "pg_replslot/%s/state.tmp", name); if (unlink(path) < 0 && errno != ENOENT) ereport(PANIC, (errcode_for_file_access(), errmsg("could not remove file \"%s\": %m", path))); sprintf(path, "pg_replslot/%s/state", name); elog(DEBUG1, "restoring replication slot from \"%s\"", path); fd = OpenTransientFile(path, O_RDONLY | PG_BINARY, 0); /* * We do not need to handle this as we are rename()ing the directory into * place only after we fsync()ed the state file. */ if (fd < 0) ereport(PANIC, (errcode_for_file_access(), errmsg("could not open file \"%s\": %m", path))); /* * Sync state file before we're reading from it. We might have crashed * while it wasn't synced yet and we shouldn't continue on that basis. */ if (pg_fsync(fd) != 0) { CloseTransientFile(fd); ereport(PANIC, (errcode_for_file_access(), errmsg("could not fsync file \"%s\": %m", path))); } /* Also sync the parent directory */ START_CRIT_SECTION(); fsync_fname(path, true); END_CRIT_SECTION(); /* read part of statefile that's guaranteed to be version independent */ readBytes = read(fd, &cp, ReplicationSlotOnDiskConstantSize); if (readBytes != ReplicationSlotOnDiskConstantSize) { int saved_errno = errno; CloseTransientFile(fd); errno = saved_errno; ereport(PANIC, (errcode_for_file_access(), errmsg("could not read file \"%s\", read %d of %u: %m", path, readBytes, (uint32) ReplicationSlotOnDiskConstantSize))); } /* verify magic */ if (cp.magic != SLOT_MAGIC) ereport(PANIC, (errcode_for_file_access(), errmsg("replication slot file \"%s\" has wrong magic %u instead of %u", path, cp.magic, SLOT_MAGIC))); /* verify version */ if (cp.version != SLOT_VERSION) ereport(PANIC, (errcode_for_file_access(), errmsg("replication slot file \"%s\" has unsupported version %u", path, cp.version))); /* boundary check on length */ if (cp.length != ReplicationSlotOnDiskV2Size) ereport(PANIC, (errcode_for_file_access(), errmsg("replication slot file \"%s\" has corrupted length %u", path, cp.length))); /* Now that we know the size, read the entire file */ readBytes = read(fd, (char *) &cp + ReplicationSlotOnDiskConstantSize, cp.length); if (readBytes != cp.length) { int saved_errno = errno; CloseTransientFile(fd); errno = saved_errno; ereport(PANIC, (errcode_for_file_access(), errmsg("could not read file \"%s\", read %d of %u: %m", path, readBytes, cp.length))); } CloseTransientFile(fd); /* now verify the CRC */ INIT_CRC32C(checksum); COMP_CRC32C(checksum, (char *) &cp + SnapBuildOnDiskNotChecksummedSize, SnapBuildOnDiskChecksummedSize); FIN_CRC32C(checksum); if (!EQ_CRC32C(checksum, cp.checksum)) ereport(PANIC, (errmsg("replication slot file %s: checksum mismatch, is %u, should be %u", path, checksum, cp.checksum))); /* * If we crashed with an ephemeral slot active, don't restore but delete * it. */ if (cp.slotdata.persistency != RS_PERSISTENT) { sprintf(path, "pg_replslot/%s", name); if (!rmtree(path, true)) { ereport(WARNING, (errcode_for_file_access(), errmsg("could not remove directory \"%s\"", path))); } fsync_fname("pg_replslot", true); return; } /* nothing can be active yet, don't lock anything */ for (i = 0; i < max_replication_slots; i++) { ReplicationSlot *slot; slot = &ReplicationSlotCtl->replication_slots[i]; if (slot->in_use) continue; /* restore the entire set of persistent data */ memcpy(&slot->data, &cp.slotdata, sizeof(ReplicationSlotPersistentData)); /* initialize in memory state */ slot->effective_xmin = cp.slotdata.xmin; slot->effective_catalog_xmin = cp.slotdata.catalog_xmin; slot->candidate_catalog_xmin = InvalidTransactionId; slot->candidate_xmin_lsn = InvalidXLogRecPtr; slot->candidate_restart_lsn = InvalidXLogRecPtr; slot->candidate_restart_valid = InvalidXLogRecPtr; slot->in_use = true; slot->active = false; restored = true; break; } if (!restored) ereport(PANIC, (errmsg("too many replication slots active before shutdown"), errhint("Increase max_replication_slots and try again."))); }
/* -------------------------------- * InitPostgres * Initialize POSTGRES. * * Note: * Be very careful with the order of calls in the InitPostgres function. * -------------------------------- */ void InitPostgres(const char *dbname, const char *username) { bool bootstrap = IsBootstrapProcessingMode(); /* * Set up the global variables holding database id and path. * * We take a shortcut in the bootstrap case, otherwise we have to look up * the db name in pg_database. */ if (bootstrap) { MyDatabaseId = TemplateDbOid; SetDatabasePath(GetDatabasePath(MyDatabaseId)); } else { char *fullpath, datpath[MAXPGPATH]; /* * Formerly we validated DataDir here, but now that's done * earlier. */ /* * Find oid and path of the database we're about to open. Since * we're not yet up and running we have to use the hackish * GetRawDatabaseInfo. */ GetRawDatabaseInfo(dbname, &MyDatabaseId, datpath); if (!OidIsValid(MyDatabaseId)) ereport(FATAL, (errcode(ERRCODE_UNDEFINED_DATABASE), errmsg("database \"%s\" does not exist", dbname))); fullpath = GetDatabasePath(MyDatabaseId); /* Verify the database path */ if (access(fullpath, F_OK) == -1) { if (errno == ENOENT) ereport(FATAL, (errcode(ERRCODE_UNDEFINED_DATABASE), errmsg("database \"%s\" does not exist", dbname), errdetail("The database subdirectory \"%s\" is missing.", fullpath))); else ereport(FATAL, (errcode_for_file_access(), errmsg("could not access directory \"%s\": %m", fullpath))); } ValidatePgVersion(fullpath); if (chdir(fullpath) == -1) ereport(FATAL, (errcode_for_file_access(), errmsg("could not change directory to \"%s\": %m", fullpath))); SetDatabasePath(fullpath); } /* * Code after this point assumes we are in the proper directory! */ /* * Set up my per-backend PGPROC struct in shared memory. (We need * to know MyDatabaseId before we can do this, since it's entered into * the PGPROC struct.) */ InitProcess(); /* * Initialize my entry in the shared-invalidation manager's array of * per-backend data. (Formerly this came before InitProcess, but now * it must happen after, because it uses MyProc.) Once I have done * this, I am visible to other backends! * * Sets up MyBackendId, a unique backend identifier. */ MyBackendId = InvalidBackendId; InitBackendSharedInvalidationState(); if (MyBackendId > MaxBackends || MyBackendId <= 0) elog(FATAL, "bad backend id: %d", MyBackendId); /* * Initialize the transaction system override state. */ AmiTransactionOverride(bootstrap); /* * Initialize the relation descriptor cache. This must create at * least the minimum set of "nailed-in" cache entries. No catalog * access happens here. */ RelationCacheInitialize(); /* * Initialize all the system catalog caches. Note that no catalog * access happens here; we only set up the cache structure. */ InitCatalogCache(); /* Initialize portal manager */ EnablePortalManager(); /* * Initialize the deferred trigger manager --- must happen before * first transaction start. */ DeferredTriggerInit(); /* start a new transaction here before access to db */ if (!bootstrap) StartTransactionCommand(); /* * It's now possible to do real access to the system catalogs. * * Replace faked-up relcache entries with correct info. */ RelationCacheInitializePhase2(); /* * Figure out our postgres user id. In standalone mode we use a fixed * id, otherwise we figure it out from the authenticated user name. */ if (bootstrap) InitializeSessionUserIdStandalone(); else if (!IsUnderPostmaster) { InitializeSessionUserIdStandalone(); if (!ThereIsAtLeastOneUser()) ereport(WARNING, (errcode(ERRCODE_UNDEFINED_OBJECT), errmsg("no users are defined in this database system"), errhint("You should immediately run CREATE USER \"%s\" WITH SYSID %d CREATEUSER;.", username, BOOTSTRAP_USESYSID))); } else { /* normal multiuser case */ InitializeSessionUserId(username); } /* * Unless we are bootstrapping, double-check that InitMyDatabaseInfo() * got a correct result. We can't do this until all the * database-access infrastructure is up. */ if (!bootstrap) ReverifyMyDatabase(dbname); /* * Final phase of relation cache startup: write a new cache file if * necessary. This is done after ReverifyMyDatabase to avoid writing * a cache file into a dead database. */ RelationCacheInitializePhase3(); /* * Check a normal user hasn't connected to a superuser reserved slot. * We can't do this till after we've read the user information, and we * must do it inside a transaction since checking superuserness may * require database access. The superuser check is probably the most * expensive part; don't do it until necessary. */ if (ReservedBackends > 0 && CountEmptyBackendSlots() < ReservedBackends && !superuser()) ereport(FATAL, (errcode(ERRCODE_TOO_MANY_CONNECTIONS), errmsg("connection limit exceeded for non-superusers"))); /* * Initialize various default states that can't be set up until we've * selected the active user and done ReverifyMyDatabase. */ /* set default namespace search path */ InitializeSearchPath(); /* initialize client encoding */ InitializeClientEncoding(); /* * Now all default states are fully set up. Report them to client if * appropriate. */ BeginReportingGUCOptions(); /* * Set up process-exit callback to do pre-shutdown cleanup. This * should be last because we want shmem_exit to call this routine * before the exit callbacks that are registered by buffer manager, * lock manager, etc. We need to run this code before we close down * database access! */ on_shmem_exit(ShutdownPostgres, 0); /* close the transaction we started above */ if (!bootstrap) CommitTransactionCommand(); }
/* ---------------------------------------------------------------- * ProcedureCreate * * Note: allParameterTypes, parameterModes, parameterNames, and proconfig * are either arrays of the proper types or NULL. We declare them Datum, * not "ArrayType *", to avoid importing array.h into pg_proc_fn.h. * ---------------------------------------------------------------- */ Oid ProcedureCreate(const char *procedureName, Oid procNamespace, bool replace, bool returnsSet, Oid returnType, Oid languageObjectId, Oid languageValidator, const char *prosrc, const char *probin, bool isAgg, bool isWindowFunc, bool security_definer, bool isStrict, char volatility, oidvector *parameterTypes, Datum allParameterTypes, Datum parameterModes, Datum parameterNames, List *parameterDefaults, Datum proconfig, float4 procost, float4 prorows) { Oid retval; int parameterCount; int allParamCount; Oid *allParams; bool genericInParam = false; bool genericOutParam = false; bool internalInParam = false; bool internalOutParam = false; Oid variadicType = InvalidOid; Oid proowner = GetUserId(); Acl *proacl = NULL; Relation rel; HeapTuple tup; HeapTuple oldtup; bool nulls[Natts_pg_proc]; Datum values[Natts_pg_proc]; bool replaces[Natts_pg_proc]; Oid relid; NameData procname; TupleDesc tupDesc; bool is_update; ObjectAddress myself, referenced; int i; /* * sanity checks */ Assert(PointerIsValid(prosrc)); parameterCount = parameterTypes->dim1; if (parameterCount < 0 || parameterCount > FUNC_MAX_ARGS) ereport(ERROR, (errcode(ERRCODE_TOO_MANY_ARGUMENTS), errmsg_plural("functions cannot have more than %d argument", "functions cannot have more than %d arguments", FUNC_MAX_ARGS, FUNC_MAX_ARGS))); /* note: the above is correct, we do NOT count output arguments */ if (allParameterTypes != PointerGetDatum(NULL)) { /* * We expect the array to be a 1-D OID array; verify that. We don't * need to use deconstruct_array() since the array data is just going * to look like a C array of OID values. */ ArrayType *allParamArray = (ArrayType *) DatumGetPointer(allParameterTypes); allParamCount = ARR_DIMS(allParamArray)[0]; if (ARR_NDIM(allParamArray) != 1 || allParamCount <= 0 || ARR_HASNULL(allParamArray) || ARR_ELEMTYPE(allParamArray) != OIDOID) elog(ERROR, "allParameterTypes is not a 1-D Oid array"); allParams = (Oid *) ARR_DATA_PTR(allParamArray); Assert(allParamCount >= parameterCount); /* we assume caller got the contents right */ } else { allParamCount = parameterCount; allParams = parameterTypes->values; } /* * Do not allow polymorphic return type unless at least one input argument * is polymorphic. Also, do not allow return type INTERNAL unless at * least one input argument is INTERNAL. */ for (i = 0; i < parameterCount; i++) { switch (parameterTypes->values[i]) { case ANYARRAYOID: case ANYELEMENTOID: case ANYNONARRAYOID: case ANYENUMOID: genericInParam = true; break; case INTERNALOID: internalInParam = true; break; } } if (allParameterTypes != PointerGetDatum(NULL)) { for (i = 0; i < allParamCount; i++) { /* * We don't bother to distinguish input and output params here, so * if there is, say, just an input INTERNAL param then we will * still set internalOutParam. This is OK since we don't really * care. */ switch (allParams[i]) { case ANYARRAYOID: case ANYELEMENTOID: case ANYNONARRAYOID: case ANYENUMOID: genericOutParam = true; break; case INTERNALOID: internalOutParam = true; break; } } } if ((IsPolymorphicType(returnType) || genericOutParam) && !genericInParam) ereport(ERROR, (errcode(ERRCODE_INVALID_FUNCTION_DEFINITION), errmsg("cannot determine result data type"), errdetail("A function returning a polymorphic type must have at least one polymorphic argument."))); if ((returnType == INTERNALOID || internalOutParam) && !internalInParam) ereport(ERROR, (errcode(ERRCODE_INVALID_FUNCTION_DEFINITION), errmsg("unsafe use of pseudo-type \"internal\""), errdetail("A function returning \"internal\" must have at least one \"internal\" argument."))); /* * don't allow functions of complex types that have the same name as * existing attributes of the type */ if (parameterCount == 1 && OidIsValid(parameterTypes->values[0]) && (relid = typeidTypeRelid(parameterTypes->values[0])) != InvalidOid && get_attnum(relid, procedureName) != InvalidAttrNumber) ereport(ERROR, (errcode(ERRCODE_DUPLICATE_COLUMN), errmsg("\"%s\" is already an attribute of type %s", procedureName, format_type_be(parameterTypes->values[0])))); if (parameterModes != PointerGetDatum(NULL)) { /* * We expect the array to be a 1-D CHAR array; verify that. We don't * need to use deconstruct_array() since the array data is just going * to look like a C array of char values. */ ArrayType *modesArray = (ArrayType *) DatumGetPointer(parameterModes); char *modes; if (ARR_NDIM(modesArray) != 1 || ARR_DIMS(modesArray)[0] != allParamCount || ARR_HASNULL(modesArray) || ARR_ELEMTYPE(modesArray) != CHAROID) elog(ERROR, "parameterModes is not a 1-D char array"); modes = (char *) ARR_DATA_PTR(modesArray); /* * Only the last input parameter can be variadic; if it is, save its * element type. Errors here are just elog since caller should have * checked this already. */ for (i = 0; i < allParamCount; i++) { switch (modes[i]) { case PROARGMODE_IN: case PROARGMODE_INOUT: if (OidIsValid(variadicType)) elog(ERROR, "variadic parameter must be last"); break; case PROARGMODE_OUT: case PROARGMODE_TABLE: /* okay */ break; case PROARGMODE_VARIADIC: if (OidIsValid(variadicType)) elog(ERROR, "variadic parameter must be last"); switch (allParams[i]) { case ANYOID: variadicType = ANYOID; break; case ANYARRAYOID: variadicType = ANYELEMENTOID; break; default: variadicType = get_element_type(allParams[i]); if (!OidIsValid(variadicType)) elog(ERROR, "variadic parameter is not an array"); break; } break; default: elog(ERROR, "invalid parameter mode '%c'", modes[i]); break; } } } /* * All seems OK; prepare the data to be inserted into pg_proc. */ for (i = 0; i < Natts_pg_proc; ++i) { nulls[i] = false; values[i] = (Datum) 0; replaces[i] = true; } namestrcpy(&procname, procedureName); values[Anum_pg_proc_proname - 1] = NameGetDatum(&procname); values[Anum_pg_proc_pronamespace - 1] = ObjectIdGetDatum(procNamespace); values[Anum_pg_proc_proowner - 1] = ObjectIdGetDatum(proowner); values[Anum_pg_proc_prolang - 1] = ObjectIdGetDatum(languageObjectId); values[Anum_pg_proc_procost - 1] = Float4GetDatum(procost); values[Anum_pg_proc_prorows - 1] = Float4GetDatum(prorows); values[Anum_pg_proc_provariadic - 1] = ObjectIdGetDatum(variadicType); values[Anum_pg_proc_proisagg - 1] = BoolGetDatum(isAgg); values[Anum_pg_proc_proiswindow - 1] = BoolGetDatum(isWindowFunc); values[Anum_pg_proc_prosecdef - 1] = BoolGetDatum(security_definer); values[Anum_pg_proc_proisstrict - 1] = BoolGetDatum(isStrict); values[Anum_pg_proc_proretset - 1] = BoolGetDatum(returnsSet); values[Anum_pg_proc_provolatile - 1] = CharGetDatum(volatility); values[Anum_pg_proc_pronargs - 1] = UInt16GetDatum(parameterCount); values[Anum_pg_proc_pronargdefaults - 1] = UInt16GetDatum(list_length(parameterDefaults)); values[Anum_pg_proc_prorettype - 1] = ObjectIdGetDatum(returnType); values[Anum_pg_proc_proargtypes - 1] = PointerGetDatum(parameterTypes); if (allParameterTypes != PointerGetDatum(NULL)) values[Anum_pg_proc_proallargtypes - 1] = allParameterTypes; else nulls[Anum_pg_proc_proallargtypes - 1] = true; if (parameterModes != PointerGetDatum(NULL)) values[Anum_pg_proc_proargmodes - 1] = parameterModes; else nulls[Anum_pg_proc_proargmodes - 1] = true; if (parameterNames != PointerGetDatum(NULL)) values[Anum_pg_proc_proargnames - 1] = parameterNames; else nulls[Anum_pg_proc_proargnames - 1] = true; if (parameterDefaults != NIL) values[Anum_pg_proc_proargdefaults - 1] = CStringGetTextDatum(nodeToString(parameterDefaults)); else nulls[Anum_pg_proc_proargdefaults - 1] = true; values[Anum_pg_proc_prosrc - 1] = CStringGetTextDatum(prosrc); if (probin) values[Anum_pg_proc_probin - 1] = CStringGetTextDatum(probin); else nulls[Anum_pg_proc_probin - 1] = true; if (proconfig != PointerGetDatum(NULL)) values[Anum_pg_proc_proconfig - 1] = proconfig; else nulls[Anum_pg_proc_proconfig - 1] = true; /* proacl will be determined later */ rel = heap_open(ProcedureRelationId, RowExclusiveLock); tupDesc = RelationGetDescr(rel); /* Check for pre-existing definition */ oldtup = SearchSysCache3(PROCNAMEARGSNSP, PointerGetDatum(procedureName), PointerGetDatum(parameterTypes), ObjectIdGetDatum(procNamespace)); if (HeapTupleIsValid(oldtup)) { /* There is one; okay to replace it? */ Form_pg_proc oldproc = (Form_pg_proc) GETSTRUCT(oldtup); Datum proargnames; bool isnull; if (!replace) ereport(ERROR, (errcode(ERRCODE_DUPLICATE_FUNCTION), errmsg("function \"%s\" already exists with same argument types", procedureName))); if (!pg_proc_ownercheck(HeapTupleGetOid(oldtup), proowner)) aclcheck_error(ACLCHECK_NOT_OWNER, ACL_KIND_PROC, procedureName); /* * Not okay to change the return type of the existing proc, since * existing rules, views, etc may depend on the return type. */ if (returnType != oldproc->prorettype || returnsSet != oldproc->proretset) ereport(ERROR, (errcode(ERRCODE_INVALID_FUNCTION_DEFINITION), errmsg("cannot change return type of existing function"), errhint("Use DROP FUNCTION first."))); /* * If it returns RECORD, check for possible change of record type * implied by OUT parameters */ if (returnType == RECORDOID) { TupleDesc olddesc; TupleDesc newdesc; olddesc = build_function_result_tupdesc_t(oldtup); newdesc = build_function_result_tupdesc_d(allParameterTypes, parameterModes, parameterNames); if (olddesc == NULL && newdesc == NULL) /* ok, both are runtime-defined RECORDs */ ; else if (olddesc == NULL || newdesc == NULL || !equalTupleDescs(olddesc, newdesc)) ereport(ERROR, (errcode(ERRCODE_INVALID_FUNCTION_DEFINITION), errmsg("cannot change return type of existing function"), errdetail("Row type defined by OUT parameters is different."), errhint("Use DROP FUNCTION first."))); } /* * If there were any named input parameters, check to make sure the * names have not been changed, as this could break existing calls. We * allow adding names to formerly unnamed parameters, though. */ proargnames = SysCacheGetAttr(PROCNAMEARGSNSP, oldtup, Anum_pg_proc_proargnames, &isnull); if (!isnull) { Datum proargmodes; char **old_arg_names; char **new_arg_names; int n_old_arg_names; int n_new_arg_names; int j; proargmodes = SysCacheGetAttr(PROCNAMEARGSNSP, oldtup, Anum_pg_proc_proargmodes, &isnull); if (isnull) proargmodes = PointerGetDatum(NULL); /* just to be sure */ n_old_arg_names = get_func_input_arg_names(proargnames, proargmodes, &old_arg_names); n_new_arg_names = get_func_input_arg_names(parameterNames, parameterModes, &new_arg_names); for (j = 0; j < n_old_arg_names; j++) { if (old_arg_names[j] == NULL) continue; if (j >= n_new_arg_names || new_arg_names[j] == NULL || strcmp(old_arg_names[j], new_arg_names[j]) != 0) ereport(ERROR, (errcode(ERRCODE_INVALID_FUNCTION_DEFINITION), errmsg("cannot change name of input parameter \"%s\"", old_arg_names[j]), errhint("Use DROP FUNCTION first."))); } } /* * If there are existing defaults, check compatibility: redefinition * must not remove any defaults nor change their types. (Removing a * default might cause a function to fail to satisfy an existing call. * Changing type would only be possible if the associated parameter is * polymorphic, and in such cases a change of default type might alter * the resolved output type of existing calls.) */ if (oldproc->pronargdefaults != 0) { Datum proargdefaults; List *oldDefaults; ListCell *oldlc; ListCell *newlc; if (list_length(parameterDefaults) < oldproc->pronargdefaults) ereport(ERROR, (errcode(ERRCODE_INVALID_FUNCTION_DEFINITION), errmsg("cannot remove parameter defaults from existing function"), errhint("Use DROP FUNCTION first."))); proargdefaults = SysCacheGetAttr(PROCNAMEARGSNSP, oldtup, Anum_pg_proc_proargdefaults, &isnull); Assert(!isnull); oldDefaults = (List *) stringToNode(TextDatumGetCString(proargdefaults)); Assert(IsA(oldDefaults, List)); Assert(list_length(oldDefaults) == oldproc->pronargdefaults); /* new list can have more defaults than old, advance over 'em */ newlc = list_head(parameterDefaults); for (i = list_length(parameterDefaults) - oldproc->pronargdefaults; i > 0; i--) newlc = lnext(newlc); foreach(oldlc, oldDefaults) { Node *oldDef = (Node *) lfirst(oldlc); Node *newDef = (Node *) lfirst(newlc); if (exprType(oldDef) != exprType(newDef)) ereport(ERROR, (errcode(ERRCODE_INVALID_FUNCTION_DEFINITION), errmsg("cannot change data type of existing parameter default value"), errhint("Use DROP FUNCTION first."))); newlc = lnext(newlc); } }
/* * Construct an inner tuple containing the given prefix and node array */ SpGistInnerTuple spgFormInnerTuple(SpGistState *state, bool hasPrefix, Datum prefix, int nNodes, SpGistNodeTuple *nodes) { SpGistInnerTuple tup; unsigned int size; unsigned int prefixSize; int i; char *ptr; /* Compute size needed */ if (hasPrefix) prefixSize = SpGistGetTypeSize(&state->attPrefixType, prefix); else prefixSize = 0; size = SGITHDRSZ + prefixSize; /* Note: we rely on node tuple sizes to be maxaligned already */ for (i = 0; i < nNodes; i++) size += IndexTupleSize(nodes[i]); /* * Ensure that we can replace the tuple with a dead tuple later. This * test is unnecessary given current tuple layouts, but let's be safe. */ if (size < SGDTSIZE) size = SGDTSIZE; /* * Inner tuple should be small enough to fit on a page */ if (size > SPGIST_PAGE_CAPACITY - sizeof(ItemIdData)) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("SP-GiST inner tuple size %zu exceeds maximum %zu", (Size) size, SPGIST_PAGE_CAPACITY - sizeof(ItemIdData)), errhint("Values larger than a buffer page cannot be indexed."))); /* * Check for overflow of header fields --- probably can't fail if the * above succeeded, but let's be paranoid */ if (size > SGITMAXSIZE || prefixSize > SGITMAXPREFIXSIZE || nNodes > SGITMAXNNODES) elog(ERROR, "SPGiST inner tuple header field is too small"); /* OK, form the tuple */ tup = (SpGistInnerTuple) palloc0(size); tup->nNodes = nNodes; tup->prefixSize = prefixSize; tup->size = size; if (hasPrefix) memcpyDatum(SGITDATAPTR(tup), &state->attPrefixType, prefix); ptr = (char *) SGITNODEPTR(tup); for (i = 0; i < nNodes; i++) { SpGistNodeTuple node = nodes[i]; memcpy(ptr, node, IndexTupleSize(node)); ptr += IndexTupleSize(node); } return tup; }
/* * CreateAcessMethod * Registers a new access method. */ ObjectAddress CreateAccessMethod(CreateAmStmt *stmt) { Relation rel; ObjectAddress myself; ObjectAddress referenced; Oid amoid; Oid amhandler; bool nulls[Natts_pg_am]; Datum values[Natts_pg_am]; HeapTuple tup; rel = heap_open(AccessMethodRelationId, RowExclusiveLock); /* Must be super user */ if (!superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("permission denied to create access method \"%s\"", stmt->amname), errhint("Must be superuser to create an access method."))); /* Check if name is used */ amoid = GetSysCacheOid1(AMNAME, CStringGetDatum(stmt->amname)); if (OidIsValid(amoid)) { ereport(ERROR, (errcode(ERRCODE_DUPLICATE_OBJECT), errmsg("access method \"%s\" already exists", stmt->amname))); } /* * Get the handler function oid, verifying the AM type while at it. */ amhandler = lookup_index_am_handler_func(stmt->handler_name, stmt->amtype); /* * Insert tuple into pg_am. */ memset(values, 0, sizeof(values)); memset(nulls, false, sizeof(nulls)); values[Anum_pg_am_amname - 1] = DirectFunctionCall1(namein, CStringGetDatum(stmt->amname)); values[Anum_pg_am_amhandler - 1] = ObjectIdGetDatum(amhandler); values[Anum_pg_am_amtype - 1] = CharGetDatum(stmt->amtype); tup = heap_form_tuple(RelationGetDescr(rel), values, nulls); amoid = simple_heap_insert(rel, tup); CatalogUpdateIndexes(rel, tup); heap_freetuple(tup); myself.classId = AccessMethodRelationId; myself.objectId = amoid; myself.objectSubId = 0; /* Record dependency on handler function */ referenced.classId = ProcedureRelationId; referenced.objectId = amhandler; referenced.objectSubId = 0; recordDependencyOn(&myself, &referenced, DEPENDENCY_NORMAL); recordDependencyOnCurrentExtension(&myself, false); heap_close(rel, RowExclusiveLock); return myself; }
/* * similar_escape() * Convert a SQL:2008 regexp pattern to POSIX style, so it can be used by * our regexp engine. */ Datum similar_escape(PG_FUNCTION_ARGS) { text *pat_text; text *esc_text; text *result; char *p, *e, *r; int plen, elen; bool afterescape = false; bool incharclass = false; int nquotes = 0; /* This function is not strict, so must test explicitly */ if (PG_ARGISNULL(0)) PG_RETURN_NULL(); pat_text = PG_GETARG_TEXT_PP(0); p = VARDATA_ANY(pat_text); plen = VARSIZE_ANY_EXHDR(pat_text); if (PG_ARGISNULL(1)) { /* No ESCAPE clause provided; default to backslash as escape */ e = "\\"; elen = 1; } else { esc_text = PG_GETARG_TEXT_PP(1); e = VARDATA_ANY(esc_text); elen = VARSIZE_ANY_EXHDR(esc_text); if (elen == 0) e = NULL; /* no escape character */ else if (elen != 1) ereport(ERROR, (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE), errmsg("invalid escape string"), errhint("Escape string must be empty or one character."))); } /*---------- * We surround the transformed input string with * ^(?: ... )$ * which requires some explanation. We need "^" and "$" to force * the pattern to match the entire input string as per SQL99 spec. * The "(?:" and ")" are a non-capturing set of parens; we have to have * parens in case the string contains "|", else the "^" and "$" will * be bound into the first and last alternatives which is not what we * want, and the parens must be non capturing because we don't want them * to count when selecting output for SUBSTRING. *---------- */ /* * We need room for the prefix/postfix plus as many as 3 output bytes per * input byte; since the input is at most 1GB this can't overflow */ result = (text *) palloc(VARHDRSZ + 6 + 3 * plen); r = VARDATA(result); *r++ = '^'; *r++ = '('; *r++ = '?'; *r++ = ':'; while (plen > 0) { char pchar = *p; if (afterescape) { if (pchar == '"' && !incharclass) /* for SUBSTRING patterns */ *r++ = ((nquotes++ % 2) == 0) ? '(' : ')'; else { *r++ = '\\'; *r++ = pchar; } afterescape = false; } else if (e && pchar == *e) { /* SQL99 escape character; do not send to output */ afterescape = true; } else if (incharclass) { if (pchar == '\\') *r++ = '\\'; *r++ = pchar; if (pchar == ']') incharclass = false; } else if (pchar == '[') { *r++ = pchar; incharclass = true; } else if (pchar == '%') { *r++ = '.'; *r++ = '*'; } else if (pchar == '_') *r++ = '.'; else if (pchar == '(') { /* convert to non-capturing parenthesis */ *r++ = '('; *r++ = '?'; *r++ = ':'; } else if (pchar == '\\' || pchar == '.' || pchar == '^' || pchar == '$') { *r++ = '\\'; *r++ = pchar; } else *r++ = pchar; p++, plen--; } *r++ = ')'; *r++ = '$'; SET_VARSIZE(result, r - ((char *) result)); PG_RETURN_TEXT_P(result); }
/* * InternalIpcMemoryCreate(memKey, size) * * Attempt to create a new shared memory segment with the specified key. * Will fail (return NULL) if such a segment already exists. If successful, * attach the segment to the current process and return its attached address. * On success, callbacks are registered with on_shmem_exit to detach and * delete the segment when on_shmem_exit is called. * * If we fail with a failure code other than collision-with-existing-segment, * print out an error and abort. Other types of errors are not recoverable. */ static void * InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size) { IpcMemoryId shmid; void *memAddress; shmid = shmget(memKey, size, IPC_CREAT | IPC_EXCL | IPCProtection); if (shmid < 0) { /* * Fail quietly if error indicates a collision with existing segment. * One would expect EEXIST, given that we said IPC_EXCL, but perhaps * we could get a permission violation instead? Also, EIDRM might * occur if an old seg is slated for destruction but not gone yet. */ if (errno == EEXIST || errno == EACCES #ifdef EIDRM || errno == EIDRM #endif ) return NULL; /* * Some BSD-derived kernels are known to return EINVAL, not EEXIST, if * there is an existing segment but it's smaller than "size" (this is * a result of poorly-thought-out ordering of error tests). To * distinguish between collision and invalid size in such cases, we * make a second try with size = 0. These kernels do not test size * against SHMMIN in the preexisting-segment case, so we will not get * EINVAL a second time if there is such a segment. */ if (errno == EINVAL) { int save_errno = errno; shmid = shmget(memKey, 0, IPC_CREAT | IPC_EXCL | IPCProtection); if (shmid < 0) { /* As above, fail quietly if we verify a collision */ if (errno == EEXIST || errno == EACCES #ifdef EIDRM || errno == EIDRM #endif ) return NULL; /* Otherwise, fall through to report the original error */ } else { /* * On most platforms we cannot get here because SHMMIN is * greater than zero. However, if we do succeed in creating a * zero-size segment, free it and then fall through to report * the original error. */ if (shmctl(shmid, IPC_RMID, NULL) < 0) elog(LOG, "shmctl(%d, %d, 0) failed: %m", (int) shmid, IPC_RMID); } errno = save_errno; } /* * Else complain and abort. * * Note: at this point EINVAL should mean that either SHMMIN or SHMMAX * is violated. SHMALL violation might be reported as either ENOMEM * (BSDen) or ENOSPC (Linux); the Single Unix Spec fails to say which * it should be. SHMMNI violation is ENOSPC, per spec. Just plain * not-enough-RAM is ENOMEM. */ ereport(FATAL, (errmsg("could not create shared memory segment: %m"), errdetail("Failed system call was shmget(key=%lu, size=%lu, 0%o).", (unsigned long) memKey, (unsigned long) size, IPC_CREAT | IPC_EXCL | IPCProtection), (errno == EINVAL) ? errhint("This error usually means that PostgreSQL's request for a shared memory " "segment exceeded your kernel's SHMMAX parameter, or possibly that " "it is less than " "your kernel's SHMMIN parameter.\n" "The PostgreSQL documentation contains more information about shared " "memory configuration.") : 0, (errno == ENOMEM) ? errhint("This error usually means that PostgreSQL's request for a shared " "memory segment exceeded your kernel's SHMALL parameter. You may need " "to reconfigure the kernel with larger SHMALL.\n" "The PostgreSQL documentation contains more information about shared " "memory configuration.") : 0, (errno == ENOSPC) ? errhint("This error does *not* mean that you have run out of disk space. " "It occurs either if all available shared memory IDs have been taken, " "in which case you need to raise the SHMMNI parameter in your kernel, " "or because the system's overall limit for shared memory has been " "reached.\n" "The PostgreSQL documentation contains more information about shared " "memory configuration.") : 0)); } /* Register on-exit routine to delete the new segment */ on_shmem_exit(IpcMemoryDelete, Int32GetDatum(shmid)); /* OK, should be able to attach to the segment */ memAddress = shmat(shmid, NULL, PG_SHMAT_FLAGS); if (memAddress == (void *) -1) elog(FATAL, "shmat(id=%d) failed: %m", shmid); /* Register on-exit routine to detach new segment before deleting */ on_shmem_exit(IpcMemoryDetach, PointerGetDatum(memAddress)); /* * Store shmem key and ID in data directory lockfile. Format to try to * keep it the same length always (trailing junk in the lockfile won't * hurt, but might confuse humans). */ { char line[64]; sprintf(line, "%9lu %9lu", (unsigned long) memKey, (unsigned long) shmid); AddToDataDirLockFile(LOCK_FILE_LINE_SHMEM_KEY, line); } return memAddress; }
/* * CheckMyDatabase -- fetch information from the pg_database entry for our DB */ static void CheckMyDatabase(const char *name, bool am_superuser) { HeapTuple tup; Form_pg_database dbform; char *collate; char *ctype; /* Fetch our pg_database row normally, via syscache */ tup = SearchSysCache1(DATABASEOID, ObjectIdGetDatum(MyDatabaseId)); if (!HeapTupleIsValid(tup)) elog(ERROR, "cache lookup failed for database %u", MyDatabaseId); dbform = (Form_pg_database) GETSTRUCT(tup); /* This recheck is strictly paranoia */ if (strcmp(name, NameStr(dbform->datname)) != 0) ereport(FATAL, (errcode(ERRCODE_UNDEFINED_DATABASE), errmsg("database \"%s\" has disappeared from pg_database", name), errdetail("Database OID %u now seems to belong to \"%s\".", MyDatabaseId, NameStr(dbform->datname)))); /* * Check permissions to connect to the database. * * These checks are not enforced when in standalone mode, so that there is * a way to recover from disabling all access to all databases, for * example "UPDATE pg_database SET datallowconn = false;". * * We do not enforce them for autovacuum worker processes either. */ if (IsUnderPostmaster && !IsAutoVacuumWorkerProcess()) { /* * Check that the database is currently allowing connections. */ if (!dbform->datallowconn) ereport(FATAL, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("database \"%s\" is not currently accepting connections", name))); /* * Check privilege to connect to the database. (The am_superuser test * is redundant, but since we have the flag, might as well check it * and save a few cycles.) */ if (!am_superuser && pg_database_aclcheck(MyDatabaseId, GetUserId(), ACL_CONNECT) != ACLCHECK_OK) ereport(FATAL, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("permission denied for database \"%s\"", name), errdetail("User does not have CONNECT privilege."))); /* * Check connection limit for this database. * * There is a race condition here --- we create our PGPROC before * checking for other PGPROCs. If two backends did this at about the * same time, they might both think they were over the limit, while * ideally one should succeed and one fail. Getting that to work * exactly seems more trouble than it is worth, however; instead we * just document that the connection limit is approximate. */ if (dbform->datconnlimit >= 0 && #ifdef XCP IS_PGXC_COORDINATOR && #endif !am_superuser && CountDBBackends(MyDatabaseId) > dbform->datconnlimit) ereport(FATAL, (errcode(ERRCODE_TOO_MANY_CONNECTIONS), errmsg("too many connections for database \"%s\"", name))); } /* * OK, we're golden. Next to-do item is to save the encoding info out of * the pg_database tuple. */ SetDatabaseEncoding(dbform->encoding); /* Record it as a GUC internal option, too */ SetConfigOption("server_encoding", GetDatabaseEncodingName(), PGC_INTERNAL, PGC_S_OVERRIDE); /* If we have no other source of client_encoding, use server encoding */ SetConfigOption("client_encoding", GetDatabaseEncodingName(), PGC_BACKEND, PGC_S_DYNAMIC_DEFAULT); /* assign locale variables */ collate = NameStr(dbform->datcollate); ctype = NameStr(dbform->datctype); if (pg_perm_setlocale(LC_COLLATE, collate) == NULL) ereport(FATAL, (errmsg("database locale is incompatible with operating system"), errdetail("The database was initialized with LC_COLLATE \"%s\", " " which is not recognized by setlocale().", collate), errhint("Recreate the database with another locale or install the missing locale."))); if (pg_perm_setlocale(LC_CTYPE, ctype) == NULL) ereport(FATAL, (errmsg("database locale is incompatible with operating system"), errdetail("The database was initialized with LC_CTYPE \"%s\", " " which is not recognized by setlocale().", ctype), errhint("Recreate the database with another locale or install the missing locale."))); /* Make the locale settings visible as GUC variables, too */ SetConfigOption("lc_collate", collate, PGC_INTERNAL, PGC_S_OVERRIDE); SetConfigOption("lc_ctype", ctype, PGC_INTERNAL, PGC_S_OVERRIDE); ReleaseSysCache(tup); }
/* * Starts background worker that will create new partitions, * waits till it finishes the job and returns the result (new partition oid) */ Oid create_partitions_bg_worker(Oid relid, Datum value, Oid value_type, bool *crashed) { BackgroundWorker worker; BackgroundWorkerHandle *worker_handle; BgwHandleStatus status; dsm_segment *segment; dsm_handle segment_handle; pid_t pid; PartitionArgs *args; Oid child_oid; TypeCacheEntry *tce; /* Create a dsm segment for the worker to pass arguments */ segment = dsm_create(sizeof(PartitionArgs), 0); segment_handle = dsm_segment_handle(segment); tce = lookup_type_cache(value_type, 0); /* Fill arguments structure */ args = (PartitionArgs *) dsm_segment_address(segment); args->dbid = MyDatabaseId; args->relid = relid; if (tce->typbyval) args->value = value; else memcpy(&args->value, DatumGetPointer(value), sizeof(args->value)); args->by_val = tce->typbyval; args->value_type = value_type; args->result = 0; /* Initialize worker struct */ worker.bgw_flags = BGWORKER_SHMEM_ACCESS | BGWORKER_BACKEND_DATABASE_CONNECTION; worker.bgw_start_time = BgWorkerStart_RecoveryFinished; worker.bgw_restart_time = BGW_NEVER_RESTART; worker.bgw_main = bg_worker_main; worker.bgw_main_arg = Int32GetDatum(segment_handle); worker.bgw_notify_pid = MyProcPid; /* Start dynamic worker */ if (!RegisterDynamicBackgroundWorker(&worker, &worker_handle)) { elog(WARNING, "Unable to create background worker for pg_pathman"); } status = WaitForBackgroundWorkerStartup(worker_handle, &pid); if (status == BGWH_POSTMASTER_DIED) { ereport(WARNING, (errmsg("Postmaster died during the pg_pathman background worker process"), errhint("More details may be available in the server log."))); } /* Wait till the worker finishes its job */ status = WaitForBackgroundWorkerShutdown(worker_handle); if (status == BGWH_POSTMASTER_DIED) { ereport(WARNING, (errmsg("Postmaster died during the pg_pathman background worker process"), errhint("More details may be available in the server log."))); } *crashed = args->crashed; child_oid = args->result; /* Free dsm segment */ dsm_detach(segment); return child_oid; }
/* * Compose and dispatch the MPPEXEC commands corresponding to a plan tree * within a complete parallel plan. (A plan tree will correspond either * to an initPlan or to the main plan.) * * If cancelOnError is true, then any dispatching error, a cancellation * request from the client, or an error from any of the associated QEs, * may cause the unfinished portion of the plan to be abandoned or canceled; * and in the event this occurs before all gangs have been dispatched, this * function does not return, but waits for all QEs to stop and exits to * the caller's error catcher via ereport(ERROR,...). Otherwise this * function returns normally and errors are not reported until later. * * If cancelOnError is false, the plan is to be dispatched as fully as * possible and the QEs allowed to proceed regardless of cancellation * requests, errors or connection failures from other QEs, etc. * * The CdbDispatchResults objects allocated for the plan are returned * in *pPrimaryResults. The caller, after calling * CdbCheckDispatchResult(), can examine the CdbDispatchResults * objects, can keep them as long as needed, and ultimately must free * them with cdbdisp_destroyDispatcherState() prior to deallocation of * the caller's memory context. Callers should use PG_TRY/PG_CATCH to * ensure proper cleanup. * * To wait for completion, check for errors, and clean up, it is * suggested that the caller use cdbdisp_finishCommand(). * * Note that the slice tree dispatched is the one specified in the EState * of the argument QueryDesc as es_cur__slice. * * Note that the QueryDesc params must include PARAM_EXEC_REMOTE parameters * containing the values of any initplans required by the slice to be run. * (This is handled by calls to addRemoteExecParamsToParamList() from the * functions preprocess_initplans() and ExecutorRun().) * * Each QE receives its assignment as a message of type 'M' in PostgresMain(). * The message is deserialized and processed by exec_mpp_query() in postgres.c. */ void cdbdisp_dispatchPlan(struct QueryDesc *queryDesc, bool planRequiresTxn, bool cancelOnError, struct CdbDispatcherState *ds) { char *splan, *ssliceinfo, *sparams; int splan_len, splan_len_uncompressed, ssliceinfo_len, sparams_len; SliceTable *sliceTbl; int rootIdx; int oldLocalSlice; PlannedStmt *stmt; bool is_SRI; DispatchCommandQueryParms queryParms; CdbComponentDatabaseInfo *qdinfo; ds->primaryResults = NULL; ds->dispatchThreads = NULL; Assert(Gp_role == GP_ROLE_DISPATCH); Assert(queryDesc != NULL && queryDesc->estate != NULL); /* * Later we'll need to operate with the slice table provided via the * EState structure in the argument QueryDesc. Cache this information * locally and assert our expectations about it. */ sliceTbl = queryDesc->estate->es_sliceTable; rootIdx = RootSliceIndex(queryDesc->estate); Assert(sliceTbl != NULL); Assert(rootIdx == 0 || (rootIdx > sliceTbl->nMotions && rootIdx <= sliceTbl->nMotions + sliceTbl->nInitPlans)); /* * Keep old value so we can restore it. We use this field as a parameter. */ oldLocalSlice = sliceTbl->localSlice; /* * This function is called only for planned statements. */ stmt = queryDesc->plannedstmt; Assert(stmt); /* * Let's evaluate STABLE functions now, so we get consistent values on the QEs * * Also, if this is a single-row INSERT statement, let's evaluate * nextval() and currval() now, so that we get the QD's values, and a * consistent value for everyone * */ is_SRI = false; if (queryDesc->operation == CMD_INSERT) { Assert(stmt->commandType == CMD_INSERT); /* * We might look for constant input relation (instead of SRI), but I'm afraid * * that wouldn't scale. */ is_SRI = IsA(stmt->planTree, Result) && stmt->planTree->lefttree == NULL; } if (!is_SRI) clear_relsize_cache(); if (queryDesc->operation == CMD_INSERT || queryDesc->operation == CMD_SELECT || queryDesc->operation == CMD_UPDATE || queryDesc->operation == CMD_DELETE) { MemoryContext oldContext; oldContext = CurrentMemoryContext; if (stmt->qdContext) { oldContext = MemoryContextSwitchTo(stmt->qdContext); } else /* * memory context of plan tree should not change */ { MemoryContext mc = GetMemoryChunkContext(stmt->planTree); oldContext = MemoryContextSwitchTo(mc); } stmt->planTree = (Plan *) exec_make_plan_constant(stmt, is_SRI); MemoryContextSwitchTo(oldContext); } /* * Cursor queries and bind/execute path queries don't run on the * writer-gang QEs; but they require snapshot-synchronization to * get started. * * initPlans, and other work (see the function pre-evaluation * above) may advance the snapshot "segmateSync" value, so we're * best off setting the shared-snapshot-ready value here. This * will dispatch to the writer gang and force it to set its * snapshot; we'll then be able to serialize the same snapshot * version (see qdSerializeDtxContextInfo() below). */ if (queryDesc->extended_query) { verify_shared_snapshot_ready(); } stripPlanBeforeDispatch(queryDesc->plannedstmt); /* * serialized plan tree. Note that we're called for a single * slice tree (corresponding to an initPlan or the main plan), so the * parameters are fixed and we can include them in the prefix. */ splan = serializeNode((Node *) queryDesc->plannedstmt, &splan_len, &splan_len_uncompressed); /* * compute the total uncompressed size of the query plan for all slices */ int num_slices = queryDesc->plannedstmt->planTree->nMotionNodes + 1; uint64 plan_size_in_kb = ((uint64) splan_len_uncompressed * (uint64) num_slices) / (uint64) 1024; elog(((gp_log_gang >= GPVARS_VERBOSITY_VERBOSE) ? LOG : DEBUG1), "Query plan size to dispatch: " UINT64_FORMAT "KB", plan_size_in_kb); if (0 < gp_max_plan_size && plan_size_in_kb > gp_max_plan_size) { ereport(ERROR, (errcode(ERRCODE_STATEMENT_TOO_COMPLEX), (errmsg("Query plan size limit exceeded, current size: " UINT64_FORMAT "KB, max allowed size: %dKB", plan_size_in_kb, gp_max_plan_size), errhint("Size controlled by gp_max_plan_size")))); } Assert(splan != NULL && splan_len > 0 && splan_len_uncompressed > 0); if (queryDesc->params != NULL && queryDesc->params->numParams > 0) { ParamListInfoData *pli; ParamExternData *pxd; StringInfoData parambuf; Size length; int plioff; int32 iparam; /* * Allocate buffer for params */ initStringInfo(¶mbuf); /* * Copy ParamListInfoData header and ParamExternData array */ pli = queryDesc->params; length = (char *) &pli->params[pli->numParams] - (char *) pli; plioff = parambuf.len; Assert(plioff == MAXALIGN(plioff)); appendBinaryStringInfo(¶mbuf, pli, length); /* * Copy pass-by-reference param values. */ for (iparam = 0; iparam < queryDesc->params->numParams; iparam++) { int16 typlen; bool typbyval; /* * Recompute pli each time in case parambuf.data is repalloc'ed */ pli = (ParamListInfoData *) (parambuf.data + plioff); pxd = &pli->params[iparam]; if (pxd->ptype == InvalidOid) continue; /* * Does pxd->value contain the value itself, or a pointer? */ get_typlenbyval(pxd->ptype, &typlen, &typbyval); if (!typbyval) { char *s = DatumGetPointer(pxd->value); if (pxd->isnull || !PointerIsValid(s)) { pxd->isnull = true; pxd->value = 0; } else { length = datumGetSize(pxd->value, typbyval, typlen); /* * We *must* set this before we * append. Appending may realloc, which will * invalidate our pxd ptr. (obviously we could * append first if we recalculate pxd from the new * base address) */ pxd->value = Int32GetDatum(length); appendBinaryStringInfo(¶mbuf, &iparam, sizeof(iparam)); appendBinaryStringInfo(¶mbuf, s, length); } } } sparams = parambuf.data; sparams_len = parambuf.len; } else { sparams = NULL; sparams_len = 0; } ssliceinfo = serializeNode((Node *) sliceTbl, &ssliceinfo_len, NULL /*uncompressed_size */ ); MemSet(&queryParms, 0, sizeof(queryParms)); queryParms.strCommand = queryDesc->sourceText; queryParms.serializedQuerytree = NULL; queryParms.serializedQuerytreelen = 0; queryParms.serializedPlantree = splan; queryParms.serializedPlantreelen = splan_len; queryParms.serializedParams = sparams; queryParms.serializedParamslen = sparams_len; queryParms.serializedSliceInfo = ssliceinfo; queryParms.serializedSliceInfolen = ssliceinfo_len; queryParms.rootIdx = rootIdx; /* * sequence server info */ qdinfo = &(getComponentDatabases()->entry_db_info[0]); Assert(qdinfo != NULL && qdinfo->hostip != NULL); queryParms.seqServerHost = pstrdup(qdinfo->hostip); queryParms.seqServerHostlen = strlen(qdinfo->hostip) + 1; queryParms.seqServerPort = seqServerCtl->seqServerPort; queryParms.primary_gang_id = 0; /* We are relying on the slice table to provide gang ids */ /* * serialized a version of our snapshot */ /* * Generate our transction isolations. We generally want Plan * based dispatch to be in a global transaction. The executor gets * to decide if the special circumstances exist which allow us to * dispatch without starting a global xact. */ queryParms.serializedDtxContextInfo = qdSerializeDtxContextInfo(&queryParms.serializedDtxContextInfolen, true /* wantSnapshot */ , queryDesc->extended_query, mppTxnOptions(planRequiresTxn), "cdbdisp_dispatchPlan"); Assert(sliceTbl); Assert(sliceTbl->slices != NIL); cdbdisp_dispatchX(&queryParms, cancelOnError, sliceTbl, ds); sliceTbl->localSlice = oldLocalSlice; }
/* * lo_export - * exports an (inversion) large object. */ Datum lo_export(PG_FUNCTION_ARGS) { Oid lobjId = PG_GETARG_OID(0); text *filename = PG_GETARG_TEXT_PP(1); int fd; int nbytes, tmp; char buf[BUFSIZE]; char fnamebuf[MAXPGPATH]; LargeObjectDesc *lobj; mode_t oumask; #ifndef ALLOW_DANGEROUS_LO_FUNCTIONS if (!superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("must be superuser to use server-side lo_export()"), errhint("Anyone can use the client-side lo_export() provided by libpq."))); #endif CreateFSContext(); /* * open the inversion object (no need to test for failure) */ lobj = inv_open(lobjId, INV_READ, fscxt); /* * open the file to be written to * * Note: we reduce backend's normal 077 umask to the slightly friendlier * 022. This code used to drop it all the way to 0, but creating * world-writable export files doesn't seem wise. */ text_to_cstring_buffer(filename, fnamebuf, sizeof(fnamebuf)); oumask = umask(S_IWGRP | S_IWOTH); fd = OpenTransientFile(fnamebuf, O_CREAT | O_WRONLY | O_TRUNC | PG_BINARY, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); umask(oumask); if (fd < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not create server file \"%s\": %m", fnamebuf))); /* * read in from the inversion file and write to the filesystem */ while ((nbytes = inv_read(lobj, buf, BUFSIZE)) > 0) { tmp = write(fd, buf, nbytes); if (tmp != nbytes) ereport(ERROR, (errcode_for_file_access(), errmsg("could not write server file \"%s\": %m", fnamebuf))); } CloseTransientFile(fd); inv_close(lobj); PG_RETURN_INT32(1); }
/* * Connect to remote server using specified server and user mapping properties. */ static Jconn * connect_jdbc_server(ForeignServer *server, UserMapping *user) { Jconn *volatile conn = NULL; /* * Use PG_TRY block to ensure closing connection on error. */ PG_TRY(); { const char **keywords; const char **values; int n; /* * Construct connection params from generic options of ForeignServer * and UserMapping. (Some of them might not be libpq options, in * which case we'll just waste a few array slots.) Add 3 extra slots * for fallback_application_name, client_encoding, end marker. */ n = list_length(server->options) + list_length(user->options) + 3; keywords = (const char **) palloc(n * sizeof(char *)); values = (const char **) palloc(n * sizeof(char *)); n = 0; n += ExtractConnectionOptions(server->options, keywords + n, values + n); n += ExtractConnectionOptions(user->options, keywords + n, values + n); /* Use "jdbc2_fdw" as fallback_application_name. */ keywords[n] = "fallback_application_name"; values[n] = "jdbc2_fdw"; n++; /* Set client_encoding so that libpq can convert encoding properly. */ keywords[n] = "client_encoding"; values[n] = GetDatabaseEncodingName(); n++; keywords[n] = values[n] = NULL; /* verify connection parameters and make connection */ check_conn_params(keywords, values); conn = JQconnectdbParams(server, user, keywords, values); if (!conn || JQstatus(conn) != CONNECTION_OK) { char *connmessage; int msglen; /* libpq typically appends a newline, strip that */ connmessage = pstrdup(JQerrorMessage(conn)); msglen = strlen(connmessage); if (msglen > 0 && connmessage[msglen - 1] == '\n') connmessage[msglen - 1] = '\0'; ereport(ERROR, (errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION), errmsg("could not connect to server \"%s\"", server->servername), errdetail_internal("%s", connmessage))); } /* * Check that non-superuser has used password to establish connection; * otherwise, he's piggybacking on the jdbc server's user * identity. See also dblink_security_check() in contrib/dblink. */ if (!superuser() && !JQconnectionUsedPassword(conn)) ereport(ERROR, (errcode(ERRCODE_S_R_E_PROHIBITED_SQL_STATEMENT_ATTEMPTED), errmsg("password is required"), errdetail("Non-superuser cannot connect if the server does not request a password."), errhint("Target server's authentication method must be changed."))); pfree(keywords); pfree(values); } PG_CATCH(); { /* Release Jconn data structure if we managed to create one */ if (conn) JQfinish(conn); PG_RE_THROW(); } PG_END_TRY(); return conn; }
/* * Workhouse routine for doing insertion into a GiST index. Note that * this routine assumes it is invoked in a short-lived memory context, * so it does not bother releasing palloc'd allocations. */ void gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *giststate, Relation heapRel) { ItemId iid; IndexTuple idxtuple; GISTInsertStack firststack; GISTInsertStack *stack; GISTInsertState state; bool xlocked = false; memset(&state, 0, sizeof(GISTInsertState)); state.freespace = freespace; state.r = r; state.heapRel = heapRel; /* Start from the root */ firststack.blkno = GIST_ROOT_BLKNO; firststack.lsn = 0; firststack.parent = NULL; firststack.downlinkoffnum = InvalidOffsetNumber; state.stack = stack = &firststack; /* * Walk down along the path of smallest penalty, updating the parent * pointers with the key we're inserting as we go. If we crash in the * middle, the tree is consistent, although the possible parent updates * were a waste. */ for (;;) { if (XLogRecPtrIsInvalid(stack->lsn)) stack->buffer = ReadBuffer(state.r, stack->blkno); /* * Be optimistic and grab shared lock first. Swap it for an exclusive * lock later if we need to update the page. */ if (!xlocked) { LockBuffer(stack->buffer, GIST_SHARE); gistcheckpage(state.r, stack->buffer); } stack->page = (Page) BufferGetPage(stack->buffer); stack->lsn = xlocked ? PageGetLSN(stack->page) : BufferGetLSNAtomic(stack->buffer); Assert(!RelationNeedsWAL(state.r) || !XLogRecPtrIsInvalid(stack->lsn)); /* * If this page was split but the downlink was never inserted to the * parent because the inserting backend crashed before doing that, fix * that now. */ if (GistFollowRight(stack->page)) { if (!xlocked) { LockBuffer(stack->buffer, GIST_UNLOCK); LockBuffer(stack->buffer, GIST_EXCLUSIVE); xlocked = true; /* someone might've completed the split when we unlocked */ if (!GistFollowRight(stack->page)) continue; } gistfixsplit(&state, giststate); UnlockReleaseBuffer(stack->buffer); xlocked = false; state.stack = stack = stack->parent; continue; } if (stack->blkno != GIST_ROOT_BLKNO && stack->parent->lsn < GistPageGetNSN(stack->page)) { /* * Concurrent split detected. There's no guarantee that the * downlink for this page is consistent with the tuple we're * inserting anymore, so go back to parent and rechoose the best * child. */ UnlockReleaseBuffer(stack->buffer); xlocked = false; state.stack = stack = stack->parent; continue; } if (!GistPageIsLeaf(stack->page)) { /* * This is an internal page so continue to walk down the tree. * Find the child node that has the minimum insertion penalty. */ BlockNumber childblkno; IndexTuple newtup; GISTInsertStack *item; OffsetNumber downlinkoffnum; /* currently, internal pages are never deleted */ Assert(!GistPageIsDeleted(stack->page)); downlinkoffnum = gistchoose(state.r, stack->page, itup, giststate); iid = PageGetItemId(stack->page, downlinkoffnum); idxtuple = (IndexTuple) PageGetItem(stack->page, iid); childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid)); /* * Check that it's not a leftover invalid tuple from pre-9.1 */ if (GistTupleIsInvalid(idxtuple)) ereport(ERROR, (errmsg("index \"%s\" contains an inner tuple marked as invalid", RelationGetRelationName(r)), errdetail("This is caused by an incomplete page split at crash recovery before upgrading to PostgreSQL 9.1."), errhint("Please REINDEX it."))); /* * Check that the key representing the target child node is * consistent with the key we're inserting. Update it if it's not. */ newtup = gistgetadjusted(state.r, idxtuple, itup, giststate); if (newtup) { /* * Swap shared lock for an exclusive one. Beware, the page may * change while we unlock/lock the page... */ if (!xlocked) { LockBuffer(stack->buffer, GIST_UNLOCK); LockBuffer(stack->buffer, GIST_EXCLUSIVE); xlocked = true; stack->page = (Page) BufferGetPage(stack->buffer); if (PageGetLSN(stack->page) != stack->lsn) { /* the page was changed while we unlocked it, retry */ continue; } } /* * Update the tuple. * * We still hold the lock after gistinserttuple(), but it * might have to split the page to make the updated tuple fit. * In that case the updated tuple might migrate to the other * half of the split, so we have to go back to the parent and * descend back to the half that's a better fit for the new * tuple. */ if (gistinserttuple(&state, stack, giststate, newtup, downlinkoffnum)) { /* * If this was a root split, the root page continues to be * the parent and the updated tuple went to one of the * child pages, so we just need to retry from the root * page. */ if (stack->blkno != GIST_ROOT_BLKNO) { UnlockReleaseBuffer(stack->buffer); xlocked = false; state.stack = stack = stack->parent; } continue; } } LockBuffer(stack->buffer, GIST_UNLOCK); xlocked = false; /* descend to the chosen child */ item = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack)); item->blkno = childblkno; item->parent = stack; item->downlinkoffnum = downlinkoffnum; state.stack = stack = item; } else { /* * Leaf page. Insert the new key. We've already updated all the * parents on the way down, but we might have to split the page if * it doesn't fit. gistinserthere() will take care of that. */ /* * Swap shared lock for an exclusive one. Be careful, the page may * change while we unlock/lock the page... */ if (!xlocked) { LockBuffer(stack->buffer, GIST_UNLOCK); LockBuffer(stack->buffer, GIST_EXCLUSIVE); xlocked = true; stack->page = (Page) BufferGetPage(stack->buffer); stack->lsn = PageGetLSN(stack->page); if (stack->blkno == GIST_ROOT_BLKNO) { /* * the only page that can become inner instead of leaf is * the root page, so for root we should recheck it */ if (!GistPageIsLeaf(stack->page)) { /* * very rare situation: during unlock/lock index with * number of pages = 1 was increased */ LockBuffer(stack->buffer, GIST_UNLOCK); xlocked = false; continue; } /* * we don't need to check root split, because checking * leaf/inner is enough to recognize split for root */ } else if (GistFollowRight(stack->page) || stack->parent->lsn < GistPageGetNSN(stack->page)) { /* * The page was split while we momentarily unlocked the * page. Go back to parent. */ UnlockReleaseBuffer(stack->buffer); xlocked = false; state.stack = stack = stack->parent; continue; } } /* * The page might have been deleted after we scanned the parent * and saw the downlink. */ if (GistPageIsDeleted(stack->page)) { UnlockReleaseBuffer(stack->buffer); xlocked = false; state.stack = stack = stack->parent; continue; } /* now state.stack->(page, buffer and blkno) points to leaf page */ gistinserttuple(&state, stack, giststate, itup, InvalidOffsetNumber); LockBuffer(stack->buffer, GIST_UNLOCK); /* Release any pins we might still hold before exiting */ for (; stack; stack = stack->parent) ReleaseBuffer(stack->buffer); break; } } }
/* * Try to identify a timezone name (in our terminology) that best matches the * observed behavior of the system timezone library. We cannot assume that * the system TZ environment setting (if indeed there is one) matches our * terminology, so we ignore it and just look at what localtime() returns. */ static const char * identify_system_timezone(void) { static char resultbuf[TZ_STRLEN_MAX + 1]; time_t tnow; time_t t; struct tztry tt; struct tm *tm; int thisyear; int bestscore; char tmptzdir[MAXPGPATH]; int std_ofs; char std_zone_name[TZ_STRLEN_MAX + 1], dst_zone_name[TZ_STRLEN_MAX + 1]; char cbuf[TZ_STRLEN_MAX + 1]; /* Initialize OS timezone library */ tzset(); /* * Set up the list of dates to be probed to see how well our timezone * matches the system zone. We first probe January and July of the * current year; this serves to quickly eliminate the vast majority of the * TZ database entries. If those dates match, we probe every week for 100 * years backwards from the current July. (Weekly resolution is good * enough to identify DST transition rules, since everybody switches on * Sundays.) This is sufficient to cover most of the Unix time_t range, * and we don't want to look further than that since many systems won't * have sane TZ behavior further back anyway. The further back the zone * matches, the better we score it. This may seem like a rather random * way of doing things, but experience has shown that system-supplied * timezone definitions are likely to have DST behavior that is right for * the recent past and not so accurate further back. Scoring in this way * allows us to recognize zones that have some commonality with the zic * database, without insisting on exact match. (Note: we probe Thursdays, * not Sundays, to avoid triggering DST-transition bugs in localtime * itself.) */ tnow = time(NULL); tm = localtime(&tnow); if (!tm) return NULL; /* give up if localtime is broken... */ thisyear = tm->tm_year + 1900; t = build_time_t(thisyear, 1, 15); /* * Round back to GMT midnight Thursday. This depends on the knowledge * that the time_t origin is Thu Jan 01 1970. (With a different origin * we'd be probing some other day of the week, but it wouldn't matter * anyway unless localtime() had DST-transition bugs.) */ t -= (t % T_WEEK); tt.n_test_times = 0; tt.test_times[tt.n_test_times++] = t; t = build_time_t(thisyear, 7, 15); t -= (t % T_WEEK); tt.test_times[tt.n_test_times++] = t; while (tt.n_test_times < MAX_TEST_TIMES) { t -= T_WEEK; tt.test_times[tt.n_test_times++] = t; } /* Search for the best-matching timezone file */ strcpy(tmptzdir, pg_TZDIR()); bestscore = -1; resultbuf[0] = '\0'; scan_available_timezones(tmptzdir, tmptzdir + strlen(tmptzdir) + 1, &tt, &bestscore, resultbuf); if (bestscore > 0) { /* Ignore zic's rather silly "Factory" time zone; use GMT instead */ if (strcmp(resultbuf, "Factory") == 0) return NULL; return resultbuf; } /* * Couldn't find a match in the database, so next we try constructed zone * names (like "PST8PDT"). * * First we need to determine the names of the local standard and daylight * zones. The idea here is to scan forward from today until we have seen * both zones, if both are in use. */ memset(std_zone_name, 0, sizeof(std_zone_name)); memset(dst_zone_name, 0, sizeof(dst_zone_name)); std_ofs = 0; tnow = time(NULL); /* * Round back to a GMT midnight so results don't depend on local time of * day */ tnow -= (tnow % T_DAY); /* * We have to look a little further ahead than one year, in case today is * just past a DST boundary that falls earlier in the year than the next * similar boundary. Arbitrarily scan up to 14 months. */ for (t = tnow; t <= tnow + T_MONTH * 14; t += T_MONTH) { tm = localtime(&t); if (!tm) continue; if (tm->tm_isdst < 0) continue; if (tm->tm_isdst == 0 && std_zone_name[0] == '\0') { /* found STD zone */ memset(cbuf, 0, sizeof(cbuf)); strftime(cbuf, sizeof(cbuf) - 1, "%Z", tm); /* zone abbr */ strcpy(std_zone_name, cbuf); std_ofs = get_timezone_offset(tm); } if (tm->tm_isdst > 0 && dst_zone_name[0] == '\0') { /* found DST zone */ memset(cbuf, 0, sizeof(cbuf)); strftime(cbuf, sizeof(cbuf) - 1, "%Z", tm); /* zone abbr */ strcpy(dst_zone_name, cbuf); } /* Done if found both */ if (std_zone_name[0] && dst_zone_name[0]) break; } /* We should have found a STD zone name by now... */ if (std_zone_name[0] == '\0') { ereport(LOG, (errmsg("could not determine system time zone"), errdetail("The PostgreSQL time zone will be set to \"%s\".", "GMT"), errhint("You can specify the correct timezone in postgresql.conf."))); return NULL; /* go to GMT */ } /* If we found DST then try STD<ofs>DST */ if (dst_zone_name[0] != '\0') { snprintf(resultbuf, sizeof(resultbuf), "%s%d%s", std_zone_name, -std_ofs / 3600, dst_zone_name); if (score_timezone(resultbuf, &tt) > 0) return resultbuf; } /* Try just the STD timezone (works for GMT at least) */ strcpy(resultbuf, std_zone_name); if (score_timezone(resultbuf, &tt) > 0) return resultbuf; /* Try STD<ofs> */ snprintf(resultbuf, sizeof(resultbuf), "%s%d", std_zone_name, -std_ofs / 3600); if (score_timezone(resultbuf, &tt) > 0) return resultbuf; /* * Did not find the timezone. Fallback to use a GMT zone. Note that the * zic timezone database names the GMT-offset zones in POSIX style: plus * is west of Greenwich. It's unfortunate that this is opposite of SQL * conventions. Should we therefore change the names? Probably not... */ snprintf(resultbuf, sizeof(resultbuf), "Etc/GMT%s%d", (-std_ofs > 0) ? "+" : "", -std_ofs / 3600); ereport(LOG, (errmsg("could not recognize system time zone"), errdetail("The PostgreSQL time zone will be set to \"%s\".", resultbuf), errhint("You can specify the correct timezone in postgresql.conf."))); return resultbuf; }
/* * Load a single slot from disk into memory. */ static void RestoreSlotFromDisk(const char *name) { ReplicationSlotOnDisk cp; int i; char slotdir[MAXPGPATH + 12]; char path[MAXPGPATH + 22]; int fd; bool restored = false; int readBytes; pg_crc32c checksum; /* no need to lock here, no concurrent access allowed yet */ /* delete temp file if it exists */ sprintf(slotdir, "pg_replslot/%s", name); sprintf(path, "%s/state.tmp", slotdir); if (unlink(path) < 0 && errno != ENOENT) ereport(PANIC, (errcode_for_file_access(), errmsg("could not remove file \"%s\": %m", path))); sprintf(path, "%s/state", slotdir); elog(DEBUG1, "restoring replication slot from \"%s\"", path); fd = OpenTransientFile(path, O_RDONLY | PG_BINARY); /* * We do not need to handle this as we are rename()ing the directory into * place only after we fsync()ed the state file. */ if (fd < 0) ereport(PANIC, (errcode_for_file_access(), errmsg("could not open file \"%s\": %m", path))); /* * Sync state file before we're reading from it. We might have crashed * while it wasn't synced yet and we shouldn't continue on that basis. */ pgstat_report_wait_start(WAIT_EVENT_REPLICATION_SLOT_RESTORE_SYNC); if (pg_fsync(fd) != 0) ereport(PANIC, (errcode_for_file_access(), errmsg("could not fsync file \"%s\": %m", path))); pgstat_report_wait_end(); /* Also sync the parent directory */ START_CRIT_SECTION(); fsync_fname(slotdir, true); END_CRIT_SECTION(); /* read part of statefile that's guaranteed to be version independent */ pgstat_report_wait_start(WAIT_EVENT_REPLICATION_SLOT_READ); readBytes = read(fd, &cp, ReplicationSlotOnDiskConstantSize); pgstat_report_wait_end(); if (readBytes != ReplicationSlotOnDiskConstantSize) { if (readBytes < 0) ereport(PANIC, (errcode_for_file_access(), errmsg("could not read file \"%s\": %m", path))); else ereport(PANIC, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("could not read file \"%s\": read %d of %zu", path, readBytes, (Size) ReplicationSlotOnDiskConstantSize))); } /* verify magic */ if (cp.magic != SLOT_MAGIC) ereport(PANIC, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("replication slot file \"%s\" has wrong magic number: %u instead of %u", path, cp.magic, SLOT_MAGIC))); /* verify version */ if (cp.version != SLOT_VERSION) ereport(PANIC, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("replication slot file \"%s\" has unsupported version %u", path, cp.version))); /* boundary check on length */ if (cp.length != ReplicationSlotOnDiskV2Size) ereport(PANIC, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("replication slot file \"%s\" has corrupted length %u", path, cp.length))); /* Now that we know the size, read the entire file */ pgstat_report_wait_start(WAIT_EVENT_REPLICATION_SLOT_READ); readBytes = read(fd, (char *) &cp + ReplicationSlotOnDiskConstantSize, cp.length); pgstat_report_wait_end(); if (readBytes != cp.length) { if (readBytes < 0) ereport(PANIC, (errcode_for_file_access(), errmsg("could not read file \"%s\": %m", path))); else ereport(PANIC, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("could not read file \"%s\": read %d of %zu", path, readBytes, (Size) cp.length))); } if (CloseTransientFile(fd)) ereport(PANIC, (errcode_for_file_access(), errmsg("could not close file \"%s\": %m", path))); /* now verify the CRC */ INIT_CRC32C(checksum); COMP_CRC32C(checksum, (char *) &cp + SnapBuildOnDiskNotChecksummedSize, SnapBuildOnDiskChecksummedSize); FIN_CRC32C(checksum); if (!EQ_CRC32C(checksum, cp.checksum)) ereport(PANIC, (errmsg("checksum mismatch for replication slot file \"%s\": is %u, should be %u", path, checksum, cp.checksum))); /* * If we crashed with an ephemeral slot active, don't restore but delete * it. */ if (cp.slotdata.persistency != RS_PERSISTENT) { if (!rmtree(slotdir, true)) { ereport(WARNING, (errmsg("could not remove directory \"%s\"", slotdir))); } fsync_fname("pg_replslot", true); return; } /* * Verify that requirements for the specific slot type are met. That's * important because if these aren't met we're not guaranteed to retain * all the necessary resources for the slot. * * NB: We have to do so *after* the above checks for ephemeral slots, * because otherwise a slot that shouldn't exist anymore could prevent * restarts. * * NB: Changing the requirements here also requires adapting * CheckSlotRequirements() and CheckLogicalDecodingRequirements(). */ if (cp.slotdata.database != InvalidOid && wal_level < WAL_LEVEL_LOGICAL) ereport(FATAL, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("logical replication slot \"%s\" exists, but wal_level < logical", NameStr(cp.slotdata.name)), errhint("Change wal_level to be logical or higher."))); else if (wal_level < WAL_LEVEL_REPLICA) ereport(FATAL, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("physical replication slot \"%s\" exists, but wal_level < replica", NameStr(cp.slotdata.name)), errhint("Change wal_level to be replica or higher."))); /* nothing can be active yet, don't lock anything */ for (i = 0; i < max_replication_slots; i++) { ReplicationSlot *slot; slot = &ReplicationSlotCtl->replication_slots[i]; if (slot->in_use) continue; /* restore the entire set of persistent data */ memcpy(&slot->data, &cp.slotdata, sizeof(ReplicationSlotPersistentData)); /* initialize in memory state */ slot->effective_xmin = cp.slotdata.xmin; slot->effective_catalog_xmin = cp.slotdata.catalog_xmin; slot->candidate_catalog_xmin = InvalidTransactionId; slot->candidate_xmin_lsn = InvalidXLogRecPtr; slot->candidate_restart_lsn = InvalidXLogRecPtr; slot->candidate_restart_valid = InvalidXLogRecPtr; slot->in_use = true; slot->active_pid = 0; restored = true; break; } if (!restored) ereport(FATAL, (errmsg("too many replication slots active before shutdown"), errhint("Increase max_replication_slots and try again."))); }
/* * FUNCTION UTL_FILE.FOPEN(location text, * filename text, * open_mode text, * max_linesize integer) * RETURNS UTL_FILE.FILE_TYPE; * * The FOPEN function opens specified file and returns file handle. * open_mode: ['R', 'W', 'A'] * max_linesize: [1 .. 32767] * * Exceptions: * INVALID_MODE, INVALID_OPERATION, INVALID_PATH, INVALID_MAXLINESIZE */ Datum utl_file_fopen(PG_FUNCTION_ARGS) { text *open_mode; int max_linesize; int encoding; const char *mode = NULL; FILE *file; char *fullname; int d; NOT_NULL_ARG(0); NOT_NULL_ARG(1); NOT_NULL_ARG(2); NOT_NULL_ARG(3); open_mode = PG_GETARG_TEXT_P(2); NON_EMPTY_TEXT(open_mode); max_linesize = PG_GETARG_INT32(3); CHECK_LINESIZE(max_linesize); if (PG_NARGS() > 4 && !PG_ARGISNULL(4)) { const char *encname = NameStr(*PG_GETARG_NAME(4)); encoding = pg_char_to_encoding(encname); if (encoding < 0) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("invalid encoding name \"%s\"", encname))); } else encoding = GetDatabaseEncoding(); if (VARSIZE(open_mode) - VARHDRSZ != 1) CUSTOM_EXCEPTION(INVALID_MODE, "open mode is different than [R,W,A]"); switch (*((char*)VARDATA(open_mode))) { case 'a': case 'A': mode = "a"; break; case 'r': case 'R': mode = "r"; break; case 'w': case 'W': mode = "w"; break; default: CUSTOM_EXCEPTION(INVALID_MODE, "open mode is different than [R,W,A]"); } /* open file */ fullname = get_safe_path(PG_GETARG_TEXT_P(0), PG_GETARG_TEXT_P(1)); /* * We cannot use AllocateFile here because those files are automatically * closed at the end of (sub)transactions, but we want to keep them open * for oracle compatibility. */ #if NOT_USED fullname = convert_encoding_server_to_platform(fullname); #endif file = fopen(fullname, mode); if (!file) IO_EXCEPTION(); d = get_descriptor(file, max_linesize, encoding); if (d == INVALID_SLOTID) { fclose(file); ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("program limit exceeded"), errdetail("Too much concurent opened files"), errhint("You can only open a maximum of ten files for each session"))); } PG_RETURN_INT32(d); }
/* * Create a new replication slot and mark it as used by this backend. * * name: Name of the slot * db_specific: logical decoding is db specific; if the slot is going to * be used for that pass true, otherwise false. */ void ReplicationSlotCreate(const char *name, bool db_specific, ReplicationSlotPersistency persistency) { ReplicationSlot *slot = NULL; int i; Assert(MyReplicationSlot == NULL); ReplicationSlotValidateName(name, ERROR); /* * If some other backend ran this code concurrently with us, we'd likely * both allocate the same slot, and that would be bad. We'd also be at * risk of missing a name collision. Also, we don't want to try to create * a new slot while somebody's busy cleaning up an old one, because we * might both be monkeying with the same directory. */ LWLockAcquire(ReplicationSlotAllocationLock, LW_EXCLUSIVE); /* * Check for name collision, and identify an allocatable slot. We need to * hold ReplicationSlotControlLock in shared mode for this, so that nobody * else can change the in_use flags while we're looking at them. */ LWLockAcquire(ReplicationSlotControlLock, LW_SHARED); for (i = 0; i < max_replication_slots; i++) { ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i]; if (s->in_use && strcmp(name, NameStr(s->data.name)) == 0) ereport(ERROR, (errcode(ERRCODE_DUPLICATE_OBJECT), errmsg("replication slot \"%s\" already exists", name))); if (!s->in_use && slot == NULL) slot = s; } LWLockRelease(ReplicationSlotControlLock); /* If all slots are in use, we're out of luck. */ if (slot == NULL) ereport(ERROR, (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED), errmsg("all replication slots are in use"), errhint("Free one or increase max_replication_slots."))); /* * Since this slot is not in use, nobody should be looking at any part of * it other than the in_use field unless they're trying to allocate it. * And since we hold ReplicationSlotAllocationLock, nobody except us can * be doing that. So it's safe to initialize the slot. */ Assert(!slot->in_use); Assert(slot->active_pid == 0); /* first initialize persistent data */ memset(&slot->data, 0, sizeof(ReplicationSlotPersistentData)); StrNCpy(NameStr(slot->data.name), name, NAMEDATALEN); slot->data.database = db_specific ? MyDatabaseId : InvalidOid; slot->data.persistency = persistency; /* and then data only present in shared memory */ slot->just_dirtied = false; slot->dirty = false; slot->effective_xmin = InvalidTransactionId; slot->effective_catalog_xmin = InvalidTransactionId; slot->candidate_catalog_xmin = InvalidTransactionId; slot->candidate_xmin_lsn = InvalidXLogRecPtr; slot->candidate_restart_valid = InvalidXLogRecPtr; slot->candidate_restart_lsn = InvalidXLogRecPtr; /* * Create the slot on disk. We haven't actually marked the slot allocated * yet, so no special cleanup is required if this errors out. */ CreateSlotOnDisk(slot); /* * We need to briefly prevent any other backend from iterating over the * slots while we flip the in_use flag. We also need to set the active * flag while holding the ControlLock as otherwise a concurrent * SlotAcquire() could acquire the slot as well. */ LWLockAcquire(ReplicationSlotControlLock, LW_EXCLUSIVE); slot->in_use = true; /* We can now mark the slot active, and that makes it our slot. */ SpinLockAcquire(&slot->mutex); Assert(slot->active_pid == 0); slot->active_pid = MyProcPid; SpinLockRelease(&slot->mutex); MyReplicationSlot = slot; LWLockRelease(ReplicationSlotControlLock); /* * Now that the slot has been marked as in_use and active, it's safe to * let somebody else try to allocate a slot. */ LWLockRelease(ReplicationSlotAllocationLock); /* Let everybody know we've modified this slot */ ConditionVariableBroadcast(&slot->active_cv); }
/* ---------------------------------------------------------------- * * ---------------------------------------------------------------- */ Datum int4notin(PG_FUNCTION_ARGS) { int32 not_in_arg = PG_GETARG_INT32(0); text *relation_and_attr = PG_GETARG_TEXT_P(1); List *names; int nnames; RangeVar *relrv; char *attribute; Relation relation_to_scan; int32 integer_value; HeapTuple current_tuple; HeapScanDesc scan_descriptor; bool isNull, retval; int attrid; Datum value; /* Parse the argument */ names = textToQualifiedNameList(relation_and_attr); nnames = list_length(names); if (nnames < 2) ereport(ERROR, (errcode(ERRCODE_INVALID_NAME), errmsg("invalid name syntax"), errhint("Must provide \"relationname.columnname\"."))); attribute = strVal(llast(names)); names = list_truncate(names, nnames - 1); relrv = makeRangeVarFromNameList(names); /* Open the relation and get a relation descriptor */ relation_to_scan = heap_openrv(relrv, AccessShareLock); /* Find the column to search */ attrid = attnameAttNum(relation_to_scan, attribute, true); if (attrid == InvalidAttrNumber) ereport(ERROR, (errcode(ERRCODE_UNDEFINED_COLUMN), errmsg("column \"%s\" of relation \"%s\" does not exist", attribute, RelationGetRelationName(relation_to_scan)))); scan_descriptor = heap_beginscan(relation_to_scan, SnapshotNow, 0, (ScanKey) NULL); retval = true; /* do a scan of the relation, and do the check */ while ((current_tuple = heap_getnext(scan_descriptor, ForwardScanDirection)) != NULL) { value = heap_getattr(current_tuple, (AttrNumber) attrid, RelationGetDescr(relation_to_scan), &isNull); if (isNull) continue; integer_value = DatumGetInt32(value); if (not_in_arg == integer_value) { retval = false; break; /* can stop scanning now */ } } /* close the relation */ heap_endscan(scan_descriptor); heap_close(relation_to_scan, AccessShareLock); PG_RETURN_BOOL(retval); }
/* * master_create_worker_shards creates empty shards for the given table based * on the specified number of initial shards. The function first gets a list of * candidate nodes and issues DDL commands on the nodes to create empty shard * placements on those nodes. The function then updates metadata on the master * node to make this shard (and its placements) visible. Note that the function * assumes the table is hash partitioned and calculates the min/max hash token * ranges for each shard, giving them an equal split of the hash space. */ Datum master_create_worker_shards(PG_FUNCTION_ARGS) { text *tableNameText = PG_GETARG_TEXT_P(0); int32 shardCount = PG_GETARG_INT32(1); int32 replicationFactor = PG_GETARG_INT32(2); Oid distributedTableId = ResolveRelationId(tableNameText); char relationKind = get_rel_relkind(distributedTableId); char *tableName = text_to_cstring(tableNameText); char *relationOwner = NULL; char shardStorageType = '\0'; List *workerNodeList = NIL; List *ddlCommandList = NIL; int32 workerNodeCount = 0; uint32 placementAttemptCount = 0; uint64 hashTokenIncrement = 0; List *existingShardList = NIL; int64 shardIndex = 0; /* make sure table is hash partitioned */ CheckHashPartitionedTable(distributedTableId); /* * In contrast to append/range partitioned tables it makes more sense to * require ownership privileges - shards for hash-partitioned tables are * only created once, not continually during ingest as for the other * partitioning types. */ EnsureTableOwner(distributedTableId); /* we plan to add shards: get an exclusive metadata lock */ LockRelationDistributionMetadata(distributedTableId, ExclusiveLock); relationOwner = TableOwner(distributedTableId); /* validate that shards haven't already been created for this table */ existingShardList = LoadShardList(distributedTableId); if (existingShardList != NIL) { ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("table \"%s\" has already had shards created for it", tableName))); } /* make sure that at least one shard is specified */ if (shardCount <= 0) { ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("shard_count must be positive"))); } /* make sure that at least one replica is specified */ if (replicationFactor <= 0) { ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("replication_factor must be positive"))); } /* calculate the split of the hash space */ hashTokenIncrement = HASH_TOKEN_COUNT / shardCount; /* load and sort the worker node list for deterministic placement */ workerNodeList = WorkerNodeList(); workerNodeList = SortList(workerNodeList, CompareWorkerNodes); /* make sure we don't process cancel signals until all shards are created */ HOLD_INTERRUPTS(); /* retrieve the DDL commands for the table */ ddlCommandList = GetTableDDLEvents(distributedTableId); workerNodeCount = list_length(workerNodeList); if (replicationFactor > workerNodeCount) { ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("replication_factor (%d) exceeds number of worker nodes " "(%d)", replicationFactor, workerNodeCount), errhint("Add more worker nodes or try again with a lower " "replication factor."))); } /* if we have enough nodes, add an extra placement attempt for backup */ placementAttemptCount = (uint32) replicationFactor; if (workerNodeCount > replicationFactor) { placementAttemptCount++; } /* set shard storage type according to relation type */ if (relationKind == RELKIND_FOREIGN_TABLE) { bool cstoreTable = CStoreTable(distributedTableId); if (cstoreTable) { shardStorageType = SHARD_STORAGE_COLUMNAR; } else { shardStorageType = SHARD_STORAGE_FOREIGN; } } else { shardStorageType = SHARD_STORAGE_TABLE; } for (shardIndex = 0; shardIndex < shardCount; shardIndex++) { uint32 roundRobinNodeIndex = shardIndex % workerNodeCount; /* initialize the hash token space for this shard */ text *minHashTokenText = NULL; text *maxHashTokenText = NULL; int32 shardMinHashToken = INT32_MIN + (shardIndex * hashTokenIncrement); int32 shardMaxHashToken = shardMinHashToken + (hashTokenIncrement - 1); Datum shardIdDatum = master_get_new_shardid(NULL); int64 shardId = DatumGetInt64(shardIdDatum); /* if we are at the last shard, make sure the max token value is INT_MAX */ if (shardIndex == (shardCount - 1)) { shardMaxHashToken = INT32_MAX; } /* insert the shard metadata row along with its min/max values */ minHashTokenText = IntegerToText(shardMinHashToken); maxHashTokenText = IntegerToText(shardMaxHashToken); /* * Grabbing the shard metadata lock isn't technically necessary since * we already hold an exclusive lock on the partition table, but we'll * acquire it for the sake of completeness. As we're adding new active * placements, the mode must be exclusive. */ LockShardDistributionMetadata(shardId, ExclusiveLock); CreateShardPlacements(shardId, ddlCommandList, relationOwner, workerNodeList, roundRobinNodeIndex, replicationFactor); InsertShardRow(distributedTableId, shardId, shardStorageType, minHashTokenText, maxHashTokenText); } if (QueryCancelPending) { ereport(WARNING, (errmsg("cancel requests are ignored during shard creation"))); QueryCancelPending = false; } RESUME_INTERRUPTS(); PG_RETURN_VOID(); }
/* * Create a table space * * Only superusers can create a tablespace. This seems a reasonable restriction * since we're determining the system layout and, anyway, we probably have * root if we're doing this kind of activity */ void CreateTableSpace(CreateTableSpaceStmt *stmt) { #ifdef HAVE_SYMLINK Relation rel; Datum values[Natts_pg_tablespace]; char nulls[Natts_pg_tablespace]; HeapTuple tuple; Oid tablespaceoid; char *location; char *linkloc; Oid ownerId; /* validate */ /* don't call this in a transaction block */ PreventTransactionChain((void *) stmt, "CREATE TABLESPACE"); /* Must be super user */ if (!superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("permission denied to create tablespace \"%s\"", stmt->tablespacename), errhint("Must be superuser to create a tablespace."))); /* However, the eventual owner of the tablespace need not be */ if (stmt->owner) ownerId = get_roleid_checked(stmt->owner); else ownerId = GetUserId(); /* Unix-ify the offered path, and strip any trailing slashes */ location = pstrdup(stmt->location); canonicalize_path(location); /* disallow quotes, else CREATE DATABASE would be at risk */ if (strchr(location, '\'')) ereport(ERROR, (errcode(ERRCODE_INVALID_NAME), errmsg("tablespace location may not contain single quotes"))); /* * Allowing relative paths seems risky * * this also helps us ensure that location is not empty or whitespace */ if (!is_absolute_path(location)) ereport(ERROR, (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), errmsg("tablespace location must be an absolute path"))); /* * Check that location isn't too long. Remember that we're going to append * '/<dboid>/<relid>.<nnn>' (XXX but do we ever form the whole path * explicitly? This may be overly conservative.) */ if (strlen(location) >= (MAXPGPATH - 1 - 10 - 1 - 10 - 1 - 10)) ereport(ERROR, (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), errmsg("tablespace location \"%s\" is too long", location))); /* * Disallow creation of tablespaces named "pg_xxx"; we reserve this * namespace for system purposes. */ if (!allowSystemTableMods && IsReservedName(stmt->tablespacename)) ereport(ERROR, (errcode(ERRCODE_RESERVED_NAME), errmsg("unacceptable tablespace name \"%s\"", stmt->tablespacename), errdetail("The prefix \"pg_\" is reserved for system tablespaces."))); /* * Check that there is no other tablespace by this name. (The unique * index would catch this anyway, but might as well give a friendlier * message.) */ if (OidIsValid(get_tablespace_oid(stmt->tablespacename))) ereport(ERROR, (errcode(ERRCODE_DUPLICATE_OBJECT), errmsg("tablespace \"%s\" already exists", stmt->tablespacename))); /* * Insert tuple into pg_tablespace. The purpose of doing this first is to * lock the proposed tablename against other would-be creators. The * insertion will roll back if we find problems below. */ rel = heap_open(TableSpaceRelationId, RowExclusiveLock); MemSet(nulls, ' ', Natts_pg_tablespace); values[Anum_pg_tablespace_spcname - 1] = DirectFunctionCall1(namein, CStringGetDatum(stmt->tablespacename)); values[Anum_pg_tablespace_spcowner - 1] = ObjectIdGetDatum(ownerId); values[Anum_pg_tablespace_spclocation - 1] = DirectFunctionCall1(textin, CStringGetDatum(location)); nulls[Anum_pg_tablespace_spcacl - 1] = 'n'; tuple = heap_formtuple(rel->rd_att, values, nulls); tablespaceoid = simple_heap_insert(rel, tuple); CatalogUpdateIndexes(rel, tuple); heap_freetuple(tuple); /* Record dependency on owner */ recordDependencyOnOwner(TableSpaceRelationId, tablespaceoid, ownerId); /* * Attempt to coerce target directory to safe permissions. If this fails, * it doesn't exist or has the wrong owner. */ if (chmod(location, 0700) != 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not set permissions on directory \"%s\": %m", location))); /* * Check the target directory is empty. */ if (!directory_is_empty(location)) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("directory \"%s\" is not empty", location))); /* * Create the PG_VERSION file in the target directory. This has several * purposes: to make sure we can write in the directory, to prevent * someone from creating another tablespace pointing at the same directory * (the emptiness check above will fail), and to label tablespace * directories by PG version. */ set_short_version(location); /* * All seems well, create the symlink */ linkloc = (char *) palloc(10 + 10 + 1); sprintf(linkloc, "pg_tblspc/%u", tablespaceoid); if (symlink(location, linkloc) < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not create symbolic link \"%s\": %m", linkloc))); /* Record the filesystem change in XLOG */ { xl_tblspc_create_rec xlrec; XLogRecData rdata[2]; xlrec.ts_id = tablespaceoid; rdata[0].data = (char *) &xlrec; rdata[0].len = offsetof(xl_tblspc_create_rec, ts_path); rdata[0].buffer = InvalidBuffer; rdata[0].next = &(rdata[1]); rdata[1].data = (char *) location; rdata[1].len = strlen(location) + 1; rdata[1].buffer = InvalidBuffer; rdata[1].next = NULL; (void) XLogInsert(RM_TBLSPC_ID, XLOG_TBLSPC_CREATE, rdata); } pfree(linkloc); pfree(location); /* We keep the lock on pg_tablespace until commit */ heap_close(rel, NoLock); #else /* !HAVE_SYMLINK */ ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("tablespaces are not supported on this platform"))); #endif /* HAVE_SYMLINK */ }
/* * RequestCheckpoint * Called in backend processes to request a checkpoint * * flags is a bitwise OR of the following: * CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown. * CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery. * CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP, * ignoring checkpoint_completion_target parameter. * CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occurred * since the last one (implied by CHECKPOINT_IS_SHUTDOWN or * CHECKPOINT_END_OF_RECOVERY). * CHECKPOINT_WAIT: wait for completion before returning (otherwise, * just signal checkpointer to do it, and return). * CHECKPOINT_CAUSE_XLOG: checkpoint is requested due to xlog filling. * (This affects logging, and in particular enables CheckPointWarning.) */ void RequestCheckpoint(int flags) { int ntries; int old_failed, old_started; /* * If in a standalone backend, just do it ourselves. */ if (!IsPostmasterEnvironment) { /* * There's no point in doing slow checkpoints in a standalone backend, * because there's no other backends the checkpoint could disrupt. */ CreateCheckPoint(flags | CHECKPOINT_IMMEDIATE); /* * After any checkpoint, close all smgr files. This is so we won't * hang onto smgr references to deleted files indefinitely. */ smgrcloseall(); return; } /* * Atomically set the request flags, and take a snapshot of the counters. * When we see ckpt_started > old_started, we know the flags we set here * have been seen by checkpointer. * * Note that we OR the flags with any existing flags, to avoid overriding * a "stronger" request by another backend. The flag senses must be * chosen to make this work! */ SpinLockAcquire(&CheckpointerShmem->ckpt_lck); old_failed = CheckpointerShmem->ckpt_failed; old_started = CheckpointerShmem->ckpt_started; CheckpointerShmem->ckpt_flags |= flags; SpinLockRelease(&CheckpointerShmem->ckpt_lck); /* * Send signal to request checkpoint. It's possible that the checkpointer * hasn't started yet, or is in process of restarting, so we will retry a * few times if needed. Also, if not told to wait for the checkpoint to * occur, we consider failure to send the signal to be nonfatal and merely * LOG it. */ for (ntries = 0;; ntries++) { if (CheckpointerShmem->checkpointer_pid == 0) { if (ntries >= 20) /* max wait 2.0 sec */ { elog((flags & CHECKPOINT_WAIT) ? ERROR : LOG, "could not request checkpoint because checkpointer not running"); break; } } else if (kill(CheckpointerShmem->checkpointer_pid, SIGINT) != 0) { if (ntries >= 20) /* max wait 2.0 sec */ { elog((flags & CHECKPOINT_WAIT) ? ERROR : LOG, "could not signal for checkpoint: %m"); break; } } else break; /* signal sent successfully */ CHECK_FOR_INTERRUPTS(); pg_usleep(100000L); /* wait 0.1 sec, then retry */ } /* * If requested, wait for completion. We detect completion according to * the algorithm given above. */ if (flags & CHECKPOINT_WAIT) { int new_started, new_failed; /* Wait for a new checkpoint to start. */ for (;;) { SpinLockAcquire(&CheckpointerShmem->ckpt_lck); new_started = CheckpointerShmem->ckpt_started; SpinLockRelease(&CheckpointerShmem->ckpt_lck); if (new_started != old_started) break; CHECK_FOR_INTERRUPTS(); pg_usleep(100000L); } /* * We are waiting for ckpt_done >= new_started, in a modulo sense. */ for (;;) { int new_done; SpinLockAcquire(&CheckpointerShmem->ckpt_lck); new_done = CheckpointerShmem->ckpt_done; new_failed = CheckpointerShmem->ckpt_failed; SpinLockRelease(&CheckpointerShmem->ckpt_lck); if (new_done - new_started >= 0) break; CHECK_FOR_INTERRUPTS(); pg_usleep(100000L); } if (new_failed != old_failed) ereport(ERROR, (errmsg("checkpoint request failed"), errhint("Consult recent messages in the server log for details."))); } }