/* * Get the max size of the relation across segments */ int64 cdbRelMaxSegSize(Relation rel) { int64 size = 0; int i; CdbPgResults cdb_pgresults = {NULL, 0}; StringInfoData buffer; char *schemaName; char *relName; /* * Let's ask the QEs for the size of the relation */ initStringInfo(&buffer); schemaName = get_namespace_name(RelationGetNamespace(rel)); if (schemaName == NULL) elog(ERROR, "cache lookup failed for namespace %d", RelationGetNamespace(rel)); relName = RelationGetRelationName(rel); /* * Safer to pass names than oids, just in case they get out of sync between QD and QE, * which might happen with a toast table or index, I think (but maybe not) */ appendStringInfo(&buffer, "select pg_relation_size('%s.%s')", quote_identifier(schemaName), quote_identifier(relName)); CdbDispatchCommand(buffer.data, DF_WITH_SNAPSHOT, &cdb_pgresults); for (i = 0; i < cdb_pgresults.numResults; i++) { struct pg_result * pgresult = cdb_pgresults.pg_results[i]; if (PQresultStatus(pgresult) != PGRES_TUPLES_OK) { cdbdisp_clearCdbPgResults(&cdb_pgresults); elog(ERROR,"cdbRelMaxSegSize: resultStatus not tuples_Ok: %s %s", PQresStatus(PQresultStatus(pgresult)),PQresultErrorMessage(pgresult)); } else { Assert(PQntuples(pgresult) == 1); int64 tempsize = 0; (void) scanint8(PQgetvalue(pgresult, 0, 0), false, &tempsize); if (tempsize > size) size = tempsize; } } pfree(buffer.data); cdbdisp_clearCdbPgResults(&cdb_pgresults); return size; }
static void dumpResGroupInfo(StringInfo str) { if (Gp_role == GP_ROLE_DISPATCH) { int i; StringInfoData str_qd; StringInfoData buffer; CdbPgResults cdb_pgresults = {NULL, 0}; struct pg_result *pg_result; initStringInfo(&str_qd); initStringInfo(&buffer); appendStringInfo(&buffer, "select * from pg_resgroup_get_status_kv('dump');"); CdbDispatchCommand(buffer.data, 0, &cdb_pgresults); if (cdb_pgresults.numResults == 0) elog(ERROR, "dumpResGroupInfo didn't get back any results from the segDBs"); LWLockAcquire(ResGroupLock, LW_EXCLUSIVE); ResGroupDumpInfo(&str_qd); LWLockRelease(ResGroupLock); /* append all qes and qd together to form str */ appendStringInfo(str, "{\"info\":[%s,", str_qd.data); for (i = 0; i < cdb_pgresults.numResults; i++) { pg_result = cdb_pgresults.pg_results[i]; if (PQresultStatus(pg_result) != PGRES_TUPLES_OK) { cdbdisp_clearCdbPgResults(&cdb_pgresults); elog(ERROR, "pg_resgroup_get_status_kv(): resultStatus not tuples_Ok"); } Assert(PQntuples(pg_result) == 1); appendStringInfo(str, "%s", PQgetvalue(pg_result, 0, 2)); if (i < cdb_pgresults.numResults - 1) appendStringInfo(str, ","); } appendStringInfo(str, "]}"); cdbdisp_clearCdbPgResults(&cdb_pgresults); } else { LWLockAcquire(ResGroupLock, LW_EXCLUSIVE); ResGroupDumpInfo(str); LWLockRelease(ResGroupLock); } }
/* * Helper function to dispatch a size-returning command. * * Dispatches the given SQL query to segments, and sums up the results. * The query is expected to return one int8 value. */ int64 get_size_from_segDBs(const char *cmd) { int64 result; CdbPgResults cdb_pgresults = {NULL, 0}; int i; Assert(Gp_role == GP_ROLE_DISPATCH); CdbDispatchCommand(cmd, DF_WITH_SNAPSHOT, &cdb_pgresults); result = 0; for (i = 0; i < cdb_pgresults.numResults; i++) { Datum value; struct pg_result *pgresult = cdb_pgresults.pg_results[i]; if (PQresultStatus(pgresult) != PGRES_TUPLES_OK) { cdbdisp_clearCdbPgResults(&cdb_pgresults); ereport(ERROR, (errmsg("unexpected result from segment: %d", PQresultStatus(pgresult)))); } if (PQntuples(pgresult) != 1 || PQnfields(pgresult) != 1) { cdbdisp_clearCdbPgResults(&cdb_pgresults); ereport(ERROR, (errmsg("unexpected shape of result from segment (%d rows, %d cols)", PQntuples(pgresult), PQnfields(pgresult)))); } if (PQgetisnull(pgresult, 0, 0)) value = 0; else value = DirectFunctionCall1(int8in, CStringGetDatum(PQgetvalue(pgresult, 0, 0))); result += value; } cdbdisp_clearCdbPgResults(&cdb_pgresults); return result; }
/* * Delete error log of the specified relation. This returns true from master * iif all segments and master find the relation. */ Datum gp_truncate_error_log(PG_FUNCTION_ARGS) { text *relname; char *relname_str; RangeVar *relrv; Oid relid; bool allResults = true; relname = PG_GETARG_TEXT_P(0); relname_str = text_to_cstring(relname); if (strcmp(relname_str, "*.*") == 0) { /* * Only superuser is allowed to delete log files across database. */ if (!superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), (errmsg("must be superuser to delete all error log files")))); ErrorLogDelete(InvalidOid, InvalidOid); } else if (strcmp(relname_str, "*") == 0) { /* * Database owner can delete error log files. */ if (!pg_database_ownercheck(MyDatabaseId, GetUserId())) aclcheck_error(ACLCHECK_NOT_OWNER, ACL_KIND_DATABASE, get_database_name(MyDatabaseId)); ErrorLogDelete(MyDatabaseId, InvalidOid); } else { AclResult aclresult; relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname)); relid = RangeVarGetRelid(relrv, true); /* Return false if the relation does not exist. */ if (!OidIsValid(relid)) PG_RETURN_BOOL(false); /* * Allow only the table owner to truncate error log. */ aclresult = pg_class_aclcheck(relid, GetUserId(), ACL_TRUNCATE); if (aclresult != ACLCHECK_OK) aclcheck_error(aclresult, ACL_KIND_CLASS, relrv->relname); /* We don't care if this fails or not. */ ErrorLogDelete(MyDatabaseId, relid); } /* * Dispatch the work to segments. */ if (Gp_role == GP_ROLE_DISPATCH) { int i = 0; StringInfoData sql; CdbPgResults cdb_pgresults = {NULL, 0}; initStringInfo(&sql); appendStringInfo(&sql, "SELECT pg_catalog.gp_truncate_error_log(%s)", quote_literal_internal(text_to_cstring(relname))); CdbDispatchCommand(sql.data, DF_WITH_SNAPSHOT, &cdb_pgresults); for (i = 0; i < cdb_pgresults.numResults; i++) { Datum value; bool isnull; struct pg_result *pgresult = cdb_pgresults.pg_results[i]; if (PQresultStatus(pgresult) != PGRES_TUPLES_OK) { cdbdisp_clearCdbPgResults(&cdb_pgresults); ereport(ERROR, (errmsg("unexpected result from segment: %d", PQresultStatus(pgresult)))); } value = ResultToDatum(pgresult, 0, 0, boolin, &isnull); allResults &= (!isnull && DatumGetBool(value)); } cdbdisp_clearCdbPgResults(&cdb_pgresults); pfree(sql.data); } /* Return true iif all segments return true. */ PG_RETURN_BOOL(allResults); }
/* * gp_read_error_log * * Returns set of error log tuples. */ Datum gp_read_error_log(PG_FUNCTION_ARGS) { FuncCallContext *funcctx; ReadErrorLogContext *context; HeapTuple tuple; Datum result; /* * First call setup */ if (SRF_IS_FIRSTCALL()) { MemoryContext oldcontext; FILE *fp; text *relname; funcctx = SRF_FIRSTCALL_INIT(); relname = PG_GETARG_TEXT_P(0); oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); context = palloc0(sizeof(ReadErrorLogContext)); funcctx->user_fctx = (void *) context; funcctx->tuple_desc = BlessTupleDesc(GetErrorTupleDesc()); /* * Though this function is usually executed on segment, we dispatch * the execution if it happens to be on QD, and combine the results * into one set. */ if (Gp_role == GP_ROLE_DISPATCH) { struct CdbPgResults cdb_pgresults = {NULL, 0}; StringInfoData sql; int i; initStringInfo(&sql); /* * construct SQL */ appendStringInfo(&sql, "SELECT * FROM pg_catalog.gp_read_error_log(%s) ", quote_literal_internal(text_to_cstring(relname))); CdbDispatchCommand(sql.data, DF_WITH_SNAPSHOT, &cdb_pgresults); for (i = 0; i < cdb_pgresults.numResults; i++) { if (PQresultStatus(cdb_pgresults.pg_results[i]) != PGRES_TUPLES_OK) { cdbdisp_clearCdbPgResults(&cdb_pgresults); elog(ERROR, "unexpected result from segment: %d", PQresultStatus(cdb_pgresults.pg_results[i])); } context->numTuples += PQntuples(cdb_pgresults.pg_results[i]); } pfree(sql.data); context->segResults = cdb_pgresults.pg_results; context->numSegResults = cdb_pgresults.numResults; } else { /* * In QE, read the error log. */ RangeVar *relrv; Oid relid; relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname)); relid = RangeVarGetRelid(relrv, true); /* * If the relation has gone, silently return no tuples. */ if (OidIsValid(relid)) { AclResult aclresult; /* * Requires SELECT priv to read error log. */ aclresult = pg_class_aclcheck(relid, GetUserId(), ACL_SELECT); if (aclresult != ACLCHECK_OK) aclcheck_error(aclresult, ACL_KIND_CLASS, relrv->relname); ErrorLogFileName(context->filename, MyDatabaseId, relid); fp = AllocateFile(context->filename, "r"); context->fp = fp; } } MemoryContextSwitchTo(oldcontext); if (Gp_role != GP_ROLE_DISPATCH && !context->fp) { pfree(context); SRF_RETURN_DONE(funcctx); } } funcctx = SRF_PERCALL_SETUP(); context = (ReadErrorLogContext *) funcctx->user_fctx; /* * Read error log, probably on segments. We don't check Gp_role, however, * in case master also wants to read the file. */ if (context->fp) { pg_crc32 crc, written_crc; tuple = ErrorLogRead(context->fp, &written_crc); /* * CRC check. */ if (HeapTupleIsValid(tuple)) { INIT_CRC32C(crc); COMP_CRC32C(crc, tuple->t_data, tuple->t_len); FIN_CRC32C(crc); if (!EQ_CRC32C(crc, written_crc)) { elog(LOG, "incorrect checksum in error log %s", context->filename); tuple = NULL; } } /* * If we found a valid tuple, return it. Otherwise, fall through * in the DONE routine. */ if (HeapTupleIsValid(tuple)) { /* * We need to set typmod for the executor to understand * its type we just blessed. */ HeapTupleHeaderSetTypMod(tuple->t_data, funcctx->tuple_desc->tdtypmod); result = HeapTupleGetDatum(tuple); SRF_RETURN_NEXT(funcctx, result); } } /* * If we got results from dispatch, return all the tuples. */ while (context->currentResult < context->numSegResults) { Datum values[NUM_ERRORTABLE_ATTR]; bool isnull[NUM_ERRORTABLE_ATTR]; PGresult *segres = context->segResults[context->currentResult]; int row = context->currentRow; if (row >= PQntuples(segres)) { context->currentRow = 0; context->currentResult++; continue; } context->currentRow++; MemSet(isnull, false, sizeof(isnull)); values[0] = ResultToDatum(segres, row, 0, timestamptz_in, &isnull[0]); values[1] = ResultToDatum(segres, row, 1, textin, &isnull[1]); values[2] = ResultToDatum(segres, row, 2, textin, &isnull[2]); values[3] = ResultToDatum(segres, row, 3, int4in, &isnull[3]); values[4] = ResultToDatum(segres, row, 4, int4in, &isnull[4]); values[5] = ResultToDatum(segres, row, 5, textin, &isnull[5]); values[6] = ResultToDatum(segres, row, 6, textin, &isnull[6]); values[7] = ResultToDatum(segres, row, 7, byteain, &isnull[7]); tuple = heap_form_tuple(funcctx->tuple_desc, values, isnull); result = HeapTupleGetDatum(tuple); SRF_RETURN_NEXT(funcctx, result); } if (context->segResults != NULL) { int i; for (i = 0; i < context->numSegResults; i++) PQclear(context->segResults[i]); /* XXX: better to copy to palloc'ed area */ free(context->segResults); } /* * Close the file, if we have opened it. */ if (context->fp != NULL) { FreeFile(context->fp); context->fp = NULL; } SRF_RETURN_DONE(funcctx); }
/* * Get resource usage. * * On QD this function dispatch the request to all QEs, collecting both * QEs' and QD's resource usage. * * On QE this function only collect the resource usage on itself. * * Memory & cpu usage are returned in JSON format. */ static void getResUsage(ResGroupStatCtx *ctx, Oid inGroupId) { int64 *usages; TimestampTz *timestamps; int i, j; usages = palloc(sizeof(*usages) * ctx->nGroups); timestamps = palloc(sizeof(*timestamps) * ctx->nGroups); for (j = 0; j < ctx->nGroups; j++) { ResGroupStat *row = &ctx->groups[j]; Oid groupId = DatumGetObjectId(row->groupId); usages[j] = ResGroupOps_GetCpuUsage(groupId); timestamps[j] = GetCurrentTimestamp(); } if (Gp_role == GP_ROLE_DISPATCH) { CdbPgResults cdb_pgresults = {NULL, 0}; StringInfoData buffer; initStringInfo(&buffer); appendStringInfo(&buffer, "SELECT groupid, cpu_usage, memory_usage " "FROM pg_resgroup_get_status(%d)", inGroupId); CdbDispatchCommand(buffer.data, DF_WITH_SNAPSHOT, &cdb_pgresults); if (cdb_pgresults.numResults == 0) elog(ERROR, "pg_resgroup_get_status() didn't get back any resource statistic from the segDBs"); for (i = 0; i < cdb_pgresults.numResults; i++) { struct pg_result *pg_result = cdb_pgresults.pg_results[i]; /* * Any error here should have propagated into errbuf, so we shouldn't * ever see anything other that tuples_ok here. But, check to be * sure. */ if (PQresultStatus(pg_result) != PGRES_TUPLES_OK) { cdbdisp_clearCdbPgResults(&cdb_pgresults); elog(ERROR, "pg_resgroup_get_status(): resultStatus not tuples_Ok"); } Assert(PQntuples(pg_result) == ctx->nGroups); for (j = 0; j < ctx->nGroups; j++) { const char *result; ResGroupStat *row = &ctx->groups[j]; Oid groupId = pg_atoi(PQgetvalue(pg_result, j, 0), sizeof(Oid), 0); Assert(groupId == row->groupId); if (row->memUsage->len == 0) { Datum d = ResGroupGetStat(groupId, RES_GROUP_STAT_MEM_USAGE); row->groupId = groupId; appendStringInfo(row->memUsage, "{\"%d\":%s", GpIdentity.segindex, DatumGetCString(d)); appendStringInfo(row->cpuUsage, "{"); calcCpuUsage(row->cpuUsage, usages[j], timestamps[j], ResGroupOps_GetCpuUsage(groupId), GetCurrentTimestamp()); } result = PQgetvalue(pg_result, j, 1); appendStringInfo(row->cpuUsage, ", %s", result); result = PQgetvalue(pg_result, j, 2); appendStringInfo(row->memUsage, ", %s", result); if (i == cdb_pgresults.numResults - 1) { appendStringInfoChar(row->cpuUsage, '}'); appendStringInfoChar(row->memUsage, '}'); } } } cdbdisp_clearCdbPgResults(&cdb_pgresults); } else { pg_usleep(300000); for (j = 0; j < ctx->nGroups; j++) { ResGroupStat *row = &ctx->groups[j]; Oid groupId = DatumGetObjectId(row->groupId); Datum d = ResGroupGetStat(groupId, RES_GROUP_STAT_MEM_USAGE); appendStringInfo(row->memUsage, "\"%d\":%s", GpIdentity.segindex, DatumGetCString(d)); calcCpuUsage(row->cpuUsage, usages[j], timestamps[j], ResGroupOps_GetCpuUsage(groupId), GetCurrentTimestamp()); } } }
/** * An interface to re-weigh an existing session on the master and all backends. * Input: * session id - what session is statement on? * command count - what is the command count of statement. * weight - int, what should be the new priority of this statement. * Output: * number of backends whose weights were changed by this call. */ Datum gp_adjust_priority_int(PG_FUNCTION_ARGS) { int32 session_id = PG_GETARG_INT32(0); int32 command_count = PG_GETARG_INT32(1); int32 wt = PG_GETARG_INT32(2); int numfound = 0; StatementId sid; if (!gp_enable_resqueue_priority) elog(ERROR, "Query prioritization is disabled."); if (!superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), (errmsg("only superuser can re-prioritize a query after it has begun execution")))); if (Gp_role == GP_ROLE_UTILITY) elog(ERROR, "Query prioritization does not work in utility mode."); if (wt <= 0) elog(ERROR, "Weight of statement must be greater than 0."); init(&sid, session_id, command_count); if (Gp_role == GP_ROLE_DISPATCH) { int i = 0; CdbPgResults cdb_pgresults = {NULL, 0}; char cmd[255]; /* * Make sure the session exists before dispatching */ for (i = 0; i < backoffSingleton->numEntries; i++) { BackoffBackendSharedEntry *se = getBackoffEntryRW(i); if (equalStatementId(&se->statementId, &sid)) { if (gp_debug_resqueue_priority) { elog(LOG, "changing weight of (%d:%d) from %d to %d", se->statementId.sessionId, se->statementId.commandCount, se->weight, wt); } se->weight = wt; numfound++; } } if (numfound == 0) elog(ERROR, "Did not find any backend entries for session %d, command count %d.", session_id, command_count); /* * Ok, it exists, dispatch the command to the segDBs. */ sprintf(cmd, "select gp_adjust_priority(%d,%d,%d)", session_id, command_count, wt); CdbDispatchCommand(cmd, DF_WITH_SNAPSHOT, &cdb_pgresults); for (i = 0; i < cdb_pgresults.numResults; i++) { struct pg_result *pgresult = cdb_pgresults.pg_results[i]; if (PQresultStatus(pgresult) != PGRES_TUPLES_OK) { cdbdisp_clearCdbPgResults(&cdb_pgresults); elog(ERROR, "gp_adjust_priority: resultStatus not tuples_Ok"); } else { int j; for (j = 0; j < PQntuples(pgresult); j++) { int retvalue = 0; retvalue = atoi(PQgetvalue(pgresult, j, 0)); numfound += retvalue; } } } cdbdisp_clearCdbPgResults(&cdb_pgresults); } else /* Gp_role == EXECUTE */ { /* * Find number of backends working on behalf of this session and * distribute the weight evenly. */ int i = 0; Assert(Gp_role == GP_ROLE_EXECUTE); for (i = 0; i < backoffSingleton->numEntries; i++) { BackoffBackendSharedEntry *se = getBackoffEntryRW(i); if (equalStatementId(&se->statementId, &sid)) { if (gp_debug_resqueue_priority) { elog(LOG, "changing weight of (%d:%d) from %d to %d", se->statementId.sessionId, se->statementId.commandCount, se->weight, wt); } se->weight = wt; numfound++; } } if (gp_debug_resqueue_priority && numfound == 0) { elog(LOG, "did not find any matching backends on segments"); } } PG_RETURN_INT32(numfound); }
/* * pg_lock_status - produce a view with one row per held or awaited lock mode */ Datum pg_lock_status(PG_FUNCTION_ARGS) { FuncCallContext *funcctx; PG_Lock_Status *mystatus; LockData *lockData; PredicateLockData *predLockData; if (SRF_IS_FIRSTCALL()) { TupleDesc tupdesc; MemoryContext oldcontext; /* create a function context for cross-call persistence */ funcctx = SRF_FIRSTCALL_INIT(); /* * switch to memory context appropriate for multiple function calls */ oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); /* build tupdesc for result tuples */ /* this had better match pg_locks view in system_views.sql */ tupdesc = CreateTemplateTupleDesc(NUM_LOCK_STATUS_COLUMNS, false); TupleDescInitEntry(tupdesc, (AttrNumber) 1, "locktype", TEXTOID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 2, "database", OIDOID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 3, "relation", OIDOID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 4, "page", INT4OID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 5, "tuple", INT2OID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 6, "virtualxid", TEXTOID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 7, "transactionid", XIDOID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 8, "classid", OIDOID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 9, "objid", OIDOID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 10, "objsubid", INT2OID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 11, "virtualtransaction", TEXTOID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 12, "pid", INT4OID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 13, "mode", TEXTOID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 14, "granted", BOOLOID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 15, "fastpath", BOOLOID, -1, 0); /* * These next columns are specific to GPDB */ TupleDescInitEntry(tupdesc, (AttrNumber) 16, "mppSessionId", INT4OID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 17, "mppIsWriter", BOOLOID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 18, "gp_segment_id", INT4OID, -1, 0); funcctx->tuple_desc = BlessTupleDesc(tupdesc); /* * Collect all the locking information that we will format and send * out as a result set. */ mystatus = (PG_Lock_Status *) palloc(sizeof(PG_Lock_Status)); funcctx->user_fctx = (void *) mystatus; mystatus->lockData = GetLockStatusData(); mystatus->currIdx = 0; mystatus->predLockData = GetPredicateLockStatusData(); mystatus->predLockIdx = 0; mystatus->numSegLocks = 0; mystatus->numsegresults = 0; mystatus->segresults = NULL; /* * Seeing the locks just from the masterDB isn't enough to know what is locked, * or if there is a deadlock. That's because the segDBs also take locks. * Some locks show up only on the master, some only on the segDBs, and some on both. * * So, let's collect the lock information from all the segDBs. Sure, this means * there are a lot more rows coming back from pg_locks than before, since most locks * on the segDBs happen across all the segDBs at the same time. But not always, * so let's play it safe and get them all. */ if (Gp_role == GP_ROLE_DISPATCH) { CdbPgResults cdb_pgresults = {NULL, 0}; StringInfoData buffer; int i; initStringInfo(&buffer); /* * Why dispatch something here, rather than do a UNION ALL in pg_locks view, and * a join to gp_dist_random('gp_id')? There are several important reasons. * * The union all method is much slower, and requires taking locks on gp_id. * More importantly, applications such as pgAdmin do queries of this view that * involve a correlated subqueries joining to other catalog tables, * which works if we do it this way, but fails * if the view includes the union all. That completely breaks the server status * display in pgAdmin. * * Why dispatch this way, rather than via SPI? There are several advantages. * First, it's easy to get "writer gang is busy" errors if we use SPI. * * Second, this should be much faster, as it doesn't require setting up * the interconnect, and doesn't need to touch any actual data tables to be * able to get the gp_segment_id. * * The downside is we get n result sets, where n == number of segDBs. * * It would be better yet if we sent a plan tree rather than a text string, * so the segDBs don't need to parse it. That would also avoid taking any relation locks * on the segDB to get this info (normally need to get an accessShareLock on pg_locks on the segDB * to make sure it doesn't go away during parsing). But the only safe way I know to do this * is to hand-build the plan tree, and I'm to lazy to do it right now. It's just a matter of * building a function scan node, and filling it in with our result set info (from the tupledesc). * * One thing to note: it's OK to join pg_locks with any catalog table or master-only table, * but joining to a distributed table will result in "writer gang busy: possible attempt to * execute volatile function in unsupported context" errors, because * the scan of the distributed table might already be running on the writer gang * when we want to dispatch this. * * This could be fixed by allocating a reader gang and dispatching to that, but the cost * of setting up a new gang is high, and I've never seen anyone need to join this to a * distributed table. * * GPDB_84_MERGE_FIXME: Should we rewrite this in a different way now that we have * ON SEGMENT/ ON MASTER attributes on functions? */ CdbDispatchCommand("SELECT * FROM pg_catalog.pg_lock_status()", DF_WITH_SNAPSHOT, &cdb_pgresults); if (cdb_pgresults.numResults == 0) elog(ERROR, "pg_locks didn't get back any data from the segDBs"); for (i = 0; i < cdb_pgresults.numResults; i++) { /* * Any error here should have propagated into errbuf, so we shouldn't * ever see anything other that tuples_ok here. But, check to be * sure. */ if (PQresultStatus(cdb_pgresults.pg_results[i]) != PGRES_TUPLES_OK) { cdbdisp_clearCdbPgResults(&cdb_pgresults); elog(ERROR,"pg_locks: resultStatus not tuples_Ok"); } /* * numSegLocks needs to be the total size we are returning to * the application. At the start of this loop, it has the count * for the masterDB locks. Add each of the segDB lock counts. */ mystatus->numSegLocks += PQntuples(cdb_pgresults.pg_results[i]); /* * This query better match the tupledesc we just made above. */ if (PQnfields(cdb_pgresults.pg_results[i]) != tupdesc->natts) elog(ERROR, "unexpected number of columns returned from pg_lock_status() on segment (%d, expected %d)", PQnfields(cdb_pgresults.pg_results[i]), tupdesc->natts); } mystatus->numsegresults = cdb_pgresults.numResults; /* * cdbdisp_dispatchRMCommand copies the result sets into our memory, which * will still exist on the subsequent calls. */ mystatus->segresults = cdb_pgresults.pg_results; } MemoryContextSwitchTo(oldcontext); } funcctx = SRF_PERCALL_SETUP(); mystatus = (PG_Lock_Status *) funcctx->user_fctx; lockData = mystatus->lockData; /* * This loop returns all the local lock data from the segment we are running on. */ while (mystatus->currIdx < lockData->nelements) { bool granted; LOCKMODE mode = 0; const char *locktypename; char tnbuf[32]; Datum values[NUM_LOCK_STATUS_COLUMNS]; bool nulls[NUM_LOCK_STATUS_COLUMNS]; HeapTuple tuple; Datum result; LockInstanceData *instance; instance = &(lockData->locks[mystatus->currIdx]); /* * Look to see if there are any held lock modes in this PROCLOCK. If * so, report, and destructively modify lockData so we don't report * again. */ granted = false; if (instance->holdMask) { for (mode = 0; mode < MAX_LOCKMODES; mode++) { if (instance->holdMask & LOCKBIT_ON(mode)) { granted = true; instance->holdMask &= LOCKBIT_OFF(mode); break; } } } /* * If no (more) held modes to report, see if PROC is waiting for a * lock on this lock. */ if (!granted) { if (instance->waitLockMode != NoLock) { /* Yes, so report it with proper mode */ mode = instance->waitLockMode; /* * We are now done with this PROCLOCK, so advance pointer to * continue with next one on next call. */ mystatus->currIdx++; } else { /* * Okay, we've displayed all the locks associated with this * PROCLOCK, proceed to the next one. */ mystatus->currIdx++; continue; } } /* * Form tuple with appropriate data. */ MemSet(values, 0, sizeof(values)); MemSet(nulls, false, sizeof(nulls)); if (instance->locktag.locktag_type <= LOCKTAG_LAST_TYPE) locktypename = LockTagTypeNames[instance->locktag.locktag_type]; else { snprintf(tnbuf, sizeof(tnbuf), "unknown %d", (int) instance->locktag.locktag_type); locktypename = tnbuf; } values[0] = CStringGetTextDatum(locktypename); switch ((LockTagType) instance->locktag.locktag_type) { case LOCKTAG_RELATION: case LOCKTAG_RELATION_EXTEND: values[1] = ObjectIdGetDatum(instance->locktag.locktag_field1); values[2] = ObjectIdGetDatum(instance->locktag.locktag_field2); nulls[3] = true; nulls[4] = true; nulls[5] = true; nulls[6] = true; nulls[7] = true; nulls[8] = true; nulls[9] = true; break; case LOCKTAG_PAGE: values[1] = ObjectIdGetDatum(instance->locktag.locktag_field1); values[2] = ObjectIdGetDatum(instance->locktag.locktag_field2); values[3] = UInt32GetDatum(instance->locktag.locktag_field3); nulls[4] = true; nulls[5] = true; nulls[6] = true; nulls[7] = true; nulls[8] = true; nulls[9] = true; break; case LOCKTAG_TUPLE: values[1] = ObjectIdGetDatum(instance->locktag.locktag_field1); values[2] = ObjectIdGetDatum(instance->locktag.locktag_field2); values[3] = UInt32GetDatum(instance->locktag.locktag_field3); values[4] = UInt16GetDatum(instance->locktag.locktag_field4); nulls[5] = true; nulls[6] = true; nulls[7] = true; nulls[8] = true; nulls[9] = true; break; case LOCKTAG_TRANSACTION: values[6] = TransactionIdGetDatum(instance->locktag.locktag_field1); nulls[1] = true; nulls[2] = true; nulls[3] = true; nulls[4] = true; nulls[5] = true; nulls[7] = true; nulls[8] = true; nulls[9] = true; break; case LOCKTAG_VIRTUALTRANSACTION: values[5] = VXIDGetDatum(instance->locktag.locktag_field1, instance->locktag.locktag_field2); nulls[1] = true; nulls[2] = true; nulls[3] = true; nulls[4] = true; nulls[6] = true; nulls[7] = true; nulls[8] = true; nulls[9] = true; break; case LOCKTAG_RELATION_APPENDONLY_SEGMENT_FILE: values[1] = ObjectIdGetDatum(instance->locktag.locktag_field1); values[2] = ObjectIdGetDatum(instance->locktag.locktag_field2); values[7] = ObjectIdGetDatum(instance->locktag.locktag_field3); nulls[3] = true; nulls[4] = true; nulls[5] = true; nulls[6] = true; nulls[8] = true; nulls[9] = true; break; case LOCKTAG_RESOURCE_QUEUE: #if 0 values[1] = ObjectIdGetDatum(proc->databaseId); #endif nulls[1] = true; values[8] = ObjectIdGetDatum(instance->locktag.locktag_field1); nulls[2] = true; nulls[3] = true; nulls[4] = true; nulls[5] = true; nulls[6] = true; nulls[7] = true; nulls[9] = true; break; case LOCKTAG_OBJECT: case LOCKTAG_USERLOCK: case LOCKTAG_ADVISORY: default: /* treat unknown locktags like OBJECT */ values[1] = ObjectIdGetDatum(instance->locktag.locktag_field1); values[7] = ObjectIdGetDatum(instance->locktag.locktag_field2); values[8] = ObjectIdGetDatum(instance->locktag.locktag_field3); values[9] = Int16GetDatum(instance->locktag.locktag_field4); nulls[2] = true; nulls[3] = true; nulls[4] = true; nulls[5] = true; nulls[6] = true; break; } values[10] = VXIDGetDatum(instance->backend, instance->lxid); if (instance->pid != 0) values[11] = Int32GetDatum(instance->pid); else nulls[11] = true; values[12] = CStringGetTextDatum(GetLockmodeName(instance->locktag.locktag_lockmethodid, mode)); values[13] = BoolGetDatum(granted); values[14] = BoolGetDatum(instance->fastpath); values[15] = Int32GetDatum(instance->mppSessionId); values[16] = BoolGetDatum(instance->mppIsWriter); values[17] = Int32GetDatum(GpIdentity.segindex); tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls); result = HeapTupleGetDatum(tuple); SRF_RETURN_NEXT(funcctx, result); } /* * This loop only executes on the masterDB and only in dispatch mode, because that * is the only time we dispatched to the segDBs. */ while (mystatus->currIdx >= lockData->nelements && mystatus->currIdx < lockData->nelements + mystatus->numSegLocks) { HeapTuple tuple; Datum result; Datum values[NUM_LOCK_STATUS_COLUMNS]; bool nulls[NUM_LOCK_STATUS_COLUMNS]; int i; int whichresultset = 0; int whichelement = mystatus->currIdx - lockData->nelements; int whichrow = whichelement; Assert(Gp_role == GP_ROLE_DISPATCH); /* * Because we have one result set per segDB (rather than one big result set with everything), * we need to figure out which result set we are on, and which row within that result set * we are returning. * * So, we walk through all the result sets and all the rows in each one, in order. */ while(whichrow >= PQntuples(mystatus->segresults[whichresultset])) { whichrow -= PQntuples(mystatus->segresults[whichresultset]); whichresultset++; if (whichresultset >= mystatus->numsegresults) break; } /* * If this condition is true, we have already sent everything back, * and we just want to do the SRF_RETURN_DONE */ if (whichresultset >= mystatus->numsegresults) break; mystatus->currIdx++; /* * Form tuple with appropriate data we got from the segDBs */ MemSet(values, 0, sizeof(values)); MemSet(nulls, false, sizeof(nulls)); /* * For each column, extract out the value (which comes out in text). * Convert it to the appropriate datatype to match our tupledesc, * and put that in values. * The columns look like this (from select statement earlier): * * " (locktype text, database oid, relation oid, page int4, tuple int2," * " transactionid xid, classid oid, objid oid, objsubid int2," * " transaction xid, pid int4, mode text, granted boolean, " * " mppSessionId int4, mppIsWriter boolean, gp_segment_id int4) ," */ values[0] = CStringGetTextDatum(PQgetvalue(mystatus->segresults[whichresultset], whichrow, 0)); values[1] = ObjectIdGetDatum(atoi(PQgetvalue(mystatus->segresults[whichresultset], whichrow, 1))); values[2] = ObjectIdGetDatum(atoi(PQgetvalue(mystatus->segresults[whichresultset], whichrow, 2))); values[3] = UInt32GetDatum(atoi(PQgetvalue(mystatus->segresults[whichresultset], whichrow, 3))); values[4] = UInt16GetDatum(atoi(PQgetvalue(mystatus->segresults[whichresultset], whichrow, 4))); values[5] = CStringGetTextDatum(PQgetvalue(mystatus->segresults[whichresultset], whichrow, 5)); values[6] = TransactionIdGetDatum(atoi(PQgetvalue(mystatus->segresults[whichresultset], whichrow, 6))); values[7] = ObjectIdGetDatum(atoi(PQgetvalue(mystatus->segresults[whichresultset], whichrow, 7))); values[8] = ObjectIdGetDatum(atoi(PQgetvalue(mystatus->segresults[whichresultset], whichrow, 8))); values[9] = UInt16GetDatum(atoi(PQgetvalue(mystatus->segresults[whichresultset], whichrow, 9))); values[10] = CStringGetTextDatum(PQgetvalue(mystatus->segresults[whichresultset], whichrow, 10)); values[11] = UInt32GetDatum(atoi(PQgetvalue(mystatus->segresults[whichresultset], whichrow, 11))); values[12] = CStringGetTextDatum(PQgetvalue(mystatus->segresults[whichresultset], whichrow, 12)); values[13] = BoolGetDatum(strncmp(PQgetvalue(mystatus->segresults[whichresultset], whichrow,13),"t",1)==0); values[14] = BoolGetDatum(strncmp(PQgetvalue(mystatus->segresults[whichresultset], whichrow,14),"t",1)==0); values[15] = Int32GetDatum(atoi(PQgetvalue(mystatus->segresults[whichresultset], whichrow,15))); values[16] = BoolGetDatum(strncmp(PQgetvalue(mystatus->segresults[whichresultset], whichrow,16),"t",1)==0); values[17] = Int32GetDatum(atoi(PQgetvalue(mystatus->segresults[whichresultset], whichrow,17))); /* * Copy the null info over. It should all match properly. */ for (i = 0; i < NUM_LOCK_STATUS_COLUMNS; i++) { nulls[i] = PQgetisnull(mystatus->segresults[whichresultset], whichrow, i); } tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls); result = HeapTupleGetDatum(tuple); SRF_RETURN_NEXT(funcctx, result); } /* * Have returned all regular locks. Now start on the SIREAD predicate * locks. */ predLockData = mystatus->predLockData; if (mystatus->predLockIdx < predLockData->nelements) { PredicateLockTargetType lockType; PREDICATELOCKTARGETTAG *predTag = &(predLockData->locktags[mystatus->predLockIdx]); SERIALIZABLEXACT *xact = &(predLockData->xacts[mystatus->predLockIdx]); Datum values[NUM_LOCK_STATUS_COLUMNS]; bool nulls[NUM_LOCK_STATUS_COLUMNS]; HeapTuple tuple; Datum result; mystatus->predLockIdx++; /* * Form tuple with appropriate data. */ MemSet(values, 0, sizeof(values)); MemSet(nulls, false, sizeof(nulls)); /* lock type */ lockType = GET_PREDICATELOCKTARGETTAG_TYPE(*predTag); values[0] = CStringGetTextDatum(PredicateLockTagTypeNames[lockType]); /* lock target */ values[1] = GET_PREDICATELOCKTARGETTAG_DB(*predTag); values[2] = GET_PREDICATELOCKTARGETTAG_RELATION(*predTag); if (lockType == PREDLOCKTAG_TUPLE) values[4] = GET_PREDICATELOCKTARGETTAG_OFFSET(*predTag); else nulls[4] = true; if ((lockType == PREDLOCKTAG_TUPLE) || (lockType == PREDLOCKTAG_PAGE)) values[3] = GET_PREDICATELOCKTARGETTAG_PAGE(*predTag); else nulls[3] = true; /* these fields are targets for other types of locks */ nulls[5] = true; /* virtualxid */ nulls[6] = true; /* transactionid */ nulls[7] = true; /* classid */ nulls[8] = true; /* objid */ nulls[9] = true; /* objsubid */ /* lock holder */ values[10] = VXIDGetDatum(xact->vxid.backendId, xact->vxid.localTransactionId); if (xact->pid != 0) values[11] = Int32GetDatum(xact->pid); else nulls[11] = true; /* * Lock mode. Currently all predicate locks are SIReadLocks, which are * always held (never waiting) and have no fast path */ values[12] = CStringGetTextDatum("SIReadLock"); values[13] = BoolGetDatum(true); values[14] = BoolGetDatum(false); /* * GPDB_91_MERGE_FIXME: what to set these GPDB-specific fields to? * These commented-out values are copy-pasted from the code above * for normal locks. */ //values[14] = Int32GetDatum(proc->mppSessionId); //values[15] = BoolGetDatum(proc->mppIsWriter); //values[16] = Int32GetDatum(Gp_segment); tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls); result = HeapTupleGetDatum(tuple); SRF_RETURN_NEXT(funcctx, result); } /* * if we dispatched to the segDBs, free up the memory holding the result sets. * Otherwise we might leak this memory each time we got called (does it automatically * get freed by the pool being deleted? Probably, but this is safer). */ if (mystatus->segresults != NULL) { int i; for (i = 0; i < mystatus->numsegresults; i++) PQclear(mystatus->segresults[i]); free(mystatus->segresults); } SRF_RETURN_DONE(funcctx); }