/* * InitPlanNodeGpmonPkt -- initialize the init gpmon package, and send it off. */ void InitPlanNodeGpmonPkt(Plan *plan, gpmon_packet_t *gpmon_pkt, EState *estate, PerfmonNodeType type, int64 rowsout_est, char* relname) { int rowsout_adjustment_factor = 0; if(!plan) return; /* The estimates are now global so we need to adjust by * the number of segments in the array. */ rowsout_adjustment_factor = getgpsegmentCount(); /* Make sure we don't div by zero below */ if (rowsout_adjustment_factor < 1) rowsout_adjustment_factor = 1; Assert(rowsout_adjustment_factor >= 1); memset(gpmon_pkt, 0, sizeof(gpmon_packet_t)); gpmon_pkt->magic = GPMON_MAGIC; gpmon_pkt->version = GPMON_PACKET_VERSION; gpmon_pkt->pkttype = GPMON_PKTTYPE_QEXEC; gpmon_gettmid(&gpmon_pkt->u.qexec.key.tmid); gpmon_pkt->u.qexec.key.ssid = gp_session_id; gpmon_pkt->u.qexec.key.ccnt = gp_command_count; gpmon_pkt->u.qexec.key.hash_key.segid = Gp_segment; gpmon_pkt->u.qexec.key.hash_key.pid = MyProcPid; gpmon_pkt->u.qexec.key.hash_key.nid = plan->plan_node_id; gpmon_pkt->u.qexec.pnid = plan->plan_parent_node_id; gpmon_pkt->u.qexec.nodeType = (apr_uint16_t)type; gpmon_pkt->u.qexec.rowsout = 0; gpmon_pkt->u.qexec.rowsout_est = rowsout_est / rowsout_adjustment_factor; if (relname) { snprintf(gpmon_pkt->u.qexec.relation_name, sizeof(gpmon_pkt->u.qexec.relation_name), "%s", relname); } gpmon_pkt->u.qexec.status = (uint8)PMNS_Initialize; if(gp_enable_gpperfmon && estate) { gpmon_send(gpmon_pkt); } gpmon_pkt->u.qexec.status = (uint8)PMNS_Executing; }
/* * Allocates memory for a CdbDispatchCmdAsync structure and do the initialization. * * Memory will be freed in function cdbdisp_destroyDispatcherState by deleting the * memory context. */ static void * cdbdisp_makeDispatchParams_async(int maxSlices, char *queryText, int len) { int maxResults = maxSlices * getgpsegmentCount(); int size = 0; CdbDispatchCmdAsync *pParms = palloc0(sizeof(CdbDispatchCmdAsync)); size = maxResults * sizeof(CdbDispatchResult *); pParms->dispatchResultPtrArray = (CdbDispatchResult **) palloc0(size); pParms->dispatchCount = 0; pParms->waitMode = DISPATCH_WAIT_NONE; pParms->query_text = queryText; pParms->query_text_len = len; return (void*)pParms; }
/* * Test if the connections of the primary writer gang are alive. */ bool isPrimaryWriterGangAlive(void) { if (primaryWriterGang == NULL) return false; int size = primaryWriterGang->size; int i = 0; Assert(size = getgpsegmentCount()); for (i = 0; i < size; i++) { SegmentDatabaseDescriptor *segdb = &primaryWriterGang->db_descriptors[i]; if (!isSockAlive(segdb->conn->sock)) return false; } return true; }
/* * Creates a new gang by logging on a session to each segDB involved. * * call this function in GangContext memory context. * elog ERROR or return a non-NULL gang. */ static Gang* createGang_async(GangType type, int gang_id, int size, int content) { Gang *newGangDefinition; SegmentDatabaseDescriptor *segdbDesc = NULL; int i = 0; int create_gang_retry_counter = 0; int in_recovery_mode_count = 0; int successful_connections = 0; bool retry = false; int poll_timeout = 0; struct timeval startTS; PostgresPollingStatusType *pollingStatus = NULL; /* true means connection status is confirmed, either established or in recovery mode */ bool *connStatusDone = NULL; ELOG_DISPATCHER_DEBUG("createGang type = %d, gang_id = %d, size = %d, content = %d", type, gang_id, size, content); /* check arguments */ Assert(size == 1 || size == getgpsegmentCount()); Assert(CurrentResourceOwner != NULL); Assert(CurrentMemoryContext == GangContext); /* Writer gang is created before reader gangs. */ if (type == GANGTYPE_PRIMARY_WRITER) Insist(!GangsExist()); /* Check writer gang firstly*/ if (type != GANGTYPE_PRIMARY_WRITER && !isPrimaryWriterGangAlive()) ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), errmsg("failed to acquire resources on one or more segments"), errdetail("writer gang got broken before creating reader gangs"))); create_gang_retry: /* If we're in a retry, we may need to reset our initial state, a bit */ newGangDefinition = NULL; successful_connections = 0; in_recovery_mode_count = 0; retry = false; /* allocate and initialize a gang structure */ newGangDefinition = buildGangDefinition(type, gang_id, size, content); Assert(newGangDefinition != NULL); Assert(newGangDefinition->size == size); Assert(newGangDefinition->perGangContext != NULL); MemoryContextSwitchTo(newGangDefinition->perGangContext); /* allocate memory within perGangContext and will be freed automatically when gang is destroyed */ pollingStatus = palloc(sizeof(PostgresPollingStatusType) * size); connStatusDone = palloc(sizeof(bool) * size); struct pollfd *fds; PG_TRY(); { for (i = 0; i < size; i++) { char gpqeid[100]; char *options; /* * Create the connection requests. If we find a segment without a * valid segdb we error out. Also, if this segdb is invalid, we must * fail the connection. */ segdbDesc = &newGangDefinition->db_descriptors[i]; /* * Build the connection string. Writer-ness needs to be processed * early enough now some locks are taken before command line options * are recognized. */ build_gpqeid_param(gpqeid, sizeof(gpqeid), segdbDesc->segindex, type == GANGTYPE_PRIMARY_WRITER, gang_id); options = makeOptions(); /* start connection in asynchronous way */ cdbconn_doConnectStart(segdbDesc, gpqeid, options); if(cdbconn_isBadConnection(segdbDesc)) ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), errmsg("failed to acquire resources on one or more segments"), errdetail("%s (%s)", PQerrorMessage(segdbDesc->conn), segdbDesc->whoami))); connStatusDone[i] = false; /* * If connection status is not CONNECTION_BAD after PQconnectStart(), we must * act as if the PQconnectPoll() had returned PGRES_POLLING_WRITING */ pollingStatus[i] = PGRES_POLLING_WRITING; } /* * Ok, we've now launched all the connection attempts. Start the * timeout clock (= get the start timestamp), and poll until they're * all completed or we reach timeout. */ gettimeofday(&startTS, NULL); fds = (struct pollfd *) palloc0(sizeof(struct pollfd) * size); for(;;) { int nready; int nfds = 0; poll_timeout = getPollTimeout(&startTS); for (i = 0; i < size; i++) { segdbDesc = &newGangDefinition->db_descriptors[i]; /* Skip established connections and in-recovery-mode connections*/ if (connStatusDone[i]) continue; switch (pollingStatus[i]) { case PGRES_POLLING_OK: cdbconn_doConnectComplete(segdbDesc); if (segdbDesc->motionListener == -1 || segdbDesc->motionListener == 0) ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), errmsg("failed to acquire resources on one or more segments"), errdetail("Internal error: No motion listener port (%s)", segdbDesc->whoami))); successful_connections++; connStatusDone[i] = true; continue; case PGRES_POLLING_READING: fds[nfds].fd = PQsocket(segdbDesc->conn); fds[nfds].events = POLLIN; nfds++; break; case PGRES_POLLING_WRITING: fds[nfds].fd = PQsocket(segdbDesc->conn); fds[nfds].events = POLLOUT; nfds++; break; case PGRES_POLLING_FAILED: if (segment_failure_due_to_recovery(&segdbDesc->conn->errorMessage)) { in_recovery_mode_count++; connStatusDone[i] = true; elog(LOG, "segment is in recovery mode (%s)", segdbDesc->whoami); } else { ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), errmsg("failed to acquire resources on one or more segments"), errdetail("%s (%s)", PQerrorMessage(segdbDesc->conn), segdbDesc->whoami))); } break; default: ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), errmsg("failed to acquire resources on one or more segments"), errdetail("unknow pollstatus (%s)", segdbDesc->whoami))); break; } if (poll_timeout == 0) ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), errmsg("failed to acquire resources on one or more segments"), errdetail("timeout expired\n (%s)", segdbDesc->whoami))); } if (nfds == 0) break; CHECK_FOR_INTERRUPTS(); /* Wait until something happens */ nready = poll(fds, nfds, poll_timeout); if (nready < 0) { int sock_errno = SOCK_ERRNO; if (sock_errno == EINTR) continue; ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), errmsg("failed to acquire resources on one or more segments"), errdetail("poll() failed: errno = %d", sock_errno))); } else if (nready > 0) { int currentFdNumber = 0; for (i = 0; i < size; i++) { segdbDesc = &newGangDefinition->db_descriptors[i]; if (connStatusDone[i]) continue; Assert(PQsocket(segdbDesc->conn) > 0); Assert(PQsocket(segdbDesc->conn) == fds[currentFdNumber].fd); if (fds[currentFdNumber].revents & fds[currentFdNumber].events) pollingStatus[i] = PQconnectPoll(segdbDesc->conn); currentFdNumber++; } } } ELOG_DISPATCHER_DEBUG("createGang: %d processes requested; %d successful connections %d in recovery", size, successful_connections, in_recovery_mode_count); MemoryContextSwitchTo(GangContext); /* some segments are in recovery mode*/ if (successful_connections != size) { Assert(successful_connections + in_recovery_mode_count == size); /* FTS shows some segment DBs are down */ if (isFTSEnabled() && FtsTestSegmentDBIsDown(newGangDefinition->db_descriptors, size)) ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), errmsg("failed to acquire resources on one or more segments"), errdetail("FTS detected one or more segments are down"))); if ( gp_gang_creation_retry_count <= 0 || create_gang_retry_counter++ >= gp_gang_creation_retry_count || type != GANGTYPE_PRIMARY_WRITER) ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), errmsg("failed to acquire resources on one or more segments"), errdetail("segments is in recovery mode"))); ELOG_DISPATCHER_DEBUG("createGang: gang creation failed, but retryable."); DisconnectAndDestroyGang(newGangDefinition); newGangDefinition = NULL; retry = true; } } PG_CATCH(); { MemoryContextSwitchTo(GangContext); DisconnectAndDestroyGang(newGangDefinition); newGangDefinition = NULL; if (type == GANGTYPE_PRIMARY_WRITER) { DisconnectAndDestroyAllGangs(true); CheckForResetSession(); } PG_RE_THROW(); } PG_END_TRY(); if (retry) { CHECK_FOR_INTERRUPTS(); pg_usleep(gp_gang_creation_retry_timer * 1000); CHECK_FOR_INTERRUPTS(); goto create_gang_retry; } setLargestGangsize(size); return newGangDefinition; }
/* * Reads the GP catalog tables and build a CdbComponentDatabases structure. * It then converts this to a Gang structure and initializes all the non-connection related fields. * * Call this function in GangContext. * Returns a not-null pointer. */ Gang * buildGangDefinition(GangType type, int gang_id, int size, int content) { Gang *newGangDefinition = NULL; CdbComponentDatabaseInfo *cdbinfo = NULL; CdbComponentDatabaseInfo *cdbInfoCopy = NULL; SegmentDatabaseDescriptor *segdbDesc = NULL; MemoryContext perGangContext = NULL; int segCount = 0; int i = 0; ELOG_DISPATCHER_DEBUG("buildGangDefinition:Starting %d qExec processes for %s gang", size, gangTypeToString(type)); Assert(CurrentMemoryContext == GangContext); Assert(size == 1 || size == getgpsegmentCount()); /* read gp_segment_configuration and build CdbComponentDatabases */ cdb_component_dbs = getComponentDatabases(); if (cdb_component_dbs == NULL || cdb_component_dbs->total_segments <= 0 || cdb_component_dbs->total_segment_dbs <= 0) insist_log(false, "schema not populated while building segworker group"); /* if mirroring is not configured */ if (cdb_component_dbs->total_segment_dbs == cdb_component_dbs->total_segments) { ELOG_DISPATCHER_DEBUG("building Gang: mirroring not configured"); disableFTS(); } perGangContext = AllocSetContextCreate(GangContext, "Per Gang Context", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); Assert(perGangContext != NULL); MemoryContextSwitchTo(perGangContext); /* allocate a gang */ newGangDefinition = (Gang *) palloc0(sizeof(Gang)); newGangDefinition->type = type; newGangDefinition->size = size; newGangDefinition->gang_id = gang_id; newGangDefinition->allocated = false; newGangDefinition->noReuse = false; newGangDefinition->dispatcherActive = false; newGangDefinition->portal_name = NULL; newGangDefinition->perGangContext = perGangContext; newGangDefinition->db_descriptors = (SegmentDatabaseDescriptor *) palloc0(size * sizeof(SegmentDatabaseDescriptor)); /* initialize db_descriptors */ switch (type) { case GANGTYPE_ENTRYDB_READER: cdbinfo = &cdb_component_dbs->entry_db_info[0]; cdbInfoCopy = copyCdbComponentDatabaseInfo(cdbinfo); segdbDesc = &newGangDefinition->db_descriptors[0]; cdbconn_initSegmentDescriptor(segdbDesc, cdbInfoCopy); setQEIdentifier(segdbDesc, -1, perGangContext); break; case GANGTYPE_SINGLETON_READER: cdbinfo = findDatabaseInfoBySegIndex(cdb_component_dbs, content); cdbInfoCopy = copyCdbComponentDatabaseInfo(cdbinfo); segdbDesc = &newGangDefinition->db_descriptors[0]; cdbconn_initSegmentDescriptor(segdbDesc, cdbInfoCopy); setQEIdentifier(segdbDesc, -1, perGangContext); break; case GANGTYPE_PRIMARY_READER: case GANGTYPE_PRIMARY_WRITER: /* * We loop through the segment_db_info. Each item has a segindex. * They are sorted by segindex, and there can be > 1 segment_db_info for * a given segindex (currently, there can be 1 or 2) */ for (i = 0; i < cdb_component_dbs->total_segment_dbs; i++) { cdbinfo = &cdb_component_dbs->segment_db_info[i]; if (SEGMENT_IS_ACTIVE_PRIMARY(cdbinfo)) { segdbDesc = &newGangDefinition->db_descriptors[segCount]; cdbInfoCopy = copyCdbComponentDatabaseInfo(cdbinfo); cdbconn_initSegmentDescriptor(segdbDesc, cdbInfoCopy); setQEIdentifier(segdbDesc, -1, perGangContext); segCount++; } } if (size != segCount) { FtsReConfigureMPP(false); elog(ERROR, "Not all primary segment instances are active and connected"); } break; default: Assert(false); } ELOG_DISPATCHER_DEBUG("buildGangDefinition done"); MemoryContextSwitchTo(GangContext); return newGangDefinition; }
/* * Create a writer gang. */ Gang * AllocateWriterGang() { Gang *writerGang = NULL; MemoryContext oldContext = NULL; int i = 0; ELOG_DISPATCHER_DEBUG("AllocateWriterGang begin."); if (Gp_role != GP_ROLE_DISPATCH) { elog(FATAL, "dispatch process called with role %d", Gp_role); } /* * First, we look for an unallocated but created gang of the right type * if it exists, we return it. * Else, we create a new gang */ if (primaryWriterGang == NULL) { int nsegdb = getgpsegmentCount(); insist_log(IsTransactionOrTransactionBlock(), "cannot allocate segworker group outside of transaction"); if (GangContext == NULL) { GangContext = AllocSetContextCreate(TopMemoryContext, "Gang Context", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); } Assert(GangContext != NULL); oldContext = MemoryContextSwitchTo(GangContext); writerGang = createGang(GANGTYPE_PRIMARY_WRITER, PRIMARY_WRITER_GANG_ID, nsegdb, -1); writerGang->allocated = true; /* * set "whoami" for utility statement. * non-utility statement will overwrite it in function getCdbProcessList. */ for(i = 0; i < writerGang->size; i++) setQEIdentifier(&writerGang->db_descriptors[i], -1, writerGang->perGangContext); MemoryContextSwitchTo(oldContext); } else { ELOG_DISPATCHER_DEBUG("Reusing an existing primary writer gang"); writerGang = primaryWriterGang; } /* sanity check the gang */ if (!GangOK(writerGang)) elog(ERROR, "could not connect to segment: initialization of segworker group failed"); ELOG_DISPATCHER_DEBUG("AllocateWriterGang end."); primaryWriterGang = writerGang; return writerGang; }
/* * Create a reader gang. * * @type can be GANGTYPE_ENTRYDB_READER, GANGTYPE_SINGLETON_READER or GANGTYPE_PRIMARY_READER. */ Gang * AllocateReaderGang(GangType type, char *portal_name) { MemoryContext oldContext = NULL; Gang *gp = NULL; int size = 0; int content = 0; ELOG_DISPATCHER_DEBUG("AllocateReaderGang for portal %s: allocatedReaderGangsN %d, availableReaderGangsN %d, " "allocatedReaderGangs1 %d, availableReaderGangs1 %d", (portal_name ? portal_name : "<unnamed>"), list_length(allocatedReaderGangsN), list_length(availableReaderGangsN), list_length(allocatedReaderGangs1), list_length(availableReaderGangs1)); if (Gp_role != GP_ROLE_DISPATCH) { elog(FATAL, "dispatch process called with role %d", Gp_role); } insist_log(IsTransactionOrTransactionBlock(), "cannot allocate segworker group outside of transaction"); if (GangContext == NULL) { GangContext = AllocSetContextCreate(TopMemoryContext, "Gang Context", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); } Assert(GangContext != NULL); oldContext = MemoryContextSwitchTo(GangContext); switch (type) { case GANGTYPE_ENTRYDB_READER: content = -1; size = 1; break; case GANGTYPE_SINGLETON_READER: content = gp_singleton_segindex; size = 1; break; case GANGTYPE_PRIMARY_READER: content = 0; size = getgpsegmentCount(); break; default: Assert(false); } /* * First, we look for an unallocated but created gang of the right type * if it exists, we return it. * Else, we create a new gang */ gp = getAvailableGang(type, size, content); if (gp == NULL) { ELOG_DISPATCHER_DEBUG("Creating a new reader size %d gang for %s", size, (portal_name ? portal_name : "unnamed portal")); gp = createGang(type, gang_id_counter++, size, content); gp->allocated = true; } /* * make sure no memory is still allocated for previous * portal name that this gang belonged to */ if (gp->portal_name) pfree(gp->portal_name); /* let the gang know which portal it is being assigned to */ gp->portal_name = (portal_name ? pstrdup(portal_name) : (char *) NULL); /* sanity check the gang */ insist_log(GangOK(gp), "could not connect to segment: initialization of segworker group failed"); addGangToAllocated(gp); MemoryContextSwitchTo(oldContext); ELOG_DISPATCHER_DEBUG("on return: allocatedReaderGangs %d, availableReaderGangsN %d, " "allocatedReaderGangs1 %d, availableReaderGangs1 %d", list_length(allocatedReaderGangsN), list_length(availableReaderGangsN), list_length(allocatedReaderGangs1), list_length(availableReaderGangs1)); return gp; }
/* * getCdbComponentDatabases * * * Storage for the SegmentInstances block and all subsidiary * strucures are allocated from the caller's context. */ CdbComponentDatabases * getCdbComponentInfo(bool DNSLookupAsError) { CdbComponentDatabaseInfo *pOld = NULL; CdbComponentDatabases *component_databases = NULL; Relation gp_seg_config_rel; HeapTuple gp_seg_config_tuple = NULL; HeapScanDesc gp_seg_config_scan; /* * Initial size for info arrays. */ int segment_array_size = 500; int entry_array_size = 4; /* we currently support a max of 2 */ /* * isNull and attr are used when getting the data for a specific column from a HeapTuple */ bool isNull; Datum attr; /* * Local variables for fields from the rows of the tables that we are reading. */ int dbid; int content; char role; char preferred_role; char mode = 0; char status = 0; int i; int x = 0; /* * Allocate component_databases return structure and * component_databases->segment_db_info array with an initial size * of 128, and component_databases->entry_db_info with an initial * size of 4. If necessary during row fetching, we grow these by * doubling each time we run out. */ component_databases = palloc0(sizeof(CdbComponentDatabases)); component_databases->segment_db_info = (CdbComponentDatabaseInfo *) palloc0(sizeof(CdbComponentDatabaseInfo) * segment_array_size); component_databases->entry_db_info = (CdbComponentDatabaseInfo *) palloc0(sizeof(CdbComponentDatabaseInfo) * entry_array_size); gp_seg_config_rel = heap_open(GpSegmentConfigRelationId, AccessShareLock); gp_seg_config_scan = heap_beginscan(gp_seg_config_rel, SnapshotNow, 0, NULL); while (HeapTupleIsValid(gp_seg_config_tuple = heap_getnext(gp_seg_config_scan, ForwardScanDirection))) { /* * Grab the fields that we need from gp_configuration. We do * this first, because until we read them, we don't know * whether this is an entry database row or a segment database * row. */ CdbComponentDatabaseInfo *pRow; /* * dbid */ attr = heap_getattr(gp_seg_config_tuple, Anum_gp_segment_configuration_dbid, RelationGetDescr(gp_seg_config_rel), &isNull); Assert(!isNull); dbid = DatumGetInt16(attr); /* * content */ attr = heap_getattr(gp_seg_config_tuple, Anum_gp_segment_configuration_content, RelationGetDescr(gp_seg_config_rel), &isNull); Assert(!isNull); content = DatumGetInt16(attr); /* * role */ attr = heap_getattr(gp_seg_config_tuple, Anum_gp_segment_configuration_role, RelationGetDescr(gp_seg_config_rel), &isNull); Assert(!isNull); role = DatumGetChar(attr); /* * preferred-role */ attr = heap_getattr(gp_seg_config_tuple, Anum_gp_segment_configuration_preferred_role, RelationGetDescr(gp_seg_config_rel), &isNull); Assert(!isNull); preferred_role = DatumGetChar(attr); /* * mode */ attr = heap_getattr(gp_seg_config_tuple, Anum_gp_segment_configuration_mode, RelationGetDescr(gp_seg_config_rel), &isNull); Assert(!isNull); mode = DatumGetChar(attr); /* * status */ attr = heap_getattr(gp_seg_config_tuple, Anum_gp_segment_configuration_status, RelationGetDescr(gp_seg_config_rel), &isNull); Assert(!isNull); status = DatumGetChar(attr); /* * Determine which array to place this rows data in: entry or * segment, based on the content field. */ if (content >= 0) { /* if we have a dbid bigger than our array we'll have to grow the array. (MPP-2104) */ if (dbid >= segment_array_size || component_databases->total_segment_dbs >= segment_array_size) { /* * Expand CdbComponentDatabaseInfo array if we've used up currently allocated space */ segment_array_size = Max((segment_array_size * 2), dbid * 2); pOld = component_databases->segment_db_info; component_databases->segment_db_info = (CdbComponentDatabaseInfo *) repalloc(pOld, sizeof(CdbComponentDatabaseInfo) * segment_array_size); } pRow = &component_databases->segment_db_info[component_databases->total_segment_dbs]; component_databases->total_segment_dbs++; } else { if (component_databases->total_entry_dbs >= entry_array_size) { /* * Expand CdbComponentDatabaseInfo array if we've used up currently allocated space */ entry_array_size *= 2; pOld = component_databases->entry_db_info; component_databases->entry_db_info = (CdbComponentDatabaseInfo *) repalloc(pOld, sizeof(CdbComponentDatabaseInfo) * entry_array_size); } pRow = &component_databases->entry_db_info[component_databases->total_entry_dbs]; component_databases->total_entry_dbs++; } pRow->dbid = dbid; pRow->segindex = content; pRow->role = role; pRow->preferred_role = preferred_role; pRow->mode = mode; pRow->status = status; /* * hostname */ attr = heap_getattr(gp_seg_config_tuple, Anum_gp_segment_configuration_hostname, RelationGetDescr(gp_seg_config_rel), &isNull); Assert(!isNull); pRow->hostname = TextDatumGetCString(attr); /* * address */ attr = heap_getattr(gp_seg_config_tuple, Anum_gp_segment_configuration_address, RelationGetDescr(gp_seg_config_rel), &isNull); Assert(!isNull); pRow->address = TextDatumGetCString(attr); /* * port */ attr = heap_getattr(gp_seg_config_tuple, Anum_gp_segment_configuration_port, RelationGetDescr(gp_seg_config_rel), &isNull); Assert(!isNull); pRow->port = DatumGetInt32(attr); /* * Filerep_port */ attr = heap_getattr(gp_seg_config_tuple, Anum_gp_segment_configuration_replication_port, RelationGetDescr(gp_seg_config_rel), &isNull); if (!isNull) pRow->filerep_port = DatumGetInt32(attr); else pRow->filerep_port = -1; getAddressesForDBid(pRow, DNSLookupAsError ? ERROR : LOG); pRow->hostip = pRow->hostaddrs[0]; } /* * We're done with the catalog entries, cleanup them up, closing * all the relations we opened. */ heap_endscan(gp_seg_config_scan); heap_close(gp_seg_config_rel, AccessShareLock); /* * Validate that there exists at least one entry and one segment * database in the configuration */ if (component_databases->total_segment_dbs == 0) { ereport(ERROR, (errcode(ERRCODE_CARDINALITY_VIOLATION), errmsg("Greenplum Database number of segment databases cannot be 0"))); } if (component_databases->total_entry_dbs == 0) { ereport(ERROR, (errcode(ERRCODE_CARDINALITY_VIOLATION), errmsg("Greenplum Database number of entry databases cannot be 0"))); } /* * Now sort the data by segindex, isprimary desc */ qsort(component_databases->segment_db_info, component_databases->total_segment_dbs, sizeof(CdbComponentDatabaseInfo), CdbComponentDatabaseInfoCompare); qsort(component_databases->entry_db_info, component_databases->total_entry_dbs, sizeof(CdbComponentDatabaseInfo), CdbComponentDatabaseInfoCompare); /* * Now count the number of distinct segindexes. * Since it's sorted, this is easy. */ for (i = 0; i < component_databases->total_segment_dbs; i++) { if (i == 0 || (component_databases->segment_db_info[i].segindex != component_databases->segment_db_info[i - 1].segindex)) { component_databases->total_segments++; } } /* * Validate that gp_numsegments == segment_databases->total_segment_dbs */ if (getgpsegmentCount() != component_databases->total_segments) { ereport(ERROR, (errcode(ERRCODE_DATA_EXCEPTION), errmsg("Greenplum Database number of segments inconsistency: count is %d from pg_catalog.%s table, but %d from getCdbComponentDatabases()", getgpsegmentCount(), GpIdRelationName, component_databases->total_segments))); } /* * Now validate that our identity is present in the entry databases */ for (i = 0; i < component_databases->total_entry_dbs; i++) { CdbComponentDatabaseInfo *pInfo = &component_databases->entry_db_info[i]; if (pInfo->dbid == GpIdentity.dbid && pInfo->segindex == Gp_segment) { break; } } if (i == component_databases->total_entry_dbs) { ereport(ERROR, (errcode(ERRCODE_DATA_EXCEPTION), errmsg("Cannot locate entry database represented by this db in gp_segment_configuration: dbid %d content %d", GpIdentity.dbid, Gp_segment))); } /* * Now validate that the segindexes for the segment databases are * between 0 and (GpIdentity.numsegments - 1) inclusive, and that we * hit them all. Since it's sorted, this is relatively easy. */ x = 0; for (i = 0; i < getgpsegmentCount(); i++) { int this_segindex = -1; while (x < component_databases->total_segment_dbs) { this_segindex = component_databases->segment_db_info[x].segindex; if (this_segindex < i) x++; else if (this_segindex == i) break; else if (this_segindex > i) { ereport(ERROR, (errcode(ERRCODE_DATA_EXCEPTION), errmsg("Content values not valid in %s table. They must be in the range 0 to %d inclusive", GpSegmentConfigRelationName, getgpsegmentCount() - 1))); } } if (this_segindex != i) { ereport(ERROR, (errcode(ERRCODE_DATA_EXCEPTION), errmsg("Content values not valid in %s table. They must be in the range 0 to %d inclusive", GpSegmentConfigRelationName, getgpsegmentCount() - 1))); } } return component_databases; }
/* * Creates a new gang by logging on a session to each segDB involved. * * call this function in GangContext memory context. * elog ERROR or return a non-NULL gang. */ static Gang * createGang_thread(GangType type, int gang_id, int size, int content) { Gang *newGangDefinition = NULL; SegmentDatabaseDescriptor *segdbDesc = NULL; DoConnectParms *doConnectParmsAr = NULL; DoConnectParms *pParms = NULL; int parmIndex = 0; int threadCount = 0; int i = 0; int create_gang_retry_counter = 0; int in_recovery_mode_count = 0; int successful_connections = 0; PQExpBufferData create_gang_error; ELOG_DISPATCHER_DEBUG("createGang type = %d, gang_id = %d, size = %d, content = %d", type, gang_id, size, content); /* check arguments */ Assert(size == 1 || size == getgpsegmentCount()); Assert(CurrentResourceOwner != NULL); Assert(CurrentMemoryContext == GangContext); Assert(gp_connections_per_thread > 0); /* Writer gang is created before reader gangs. */ if (type == GANGTYPE_PRIMARY_WRITER) Insist(!GangsExist()); initPQExpBuffer(&create_gang_error); Assert(CurrentGangCreating == NULL); create_gang_retry: /* * If we're in a retry, we may need to reset our initial state a bit. We * also want to ensure that all resources have been released. */ Assert(newGangDefinition == NULL); Assert(doConnectParmsAr == NULL); successful_connections = 0; in_recovery_mode_count = 0; threadCount = 0; /* allocate and initialize a gang structure */ newGangDefinition = buildGangDefinition(type, gang_id, size, content); CurrentGangCreating = newGangDefinition; Assert(newGangDefinition != NULL); Assert(newGangDefinition->size == size); Assert(newGangDefinition->perGangContext != NULL); MemoryContextSwitchTo(newGangDefinition->perGangContext); resetPQExpBuffer(&create_gang_error); /* * The most threads we could have is segdb_count / * gp_connections_per_thread, rounded up. This is equivalent to 1 + * (segdb_count-1) / gp_connections_per_thread. We allocate enough memory * for this many DoConnectParms structures, even though we may not use * them all. */ threadCount = 1 + (size - 1) / gp_connections_per_thread; Assert(threadCount > 0); /* initialize connect parameters */ doConnectParmsAr = makeConnectParms(threadCount, type, gang_id); for (i = 0; i < size; i++) { parmIndex = i / gp_connections_per_thread; pParms = &doConnectParmsAr[parmIndex]; segdbDesc = &newGangDefinition->db_descriptors[i]; pParms->segdbDescPtrArray[pParms->db_count++] = segdbDesc; } /* start threads and doing the connect */ for (i = 0; i < threadCount; i++) { int pthread_err; pParms = &doConnectParmsAr[i]; ELOG_DISPATCHER_DEBUG("createGang creating thread %d of %d for libpq connections", i + 1, threadCount); pthread_err = gp_pthread_create(&pParms->thread, thread_DoConnect, pParms, "createGang"); if (pthread_err != 0) { int j; /* * Error during thread create (this should be caused by resource * constraints). If we leave the threads running, they'll * immediately have some problems -- so we need to join them, and * *then* we can issue our FATAL error */ for (j = 0; j < i; j++) { pthread_join(doConnectParmsAr[j].thread, NULL); } ereport(FATAL, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("failed to create thread %d of %d", i + 1, threadCount), errdetail("pthread_create() failed with err %d", pthread_err))); } } /* * wait for all of the DoConnect threads to complete. */ for (i = 0; i < threadCount; i++) { ELOG_DISPATCHER_DEBUG("joining to thread %d of %d for libpq connections", i + 1, threadCount); if (0 != pthread_join(doConnectParmsAr[i].thread, NULL)) { elog(FATAL, "could not create segworker group"); } } /* * Free the memory allocated for the threadParms array */ destroyConnectParms(doConnectParmsAr, threadCount); doConnectParmsAr = NULL; SIMPLE_FAULT_INJECTOR(GangCreated); /* find out the successful connections and the failed ones */ checkConnectionStatus(newGangDefinition, &in_recovery_mode_count, &successful_connections, &create_gang_error); ELOG_DISPATCHER_DEBUG("createGang: %d processes requested; %d successful connections %d in recovery", size, successful_connections, in_recovery_mode_count); MemoryContextSwitchTo(GangContext); if (size == successful_connections) { setLargestGangsize(size); termPQExpBuffer(&create_gang_error); CurrentGangCreating = NULL; return newGangDefinition; } /* there'er failed connections */ /* FTS shows some segment DBs are down, destroy all gangs. */ if (isFTSEnabled() && FtsTestSegmentDBIsDown(newGangDefinition->db_descriptors, size)) { appendPQExpBuffer(&create_gang_error, "FTS detected one or more segments are down\n"); goto exit; } /* failure due to recovery */ if (successful_connections + in_recovery_mode_count == size) { if (gp_gang_creation_retry_count && create_gang_retry_counter++ < gp_gang_creation_retry_count && type == GANGTYPE_PRIMARY_WRITER) { /* * Retry for non-writer gangs is meaningless because writer gang * must be gone when QE is in recovery mode */ DisconnectAndDestroyGang(newGangDefinition); newGangDefinition = NULL; CurrentGangCreating = NULL; ELOG_DISPATCHER_DEBUG("createGang: gang creation failed, but retryable."); CHECK_FOR_INTERRUPTS(); pg_usleep(gp_gang_creation_retry_timer * 1000); CHECK_FOR_INTERRUPTS(); goto create_gang_retry; } appendPQExpBuffer(&create_gang_error, "segment(s) are in recovery mode\n"); } exit: if (newGangDefinition != NULL) DisconnectAndDestroyGang(newGangDefinition); if (type == GANGTYPE_PRIMARY_WRITER) { DisconnectAndDestroyAllGangs(true); CheckForResetSession(); } CurrentGangCreating = NULL; ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), errmsg("failed to acquire resources on one or more segments"), errdetail("%s", create_gang_error.data))); return NULL; }
void CdbCheckDispatchResult_internal(struct CdbDispatcherState *ds, struct SegmentDatabaseDescriptor ***failedSegDB, int *numOfFailed, DispatchWaitMode waitMode) { int i; int j; int nFailed = 0; DispatchCommandParms *pParms; CdbDispatchResult *dispatchResult; SegmentDatabaseDescriptor *segdbDesc; Assert(ds != NULL); if (failedSegDB) *failedSegDB = NULL; if (numOfFailed) *numOfFailed = 0; /* * No-op if no work was dispatched since the last time we were called. */ if (!ds->dispatchThreads || ds->dispatchThreads->threadCount == 0) { elog(DEBUG5, "CheckDispatchResult: no threads active"); return; } /* * Wait for threads to finish. */ for (i = 0; i < ds->dispatchThreads->threadCount; i++) { pParms = &ds->dispatchThreads->dispatchCommandParmsAr[i]; Assert(pParms != NULL); /* * Does caller want to stop short? */ switch (waitMode) { case DISPATCH_WAIT_CANCEL: case DISPATCH_WAIT_FINISH: pParms->waitMode = waitMode; break; default: break; } if (gp_connections_per_thread == 0) { thread_DispatchWait(pParms); } else { elog(DEBUG4, "CheckDispatchResult: Joining to thread %d of %d", i + 1, ds->dispatchThreads->threadCount); if (pParms->thread_valid) { int pthread_err = 0; pthread_err = pthread_join(pParms->thread, NULL); if (pthread_err != 0) elog(FATAL, "CheckDispatchResult: pthread_join failed on thread %d (%lu) of %d (returned %d attempting to join to %lu)", i + 1, #ifndef _WIN32 (unsigned long) pParms->thread, #else (unsigned long) pParms->thread.p, #endif ds->dispatchThreads->threadCount, pthread_err, (unsigned long) mythread()); } } HOLD_INTERRUPTS(); pParms->thread_valid = false; MemSet(&pParms->thread, 0, sizeof(pParms->thread)); RESUME_INTERRUPTS(); /* * Examine the CdbDispatchResult objects containing the results * from this thread's QEs. */ for (j = 0; j < pParms->db_count; j++) { dispatchResult = pParms->dispatchResultPtrArray[j]; if (dispatchResult == NULL) { elog(LOG, "CheckDispatchResult: result object is NULL ? skipping."); continue; } if (dispatchResult->segdbDesc == NULL) { elog(LOG, "CheckDispatchResult: result object segment descriptor is NULL ? skipping."); continue; } segdbDesc = dispatchResult->segdbDesc; /* * segdbDesc error message is unlikely here, but check anyway. */ if (segdbDesc->errcode || segdbDesc->error_message.len) cdbdisp_mergeConnectionErrors(dispatchResult, segdbDesc); /* * Log the result */ if (DEBUG2 >= log_min_messages) cdbdisp_debugDispatchResult(dispatchResult, DEBUG2, DEBUG3); /* * Notify FTS to reconnect if connection lost or never connected. */ if (failedSegDB && PQstatus(segdbDesc->conn) == CONNECTION_BAD) { /* * Allocate storage. Caller should pfree() it. */ if (!*failedSegDB) *failedSegDB = palloc(sizeof(**failedSegDB) * (2 * getgpsegmentCount() + 1)); /* * Append to broken connection list. */ (*failedSegDB)[nFailed++] = segdbDesc; (*failedSegDB)[nFailed] = NULL; if (numOfFailed) *numOfFailed = nFailed; } /* * Zap our SegmentDatabaseDescriptor ptr because it may be * invalidated by the call to FtsHandleNetFailure() below. * Anything we need from there, we should get before this. */ dispatchResult->segdbDesc = NULL; } } /* * reset thread state (will be destroyed later on in finishCommand) */ ds->dispatchThreads->threadCount = 0; /* * It looks like everything went fine, make sure we don't miss a * user cancellation? * * The waitMode argument is NONE when we are doing "normal work". */ if (waitMode == DISPATCH_WAIT_NONE || waitMode == DISPATCH_WAIT_FINISH) CHECK_FOR_INTERRUPTS(); }
/* * cdbdisp_dispatchCommand: * Send the strCommand SQL statement to all segdbs in the cluster * cancelOnError indicates whether an error * occurring on one of the qExec segdbs should cause all still-executing commands to cancel * on other qExecs. Normally this would be true. The commands are sent over the libpq * connections that were established during gang creation. They are run inside of threads. * The number of segdbs handled by any one thread is determined by the * guc variable gp_connections_per_thread. * * The CdbDispatchResults objects allocated for the command * are returned in *pPrimaryResults * The caller, after calling CdbCheckDispatchResult(), can * examine the CdbDispatchResults objects, can keep them as * long as needed, and ultimately must free them with * cdbdisp_destroyDispatcherState() prior to deallocation * of the memory context from which they were allocated. * * NB: Callers should use PG_TRY()/PG_CATCH() if needed to make * certain that the CdbDispatchResults objects are destroyed by * cdbdisp_destroyDispatcherState() in case of error. * To wait for completion, check for errors, and clean up, it is * suggested that the caller use cdbdisp_finishCommand(). */ void cdbdisp_dispatchCommand(const char *strCommand, char *serializedQuerytree, int serializedQuerytreelen, bool cancelOnError, bool needTwoPhase, bool withSnapshot, CdbDispatcherState * ds) { DispatchCommandQueryParms queryParms; Gang *primaryGang; int nsegdb = getgpsegmentCount(); CdbComponentDatabaseInfo *qdinfo; if (log_dispatch_stats) ResetUsage(); if (DEBUG5 >= log_min_messages) elog(DEBUG3, "cdbdisp_dispatchCommand: %s (needTwoPhase = %s)", strCommand, (needTwoPhase ? "true" : "false")); else elog((Debug_print_full_dtm ? LOG : DEBUG3), "cdbdisp_dispatchCommand: %.50s (needTwoPhase = %s)", strCommand, (needTwoPhase ? "true" : "false")); MemSet(&queryParms, 0, sizeof(queryParms)); queryParms.strCommand = strCommand; queryParms.serializedQuerytree = serializedQuerytree; queryParms.serializedQuerytreelen = serializedQuerytreelen; /* * Allocate a primary QE for every available segDB in the system. */ primaryGang = allocateWriterGang(); Assert(primaryGang); /* * Serialize a version of our DTX Context Info */ queryParms.serializedDtxContextInfo = qdSerializeDtxContextInfo(&queryParms.serializedDtxContextInfolen, withSnapshot, false, mppTxnOptions(needTwoPhase), "cdbdisp_dispatchCommand"); /* * sequence server info */ qdinfo = &(getComponentDatabases()->entry_db_info[0]); Assert(qdinfo != NULL && qdinfo->hostip != NULL); queryParms.seqServerHost = pstrdup(qdinfo->hostip); queryParms.seqServerHostlen = strlen(qdinfo->hostip) + 1; queryParms.seqServerPort = seqServerCtl->seqServerPort; /* * Dispatch the command. */ ds->primaryResults = NULL; ds->dispatchThreads = NULL; cdbdisp_makeDispatcherState(ds, nsegdb, 0, cancelOnError); cdbdisp_queryParmsInit(ds, &queryParms); ds->primaryResults->writer_gang = primaryGang; cdbdisp_dispatchToGang(ds, primaryGang, -1, DEFAULT_DISP_DIRECT); /* * don't pfree serializedShapshot here, it will be pfree'd when * the first thread is destroyed. */ }