/* * Creates a new gang by logging on a session to each segDB involved. * * elog ERROR or return a non-NULL gang. */ Gang * AllocateGang(CdbDispatcherState *ds, GangType type, List *segments) { MemoryContext oldContext; SegmentType segmentType; Gang *newGang = NULL; int i; ELOG_DISPATCHER_DEBUG("AllocateGang begin."); if (Gp_role != GP_ROLE_DISPATCH) { elog(FATAL, "dispatch process called with role %d", Gp_role); } if (segments == NIL) return NULL; Assert(DispatcherContext); oldContext = MemoryContextSwitchTo(DispatcherContext); if (type == GANGTYPE_PRIMARY_WRITER) segmentType = SEGMENTTYPE_EXPLICT_WRITER; /* for extended query like cursor, must specify a reader */ else if (ds->isExtendedQuery) segmentType = SEGMENTTYPE_EXPLICT_READER; else segmentType = SEGMENTTYPE_ANY; newGang = cdbgang_createGang(segments, segmentType); newGang->allocated = true; newGang->type = type; ds->allocatedGangs = lcons(newGang, ds->allocatedGangs); ds->largestGangSize = Max(ds->largestGangSize, newGang->size); ELOG_DISPATCHER_DEBUG("AllocateGang end."); if (type == GANGTYPE_PRIMARY_WRITER) { /* * set "whoami" for utility statement. non-utility statement will * overwrite it in function getCdbProcessList. */ for (i = 0; i < newGang->size; i++) cdbconn_setQEIdentifier(newGang->db_descriptors[i], -1); } MemoryContextSwitchTo(oldContext); return newGang; }
/* * Read gp_segment_configuration catalog table and build a CdbComponentDatabases. * * Read the catalog if FTS is reconfigured. * * We don't want to destroy cdb_component_dbs when one gang get destroyed, so allocate * it in GangContext instead of perGangContext. */ CdbComponentDatabases * getComponentDatabases(void) { Assert(Gp_role == GP_ROLE_DISPATCH || Gp_role == GP_ROLE_UTILITY); Assert(GangContext != NULL); uint64 ftsVersion = getFtsVersion(); MemoryContext oldContext = MemoryContextSwitchTo(GangContext); if (cdb_component_dbs == NULL) { cdb_component_dbs = getCdbComponentDatabases(); cdb_component_dbs->fts_version = ftsVersion; } else if (cdb_component_dbs->fts_version != ftsVersion) { ELOG_DISPATCHER_DEBUG("FTS rescanned, get new component databases info."); freeCdbComponentDatabases(cdb_component_dbs); cdb_component_dbs = getCdbComponentDatabases(); cdb_component_dbs->fts_version = ftsVersion; } MemoryContextSwitchTo(oldContext); return cdb_component_dbs; }
/* * Reads the GP catalog tables and build a CdbComponentDatabases structure. * It then converts this to a Gang structure and initializes all the non-connection related fields. * * Call this function in GangContext. * Returns a not-null pointer. */ Gang * buildGangDefinition(List *segments, SegmentType segmentType) { Gang *newGangDefinition = NULL; ListCell *lc; int i = 0; int size; int contentId; size = list_length(segments); ELOG_DISPATCHER_DEBUG("buildGangDefinition:Starting %d qExec processes for gang", size); Assert(CurrentMemoryContext == DispatcherContext); /* allocate a gang */ newGangDefinition = (Gang *) palloc0(sizeof(Gang)); newGangDefinition->type = GANGTYPE_UNALLOCATED; newGangDefinition->size = size; newGangDefinition->allocated = false; newGangDefinition->db_descriptors = (SegmentDatabaseDescriptor **) palloc0(size * sizeof(SegmentDatabaseDescriptor*)); PG_TRY(); { /* initialize db_descriptors */ foreach_with_count (lc, segments , i) { contentId = lfirst_int(lc); newGangDefinition->db_descriptors[i] = cdbcomponent_allocateIdleQE(contentId, segmentType); } }
/* * Helper function that actually kicks off the command on the libpq connection. */ static void dispatchCommand(CdbDispatchResult * dispatchResult, const char *query_text, int query_text_len) { SegmentDatabaseDescriptor *segdbDesc = dispatchResult->segdbDesc; TimestampTz beforeSend = 0; long secs; int usecs; if (DEBUG1 >= log_min_messages) beforeSend = GetCurrentTimestamp(); if (PQisBusy(segdbDesc->conn)) elog(LOG, "Trying to send to busy connection %s: asyncStatus %d", segdbDesc->whoami, segdbDesc->conn->asyncStatus); if (cdbconn_isBadConnection(segdbDesc)) { char *msg = PQerrorMessage(dispatchResult->segdbDesc->conn); dispatchResult->stillRunning = false; ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), errmsg("Connection lost before dispatch to segment %s: %s", dispatchResult->segdbDesc->whoami, msg ? msg : "unknown error"))); } /* * Submit the command asynchronously. */ if (PQsendGpQuery_shared(dispatchResult->segdbDesc->conn, (char *) query_text, query_text_len) == 0) { char *msg = PQerrorMessage(dispatchResult->segdbDesc->conn); dispatchResult->stillRunning = false; ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), errmsg("Command could not be dispatch to segment %s: %s", dispatchResult->segdbDesc->whoami, msg ? msg : "unknown error"))); } if (DEBUG1 >= log_min_messages) { TimestampDifference(beforeSend, GetCurrentTimestamp(), &secs, &usecs); if (secs != 0 || usecs > 1000) /* Time > 1ms? */ elog(LOG, "time for PQsendGpQuery_shared %ld.%06d", secs, usecs); } /* * We'll keep monitoring this QE -- whether or not the command * was dispatched -- in order to check for a lost connection * or any other errors that libpq might have in store for us. */ dispatchResult->stillRunning = true; dispatchResult->hasDispatched = true; ELOG_DISPATCHER_DEBUG("Command dispatched to QE (%s)", dispatchResult->segdbDesc->whoami); }
/* * Check if any segment DB down is detected by FTS. * * Issue a FTS probe every 1 minute. */ static void finishupFailedConnections(CdbDispatchCmdAsync * pParms) { int i; bool forceScan = true; /* * check the connection still valid, set 1 min time interval * this may affect performance, should turn it off if required. */ for (i = 0; i < pParms->dispatchCount; i++) { CdbDispatchResult *dispatchResult = pParms->dispatchResultPtrArray[i]; SegmentDatabaseDescriptor *segdbDesc = dispatchResult->segdbDesc; /* * Skip if already finished or didn't dispatch. */ if (!dispatchResult->stillRunning) continue; /* * Skip the entry db. */ if (segdbDesc->segindex < 0) continue; ELOG_DISPATCHER_DEBUG("FTS testing connection %d of %d (%s)", i + 1, pParms->dispatchCount, segdbDesc->whoami); if (!FtsTestConnection(segdbDesc->segment_database_info, forceScan)) { char *msg = PQerrorMessage(segdbDesc->conn); dispatchResult->stillRunning = false; cdbdisp_appendMessageNonThread(dispatchResult, LOG, "FTS detected connection lost during dispatch to %s: %s", dispatchResult->segdbDesc->whoami, msg ? msg : "unknown error"); } forceScan = false; } }
/* * Creates a new gang by logging on a session to each segDB involved. * * call this function in GangContext memory context. * elog ERROR or return a non-NULL gang. */ static Gang* createGang_async(GangType type, int gang_id, int size, int content) { Gang *newGangDefinition; SegmentDatabaseDescriptor *segdbDesc = NULL; int i = 0; int create_gang_retry_counter = 0; int in_recovery_mode_count = 0; int successful_connections = 0; bool retry = false; int poll_timeout = 0; struct timeval startTS; PostgresPollingStatusType *pollingStatus = NULL; /* true means connection status is confirmed, either established or in recovery mode */ bool *connStatusDone = NULL; ELOG_DISPATCHER_DEBUG("createGang type = %d, gang_id = %d, size = %d, content = %d", type, gang_id, size, content); /* check arguments */ Assert(size == 1 || size == getgpsegmentCount()); Assert(CurrentResourceOwner != NULL); Assert(CurrentMemoryContext == GangContext); /* Writer gang is created before reader gangs. */ if (type == GANGTYPE_PRIMARY_WRITER) Insist(!GangsExist()); /* Check writer gang firstly*/ if (type != GANGTYPE_PRIMARY_WRITER && !isPrimaryWriterGangAlive()) ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), errmsg("failed to acquire resources on one or more segments"), errdetail("writer gang got broken before creating reader gangs"))); create_gang_retry: /* If we're in a retry, we may need to reset our initial state, a bit */ newGangDefinition = NULL; successful_connections = 0; in_recovery_mode_count = 0; retry = false; /* allocate and initialize a gang structure */ newGangDefinition = buildGangDefinition(type, gang_id, size, content); Assert(newGangDefinition != NULL); Assert(newGangDefinition->size == size); Assert(newGangDefinition->perGangContext != NULL); MemoryContextSwitchTo(newGangDefinition->perGangContext); /* allocate memory within perGangContext and will be freed automatically when gang is destroyed */ pollingStatus = palloc(sizeof(PostgresPollingStatusType) * size); connStatusDone = palloc(sizeof(bool) * size); struct pollfd *fds; PG_TRY(); { for (i = 0; i < size; i++) { char gpqeid[100]; char *options; /* * Create the connection requests. If we find a segment without a * valid segdb we error out. Also, if this segdb is invalid, we must * fail the connection. */ segdbDesc = &newGangDefinition->db_descriptors[i]; /* * Build the connection string. Writer-ness needs to be processed * early enough now some locks are taken before command line options * are recognized. */ build_gpqeid_param(gpqeid, sizeof(gpqeid), segdbDesc->segindex, type == GANGTYPE_PRIMARY_WRITER, gang_id); options = makeOptions(); /* start connection in asynchronous way */ cdbconn_doConnectStart(segdbDesc, gpqeid, options); if(cdbconn_isBadConnection(segdbDesc)) ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), errmsg("failed to acquire resources on one or more segments"), errdetail("%s (%s)", PQerrorMessage(segdbDesc->conn), segdbDesc->whoami))); connStatusDone[i] = false; /* * If connection status is not CONNECTION_BAD after PQconnectStart(), we must * act as if the PQconnectPoll() had returned PGRES_POLLING_WRITING */ pollingStatus[i] = PGRES_POLLING_WRITING; } /* * Ok, we've now launched all the connection attempts. Start the * timeout clock (= get the start timestamp), and poll until they're * all completed or we reach timeout. */ gettimeofday(&startTS, NULL); fds = (struct pollfd *) palloc0(sizeof(struct pollfd) * size); for(;;) { int nready; int nfds = 0; poll_timeout = getPollTimeout(&startTS); for (i = 0; i < size; i++) { segdbDesc = &newGangDefinition->db_descriptors[i]; /* Skip established connections and in-recovery-mode connections*/ if (connStatusDone[i]) continue; switch (pollingStatus[i]) { case PGRES_POLLING_OK: cdbconn_doConnectComplete(segdbDesc); if (segdbDesc->motionListener == -1 || segdbDesc->motionListener == 0) ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), errmsg("failed to acquire resources on one or more segments"), errdetail("Internal error: No motion listener port (%s)", segdbDesc->whoami))); successful_connections++; connStatusDone[i] = true; continue; case PGRES_POLLING_READING: fds[nfds].fd = PQsocket(segdbDesc->conn); fds[nfds].events = POLLIN; nfds++; break; case PGRES_POLLING_WRITING: fds[nfds].fd = PQsocket(segdbDesc->conn); fds[nfds].events = POLLOUT; nfds++; break; case PGRES_POLLING_FAILED: if (segment_failure_due_to_recovery(&segdbDesc->conn->errorMessage)) { in_recovery_mode_count++; connStatusDone[i] = true; elog(LOG, "segment is in recovery mode (%s)", segdbDesc->whoami); } else { ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), errmsg("failed to acquire resources on one or more segments"), errdetail("%s (%s)", PQerrorMessage(segdbDesc->conn), segdbDesc->whoami))); } break; default: ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), errmsg("failed to acquire resources on one or more segments"), errdetail("unknow pollstatus (%s)", segdbDesc->whoami))); break; } if (poll_timeout == 0) ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), errmsg("failed to acquire resources on one or more segments"), errdetail("timeout expired\n (%s)", segdbDesc->whoami))); } if (nfds == 0) break; CHECK_FOR_INTERRUPTS(); /* Wait until something happens */ nready = poll(fds, nfds, poll_timeout); if (nready < 0) { int sock_errno = SOCK_ERRNO; if (sock_errno == EINTR) continue; ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), errmsg("failed to acquire resources on one or more segments"), errdetail("poll() failed: errno = %d", sock_errno))); } else if (nready > 0) { int currentFdNumber = 0; for (i = 0; i < size; i++) { segdbDesc = &newGangDefinition->db_descriptors[i]; if (connStatusDone[i]) continue; Assert(PQsocket(segdbDesc->conn) > 0); Assert(PQsocket(segdbDesc->conn) == fds[currentFdNumber].fd); if (fds[currentFdNumber].revents & fds[currentFdNumber].events) pollingStatus[i] = PQconnectPoll(segdbDesc->conn); currentFdNumber++; } } } ELOG_DISPATCHER_DEBUG("createGang: %d processes requested; %d successful connections %d in recovery", size, successful_connections, in_recovery_mode_count); MemoryContextSwitchTo(GangContext); /* some segments are in recovery mode*/ if (successful_connections != size) { Assert(successful_connections + in_recovery_mode_count == size); /* FTS shows some segment DBs are down */ if (isFTSEnabled() && FtsTestSegmentDBIsDown(newGangDefinition->db_descriptors, size)) ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), errmsg("failed to acquire resources on one or more segments"), errdetail("FTS detected one or more segments are down"))); if ( gp_gang_creation_retry_count <= 0 || create_gang_retry_counter++ >= gp_gang_creation_retry_count || type != GANGTYPE_PRIMARY_WRITER) ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), errmsg("failed to acquire resources on one or more segments"), errdetail("segments is in recovery mode"))); ELOG_DISPATCHER_DEBUG("createGang: gang creation failed, but retryable."); DisconnectAndDestroyGang(newGangDefinition); newGangDefinition = NULL; retry = true; } } PG_CATCH(); { MemoryContextSwitchTo(GangContext); DisconnectAndDestroyGang(newGangDefinition); newGangDefinition = NULL; if (type == GANGTYPE_PRIMARY_WRITER) { DisconnectAndDestroyAllGangs(true); CheckForResetSession(); } PG_RE_THROW(); } PG_END_TRY(); if (retry) { CHECK_FOR_INTERRUPTS(); pg_usleep(gp_gang_creation_retry_timer * 1000); CHECK_FOR_INTERRUPTS(); goto create_gang_retry; } setLargestGangsize(size); return newGangDefinition; }
/* * Reads the GP catalog tables and build a CdbComponentDatabases structure. * It then converts this to a Gang structure and initializes all the non-connection related fields. * * Call this function in GangContext. * Returns a not-null pointer. */ Gang * buildGangDefinition(GangType type, int gang_id, int size, int content) { Gang *newGangDefinition = NULL; CdbComponentDatabaseInfo *cdbinfo = NULL; CdbComponentDatabaseInfo *cdbInfoCopy = NULL; SegmentDatabaseDescriptor *segdbDesc = NULL; MemoryContext perGangContext = NULL; int segCount = 0; int i = 0; ELOG_DISPATCHER_DEBUG("buildGangDefinition:Starting %d qExec processes for %s gang", size, gangTypeToString(type)); Assert(CurrentMemoryContext == GangContext); Assert(size == 1 || size == getgpsegmentCount()); /* read gp_segment_configuration and build CdbComponentDatabases */ cdb_component_dbs = getComponentDatabases(); if (cdb_component_dbs == NULL || cdb_component_dbs->total_segments <= 0 || cdb_component_dbs->total_segment_dbs <= 0) insist_log(false, "schema not populated while building segworker group"); /* if mirroring is not configured */ if (cdb_component_dbs->total_segment_dbs == cdb_component_dbs->total_segments) { ELOG_DISPATCHER_DEBUG("building Gang: mirroring not configured"); disableFTS(); } perGangContext = AllocSetContextCreate(GangContext, "Per Gang Context", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); Assert(perGangContext != NULL); MemoryContextSwitchTo(perGangContext); /* allocate a gang */ newGangDefinition = (Gang *) palloc0(sizeof(Gang)); newGangDefinition->type = type; newGangDefinition->size = size; newGangDefinition->gang_id = gang_id; newGangDefinition->allocated = false; newGangDefinition->noReuse = false; newGangDefinition->dispatcherActive = false; newGangDefinition->portal_name = NULL; newGangDefinition->perGangContext = perGangContext; newGangDefinition->db_descriptors = (SegmentDatabaseDescriptor *) palloc0(size * sizeof(SegmentDatabaseDescriptor)); /* initialize db_descriptors */ switch (type) { case GANGTYPE_ENTRYDB_READER: cdbinfo = &cdb_component_dbs->entry_db_info[0]; cdbInfoCopy = copyCdbComponentDatabaseInfo(cdbinfo); segdbDesc = &newGangDefinition->db_descriptors[0]; cdbconn_initSegmentDescriptor(segdbDesc, cdbInfoCopy); setQEIdentifier(segdbDesc, -1, perGangContext); break; case GANGTYPE_SINGLETON_READER: cdbinfo = findDatabaseInfoBySegIndex(cdb_component_dbs, content); cdbInfoCopy = copyCdbComponentDatabaseInfo(cdbinfo); segdbDesc = &newGangDefinition->db_descriptors[0]; cdbconn_initSegmentDescriptor(segdbDesc, cdbInfoCopy); setQEIdentifier(segdbDesc, -1, perGangContext); break; case GANGTYPE_PRIMARY_READER: case GANGTYPE_PRIMARY_WRITER: /* * We loop through the segment_db_info. Each item has a segindex. * They are sorted by segindex, and there can be > 1 segment_db_info for * a given segindex (currently, there can be 1 or 2) */ for (i = 0; i < cdb_component_dbs->total_segment_dbs; i++) { cdbinfo = &cdb_component_dbs->segment_db_info[i]; if (SEGMENT_IS_ACTIVE_PRIMARY(cdbinfo)) { segdbDesc = &newGangDefinition->db_descriptors[segCount]; cdbInfoCopy = copyCdbComponentDatabaseInfo(cdbinfo); cdbconn_initSegmentDescriptor(segdbDesc, cdbInfoCopy); setQEIdentifier(segdbDesc, -1, perGangContext); segCount++; } } if (size != segCount) { FtsReConfigureMPP(false); elog(ERROR, "Not all primary segment instances are active and connected"); } break; default: Assert(false); } ELOG_DISPATCHER_DEBUG("buildGangDefinition done"); MemoryContextSwitchTo(GangContext); return newGangDefinition; }
/* * Create a writer gang. */ Gang * AllocateWriterGang() { Gang *writerGang = NULL; MemoryContext oldContext = NULL; int i = 0; ELOG_DISPATCHER_DEBUG("AllocateWriterGang begin."); if (Gp_role != GP_ROLE_DISPATCH) { elog(FATAL, "dispatch process called with role %d", Gp_role); } /* * First, we look for an unallocated but created gang of the right type * if it exists, we return it. * Else, we create a new gang */ if (primaryWriterGang == NULL) { int nsegdb = getgpsegmentCount(); insist_log(IsTransactionOrTransactionBlock(), "cannot allocate segworker group outside of transaction"); if (GangContext == NULL) { GangContext = AllocSetContextCreate(TopMemoryContext, "Gang Context", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); } Assert(GangContext != NULL); oldContext = MemoryContextSwitchTo(GangContext); writerGang = createGang(GANGTYPE_PRIMARY_WRITER, PRIMARY_WRITER_GANG_ID, nsegdb, -1); writerGang->allocated = true; /* * set "whoami" for utility statement. * non-utility statement will overwrite it in function getCdbProcessList. */ for(i = 0; i < writerGang->size; i++) setQEIdentifier(&writerGang->db_descriptors[i], -1, writerGang->perGangContext); MemoryContextSwitchTo(oldContext); } else { ELOG_DISPATCHER_DEBUG("Reusing an existing primary writer gang"); writerGang = primaryWriterGang; } /* sanity check the gang */ if (!GangOK(writerGang)) elog(ERROR, "could not connect to segment: initialization of segworker group failed"); ELOG_DISPATCHER_DEBUG("AllocateWriterGang end."); primaryWriterGang = writerGang; return writerGang; }
/* * Create a reader gang. * * @type can be GANGTYPE_ENTRYDB_READER, GANGTYPE_SINGLETON_READER or GANGTYPE_PRIMARY_READER. */ Gang * AllocateReaderGang(GangType type, char *portal_name) { MemoryContext oldContext = NULL; Gang *gp = NULL; int size = 0; int content = 0; ELOG_DISPATCHER_DEBUG("AllocateReaderGang for portal %s: allocatedReaderGangsN %d, availableReaderGangsN %d, " "allocatedReaderGangs1 %d, availableReaderGangs1 %d", (portal_name ? portal_name : "<unnamed>"), list_length(allocatedReaderGangsN), list_length(availableReaderGangsN), list_length(allocatedReaderGangs1), list_length(availableReaderGangs1)); if (Gp_role != GP_ROLE_DISPATCH) { elog(FATAL, "dispatch process called with role %d", Gp_role); } insist_log(IsTransactionOrTransactionBlock(), "cannot allocate segworker group outside of transaction"); if (GangContext == NULL) { GangContext = AllocSetContextCreate(TopMemoryContext, "Gang Context", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); } Assert(GangContext != NULL); oldContext = MemoryContextSwitchTo(GangContext); switch (type) { case GANGTYPE_ENTRYDB_READER: content = -1; size = 1; break; case GANGTYPE_SINGLETON_READER: content = gp_singleton_segindex; size = 1; break; case GANGTYPE_PRIMARY_READER: content = 0; size = getgpsegmentCount(); break; default: Assert(false); } /* * First, we look for an unallocated but created gang of the right type * if it exists, we return it. * Else, we create a new gang */ gp = getAvailableGang(type, size, content); if (gp == NULL) { ELOG_DISPATCHER_DEBUG("Creating a new reader size %d gang for %s", size, (portal_name ? portal_name : "unnamed portal")); gp = createGang(type, gang_id_counter++, size, content); gp->allocated = true; } /* * make sure no memory is still allocated for previous * portal name that this gang belonged to */ if (gp->portal_name) pfree(gp->portal_name); /* let the gang know which portal it is being assigned to */ gp->portal_name = (portal_name ? pstrdup(portal_name) : (char *) NULL); /* sanity check the gang */ insist_log(GangOK(gp), "could not connect to segment: initialization of segworker group failed"); addGangToAllocated(gp); MemoryContextSwitchTo(oldContext); ELOG_DISPATCHER_DEBUG("on return: allocatedReaderGangs %d, availableReaderGangsN %d, " "allocatedReaderGangs1 %d, availableReaderGangs1 %d", list_length(allocatedReaderGangsN), list_length(availableReaderGangsN), list_length(allocatedReaderGangs1), list_length(availableReaderGangs1)); return gp; }
/* * Creates a new gang by logging on a session to each segDB involved. * * call this function in GangContext memory context. * elog ERROR or return a non-NULL gang. */ static Gang * createGang_thread(GangType type, int gang_id, int size, int content) { Gang *newGangDefinition = NULL; SegmentDatabaseDescriptor *segdbDesc = NULL; DoConnectParms *doConnectParmsAr = NULL; DoConnectParms *pParms = NULL; int parmIndex = 0; int threadCount = 0; int i = 0; int create_gang_retry_counter = 0; int in_recovery_mode_count = 0; int successful_connections = 0; PQExpBufferData create_gang_error; ELOG_DISPATCHER_DEBUG("createGang type = %d, gang_id = %d, size = %d, content = %d", type, gang_id, size, content); /* check arguments */ Assert(size == 1 || size == getgpsegmentCount()); Assert(CurrentResourceOwner != NULL); Assert(CurrentMemoryContext == GangContext); Assert(gp_connections_per_thread > 0); /* Writer gang is created before reader gangs. */ if (type == GANGTYPE_PRIMARY_WRITER) Insist(!GangsExist()); initPQExpBuffer(&create_gang_error); Assert(CurrentGangCreating == NULL); create_gang_retry: /* * If we're in a retry, we may need to reset our initial state a bit. We * also want to ensure that all resources have been released. */ Assert(newGangDefinition == NULL); Assert(doConnectParmsAr == NULL); successful_connections = 0; in_recovery_mode_count = 0; threadCount = 0; /* allocate and initialize a gang structure */ newGangDefinition = buildGangDefinition(type, gang_id, size, content); CurrentGangCreating = newGangDefinition; Assert(newGangDefinition != NULL); Assert(newGangDefinition->size == size); Assert(newGangDefinition->perGangContext != NULL); MemoryContextSwitchTo(newGangDefinition->perGangContext); resetPQExpBuffer(&create_gang_error); /* * The most threads we could have is segdb_count / * gp_connections_per_thread, rounded up. This is equivalent to 1 + * (segdb_count-1) / gp_connections_per_thread. We allocate enough memory * for this many DoConnectParms structures, even though we may not use * them all. */ threadCount = 1 + (size - 1) / gp_connections_per_thread; Assert(threadCount > 0); /* initialize connect parameters */ doConnectParmsAr = makeConnectParms(threadCount, type, gang_id); for (i = 0; i < size; i++) { parmIndex = i / gp_connections_per_thread; pParms = &doConnectParmsAr[parmIndex]; segdbDesc = &newGangDefinition->db_descriptors[i]; pParms->segdbDescPtrArray[pParms->db_count++] = segdbDesc; } /* start threads and doing the connect */ for (i = 0; i < threadCount; i++) { int pthread_err; pParms = &doConnectParmsAr[i]; ELOG_DISPATCHER_DEBUG("createGang creating thread %d of %d for libpq connections", i + 1, threadCount); pthread_err = gp_pthread_create(&pParms->thread, thread_DoConnect, pParms, "createGang"); if (pthread_err != 0) { int j; /* * Error during thread create (this should be caused by resource * constraints). If we leave the threads running, they'll * immediately have some problems -- so we need to join them, and * *then* we can issue our FATAL error */ for (j = 0; j < i; j++) { pthread_join(doConnectParmsAr[j].thread, NULL); } ereport(FATAL, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("failed to create thread %d of %d", i + 1, threadCount), errdetail("pthread_create() failed with err %d", pthread_err))); } } /* * wait for all of the DoConnect threads to complete. */ for (i = 0; i < threadCount; i++) { ELOG_DISPATCHER_DEBUG("joining to thread %d of %d for libpq connections", i + 1, threadCount); if (0 != pthread_join(doConnectParmsAr[i].thread, NULL)) { elog(FATAL, "could not create segworker group"); } } /* * Free the memory allocated for the threadParms array */ destroyConnectParms(doConnectParmsAr, threadCount); doConnectParmsAr = NULL; SIMPLE_FAULT_INJECTOR(GangCreated); /* find out the successful connections and the failed ones */ checkConnectionStatus(newGangDefinition, &in_recovery_mode_count, &successful_connections, &create_gang_error); ELOG_DISPATCHER_DEBUG("createGang: %d processes requested; %d successful connections %d in recovery", size, successful_connections, in_recovery_mode_count); MemoryContextSwitchTo(GangContext); if (size == successful_connections) { setLargestGangsize(size); termPQExpBuffer(&create_gang_error); CurrentGangCreating = NULL; return newGangDefinition; } /* there'er failed connections */ /* FTS shows some segment DBs are down, destroy all gangs. */ if (isFTSEnabled() && FtsTestSegmentDBIsDown(newGangDefinition->db_descriptors, size)) { appendPQExpBuffer(&create_gang_error, "FTS detected one or more segments are down\n"); goto exit; } /* failure due to recovery */ if (successful_connections + in_recovery_mode_count == size) { if (gp_gang_creation_retry_count && create_gang_retry_counter++ < gp_gang_creation_retry_count && type == GANGTYPE_PRIMARY_WRITER) { /* * Retry for non-writer gangs is meaningless because writer gang * must be gone when QE is in recovery mode */ DisconnectAndDestroyGang(newGangDefinition); newGangDefinition = NULL; CurrentGangCreating = NULL; ELOG_DISPATCHER_DEBUG("createGang: gang creation failed, but retryable."); CHECK_FOR_INTERRUPTS(); pg_usleep(gp_gang_creation_retry_timer * 1000); CHECK_FOR_INTERRUPTS(); goto create_gang_retry; } appendPQExpBuffer(&create_gang_error, "segment(s) are in recovery mode\n"); } exit: if (newGangDefinition != NULL) DisconnectAndDestroyGang(newGangDefinition); if (type == GANGTYPE_PRIMARY_WRITER) { DisconnectAndDestroyAllGangs(true); CheckForResetSession(); } CurrentGangCreating = NULL; ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), errmsg("failed to acquire resources on one or more segments"), errdetail("%s", create_gang_error.data))); return NULL; }
/* * Receive and process input from one QE. * * Return true if all input are consumed or the connection went wrong. * Return false if there'er still more data expected. */ static bool processResults(CdbDispatchResult * dispatchResult) { SegmentDatabaseDescriptor *segdbDesc = dispatchResult->segdbDesc; char *msg; /* * Receive input from QE. */ if (PQconsumeInput(segdbDesc->conn) == 0) { msg = PQerrorMessage(segdbDesc->conn); cdbdisp_appendMessageNonThread(dispatchResult, LOG, "Error on receive from %s: %s", segdbDesc->whoami, msg ? msg : "unknown error"); return true; } /* * If we have received one or more complete messages, process them. */ while (!PQisBusy(segdbDesc->conn)) { /* loop to call PQgetResult; won't block */ PGresult *pRes; ExecStatusType resultStatus; int resultIndex; /* * PQisBusy() does some error handling, which can * cause the connection to die -- we can't just continue on as * if the connection is happy without checking first. * * For example, cdbdisp_numPGresult() will return a completely * bogus value! */ if (cdbconn_isBadConnection(segdbDesc)) { msg = PQerrorMessage(segdbDesc->conn); cdbdisp_appendMessageNonThread(dispatchResult, LOG, "Connection lost when receiving from %s: %s", segdbDesc->whoami, msg ? msg : "unknown error"); return true; } /* * Get one message. */ ELOG_DISPATCHER_DEBUG("PQgetResult"); pRes = PQgetResult(segdbDesc->conn); /* * Command is complete when PGgetResult() returns NULL. It is critical * that for any connection that had an asynchronous command sent thru * it, we call PQgetResult until it returns NULL. Otherwise, the next * time a command is sent to that connection, it will return an error * that there's a command pending. */ if (!pRes) { ELOG_DISPATCHER_DEBUG("%s -> idle", segdbDesc->whoami); /* this is normal end of command */ return true; } /* * Attach the PGresult object to the CdbDispatchResult object. */ resultIndex = cdbdisp_numPGresult(dispatchResult); cdbdisp_appendResult(dispatchResult, pRes); /* * Did a command complete successfully? */ resultStatus = PQresultStatus(pRes); if (resultStatus == PGRES_COMMAND_OK || resultStatus == PGRES_TUPLES_OK || resultStatus == PGRES_COPY_IN || resultStatus == PGRES_COPY_OUT || resultStatus == PGRES_EMPTY_QUERY) { ELOG_DISPATCHER_DEBUG("%s -> ok %s", segdbDesc->whoami, PQcmdStatus(pRes) ? PQcmdStatus(pRes) : "(no cmdStatus)"); if (resultStatus == PGRES_EMPTY_QUERY) ELOG_DISPATCHER_DEBUG("QE received empty query."); /* * Save the index of the last successful PGresult. Can be given to * cdbdisp_getPGresult() to get tuple count, etc. */ dispatchResult->okindex = resultIndex; /* * SREH - get number of rows rejected from QE if any */ if (pRes->numRejected > 0) dispatchResult->numrowsrejected += pRes->numRejected; if (resultStatus == PGRES_COPY_IN || resultStatus == PGRES_COPY_OUT) return true; } /* * Note QE error. Cancel the whole statement if requested. */ else { /* QE reported an error */ char *sqlstate = PQresultErrorField(pRes, PG_DIAG_SQLSTATE); int errcode = 0; msg = PQresultErrorMessage(pRes); ELOG_DISPATCHER_DEBUG("%s -> %s %s %s", segdbDesc->whoami, PQresStatus(resultStatus), sqlstate ? sqlstate : "(no SQLSTATE)", msg); /* * Convert SQLSTATE to an error code (ERRCODE_xxx). Use a generic * nonzero error code if no SQLSTATE. */ if (sqlstate && strlen(sqlstate) == 5) errcode = sqlstate_to_errcode(sqlstate); /* * Save first error code and the index of its PGresult buffer * entry. */ cdbdisp_seterrcode(errcode, resultIndex, dispatchResult); } } return false; /* we must keep on monitoring this socket */ }
/* * Receive and process results from QEs. */ static void handlePollSuccess(CdbDispatchCmdAsync* pParms, struct pollfd *fds) { int currentFdNumber = 0; int i = 0; /* * We have data waiting on one or more of the connections. */ for (i = 0; i < pParms->dispatchCount; i++) { bool finished; int sock; CdbDispatchResult *dispatchResult = pParms->dispatchResultPtrArray[i]; SegmentDatabaseDescriptor *segdbDesc = dispatchResult->segdbDesc; /* * Skip if already finished or didn't dispatch. */ if (!dispatchResult->stillRunning) continue; ELOG_DISPATCHER_DEBUG("looking for results from %d of %d (%s)", i + 1, pParms->dispatchCount, segdbDesc->whoami); sock = PQsocket(segdbDesc->conn); Assert(sock >= 0); Assert(sock == fds[currentFdNumber].fd); /* * Skip this connection if it has no input available. */ if (!(fds[currentFdNumber++].revents & POLLIN)) continue; ELOG_DISPATCHER_DEBUG("PQsocket says there are results from %d of %d (%s)", i + 1, pParms->dispatchCount, segdbDesc->whoami); /* * Receive and process results from this QE. */ finished = processResults(dispatchResult); /* * Are we through with this QE now? */ if (finished) { dispatchResult->stillRunning = false; ELOG_DISPATCHER_DEBUG("processResults says we are finished with %d of %d (%s)", i + 1, pParms->dispatchCount, segdbDesc->whoami); if (DEBUG1 >= log_min_messages) { char msec_str[32]; switch (check_log_duration(msec_str, false)) { case 1: case 2: elog(LOG, "duration to dispatch result received from %d (seg %d): %s ms", i + 1, dispatchResult->segdbDesc->segindex, msec_str); break; } } if (PQisBusy(dispatchResult->segdbDesc->conn)) elog(LOG, "We thought we were done, because finished==true, but libpq says we are still busy"); } else ELOG_DISPATCHER_DEBUG("processResults says we have more to do with %d of %d (%s)", i + 1, pParms->dispatchCount, segdbDesc->whoami); } }