/* * Helper function that actually kicks off the command on the libpq connection. */ static void dispatchCommand(CdbDispatchResult * dispatchResult, const char *query_text, int query_text_len) { SegmentDatabaseDescriptor *segdbDesc = dispatchResult->segdbDesc; TimestampTz beforeSend = 0; long secs; int usecs; if (DEBUG1 >= log_min_messages) beforeSend = GetCurrentTimestamp(); if (PQisBusy(segdbDesc->conn)) elog(LOG, "Trying to send to busy connection %s: asyncStatus %d", segdbDesc->whoami, segdbDesc->conn->asyncStatus); if (cdbconn_isBadConnection(segdbDesc)) { char *msg = PQerrorMessage(dispatchResult->segdbDesc->conn); dispatchResult->stillRunning = false; ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), errmsg("Connection lost before dispatch to segment %s: %s", dispatchResult->segdbDesc->whoami, msg ? msg : "unknown error"))); } /* * Submit the command asynchronously. */ if (PQsendGpQuery_shared(dispatchResult->segdbDesc->conn, (char *) query_text, query_text_len) == 0) { char *msg = PQerrorMessage(dispatchResult->segdbDesc->conn); dispatchResult->stillRunning = false; ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), errmsg("Command could not be dispatch to segment %s: %s", dispatchResult->segdbDesc->whoami, msg ? msg : "unknown error"))); } if (DEBUG1 >= log_min_messages) { TimestampDifference(beforeSend, GetCurrentTimestamp(), &secs, &usecs); if (secs != 0 || usecs > 1000) /* Time > 1ms? */ elog(LOG, "time for PQsendGpQuery_shared %ld.%06d", secs, usecs); } /* * We'll keep monitoring this QE -- whether or not the command * was dispatched -- in order to check for a lost connection * or any other errors that libpq might have in store for us. */ dispatchResult->stillRunning = true; dispatchResult->hasDispatched = true; ELOG_DISPATCHER_DEBUG("Command dispatched to QE (%s)", dispatchResult->segdbDesc->whoami); }
/* * Dispatch command to gang. * * Throw out error to upper try-catch block if anything goes wrong. */ static void cdbdisp_dispatchToGang_async(struct CdbDispatcherState *ds, struct Gang *gp, int sliceIndex, CdbDispatchDirectDesc * dispDirect) { int i; CdbDispatchCmdAsync *pParms = (CdbDispatchCmdAsync*)ds->dispatchParams; /* * Start the dispatching */ for (i = 0; i < gp->size; i++) { CdbDispatchResult* qeResult; SegmentDatabaseDescriptor *segdbDesc = &gp->db_descriptors[i]; Assert(segdbDesc != NULL); if (dispDirect->directed_dispatch) { /* We can direct dispatch to one segment DB only */ Assert(dispDirect->count == 1); if (dispDirect->content[0] != segdbDesc->segindex) continue; } /* * Initialize the QE's CdbDispatchResult object. */ qeResult = cdbdisp_makeResult(ds->primaryResults, segdbDesc, sliceIndex); if (qeResult == NULL) { /* * writer_gang could be NULL if this is an extended query. */ if (ds->primaryResults->writer_gang) ds->primaryResults->writer_gang->dispatcherActive = true; elog(FATAL, "could not allocate resources for segworker communication"); } pParms->dispatchResultPtrArray[pParms->dispatchCount++] = qeResult; if (cdbconn_isBadConnection(segdbDesc)) { char *msg = PQerrorMessage(qeResult->segdbDesc->conn); qeResult->stillRunning = false; ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), errmsg("Connection lost before dispatch to %s: %s", segdbDesc->whoami, msg ? msg : "unknown error"))); } dispatchCommand(qeResult, pParms->query_text, pParms->query_text_len); } }
/* * Creates a new gang by logging on a session to each segDB involved. * * call this function in GangContext memory context. * elog ERROR or return a non-NULL gang. */ static Gang* createGang_async(GangType type, int gang_id, int size, int content) { Gang *newGangDefinition; SegmentDatabaseDescriptor *segdbDesc = NULL; int i = 0; int create_gang_retry_counter = 0; int in_recovery_mode_count = 0; int successful_connections = 0; bool retry = false; int poll_timeout = 0; struct timeval startTS; PostgresPollingStatusType *pollingStatus = NULL; /* true means connection status is confirmed, either established or in recovery mode */ bool *connStatusDone = NULL; ELOG_DISPATCHER_DEBUG("createGang type = %d, gang_id = %d, size = %d, content = %d", type, gang_id, size, content); /* check arguments */ Assert(size == 1 || size == getgpsegmentCount()); Assert(CurrentResourceOwner != NULL); Assert(CurrentMemoryContext == GangContext); /* Writer gang is created before reader gangs. */ if (type == GANGTYPE_PRIMARY_WRITER) Insist(!GangsExist()); /* Check writer gang firstly*/ if (type != GANGTYPE_PRIMARY_WRITER && !isPrimaryWriterGangAlive()) ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), errmsg("failed to acquire resources on one or more segments"), errdetail("writer gang got broken before creating reader gangs"))); create_gang_retry: /* If we're in a retry, we may need to reset our initial state, a bit */ newGangDefinition = NULL; successful_connections = 0; in_recovery_mode_count = 0; retry = false; /* allocate and initialize a gang structure */ newGangDefinition = buildGangDefinition(type, gang_id, size, content); Assert(newGangDefinition != NULL); Assert(newGangDefinition->size == size); Assert(newGangDefinition->perGangContext != NULL); MemoryContextSwitchTo(newGangDefinition->perGangContext); /* allocate memory within perGangContext and will be freed automatically when gang is destroyed */ pollingStatus = palloc(sizeof(PostgresPollingStatusType) * size); connStatusDone = palloc(sizeof(bool) * size); struct pollfd *fds; PG_TRY(); { for (i = 0; i < size; i++) { char gpqeid[100]; char *options; /* * Create the connection requests. If we find a segment without a * valid segdb we error out. Also, if this segdb is invalid, we must * fail the connection. */ segdbDesc = &newGangDefinition->db_descriptors[i]; /* * Build the connection string. Writer-ness needs to be processed * early enough now some locks are taken before command line options * are recognized. */ build_gpqeid_param(gpqeid, sizeof(gpqeid), segdbDesc->segindex, type == GANGTYPE_PRIMARY_WRITER, gang_id); options = makeOptions(); /* start connection in asynchronous way */ cdbconn_doConnectStart(segdbDesc, gpqeid, options); if(cdbconn_isBadConnection(segdbDesc)) ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), errmsg("failed to acquire resources on one or more segments"), errdetail("%s (%s)", PQerrorMessage(segdbDesc->conn), segdbDesc->whoami))); connStatusDone[i] = false; /* * If connection status is not CONNECTION_BAD after PQconnectStart(), we must * act as if the PQconnectPoll() had returned PGRES_POLLING_WRITING */ pollingStatus[i] = PGRES_POLLING_WRITING; } /* * Ok, we've now launched all the connection attempts. Start the * timeout clock (= get the start timestamp), and poll until they're * all completed or we reach timeout. */ gettimeofday(&startTS, NULL); fds = (struct pollfd *) palloc0(sizeof(struct pollfd) * size); for(;;) { int nready; int nfds = 0; poll_timeout = getPollTimeout(&startTS); for (i = 0; i < size; i++) { segdbDesc = &newGangDefinition->db_descriptors[i]; /* Skip established connections and in-recovery-mode connections*/ if (connStatusDone[i]) continue; switch (pollingStatus[i]) { case PGRES_POLLING_OK: cdbconn_doConnectComplete(segdbDesc); if (segdbDesc->motionListener == -1 || segdbDesc->motionListener == 0) ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), errmsg("failed to acquire resources on one or more segments"), errdetail("Internal error: No motion listener port (%s)", segdbDesc->whoami))); successful_connections++; connStatusDone[i] = true; continue; case PGRES_POLLING_READING: fds[nfds].fd = PQsocket(segdbDesc->conn); fds[nfds].events = POLLIN; nfds++; break; case PGRES_POLLING_WRITING: fds[nfds].fd = PQsocket(segdbDesc->conn); fds[nfds].events = POLLOUT; nfds++; break; case PGRES_POLLING_FAILED: if (segment_failure_due_to_recovery(&segdbDesc->conn->errorMessage)) { in_recovery_mode_count++; connStatusDone[i] = true; elog(LOG, "segment is in recovery mode (%s)", segdbDesc->whoami); } else { ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), errmsg("failed to acquire resources on one or more segments"), errdetail("%s (%s)", PQerrorMessage(segdbDesc->conn), segdbDesc->whoami))); } break; default: ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), errmsg("failed to acquire resources on one or more segments"), errdetail("unknow pollstatus (%s)", segdbDesc->whoami))); break; } if (poll_timeout == 0) ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), errmsg("failed to acquire resources on one or more segments"), errdetail("timeout expired\n (%s)", segdbDesc->whoami))); } if (nfds == 0) break; CHECK_FOR_INTERRUPTS(); /* Wait until something happens */ nready = poll(fds, nfds, poll_timeout); if (nready < 0) { int sock_errno = SOCK_ERRNO; if (sock_errno == EINTR) continue; ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), errmsg("failed to acquire resources on one or more segments"), errdetail("poll() failed: errno = %d", sock_errno))); } else if (nready > 0) { int currentFdNumber = 0; for (i = 0; i < size; i++) { segdbDesc = &newGangDefinition->db_descriptors[i]; if (connStatusDone[i]) continue; Assert(PQsocket(segdbDesc->conn) > 0); Assert(PQsocket(segdbDesc->conn) == fds[currentFdNumber].fd); if (fds[currentFdNumber].revents & fds[currentFdNumber].events) pollingStatus[i] = PQconnectPoll(segdbDesc->conn); currentFdNumber++; } } } ELOG_DISPATCHER_DEBUG("createGang: %d processes requested; %d successful connections %d in recovery", size, successful_connections, in_recovery_mode_count); MemoryContextSwitchTo(GangContext); /* some segments are in recovery mode*/ if (successful_connections != size) { Assert(successful_connections + in_recovery_mode_count == size); /* FTS shows some segment DBs are down */ if (isFTSEnabled() && FtsTestSegmentDBIsDown(newGangDefinition->db_descriptors, size)) ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), errmsg("failed to acquire resources on one or more segments"), errdetail("FTS detected one or more segments are down"))); if ( gp_gang_creation_retry_count <= 0 || create_gang_retry_counter++ >= gp_gang_creation_retry_count || type != GANGTYPE_PRIMARY_WRITER) ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), errmsg("failed to acquire resources on one or more segments"), errdetail("segments is in recovery mode"))); ELOG_DISPATCHER_DEBUG("createGang: gang creation failed, but retryable."); DisconnectAndDestroyGang(newGangDefinition); newGangDefinition = NULL; retry = true; } } PG_CATCH(); { MemoryContextSwitchTo(GangContext); DisconnectAndDestroyGang(newGangDefinition); newGangDefinition = NULL; if (type == GANGTYPE_PRIMARY_WRITER) { DisconnectAndDestroyAllGangs(true); CheckForResetSession(); } PG_RE_THROW(); } PG_END_TRY(); if (retry) { CHECK_FOR_INTERRUPTS(); pg_usleep(gp_gang_creation_retry_timer * 1000); CHECK_FOR_INTERRUPTS(); goto create_gang_retry; } setLargestGangsize(size); return newGangDefinition; }
/* * Receive and process input from one QE. * * Return true if all input are consumed or the connection went wrong. * Return false if there'er still more data expected. */ static bool processResults(CdbDispatchResult * dispatchResult) { SegmentDatabaseDescriptor *segdbDesc = dispatchResult->segdbDesc; char *msg; /* * Receive input from QE. */ if (PQconsumeInput(segdbDesc->conn) == 0) { msg = PQerrorMessage(segdbDesc->conn); cdbdisp_appendMessageNonThread(dispatchResult, LOG, "Error on receive from %s: %s", segdbDesc->whoami, msg ? msg : "unknown error"); return true; } /* * If we have received one or more complete messages, process them. */ while (!PQisBusy(segdbDesc->conn)) { /* loop to call PQgetResult; won't block */ PGresult *pRes; ExecStatusType resultStatus; int resultIndex; /* * PQisBusy() does some error handling, which can * cause the connection to die -- we can't just continue on as * if the connection is happy without checking first. * * For example, cdbdisp_numPGresult() will return a completely * bogus value! */ if (cdbconn_isBadConnection(segdbDesc)) { msg = PQerrorMessage(segdbDesc->conn); cdbdisp_appendMessageNonThread(dispatchResult, LOG, "Connection lost when receiving from %s: %s", segdbDesc->whoami, msg ? msg : "unknown error"); return true; } /* * Get one message. */ ELOG_DISPATCHER_DEBUG("PQgetResult"); pRes = PQgetResult(segdbDesc->conn); /* * Command is complete when PGgetResult() returns NULL. It is critical * that for any connection that had an asynchronous command sent thru * it, we call PQgetResult until it returns NULL. Otherwise, the next * time a command is sent to that connection, it will return an error * that there's a command pending. */ if (!pRes) { ELOG_DISPATCHER_DEBUG("%s -> idle", segdbDesc->whoami); /* this is normal end of command */ return true; } /* * Attach the PGresult object to the CdbDispatchResult object. */ resultIndex = cdbdisp_numPGresult(dispatchResult); cdbdisp_appendResult(dispatchResult, pRes); /* * Did a command complete successfully? */ resultStatus = PQresultStatus(pRes); if (resultStatus == PGRES_COMMAND_OK || resultStatus == PGRES_TUPLES_OK || resultStatus == PGRES_COPY_IN || resultStatus == PGRES_COPY_OUT || resultStatus == PGRES_EMPTY_QUERY) { ELOG_DISPATCHER_DEBUG("%s -> ok %s", segdbDesc->whoami, PQcmdStatus(pRes) ? PQcmdStatus(pRes) : "(no cmdStatus)"); if (resultStatus == PGRES_EMPTY_QUERY) ELOG_DISPATCHER_DEBUG("QE received empty query."); /* * Save the index of the last successful PGresult. Can be given to * cdbdisp_getPGresult() to get tuple count, etc. */ dispatchResult->okindex = resultIndex; /* * SREH - get number of rows rejected from QE if any */ if (pRes->numRejected > 0) dispatchResult->numrowsrejected += pRes->numRejected; if (resultStatus == PGRES_COPY_IN || resultStatus == PGRES_COPY_OUT) return true; } /* * Note QE error. Cancel the whole statement if requested. */ else { /* QE reported an error */ char *sqlstate = PQresultErrorField(pRes, PG_DIAG_SQLSTATE); int errcode = 0; msg = PQresultErrorMessage(pRes); ELOG_DISPATCHER_DEBUG("%s -> %s %s %s", segdbDesc->whoami, PQresStatus(resultStatus), sqlstate ? sqlstate : "(no SQLSTATE)", msg); /* * Convert SQLSTATE to an error code (ERRCODE_xxx). Use a generic * nonzero error code if no SQLSTATE. */ if (sqlstate && strlen(sqlstate) == 5) errcode = sqlstate_to_errcode(sqlstate); /* * Save first error code and the index of its PGresult buffer * entry. */ cdbdisp_seterrcode(errcode, resultIndex, dispatchResult); } } return false; /* we must keep on monitoring this socket */ }
/* * Send finish or cancel signal to QEs if needed. */ static void checkConnectionsForCancel(CdbDispatchCmdAsync* pParms) { int i; for (i = 0; i < pParms->dispatchCount; i++) { DispatchWaitMode waitMode; CdbDispatchResult *dispatchResult = pParms->dispatchResultPtrArray[i]; Assert(dispatchResult != NULL); SegmentDatabaseDescriptor *segdbDesc = dispatchResult->segdbDesc; CdbDispatchResults *meleeResults = dispatchResult->meleeResults; /* * Already finished with this QE? */ if (!dispatchResult->stillRunning) continue; waitMode = DISPATCH_WAIT_NONE; /* * Send query finish to this QE if QD is already done. */ if (pParms->waitMode == DISPATCH_WAIT_FINISH) waitMode = DISPATCH_WAIT_FINISH; /* * However, escalate it to cancel if: * - user interrupt has occurred, * - or I'm told to send cancel, * - or an error has been reported by another QE, * - in case the caller wants cancelOnError and it was not canceled */ if ((InterruptPending || pParms->waitMode == DISPATCH_WAIT_CANCEL || meleeResults->errcode) && (meleeResults->cancelOnError && !dispatchResult->wasCanceled)) waitMode = DISPATCH_WAIT_CANCEL; /* * Finally, don't send the signal if * - no action needed (NONE) * - the signal was already sent * - connection is dead */ if (waitMode != DISPATCH_WAIT_NONE && waitMode != dispatchResult->sentSignal && !cdbconn_isBadConnection(segdbDesc)) { char errbuf[256]; bool sent; memset(errbuf, 0, sizeof(errbuf)); sent = cdbconn_signalQE(segdbDesc, errbuf, waitMode == DISPATCH_WAIT_CANCEL); if (sent) dispatchResult->sentSignal = waitMode; else if (Debug_cancel_print || gp_log_gang >= GPVARS_VERBOSITY_DEBUG) elog(LOG, "Unable to cancel: %s", strlen(errbuf) == 0 ? "cannot allocate PGCancel" : errbuf); } } }
/* * Receive and process results from all running QEs. * * wait: true, wait until all dispatch works are completed. * false, return immediate when there's no more data. * * Don't throw out error, instead, append the error message to * CdbDispatchResult.error_message. */ static void checkDispatchResult(CdbDispatcherState *ds, bool wait) { CdbDispatchCmdAsync *pParms = (CdbDispatchCmdAsync*)ds->dispatchParams; SegmentDatabaseDescriptor *segdbDesc; CdbDispatchResult *dispatchResult; int i; int db_count = 0; int timeoutCounter = 0; struct pollfd *fds; db_count = pParms->dispatchCount; fds = (struct pollfd *) palloc(db_count * sizeof(struct pollfd)); /* * OK, we are finished submitting the command to the segdbs. * Now, we have to wait for them to finish. */ for (;;) { int sock; int n; int nfds = 0; /* * bail-out if we are dying. * Once QD dies, QE will recognize it shortly anyway. */ if (proc_exit_inprogress) break; /* * Which QEs are still running and could send results to us? */ for (i = 0; i < db_count; i++) { dispatchResult = pParms->dispatchResultPtrArray[i]; segdbDesc = dispatchResult->segdbDesc; if (cdbconn_isBadConnection(segdbDesc)) { char *msg = PQerrorMessage(segdbDesc->conn); dispatchResult->stillRunning = false; cdbdisp_appendMessageNonThread(dispatchResult, LOG, "Connection lost during dispatch to %s: %s", dispatchResult->segdbDesc->whoami, msg ? msg : "unknown error"); } /* * Already finished with this QE? */ if (!dispatchResult->stillRunning) continue; /* * Add socket to fd_set if still connected. */ sock = PQsocket(segdbDesc->conn); Assert(sock >= 0); fds[nfds].fd = sock; fds[nfds].events = POLLIN; nfds++; } /* * Break out when no QEs still running. */ if (nfds <= 0) break; /* * Wait for results from QEs. Block here until input is available. */ n = poll(fds, nfds, wait ? DISPATCH_WAIT_TIMEOUT_SEC * 1000 : 0); /* poll returns with an error, including one due to an interrupted call */ if (n < 0) { int sock_errno = SOCK_ERRNO; if (sock_errno == EINTR) continue; elog(LOG, "handlePollError poll() failed; errno=%d", sock_errno); checkConnectionsForCancel(pParms); finishupFailedConnections(pParms); } /* If the time limit expires, poll() returns 0 */ else if (n == 0) { if (!wait) break; checkConnectionsForCancel(pParms); if (timeoutCounter++ > 30) { finishupFailedConnections(pParms); timeoutCounter = 0; } } /* We have data waiting on one or more of the connections. */ else handlePollSuccess(pParms, fds); } pfree(fds); }