예제 #1
0
/*
 * Creates a new gang by logging on a session to each segDB involved.
 *
 * call this function in GangContext memory context.
 * elog ERROR or return a non-NULL gang.
 */
static Gang*
createGang_async(GangType type, int gang_id, int size, int content)
{
	Gang *newGangDefinition;
	SegmentDatabaseDescriptor *segdbDesc = NULL;
	int i = 0;
	int create_gang_retry_counter = 0;
	int in_recovery_mode_count = 0;
	int successful_connections = 0;
	bool retry = false;
	int poll_timeout = 0;
	struct timeval startTS;
	PostgresPollingStatusType *pollingStatus = NULL;
	/* true means connection status is confirmed, either established or in recovery mode */
	bool *connStatusDone = NULL;

	ELOG_DISPATCHER_DEBUG("createGang type = %d, gang_id = %d, size = %d, content = %d",
			type, gang_id, size, content);

	/* check arguments */
	Assert(size == 1 || size == getgpsegmentCount());
	Assert(CurrentResourceOwner != NULL);
	Assert(CurrentMemoryContext == GangContext);
	/* Writer gang is created before reader gangs. */
	if (type == GANGTYPE_PRIMARY_WRITER)
		Insist(!GangsExist());

	/* Check writer gang firstly*/
	if (type != GANGTYPE_PRIMARY_WRITER && !isPrimaryWriterGangAlive())
		ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR),
						errmsg("failed to acquire resources on one or more segments"),
						errdetail("writer gang got broken before creating reader gangs")));

create_gang_retry:
	/* If we're in a retry, we may need to reset our initial state, a bit */
	newGangDefinition = NULL;
	successful_connections = 0;
	in_recovery_mode_count = 0;
	retry = false;

	/* allocate and initialize a gang structure */
	newGangDefinition = buildGangDefinition(type, gang_id, size, content);

	Assert(newGangDefinition != NULL);
	Assert(newGangDefinition->size == size);
	Assert(newGangDefinition->perGangContext != NULL);
	MemoryContextSwitchTo(newGangDefinition->perGangContext);

	/* allocate memory within perGangContext and will be freed automatically when gang is destroyed */
	pollingStatus = palloc(sizeof(PostgresPollingStatusType) * size);
	connStatusDone = palloc(sizeof(bool) * size);

	struct pollfd *fds;

	PG_TRY();
	{
		for (i = 0; i < size; i++)
		{
			char gpqeid[100];
			char *options;

			/*
			 * Create the connection requests.	If we find a segment without a
			 * valid segdb we error out.  Also, if this segdb is invalid, we must
			 * fail the connection.
			 */
			segdbDesc = &newGangDefinition->db_descriptors[i];

			/*
			 * Build the connection string.  Writer-ness needs to be processed
			 * early enough now some locks are taken before command line options
			 * are recognized.
			 */
			build_gpqeid_param(gpqeid, sizeof(gpqeid),
							   segdbDesc->segindex,
							   type == GANGTYPE_PRIMARY_WRITER,
							   gang_id);

			options = makeOptions();

			/* start connection in asynchronous way */
			cdbconn_doConnectStart(segdbDesc, gpqeid, options);

			if(cdbconn_isBadConnection(segdbDesc))
				ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR),
										errmsg("failed to acquire resources on one or more segments"),
										errdetail("%s (%s)", PQerrorMessage(segdbDesc->conn), segdbDesc->whoami)));

			connStatusDone[i] = false;
			/*
			 * If connection status is not CONNECTION_BAD after PQconnectStart(), we must
			 * act as if the PQconnectPoll() had returned PGRES_POLLING_WRITING
			 */
			pollingStatus[i] = PGRES_POLLING_WRITING;
		}

		/*
		 * Ok, we've now launched all the connection attempts. Start the
		 * timeout clock (= get the start timestamp), and poll until they're
		 * all completed or we reach timeout.
		 */
		gettimeofday(&startTS, NULL);
		fds = (struct pollfd *) palloc0(sizeof(struct pollfd) * size);

		for(;;)
		{
			int nready;
			int nfds = 0;

			poll_timeout = getPollTimeout(&startTS);

			for (i = 0; i < size; i++)
			{
				segdbDesc = &newGangDefinition->db_descriptors[i];

				/* Skip established connections and in-recovery-mode connections*/
				if (connStatusDone[i])
					continue;

				switch (pollingStatus[i])
				{
					case PGRES_POLLING_OK:
						cdbconn_doConnectComplete(segdbDesc);
						if (segdbDesc->motionListener == -1 || segdbDesc->motionListener == 0)
							ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR),
									errmsg("failed to acquire resources on one or more segments"),
									errdetail("Internal error: No motion listener port (%s)", segdbDesc->whoami)));
						successful_connections++;
						connStatusDone[i] = true;
						continue;

					case PGRES_POLLING_READING:
						fds[nfds].fd = PQsocket(segdbDesc->conn);
						fds[nfds].events = POLLIN;
						nfds++;
						break;

					case PGRES_POLLING_WRITING:
						fds[nfds].fd = PQsocket(segdbDesc->conn);
						fds[nfds].events = POLLOUT;
						nfds++;
						break;

					case PGRES_POLLING_FAILED:
						if (segment_failure_due_to_recovery(&segdbDesc->conn->errorMessage))
						{
							in_recovery_mode_count++;
							connStatusDone[i] = true;
							elog(LOG, "segment is in recovery mode (%s)", segdbDesc->whoami);
						}
						else
						{
							ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR),
											errmsg("failed to acquire resources on one or more segments"),
											errdetail("%s (%s)", PQerrorMessage(segdbDesc->conn), segdbDesc->whoami)));
						}
						break;

					default:
							ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR),
										errmsg("failed to acquire resources on one or more segments"),
										errdetail("unknow pollstatus (%s)", segdbDesc->whoami)));
						break;
				}

				if (poll_timeout == 0)
						ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR),
										errmsg("failed to acquire resources on one or more segments"),
										errdetail("timeout expired\n (%s)", segdbDesc->whoami)));
			}

			if (nfds == 0)
				break;

			CHECK_FOR_INTERRUPTS();

			/* Wait until something happens */
			nready = poll(fds, nfds, poll_timeout);

			if (nready < 0)
			{
				int	sock_errno = SOCK_ERRNO;
				if (sock_errno == EINTR)
					continue;

				ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR),
								errmsg("failed to acquire resources on one or more segments"),
								errdetail("poll() failed: errno = %d", sock_errno)));
			}
			else if (nready > 0)
			{
				int currentFdNumber = 0;
				for (i = 0; i < size; i++)
				{
					segdbDesc = &newGangDefinition->db_descriptors[i];
					if (connStatusDone[i])
						continue;

					Assert(PQsocket(segdbDesc->conn) > 0);
					Assert(PQsocket(segdbDesc->conn) == fds[currentFdNumber].fd);

					if (fds[currentFdNumber].revents & fds[currentFdNumber].events)
						pollingStatus[i] = PQconnectPoll(segdbDesc->conn);

					currentFdNumber++;

				}
			}
		}

		ELOG_DISPATCHER_DEBUG("createGang: %d processes requested; %d successful connections %d in recovery",
				size, successful_connections, in_recovery_mode_count);

		MemoryContextSwitchTo(GangContext);

		/* some segments are in recovery mode*/
		if (successful_connections != size)
		{
			Assert(successful_connections + in_recovery_mode_count == size);

			/* FTS shows some segment DBs are down */
			if (isFTSEnabled() &&
				FtsTestSegmentDBIsDown(newGangDefinition->db_descriptors, size))
				ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR),
								errmsg("failed to acquire resources on one or more segments"),
								errdetail("FTS detected one or more segments are down")));

			if ( gp_gang_creation_retry_count <= 0 ||
				create_gang_retry_counter++ >= gp_gang_creation_retry_count ||
				type != GANGTYPE_PRIMARY_WRITER)
				ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR),
								errmsg("failed to acquire resources on one or more segments"),
								errdetail("segments is in recovery mode")));

			ELOG_DISPATCHER_DEBUG("createGang: gang creation failed, but retryable.");

			DisconnectAndDestroyGang(newGangDefinition);
			newGangDefinition = NULL;
			retry = true;
		}
	}
	PG_CATCH();
	{
		MemoryContextSwitchTo(GangContext);
		DisconnectAndDestroyGang(newGangDefinition);
		newGangDefinition = NULL;

		if (type == GANGTYPE_PRIMARY_WRITER)
		{
			DisconnectAndDestroyAllGangs(true);
			CheckForResetSession();
		}

		PG_RE_THROW();
	}
	PG_END_TRY();

	if (retry)
	{
		CHECK_FOR_INTERRUPTS();
		pg_usleep(gp_gang_creation_retry_timer * 1000);
		CHECK_FOR_INTERRUPTS();

		goto create_gang_retry;
	}

	setLargestGangsize(size);
	return newGangDefinition;
}
예제 #2
0
/*
 * Creates a new gang by logging on a session to each segDB involved.
 *
 * call this function in GangContext memory context.
 * elog ERROR or return a non-NULL gang.
 */
static Gang *
createGang_thread(GangType type, int gang_id, int size, int content)
{
	Gang	   *newGangDefinition = NULL;
	SegmentDatabaseDescriptor *segdbDesc = NULL;
	DoConnectParms *doConnectParmsAr = NULL;
	DoConnectParms *pParms = NULL;
	int			parmIndex = 0;
	int			threadCount = 0;
	int			i = 0;
	int			create_gang_retry_counter = 0;
	int			in_recovery_mode_count = 0;
	int			successful_connections = 0;

	PQExpBufferData create_gang_error;

	ELOG_DISPATCHER_DEBUG("createGang type = %d, gang_id = %d, size = %d, content = %d",
						  type, gang_id, size, content);

	/* check arguments */
	Assert(size == 1 || size == getgpsegmentCount());
	Assert(CurrentResourceOwner != NULL);
	Assert(CurrentMemoryContext == GangContext);
	Assert(gp_connections_per_thread > 0);

	/* Writer gang is created before reader gangs. */
	if (type == GANGTYPE_PRIMARY_WRITER)
		Insist(!GangsExist());

	initPQExpBuffer(&create_gang_error);

	Assert(CurrentGangCreating == NULL);

create_gang_retry:

	/*
	 * If we're in a retry, we may need to reset our initial state a bit. We
	 * also want to ensure that all resources have been released.
	 */
	Assert(newGangDefinition == NULL);
	Assert(doConnectParmsAr == NULL);
	successful_connections = 0;
	in_recovery_mode_count = 0;
	threadCount = 0;

	/* allocate and initialize a gang structure */
	newGangDefinition = buildGangDefinition(type, gang_id, size, content);
	CurrentGangCreating = newGangDefinition;

	Assert(newGangDefinition != NULL);
	Assert(newGangDefinition->size == size);
	Assert(newGangDefinition->perGangContext != NULL);
	MemoryContextSwitchTo(newGangDefinition->perGangContext);

	resetPQExpBuffer(&create_gang_error);

	/*
	 * The most threads we could have is segdb_count /
	 * gp_connections_per_thread, rounded up. This is equivalent to 1 +
	 * (segdb_count-1) / gp_connections_per_thread. We allocate enough memory
	 * for this many DoConnectParms structures, even though we may not use
	 * them all.
	 */
	threadCount = 1 + (size - 1) / gp_connections_per_thread;
	Assert(threadCount > 0);

	/* initialize connect parameters */
	doConnectParmsAr = makeConnectParms(threadCount, type, gang_id);
	for (i = 0; i < size; i++)
	{
		parmIndex = i / gp_connections_per_thread;
		pParms = &doConnectParmsAr[parmIndex];
		segdbDesc = &newGangDefinition->db_descriptors[i];
		pParms->segdbDescPtrArray[pParms->db_count++] = segdbDesc;
	}

	/* start threads and doing the connect */
	for (i = 0; i < threadCount; i++)
	{
		int			pthread_err;

		pParms = &doConnectParmsAr[i];

		ELOG_DISPATCHER_DEBUG("createGang creating thread %d of %d for libpq connections",
							  i + 1, threadCount);

		pthread_err = gp_pthread_create(&pParms->thread, thread_DoConnect, pParms, "createGang");
		if (pthread_err != 0)
		{
			int			j;

			/*
			 * Error during thread create (this should be caused by resource
			 * constraints). If we leave the threads running, they'll
			 * immediately have some problems -- so we need to join them, and
			 * *then* we can issue our FATAL error
			 */
			for (j = 0; j < i; j++)
			{
				pthread_join(doConnectParmsAr[j].thread, NULL);
			}

			ereport(FATAL, (errcode(ERRCODE_INTERNAL_ERROR),
							errmsg("failed to create thread %d of %d", i + 1, threadCount),
							errdetail("pthread_create() failed with err %d", pthread_err)));
		}
	}

	/*
	 * wait for all of the DoConnect threads to complete.
	 */
	for (i = 0; i < threadCount; i++)
	{
		ELOG_DISPATCHER_DEBUG("joining to thread %d of %d for libpq connections",
							  i + 1, threadCount);

		if (0 != pthread_join(doConnectParmsAr[i].thread, NULL))
		{
			elog(FATAL, "could not create segworker group");
		}
	}

	/*
	 * Free the memory allocated for the threadParms array
	 */
	destroyConnectParms(doConnectParmsAr, threadCount);
	doConnectParmsAr = NULL;

	SIMPLE_FAULT_INJECTOR(GangCreated);

	/* find out the successful connections and the failed ones */
	checkConnectionStatus(newGangDefinition, &in_recovery_mode_count,
						  &successful_connections, &create_gang_error);

	ELOG_DISPATCHER_DEBUG("createGang: %d processes requested; %d successful connections %d in recovery",
						  size, successful_connections, in_recovery_mode_count);

	MemoryContextSwitchTo(GangContext);

	if (size == successful_connections)
	{
		setLargestGangsize(size);
		termPQExpBuffer(&create_gang_error);
		CurrentGangCreating = NULL;

		return newGangDefinition;
	}

	/* there'er failed connections */

	/* FTS shows some segment DBs are down, destroy all gangs. */
	if (isFTSEnabled() &&
		FtsTestSegmentDBIsDown(newGangDefinition->db_descriptors, size))
	{
		appendPQExpBuffer(&create_gang_error, "FTS detected one or more segments are down\n");
		goto exit;
	}

	/* failure due to recovery */
	if (successful_connections + in_recovery_mode_count == size)
	{
		if (gp_gang_creation_retry_count &&
			create_gang_retry_counter++ < gp_gang_creation_retry_count &&
			type == GANGTYPE_PRIMARY_WRITER)
		{
			/*
			 * Retry for non-writer gangs is meaningless because writer gang
			 * must be gone when QE is in recovery mode
			 */
			DisconnectAndDestroyGang(newGangDefinition);
			newGangDefinition = NULL;
			CurrentGangCreating = NULL;

			ELOG_DISPATCHER_DEBUG("createGang: gang creation failed, but retryable.");

			CHECK_FOR_INTERRUPTS();
			pg_usleep(gp_gang_creation_retry_timer * 1000);
			CHECK_FOR_INTERRUPTS();

			goto create_gang_retry;
		}

		appendPQExpBuffer(&create_gang_error, "segment(s) are in recovery mode\n");
	}

exit:
	if (newGangDefinition != NULL)
		DisconnectAndDestroyGang(newGangDefinition);

	if (type == GANGTYPE_PRIMARY_WRITER)
	{
		DisconnectAndDestroyAllGangs(true);
		CheckForResetSession();
	}

	CurrentGangCreating = NULL;

	ereport(ERROR,
			(errcode(ERRCODE_GP_INTERCONNECTION_ERROR),
			 errmsg("failed to acquire resources on one or more segments"),
			 errdetail("%s", create_gang_error.data)));
	return NULL;
}