/*
 * PersistentFilespace_LookupMirrorDbid()
 *
 * Check the gp_persistent_filespace table to identify what dbid it contains
 * that does not match the primary dbid.  If there are no filespaces currently
 * defined this check will return 0 even if there is an active mirror, because
 * the segment doesn't know any better.
 */
int16
PersistentFilespace_LookupMirrorDbid(int16 primaryDbid)
{
	HASH_SEQ_STATUS		status;
	FilespaceDirEntry   dirEntry;
	int16				mirrorDbid = 0;

	PersistentFilespace_VerifyInitScan();

	/* Start scan */
	hash_seq_init(&status, persistentFilespaceSharedHashTable);
	dirEntry = (FilespaceDirEntry) hash_seq_search(&status);
	if (dirEntry != NULL)
	{
		if (dirEntry->dbId1 == primaryDbid)
		{
			mirrorDbid = dirEntry->dbId2;
		}
		else if (dirEntry->dbId2 == primaryDbid)
		{
			mirrorDbid = dirEntry->dbId1;
		}
		else
		{
			elog(FATAL,
				 "dbid %d not found in gp_persistent_filespace_node",
				 (int) primaryDbid);
		}

		/* Terminate the scan early */
		hash_seq_term(&status);
	}

	return mirrorDbid;
}
示例#2
0
/*
 * smgrIsAppendOnlyMirrorResyncEofs() -- Returns true if there Append-Only Mirror Resync
 * EOF work that needs to be done post-commit or post-abort work.
 *
 * Note that the list does not include anything scheduled for termination
 * by upper-level transactions.
 */
bool
smgrIsAppendOnlyMirrorResyncEofs(EndXactRecKind endXactRecKind)
{
	int			nestLevel = GetCurrentTransactionNestLevel();
	HASH_SEQ_STATUS iterateStatus;
	AppendOnlyMirrorResyncEofs *entry;

	if (AppendOnlyMirrorResyncEofsTable == NULL)
	{
		return false;
	}

	hash_seq_init(&iterateStatus, AppendOnlyMirrorResyncEofsTable);

	while ((entry = hash_seq_search(&iterateStatus)) != NULL)
	{
		if (entry->key.nestLevel >= nestLevel)
		{
			/* Deregister seq scan and exit early. */
			hash_seq_term(&iterateStatus);
			return true;
		}
	}

	return false;
}
示例#3
0
/*
 * WorkerGetNodeWithName finds and returns a node from the membership list that
 * has the given hostname. The function returns null if no such node exists.
 */
WorkerNode *
WorkerGetNodeWithName(const char *hostname)
{
	WorkerNode *workerNode = NULL;

	HASH_SEQ_STATUS status;
	hash_seq_init(&status, WorkerNodesHash);

	workerNode = (WorkerNode *) hash_seq_search(&status);
	while (workerNode != NULL)
	{
		if (workerNode->inWorkerFile)
		{
			int nameCompare = strncmp(workerNode->workerName, hostname, WORKER_LENGTH);
			if (nameCompare == 0)
			{
				hash_seq_term(&status);
				break;
			}
		}

		workerNode = (WorkerNode *) hash_seq_search(&status);
	}

	return workerNode;
}
示例#4
0
/*
 * Release connection created by calling GetConnection.
 */
void
mysql_rel_connection(MYSQL *conn)
{
	HASH_SEQ_STATUS	scan;
	ConnCacheEntry *entry;

	if (ConnectionHash == NULL)
		return;

	hash_seq_init(&scan, ConnectionHash);
	while ((entry = (ConnCacheEntry *) hash_seq_search(&scan)))
	{
		if (entry->conn == NULL)
			continue;

		if (entry->conn == conn)
		{
			elog(DEBUG3, "disconnecting mysql_fdw connection %p", entry->conn);
			_mysql_close(entry->conn);
			entry->conn = NULL;
			hash_seq_term(&scan);
			break;
		}
	}
}
示例#5
0
/*
 * FindRandomNodeNotInList finds a random node from the shared hash that is not
 * a member of the current node list. The caller is responsible for making the
 * necessary node count checks to ensure that such a node exists.
 *
 * Note that this function has a selection bias towards nodes whose positions in
 * the shared hash are sequentially adjacent to the positions of nodes that are
 * in the current node list. This bias follows from our decision to first pick a
 * random node in the hash, and if that node is a member of the current list, to
 * simply iterate to the next node in the hash. Overall, this approach trades in
 * some selection bias for simplicity in design and for bounded execution time.
 */
static WorkerNode *
FindRandomNodeNotInList(HTAB *WorkerNodesHash, List *currentNodeList)
{
	WorkerNode *workerNode = NULL;
	HASH_SEQ_STATUS status;
	uint32 workerNodeCount = 0;
	uint32 currentNodeCount PG_USED_FOR_ASSERTS_ONLY = 0;
	bool lookForWorkerNode = true;
	uint32 workerPosition = 0;
	uint32 workerIndex = 0;

	workerNodeCount = hash_get_num_entries(WorkerNodesHash);
	currentNodeCount = list_length(currentNodeList);
	Assert(workerNodeCount > currentNodeCount);

	/*
	 * We determine a random position within the worker hash between [1, N],
	 * assuming that the number of elements in the hash is N. We then get to
	 * this random position by iterating over the worker hash. Please note that
	 * the random seed has already been set by the postmaster when starting up.
	 */
	workerPosition = (random() % workerNodeCount) + 1;
	hash_seq_init(&status, WorkerNodesHash);

	for (workerIndex = 0; workerIndex < workerPosition; workerIndex++)
	{
		workerNode = (WorkerNode *) hash_seq_search(&status);
	}

	while (lookForWorkerNode)
	{
		bool listMember = ListMember(currentNodeList, workerNode);

		if (workerNode->inWorkerFile && !listMember)
		{
			lookForWorkerNode = false;
		}
		else
		{
			/* iterate to the next worker node in the hash */
			workerNode = (WorkerNode *) hash_seq_search(&status);

			/* reached end of hash; start from the beginning */
			if (workerNode == NULL)
			{
				hash_seq_init(&status, WorkerNodesHash);
				workerNode = (WorkerNode *) hash_seq_search(&status);
			}
		}
	}

	/* we stopped scanning before completion; therefore clean up scan */
	hash_seq_term(&status);

	return workerNode;
}
/*
 * Terminate the seq search of the DispatchedFilespaceDirHashTable.
 */
void
DispatchedFilespace_SeqSearch_Term(void)
{
	if (!DispatchedFileSpace_SeqSearch_Initialized)
	{
		return;
	}
	hash_seq_term(&DispatchedFileSpace_SeqSearch);
	DispatchedFileSpace_SeqSearch_Initialized = false;
}
示例#7
0
/*
 * DynamicTableScanEndCurrentScan
 *		Cleans up any ongoing scan.
 */
static void
DynamicTableScanEndCurrentScan(DynamicTableScanState *node)
{
	CleanupOnePartition((ScanState*)node);

	if (node->shouldCallHashSeqTerm)
	{
		hash_seq_term(&node->pidStatus);
		node->shouldCallHashSeqTerm = false;
	}
}
/*
 * Ends current scan by closing relations, and ending hash
 * iteration
 */
static void
DynamicIndexScanEndCurrentScan(DynamicIndexScanState *node)
{
	IndexScanState *indexState = &(node->indexScanState);

	CleanupOnePartition(indexState);

	if (node->shouldCallHashSeqTerm)
	{
		hash_seq_term(&node->pidxStatus);
		node->shouldCallHashSeqTerm = false;
	}
}
示例#9
0
/*
 * DynamicScan_RewindIterator
 *		Rewinds the iterator for a new scan of all the parts
 */
static void
DynamicScan_RewindIterator(ScanState *scanState)
{
	if (!isDynamicScan((Scan *)scanState->ps.plan))
	{
		return;
	}

	/*
	 * For EXPLAIN of a plan, we may never finish the initialization,
	 * and end up calling the End method directly.In such cases, we
	 * don't have any iterator to end.
	 */
	if (SCAN_INIT == scanState->scan_state)
	{
		DynamicScan_CreateIterator(scanState, (Scan *)scanState->ps.plan);
		return;
	}

	Scan *scan = (Scan *)scanState->ps.plan;

	DynamicTableScanInfo *partitionInfo = scanState->ps.state->dynamicTableScanInfo;

	Assert(partitionInfo->numScans >= scan->partIndex);
	DynamicPartitionIterator *iterator = partitionInfo->iterators[scan->partIndex - 1];

	Assert(NULL != iterator);

	if (iterator->shouldCallHashSeqTerm)
	{
		hash_seq_term(iterator->partitionIterator);
	}

	pfree(iterator->partitionIterator);

	iterator->partitionOids = partitionInfo->pidIndexes[scan->partIndex - 1];
	Assert(iterator->partitionOids != NULL);
	iterator->shouldCallHashSeqTerm = true;

	HASH_SEQ_STATUS *partitionIterator = palloc(sizeof(HASH_SEQ_STATUS));
	hash_seq_init(partitionIterator, iterator->partitionOids);

	iterator->partitionIterator = partitionIterator;

	Assert(iterator == partitionInfo->iterators[scan->partIndex - 1]);
}
示例#10
0
/*
 * Relcache invalidation callback for our relation map cache.
 */
static void
logicalrep_relmap_invalidate_cb(Datum arg, Oid reloid)
{
	LogicalRepRelMapEntry  *entry;

	/* Just to be sure. */
	if (LogicalRepRelMap == NULL)
		return;

	if (reloid != InvalidOid)
	{
		HASH_SEQ_STATUS status;

		hash_seq_init(&status, LogicalRepRelMap);

		/* TODO, use inverse lookup hashtable? */
		while ((entry = (LogicalRepRelMapEntry *) hash_seq_search(&status)) != NULL)
		{
			if (entry->localreloid == reloid)
			{
				entry->localreloid = InvalidOid;
				hash_seq_term(&status);
				break;
			}
		}
	}
	else
	{
		/* invalidate all cache entries */
		HASH_SEQ_STATUS status;

		hash_seq_init(&status, LogicalRepRelMap);

		while ((entry = (LogicalRepRelMapEntry *) hash_seq_search(&status)) != NULL)
			entry->localreloid = InvalidOid;
	}
}
示例#11
0
/*
 * DynamicScan_EndIterator
 * 		Frees the partition iterator for a scanState.
 */
static void
DynamicScan_EndIterator(ScanState *scanState)
{
	Assert(NULL != scanState);

	/*
	 * For EXPLAIN of a plan, we may never finish the initialization,
	 * and end up calling the End method directly.In such cases, we
	 * don't have any iterator to end.
	 */
	if (SCAN_INIT == scanState->scan_state)
	{
		return;
	}

	Scan *scan = (Scan *)scanState->ps.plan;

	DynamicTableScanInfo *partitionInfo = scanState->ps.state->dynamicTableScanInfo;

	Assert(partitionInfo->numScans >= scan->partIndex);
	DynamicPartitionIterator *iterator = partitionInfo->iterators[scan->partIndex - 1];

	Assert(NULL != iterator);

	if (iterator->shouldCallHashSeqTerm)
	{
		hash_seq_term(iterator->partitionIterator);
	}

	pfree(iterator->partitionIterator);

	MemoryContextDelete(iterator->partitionMemoryContext);

	pfree(iterator);

	partitionInfo->iterators[scan->partIndex - 1] = NULL;
}
示例#12
0
/*
 * launch_consumer_group
 *
 * Launch a group of background worker process that will consume from the given topic
 * into the given relation
 */
static bool
launch_consumer_group(Relation consumers, KafkaConsumer *consumer, int64 offset)
{
	BackgroundWorker worker;
	BackgroundWorkerHandle *handle;
	KafkaConsumerGroup *group;
	bool found;
	int i;

	group = (KafkaConsumerGroup *) hash_search(consumer_groups, &consumer->id, HASH_ENTER, &found);
	if (found)
	{
		KafkaConsumerProc *proc;
		HASH_SEQ_STATUS iter;
		bool running = false;

		hash_seq_init(&iter, consumer_procs);
		while ((proc = (KafkaConsumerProc *) hash_seq_search(&iter)) != NULL)
		{
			if (proc->consumer_id == consumer->id)
			{
				running = true;
				break;
			}
		}
		hash_seq_term(&iter);

		/* if there are already procs running, it's a noop */
		if (running)
			return true;

		/* no procs actually running, so it's ok to launch new ones */
	}

	group->parallelism = consumer->parallelism;

	for (i = 0; i < group->parallelism; i++)
	{
		/* we just need any unique OID here */
		Oid id = GetNewOid(consumers);
		KafkaConsumerProc *proc;

		proc = (KafkaConsumerProc *) hash_search(consumer_procs, &id, HASH_ENTER, &found);
		if (found)
			continue;

		worker.bgw_main_arg = DatumGetObjectId(id);
		worker.bgw_flags = BGWORKER_BACKEND_DATABASE_CONNECTION | BGWORKER_SHMEM_ACCESS;
		worker.bgw_start_time = BgWorkerStart_RecoveryFinished;
		worker.bgw_restart_time = BGW_NEVER_RESTART;
		worker.bgw_main = NULL;
		worker.bgw_notify_pid = 0;

		/* this module is loaded dynamically, so we can't use bgw_main */
		sprintf(worker.bgw_library_name, PIPELINE_KAFKA_LIB);
		sprintf(worker.bgw_function_name, KAFKA_CONSUME_MAIN);
		snprintf(worker.bgw_name, BGW_MAXLEN, "[kafka consumer] %s <- %s", consumer->rel->relname, consumer->topic);

		proc->consumer_id = consumer->id;
		proc->partition_group = i;
		proc->offset = offset;
		namestrcpy(&proc->dbname, get_database_name(MyDatabaseId));

		if (!RegisterDynamicBackgroundWorker(&worker, &handle))
			return false;

		proc->worker = *handle;
	}

	return true;
}
示例#13
0
void *
hash_seq_search(HASH_SEQ_STATUS *status)
{
	HTAB	   *hashp;
	HASHHDR    *hctl;
	uint32		max_bucket;
	long		ssize;
	long		segment_num;
	long		segment_ndx;
	HASHSEGMENT segp;
	uint32		curBucket;
	HASHELEMENT *curElem;

	if ((curElem = status->curEntry) != NULL)
	{
		/* Continuing scan of curBucket... */
		status->curEntry = curElem->link;
		if (status->curEntry == NULL)	/* end of this bucket */
			++status->curBucket;
		return (void *) ELEMENTKEY(curElem);
	}

	/*
	 * Search for next nonempty bucket starting at curBucket.
	 */
	curBucket = status->curBucket;
	hashp = status->hashp;
	hctl = hashp->hctl;
	ssize = hashp->ssize;
	max_bucket = hctl->max_bucket;

	if (curBucket > max_bucket)
	{
		hash_seq_term(status);
		return NULL;			/* search is done */
	}

	/*
	 * first find the right segment in the table directory.
	 */
	segment_num = curBucket >> hashp->sshift;
	segment_ndx = MOD(curBucket, ssize);

	segp = hashp->dir[segment_num];

	/*
	 * Pick up the first item in this bucket's chain.  If chain is not empty
	 * we can begin searching it.  Otherwise we have to advance to find the
	 * next nonempty bucket.  We try to optimize that case since searching a
	 * near-empty hashtable has to iterate this loop a lot.
	 */
	while ((curElem = segp[segment_ndx]) == NULL)
	{
		/* empty bucket, advance to next */
		if (++curBucket > max_bucket)
		{
			status->curBucket = curBucket;
			hash_seq_term(status);
			return NULL;		/* search is done */
		}
		if (++segment_ndx >= ssize)
		{
			segment_num++;
			segment_ndx = 0;
			segp = hashp->dir[segment_num];
		}
	}

	/* Begin scan of curBucket... */
	status->curEntry = curElem->link;
	if (status->curEntry == NULL)		/* end of this bucket */
		++curBucket;
	status->curBucket = curBucket;
	return (void *) ELEMENTKEY(curElem);
}
示例#14
0
/*
 *	mdsync() -- Sync previous writes to stable storage.
 */
bool
mdsync(void)
{
	static bool mdsync_in_progress = false;

	HASH_SEQ_STATUS hstat;
	PendingOperationEntry *entry;
	int			absorb_counter;

	/*
	 * This is only called during checkpoints, and checkpoints should only
	 * occur in processes that have created a pendingOpsTable.
	 */
	if (!pendingOpsTable)
		return false;

	/*
	 * If we are in the bgwriter, the sync had better include all fsync
	 * requests that were queued by backends before the checkpoint REDO
	 * point was determined.  We go that a little better by accepting all
	 * requests queued up to the point where we start fsync'ing.
	 */
	AbsorbFsyncRequests();

	/*
	 * To avoid excess fsync'ing (in the worst case, maybe a never-terminating
	 * checkpoint), we want to ignore fsync requests that are entered into the
	 * hashtable after this point --- they should be processed next time,
	 * instead.  We use mdsync_cycle_ctr to tell old entries apart from new
	 * ones: new ones will have cycle_ctr equal to the incremented value of
	 * mdsync_cycle_ctr.
	 *
	 * In normal circumstances, all entries present in the table at this
	 * point will have cycle_ctr exactly equal to the current (about to be old)
	 * value of mdsync_cycle_ctr.  However, if we fail partway through the
	 * fsync'ing loop, then older values of cycle_ctr might remain when we
	 * come back here to try again.  Repeated checkpoint failures would
	 * eventually wrap the counter around to the point where an old entry
	 * might appear new, causing us to skip it, possibly allowing a checkpoint
	 * to succeed that should not have.  To forestall wraparound, any time
	 * the previous mdsync() failed to complete, run through the table and
	 * forcibly set cycle_ctr = mdsync_cycle_ctr.
	 *
	 * Think not to merge this loop with the main loop, as the problem is
	 * exactly that that loop may fail before having visited all the entries.
	 * From a performance point of view it doesn't matter anyway, as this
	 * path will never be taken in a system that's functioning normally.
	 */
	if (mdsync_in_progress)
	{
		/* prior try failed, so update any stale cycle_ctr values */
		hash_seq_init(&hstat, pendingOpsTable);
		while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
		{
			entry->cycle_ctr = mdsync_cycle_ctr;
		}
	}

	/* Advance counter so that new hashtable entries are distinguishable */
	mdsync_cycle_ctr++;

	/* Set flag to detect failure if we don't reach the end of the loop */
	mdsync_in_progress = true;

	/* Now scan the hashtable for fsync requests to process */
	absorb_counter = FSYNCS_PER_ABSORB;
	hash_seq_init(&hstat, pendingOpsTable);
	while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
	{
		/*
		 * If the entry is new then don't process it this time.  Note that
		 * "continue" bypasses the hash-remove call at the bottom of the loop.
		 */
		if (entry->cycle_ctr == mdsync_cycle_ctr)
			continue;

		/* Else assert we haven't missed it */
		Assert((CycleCtr) (entry->cycle_ctr + 1) == mdsync_cycle_ctr);

		/*
		 * If fsync is off then we don't have to bother opening the file
		 * at all.  (We delay checking until this point so that changing
		 * fsync on the fly behaves sensibly.)  Also, if the entry is
		 * marked canceled, fall through to delete it.
		 */
		if (enableFsync && !entry->canceled)
		{
			int			failures;

			/*
			 * If in bgwriter, we want to absorb pending requests every so
			 * often to prevent overflow of the fsync request queue.  It is
			 * unspecified whether newly-added entries will be visited by
			 * hash_seq_search, but we don't care since we don't need to
			 * process them anyway.
			 */
			if (--absorb_counter <= 0)
			{
				AbsorbFsyncRequests();
				absorb_counter = FSYNCS_PER_ABSORB;
			}

			/*
			 * The fsync table could contain requests to fsync segments that
			 * have been deleted (unlinked) by the time we get to them.
			 * Rather than just hoping an ENOENT (or EACCES on Windows) error
			 * can be ignored, what we do on error is absorb pending requests
			 * and then retry.  Since mdunlink() queues a "revoke" message
			 * before actually unlinking, the fsync request is guaranteed to
			 * be marked canceled after the absorb if it really was this case.
			 * DROP DATABASE likewise has to tell us to forget fsync requests
			 * before it starts deletions.
			 */
			for (failures = 0; ; failures++)	/* loop exits at "break" */
			{
				SMgrRelation reln;
				MdfdVec    *seg;

				/*
				 * Find or create an smgr hash entry for this relation. This
				 * may seem a bit unclean -- md calling smgr?  But it's really
				 * the best solution.  It ensures that the open file reference
				 * isn't permanently leaked if we get an error here. (You may
				 * say "but an unreferenced SMgrRelation is still a leak!" Not
				 * really, because the only case in which a checkpoint is done
				 * by a process that isn't about to shut down is in the
				 * bgwriter, and it will periodically do smgrcloseall(). This
				 * fact justifies our not closing the reln in the success path
				 * either, which is a good thing since in non-bgwriter cases
				 * we couldn't safely do that.)  Furthermore, in many cases
				 * the relation will have been dirtied through this same smgr
				 * relation, and so we can save a file open/close cycle.
				 */
				reln = smgropen(entry->tag.rnode);

				/*
				 * It is possible that the relation has been dropped or
				 * truncated since the fsync request was entered.  Therefore,
				 * allow ENOENT, but only if we didn't fail already on
				 * this file.  This applies both during _mdfd_getseg() and
				 * during FileSync, since fd.c might have closed the file
				 * behind our back.
				 */
				seg = _mdfd_getseg(reln,
								   entry->tag.segno * ((BlockNumber) RELSEG_SIZE),
								   true);
				if (seg != NULL &&
					FileSync(seg->mdfd_vfd) >= 0)
					break;		/* success; break out of retry loop */

				/*
				 * XXX is there any point in allowing more than one retry?
				 * Don't see one at the moment, but easy to change the
				 * test here if so.
				 */
				if (!FILE_POSSIBLY_DELETED(errno) ||
					failures > 0)
				{
					ereport(LOG,
							(errcode_for_file_access(),
							 errmsg("could not fsync segment %u of relation %u/%u/%u: %m",
									entry->tag.segno,
									entry->tag.rnode.spcNode,
									entry->tag.rnode.dbNode,
									entry->tag.rnode.relNode)));
					hash_seq_term(&hstat);
					return false;
				}
				else
					ereport(DEBUG1,
							(errcode_for_file_access(),
							 errmsg("could not fsync segment %u of relation %u/%u/%u, but retrying: %m",
									entry->tag.segno,
									entry->tag.rnode.spcNode,
									entry->tag.rnode.dbNode,
									entry->tag.rnode.relNode)));

				/*
				 * Absorb incoming requests and check to see if canceled.
				 */
				AbsorbFsyncRequests();
				absorb_counter = FSYNCS_PER_ABSORB;	/* might as well... */

				if (entry->canceled)
					break;
			}	/* end retry loop */
		}

		/*
		 * If we get here, either we fsync'd successfully, or we don't have
		 * to because enableFsync is off, or the entry is (now) marked
		 * canceled.  Okay to delete it.
		 */
		if (hash_search(pendingOpsTable, &entry->tag,
						HASH_REMOVE, NULL) == NULL)
			elog(ERROR, "pendingOpsTable corrupted");
	}	/* end loop over hashtable entries */

	/* Flag successful completion of mdsync */
	mdsync_in_progress = false;

	return true;
}