/* * PersistentFilespace_LookupMirrorDbid() * * Check the gp_persistent_filespace table to identify what dbid it contains * that does not match the primary dbid. If there are no filespaces currently * defined this check will return 0 even if there is an active mirror, because * the segment doesn't know any better. */ int16 PersistentFilespace_LookupMirrorDbid(int16 primaryDbid) { HASH_SEQ_STATUS status; FilespaceDirEntry dirEntry; int16 mirrorDbid = 0; PersistentFilespace_VerifyInitScan(); /* Start scan */ hash_seq_init(&status, persistentFilespaceSharedHashTable); dirEntry = (FilespaceDirEntry) hash_seq_search(&status); if (dirEntry != NULL) { if (dirEntry->dbId1 == primaryDbid) { mirrorDbid = dirEntry->dbId2; } else if (dirEntry->dbId2 == primaryDbid) { mirrorDbid = dirEntry->dbId1; } else { elog(FATAL, "dbid %d not found in gp_persistent_filespace_node", (int) primaryDbid); } /* Terminate the scan early */ hash_seq_term(&status); } return mirrorDbid; }
/* * smgrIsAppendOnlyMirrorResyncEofs() -- Returns true if there Append-Only Mirror Resync * EOF work that needs to be done post-commit or post-abort work. * * Note that the list does not include anything scheduled for termination * by upper-level transactions. */ bool smgrIsAppendOnlyMirrorResyncEofs(EndXactRecKind endXactRecKind) { int nestLevel = GetCurrentTransactionNestLevel(); HASH_SEQ_STATUS iterateStatus; AppendOnlyMirrorResyncEofs *entry; if (AppendOnlyMirrorResyncEofsTable == NULL) { return false; } hash_seq_init(&iterateStatus, AppendOnlyMirrorResyncEofsTable); while ((entry = hash_seq_search(&iterateStatus)) != NULL) { if (entry->key.nestLevel >= nestLevel) { /* Deregister seq scan and exit early. */ hash_seq_term(&iterateStatus); return true; } } return false; }
/* * WorkerGetNodeWithName finds and returns a node from the membership list that * has the given hostname. The function returns null if no such node exists. */ WorkerNode * WorkerGetNodeWithName(const char *hostname) { WorkerNode *workerNode = NULL; HASH_SEQ_STATUS status; hash_seq_init(&status, WorkerNodesHash); workerNode = (WorkerNode *) hash_seq_search(&status); while (workerNode != NULL) { if (workerNode->inWorkerFile) { int nameCompare = strncmp(workerNode->workerName, hostname, WORKER_LENGTH); if (nameCompare == 0) { hash_seq_term(&status); break; } } workerNode = (WorkerNode *) hash_seq_search(&status); } return workerNode; }
/* * Release connection created by calling GetConnection. */ void mysql_rel_connection(MYSQL *conn) { HASH_SEQ_STATUS scan; ConnCacheEntry *entry; if (ConnectionHash == NULL) return; hash_seq_init(&scan, ConnectionHash); while ((entry = (ConnCacheEntry *) hash_seq_search(&scan))) { if (entry->conn == NULL) continue; if (entry->conn == conn) { elog(DEBUG3, "disconnecting mysql_fdw connection %p", entry->conn); _mysql_close(entry->conn); entry->conn = NULL; hash_seq_term(&scan); break; } } }
/* * FindRandomNodeNotInList finds a random node from the shared hash that is not * a member of the current node list. The caller is responsible for making the * necessary node count checks to ensure that such a node exists. * * Note that this function has a selection bias towards nodes whose positions in * the shared hash are sequentially adjacent to the positions of nodes that are * in the current node list. This bias follows from our decision to first pick a * random node in the hash, and if that node is a member of the current list, to * simply iterate to the next node in the hash. Overall, this approach trades in * some selection bias for simplicity in design and for bounded execution time. */ static WorkerNode * FindRandomNodeNotInList(HTAB *WorkerNodesHash, List *currentNodeList) { WorkerNode *workerNode = NULL; HASH_SEQ_STATUS status; uint32 workerNodeCount = 0; uint32 currentNodeCount PG_USED_FOR_ASSERTS_ONLY = 0; bool lookForWorkerNode = true; uint32 workerPosition = 0; uint32 workerIndex = 0; workerNodeCount = hash_get_num_entries(WorkerNodesHash); currentNodeCount = list_length(currentNodeList); Assert(workerNodeCount > currentNodeCount); /* * We determine a random position within the worker hash between [1, N], * assuming that the number of elements in the hash is N. We then get to * this random position by iterating over the worker hash. Please note that * the random seed has already been set by the postmaster when starting up. */ workerPosition = (random() % workerNodeCount) + 1; hash_seq_init(&status, WorkerNodesHash); for (workerIndex = 0; workerIndex < workerPosition; workerIndex++) { workerNode = (WorkerNode *) hash_seq_search(&status); } while (lookForWorkerNode) { bool listMember = ListMember(currentNodeList, workerNode); if (workerNode->inWorkerFile && !listMember) { lookForWorkerNode = false; } else { /* iterate to the next worker node in the hash */ workerNode = (WorkerNode *) hash_seq_search(&status); /* reached end of hash; start from the beginning */ if (workerNode == NULL) { hash_seq_init(&status, WorkerNodesHash); workerNode = (WorkerNode *) hash_seq_search(&status); } } } /* we stopped scanning before completion; therefore clean up scan */ hash_seq_term(&status); return workerNode; }
/* * Terminate the seq search of the DispatchedFilespaceDirHashTable. */ void DispatchedFilespace_SeqSearch_Term(void) { if (!DispatchedFileSpace_SeqSearch_Initialized) { return; } hash_seq_term(&DispatchedFileSpace_SeqSearch); DispatchedFileSpace_SeqSearch_Initialized = false; }
/* * DynamicTableScanEndCurrentScan * Cleans up any ongoing scan. */ static void DynamicTableScanEndCurrentScan(DynamicTableScanState *node) { CleanupOnePartition((ScanState*)node); if (node->shouldCallHashSeqTerm) { hash_seq_term(&node->pidStatus); node->shouldCallHashSeqTerm = false; } }
/* * Ends current scan by closing relations, and ending hash * iteration */ static void DynamicIndexScanEndCurrentScan(DynamicIndexScanState *node) { IndexScanState *indexState = &(node->indexScanState); CleanupOnePartition(indexState); if (node->shouldCallHashSeqTerm) { hash_seq_term(&node->pidxStatus); node->shouldCallHashSeqTerm = false; } }
/* * DynamicScan_RewindIterator * Rewinds the iterator for a new scan of all the parts */ static void DynamicScan_RewindIterator(ScanState *scanState) { if (!isDynamicScan((Scan *)scanState->ps.plan)) { return; } /* * For EXPLAIN of a plan, we may never finish the initialization, * and end up calling the End method directly.In such cases, we * don't have any iterator to end. */ if (SCAN_INIT == scanState->scan_state) { DynamicScan_CreateIterator(scanState, (Scan *)scanState->ps.plan); return; } Scan *scan = (Scan *)scanState->ps.plan; DynamicTableScanInfo *partitionInfo = scanState->ps.state->dynamicTableScanInfo; Assert(partitionInfo->numScans >= scan->partIndex); DynamicPartitionIterator *iterator = partitionInfo->iterators[scan->partIndex - 1]; Assert(NULL != iterator); if (iterator->shouldCallHashSeqTerm) { hash_seq_term(iterator->partitionIterator); } pfree(iterator->partitionIterator); iterator->partitionOids = partitionInfo->pidIndexes[scan->partIndex - 1]; Assert(iterator->partitionOids != NULL); iterator->shouldCallHashSeqTerm = true; HASH_SEQ_STATUS *partitionIterator = palloc(sizeof(HASH_SEQ_STATUS)); hash_seq_init(partitionIterator, iterator->partitionOids); iterator->partitionIterator = partitionIterator; Assert(iterator == partitionInfo->iterators[scan->partIndex - 1]); }
/* * Relcache invalidation callback for our relation map cache. */ static void logicalrep_relmap_invalidate_cb(Datum arg, Oid reloid) { LogicalRepRelMapEntry *entry; /* Just to be sure. */ if (LogicalRepRelMap == NULL) return; if (reloid != InvalidOid) { HASH_SEQ_STATUS status; hash_seq_init(&status, LogicalRepRelMap); /* TODO, use inverse lookup hashtable? */ while ((entry = (LogicalRepRelMapEntry *) hash_seq_search(&status)) != NULL) { if (entry->localreloid == reloid) { entry->localreloid = InvalidOid; hash_seq_term(&status); break; } } } else { /* invalidate all cache entries */ HASH_SEQ_STATUS status; hash_seq_init(&status, LogicalRepRelMap); while ((entry = (LogicalRepRelMapEntry *) hash_seq_search(&status)) != NULL) entry->localreloid = InvalidOid; } }
/* * DynamicScan_EndIterator * Frees the partition iterator for a scanState. */ static void DynamicScan_EndIterator(ScanState *scanState) { Assert(NULL != scanState); /* * For EXPLAIN of a plan, we may never finish the initialization, * and end up calling the End method directly.In such cases, we * don't have any iterator to end. */ if (SCAN_INIT == scanState->scan_state) { return; } Scan *scan = (Scan *)scanState->ps.plan; DynamicTableScanInfo *partitionInfo = scanState->ps.state->dynamicTableScanInfo; Assert(partitionInfo->numScans >= scan->partIndex); DynamicPartitionIterator *iterator = partitionInfo->iterators[scan->partIndex - 1]; Assert(NULL != iterator); if (iterator->shouldCallHashSeqTerm) { hash_seq_term(iterator->partitionIterator); } pfree(iterator->partitionIterator); MemoryContextDelete(iterator->partitionMemoryContext); pfree(iterator); partitionInfo->iterators[scan->partIndex - 1] = NULL; }
/* * launch_consumer_group * * Launch a group of background worker process that will consume from the given topic * into the given relation */ static bool launch_consumer_group(Relation consumers, KafkaConsumer *consumer, int64 offset) { BackgroundWorker worker; BackgroundWorkerHandle *handle; KafkaConsumerGroup *group; bool found; int i; group = (KafkaConsumerGroup *) hash_search(consumer_groups, &consumer->id, HASH_ENTER, &found); if (found) { KafkaConsumerProc *proc; HASH_SEQ_STATUS iter; bool running = false; hash_seq_init(&iter, consumer_procs); while ((proc = (KafkaConsumerProc *) hash_seq_search(&iter)) != NULL) { if (proc->consumer_id == consumer->id) { running = true; break; } } hash_seq_term(&iter); /* if there are already procs running, it's a noop */ if (running) return true; /* no procs actually running, so it's ok to launch new ones */ } group->parallelism = consumer->parallelism; for (i = 0; i < group->parallelism; i++) { /* we just need any unique OID here */ Oid id = GetNewOid(consumers); KafkaConsumerProc *proc; proc = (KafkaConsumerProc *) hash_search(consumer_procs, &id, HASH_ENTER, &found); if (found) continue; worker.bgw_main_arg = DatumGetObjectId(id); worker.bgw_flags = BGWORKER_BACKEND_DATABASE_CONNECTION | BGWORKER_SHMEM_ACCESS; worker.bgw_start_time = BgWorkerStart_RecoveryFinished; worker.bgw_restart_time = BGW_NEVER_RESTART; worker.bgw_main = NULL; worker.bgw_notify_pid = 0; /* this module is loaded dynamically, so we can't use bgw_main */ sprintf(worker.bgw_library_name, PIPELINE_KAFKA_LIB); sprintf(worker.bgw_function_name, KAFKA_CONSUME_MAIN); snprintf(worker.bgw_name, BGW_MAXLEN, "[kafka consumer] %s <- %s", consumer->rel->relname, consumer->topic); proc->consumer_id = consumer->id; proc->partition_group = i; proc->offset = offset; namestrcpy(&proc->dbname, get_database_name(MyDatabaseId)); if (!RegisterDynamicBackgroundWorker(&worker, &handle)) return false; proc->worker = *handle; } return true; }
void * hash_seq_search(HASH_SEQ_STATUS *status) { HTAB *hashp; HASHHDR *hctl; uint32 max_bucket; long ssize; long segment_num; long segment_ndx; HASHSEGMENT segp; uint32 curBucket; HASHELEMENT *curElem; if ((curElem = status->curEntry) != NULL) { /* Continuing scan of curBucket... */ status->curEntry = curElem->link; if (status->curEntry == NULL) /* end of this bucket */ ++status->curBucket; return (void *) ELEMENTKEY(curElem); } /* * Search for next nonempty bucket starting at curBucket. */ curBucket = status->curBucket; hashp = status->hashp; hctl = hashp->hctl; ssize = hashp->ssize; max_bucket = hctl->max_bucket; if (curBucket > max_bucket) { hash_seq_term(status); return NULL; /* search is done */ } /* * first find the right segment in the table directory. */ segment_num = curBucket >> hashp->sshift; segment_ndx = MOD(curBucket, ssize); segp = hashp->dir[segment_num]; /* * Pick up the first item in this bucket's chain. If chain is not empty * we can begin searching it. Otherwise we have to advance to find the * next nonempty bucket. We try to optimize that case since searching a * near-empty hashtable has to iterate this loop a lot. */ while ((curElem = segp[segment_ndx]) == NULL) { /* empty bucket, advance to next */ if (++curBucket > max_bucket) { status->curBucket = curBucket; hash_seq_term(status); return NULL; /* search is done */ } if (++segment_ndx >= ssize) { segment_num++; segment_ndx = 0; segp = hashp->dir[segment_num]; } } /* Begin scan of curBucket... */ status->curEntry = curElem->link; if (status->curEntry == NULL) /* end of this bucket */ ++curBucket; status->curBucket = curBucket; return (void *) ELEMENTKEY(curElem); }
/* * mdsync() -- Sync previous writes to stable storage. */ bool mdsync(void) { static bool mdsync_in_progress = false; HASH_SEQ_STATUS hstat; PendingOperationEntry *entry; int absorb_counter; /* * This is only called during checkpoints, and checkpoints should only * occur in processes that have created a pendingOpsTable. */ if (!pendingOpsTable) return false; /* * If we are in the bgwriter, the sync had better include all fsync * requests that were queued by backends before the checkpoint REDO * point was determined. We go that a little better by accepting all * requests queued up to the point where we start fsync'ing. */ AbsorbFsyncRequests(); /* * To avoid excess fsync'ing (in the worst case, maybe a never-terminating * checkpoint), we want to ignore fsync requests that are entered into the * hashtable after this point --- they should be processed next time, * instead. We use mdsync_cycle_ctr to tell old entries apart from new * ones: new ones will have cycle_ctr equal to the incremented value of * mdsync_cycle_ctr. * * In normal circumstances, all entries present in the table at this * point will have cycle_ctr exactly equal to the current (about to be old) * value of mdsync_cycle_ctr. However, if we fail partway through the * fsync'ing loop, then older values of cycle_ctr might remain when we * come back here to try again. Repeated checkpoint failures would * eventually wrap the counter around to the point where an old entry * might appear new, causing us to skip it, possibly allowing a checkpoint * to succeed that should not have. To forestall wraparound, any time * the previous mdsync() failed to complete, run through the table and * forcibly set cycle_ctr = mdsync_cycle_ctr. * * Think not to merge this loop with the main loop, as the problem is * exactly that that loop may fail before having visited all the entries. * From a performance point of view it doesn't matter anyway, as this * path will never be taken in a system that's functioning normally. */ if (mdsync_in_progress) { /* prior try failed, so update any stale cycle_ctr values */ hash_seq_init(&hstat, pendingOpsTable); while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL) { entry->cycle_ctr = mdsync_cycle_ctr; } } /* Advance counter so that new hashtable entries are distinguishable */ mdsync_cycle_ctr++; /* Set flag to detect failure if we don't reach the end of the loop */ mdsync_in_progress = true; /* Now scan the hashtable for fsync requests to process */ absorb_counter = FSYNCS_PER_ABSORB; hash_seq_init(&hstat, pendingOpsTable); while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL) { /* * If the entry is new then don't process it this time. Note that * "continue" bypasses the hash-remove call at the bottom of the loop. */ if (entry->cycle_ctr == mdsync_cycle_ctr) continue; /* Else assert we haven't missed it */ Assert((CycleCtr) (entry->cycle_ctr + 1) == mdsync_cycle_ctr); /* * If fsync is off then we don't have to bother opening the file * at all. (We delay checking until this point so that changing * fsync on the fly behaves sensibly.) Also, if the entry is * marked canceled, fall through to delete it. */ if (enableFsync && !entry->canceled) { int failures; /* * If in bgwriter, we want to absorb pending requests every so * often to prevent overflow of the fsync request queue. It is * unspecified whether newly-added entries will be visited by * hash_seq_search, but we don't care since we don't need to * process them anyway. */ if (--absorb_counter <= 0) { AbsorbFsyncRequests(); absorb_counter = FSYNCS_PER_ABSORB; } /* * The fsync table could contain requests to fsync segments that * have been deleted (unlinked) by the time we get to them. * Rather than just hoping an ENOENT (or EACCES on Windows) error * can be ignored, what we do on error is absorb pending requests * and then retry. Since mdunlink() queues a "revoke" message * before actually unlinking, the fsync request is guaranteed to * be marked canceled after the absorb if it really was this case. * DROP DATABASE likewise has to tell us to forget fsync requests * before it starts deletions. */ for (failures = 0; ; failures++) /* loop exits at "break" */ { SMgrRelation reln; MdfdVec *seg; /* * Find or create an smgr hash entry for this relation. This * may seem a bit unclean -- md calling smgr? But it's really * the best solution. It ensures that the open file reference * isn't permanently leaked if we get an error here. (You may * say "but an unreferenced SMgrRelation is still a leak!" Not * really, because the only case in which a checkpoint is done * by a process that isn't about to shut down is in the * bgwriter, and it will periodically do smgrcloseall(). This * fact justifies our not closing the reln in the success path * either, which is a good thing since in non-bgwriter cases * we couldn't safely do that.) Furthermore, in many cases * the relation will have been dirtied through this same smgr * relation, and so we can save a file open/close cycle. */ reln = smgropen(entry->tag.rnode); /* * It is possible that the relation has been dropped or * truncated since the fsync request was entered. Therefore, * allow ENOENT, but only if we didn't fail already on * this file. This applies both during _mdfd_getseg() and * during FileSync, since fd.c might have closed the file * behind our back. */ seg = _mdfd_getseg(reln, entry->tag.segno * ((BlockNumber) RELSEG_SIZE), true); if (seg != NULL && FileSync(seg->mdfd_vfd) >= 0) break; /* success; break out of retry loop */ /* * XXX is there any point in allowing more than one retry? * Don't see one at the moment, but easy to change the * test here if so. */ if (!FILE_POSSIBLY_DELETED(errno) || failures > 0) { ereport(LOG, (errcode_for_file_access(), errmsg("could not fsync segment %u of relation %u/%u/%u: %m", entry->tag.segno, entry->tag.rnode.spcNode, entry->tag.rnode.dbNode, entry->tag.rnode.relNode))); hash_seq_term(&hstat); return false; } else ereport(DEBUG1, (errcode_for_file_access(), errmsg("could not fsync segment %u of relation %u/%u/%u, but retrying: %m", entry->tag.segno, entry->tag.rnode.spcNode, entry->tag.rnode.dbNode, entry->tag.rnode.relNode))); /* * Absorb incoming requests and check to see if canceled. */ AbsorbFsyncRequests(); absorb_counter = FSYNCS_PER_ABSORB; /* might as well... */ if (entry->canceled) break; } /* end retry loop */ } /* * If we get here, either we fsync'd successfully, or we don't have * to because enableFsync is off, or the entry is (now) marked * canceled. Okay to delete it. */ if (hash_search(pendingOpsTable, &entry->tag, HASH_REMOVE, NULL) == NULL) elog(ERROR, "pendingOpsTable corrupted"); } /* end loop over hashtable entries */ /* Flag successful completion of mdsync */ mdsync_in_progress = false; return true; }