/* * Sweeps through the cache and marks all entries as deleted * * Returns the number of elements it found and marked deleted. */ int32 Cache_Clear(Cache *cache) { Assert(NULL != cache); int32 startIdx = cdb_randint(cache->cacheHdr->nEntries - 1, 0); int32 entryIdx = startIdx; int32 numClearedEntries = 0; while (true) { entryIdx = (entryIdx + 1) % cache->cacheHdr->nEntries; if (entryIdx == startIdx) { /* Completed one loop through the list of all entries. We're done */ break; } CacheEntry *crtEntry = Cache_GetEntryByIndex(cache->cacheHdr, entryIdx); /* Lock entry so that nobody else changes its state until we're done with it */ Cache_LockEntry(cache, crtEntry); if (crtEntry->state != CACHE_ENTRY_CACHED) { /* Not interested in free/acquired/deleted entries. Go back and look at next entry */ Cache_UnlockEntry(cache, crtEntry); continue; } /* Found cached entry */ Cache_EntryAddRef(cache, crtEntry); if (crtEntry->state == CACHE_ENTRY_FREE || crtEntry->state == CACHE_ENTRY_ACQUIRED) { /* Someone freed up the entry before we had a chance to Add-Ref it. Skip it. */ Assert(false); Cache_EntryDecRef(cache, crtEntry); Cache_UnlockEntry(cache, crtEntry); continue; } Cache_RegisterCleanup(cache, crtEntry, true /* isCachedEntry */); Cache_Remove(cache, crtEntry); /* Done with changing the state. Unlock the entry */ Cache_UnlockEntry(cache, crtEntry); Cache_Release(cache, crtEntry); numClearedEntries++; } return numClearedEntries; }
/* * Return a previously acquired entry to the cache freelist. * Calls the client-specific cleanup before returning to the freelist. * * Unregisters the entry from the cleanup list if requested. */ static void Cache_ReleaseAcquired(Cache *cache, CacheEntry *entry, bool unregisterCleanup) { Assert(NULL != cache); Assert(NULL != entry); Assert(CACHE_ENTRY_ACQUIRED == entry->state); /* Unregister entry from the cleanup list if requested */ if (unregisterCleanup) { Cache_UnregisterCleanup(cache, entry); } PG_TRY(); { /* Call client-specific cleanup function before removing entry from cache */ cache->cleanupEntry(CACHE_ENTRY_PAYLOAD(entry)); } PG_CATCH(); { /* Grab entry lock to ensure exclusive access to it while we're touching it */ Cache_LockEntry(cache, entry); /* No need for atomic operations as long as we hold the entry lock */ entry->state = CACHE_ENTRY_FREE; #ifdef USE_ASSERT_CHECKING Cache_MemsetPayload(cache, entry); #endif Cache_UnlockEntry(cache, entry); /* Link entry back in the freelist */ Cache_AddToFreelist(cache, entry); PG_RE_THROW(); } PG_END_TRY(); /* Grab entry lock to ensure exclusive access to it while we're touching it */ Cache_LockEntry(cache, entry); /* No need for atomic operations as long as we hold the entry lock */ entry->state = CACHE_ENTRY_FREE; #ifdef USE_ASSERT_CHECKING Cache_MemsetPayload(cache, entry); #endif Cache_UnlockEntry(cache, entry); Cache_AddToFreelist(cache, entry); Cache_DecPerfCounter(&cache->cacheHdr->cacheStats.noAcquiredEntries, 1 /* delta */ ); }
/* * Look up an entry in the Global MDVSN component. * To avoid any concurrency issues, this returns a copy of the entry, * palloc'ed in the current memory context. The caller is responsible * for freeing this copy. * * Returns a copy of the entry if found, NULL otherwise. * */ mdver_entry * mdver_glob_mdvsn_find(Oid oid) { Assert(NULL != mdver_glob_mdvsn); mdver_entry mdver_info; mdver_info.key = oid; /* FIXME gcaragea 03/18/2014: Trigger evictions if cache is full (MPP-22923) */ CacheEntry *localEntry = Cache_AcquireEntry(mdver_glob_mdvsn, &mdver_info); Assert(NULL != localEntry); CacheEntry *cachedEntry = Cache_Lookup(mdver_glob_mdvsn, localEntry); /* Release local entry. We don't need it anymore */ Cache_Release(mdver_glob_mdvsn, localEntry); mdver_entry *mdver_copy = NULL; if (NULL != cachedEntry) { /* Found a match. Make a local copy */ mdver_entry *shared_mdver = (mdver_entry *) CACHE_ENTRY_PAYLOAD(cachedEntry); mdver_copy = (mdver_entry *) palloc0(sizeof(mdver_entry)); /* Lock entry to ensure atomicity of copy */ Cache_LockEntry(mdver_glob_mdvsn, cachedEntry); memcpy(mdver_copy, shared_mdver, sizeof(mdver_entry)); /* Got the copy, unlock entry */ Cache_UnlockEntry(mdver_glob_mdvsn, cachedEntry); /* * We're also done with the entry, release our pincount on it * * TODO gcaragea 05/02/2014: Are there cases where we need to hold the * entry past this point? (MPP-22923) */ Cache_Release(mdver_glob_mdvsn, cachedEntry); } return mdver_copy; }
/* * Run cache eviction algorithm * * It will try to evict enough entries to add up to evictSize. Returns the * actual accumulated size of the entries evicted */ int64 Cache_Evict(Cache *cache, int64 evictRequestSize) { Assert(NULL != cache); Assert(evictRequestSize > 0); Cache_TimedOperationStart(); int64 evictedSize = 0; uint32 unsuccessfulLoops = 0; bool foundVictim = false; uint32 decAmount = cache->cacheHdr->policyContext.utilityDecrement; Cache_Stats *cacheStats = &cache->cacheHdr->cacheStats; while (true) { bool wraparound = false; int32 entryIdx = Cache_NextClockHand(cache, &wraparound); Assert(entryIdx < cache->cacheHdr->nEntries); Cache_UpdatePerfCounter(&cacheStats->noEntriesScanned,1 /* delta */); if (wraparound) { unsuccessfulLoops++; Cache_UpdatePerfCounter(&cacheStats->noWraparound, 1 /* delta */); if (!foundVictim) { /* * We looped around and did not manage to evict any entries. * Double the amount we decrement eviction candidate's utility by. * This makes the eviction algorithm look for a victim more aggressively */ if (decAmount <= CACHE_MAX_UTILITY / 2) { decAmount = 2 * decAmount; } else { decAmount = CACHE_MAX_UTILITY; } } foundVictim = false; if (unsuccessfulLoops > cache->cacheHdr->policyContext.maxClockLoops) { /* Can't find any cached and unused entries candidates for evictions, even after looping around * maxClockLoops times. Give up looking for victims. */ Cache_TimedOperationRecord(&cacheStats->timeEvictions, &cacheStats->maxTimeEvict); break; } } CacheEntry *crtEntry = Cache_GetEntryByIndex(cache->cacheHdr, entryIdx); if (crtEntry->state != CACHE_ENTRY_CACHED) { /* Not interested in free/acquired/deleted entries. Go back and advance clock hand */ continue; } CacheAnchor *anchor = (CacheAnchor *) SyncHTLookup(cache->syncHashtable, &crtEntry->hashvalue); if (NULL == anchor) { /* There's no anchor for this entry, someone might have snatched it in the meantime */ continue; } SpinLockAcquire(&anchor->spinlock); if (crtEntry->state != CACHE_ENTRY_CACHED) { /* Someone freed this entry in the meantime, before we got a chance to acquire the anchor lock */ SpinLockRelease(&anchor->spinlock); SyncHTRelease(cache->syncHashtable, (void *) anchor); continue; } /* Ok, did all the checks, this entry must be valid now */ CACHE_ASSERT_VALID(crtEntry); if (crtEntry->pinCount > 0) { /* Entry is in use and can't be evicted. Go back and advance clock hand */ SpinLockRelease(&anchor->spinlock); SyncHTRelease(cache->syncHashtable, (void *) anchor); continue; } /* Decrement utility */ gp_atomic_dec_positive_32(&crtEntry->utility, decAmount); /* Just decremented someone's utility. Reset our unsuccessful loops counter */ unsuccessfulLoops = 0; if (crtEntry->utility > 0) { /* Entry has non-zero utility, we shouldn't evict it. Go back and advance clock hand */ SpinLockRelease(&anchor->spinlock); SyncHTRelease(cache->syncHashtable, (void *) anchor); continue; } /* Found our victim */ Assert(0 == crtEntry->pinCount); CACHE_ASSERT_VALID(crtEntry); Assert(crtEntry->utility == 0); #if USE_ASSERT_CHECKING int32 casResult = #endif compare_and_swap_32(&crtEntry->state, CACHE_ENTRY_CACHED, CACHE_ENTRY_DELETED); Assert(1 == casResult); SpinLockRelease(&anchor->spinlock); foundVictim = true; evictedSize += crtEntry->size; /* Don't update noFreeEntries yet. It will be done in Cache_AddToFreelist */ Cache_UpdatePerfCounter(&cacheStats->noCachedEntries, -1 /* delta */); /* Unlink entry from the anchor chain */ SpinLockAcquire(&anchor->spinlock); Cache_UnlinkEntry(cache, anchor, crtEntry); SpinLockRelease(&anchor->spinlock); SyncHTRelease(cache->syncHashtable, (void *) anchor); if (NULL != cache->cleanupEntry) { /* Call client-side cleanup for entry */ cache->cleanupEntry(CACHE_ENTRY_PAYLOAD(crtEntry)); } Cache_LockEntry(cache, crtEntry); Assert(crtEntry->state == CACHE_ENTRY_DELETED); crtEntry->state = CACHE_ENTRY_FREE; #if USE_ASSERT_CHECKING Cache_MemsetPayload(cache, crtEntry); #endif Cache_UnlockEntry(cache, crtEntry); Cache_AddToFreelist(cache, crtEntry); Cache_UpdatePerfCounter(&cacheStats->noEvicts, 1 /* delta */); Cache_TimedOperationRecord(&cacheStats->timeEvictions, &cacheStats->maxTimeEvict); if (evictedSize >= evictRequestSize) { /* We evicted as much as requested */ break; } Cache_TimedOperationStart(); } return evictedSize; }
/* * Reconcile an incoming versioning event with an existing Global MDVSN entry * for the same versioned object. * * Each versioning event contains the old version and the new version as known * by the originating backend: * VE = (key, oldV, newV) * Cached entry contains the current version globally visible: * entry = (key, crtV) * * We have the following scenarios: * - If oldV == crtV, (i.e. VE old version is the same as the current version) * then nobody else has modified the object since the backend read it. * We simply update the entry with the new version in that case: * entry = (key, crtV) --> entry = (key, newV) * * - If oldV < crtV, (i.e. VE old version is different than the current version) * some other backend must have modified the object in the meantime. * We generate an entirely new version new_newV for the object to reflect * the new "combined" object. * * The cached entry is updated directly with the new version: * entry = (key, crtV) --> entry = (key, new_newV) * * The versioning event in the queue is updated directly: VE = (key, oldV, newV) --> VE = (key, crtV, new_newV) * * event: The event containing the versioning information for an update * cached_entry: The existing entry for this object in the Global MDVSN * * This function is called while the MDVerWriteLock is held in exclusive * mode. Don't do anything that is not allowed while holding a LWLock * (e.g. allocate memory, or call unsafe functions). * */ static void mdver_globalhandler_reconcile(mdver_event *event, CacheEntry *cached_entry) { /* Found existing entry, reconcile and update the version */ mdver_entry *cached_mdver_entry = CACHE_ENTRY_PAYLOAD(cached_entry); #ifdef MD_VERSIONING_INSTRUMENTATION elog(gp_mdversioning_loglevel, "Updating GlobalMDVSN entry %d: Current (%d,%d). Event: [(%d,%d)->(%d,%d)]", event->key, (int) cached_mdver_entry->ddl_version, (int) cached_mdver_entry->dml_version, (int) event->old_ddl_version, (int) event->old_dml_version, (int) event->new_ddl_version, (int) event->new_dml_version); #endif /* * Reconcile and resolve conflicts for incoming versioning events. * When a new versioning event is received at the Global MDVSN, * look up if the same object has a conflicting version. * If so, resolve conflict by generating a new version. */ uint64 new_ddl_version = event->new_ddl_version; uint64 new_dml_version = event->new_dml_version; bool conflict = false; /* * It is safe to read the cached_mdver_entry contents, since * we're holding the write lock on the Global MDVSN cache. */ if (cached_mdver_entry->ddl_version != event->old_ddl_version) { new_ddl_version = mdver_next_global_version(); conflict = true; } if (cached_mdver_entry->dml_version != event->old_dml_version) { new_dml_version = mdver_next_global_version(); conflict = true; } if (conflict) { #ifdef MD_VERSIONING_INSTRUMENTATION elog(gp_mdversioning_loglevel, "Updating event in the queue (pid=%d, oid=%d): Old event: [(%d,%d)->(%d,%d)]. Modified event: [(%d,%d)->(%d,%d)]", event->backend_pid, event->key, /* Old event */ (int) event->old_ddl_version, (int) event->old_dml_version, (int) event->new_ddl_version, (int) event->new_dml_version, /* New event */ (int) cached_mdver_entry->ddl_version, (int) cached_mdver_entry->dml_version, (int) new_ddl_version, (int) new_dml_version); #endif /* * A new version for this object is being generated here. * We're going to directly update the event in the queue with the new * version. */ event->new_ddl_version = new_ddl_version; event->new_dml_version = new_dml_version; /* * We're also updating the VE old version to reflect the current * visible global version */ event->old_ddl_version = cached_mdver_entry->ddl_version; event->old_dml_version = cached_mdver_entry->dml_version; } /* About to update the cached entry. Lock entry to make update atomic */ Cache *glob_mdvsn = mdver_get_glob_mdvsn(); Cache_LockEntry(glob_mdvsn, cached_entry); cached_mdver_entry->ddl_version = new_ddl_version; cached_mdver_entry->dml_version = new_dml_version; Cache_UnlockEntry(glob_mdvsn, cached_entry); }
/* * Internal version of the CacheRelease function * * Unregisters the entry from the cleanup list if requested. */ static void Cache_ReleaseCached(Cache *cache, CacheEntry *entry, bool unregisterCleanup) { Assert(NULL != cache); Assert(NULL != entry); Assert(CACHE_ENTRY_CACHED == entry->state || CACHE_ENTRY_DELETED == entry->state); Cache_ComputeEntryHashcode(cache, entry); volatile CacheAnchor *anchor = SyncHTLookup(cache->syncHashtable, &entry->hashvalue); Assert(anchor != NULL); /* Acquire anchor lock to touch the entry */ SpinLockAcquire(&anchor->spinlock); Cache_LockEntry(cache, entry); uint32 pinCount = Cache_EntryDecRef(cache, entry); bool deleteEntry = false; if (pinCount == 0 && entry->state == CACHE_ENTRY_DELETED) { /* Delete the cache entry if pin-count = 0 and it is marked for deletion */ Cache_UnlinkEntry(cache, (CacheAnchor *) anchor, entry); deleteEntry = true; Cache_UpdatePerfCounter(&cache->cacheHdr->cacheStats.noDeletedEntries, -1 /* delta */); } Cache_UnlockEntry(cache, entry); SpinLockRelease(&anchor->spinlock); /* * Releasing anchor to hashtable. * Ignoring 'removed' return value, both values are valid */ SyncHTRelease(cache->syncHashtable, (void *) anchor); /* If requested, unregister entry from the cleanup list */ if (unregisterCleanup) { Cache_UnregisterCleanup(cache, entry); } if (deleteEntry) { if (NULL != cache->cleanupEntry) { PG_TRY(); { /* Call client-specific cleanup function before removing entry from cache */ cache->cleanupEntry(CACHE_ENTRY_PAYLOAD(entry)); } PG_CATCH(); { /* Grab entry lock to ensure exclusive access to it while we're touching it */ Cache_LockEntry(cache, entry); Assert(CACHE_ENTRY_DELETED == entry->state); entry->state = CACHE_ENTRY_FREE; #ifdef USE_ASSERT_CHECKING Cache_MemsetPayload(cache, entry); #endif Cache_UnlockEntry(cache, entry); /* Link entry back in the freelist */ Cache_AddToFreelist(cache, entry); PG_RE_THROW(); } PG_END_TRY(); } /* Grab entry lock to ensure exclusive access to it while we're touching it */ Cache_LockEntry(cache, entry); entry->state = CACHE_ENTRY_FREE; #ifdef USE_ASSERT_CHECKING Cache_MemsetPayload(cache, entry); #endif Cache_UnlockEntry(cache, entry); /* Link entry back in the freelist */ Cache_AddToFreelist(cache, entry); } }
/* * Function returning all workfile cache entries for one segment */ Datum gp_workfile_mgr_cache_entries(PG_FUNCTION_ARGS) { FuncCallContext *funcctx; int32 *crtIndexPtr; if (SRF_IS_FIRSTCALL()) { /* create a function context for cross-call persistence */ funcctx = SRF_FIRSTCALL_INIT(); /* Switch to memory context appropriate for multiple function calls */ MemoryContext oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); /* * Build a tuple descriptor for our result type * The number and type of attributes have to match the definition of the * view gp_workfile_mgr_cache_entries */ TupleDesc tupdesc = CreateTemplateTupleDesc(NUM_CACHE_ENTRIES_ELEM, false); Assert(NUM_CACHE_ENTRIES_ELEM == 12); TupleDescInitEntry(tupdesc, (AttrNumber) 1, "segid", INT4OID, -1 /* typmod */, 0 /* attdim */); TupleDescInitEntry(tupdesc, (AttrNumber) 2, "path", TEXTOID, -1 /* typmod */, 0 /* attdim */); TupleDescInitEntry(tupdesc, (AttrNumber) 3, "hash", INT4OID, -1 /* typmod */, 0 /* attdim */); TupleDescInitEntry(tupdesc, (AttrNumber) 4, "size", INT8OID, -1 /* typmod */, 0 /* attdim */); TupleDescInitEntry(tupdesc, (AttrNumber) 5, "state", INT4OID, -1 /* typmod */, 0 /* attdim */); TupleDescInitEntry(tupdesc, (AttrNumber) 6, "workmem", INT4OID, -1 /* typmod */, 0 /* attdim */); TupleDescInitEntry(tupdesc, (AttrNumber) 7, "optype", TEXTOID, -1 /* typmod */, 0 /* attdim */); TupleDescInitEntry(tupdesc, (AttrNumber) 8, "slice", INT4OID, -1 /* typmod */, 0 /* attdim */); TupleDescInitEntry(tupdesc, (AttrNumber) 9, "sessionid", INT4OID, -1 /* typmod */, 0 /* attdim */); TupleDescInitEntry(tupdesc, (AttrNumber) 10, "commandid", INT4OID, -1 /* typmod */, 0 /* attdim */); TupleDescInitEntry(tupdesc, (AttrNumber) 11, "query_start", TIMESTAMPTZOID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 12, "numfiles", INT4OID, -1 /* typmod */, 0 /* attdim */); funcctx->tuple_desc = BlessTupleDesc(tupdesc); crtIndexPtr = (int32 *) palloc(sizeof(*crtIndexPtr)); *crtIndexPtr = 0; funcctx->user_fctx = crtIndexPtr; MemoryContextSwitchTo(oldcontext); } Cache *cache = workfile_mgr_get_cache(); funcctx = SRF_PERCALL_SETUP(); crtIndexPtr = (int32 *) funcctx->user_fctx; while (true) { CacheEntry *crtEntry = next_entry_to_list(cache, crtIndexPtr); if (!crtEntry) { /* Reached the end of the entry array, we're done */ SRF_RETURN_DONE(funcctx); } Datum values[NUM_CACHE_ENTRIES_ELEM]; bool nulls[NUM_CACHE_ENTRIES_ELEM]; MemSet(nulls, 0, sizeof(nulls)); workfile_set *work_set = CACHE_ENTRY_PAYLOAD(crtEntry); char work_set_path[MAXPGPATH] = ""; char *work_set_operator_name = NULL; /* * Lock entry in order to read its payload * Don't call any functions that can get interrupted or * that palloc memory while holding this lock. */ Cache_LockEntry(cache, crtEntry); if (!should_list_entry(crtEntry)) { Cache_UnlockEntry(cache, crtEntry); continue; } values[0] = Int32GetDatum(GpIdentity.segindex); strlcpy(work_set_path, work_set->path, MAXPGPATH); values[2] = UInt32GetDatum(crtEntry->hashvalue); int64 work_set_size = work_set->size; if (crtEntry->state == CACHE_ENTRY_ACQUIRED) { /* * work_set->size is not updated until the entry is cached. * For in-progress queries, the up-to-date size is stored in * work_set->in_progress_size. */ work_set_size = work_set->in_progress_size; } values[3] = Int64GetDatum(work_set_size); values[4] = UInt32GetDatum(crtEntry->state); values[5] = UInt32GetDatum(work_set->metadata.operator_work_mem); work_set_operator_name = gp_workfile_operator_name(work_set->node_type); values[7] = UInt32GetDatum(work_set->slice_id); values[8] = UInt32GetDatum(work_set->session_id); values[9] = UInt32GetDatum(work_set->command_count); values[10] = TimestampTzGetDatum(work_set->session_start_time); values[11] = UInt32GetDatum(work_set->no_files); /* Done reading from the payload of the entry, release lock */ Cache_UnlockEntry(cache, crtEntry); /* * Fill in the rest of the entries of the tuple with data copied * from the descriptor. * CStringGetTextDatum calls palloc so we cannot do this while * holding the lock above. */ values[1] = CStringGetTextDatum(work_set_path); values[6] = CStringGetTextDatum(work_set_operator_name); HeapTuple tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls); Datum result = HeapTupleGetDatum(tuple); SRF_RETURN_NEXT(funcctx, result); } }