/* * Create a new file set * type is the WorkFileType for the files: BUFFILE or BFZ * can_be_reused: if set to false, then we don't insert this set into the cache, * since the caller is telling us there is no point. This can happen for * example when spilling during index creation. * ps is the PlanState for the subtree rooted at the operator * snapshot contains snapshot information for the current transaction * */ workfile_set * workfile_mgr_create_set(enum ExecWorkFileType type, bool can_be_reused, PlanState *ps) { Assert(NULL != workfile_mgr_cache); Plan *plan = NULL; if (ps != NULL) { plan = ps->plan; } AssertImply(can_be_reused, plan != NULL); NodeTag node_type = T_Invalid; if (ps != NULL) { node_type = ps->type; } char *dir_path = create_workset_directory(node_type, currentSliceId); if (!workfile_sets_resowner_callback_registered) { RegisterResourceReleaseCallback(workfile_set_free_callback, NULL); workfile_sets_resowner_callback_registered = true; } /* Create parameter info for the populate function */ workset_info set_info; set_info.file_type = type; set_info.nodeType = node_type; set_info.dir_path = dir_path; set_info.session_start_time = GetCurrentTimestamp(); set_info.operator_work_mem = get_operator_work_mem(ps); CacheEntry *newEntry = Cache_AcquireEntry(workfile_mgr_cache, &set_info); if (NULL == newEntry) { /* Clean up the directory we created. */ workfile_mgr_delete_set_directory(dir_path); /* Could not acquire another entry from the cache - we filled it up */ ereport(ERROR, (errmsg("could not create workfile manager entry: exceeded number of concurrent spilling queries"))); } /* Path has now been copied to the workfile_set. We can free it */ pfree(dir_path); /* Complete initialization of the entry with post-acquire actions */ Assert(NULL != newEntry); workfile_set *work_set = CACHE_ENTRY_PAYLOAD(newEntry); Assert(work_set != NULL); elog(gp_workfile_caching_loglevel, "new spill file set. key=0x%x prefix=%s opMemKB=" INT64_FORMAT, work_set->key, work_set->path, work_set->metadata.operator_work_mem); return work_set; }
/* * Add or update an entry in the Global MDVSN cache for a versioning event * found in the event list. Reconcile with current contents of the cache * if needed. * event: The event containing the versioning information for an update */ static void mdver_globalhandler_add_version(mdver_event *event) { Assert(NULL != event); Cache *glob_mdvsn = mdver_get_glob_mdvsn(); if (mdver_is_nuke_event(event)) { mdver_glob_mdvsn_nuke(); return; } mdver_entry mdver = { InvalidOid, INVALID_MD_VERSION, INVALID_MD_VERSION }; mdver.key = event->key; mdver.ddl_version = INVALID_MD_VERSION; mdver.dml_version = INVALID_MD_VERSION; /* FIXME gcaragea 04/14/2014: Trigger evictions if cache is full (MPP-22923) */ CacheEntry *acquired_entry = Cache_AcquireEntry(glob_mdvsn, &mdver); Assert(NULL != acquired_entry); /* * We're about to look-up and insert/update a shared cache entry. * Grab writer lock in exclusive mode, so that no other backend * tries to insert or update the same entry at the same time. */ LWLockAcquire(MDVerWriteLock, LW_EXCLUSIVE); CacheEntry *cached_entry = Cache_Lookup(glob_mdvsn, acquired_entry); if (NULL != cached_entry) { mdver_globalhandler_reconcile(event, cached_entry); /* Done with the looked-up entry. Release it */ Cache_Release(glob_mdvsn, cached_entry); } else { /* Entry not found, insert new entry */ mdver_entry *new_mdver_entry = CACHE_ENTRY_PAYLOAD(acquired_entry); #ifdef MD_VERSIONING_INSTRUMENTATION elog(gp_mdversioning_loglevel, "Inserting into GlobalMDVSN entry %d: (%d,%d)", event->key, (int) event->new_ddl_version, (int) event->new_dml_version); #endif new_mdver_entry->ddl_version = event->new_ddl_version; new_mdver_entry->dml_version = event->new_dml_version; Cache_Insert(glob_mdvsn, acquired_entry); } Cache_Release(glob_mdvsn, acquired_entry); LWLockRelease(MDVerWriteLock); }
/* * Look up file set the cache given a certain PlanState. * Return NULL if not found. */ static workfile_set * workfile_mgr_lookup_set(PlanState *ps) { Assert(NULL != ps); Assert(NULL != workfile_mgr_cache); Assert(NULL != ps->plan); Assert(nodeTag(ps->plan) >= T_Plan && nodeTag(ps->plan) < T_PlanInvalItem); /* Create parameter info for the populate function */ workset_info set_info; set_info.dir_path = NULL; set_info.operator_work_mem = get_operator_work_mem(ps); set_info.on_disk = false; CacheEntry *localEntry = acquire_entry_retry(workfile_mgr_cache, &set_info); Assert(localEntry != NULL); workfile_set *local_work_set = (workfile_set *) CACHE_ENTRY_PAYLOAD(localEntry); /* Populate the rest of the entries needed for look-up * Allocate the serialized plan in the TopMemoryContext since this memory * context is still available when calling the transaction callback at the * time when the transaction aborts. */ MemoryContext oldcxt = MemoryContextSwitchTo(TopMemoryContext); workfile_set_plan *s_plan = workfile_mgr_serialize_plan(ps); MemoryContextSwitchTo(oldcxt); Assert(s_plan != NULL); local_work_set->set_plan = s_plan; local_work_set->key = workfile_mgr_hash_key(s_plan); CacheEntry *cachedEntry = Cache_Lookup(workfile_mgr_cache, localEntry); /* Release local entry and free up plan memory. We don't need it anymore */ Cache_Release(workfile_mgr_cache, localEntry); workfile_set *work_set = NULL; if (NULL != cachedEntry) { work_set = (workfile_set *) CACHE_ENTRY_PAYLOAD(cachedEntry); } return work_set; }
/* * Return a previously acquired entry to the cache freelist. * Calls the client-specific cleanup before returning to the freelist. * * Unregisters the entry from the cleanup list if requested. */ static void Cache_ReleaseAcquired(Cache *cache, CacheEntry *entry, bool unregisterCleanup) { Assert(NULL != cache); Assert(NULL != entry); Assert(CACHE_ENTRY_ACQUIRED == entry->state); /* Unregister entry from the cleanup list if requested */ if (unregisterCleanup) { Cache_UnregisterCleanup(cache, entry); } PG_TRY(); { /* Call client-specific cleanup function before removing entry from cache */ cache->cleanupEntry(CACHE_ENTRY_PAYLOAD(entry)); } PG_CATCH(); { /* Grab entry lock to ensure exclusive access to it while we're touching it */ Cache_LockEntry(cache, entry); /* No need for atomic operations as long as we hold the entry lock */ entry->state = CACHE_ENTRY_FREE; #ifdef USE_ASSERT_CHECKING Cache_MemsetPayload(cache, entry); #endif Cache_UnlockEntry(cache, entry); /* Link entry back in the freelist */ Cache_AddToFreelist(cache, entry); PG_RE_THROW(); } PG_END_TRY(); /* Grab entry lock to ensure exclusive access to it while we're touching it */ Cache_LockEntry(cache, entry); /* No need for atomic operations as long as we hold the entry lock */ entry->state = CACHE_ENTRY_FREE; #ifdef USE_ASSERT_CHECKING Cache_MemsetPayload(cache, entry); #endif Cache_UnlockEntry(cache, entry); Cache_AddToFreelist(cache, entry); Cache_DecPerfCounter(&cache->cacheHdr->cacheStats.noAcquiredEntries, 1 /* delta */ ); }
/* * Compute the hashcode for a given cache entry. * Uses the hash function specified in the cache. */ static void Cache_ComputeEntryHashcode(Cache *cache, CacheEntry *entry) { Assert(NULL != cache); Assert(NULL != entry); void *payload = CACHE_ENTRY_PAYLOAD(entry); void *key = (void *) ((char *) payload + cache->cacheHdr->keyOffset); entry->hashvalue = cache->hash(key, cache->cacheHdr->keySize); }
/* * Look up an entry in the Global MDVSN component. * To avoid any concurrency issues, this returns a copy of the entry, * palloc'ed in the current memory context. The caller is responsible * for freeing this copy. * * Returns a copy of the entry if found, NULL otherwise. * */ mdver_entry * mdver_glob_mdvsn_find(Oid oid) { Assert(NULL != mdver_glob_mdvsn); mdver_entry mdver_info; mdver_info.key = oid; /* FIXME gcaragea 03/18/2014: Trigger evictions if cache is full (MPP-22923) */ CacheEntry *localEntry = Cache_AcquireEntry(mdver_glob_mdvsn, &mdver_info); Assert(NULL != localEntry); CacheEntry *cachedEntry = Cache_Lookup(mdver_glob_mdvsn, localEntry); /* Release local entry. We don't need it anymore */ Cache_Release(mdver_glob_mdvsn, localEntry); mdver_entry *mdver_copy = NULL; if (NULL != cachedEntry) { /* Found a match. Make a local copy */ mdver_entry *shared_mdver = (mdver_entry *) CACHE_ENTRY_PAYLOAD(cachedEntry); mdver_copy = (mdver_entry *) palloc0(sizeof(mdver_entry)); /* Lock entry to ensure atomicity of copy */ Cache_LockEntry(mdver_glob_mdvsn, cachedEntry); memcpy(mdver_copy, shared_mdver, sizeof(mdver_entry)); /* Got the copy, unlock entry */ Cache_UnlockEntry(mdver_glob_mdvsn, cachedEntry); /* * We're also done with the entry, release our pincount on it * * TODO gcaragea 05/02/2014: Are there cases where we need to hold the * entry past this point? (MPP-22923) */ Cache_Release(mdver_glob_mdvsn, cachedEntry); } return mdver_copy; }
/* * Retrieve a new cache entry from the pre-allocated freelist. * The client has to either insert the entry in the cache or surrender it. * * This function calls the populateEntry callback function to populate the * entry before returning it to the client. * * populate_param is the opaque parameter to be passed to the populateEntry function. * * Return NULL if freelist is empty. * */ CacheEntry * Cache_AcquireEntry(Cache *cache, void *populate_param) { Assert(NULL != cache); CacheEntry *newEntry = Cache_GetFreeElement(cache); if (NULL == newEntry) { return NULL; } CACHE_ASSERT_WIPED(newEntry); uint32 expected = CACHE_ENTRY_FREE; #ifdef USE_ASSERT_CHECKING int32 casResult = #endif pg_atomic_compare_exchange_u32((pg_atomic_uint32 *)&newEntry->state, &expected, CACHE_ENTRY_RESERVED); Assert(1 == casResult); /* * In RESERVED state nobody else will try to read this entry, not even * the views. No need to lock the entry while populating. */ if (cache->populateEntry) { cache->populateEntry(CACHE_ENTRY_PAYLOAD(newEntry), populate_param); } expected = CACHE_ENTRY_RESERVED; #ifdef USE_ASSERT_CHECKING casResult = #endif pg_atomic_compare_exchange_u32((pg_atomic_uint32 *)&newEntry->state, &expected, CACHE_ENTRY_ACQUIRED); Assert(1 == casResult); Cache_RegisterCleanup(cache, newEntry, false /* isCachedEntry */ ); return newEntry; }
/* * Run cache eviction algorithm * * It will try to evict enough entries to add up to evictSize. Returns the * actual accumulated size of the entries evicted */ int64 Cache_Evict(Cache *cache, int64 evictRequestSize) { Assert(NULL != cache); Assert(evictRequestSize > 0); Cache_TimedOperationStart(); int64 evictedSize = 0; uint32 unsuccessfulLoops = 0; bool foundVictim = false; uint32 decAmount = cache->cacheHdr->policyContext.utilityDecrement; Cache_Stats *cacheStats = &cache->cacheHdr->cacheStats; while (true) { bool wraparound = false; int32 entryIdx = Cache_NextClockHand(cache, &wraparound); Assert(entryIdx < cache->cacheHdr->nEntries); Cache_UpdatePerfCounter(&cacheStats->noEntriesScanned,1 /* delta */); if (wraparound) { unsuccessfulLoops++; Cache_UpdatePerfCounter(&cacheStats->noWraparound, 1 /* delta */); if (!foundVictim) { /* * We looped around and did not manage to evict any entries. * Double the amount we decrement eviction candidate's utility by. * This makes the eviction algorithm look for a victim more aggressively */ if (decAmount <= CACHE_MAX_UTILITY / 2) { decAmount = 2 * decAmount; } else { decAmount = CACHE_MAX_UTILITY; } } foundVictim = false; if (unsuccessfulLoops > cache->cacheHdr->policyContext.maxClockLoops) { /* Can't find any cached and unused entries candidates for evictions, even after looping around * maxClockLoops times. Give up looking for victims. */ Cache_TimedOperationRecord(&cacheStats->timeEvictions, &cacheStats->maxTimeEvict); break; } } CacheEntry *crtEntry = Cache_GetEntryByIndex(cache->cacheHdr, entryIdx); if (crtEntry->state != CACHE_ENTRY_CACHED) { /* Not interested in free/acquired/deleted entries. Go back and advance clock hand */ continue; } CacheAnchor *anchor = (CacheAnchor *) SyncHTLookup(cache->syncHashtable, &crtEntry->hashvalue); if (NULL == anchor) { /* There's no anchor for this entry, someone might have snatched it in the meantime */ continue; } SpinLockAcquire(&anchor->spinlock); if (crtEntry->state != CACHE_ENTRY_CACHED) { /* Someone freed this entry in the meantime, before we got a chance to acquire the anchor lock */ SpinLockRelease(&anchor->spinlock); SyncHTRelease(cache->syncHashtable, (void *) anchor); continue; } /* Ok, did all the checks, this entry must be valid now */ CACHE_ASSERT_VALID(crtEntry); if (crtEntry->pinCount > 0) { /* Entry is in use and can't be evicted. Go back and advance clock hand */ SpinLockRelease(&anchor->spinlock); SyncHTRelease(cache->syncHashtable, (void *) anchor); continue; } /* Decrement utility */ gp_atomic_dec_positive_32(&crtEntry->utility, decAmount); /* Just decremented someone's utility. Reset our unsuccessful loops counter */ unsuccessfulLoops = 0; if (crtEntry->utility > 0) { /* Entry has non-zero utility, we shouldn't evict it. Go back and advance clock hand */ SpinLockRelease(&anchor->spinlock); SyncHTRelease(cache->syncHashtable, (void *) anchor); continue; } /* Found our victim */ Assert(0 == crtEntry->pinCount); CACHE_ASSERT_VALID(crtEntry); Assert(crtEntry->utility == 0); #if USE_ASSERT_CHECKING int32 casResult = #endif compare_and_swap_32(&crtEntry->state, CACHE_ENTRY_CACHED, CACHE_ENTRY_DELETED); Assert(1 == casResult); SpinLockRelease(&anchor->spinlock); foundVictim = true; evictedSize += crtEntry->size; /* Don't update noFreeEntries yet. It will be done in Cache_AddToFreelist */ Cache_UpdatePerfCounter(&cacheStats->noCachedEntries, -1 /* delta */); /* Unlink entry from the anchor chain */ SpinLockAcquire(&anchor->spinlock); Cache_UnlinkEntry(cache, anchor, crtEntry); SpinLockRelease(&anchor->spinlock); SyncHTRelease(cache->syncHashtable, (void *) anchor); if (NULL != cache->cleanupEntry) { /* Call client-side cleanup for entry */ cache->cleanupEntry(CACHE_ENTRY_PAYLOAD(crtEntry)); } Cache_LockEntry(cache, crtEntry); Assert(crtEntry->state == CACHE_ENTRY_DELETED); crtEntry->state = CACHE_ENTRY_FREE; #if USE_ASSERT_CHECKING Cache_MemsetPayload(cache, crtEntry); #endif Cache_UnlockEntry(cache, crtEntry); Cache_AddToFreelist(cache, crtEntry); Cache_UpdatePerfCounter(&cacheStats->noEvicts, 1 /* delta */); Cache_TimedOperationRecord(&cacheStats->timeEvictions, &cacheStats->maxTimeEvict); if (evictedSize >= evictRequestSize) { /* We evicted as much as requested */ break; } Cache_TimedOperationStart(); } return evictedSize; }
/* * When a backend is requesting the more recent version of an object, * if the Local MDVSN cache doesn't have the version, and if a NUKE event * hasn't been encountered in the current transaction, it is looked up * in the Global MDVSN shared cache. * * If the object is found in Global MDVSN, return the global version. * If the object is not found, generate a new version, record it in Global MDVSN * and then return it. * * key: The key of the looked-up object * ddl_version: used to return the ddl version for the object * dml_version: used to return the dml version for the object * */ static void mdver_request_from_global(Oid key, uint64 *ddl_version, uint64 *dml_version) { Assert(NULL != ddl_version); Assert(NULL != dml_version); Cache *mdver_glob_mdvsn = mdver_get_glob_mdvsn(); Assert(NULL != mdver_glob_mdvsn); mdver_entry entry = {key, INVALID_MD_VERSION, INVALID_MD_VERSION}; /* FIXME gcaragea 06/03/2014: Trigger evictions if cache is full (MPP-22923) */ CacheEntry *localEntry = Cache_AcquireEntry(mdver_glob_mdvsn, &entry); Assert(NULL != localEntry); /* * We're about to look-up and insert a shared cache entry. * Grab writer lock in exclusive mode, so that no other backend * can insert or update the same entry at the same time. */ LWLockAcquire(MDVerWriteLock, LW_EXCLUSIVE); CacheEntry *cachedEntry = Cache_Lookup(mdver_glob_mdvsn, localEntry); if (NULL != cachedEntry) { /* Not found in LVSN, not nuke happened, eventually found in GVSN */ mdver_entry *crt_entry = CACHE_ENTRY_PAYLOAD(cachedEntry); *ddl_version = crt_entry->ddl_version; *dml_version = crt_entry->dml_version; #ifdef MD_VERSIONING_INSTRUMENTATION elog(gp_mdversioning_loglevel, "Found version in Global MDVSN: (%d, " UINT64_FORMAT ", " UINT64_FORMAT "). Adding it to Local MDVSN", key, crt_entry->ddl_version, crt_entry->dml_version); #endif /* * We're also done with the entry, release our pincount on it * * TODO gcaragea 05/02/2014: Are there cases where we need to hold the * entry past this point? (MPP-22923) */ Cache_Release(mdver_glob_mdvsn, cachedEntry); } else { /* Not found in LVSN, not nuke happened, not found in GVSN either */ /* Generate new version */ *ddl_version = mdver_next_global_version(); *dml_version = mdver_next_global_version(); /* Add to GVSN */ mdver_entry *new_entry = CACHE_ENTRY_PAYLOAD(localEntry); new_entry->ddl_version = *ddl_version; new_entry->dml_version = *dml_version; #ifdef MD_VERSIONING_INSTRUMENTATION elog(gp_mdversioning_loglevel, "Inserting new version in Global MDVSN: (%d, " UINT64_FORMAT ", " UINT64_FORMAT "). Adding it to Local MDVSN", key, new_entry->ddl_version, new_entry->dml_version); #endif Cache_Insert(mdver_glob_mdvsn, localEntry); } LWLockRelease(MDVerWriteLock); /* Release local entry. We don't need it anymore */ Cache_Release(mdver_glob_mdvsn, localEntry); }
/* * Reconcile an incoming versioning event with an existing Global MDVSN entry * for the same versioned object. * * Each versioning event contains the old version and the new version as known * by the originating backend: * VE = (key, oldV, newV) * Cached entry contains the current version globally visible: * entry = (key, crtV) * * We have the following scenarios: * - If oldV == crtV, (i.e. VE old version is the same as the current version) * then nobody else has modified the object since the backend read it. * We simply update the entry with the new version in that case: * entry = (key, crtV) --> entry = (key, newV) * * - If oldV < crtV, (i.e. VE old version is different than the current version) * some other backend must have modified the object in the meantime. * We generate an entirely new version new_newV for the object to reflect * the new "combined" object. * * The cached entry is updated directly with the new version: * entry = (key, crtV) --> entry = (key, new_newV) * * The versioning event in the queue is updated directly: VE = (key, oldV, newV) --> VE = (key, crtV, new_newV) * * event: The event containing the versioning information for an update * cached_entry: The existing entry for this object in the Global MDVSN * * This function is called while the MDVerWriteLock is held in exclusive * mode. Don't do anything that is not allowed while holding a LWLock * (e.g. allocate memory, or call unsafe functions). * */ static void mdver_globalhandler_reconcile(mdver_event *event, CacheEntry *cached_entry) { /* Found existing entry, reconcile and update the version */ mdver_entry *cached_mdver_entry = CACHE_ENTRY_PAYLOAD(cached_entry); #ifdef MD_VERSIONING_INSTRUMENTATION elog(gp_mdversioning_loglevel, "Updating GlobalMDVSN entry %d: Current (%d,%d). Event: [(%d,%d)->(%d,%d)]", event->key, (int) cached_mdver_entry->ddl_version, (int) cached_mdver_entry->dml_version, (int) event->old_ddl_version, (int) event->old_dml_version, (int) event->new_ddl_version, (int) event->new_dml_version); #endif /* * Reconcile and resolve conflicts for incoming versioning events. * When a new versioning event is received at the Global MDVSN, * look up if the same object has a conflicting version. * If so, resolve conflict by generating a new version. */ uint64 new_ddl_version = event->new_ddl_version; uint64 new_dml_version = event->new_dml_version; bool conflict = false; /* * It is safe to read the cached_mdver_entry contents, since * we're holding the write lock on the Global MDVSN cache. */ if (cached_mdver_entry->ddl_version != event->old_ddl_version) { new_ddl_version = mdver_next_global_version(); conflict = true; } if (cached_mdver_entry->dml_version != event->old_dml_version) { new_dml_version = mdver_next_global_version(); conflict = true; } if (conflict) { #ifdef MD_VERSIONING_INSTRUMENTATION elog(gp_mdversioning_loglevel, "Updating event in the queue (pid=%d, oid=%d): Old event: [(%d,%d)->(%d,%d)]. Modified event: [(%d,%d)->(%d,%d)]", event->backend_pid, event->key, /* Old event */ (int) event->old_ddl_version, (int) event->old_dml_version, (int) event->new_ddl_version, (int) event->new_dml_version, /* New event */ (int) cached_mdver_entry->ddl_version, (int) cached_mdver_entry->dml_version, (int) new_ddl_version, (int) new_dml_version); #endif /* * A new version for this object is being generated here. * We're going to directly update the event in the queue with the new * version. */ event->new_ddl_version = new_ddl_version; event->new_dml_version = new_dml_version; /* * We're also updating the VE old version to reflect the current * visible global version */ event->old_ddl_version = cached_mdver_entry->ddl_version; event->old_dml_version = cached_mdver_entry->dml_version; } /* About to update the cached entry. Lock entry to make update atomic */ Cache *glob_mdvsn = mdver_get_glob_mdvsn(); Cache_LockEntry(glob_mdvsn, cached_entry); cached_mdver_entry->ddl_version = new_ddl_version; cached_mdver_entry->dml_version = new_dml_version; Cache_UnlockEntry(glob_mdvsn, cached_entry); }
/* * Internal version of the CacheRelease function * * Unregisters the entry from the cleanup list if requested. */ static void Cache_ReleaseCached(Cache *cache, CacheEntry *entry, bool unregisterCleanup) { Assert(NULL != cache); Assert(NULL != entry); Assert(CACHE_ENTRY_CACHED == entry->state || CACHE_ENTRY_DELETED == entry->state); Cache_ComputeEntryHashcode(cache, entry); volatile CacheAnchor *anchor = SyncHTLookup(cache->syncHashtable, &entry->hashvalue); Assert(anchor != NULL); /* Acquire anchor lock to touch the entry */ SpinLockAcquire(&anchor->spinlock); Cache_LockEntry(cache, entry); uint32 pinCount = Cache_EntryDecRef(cache, entry); bool deleteEntry = false; if (pinCount == 0 && entry->state == CACHE_ENTRY_DELETED) { /* Delete the cache entry if pin-count = 0 and it is marked for deletion */ Cache_UnlinkEntry(cache, (CacheAnchor *) anchor, entry); deleteEntry = true; Cache_UpdatePerfCounter(&cache->cacheHdr->cacheStats.noDeletedEntries, -1 /* delta */); } Cache_UnlockEntry(cache, entry); SpinLockRelease(&anchor->spinlock); /* * Releasing anchor to hashtable. * Ignoring 'removed' return value, both values are valid */ SyncHTRelease(cache->syncHashtable, (void *) anchor); /* If requested, unregister entry from the cleanup list */ if (unregisterCleanup) { Cache_UnregisterCleanup(cache, entry); } if (deleteEntry) { if (NULL != cache->cleanupEntry) { PG_TRY(); { /* Call client-specific cleanup function before removing entry from cache */ cache->cleanupEntry(CACHE_ENTRY_PAYLOAD(entry)); } PG_CATCH(); { /* Grab entry lock to ensure exclusive access to it while we're touching it */ Cache_LockEntry(cache, entry); Assert(CACHE_ENTRY_DELETED == entry->state); entry->state = CACHE_ENTRY_FREE; #ifdef USE_ASSERT_CHECKING Cache_MemsetPayload(cache, entry); #endif Cache_UnlockEntry(cache, entry); /* Link entry back in the freelist */ Cache_AddToFreelist(cache, entry); PG_RE_THROW(); } PG_END_TRY(); } /* Grab entry lock to ensure exclusive access to it while we're touching it */ Cache_LockEntry(cache, entry); entry->state = CACHE_ENTRY_FREE; #ifdef USE_ASSERT_CHECKING Cache_MemsetPayload(cache, entry); #endif Cache_UnlockEntry(cache, entry); /* Link entry back in the freelist */ Cache_AddToFreelist(cache, entry); } }
/* * Look up an exact match for a cache entry * * Returns the matching cache entry if found, NULL otherwise */ CacheEntry * Cache_Lookup(Cache *cache, CacheEntry *entry) { Assert(NULL != cache); Assert(NULL != entry); Cache_TimedOperationStart(); Cache_UpdatePerfCounter(&cache->cacheHdr->cacheStats.noLookups, 1 /* delta */); /* Advance the clock for the replacement policy */ Cache_AdvanceClock(cache); Cache_ComputeEntryHashcode(cache, entry); volatile CacheAnchor *anchor = SyncHTLookup(cache->syncHashtable, &entry->hashvalue); if (NULL == anchor) { /* No matching anchor found, there can't be a matching element in the cache */ Cache_TimedOperationRecord(&cache->cacheHdr->cacheStats.timeLookups, &cache->cacheHdr->cacheStats.maxTimeLookup); return NULL; } /* Acquire anchor lock to touch the chain */ SpinLockAcquire(&anchor->spinlock); CacheEntry *crtEntry = anchor->firstEntry; while (true) { while (NULL != crtEntry && crtEntry->state == CACHE_ENTRY_DELETED) { /* Skip over deleted entries */ crtEntry = crtEntry->nextEntry; } if (NULL == crtEntry) { /* No valid entries found in the chain */ SpinLockRelease(&anchor->spinlock); Cache_TimedOperationRecord(&cache->cacheHdr->cacheStats.timeLookups, &cache->cacheHdr->cacheStats.maxTimeLookup); return NULL; } /* Found a valid entry. AddRef it and test to see if it matches */ Cache_EntryAddRef(cache, crtEntry); SpinLockRelease(&anchor->spinlock); /* Register it for cleanup in case we get an error while testing for equality */ Cache_RegisterCleanup(cache, crtEntry, true /* isCachedEntry */); Cache_UpdatePerfCounter(&cache->cacheHdr->cacheStats.noCompares, 1 /* delta */); if(cache->equivalentEntries(CACHE_ENTRY_PAYLOAD(entry), CACHE_ENTRY_PAYLOAD(crtEntry))) { /* Found the match, we're done */ Cache_TouchEntry(cache, crtEntry); Cache_UpdatePerfCounter(&cache->cacheHdr->cacheStats.noCacheHits, 1 /* delta */); break; } /* Unregister it from cleanup since it wasn't the one */ Cache_UnregisterCleanup(cache, crtEntry); SpinLockAcquire(&anchor->spinlock); Cache_EntryDecRef(cache, crtEntry); crtEntry = crtEntry->nextEntry; } /* ignoring return value, both values are valid */ SyncHTRelease(cache->syncHashtable, (void *) anchor); Cache_TimedOperationRecord(&cache->cacheHdr->cacheStats.timeLookups, &cache->cacheHdr->cacheStats.maxTimeLookup); return crtEntry; }
/* * MemSet the payload of an entry with a pattern to prevent a client from * accidentally using a surrendered entry's payload. */ void Cache_MemsetPayload(Cache *cache, CacheEntry *entry) { void *payload = CACHE_ENTRY_PAYLOAD(entry); MemSet(payload, CACHE_MEMSET_BYTE_PATTERN, cache->cacheHdr->entrySize); }
/* * Function returning all workfile cache entries for one segment */ Datum gp_workfile_mgr_cache_entries(PG_FUNCTION_ARGS) { FuncCallContext *funcctx; int32 *crtIndexPtr; if (SRF_IS_FIRSTCALL()) { /* create a function context for cross-call persistence */ funcctx = SRF_FIRSTCALL_INIT(); /* Switch to memory context appropriate for multiple function calls */ MemoryContext oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); /* * Build a tuple descriptor for our result type * The number and type of attributes have to match the definition of the * view gp_workfile_mgr_cache_entries */ TupleDesc tupdesc = CreateTemplateTupleDesc(NUM_CACHE_ENTRIES_ELEM, false); Assert(NUM_CACHE_ENTRIES_ELEM == 12); TupleDescInitEntry(tupdesc, (AttrNumber) 1, "segid", INT4OID, -1 /* typmod */, 0 /* attdim */); TupleDescInitEntry(tupdesc, (AttrNumber) 2, "path", TEXTOID, -1 /* typmod */, 0 /* attdim */); TupleDescInitEntry(tupdesc, (AttrNumber) 3, "hash", INT4OID, -1 /* typmod */, 0 /* attdim */); TupleDescInitEntry(tupdesc, (AttrNumber) 4, "size", INT8OID, -1 /* typmod */, 0 /* attdim */); TupleDescInitEntry(tupdesc, (AttrNumber) 5, "state", INT4OID, -1 /* typmod */, 0 /* attdim */); TupleDescInitEntry(tupdesc, (AttrNumber) 6, "workmem", INT4OID, -1 /* typmod */, 0 /* attdim */); TupleDescInitEntry(tupdesc, (AttrNumber) 7, "optype", TEXTOID, -1 /* typmod */, 0 /* attdim */); TupleDescInitEntry(tupdesc, (AttrNumber) 8, "slice", INT4OID, -1 /* typmod */, 0 /* attdim */); TupleDescInitEntry(tupdesc, (AttrNumber) 9, "sessionid", INT4OID, -1 /* typmod */, 0 /* attdim */); TupleDescInitEntry(tupdesc, (AttrNumber) 10, "commandid", INT4OID, -1 /* typmod */, 0 /* attdim */); TupleDescInitEntry(tupdesc, (AttrNumber) 11, "query_start", TIMESTAMPTZOID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 12, "numfiles", INT4OID, -1 /* typmod */, 0 /* attdim */); funcctx->tuple_desc = BlessTupleDesc(tupdesc); crtIndexPtr = (int32 *) palloc(sizeof(*crtIndexPtr)); *crtIndexPtr = 0; funcctx->user_fctx = crtIndexPtr; MemoryContextSwitchTo(oldcontext); } Cache *cache = workfile_mgr_get_cache(); funcctx = SRF_PERCALL_SETUP(); crtIndexPtr = (int32 *) funcctx->user_fctx; while (true) { CacheEntry *crtEntry = next_entry_to_list(cache, crtIndexPtr); if (!crtEntry) { /* Reached the end of the entry array, we're done */ SRF_RETURN_DONE(funcctx); } Datum values[NUM_CACHE_ENTRIES_ELEM]; bool nulls[NUM_CACHE_ENTRIES_ELEM]; MemSet(nulls, 0, sizeof(nulls)); workfile_set *work_set = CACHE_ENTRY_PAYLOAD(crtEntry); char work_set_path[MAXPGPATH] = ""; char *work_set_operator_name = NULL; /* * Lock entry in order to read its payload * Don't call any functions that can get interrupted or * that palloc memory while holding this lock. */ Cache_LockEntry(cache, crtEntry); if (!should_list_entry(crtEntry)) { Cache_UnlockEntry(cache, crtEntry); continue; } values[0] = Int32GetDatum(GpIdentity.segindex); strlcpy(work_set_path, work_set->path, MAXPGPATH); values[2] = UInt32GetDatum(crtEntry->hashvalue); int64 work_set_size = work_set->size; if (crtEntry->state == CACHE_ENTRY_ACQUIRED) { /* * work_set->size is not updated until the entry is cached. * For in-progress queries, the up-to-date size is stored in * work_set->in_progress_size. */ work_set_size = work_set->in_progress_size; } values[3] = Int64GetDatum(work_set_size); values[4] = UInt32GetDatum(crtEntry->state); values[5] = UInt32GetDatum(work_set->metadata.operator_work_mem); work_set_operator_name = gp_workfile_operator_name(work_set->node_type); values[7] = UInt32GetDatum(work_set->slice_id); values[8] = UInt32GetDatum(work_set->session_id); values[9] = UInt32GetDatum(work_set->command_count); values[10] = TimestampTzGetDatum(work_set->session_start_time); values[11] = UInt32GetDatum(work_set->no_files); /* Done reading from the payload of the entry, release lock */ Cache_UnlockEntry(cache, crtEntry); /* * Fill in the rest of the entries of the tuple with data copied * from the descriptor. * CStringGetTextDatum calls palloc so we cannot do this while * holding the lock above. */ values[1] = CStringGetTextDatum(work_set_path); values[6] = CStringGetTextDatum(work_set_operator_name); HeapTuple tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls); Datum result = HeapTupleGetDatum(tuple); SRF_RETURN_NEXT(funcctx, result); } }
/* * Create a new file set * type is the WorkFileType for the files: BUFFILE or BFZ * can_be_reused: if set to false, then we don't insert this set into the cache, * since the caller is telling us there is no point. This can happen for * example when spilling during index creation. * ps is the PlanState for the subtree rooted at the operator * snapshot contains snapshot information for the current transaction * */ workfile_set * workfile_mgr_create_set(enum ExecWorkFileType type, bool can_be_reused, PlanState *ps, workfile_set_snapshot snapshot) { Assert(NULL != workfile_mgr_cache); Plan *plan = NULL; if (ps != NULL) { plan = ps->plan; } AssertImply(can_be_reused, plan != NULL); NodeTag node_type = T_Invalid; if (ps != NULL) { node_type = ps->type; } char *dir_path = create_workset_directory(node_type, currentSliceId); /* Create parameter info for the populate function */ workset_info set_info; set_info.file_type = type; set_info.snapshot = snapshot; set_info.nodeType = node_type; set_info.can_be_reused = can_be_reused && workfile_mgr_is_reusable(ps); set_info.dir_path = dir_path; set_info.session_start_time = GetCurrentTimestamp(); set_info.operator_work_mem = get_operator_work_mem(ps); set_info.on_disk = true; CacheEntry *newEntry = NULL; PG_TRY(); { newEntry = acquire_entry_retry(workfile_mgr_cache, &set_info); } PG_CATCH(); { /* Failed to acquire new entry, cache full. Clean up the directory we created. */ workfile_mgr_delete_set_directory(dir_path); PG_RE_THROW(); } PG_END_TRY(); /* Path has now been copied to the workfile_set. We can free it */ pfree(dir_path); /* Complete initialization of the entry with post-acquire actions */ Assert(NULL != newEntry); workfile_set *work_set = CACHE_ENTRY_PAYLOAD(newEntry); Assert(work_set != NULL); if (work_set->can_be_reused) { Assert(plan != NULL); Assert(nodeTag(plan) >= T_Plan && nodeTag(plan) < T_PlanInvalItem); workfile_set_plan *s_plan = workfile_mgr_serialize_plan(ps); work_set->key = workfile_mgr_hash_key(s_plan); workfile_mgr_save_plan(work_set, s_plan); workfile_mgr_free_plan(s_plan); } elog(gp_workfile_caching_loglevel, "new spill file set. key=0x%x can_be_reused=%d prefix=%s opMemKB=" INT64_FORMAT, work_set->key, work_set->can_be_reused, work_set->path, work_set->metadata.operator_work_mem); return work_set; }