Datum currtid_byreloid(PG_FUNCTION_ARGS) { Oid reloid = PG_GETARG_OID(0); ItemPointer tid = PG_GETARG_ITEMPOINTER(1); ItemPointer result; Relation rel; result = (ItemPointer) palloc(sizeof(ItemPointerData)); if (!reloid) { *result = Current_last_tid; PG_RETURN_ITEMPOINTER(result); } rel = heap_open(reloid, AccessShareLock); if (rel->rd_rel->relkind == RELKIND_VIEW) return currtid_for_view(rel, tid); ItemPointerCopy(tid, result); heap_get_latest_tid(rel, SnapshotNow, result); heap_close(rel, AccessShareLock); PG_RETURN_ITEMPOINTER(result); }
Datum currtid_byreloid(PG_FUNCTION_ARGS) { Oid reloid = PG_GETARG_OID(0); ItemPointer tid = PG_GETARG_ITEMPOINTER(1); ItemPointer result; Relation rel; AclResult aclresult; result = (ItemPointer) palloc(sizeof(ItemPointerData)); if (!reloid) { *result = Current_last_tid; PG_RETURN_ITEMPOINTER(result); } rel = heap_open(reloid, AccessShareLock); aclresult = pg_class_aclcheck(RelationGetRelid(rel), GetUserId(), ACL_SELECT); if (aclresult != ACLCHECK_OK) aclcheck_error(aclresult, ACL_KIND_CLASS, RelationGetRelationName(rel)); if (rel->rd_rel->relkind == RELKIND_VIEW) return currtid_for_view(rel, tid); ItemPointerCopy(tid, result); heap_get_latest_tid(rel, SnapshotNow, result); heap_close(rel, AccessShareLock); PG_RETURN_ITEMPOINTER(result); }
void PersistentTablespace_ActivateStandby(int16 oldmaster, int16 newmaster) { TablespaceDirEntry tablespaceDirEntry; HASH_SEQ_STATUS hstat; WRITE_PERSISTENT_STATE_ORDERED_LOCK_DECLARE; hash_seq_init(&hstat, persistentTablespaceSharedHashTable); if (Persistent_BeforePersistenceWork()) elog(ERROR, "persistent table changes forbidden"); PersistentTablespace_VerifyInitScan(); WRITE_PERSISTENT_STATE_ORDERED_LOCK; LWLockAcquire(TablespaceHashLock, LW_SHARED); while ((tablespaceDirEntry = hash_seq_search(&hstat)) != NULL) { PersistentFileSysObjName fsObjName; Oid tblspc = tablespaceDirEntry->key.tablespaceOid; ItemPointerData persistentTid; uint64 persistentSerialNum; tablespaceDirEntry = PersistentTablespace_FindEntryUnderLock(tblspc); if (tablespaceDirEntry == NULL) elog(ERROR, "cannot find persistent tablespace entry %u", tblspc); persistentSerialNum = tablespaceDirEntry->persistentSerialNum; ItemPointerCopy(&tablespaceDirEntry->persistentTid, &persistentTid); /* * We release TablespaceHashLock in the middle of the loop and * re-acquire it after doing persistent table change. This is needed * to prevent holding the lock for any purpose other than to protect * the tablespace shared hash table. Not releasing this lock could * result in file I/O and potential deadlock due to other LW locks * being acquired in the process. Releasing the lock this way is safe * because we are still holding PersistentObjLock in exclusive mode. * Any change to the filespace shared hash table is also protected by * PersistentObjLock. */ LWLockRelease(TablespaceHashLock); PersistentFileSysObjName_SetTablespaceDir(&fsObjName, tblspc); PersistentFileSysObj_ActivateStandby(&fsObjName, &persistentTid, persistentSerialNum, oldmaster, newmaster, /* flushToXlog */ false); LWLockAcquire(TablespaceHashLock, LW_SHARED); } LWLockRelease(TablespaceHashLock); WRITE_PERSISTENT_STATE_ORDERED_UNLOCK; }
Datum currtid_byrelname(PG_FUNCTION_ARGS) { text *relname = PG_GETARG_TEXT_P(0); ItemPointer tid = PG_GETARG_ITEMPOINTER(1); ItemPointer result; RangeVar *relrv; Relation rel; AclResult aclresult; Snapshot snapshot; relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname)); rel = heap_openrv(relrv, AccessShareLock); aclresult = pg_class_aclcheck(RelationGetRelid(rel), GetUserId(), ACL_SELECT); if (aclresult != ACLCHECK_OK) aclcheck_error(aclresult, ACL_KIND_CLASS, RelationGetRelationName(rel)); if (rel->rd_rel->relkind == RELKIND_VIEW || rel->rd_rel->relkind == RELKIND_CONTVIEW) return currtid_for_view(rel, tid); result = (ItemPointer) palloc(sizeof(ItemPointerData)); ItemPointerCopy(tid, result); snapshot = RegisterSnapshot(GetLatestSnapshot()); heap_get_latest_tid(rel, snapshot, result); UnregisterSnapshot(snapshot); heap_close(rel, AccessShareLock); PG_RETURN_ITEMPOINTER(result); }
/* * Fetches the next entry from a visimap store index scan. * * Parameter visiMapEntry may be NULL. If it is not NULL and * the scan returns an entry, the entry data is copied to the * visimapEntry. * Parameter tupleTid may be NULL. If it is not NULL and the scan * returns an entry, the (heap) tuple id is copied to the parameter. */ bool AppendOnlyVisimapStore_GetNext( AppendOnlyVisimapStore *visiMapStore, IndexScanDesc indexScan, ScanDirection scanDirection, AppendOnlyVisimapEntry* visiMapEntry, ItemPointerData *tupleTid) { HeapTuple tuple; TupleDesc heapTupleDesc; Assert(visiMapStore); Assert(RelationIsValid(visiMapStore->visimapRelation)); Assert(RelationIsValid(visiMapStore->visimapIndex)); Assert(indexScan); tuple = AppendOnlyVisimapStore_GetNextTuple(visiMapStore, indexScan, scanDirection); if (tuple == NULL) { return false; } heapTupleDesc = RelationGetDescr(visiMapStore->visimapRelation); if (visiMapEntry) { AppendOnlyVisimapEntry_Copyout(visiMapEntry, tuple, heapTupleDesc); } if (tupleTid) { ItemPointerCopy(&tuple->t_self, tupleTid); } return true; }
/* * add a task, it will be performed on all segments. */ void SharedStorageOpAddTask(const char * relname, RelFileNode *node, int32 segno, ItemPointer persistentTid, int64 persistentSerialNum, SharedStorageOpTasks *tasks) { Assert(NULL != node && NULL != tasks); Assert(tasks->sizeTasks >= tasks->numTasks); RelFileNode *n; if (tasks->sizeTasks == tasks->numTasks) { tasks->tasks = repalloc(tasks->tasks, tasks->sizeTasks * sizeof(SharedStorageOpTask) * 2); tasks->sizeTasks *= 2; } n = &tasks->tasks[tasks->numTasks].node; n->dbNode = node->dbNode; n->relNode = node->relNode; n->spcNode = node->spcNode; tasks->tasks[tasks->numTasks].segno = segno; tasks->tasks[tasks->numTasks].relname = palloc(strlen(relname) + 1); strcpy(tasks->tasks[tasks->numTasks].relname, relname); ItemPointerCopy(persistentTid, &tasks->tasks[tasks->numTasks].persistentTid); tasks->tasks[tasks->numTasks].persistentSerialNum = persistentSerialNum; tasks->numTasks++; }
static bool PersistentStore_GetFreeTuple( PersistentStoreData *storeData, PersistentStoreSharedData *storeSharedData, ItemPointer freeTid) { ItemPointerData previousFreeTid; MemSet(freeTid, 0, sizeof(ItemPointerData)); if (Debug_persistent_store_print) elog(PersistentStore_DebugPrintLevel(), "PersistentStore_GetFreeTuple: Enter: maximum free order number " INT64_FORMAT ", free TID %s ('%s')", storeSharedData->maxFreeOrderNum, ItemPointerToString(&storeSharedData->freeTid), storeData->tableName); if (storeSharedData->maxFreeOrderNum == 0) { return false; /* No free tuples. */ } if (gp_persistent_skip_free_list) { if (Debug_persistent_store_print) elog(PersistentStore_DebugPrintLevel(), "PersistentStore_GetFreeTuple: Skipping because gp_persistent_skip_free_list GUC is ON ('%s')", storeData->tableName); return false; /* Pretend no free tuples. */ } Assert(storeSharedData->freeTid.ip_posid != 0); if (!PersistentStore_ValidateFreeTID( storeData, storeSharedData, &previousFreeTid)) return false; *freeTid = storeSharedData->freeTid; storeSharedData->maxFreeOrderNum--; ItemPointerCopy(&previousFreeTid /* previousFreeTid set inside the ValidateFreeTID function */, &storeSharedData->freeTid); if (Debug_persistent_store_print) elog(PersistentStore_DebugPrintLevel(), "PersistentStore_GetFreeTuple: Exit: maximum free order number " INT64_FORMAT ", free TID %s ('%s')", storeSharedData->maxFreeOrderNum, ItemPointerToString(&storeSharedData->freeTid), storeData->tableName); if (validate_previous_free_tid && !PersistentStore_ValidateFreeTID( storeData, storeSharedData, &previousFreeTid)) return false; return true; }
/* * update a tuple in in-memory heap table. * * if the target tuple already in the memory, * update it in-place with flag INMEM_HEAP_TUPLE_UPDATED. * else report an error. * * update should not change the otid of the old tuple, * since updated tuple should write back to the master and update there. */ void InMemHeap_Update(InMemHeapRelation relation, ItemPointer otid, HeapTuple tup) { int pos; HeapTuple target; MemoryContext oldmem = CurrentMemoryContext; Assert(ItemPointerIsValid(otid)); pos = InMemHeap_Find(relation, otid); CurrentMemoryContext = relation->memcxt; /* * not found, report error */ if (pos >= relation->tupsize) { ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("update a tuple which does not exist," " relname = %s, relid = %u", relation->rel->rd_rel->relname.data, relation->relid))); } Insist(relation->hashIndex == NULL && "cannot handle index in in-memory heap when update"); /* * already in table */ Assert(relation->tuples[pos].flags == INMEM_HEAP_TUPLE_DISPATCHED || relation->tuples[pos].flags == INMEM_HEAP_TUPLE_UPDATED); relation->tuples[pos].flags = INMEM_HEAP_TUPLE_UPDATED; target = heaptuple_copy_to(tup, NULL, NULL ); /* * do not modify original tuple header */ ItemPointerCopy(&target->t_self, &relation->tuples[pos].tuple->t_self); Assert(ItemPointerEquals(&target->t_self, otid)); memcpy(target->t_data, relation->tuples[pos].tuple->t_data, sizeof(HeapTupleHeaderData)); CurrentMemoryContext = oldmem; pfree(relation->tuples[pos].tuple); relation->tuples[pos].tuple = target; }
void PersistentTablespace_RemoveSegment(int16 dbid, bool ismirror) { TablespaceDirEntry tablespaceDirEntry; HASH_SEQ_STATUS hstat; WRITE_PERSISTENT_STATE_ORDERED_LOCK_DECLARE; hash_seq_init(&hstat, persistentTablespaceSharedHashTable); if (Persistent_BeforePersistenceWork()) elog(ERROR, "persistent table changes forbidden"); PersistentTablespace_VerifyInitScan(); WRITE_PERSISTENT_STATE_ORDERED_LOCK; LWLockAcquire(TablespaceHashLock, LW_SHARED); while ((tablespaceDirEntry = hash_seq_search(&hstat)) != NULL) { PersistentFileSysObjName fsObjName; Oid tblspc = tablespaceDirEntry->key.tablespaceOid; ItemPointerData persistentTid; uint64 persistentSerialNum; tablespaceDirEntry = PersistentTablespace_FindEntryUnderLock(tblspc); LWLockRelease(TablespaceHashLock); if (tablespaceDirEntry == NULL) elog(ERROR, "Did not find persistent tablespace entry %u", tblspc); persistentSerialNum = tablespaceDirEntry->persistentSerialNum; ItemPointerCopy(&tablespaceDirEntry->persistentTid, &persistentTid); PersistentFileSysObjName_SetTablespaceDir(&fsObjName, tblspc); PersistentFileSysObj_RemoveSegment(&fsObjName, &persistentTid, persistentSerialNum, dbid, ismirror, /* flushToXlog */ false); LWLockAcquire(TablespaceHashLock, LW_SHARED); } LWLockRelease(TablespaceHashLock); WRITE_PERSISTENT_STATE_ORDERED_UNLOCK; }
Datum currtid_byrelname(PG_FUNCTION_ARGS) { text *relname = PG_GETARG_TEXT_P(0); ItemPointer tid = PG_GETARG_ITEMPOINTER(1); ItemPointer result; RangeVar *relrv; Relation rel; AclResult aclresult; /* * Immediately inform client that the function is not supported */ elog(ERROR, "Function currtid2 is not supported by GPDB"); relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname)); rel = heap_openrv(relrv, AccessShareLock); aclresult = pg_class_aclcheck(RelationGetRelid(rel), GetUserId(), ACL_SELECT); if (aclresult != ACLCHECK_OK) aclcheck_error(aclresult, ACL_KIND_CLASS, RelationGetRelationName(rel)); if (rel->rd_rel->relkind == RELKIND_VIEW) return currtid_for_view(rel, tid); result = (ItemPointer) palloc(sizeof(ItemPointerData)); ItemPointerCopy(tid, result); heap_get_latest_tid(rel, SnapshotNow, result); heap_close(rel, AccessShareLock); PG_RETURN_ITEMPOINTER(result); }
Datum currtid_byrelname(PG_FUNCTION_ARGS) { text *relname = PG_GETARG_TEXT_P(0); ItemPointer tid = PG_GETARG_ITEMPOINTER(1); ItemPointer result; RangeVar *relrv; Relation rel; relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname, "currtid_byrelname")); rel = heap_openrv(relrv, AccessShareLock); if (rel->rd_rel->relkind == RELKIND_VIEW) return currtid_for_view(rel, tid); result = (ItemPointer) palloc(sizeof(ItemPointerData)); ItemPointerCopy(tid, result); heap_get_latest_tid(rel, SnapshotNow, result); heap_close(rel, AccessShareLock); PG_RETURN_ITEMPOINTER(result); }
/* * Search the relation 'rel' for tuple using the index. * * If a matching tuple is found, lock it with lockmode, fill the slot with its * contents, and return true. Return false otherwise. */ bool RelationFindReplTupleByIndex(Relation rel, Oid idxoid, LockTupleMode lockmode, TupleTableSlot *searchslot, TupleTableSlot *outslot) { HeapTuple scantuple; ScanKeyData skey[INDEX_MAX_KEYS]; IndexScanDesc scan; SnapshotData snap; TransactionId xwait; Relation idxrel; bool found; /* Open the index. */ idxrel = index_open(idxoid, RowExclusiveLock); /* Start an index scan. */ InitDirtySnapshot(snap); scan = index_beginscan(rel, idxrel, &snap, RelationGetNumberOfAttributes(idxrel), 0); /* Build scan key. */ build_replindex_scan_key(skey, rel, idxrel, searchslot); retry: found = false; index_rescan(scan, skey, RelationGetNumberOfAttributes(idxrel), NULL, 0); /* Try to find the tuple */ if ((scantuple = index_getnext(scan, ForwardScanDirection)) != NULL) { found = true; ExecStoreTuple(scantuple, outslot, InvalidBuffer, false); ExecMaterializeSlot(outslot); xwait = TransactionIdIsValid(snap.xmin) ? snap.xmin : snap.xmax; /* * If the tuple is locked, wait for locking transaction to finish and * retry. */ if (TransactionIdIsValid(xwait)) { XactLockTableWait(xwait, NULL, NULL, XLTW_None); goto retry; } } /* Found tuple, try to lock it in the lockmode. */ if (found) { Buffer buf; HeapUpdateFailureData hufd; HTSU_Result res; HeapTupleData locktup; ItemPointerCopy(&outslot->tts_tuple->t_self, &locktup.t_self); PushActiveSnapshot(GetLatestSnapshot()); res = heap_lock_tuple(rel, &locktup, GetCurrentCommandId(false), lockmode, LockWaitBlock, false /* don't follow updates */ , &buf, &hufd); /* the tuple slot already has the buffer pinned */ ReleaseBuffer(buf); PopActiveSnapshot(); switch (res) { case HeapTupleMayBeUpdated: break; case HeapTupleUpdated: /* XXX: Improve handling here */ ereport(LOG, (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), errmsg("concurrent update, retrying"))); goto retry; case HeapTupleInvisible: elog(ERROR, "attempted to lock invisible tuple"); default: elog(ERROR, "unexpected heap_lock_tuple status: %u", res); break; } } index_endscan(scan); /* Don't release lock until commit. */ index_close(idxrel, NoLock); return found; }
void PersistentTablespace_AddCreated( Oid filespaceOid, /* The filespace where the tablespace lives. */ Oid tablespaceOid, /* The tablespace OID to be added. */ MirroredObjectExistenceState mirrorExistenceState, bool flushToXLog) /* When true, the XLOG record for this change will be flushed to disk. */ { WRITE_PERSISTENT_STATE_ORDERED_LOCK_DECLARE; PersistentFileSysObjName fsObjName; ItemPointerData persistentTid; int64 persistentSerialNum; TablespaceDirEntry tablespaceDirEntry; if (Persistent_BeforePersistenceWork()) { if (Debug_persistent_print) elog(Persistent_DebugPrintLevel(), "Skipping persistent tablespace %u because we are before persistence work", tablespaceOid); return; /* * The initdb process will load the persistent table once we out of * bootstrap mode. */ } PersistentTablespace_VerifyInitScan(); PersistentFileSysObjName_SetTablespaceDir(&fsObjName, tablespaceOid); WRITE_PERSISTENT_STATE_ORDERED_LOCK; PersistentTablespace_AddTuple( filespaceOid, tablespaceOid, PersistentFileSysState_Created, /* createMirrorDataLossTrackingSessionNum */ 0, mirrorExistenceState, /* reserved */ 0, InvalidTransactionId, flushToXLog, &persistentTid, &persistentSerialNum); WRITE_TABLESPACE_HASH_LOCK; tablespaceDirEntry = PersistentTablespace_CreateEntryUnderLock(filespaceOid, tablespaceOid); Assert(tablespaceDirEntry != NULL); tablespaceDirEntry->state = PersistentFileSysState_Created; ItemPointerCopy(&persistentTid, &tablespaceDirEntry->persistentTid); tablespaceDirEntry->persistentSerialNum = persistentSerialNum; WRITE_TABLESPACE_HASH_UNLOCK; WRITE_PERSISTENT_STATE_ORDERED_UNLOCK; if (Debug_persistent_print) elog(Persistent_DebugPrintLevel(), "Persistent tablespace directory: Add '%s' in state 'Created', mirror existence state '%s', serial number " INT64_FORMAT " at TID '%s' ", PersistentFileSysObjName_ObjectName(&fsObjName), MirroredObjectExistenceState_Name(mirrorExistenceState), persistentSerialNum, ItemPointerToString(&persistentTid)); }
/* * Indicate we intend to create a tablespace file as part of the current transaction. * * An XLOG IntentToCreate record is generated that will guard the subsequent file-system * create in case the transaction aborts. * * After 1 or more calls to this routine to mark intention about tablespace files that are going * to be created, call ~_DoPendingCreates to do the actual file-system creates. (See its * note on XLOG flushing). */ void PersistentTablespace_MarkCreatePending( Oid filespaceOid, /* The filespace where the tablespace lives. */ Oid tablespaceOid, /* The tablespace OID for the create. */ MirroredObjectExistenceState mirrorExistenceState, ItemPointer persistentTid, /* TID of the gp_persistent_rel_files tuple for the rel file */ int64 *persistentSerialNum, bool flushToXLog) /* When true, the XLOG record for this change will be flushed to disk. */ { WRITE_PERSISTENT_STATE_ORDERED_LOCK_DECLARE; PersistentFileSysObjName fsObjName; TablespaceDirEntry tablespaceDirEntry; TransactionId topXid; if (Persistent_BeforePersistenceWork()) { if (Debug_persistent_print) elog(Persistent_DebugPrintLevel(), "Skipping persistent tablespace %u because we are before persistence work", tablespaceOid); return; /* * The initdb process will load the persistent table once we out of * bootstrap mode. */ } PersistentTablespace_VerifyInitScan(); PersistentFileSysObjName_SetTablespaceDir(&fsObjName, tablespaceOid); topXid = GetTopTransactionId(); WRITE_PERSISTENT_STATE_ORDERED_LOCK; PersistentTablespace_AddTuple( filespaceOid, tablespaceOid, PersistentFileSysState_CreatePending, /* createMirrorDataLossTrackingSessionNum */ 0, mirrorExistenceState, /* reserved */ 0, /* parentXid */ topXid, flushToXLog, persistentTid, persistentSerialNum); WRITE_TABLESPACE_HASH_LOCK; tablespaceDirEntry = PersistentTablespace_CreateEntryUnderLock(filespaceOid, tablespaceOid); Assert(tablespaceDirEntry != NULL); tablespaceDirEntry->state = PersistentFileSysState_CreatePending; ItemPointerCopy(persistentTid, &tablespaceDirEntry->persistentTid); tablespaceDirEntry->persistentSerialNum = *persistentSerialNum; WRITE_TABLESPACE_HASH_UNLOCK; /* * This XLOG must be generated under the persistent write-lock. */ #ifdef MASTER_MIRROR_SYNC mmxlog_log_create_tablespace( filespaceOid, tablespaceOid); #endif SIMPLE_FAULT_INJECTOR(FaultBeforePendingDeleteTablespaceEntry); /* * MPP-18228 To make adding 'Create Pending' entry to persistent table and * adding to the PendingDelete list atomic */ PendingDelete_AddCreatePendingEntryWrapper( &fsObjName, persistentTid, *persistentSerialNum); WRITE_PERSISTENT_STATE_ORDERED_UNLOCK; if (Debug_persistent_print) elog(Persistent_DebugPrintLevel(), "Persistent tablespace directory: Add '%s' in state 'Created', mirror existence state '%s', serial number " INT64_FORMAT " at TID %s", PersistentFileSysObjName_ObjectName(&fsObjName), MirroredObjectExistenceState_Name(mirrorExistenceState), *persistentSerialNum, ItemPointerToString(persistentTid)); }
/* * _bt_mergeload - Merge two streams of index tuples into new index files. */ static void _bt_mergeload(Spooler *self, BTWriteState *wstate, BTSpool *btspool, BTReader *btspool2, Relation heapRel) { BTPageState *state = NULL; IndexTuple itup, itup2; bool should_free = false; TupleDesc tupdes = RelationGetDescr(wstate->index); int keysz = RelationGetNumberOfAttributes(wstate->index); ScanKey indexScanKey; ON_DUPLICATE on_duplicate = self->on_duplicate; Assert(btspool != NULL); /* the preparation of merge */ itup = BTSpoolGetNextItem(btspool, NULL, &should_free); itup2 = BTReaderGetNextItem(btspool2); indexScanKey = _bt_mkscankey_nodata(wstate->index); for (;;) { bool load1 = true; /* load BTSpool next ? */ bool hasnull; int32 compare; if (self->dup_old + self->dup_new > self->max_dup_errors) ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Maximum duplicate error count exceeded"))); if (itup2 == NULL) { if (itup == NULL) break; } else if (itup != NULL) { compare = compare_indextuple(itup, itup2, indexScanKey, keysz, tupdes, &hasnull); if (compare == 0 && !hasnull && btspool->isunique) { ItemPointerData t_tid2; /* * t_tid is update by heap_is_visible(), because use it for an * index, t_tid backup */ ItemPointerCopy(&itup2->t_tid, &t_tid2); /* The tuple pointed by the old index should not be visible. */ if (!heap_is_visible(heapRel, &itup->t_tid)) { itup = BTSpoolGetNextItem(btspool, itup, &should_free); } else if (!heap_is_visible(heapRel, &itup2->t_tid)) { itup2 = BTReaderGetNextItem(btspool2); } else { if (on_duplicate == ON_DUPLICATE_KEEP_NEW) { self->dup_old++; remove_duplicate(self, heapRel, itup2, RelationGetRelationName(wstate->index)); itup2 = BTReaderGetNextItem(btspool2); } else { ItemPointerCopy(&t_tid2, &itup2->t_tid); self->dup_new++; remove_duplicate(self, heapRel, itup, RelationGetRelationName(wstate->index)); itup = BTSpoolGetNextItem(btspool, itup, &should_free); } } continue; } else if (compare > 0) load1 = false; } else load1 = false; BULKLOAD_PROFILE(&prof_merge_unique); /* When we see first tuple, create first index page */ if (state == NULL) state = _bt_pagestate(wstate, 0); if (load1) { IndexTuple next_itup = NULL; bool next_should_free = false; for (;;) { /* get next item */ next_itup = BTSpoolGetNextItem(btspool, next_itup, &next_should_free); if (!btspool->isunique || next_itup == NULL) break; compare = compare_indextuple(itup, next_itup, indexScanKey, keysz, tupdes, &hasnull); if (compare < 0 || hasnull) break; if (compare > 0) { /* shouldn't happen */ elog(ERROR, "faild in tuplesort_performsort"); } /* * If tupple is deleted by other unique indexes, not visible */ if (!heap_is_visible(heapRel, &next_itup->t_tid)) { continue; } if (!heap_is_visible(heapRel, &itup->t_tid)) { if (should_free) pfree(itup); itup = next_itup; should_free = next_should_free; next_should_free = false; continue; } /* not unique between input files */ self->dup_new++; remove_duplicate(self, heapRel, next_itup, RelationGetRelationName(wstate->index)); if (self->dup_old + self->dup_new > self->max_dup_errors) ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Maximum duplicate error count exceeded"))); } _bt_buildadd(wstate, state, itup); if (should_free) pfree(itup); itup = next_itup; should_free = next_should_free; } else { _bt_buildadd(wstate, state, itup2); itup2 = BTReaderGetNextItem(btspool2); } BULKLOAD_PROFILE(&prof_merge_insert); } _bt_freeskey(indexScanKey); /* Close down final pages and write the metapage */ _bt_uppershutdown(wstate, state); /* * If the index isn't temp, we must fsync it down to disk before it's safe * to commit the transaction. (For a temp index we don't care since the * index will be uninteresting after a crash anyway.) * * It's obvious that we must do this when not WAL-logging the build. It's * less obvious that we have to do it even if we did WAL-log the index * pages. The reason is that since we're building outside shared buffers, * a CHECKPOINT occurring during the build has no way to flush the * previously written data to disk (indeed it won't know the index even * exists). A crash later on would replay WAL from the checkpoint, * therefore it wouldn't replay our earlier WAL entries. If we do not * fsync those pages here, they might still not be on disk when the crash * occurs. */ if (!RELATION_IS_LOCAL(wstate->index)) { RelationOpenSmgr(wstate->index); smgrimmedsync(wstate->index->rd_smgr, MAIN_FORKNUM); } BULKLOAD_PROFILE(&prof_merge_term); }
void PersistentStore_FreeTuple( PersistentStoreData *storeData, PersistentStoreSharedData *storeSharedData, ItemPointer persistentTid, /* TID of the stored tuple. */ Datum *freeValues, bool flushToXLog) /* When true, the XLOG record for this change will be flushed to disk. */ { Relation persistentRel; HeapTuple persistentTuple = NULL; ItemPointerData prevFreeTid; XLogRecPtr xlogEndLoc; /* The end location of the UPDATE XLOG record. */ Assert( LWLockHeldByMe(PersistentObjLock) ); #ifdef USE_ASSERT_CHECKING if (storeSharedData == NULL || !PersistentStoreSharedData_EyecatcherIsValid(storeSharedData)) elog(ERROR, "Persistent store shared-memory not valid"); #endif if (Debug_persistent_store_print) elog(PersistentStore_DebugPrintLevel(), "PersistentStore_FreeTuple: Going to free tuple at TID %s ('%s', shared data %p)", ItemPointerToString(persistentTid), storeData->tableName, storeSharedData); Assert(persistentTid->ip_posid != 0); persistentRel = (*storeData->openRel)(); prevFreeTid = storeSharedData->freeTid; if (validate_previous_free_tid) { /* Let us validate and have sanity check to make sure the prevFreeTid is really free. */ ItemPointerData tmpPrevFreeTid; PersistentStore_ValidateFreeTID( storeData, storeSharedData, &tmpPrevFreeTid); } storeSharedData->maxFreeOrderNum++; if (storeSharedData->maxFreeOrderNum == 1) ItemPointerCopy(persistentTid, &prevFreeTid); /* So non-zero PreviousFreeTid indicates free. */ storeSharedData->freeTid = *persistentTid; PersistentStore_FormTupleSetOurs( storeData, persistentRel->rd_att, freeValues, storeSharedData->maxFreeOrderNum, &prevFreeTid, &persistentTuple); persistentTuple->t_self = *persistentTid; frozen_heap_inplace_update(persistentRel, persistentTuple); /* * XLOG location of the UPDATE tuple's XLOG record. */ xlogEndLoc = XLogLastInsertEndLoc(); heap_freetuple(persistentTuple); (*storeData->closeRel)(persistentRel); storeSharedData->inUseCount--; if (Debug_persistent_store_print) elog(PersistentStore_DebugPrintLevel(), "PersistentStore_FreeTuple: Freed tuple at TID %s. Maximum free order number " INT64_FORMAT ", in use count " INT64_FORMAT " ('%s')", ItemPointerToString(&storeSharedData->freeTid), storeSharedData->maxFreeOrderNum, storeSharedData->inUseCount, storeData->tableName); if (flushToXLog) { XLogFlush(xlogEndLoc); XLogRecPtr_Zero(&nowaitXLogEndLoc); } else nowaitXLogEndLoc = xlogEndLoc; }
/* * MultiRelease -- release a multi-level lock * * Returns: TRUE if successful, FALSE otherwise. */ bool MultiRelease(LockTableId tableId, LOCKTAG *tag, LOCKT lockt, LOCK_LEVEL level) { LOCKT locks[N_LEVELS]; int i,status; LOCKTAG xxTag, *tmpTag = &xxTag; /* * same level scheme as MultiAcquire(). */ switch (level) { case RELN_LEVEL: locks[0] = lockt; locks[1] = NO_LOCK; locks[2] = NO_LOCK; break; case PAGE_LEVEL: locks[0] = lockt + INTENT; locks[1] = lockt; locks[2] = NO_LOCK; break; case TUPLE_LEVEL: locks[0] = lockt + INTENT; locks[1] = lockt + INTENT; locks[2] = lockt; break; default: elog(WARN,"MultiRelease: bad lockt"); } /* * again, construct the tag on the fly. This time, however, * we release the locks in the REVERSE order -- from lowest * level to highest level. * * Must zero out the tag to set padding byes to zero and ensure * hashing consistency. */ memset(tmpTag, 0, sizeof(*tmpTag)); tmpTag->relId = tag->relId; tmpTag->dbId = tag->dbId; for (i=(N_LEVELS-1); i>=0; i--) { if (locks[i] != NO_LOCK) { switch (i) { case RELN_LEVEL: /* ------------- * Set the block # and offset to invalid * ------------- */ BlockIdSet(&(tmpTag->tupleId.ip_blkid), InvalidBlockNumber); tmpTag->tupleId.ip_posid = InvalidOffsetNumber; break; case PAGE_LEVEL: /* ------------- * Copy the block #, set the offset to invalid * ------------- */ BlockIdCopy(&(tmpTag->tupleId.ip_blkid), &(tag->tupleId.ip_blkid)); tmpTag->tupleId.ip_posid = InvalidOffsetNumber; break; case TUPLE_LEVEL: ItemPointerCopy(&tmpTag->tupleId, &tag->tupleId); break; } status = LockRelease(tableId, tmpTag, locks[i]); if (! status) { elog(WARN,"MultiRelease: couldn't release after error"); } } } /* shouldn't reach here */ return false; }
/* * MultiAcquire -- acquire multi level lock at requested level * * Returns: TRUE if lock is set, FALSE if not * Side Effects: */ bool MultiAcquire(LockTableId tableId, LOCKTAG *tag, LOCKT lockt, LOCK_LEVEL level) { LOCKT locks[N_LEVELS]; int i,status; LOCKTAG xxTag, *tmpTag = &xxTag; int retStatus = TRUE; /* * Three levels implemented. If we set a low level (e.g. Tuple) * lock, we must set INTENT locks on the higher levels. The * intent lock detects conflicts between the low level lock * and an existing high level lock. For example, setting a * write lock on a tuple in a relation is disallowed if there * is an existing read lock on the entire relation. The * write lock would set a WRITE + INTENT lock on the relation * and that lock would conflict with the read. */ switch (level) { case RELN_LEVEL: locks[0] = lockt; locks[1] = NO_LOCK; locks[2] = NO_LOCK; break; case PAGE_LEVEL: locks[0] = lockt + INTENT; locks[1] = lockt; locks[2] = NO_LOCK; break; case TUPLE_LEVEL: locks[0] = lockt + INTENT; locks[1] = lockt + INTENT; locks[2] = lockt; break; default: elog(WARN,"MultiAcquire: bad lock level"); return(FALSE); } /* * construct a new tag as we go. Always loop through all levels, * but if we arent' seting a low level lock, locks[i] is set to * NO_LOCK for the lower levels. Always start from the highest * level and go to the lowest level. */ memset(tmpTag,0,sizeof(*tmpTag)); tmpTag->relId = tag->relId; tmpTag->dbId = tag->dbId; for (i=0;i<N_LEVELS;i++) { if (locks[i] != NO_LOCK) { switch (i) { case RELN_LEVEL: /* ------------- * Set the block # and offset to invalid * ------------- */ BlockIdSet(&(tmpTag->tupleId.ip_blkid), InvalidBlockNumber); tmpTag->tupleId.ip_posid = InvalidOffsetNumber; break; case PAGE_LEVEL: /* ------------- * Copy the block #, set the offset to invalid * ------------- */ BlockIdCopy(&(tmpTag->tupleId.ip_blkid), &(tag->tupleId.ip_blkid)); tmpTag->tupleId.ip_posid = InvalidOffsetNumber; break; case TUPLE_LEVEL: /* -------------- * Copy the entire tuple id. * -------------- */ ItemPointerCopy(&tmpTag->tupleId, &tag->tupleId); break; } status = LockAcquire(tableId, tmpTag, locks[i]); if (! status) { /* failed for some reason. Before returning we have * to release all of the locks we just acquired. * MultiRelease(xx,xx,xx, i) means release starting from * the last level lock we successfully acquired */ retStatus = FALSE; (void) MultiRelease(tableId, tag, lockt, i); /* now leave the loop. Don't try for any more locks */ break; } } } return(retStatus); }
/* * Rebuild free TID list based on freeEntryHashTable. Returns number * of free tuples in the rebuilt free list. */ uint64 PersistentStore_RebuildFreeList( PersistentStoreData *storeData, PersistentStoreSharedData *storeSharedData) { Datum *values; PersistentStoreScan storeScan; ItemPointerData persistentTid; ItemPointerData previousFreeTid; ItemPointerData previousTid; uint64 persistentSerialNum; uint64 freeOrderNum; values = (Datum*)palloc(storeData->numAttributes * sizeof(Datum)); /* * PT shared data must be already initialized, even when we are * called during recovery. */ Assert(!PersistentStore_IsZeroTid(&storeSharedData->maxTid)); if (storeSharedData->maxFreeOrderNum < 1) { elog(LOG, "no free tuples in %s, not building any free list", storeData->tableName); return 0; } elog(LOG, "rebuilding free list in %s with " INT64_FORMAT " free tuples", storeData->tableName, storeSharedData->maxFreeOrderNum); /* * Scan PT for free entries (in TID order) and establish links * with previous free entry as we go on. */ previousTid.ip_posid = 0; freeOrderNum = 0; PersistentStore_BeginScan(storeData, storeSharedData, &storeScan); while (PersistentStore_GetNext( &storeScan, values, &persistentTid, (int64 *)&persistentSerialNum)) { /* * We are scanning from low to high TID. All TIDs we * encounter should be smaller or equal to the known * maxTid. */ Assert(ItemPointerCompare( &storeSharedData->maxTid, &persistentTid) >= 0); PersistentStore_ExtractOurTupleData( storeData, values, (int64 *)&persistentSerialNum, &previousFreeTid); if (!PersistentStore_IsZeroTid(&previousFreeTid)) { values[storeData->attNumPersistentSerialNum - 1] = Int64GetDatum(++freeOrderNum); values[storeData->attNumPreviousFreeTid - 1] = ItemPointerIsValid(&previousTid) ? PointerGetDatum(&previousTid) : PointerGetDatum(&persistentTid); #ifdef FAULT_INJECTOR /* * Inject fault after free list is partially built - a few * tuples are updated but at least one is yet to be * updated. */ if (freeOrderNum > 3) { FaultInjector_InjectFaultIfSet( RebuildPTDB, DDLNotSpecified, "", // databaseName ""); // tableName } #endif PersistentStore_UpdateTuple( storeData, storeSharedData, &persistentTid, values, true); ItemPointerCopy(&persistentTid, &previousTid); } } PersistentStore_EndScan(&storeScan); pfree(values); if (ItemPointerIsValid(&previousTid)) { Assert(freeOrderNum > 0); ItemPointerCopy(&previousTid, &storeSharedData->freeTid); storeSharedData->maxFreeOrderNum = freeOrderNum; elog(LOG, "rebuilt free list in %s: maxFreeOrderNum = " INT64_FORMAT " freeTid = %s", storeData->tableName, freeOrderNum, ItemPointerToString(&persistentTid)); } return freeOrderNum; }
static ArrayTuple gistVacuumUpdate(GistVacuum *gv, BlockNumber blkno, bool needunion) { ArrayTuple res = {NULL, 0, false}; Buffer buffer; Page page, tempPage = NULL; OffsetNumber i, maxoff; ItemId iid; int lenaddon = 4, curlenaddon = 0, nOffToDelete = 0, nBlkToDelete = 0; IndexTuple idxtuple, *addon = NULL; bool needwrite = false; OffsetNumber offToDelete[MaxOffsetNumber]; BlockNumber blkToDelete[MaxOffsetNumber]; ItemPointerData *completed = NULL; int ncompleted = 0, lencompleted = 16; vacuum_delay_point(); buffer = ReadBufferWithStrategy(gv->index, blkno, gv->strategy); LockBuffer(buffer, GIST_EXCLUSIVE); gistcheckpage(gv->index, buffer); page = (Page) BufferGetPage(buffer); maxoff = PageGetMaxOffsetNumber(page); if (GistPageIsLeaf(page)) { if (GistTuplesDeleted(page)) needunion = needwrite = true; } else { completed = (ItemPointerData *) palloc(sizeof(ItemPointerData) * lencompleted); addon = (IndexTuple *) palloc(sizeof(IndexTuple) * lenaddon); /* get copy of page to work */ tempPage = GistPageGetCopyPage(page); for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) { ArrayTuple chldtuple; bool needchildunion; iid = PageGetItemId(tempPage, i); idxtuple = (IndexTuple) PageGetItem(tempPage, iid); needchildunion = (GistTupleIsInvalid(idxtuple)) ? true : false; if (needchildunion) elog(DEBUG2, "gistVacuumUpdate: need union for block %u", ItemPointerGetBlockNumber(&(idxtuple->t_tid))); chldtuple = gistVacuumUpdate(gv, ItemPointerGetBlockNumber(&(idxtuple->t_tid)), needchildunion); if (chldtuple.ituplen || chldtuple.emptypage) { /* update tuple or/and inserts new */ if (chldtuple.emptypage) blkToDelete[nBlkToDelete++] = ItemPointerGetBlockNumber(&(idxtuple->t_tid)); offToDelete[nOffToDelete++] = i; PageIndexTupleDelete(tempPage, i); i--; maxoff--; needwrite = needunion = true; if (chldtuple.ituplen) { Assert(chldtuple.emptypage == false); while (curlenaddon + chldtuple.ituplen >= lenaddon) { lenaddon *= 2; addon = (IndexTuple *) repalloc(addon, sizeof(IndexTuple) * lenaddon); } memcpy(addon + curlenaddon, chldtuple.itup, chldtuple.ituplen * sizeof(IndexTuple)); curlenaddon += chldtuple.ituplen; if (chldtuple.ituplen > 1) { /* * child was split, so we need mark completion * insert(split) */ int j; while (ncompleted + chldtuple.ituplen > lencompleted) { lencompleted *= 2; completed = (ItemPointerData *) repalloc(completed, sizeof(ItemPointerData) * lencompleted); } for (j = 0; j < chldtuple.ituplen; j++) { ItemPointerCopy(&(chldtuple.itup[j]->t_tid), completed + ncompleted); ncompleted++; } } pfree(chldtuple.itup); } } } Assert(maxoff == PageGetMaxOffsetNumber(tempPage)); if (curlenaddon) { /* insert updated tuples */ if (gistnospace(tempPage, addon, curlenaddon, InvalidOffsetNumber, 0)) { /* there is no space on page to insert tuples */ res = vacuumSplitPage(gv, tempPage, buffer, addon, curlenaddon); tempPage = NULL; /* vacuumSplitPage() free tempPage */ needwrite = needunion = false; /* gistSplit already forms * unions and writes pages */ } else /* enough free space */ gistfillbuffer(gv->index, tempPage, addon, curlenaddon, InvalidOffsetNumber); } } /* * If page is empty, we should remove pointer to it before deleting page * (except root) */ if (blkno != GIST_ROOT_BLKNO && (PageIsEmpty(page) || (tempPage && PageIsEmpty(tempPage)))) { /* * New version of page is empty, so leave it unchanged, upper call * will mark our page as deleted. In case of page split we never will * be here... * * If page was empty it can't become non-empty during processing */ res.emptypage = true; UnlockReleaseBuffer(buffer); } else { /* write page and remove its childs if it need */ START_CRIT_SECTION(); if (tempPage && needwrite) { PageRestoreTempPage(tempPage, page); tempPage = NULL; } /* Empty index */ if (PageIsEmpty(page) && blkno == GIST_ROOT_BLKNO) { needwrite = true; GistPageSetLeaf(page); } if (needwrite) { MarkBufferDirty(buffer); GistClearTuplesDeleted(page); if (!gv->index->rd_istemp) { XLogRecData *rdata; XLogRecPtr recptr; char *xlinfo; rdata = formUpdateRdata(gv->index->rd_node, buffer, offToDelete, nOffToDelete, addon, curlenaddon, NULL); xlinfo = rdata->next->data; recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_UPDATE, rdata); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); pfree(xlinfo); pfree(rdata); } else PageSetLSN(page, XLogRecPtrForTemp); } END_CRIT_SECTION(); if (needunion && !PageIsEmpty(page)) { res.itup = (IndexTuple *) palloc(sizeof(IndexTuple)); res.ituplen = 1; res.itup[0] = PageMakeUnionKey(gv, buffer); } UnlockReleaseBuffer(buffer); /* delete empty children, now we havn't any links to pointed subtrees */ for (i = 0; i < nBlkToDelete; i++) gistDeleteSubtree(gv, blkToDelete[i]); if (ncompleted && !gv->index->rd_istemp) gistxlogInsertCompletion(gv->index->rd_node, completed, ncompleted); } for (i = 0; i < curlenaddon; i++) pfree(addon[i]); if (addon) pfree(addon); if (completed) pfree(completed); if (tempPage) pfree(tempPage); return res; }
/* * extract_minipage * * Extract the minipage info from the given tuple. The tupleTid * is also set here. */ static void extract_minipage(AppendOnlyBlockDirectory *blockDirectory, HeapTuple tuple, TupleDesc tupleDesc, int columnGroupNo) { Datum *values = blockDirectory->values; bool *nulls = blockDirectory->nulls; MinipagePerColumnGroup *minipageInfo = &blockDirectory->minipages[columnGroupNo]; FileSegInfo *fsInfo = blockDirectory->currentSegmentFileInfo; int64 eof; int start, end, mid=0; bool found = false; heap_deform_tuple(tuple, tupleDesc, values, nulls); Assert(blockDirectory->currentSegmentFileNum == DatumGetInt32(values[Anum_pg_aoblkdir_segno - 1])); /* * Copy out the minipage */ copy_out_minipage(minipageInfo, values[Anum_pg_aoblkdir_minipage - 1], nulls[Anum_pg_aoblkdir_minipage - 1]); ItemPointerCopy(&tuple->t_self, &minipageInfo->tupleTid); /* * When crashes during inserts, or cancellation during inserts, * there are out-of-date minipage entries in the block directory. * We reset those entries here. */ Assert(fsInfo != NULL); if (!blockDirectory->isAOCol) eof = fsInfo->eof; else eof = ((AOCSFileSegInfo *)fsInfo)->vpinfo.entry[columnGroupNo].eof; start = 0; end = minipageInfo->numMinipageEntries - 1; while (start <= end) { mid = (end - start + 1) / 2 + start; if (minipageInfo->minipage->entry[mid].fileOffset > eof) end = mid - 1; else if (minipageInfo->minipage->entry[mid].fileOffset < eof) start = mid + 1; else { found = true; break; } } minipageInfo->numMinipageEntries = 0; if (found) minipageInfo->numMinipageEntries = mid; else if (start > 0) { minipageInfo->numMinipageEntries = start; Assert(minipageInfo->minipage->entry[start - 1].fileOffset < eof); } }
/* * Add a mirror. */ void PersistentFilespace_AddMirror(Oid filespace, char *mirpath, int16 pridbid, int16 mirdbid, bool set_mirror_existence) { PersistentFileSysObjName fsObjName; char *newpath; WRITE_PERSISTENT_STATE_ORDERED_LOCK_DECLARE; FilespaceDirEntry fde; ItemPointerData persistentTid; int64 persistentSerialNum; if (Persistent_BeforePersistenceWork()) elog(ERROR, "persistent table changes forbidden"); PersistentFilespace_VerifyInitScan(); PersistentFileSysObjName_SetFilespaceDir(&fsObjName, filespace); WRITE_PERSISTENT_STATE_ORDERED_LOCK; LWLockAcquire(FilespaceHashLock, LW_SHARED); fde = PersistentFilespace_FindDirUnderLock(filespace); if (fde == NULL) elog(ERROR, "did not find persistent filespace entry %u", filespace); if (fde->dbId1 == pridbid) { fde->dbId2 = mirdbid; PersistentFilespace_BlankPadCopyLocation( fde->locationBlankPadded2, mirpath); newpath = fde->locationBlankPadded2; } else if (fde->dbId2 == pridbid) { fde->dbId1 = mirdbid; PersistentFilespace_BlankPadCopyLocation( fde->locationBlankPadded1, mirpath); newpath = fde->locationBlankPadded1; } else { Insist(false); } ItemPointerCopy(&fde->persistentTid, &persistentTid); persistentSerialNum = fde->persistentSerialNum; LWLockRelease(FilespaceHashLock); PersistentFileSysObj_AddMirror(&fsObjName, &persistentTid, persistentSerialNum, pridbid, mirdbid, (void *)newpath, set_mirror_existence, /* flushToXlog */ false); WRITE_PERSISTENT_STATE_ORDERED_UNLOCK; }
/* * Activate a standby master by removing reference to the dead master * and changing our dbid to the old master's dbid */ void PersistentFilespace_ActivateStandby(int16 oldmaster, int16 newmaster) { HASH_SEQ_STATUS hstat; FilespaceDirEntry fde; WRITE_PERSISTENT_STATE_ORDERED_LOCK_DECLARE; if (Persistent_BeforePersistenceWork()) elog(ERROR, "persistent table changes forbidden"); hash_seq_init(&hstat, persistentFilespaceSharedHashTable); PersistentFilespace_VerifyInitScan(); WRITE_PERSISTENT_STATE_ORDERED_LOCK; /* * We release FilespaceHashLock in the middle of the loop and re-acquire * it after doing persistent table change. This is needed to prevent * holding the lock for any purpose other than to protect the filespace * shared hash table. Not releasing this lock could result in file I/O * and potential deadlock due to other LW locks being acquired in the * process. Releasing the lock this way is safe because we are still * holding PersistentObjLock in exclusive mode. Any change to the * filespace shared hash table is also protected by PersistentObjLock. */ WRITE_FILESPACE_HASH_LOCK; while ((fde = hash_seq_search(&hstat)) != NULL) { Oid filespace = fde->key.filespaceOid; PersistentFileSysObjName fsObjName; ItemPointerData persistentTid; int64 persistentSerialNum = fde->persistentSerialNum; ItemPointerCopy(&fde->persistentTid, &persistentTid); PersistentFileSysObjName_SetFilespaceDir(&fsObjName, filespace); if (fde->dbId1 == oldmaster) { fde->dbId1 = InvalidDbid; fde->dbId2 = newmaster; /* Copy standby filespace location into new master location */ PersistentFilespace_BlankPadCopyLocation( fde->locationBlankPadded2, fde->locationBlankPadded1); PersistentFilespace_BlankPadCopyLocation( fde->locationBlankPadded1, ""); } else if (fde->dbId2 == oldmaster) { fde->dbId2 = InvalidDbid; fde->dbId1 = newmaster; /* Copy standby filespace location into new master location */ PersistentFilespace_BlankPadCopyLocation( fde->locationBlankPadded1, fde->locationBlankPadded2); PersistentFilespace_BlankPadCopyLocation( fde->locationBlankPadded2, ""); } WRITE_FILESPACE_HASH_UNLOCK; PersistentFileSysObj_ActivateStandby(&fsObjName, &persistentTid, persistentSerialNum, oldmaster, newmaster, /* flushToXlog */ false); WRITE_FILESPACE_HASH_LOCK; } WRITE_FILESPACE_HASH_UNLOCK; WRITE_PERSISTENT_STATE_ORDERED_UNLOCK; }
/* * Indicate we intend to create a filespace file as part of the current transaction. * * An XLOG IntentToCreate record is generated that will guard the subsequent file-system * create in case the transaction aborts. * * After 1 or more calls to this routine to mark intention about filespace files that are going * to be created, call ~_DoPendingCreates to do the actual file-system creates. (See its * note on XLOG flushing). */ void PersistentFilespace_MarkCreatePending( Oid filespaceOid, /* The filespace where the filespace lives. */ int16 primaryDbId, char *primaryFilespaceLocation, /* * The primary filespace directory path. NOT Blank padded. * Just a NULL terminated string. */ int16 mirrorDbId, char *mirrorFilespaceLocation, MirroredObjectExistenceState mirrorExistenceState, ItemPointer persistentTid, /* TID of the gp_persistent_rel_files tuple for the rel file */ int64 *persistentSerialNum, bool flushToXLog) /* When true, the XLOG record for this change will be flushed to disk. */ { WRITE_PERSISTENT_STATE_ORDERED_LOCK_DECLARE; PersistentFileSysObjName fsObjName; FilespaceDirEntry filespaceDirEntry; TransactionId topXid; Datum values[Natts_gp_persistent_filespace_node]; char mirrorFilespaceLocationBlankPadded[FilespaceLocationBlankPaddedWithNullTermLen]; char primaryFilespaceLocationBlankPadded[FilespaceLocationBlankPaddedWithNullTermLen]; if (Persistent_BeforePersistenceWork()) { if (Debug_persistent_print) elog(Persistent_DebugPrintLevel(), "Skipping persistent filespace %u because we are before persistence work", filespaceOid); return; // The initdb process will load the persistent table once we out of bootstrap mode. } PersistentFilespace_VerifyInitScan(); PersistentFileSysObjName_SetFilespaceDir(&fsObjName,filespaceOid); topXid = GetTopTransactionId(); WRITE_PERSISTENT_STATE_ORDERED_LOCK; PersistentFilespace_BlankPadCopyLocation( primaryFilespaceLocationBlankPadded, primaryFilespaceLocation); PersistentFilespace_BlankPadCopyLocation( mirrorFilespaceLocationBlankPadded, mirrorFilespaceLocation); GpPersistentFilespaceNode_SetDatumValues( values, filespaceOid, primaryDbId, primaryFilespaceLocationBlankPadded, mirrorDbId, mirrorFilespaceLocationBlankPadded, PersistentFileSysState_CreatePending, /* createMirrorDataLossTrackingSessionNum */ 0, mirrorExistenceState, /* reserved */ 0, /* parentXid */ topXid, /* persistentSerialNum */ 0); // This will be set by PersistentFileSysObj_AddTuple. PersistentFileSysObj_AddTuple( PersistentFsObjType_FilespaceDir, values, flushToXLog, persistentTid, persistentSerialNum); WRITE_FILESPACE_HASH_LOCK; filespaceDirEntry = PersistentFilespace_CreateDirUnderLock(filespaceOid); Assert(filespaceDirEntry != NULL); filespaceDirEntry->dbId1 = primaryDbId; memcpy(filespaceDirEntry->locationBlankPadded1, primaryFilespaceLocationBlankPadded, FilespaceLocationBlankPaddedWithNullTermLen); filespaceDirEntry->dbId2 = mirrorDbId; memcpy(filespaceDirEntry->locationBlankPadded2, mirrorFilespaceLocationBlankPadded, FilespaceLocationBlankPaddedWithNullTermLen); filespaceDirEntry->state = PersistentFileSysState_CreatePending; ItemPointerCopy(persistentTid, &filespaceDirEntry->persistentTid); filespaceDirEntry->persistentSerialNum = *persistentSerialNum; WRITE_FILESPACE_HASH_UNLOCK; /* * This XLOG must be generated under the persistent write-lock. */ #ifdef MASTER_MIRROR_SYNC mmxlog_log_create_filespace(filespaceOid); #endif SIMPLE_FAULT_INJECTOR(FaultBeforePendingDeleteFilespaceEntry); /* * MPP-18228 * To make adding 'Create Pending' entry to persistent table and adding * to the PendingDelete list atomic */ PendingDelete_AddCreatePendingEntryWrapper( &fsObjName, persistentTid, *persistentSerialNum); WRITE_PERSISTENT_STATE_ORDERED_UNLOCK; if (Debug_persistent_print) elog(Persistent_DebugPrintLevel(), "Persistent filespace directory: Add '%s' in state 'Created', mirror existence state '%s', serial number " INT64_FORMAT " at TID %s", PersistentFileSysObjName_ObjectName(&fsObjName), MirroredObjectExistenceState_Name(mirrorExistenceState), *persistentSerialNum, ItemPointerToString(persistentTid)); }
/* * Post vacuum, iterate over all entries in index, check if the h_tid * of each entry exists and is not dead. For specific system tables, * also ensure that the key in index entry matches the corresponding * attribute in the heap tuple. */ void _bt_validate_vacuum(Relation irel, Relation hrel, TransactionId oldest_xmin) { MIRROREDLOCK_BUFMGR_DECLARE; BlockNumber blkno; BlockNumber num_pages; Buffer ibuf = InvalidBuffer; Buffer hbuf = InvalidBuffer; Page ipage; BTPageOpaque opaque; IndexTuple itup; HeapTupleData htup; OffsetNumber maxoff, minoff, offnum; Oid ioid, hoid; bool isnull; blkno = BTREE_METAPAGE + 1; num_pages = RelationGetNumberOfBlocks(irel); elog(LOG, "btvalidatevacuum: index %s, heap %s", RelationGetRelationName(irel), RelationGetRelationName(hrel)); MIRROREDLOCK_BUFMGR_LOCK; for (; blkno < num_pages; blkno++) { ibuf = ReadBuffer(irel, blkno); ipage = BufferGetPage(ibuf); opaque = (BTPageOpaque) PageGetSpecialPointer(ipage); if (!PageIsNew(ipage)) _bt_checkpage(irel, ibuf); if (P_ISLEAF(opaque)) { minoff = P_FIRSTDATAKEY(opaque); maxoff = PageGetMaxOffsetNumber(ipage); for (offnum = minoff; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { itup = (IndexTuple) PageGetItem(ipage, PageGetItemId(ipage, offnum)); ItemPointerCopy(&itup->t_tid, &htup.t_self); /* * TODO: construct a tid bitmap based on index tids * and fetch heap tids in order afterwards. That will * also allow validating if a heap tid appears twice * in a unique index. */ if (!heap_release_fetch(hrel, SnapshotAny, &htup, &hbuf, true, NULL)) { elog(ERROR, "btvalidatevacuum: tid (%d,%d) from index %s " "not found in heap %s", ItemPointerGetBlockNumber(&itup->t_tid), ItemPointerGetOffsetNumber(&itup->t_tid), RelationGetRelationName(irel), RelationGetRelationName(hrel)); } switch (HeapTupleSatisfiesVacuum(hrel, htup.t_data, oldest_xmin, hbuf)) { case HEAPTUPLE_RECENTLY_DEAD: case HEAPTUPLE_LIVE: case HEAPTUPLE_INSERT_IN_PROGRESS: case HEAPTUPLE_DELETE_IN_PROGRESS: /* these tuples are considered alive by vacuum */ break; case HEAPTUPLE_DEAD: elog(ERROR, "btvalidatevacuum: vacuum did not remove " "dead tuple (%d,%d) from heap %s and index %s", ItemPointerGetBlockNumber(&itup->t_tid), ItemPointerGetOffsetNumber(&itup->t_tid), RelationGetRelationName(hrel), RelationGetRelationName(irel)); break; default: elog(ERROR, "btvalidatevacuum: invalid visibility"); break; } switch(RelationGetRelid(irel)) { case DatabaseOidIndexId: case TypeOidIndexId: case ClassOidIndexId: case ConstraintOidIndexId: hoid = HeapTupleGetOid(&htup); ioid = index_getattr(itup, 1, RelationGetDescr(irel), &isnull); if (hoid != ioid) { elog(ERROR, "btvalidatevacuum: index oid(%d) != heap oid(%d)" " tuple (%d,%d) index %s", ioid, hoid, ItemPointerGetBlockNumber(&itup->t_tid), ItemPointerGetOffsetNumber(&itup->t_tid), RelationGetRelationName(irel)); } break; case GpRelationNodeOidIndexId: hoid = heap_getattr(&htup, 1, RelationGetDescr(hrel), &isnull); ioid = index_getattr(itup, 1, RelationGetDescr(irel), &isnull); if (hoid != ioid) { elog(ERROR, "btvalidatevacuum: index oid(%d) != heap oid(%d)" " tuple (%d,%d) index %s", ioid, hoid, ItemPointerGetBlockNumber(&itup->t_tid), ItemPointerGetOffsetNumber(&itup->t_tid), RelationGetRelationName(irel)); } int4 hsegno = heap_getattr(&htup, 2, RelationGetDescr(hrel), &isnull); int4 isegno = index_getattr(itup, 2, RelationGetDescr(irel), &isnull); if (isegno != hsegno) { elog(ERROR, "btvalidatevacuum: index segno(%d) != heap segno(%d)" " tuple (%d,%d) index %s", isegno, hsegno, ItemPointerGetBlockNumber(&itup->t_tid), ItemPointerGetOffsetNumber(&itup->t_tid), RelationGetRelationName(irel)); } break; default: break; } if (RelationGetNamespace(irel) == PG_AOSEGMENT_NAMESPACE) { int4 isegno = index_getattr(itup, 1, RelationGetDescr(irel), &isnull); int4 hsegno = heap_getattr(&htup, 1, RelationGetDescr(hrel), &isnull); if (isegno != hsegno) { elog(ERROR, "btvalidatevacuum: index segno(%d) != heap segno(%d)" " tuple (%d,%d) index %s", isegno, hsegno, ItemPointerGetBlockNumber(&itup->t_tid), ItemPointerGetOffsetNumber(&itup->t_tid), RelationGetRelationName(irel)); } } } } if (BufferIsValid(ibuf)) ReleaseBuffer(ibuf); } if (BufferIsValid(hbuf)) ReleaseBuffer(hbuf); MIRROREDLOCK_BUFMGR_UNLOCK; }
/* * Search the relation 'rel' for tuple using the sequential scan. * * If a matching tuple is found, lock it with lockmode, fill the slot with its * contents, and return true. Return false otherwise. * * Note that this stops on the first matching tuple. * * This can obviously be quite slow on tables that have more than few rows. */ bool RelationFindReplTupleSeq(Relation rel, LockTupleMode lockmode, TupleTableSlot *searchslot, TupleTableSlot *outslot) { HeapTuple scantuple; HeapScanDesc scan; SnapshotData snap; TransactionId xwait; bool found; TupleDesc desc = RelationGetDescr(rel); Assert(equalTupleDescs(desc, outslot->tts_tupleDescriptor)); /* Start an index scan. */ InitDirtySnapshot(snap); scan = heap_beginscan(rel, &snap, 0, NULL); retry: found = false; heap_rescan(scan, NULL); /* Try to find the tuple */ while ((scantuple = heap_getnext(scan, ForwardScanDirection)) != NULL) { if (!tuple_equals_slot(desc, scantuple, searchslot)) continue; found = true; ExecStoreTuple(scantuple, outslot, InvalidBuffer, false); ExecMaterializeSlot(outslot); xwait = TransactionIdIsValid(snap.xmin) ? snap.xmin : snap.xmax; /* * If the tuple is locked, wait for locking transaction to finish and * retry. */ if (TransactionIdIsValid(xwait)) { XactLockTableWait(xwait, NULL, NULL, XLTW_None); goto retry; } } /* Found tuple, try to lock it in the lockmode. */ if (found) { Buffer buf; HeapUpdateFailureData hufd; HTSU_Result res; HeapTupleData locktup; ItemPointerCopy(&outslot->tts_tuple->t_self, &locktup.t_self); PushActiveSnapshot(GetLatestSnapshot()); res = heap_lock_tuple(rel, &locktup, GetCurrentCommandId(false), lockmode, LockWaitBlock, false /* don't follow updates */ , &buf, &hufd); /* the tuple slot already has the buffer pinned */ ReleaseBuffer(buf); PopActiveSnapshot(); switch (res) { case HeapTupleMayBeUpdated: break; case HeapTupleUpdated: /* XXX: Improve handling here */ ereport(LOG, (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), errmsg("concurrent update, retrying"))); goto retry; case HeapTupleInvisible: elog(ERROR, "attempted to lock invisible tuple"); default: elog(ERROR, "unexpected heap_lock_tuple status: %u", res); break; } } heap_endscan(scan); return found; }