/* ------------------------------------------------------------------ * ExecShareInputScan * Retrieve a tuple from the ShareInputScan * ------------------------------------------------------------------ */ TupleTableSlot * ExecShareInputScan(ShareInputScanState *node) { EState *estate; ScanDirection dir; bool forward; TupleTableSlot *slot; ShareInputScan * sisc = (ShareInputScan *) node->ss.ps.plan; ShareType share_type = sisc->share_type; /* * get state info from node */ estate = node->ss.ps.state; dir = estate->es_direction; forward = ScanDirectionIsForward(dir); /* if first time call, need to initialize the tuplestore state. */ if(node->ts_state == NULL) { elog(DEBUG1, "SISC (shareid=%d, slice=%d): No tuplestore yet, initializing tuplestore", sisc->share_id, currentSliceId); init_tuplestore_state(node); } slot = node->ss.ps.ps_ResultTupleSlot; while(1) { bool gotOK = false; if(share_type == SHARE_MATERIAL || share_type == SHARE_MATERIAL_XSLICE) { ntuplestore_acc_advance((NTupleStoreAccessor *) node->ts_pos, forward ? 1 : -1); gotOK = ntuplestore_acc_current_tupleslot((NTupleStoreAccessor *) node->ts_pos, slot); } else { gotOK = tuplesort_gettupleslot_pos(node->ts_state->sortstore, (TuplesortPos *)node->ts_pos, forward, slot, CurrentMemoryContext); } if(!gotOK) return NULL; SIMPLE_FAULT_INJECTOR(ExecShareInputNext); return slot; } Assert(!"should not be here"); return NULL; }
/* * Creates a new numbered workfile in a given set * * The given file_no is used to generate the file name */ ExecWorkFile * workfile_mgr_create_fileno(workfile_set *work_set, uint32 file_no) { Assert(NULL != work_set); char file_name[MAXPGPATH]; retrieve_file_no(work_set, file_no, file_name, sizeof(file_name)); ExecWorkFile *ewfile = ExecWorkFile_Create(file_name, work_set->metadata.type, true /* del_on_close */, work_set->metadata.bfz_compress_type); SIMPLE_FAULT_INJECTOR(WorkfileCreationFail); ExecWorkfile_SetWorkset(ewfile, work_set); return ewfile; }
/* * Indicate we intend to create a tablespace file as part of the current transaction. * * An XLOG IntentToCreate record is generated that will guard the subsequent file-system * create in case the transaction aborts. * * After 1 or more calls to this routine to mark intention about tablespace files that are going * to be created, call ~_DoPendingCreates to do the actual file-system creates. (See its * note on XLOG flushing). */ void PersistentTablespace_MarkCreatePending( Oid filespaceOid, /* The filespace where the tablespace lives. */ Oid tablespaceOid, /* The tablespace OID for the create. */ MirroredObjectExistenceState mirrorExistenceState, ItemPointer persistentTid, /* TID of the gp_persistent_rel_files tuple for the rel file */ int64 *persistentSerialNum, bool flushToXLog) /* When true, the XLOG record for this change will be flushed to disk. */ { WRITE_PERSISTENT_STATE_ORDERED_LOCK_DECLARE; PersistentFileSysObjName fsObjName; TablespaceDirEntry tablespaceDirEntry; TransactionId topXid; if (Persistent_BeforePersistenceWork()) { if (Debug_persistent_print) elog(Persistent_DebugPrintLevel(), "Skipping persistent tablespace %u because we are before persistence work", tablespaceOid); return; /* * The initdb process will load the persistent table once we out of * bootstrap mode. */ } PersistentTablespace_VerifyInitScan(); PersistentFileSysObjName_SetTablespaceDir(&fsObjName, tablespaceOid); topXid = GetTopTransactionId(); WRITE_PERSISTENT_STATE_ORDERED_LOCK; PersistentTablespace_AddTuple( filespaceOid, tablespaceOid, PersistentFileSysState_CreatePending, /* createMirrorDataLossTrackingSessionNum */ 0, mirrorExistenceState, /* reserved */ 0, /* parentXid */ topXid, flushToXLog, persistentTid, persistentSerialNum); WRITE_TABLESPACE_HASH_LOCK; tablespaceDirEntry = PersistentTablespace_CreateEntryUnderLock(filespaceOid, tablespaceOid); Assert(tablespaceDirEntry != NULL); tablespaceDirEntry->state = PersistentFileSysState_CreatePending; ItemPointerCopy(persistentTid, &tablespaceDirEntry->persistentTid); tablespaceDirEntry->persistentSerialNum = *persistentSerialNum; WRITE_TABLESPACE_HASH_UNLOCK; /* * This XLOG must be generated under the persistent write-lock. */ #ifdef MASTER_MIRROR_SYNC mmxlog_log_create_tablespace( filespaceOid, tablespaceOid); #endif SIMPLE_FAULT_INJECTOR(FaultBeforePendingDeleteTablespaceEntry); /* * MPP-18228 To make adding 'Create Pending' entry to persistent table and * adding to the PendingDelete list atomic */ PendingDelete_AddCreatePendingEntryWrapper( &fsObjName, persistentTid, *persistentSerialNum); WRITE_PERSISTENT_STATE_ORDERED_UNLOCK; if (Debug_persistent_print) elog(Persistent_DebugPrintLevel(), "Persistent tablespace directory: Add '%s' in state 'Created', mirror existence state '%s', serial number " INT64_FORMAT " at TID %s", PersistentFileSysObjName_ObjectName(&fsObjName), MirroredObjectExistenceState_Name(mirrorExistenceState), *persistentSerialNum, ItemPointerToString(persistentTid)); }
/* * Actually do a base backup for the specified tablespaces. * * This is split out mainly to avoid complaints about "variable might be * clobbered by longjmp" from stupider versions of gcc. */ static void perform_base_backup(basebackup_options *opt, DIR *tblspcdir) { XLogRecPtr startptr; XLogRecPtr endptr; char *labelfile; startptr = do_pg_start_backup(opt->label, opt->fastcheckpoint, &labelfile); Assert(!XLogRecPtrIsInvalid(startptr)); elogif(!debug_basebackup, LOG, "basebackup perform -- " "Basebackup start xlog location = %X/%X", startptr.xlogid, startptr.xrecoff); /* * Set xlogCleanUpTo so that checkpoint process knows * which old xlog files should not be cleaned */ WalSndSetXLogCleanUpTo(startptr); SIMPLE_FAULT_INJECTOR(BaseBackupPostCreateCheckpoint); SendXlogRecPtrResult(startptr); PG_ENSURE_ERROR_CLEANUP(base_backup_cleanup, (Datum) 0); { List *filespaces = NIL; ListCell *lc; /* Collect information about all filespaces, including pg_system */ filespaces = get_filespaces_to_send(opt); /* Send filespace header */ SendBackupHeader(filespaces); /* Send off our filespaces one by one */ foreach(lc, filespaces) { filespaceinfo *fi = (filespaceinfo *) lfirst(lc); StringInfoData buf; /* Send CopyOutResponse message */ pq_beginmessage(&buf, 'H'); pq_sendbyte(&buf, 0); /* overall format */ pq_sendint(&buf, 0, 2); /* natts */ pq_endmessage(&buf); /* In the main tar, include the backup_label first. */ if (fi->primary_path == NULL) sendFileWithContent(BACKUP_LABEL_FILE, labelfile); sendDir(fi->primary_path == NULL ? "." : fi->primary_path, fi->primary_path == NULL ? 1 : strlen(fi->primary_path), opt->exclude, false); /* In the main tar, include pg_control last. */ if (fi->primary_path == NULL) { struct stat statbuf; if (lstat(XLOG_CONTROL_FILE, &statbuf) != 0) { ereport(ERROR, (errcode_for_file_access(), errmsg("could not stat control file \"%s\": %m", XLOG_CONTROL_FILE))); } sendFile(XLOG_CONTROL_FILE, XLOG_CONTROL_FILE, &statbuf); elogif(debug_basebackup, LOG, "basebackup perform -- Sent file %s." , XLOG_CONTROL_FILE); } /* * If we're including WAL, and this is the main data directory we * don't terminate the tar stream here. Instead, we will append * the xlog files below and terminate it then. This is safe since * the main data directory is always sent *last*. */ if (opt->includewal && fi->xlogdir) { Assert(lnext(lc) == NULL); } else pq_putemptymessage('c'); /* CopyDone */ } }
void datumstreamread_block_content(DatumStreamRead * acc) { Assert(acc); /* * Clear out state from previous block. */ DatumStreamBlockRead_Reset(&acc->blockRead); acc->largeObjectState = DatumStreamLargeObjectState_None; /* * Read in data. */ if (acc->getBlockInfo.execBlockKind == AOCSBK_BLOCK) { Assert(!acc->getBlockInfo.isLarge); if (acc->getBlockInfo.isCompressed) { /* Compressed, need to decompress to our own buffer. */ if (acc->large_object_buffer_size < acc->getBlockInfo.contentLen) { MemoryContext oldCtxt; oldCtxt = MemoryContextSwitchTo(acc->memctxt); if (acc->large_object_buffer) { pfree(acc->large_object_buffer); acc->large_object_buffer = NULL; SIMPLE_FAULT_INJECTOR(MallocFailure); } acc->large_object_buffer_size = acc->getBlockInfo.contentLen; acc->large_object_buffer = palloc(acc->getBlockInfo.contentLen); MemoryContextSwitchTo(oldCtxt); } AppendOnlyStorageRead_Content( &acc->ao_read, (uint8 *) acc->large_object_buffer, acc->getBlockInfo.contentLen); acc->buffer_beginp = acc->large_object_buffer; } else { acc->buffer_beginp = AppendOnlyStorageRead_GetBuffer(&acc->ao_read); } if (Debug_appendonly_print_datumstream) elog(LOG, "datumstream_read_block_content filePathName %s firstRowNum " INT64_FORMAT " rowCnt %u " "ndatum %u contentLen %d datump %p", acc->ao_read.bufferedRead.filePathName, acc->getBlockInfo.firstRow, acc->getBlockInfo.rowCnt, acc->blockRead.logical_row_count, acc->getBlockInfo.contentLen, acc->blockRead.datump); } else if (acc->getBlockInfo.execBlockKind == AOCSBK_BLOB) { Assert(acc->getBlockInfo.rowCnt == 1); if (acc->typeInfo.datumlen >= 0) { elog(ERROR, "Large object must be variable length objects (varlena)"); } /* * NOTE: Do not assert the content is large. What appears to be large * content */ /* NOTE: can compress into one AO storage block. */ if (acc->large_object_buffer_size < acc->getBlockInfo.contentLen) { MemoryContext oldCtxt; oldCtxt = MemoryContextSwitchTo(acc->memctxt); if (acc->large_object_buffer) pfree(acc->large_object_buffer); acc->large_object_buffer_size = acc->getBlockInfo.contentLen; acc->large_object_buffer = palloc(acc->getBlockInfo.contentLen); MemoryContextSwitchTo(oldCtxt); } AppendOnlyStorageRead_Content( &acc->ao_read, acc->large_object_buffer, acc->getBlockInfo.contentLen); acc->buffer_beginp = acc->large_object_buffer; acc->largeObjectState = DatumStreamLargeObjectState_HaveAoContent; if (Debug_datumstream_read_check_large_varlena_integrity) { datumstreamread_check_large_varlena_integrity( acc, acc->buffer_beginp, acc->getBlockInfo.contentLen); } } else { elog(ERROR, "Unexpected Append-Only Column Store executor kind %d", acc->getBlockInfo.execBlockKind); } /* * Unpack the information from the block headers and get ready to read the first datum. */ datumstreamread_block_get_ready(acc); }
/* ---------------------------------------------------------------- * MultiExecHash * * build hash table for hashjoin, doing partitioning if more * than one batch is required. * ---------------------------------------------------------------- */ Node * MultiExecHash(HashState *node) { PlanState *outerNode; List *hashkeys; HashJoinTable hashtable; TupleTableSlot *slot; ExprContext *econtext; uint32 hashvalue = 0; /* must provide our own instrumentation support */ if (node->ps.instrument) InstrStartNode(node->ps.instrument); /* * get state info from node */ outerNode = outerPlanState(node); hashtable = node->hashtable; /* * set expression context */ hashkeys = node->hashkeys; econtext = node->ps.ps_ExprContext; SIMPLE_FAULT_INJECTOR(MultiExecHashLargeVmem); /* * get all inner tuples and insert into the hash table (or temp files) */ for (;;) { slot = ExecProcNode(outerNode); if (TupIsNull(slot)) break; Gpmon_M_Incr(GpmonPktFromHashState(node), GPMON_QEXEC_M_ROWSIN); CheckSendPlanStateGpmonPkt(&node->ps); /* We have to compute the hash value */ econtext->ecxt_innertuple = slot; bool hashkeys_null = false; if (ExecHashGetHashValue(node, hashtable, econtext, hashkeys, false, node->hs_keepnull, &hashvalue, &hashkeys_null)) { ExecHashTableInsert(node, hashtable, slot, hashvalue); } if (hashkeys_null) { node->hs_hashkeys_null = true; if (node->hs_quit_if_hashkeys_null) { ExecSquelchNode(outerNode); return NULL; } } } /* Now we have set up all the initial batches & primary overflow batches. */ hashtable->nbatch_outstart = hashtable->nbatch; /* must provide our own instrumentation support */ if (node->ps.instrument) InstrStopNode(node->ps.instrument, hashtable->totalTuples); /* * We do not return the hash table directly because it's not a subtype of * Node, and so would violate the MultiExecProcNode API. Instead, our * parent Hashjoin node is expected to know how to fish it out of our node * state. Ugly but not really worth cleaning up, since Hashjoin knows * quite a bit more about Hash besides that. */ return NULL; }
/* * Creates a new gang by logging on a session to each segDB involved. * * call this function in GangContext memory context. * elog ERROR or return a non-NULL gang. */ static Gang * createGang_thread(GangType type, int gang_id, int size, int content) { Gang *newGangDefinition = NULL; SegmentDatabaseDescriptor *segdbDesc = NULL; DoConnectParms *doConnectParmsAr = NULL; DoConnectParms *pParms = NULL; int parmIndex = 0; int threadCount = 0; int i = 0; int create_gang_retry_counter = 0; int in_recovery_mode_count = 0; int successful_connections = 0; PQExpBufferData create_gang_error; ELOG_DISPATCHER_DEBUG("createGang type = %d, gang_id = %d, size = %d, content = %d", type, gang_id, size, content); /* check arguments */ Assert(size == 1 || size == getgpsegmentCount()); Assert(CurrentResourceOwner != NULL); Assert(CurrentMemoryContext == GangContext); Assert(gp_connections_per_thread > 0); /* Writer gang is created before reader gangs. */ if (type == GANGTYPE_PRIMARY_WRITER) Insist(!GangsExist()); initPQExpBuffer(&create_gang_error); Assert(CurrentGangCreating == NULL); create_gang_retry: /* * If we're in a retry, we may need to reset our initial state a bit. We * also want to ensure that all resources have been released. */ Assert(newGangDefinition == NULL); Assert(doConnectParmsAr == NULL); successful_connections = 0; in_recovery_mode_count = 0; threadCount = 0; /* allocate and initialize a gang structure */ newGangDefinition = buildGangDefinition(type, gang_id, size, content); CurrentGangCreating = newGangDefinition; Assert(newGangDefinition != NULL); Assert(newGangDefinition->size == size); Assert(newGangDefinition->perGangContext != NULL); MemoryContextSwitchTo(newGangDefinition->perGangContext); resetPQExpBuffer(&create_gang_error); /* * The most threads we could have is segdb_count / * gp_connections_per_thread, rounded up. This is equivalent to 1 + * (segdb_count-1) / gp_connections_per_thread. We allocate enough memory * for this many DoConnectParms structures, even though we may not use * them all. */ threadCount = 1 + (size - 1) / gp_connections_per_thread; Assert(threadCount > 0); /* initialize connect parameters */ doConnectParmsAr = makeConnectParms(threadCount, type, gang_id); for (i = 0; i < size; i++) { parmIndex = i / gp_connections_per_thread; pParms = &doConnectParmsAr[parmIndex]; segdbDesc = &newGangDefinition->db_descriptors[i]; pParms->segdbDescPtrArray[pParms->db_count++] = segdbDesc; } /* start threads and doing the connect */ for (i = 0; i < threadCount; i++) { int pthread_err; pParms = &doConnectParmsAr[i]; ELOG_DISPATCHER_DEBUG("createGang creating thread %d of %d for libpq connections", i + 1, threadCount); pthread_err = gp_pthread_create(&pParms->thread, thread_DoConnect, pParms, "createGang"); if (pthread_err != 0) { int j; /* * Error during thread create (this should be caused by resource * constraints). If we leave the threads running, they'll * immediately have some problems -- so we need to join them, and * *then* we can issue our FATAL error */ for (j = 0; j < i; j++) { pthread_join(doConnectParmsAr[j].thread, NULL); } ereport(FATAL, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("failed to create thread %d of %d", i + 1, threadCount), errdetail("pthread_create() failed with err %d", pthread_err))); } } /* * wait for all of the DoConnect threads to complete. */ for (i = 0; i < threadCount; i++) { ELOG_DISPATCHER_DEBUG("joining to thread %d of %d for libpq connections", i + 1, threadCount); if (0 != pthread_join(doConnectParmsAr[i].thread, NULL)) { elog(FATAL, "could not create segworker group"); } } /* * Free the memory allocated for the threadParms array */ destroyConnectParms(doConnectParmsAr, threadCount); doConnectParmsAr = NULL; SIMPLE_FAULT_INJECTOR(GangCreated); /* find out the successful connections and the failed ones */ checkConnectionStatus(newGangDefinition, &in_recovery_mode_count, &successful_connections, &create_gang_error); ELOG_DISPATCHER_DEBUG("createGang: %d processes requested; %d successful connections %d in recovery", size, successful_connections, in_recovery_mode_count); MemoryContextSwitchTo(GangContext); if (size == successful_connections) { setLargestGangsize(size); termPQExpBuffer(&create_gang_error); CurrentGangCreating = NULL; return newGangDefinition; } /* there'er failed connections */ /* FTS shows some segment DBs are down, destroy all gangs. */ if (isFTSEnabled() && FtsTestSegmentDBIsDown(newGangDefinition->db_descriptors, size)) { appendPQExpBuffer(&create_gang_error, "FTS detected one or more segments are down\n"); goto exit; } /* failure due to recovery */ if (successful_connections + in_recovery_mode_count == size) { if (gp_gang_creation_retry_count && create_gang_retry_counter++ < gp_gang_creation_retry_count && type == GANGTYPE_PRIMARY_WRITER) { /* * Retry for non-writer gangs is meaningless because writer gang * must be gone when QE is in recovery mode */ DisconnectAndDestroyGang(newGangDefinition); newGangDefinition = NULL; CurrentGangCreating = NULL; ELOG_DISPATCHER_DEBUG("createGang: gang creation failed, but retryable."); CHECK_FOR_INTERRUPTS(); pg_usleep(gp_gang_creation_retry_timer * 1000); CHECK_FOR_INTERRUPTS(); goto create_gang_retry; } appendPQExpBuffer(&create_gang_error, "segment(s) are in recovery mode\n"); } exit: if (newGangDefinition != NULL) DisconnectAndDestroyGang(newGangDefinition); if (type == GANGTYPE_PRIMARY_WRITER) { DisconnectAndDestroyAllGangs(true); CheckForResetSession(); } CurrentGangCreating = NULL; ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR), errmsg("failed to acquire resources on one or more segments"), errdetail("%s", create_gang_error.data))); return NULL; }
static int FileRepPrimary_ResyncBufferPoolIncrementalWrite(ChangeTrackingRequest *request) { int status = STATUS_OK; Page page; Buffer buf; BlockNumber numBlocks = 0; SMgrRelation smgr_relation = NULL; char relidstr[OIDCHARS + 1 + OIDCHARS + 1 + OIDCHARS + 1]; int ii; XLogRecPtr loc; XLogRecPtr loc1; int count = 0; int thresholdCount = 0; bool mirrorDataLossOccurred = FALSE; int NumberOfRelations = request->count; FileRepResyncHashEntry_s entry; ChangeTrackingResult *result = NULL; while (1) { /* allow flushing buffers from buffer pool during scan */ FileRepResync_SetReadBufferRequest(); if ((result = ChangeTracking_GetChanges(request)) != NULL) { FileRepResync_ResetReadBufferRequest(); for (ii = 0; ii < result->count; ii++) { if (smgr_relation == NULL) { NumberOfRelations--; smgr_relation = smgropen(result->entries[ii].relFileNode); snprintf(relidstr, sizeof(relidstr), "%u/%u/%u", smgr_relation->smgr_rnode.spcNode, smgr_relation->smgr_rnode.dbNode, smgr_relation->smgr_rnode.relNode); numBlocks = smgrnblocks(smgr_relation); if (Debug_filerep_print) elog(LOG, "resynchronize buffer pool relation '%u/%u/%u' " "number of blocks:'%u' ", smgr_relation->smgr_rnode.spcNode, smgr_relation->smgr_rnode.dbNode, smgr_relation->smgr_rnode.relNode, numBlocks); thresholdCount = Min(numBlocks, 1024); } loc1 = result->entries[ii].lsn_end; /* * if relation was truncated then block_num from change tracking can be beyond numBlocks */ if (result->entries[ii].block_num >= numBlocks) { ereport(LOG, (errmsg("could not resynchonize buffer pool relation '%s' block '%d' (maybe due to truncate), " "lsn change tracking '%s(%u/%u)' " "number of blocks '%d' ", relidstr, result->entries[ii].block_num, XLogLocationToString(&loc1), loc1.xlogid, loc1.xrecoff, numBlocks), FileRep_errcontext())); goto flush_check; } /* allow flushing buffers from buffer pool during scan */ FileRepResync_SetReadBufferRequest(); buf = ReadBuffer_Resync(smgr_relation, result->entries[ii].block_num); FileRepResync_ResetReadBufferRequest(); Assert(result->entries[ii].block_num < numBlocks); LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); page = BufferGetPage(buf); loc = PageGetLSN(page); if(Debug_filerep_print) { elog(LOG, "incremental resync buffer pool identifier '%s' num blocks '%d' blkno '%d' lsn page '%s(%u/%u)' " "lsn end change tracking '%s(%u/%u)' ", relidstr, numBlocks, result->entries[ii].block_num, XLogLocationToString(&loc), loc.xlogid, loc.xrecoff, XLogLocationToString(&loc1), result->entries[ii].lsn_end.xlogid, result->entries[ii].lsn_end.xrecoff); } else { char tmpBuf[FILEREP_MAX_LOG_DESCRIPTION_LEN]; snprintf(tmpBuf, sizeof(tmpBuf), "incremental resync buffer pool identifier '%s' num blocks '%d' blkno '%d' lsn page '%s(%u/%u)' ", relidstr, numBlocks, result->entries[ii].block_num, XLogLocationToString(&loc), loc.xlogid, loc.xrecoff); FileRep_InsertConfigLogEntry(tmpBuf); snprintf(tmpBuf, sizeof(tmpBuf), "incremental resync buffer pool identifier '%s' lsn end change tracking '%s(%u/%u)' ", relidstr, XLogLocationToString(&loc1), result->entries[ii].lsn_end.xlogid, result->entries[ii].lsn_end.xrecoff); FileRep_InsertConfigLogEntry(tmpBuf); } if (XLByteLE(result->entries[ii].lsn_end, PageGetLSN(page))) { if (! XLByteEQ(PageGetLSN(page), result->entries[ii].lsn_end)) { ereport(LOG, (errmsg("Resynchonize buffer pool relation '%s' block '%d' has page lsn less than CT lsn, " "lsn end change tracking '%s(%u/%u)' lsn page '%s(%u/%u)' " "number of blocks '%d'", relidstr, result->entries[ii].block_num, XLogLocationToString(&loc), loc.xlogid, loc.xrecoff, XLogLocationToString(&loc1), loc1.xlogid, loc1.xrecoff, numBlocks), FileRep_errcontext())); } /* * It's safe and better to perform write of the page to mirror, * for this case, as primary and mirror data pages should always * be same. So, we might do some extra work but definitely won't * lose out blocks, or error out and need to perform full recovery. * Need to cover for this case as there are some known scenarios where * CT file can have extra records which should have been discarded, * but as we loose out information of xlog LSN cannot be discarded. * One such case is when CT_TRANSIENT being compacted to CT_COMPACT * with specific xlog LSN (to discard extra records) in CT mode gets * interrupted by resync. Compaction during Resync collects all the * CT records and doesn't have xlog LSN information to discard any * extra records from CT_TRANSIENT. */ smgrwrite(smgr_relation, result->entries[ii].block_num, (char *)BufferGetBlock(buf), FALSE); } SIMPLE_FAULT_INJECTOR(FileRepResyncWorker); UnlockReleaseBuffer(buf); SIMPLE_FAULT_INJECTOR(FileRepResyncWorker); flush_check: if (((ii + 1) == result->count) || ! (result->entries[ii].relFileNode.spcNode == result->entries[ii+1].relFileNode.spcNode && result->entries[ii].relFileNode.dbNode == result->entries[ii+1].relFileNode.dbNode && result->entries[ii].relFileNode.relNode == result->entries[ii+1].relFileNode.relNode)) { if (result->ask_for_more == false) { smgrimmedsync(smgr_relation); smgrclose(smgr_relation); smgr_relation = NULL; FileRep_GetRelationPath( entry.fileName, result->entries[ii].relFileNode, 0 /* segment file number is always 0 for Buffer Pool */); /* * We only want to update the state with this call to * FileRepResync_UpdateEntry(), so to ensure that we * don't incur any sideeffects set the changed page * count to zero as it will only be updated to if the * hashtable entry changed page count is zero. */ entry.mirrorBufpoolResyncChangedPageCount = 0; status = FileRepResync_UpdateEntry(&entry); if (status != STATUS_OK) { break; } } } if (count > thresholdCount) { count = 0; FileRepSubProcess_ProcessSignals(); if (! (FileRepSubProcess_GetState() == FileRepStateReady && dataState == DataStateInResync)) { mirrorDataLossOccurred = TRUE; break; } } else count++; } // for (ii = 0; ii < result->count; ii++) } // if ((result = ChangeTracking_GetChanges(request)) != NULL) FileRepResync_ResetReadBufferRequest(); if (result != NULL && result->ask_for_more == true) { Assert(request->count == 1); request->entries[0].lsn_start = result->next_start_lsn; } else { break; } } // while(1) ChangeTracking_FreeRequest(request); ChangeTracking_FreeResult(result); Insist(NumberOfRelations == 0); if (mirrorDataLossOccurred) status = STATUS_ERROR; return status; }
static int FileRepPrimary_ResyncWrite(FileRepResyncHashEntry_s *entry) { int status = STATUS_OK; Page page; Buffer buf; BlockNumber numBlocks; BlockNumber blkno; SMgrRelation smgr_relation; char relidstr[OIDCHARS + 1 + OIDCHARS + 1 + OIDCHARS + 1]; XLogRecPtr loc; int count = 0; int thresholdCount = 0; bool mirrorDataLossOccurred = FALSE; switch (entry->relStorageMgr) { case PersistentFileSysRelStorageMgr_BufferPool: switch (entry->mirrorDataSynchronizationState) { case MirroredRelDataSynchronizationState_BufferPoolScanIncremental: case MirroredRelDataSynchronizationState_FullCopy: smgr_relation = smgropen(entry->relFileNode); numBlocks = smgrnblocks(smgr_relation); snprintf(relidstr, sizeof(relidstr), "%u/%u/%u", smgr_relation->smgr_rnode.spcNode, smgr_relation->smgr_rnode.dbNode, smgr_relation->smgr_rnode.relNode); if (Debug_filerep_print) elog(LOG, "resync buffer pool relation '%s' number of blocks '%d' ", relidstr, numBlocks); thresholdCount = Min(numBlocks, 1024); /* * required in order to report how many blocks were synchronized * if gp_persistent_relation_node does not return that information */ if (entry->mirrorBufpoolResyncChangedPageCount == 0) { entry->mirrorBufpoolResyncChangedPageCount = numBlocks - entry->mirrorBufpoolResyncCkptBlockNum; } for (blkno = entry->mirrorBufpoolResyncCkptBlockNum; blkno < numBlocks; blkno++) { XLogRecPtr endResyncLSN = (isFullResync() ? FileRepResync_GetEndFullResyncLSN() : FileRepResync_GetEndIncrResyncLSN()); SIMPLE_FAULT_INJECTOR(FileRepResyncWorkerRead); FileRepResync_SetReadBufferRequest(); buf = ReadBuffer_Resync(smgr_relation, blkno); FileRepResync_ResetReadBufferRequest(); LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); page = BufferGetPage(buf); loc = PageGetLSN(page); if (Debug_filerep_print) { elog(LOG, "full resync buffer pool identifier '%s' num blocks '%d' blkno '%d' lsn begin change tracking '%s(%u/%u)' " "lsn page '%s(%u/%u)' lsn end change tracking '%s(%u/%u)' ", relidstr, numBlocks, blkno, XLogLocationToString(&entry->mirrorBufpoolResyncCkptLoc), entry->mirrorBufpoolResyncCkptLoc.xlogid, entry->mirrorBufpoolResyncCkptLoc.xrecoff, XLogLocationToString(&loc), loc.xlogid, loc.xrecoff, XLogLocationToString(&endResyncLSN), endResyncLSN.xlogid, endResyncLSN.xrecoff); } else { char tmpBuf[FILEREP_MAX_LOG_DESCRIPTION_LEN]; snprintf(tmpBuf, sizeof(tmpBuf), "full resync buffer pool identifier '%s' num blocks '%d' blkno '%d' lsn begin change tracking '%s(%u/%u)' ", relidstr, numBlocks, blkno, XLogLocationToString(&entry->mirrorBufpoolResyncCkptLoc), entry->mirrorBufpoolResyncCkptLoc.xlogid, entry->mirrorBufpoolResyncCkptLoc.xrecoff); FileRep_InsertConfigLogEntry(tmpBuf); snprintf(tmpBuf, sizeof(tmpBuf), "full resync buffer pool identifier '%s' lsn page '%s(%u/%u)' lsn end change tracking '%s(%u/%u)' ", relidstr, XLogLocationToString(&loc), loc.xlogid, loc.xrecoff, XLogLocationToString(&endResyncLSN), endResyncLSN.xlogid, endResyncLSN.xrecoff); FileRep_InsertConfigLogEntry(tmpBuf); } if (XLByteLE(PageGetLSN(page), endResyncLSN) && XLByteLE(entry->mirrorBufpoolResyncCkptLoc, PageGetLSN(page))) { smgrwrite(smgr_relation, blkno, (char *)BufferGetBlock(buf), FALSE); } SIMPLE_FAULT_INJECTOR(FileRepResyncWorker); UnlockReleaseBuffer(buf); if (count > thresholdCount) { count = 0; FileRepSubProcess_ProcessSignals(); if (! (FileRepSubProcess_GetState() == FileRepStateReady && dataState == DataStateInResync)) { mirrorDataLossOccurred = TRUE; break; } } else count++; } if (mirrorDataLossOccurred) break; if (entry->mirrorDataSynchronizationState != MirroredRelDataSynchronizationState_FullCopy) { LockRelationForResyncExtension(&smgr_relation->smgr_rnode, ExclusiveLock); numBlocks = smgrnblocks(smgr_relation); smgrtruncate(smgr_relation, numBlocks, TRUE /* isTemp, TRUE means to not record in XLOG */, FALSE /* isLocalBuf */, &entry->persistentTid, entry->persistentSerialNum); UnlockRelationForResyncExtension(&smgr_relation->smgr_rnode, ExclusiveLock); } smgrimmedsync(smgr_relation); smgrclose(smgr_relation); smgr_relation = NULL; break; case MirroredRelDataSynchronizationState_None: case MirroredRelDataSynchronizationState_DataSynchronized: break; default: ereport(LOG, (errmsg("could not resynchronize relation '%u/%u/%u' " "mirror synchronization state:'%s(%d)' ", entry->relFileNode.relNode, entry->relFileNode.spcNode, entry->relFileNode.dbNode, MirroredRelDataSynchronizationState_Name(entry->mirrorDataSynchronizationState), entry->mirrorDataSynchronizationState))); break; } break; case PersistentFileSysRelStorageMgr_AppendOnly: { MirroredAppendOnlyOpen mirroredOpen; int primaryError; bool mirrorDataLossOccurred; char *buffer = NULL; int64 endOffset = entry->mirrorAppendOnlyNewEof; int64 startOffset = entry->mirrorAppendOnlyLossEof; int32 bufferLen = 0; int retval = 0; switch (entry->mirrorDataSynchronizationState) { case MirroredRelDataSynchronizationState_AppendOnlyCatchup: case MirroredRelDataSynchronizationState_FullCopy: /* * required in order to report how many blocks were synchronized * if gp_persistent_relation_node does not return that information */ if (entry->mirrorBufpoolResyncChangedPageCount == 0) { entry->mirrorBufpoolResyncChangedPageCount = (endOffset - startOffset) / BLCKSZ; } /* * The MirroredAppendOnly_OpenResynchonize routine knows we are a resynch worker and * will open BOTH, but write only the MIRROR!!! */ MirroredAppendOnly_OpenResynchonize( &mirroredOpen, &entry->relFileNode, entry->segmentFileNum, startOffset, &primaryError, &mirrorDataLossOccurred); if (primaryError != 0) { ereport(ERROR, (errcode_for_file_access(), errmsg("could not open file %u/%u/%u.%u : %s", entry->relFileNode.dbNode, entry->relFileNode.spcNode, entry->relFileNode.relNode, entry->segmentFileNum, strerror(primaryError)))); break; } if (mirrorDataLossOccurred) break; /* AO and CO Data Store writes 64k size by default */ bufferLen = (Size) Min(2*BLCKSZ, endOffset - startOffset); buffer = (char*) palloc(bufferLen); MemSet(buffer, 0, bufferLen); while (startOffset < endOffset) { retval = MirroredAppendOnly_Read( &mirroredOpen, buffer, bufferLen); if (retval != bufferLen) { ereport(ERROR, (errcode_for_file_access(), errmsg("could not read from position:" INT64_FORMAT " in file %u/%u/%u.%u : %m", startOffset, entry->relFileNode.dbNode, entry->relFileNode.spcNode, entry->relFileNode.relNode, entry->segmentFileNum))); break; } MirroredAppendOnly_Append( &mirroredOpen, buffer, bufferLen, &primaryError, &mirrorDataLossOccurred); if (mirrorDataLossOccurred) break; Assert(primaryError == 0); // No primary writes as resync worker. startOffset += bufferLen; /* AO and CO Data Store writes 64k size by default */ bufferLen = (Size) Min(2*BLCKSZ, endOffset - startOffset); } pfree(buffer); buffer = NULL; if (mirrorDataLossOccurred) break; /* Flush written data on Mirror */ MirroredAppendOnly_Flush( &mirroredOpen, &primaryError, &mirrorDataLossOccurred); if (mirrorDataLossOccurred) break; Assert(primaryError == 0); // Not flushed on primary as resync worker. /* Close Primary and Mirror */ MirroredAppendOnly_Close( &mirroredOpen, &mirrorDataLossOccurred); break; case MirroredRelDataSynchronizationState_None: case MirroredRelDataSynchronizationState_DataSynchronized: break; default: ereport(LOG, (errmsg("could not resynchronize relation '%u/%u/%u' " "mirror synchronization state:'%s(%d)' ", entry->relFileNode.relNode, entry->relFileNode.spcNode, entry->relFileNode.dbNode, MirroredRelDataSynchronizationState_Name(entry->mirrorDataSynchronizationState), entry->mirrorDataSynchronizationState))); break; } break; } //case default: Assert(0); break; } //switch if (mirrorDataLossOccurred) status = STATUS_ERROR; return status; }
/* * Indicate we intend to create a filespace file as part of the current transaction. * * An XLOG IntentToCreate record is generated that will guard the subsequent file-system * create in case the transaction aborts. * * After 1 or more calls to this routine to mark intention about filespace files that are going * to be created, call ~_DoPendingCreates to do the actual file-system creates. (See its * note on XLOG flushing). */ void PersistentFilespace_MarkCreatePending( Oid filespaceOid, /* The filespace where the filespace lives. */ int16 primaryDbId, char *primaryFilespaceLocation, /* * The primary filespace directory path. NOT Blank padded. * Just a NULL terminated string. */ int16 mirrorDbId, char *mirrorFilespaceLocation, MirroredObjectExistenceState mirrorExistenceState, ItemPointer persistentTid, /* TID of the gp_persistent_rel_files tuple for the rel file */ int64 *persistentSerialNum, bool flushToXLog) /* When true, the XLOG record for this change will be flushed to disk. */ { WRITE_PERSISTENT_STATE_ORDERED_LOCK_DECLARE; PersistentFileSysObjName fsObjName; FilespaceDirEntry filespaceDirEntry; TransactionId topXid; Datum values[Natts_gp_persistent_filespace_node]; char mirrorFilespaceLocationBlankPadded[FilespaceLocationBlankPaddedWithNullTermLen]; char primaryFilespaceLocationBlankPadded[FilespaceLocationBlankPaddedWithNullTermLen]; if (Persistent_BeforePersistenceWork()) { if (Debug_persistent_print) elog(Persistent_DebugPrintLevel(), "Skipping persistent filespace %u because we are before persistence work", filespaceOid); return; // The initdb process will load the persistent table once we out of bootstrap mode. } PersistentFilespace_VerifyInitScan(); PersistentFileSysObjName_SetFilespaceDir(&fsObjName,filespaceOid); topXid = GetTopTransactionId(); WRITE_PERSISTENT_STATE_ORDERED_LOCK; PersistentFilespace_BlankPadCopyLocation( primaryFilespaceLocationBlankPadded, primaryFilespaceLocation); PersistentFilespace_BlankPadCopyLocation( mirrorFilespaceLocationBlankPadded, mirrorFilespaceLocation); GpPersistentFilespaceNode_SetDatumValues( values, filespaceOid, primaryDbId, primaryFilespaceLocationBlankPadded, mirrorDbId, mirrorFilespaceLocationBlankPadded, PersistentFileSysState_CreatePending, /* createMirrorDataLossTrackingSessionNum */ 0, mirrorExistenceState, /* reserved */ 0, /* parentXid */ topXid, /* persistentSerialNum */ 0); // This will be set by PersistentFileSysObj_AddTuple. PersistentFileSysObj_AddTuple( PersistentFsObjType_FilespaceDir, values, flushToXLog, persistentTid, persistentSerialNum); WRITE_FILESPACE_HASH_LOCK; filespaceDirEntry = PersistentFilespace_CreateDirUnderLock(filespaceOid); Assert(filespaceDirEntry != NULL); filespaceDirEntry->dbId1 = primaryDbId; memcpy(filespaceDirEntry->locationBlankPadded1, primaryFilespaceLocationBlankPadded, FilespaceLocationBlankPaddedWithNullTermLen); filespaceDirEntry->dbId2 = mirrorDbId; memcpy(filespaceDirEntry->locationBlankPadded2, mirrorFilespaceLocationBlankPadded, FilespaceLocationBlankPaddedWithNullTermLen); filespaceDirEntry->state = PersistentFileSysState_CreatePending; ItemPointerCopy(persistentTid, &filespaceDirEntry->persistentTid); filespaceDirEntry->persistentSerialNum = *persistentSerialNum; WRITE_FILESPACE_HASH_UNLOCK; /* * This XLOG must be generated under the persistent write-lock. */ #ifdef MASTER_MIRROR_SYNC mmxlog_log_create_filespace(filespaceOid); #endif SIMPLE_FAULT_INJECTOR(FaultBeforePendingDeleteFilespaceEntry); /* * MPP-18228 * To make adding 'Create Pending' entry to persistent table and adding * to the PendingDelete list atomic */ PendingDelete_AddCreatePendingEntryWrapper( &fsObjName, persistentTid, *persistentSerialNum); WRITE_PERSISTENT_STATE_ORDERED_UNLOCK; if (Debug_persistent_print) elog(Persistent_DebugPrintLevel(), "Persistent filespace directory: Add '%s' in state 'Created', mirror existence state '%s', serial number " INT64_FORMAT " at TID %s", PersistentFileSysObjName_ObjectName(&fsObjName), MirroredObjectExistenceState_Name(mirrorExistenceState), *persistentSerialNum, ItemPointerToString(persistentTid)); }
/* * ExecWorkFile_Write * write the given data from the end of the last write position. * * This function returns true if the write succeeds. Otherwise, return false. */ bool ExecWorkFile_Write(ExecWorkFile *workfile, void *data, uint64 size) { Assert(workfile != NULL); uint64 bytes; SIMPLE_FAULT_INJECTOR(WorkfileWriteFail); if (data == NULL || size == 0) { return false; } /* Test the per-query and per-segment limit */ if ((workfile->flags & EXEC_WORKFILE_LIMIT_SIZE) && !WorkfileDiskspace_Reserve(size)) { /* Failed to reserve additional disk space, notify caller */ workfile_mgr_report_error(); } switch(workfile->fileType) { case BUFFILE: {} BufFile *buffile = (BufFile *)workfile->file; int64 current_size = BufFileGetSize(buffile); int64 new_size = 0; PG_TRY(); { bytes = BufFileWrite(buffile, data, size); } PG_CATCH(); { new_size = BufFileGetSize(buffile); workfile->size = new_size; WorkfileDiskspace_Commit( (new_size - current_size), size, true /* update_query_size */); PG_RE_THROW(); } PG_END_TRY(); new_size = BufFileGetSize(buffile); workfile->size = new_size; WorkfileDiskspace_Commit( (new_size - current_size), size, true /* update_query_size */); workfile_set_update_in_progress_size(workfile->work_set, new_size - current_size); if (bytes != size) { workfile_mgr_report_error(); } break; case BFZ: PG_TRY(); { bfz_append((bfz_t *)workfile->file, data, size); } PG_CATCH(); { Assert(WorkfileDiskspace_IsFull()); WorkfileDiskspace_Commit(0, size, true /* update_query_size */); PG_RE_THROW(); } PG_END_TRY(); /* bfz_append always adds to the file size */ workfile->size += size; if ((workfile->flags & EXEC_WORKFILE_LIMIT_SIZE)) { WorkfileDiskspace_Commit(size, size, true /* update_query_size */); } workfile_set_update_in_progress_size(workfile->work_set, size); break; default: insist_log(false, "invalid work file type: %d", workfile->fileType); } return true; }
/* * Indicate we intend to create a relation file as part of the current transaction. * * This function adds an entry in 'gp_persistent_relation_node' for either a new table (segment file * # 0) or a new segment file under AO table (segment file # > 0 for row/column-oriented AO) with a state * 'Create Pending'. An XLOG IntentToCreate record is generated that will guard the subsequent file-system * create in case the transaction aborts. * * Paramaters * ----------- * relFileNode = The tablespace, database, and relation OIDs for the create * segmentFileNum = As the name implies ( 0 for heap * >= 0 for RO/CO AO as applicable) * relStorageMgr = Persistent Relation storage Manager * relBufpoolKind = Buffer pool type beneath corrosponding relation * TODO bufferPollBulkLoad = ??? * TODO mirrorExistenceState = ??? * TODO relDataSynchronizationState = ??? * flushToXlog = If true, the XLOG record for this change will be flushed to disk. * TODO isLocalBuf = ??? * * Return * ------ * relationName = Name of the relation used for either debugging or to store in PendingDelete LL. * persistentTid = Resulting TID of the gp_persistent_rel_files tuple for the relation * serialNum = Resulting serial number for the relation. Distinquishes the uses of the tuple */ void PersistentRelation_AddCreatePending( RelFileNode *relFileNode, int32 segmentFileNum, PersistentFileSysRelStorageMgr relStorageMgr, PersistentFileSysRelBufpoolKind relBufpoolKind, bool bufferPoolBulkLoad, MirroredObjectExistenceState mirrorExistenceState, MirroredRelDataSynchronizationState relDataSynchronizationState, char *relationName, ItemPointer persistentTid, int64 *serialNum, bool flushToXLog, bool isLocalBuf) { WRITE_PERSISTENT_STATE_ORDERED_LOCK_DECLARE; PersistentFileSysObjName fsObjName; XLogRecPtr mirrorBufpoolResyncCkptLoc; Datum values[Natts_gp_persistent_relation_node]; if(RelFileNode_IsEmpty(relFileNode)) elog(ERROR, "Invalid RelFileNode (0,0,0)"); MemSet(&mirrorBufpoolResyncCkptLoc, 0, sizeof(XLogRecPtr)); if (Persistent_BeforePersistenceWork()) { if (Debug_persistent_print) elog(Persistent_DebugPrintLevel(), "Skipping persistent relation '%s' because we are before persistence work", relpath(*relFileNode)); MemSet(persistentTid, 0, sizeof(ItemPointerData)); *serialNum = 0; return; // The initdb process will load the persistent table once we out of bootstrap mode. } /* Verify if the needed shared mem data structures for persistent tables are setup and inited */ PersistentRelation_VerifyInitScan(); /* Setup the file system object name */ PersistentFileSysObjName_SetRelationFile( &fsObjName, relFileNode, segmentFileNum); WRITE_PERSISTENT_STATE_ORDERED_LOCK; /* Create a values array which will be used to create a 'gp_persistent_relation_node' tuple */ GpPersistentRelationNode_SetDatumValues( values, relFileNode->spcNode, relFileNode->dbNode, relFileNode->relNode, segmentFileNum, relStorageMgr, (bufferPoolBulkLoad ? PersistentFileSysState_BulkLoadCreatePending : PersistentFileSysState_CreatePending), /* createMirrorDataLossTrackingSessionNum */ 0, mirrorExistenceState, relDataSynchronizationState, /* mirrorBufpoolMarkedForScanIncrementalResync */ false, /* mirrorBufpoolResyncChangedPageCount */ 0, &mirrorBufpoolResyncCkptLoc, /* mirrorBufpoolResyncCkptBlockNum */ 0, /* mirrorAppendOnlyLossEof */ 0, /* mirrorAppendOnlyNewEof */ 0, relBufpoolKind, GetTopTransactionId(), /* persistentSerialNum */ 0); // This will be set by PersistentFileSysObj_AddTuple. /* Add a new tuple to 'gp_persistent_relation_node' table for the new relation/segment file * we intend to create. This will also create and apply a new persistent serial number. */ PersistentFileSysObj_AddTuple( PersistentFsObjType_RelationFile, values, flushToXLog, persistentTid, serialNum); /* * This XLOG must be generated under the persistent write-lock. */ #ifdef MASTER_MIRROR_SYNC mmxlog_log_create_relfilenode( relFileNode->spcNode, relFileNode->dbNode, relFileNode->relNode, segmentFileNum); #endif SIMPLE_FAULT_INJECTOR(FaultBeforePendingDeleteRelationEntry); /* We'll add an entry to the PendingDelete LinkedList (LL) to remeber what we * created in this transaction (or sub-transaction). If the transaction * aborts then we can search for all such entries in this LL and get rid of (delete) * such relations or segment files on the disk. * * MPP-18228 * To make adding 'Create Pending' entry to persistent table and adding * to the PendingDelete list atomic */ PendingDelete_AddCreatePendingRelationEntry( &fsObjName, persistentTid, serialNum, relStorageMgr, relationName, isLocalBuf, bufferPoolBulkLoad); WRITE_PERSISTENT_STATE_ORDERED_UNLOCK; if (Debug_persistent_print) elog(Persistent_DebugPrintLevel(), "Persistent relation: Add '%s', relation name '%s' in state 'Create Pending', relation storage manager '%s', mirror existence state '%s', relation data resynchronization state '%s', serial number " INT64_FORMAT " at TID %s", PersistentFileSysObjName_ObjectName(&fsObjName), relationName, PersistentFileSysRelStorageMgr_Name(relStorageMgr), MirroredObjectExistenceState_Name(mirrorExistenceState), MirroredRelDataSynchronizationState_Name(relDataSynchronizationState), *serialNum, ItemPointerToString(persistentTid)); }
/* * FileRepAckPrimary_RunConsumer() */ static int FileRepAckPrimary_RunConsumer(void) { FileRepShmemMessageDescr_s *fileRepShmemMessageDescr = NULL; FileRepMessageHeader_s *fileRepMessageHeader = NULL; pg_crc32 *fileRepMessageHeaderCrc; pg_crc32 messageHeaderCrcLocal = 0; int status = STATUS_OK; bool movePositionConsume = FALSE; FileRepShmem_s *fileRepAckShmem = NULL; FileRep_InsertConfigLogEntry("run consumer"); fileRepAckShmem = fileRepAckShmemArray[FILEREP_ACKSHMEM_MESSAGE_SLOT_PRIMARY_ACK]; while (1) { LWLockAcquire(FileRepAckShmemLock, LW_EXCLUSIVE); if (movePositionConsume) { fileRepAckShmem->positionConsume = fileRepAckShmem->positionConsume + fileRepShmemMessageDescr->messageLength + sizeof(FileRepShmemMessageDescr_s); if (fileRepAckShmem->positionConsume == fileRepAckShmem->positionWraparound && fileRepAckShmem->positionInsert != fileRepAckShmem->positionWraparound) { fileRepAckShmem->positionConsume = fileRepAckShmem->positionBegin; fileRepAckShmem->positionWraparound = fileRepAckShmem->positionEnd; } FileRep_IpcSignal(fileRepIpcArray[fileRepAckShmem->ipcArrayIndex]->semP, &fileRepIpcArray[fileRepAckShmem->ipcArrayIndex]->refCountSemP); } fileRepShmemMessageDescr = (FileRepShmemMessageDescr_s*) fileRepAckShmem->positionConsume; while ((fileRepAckShmem->positionConsume == fileRepAckShmem->positionInsert) || ((fileRepAckShmem->positionConsume != fileRepAckShmem->positionInsert) && (fileRepShmemMessageDescr->messageState != FileRepShmemMessageStateReady))) { fileRepIpcArray[fileRepAckShmem->ipcArrayIndex]->refCountSemC++; LWLockRelease(FileRepAckShmemLock); FileRepSubProcess_ProcessSignals(); if (FileRepSubProcess_GetState() != FileRepStateReady && FileRepSubProcess_GetState() != FileRepStateInitialization) { LWLockAcquire(FileRepAckShmemLock, LW_EXCLUSIVE); break; } FileRep_IpcWait(fileRepIpcArray[fileRepAckShmem->ipcArrayIndex]->semC, &fileRepIpcArray[fileRepAckShmem->ipcArrayIndex]->refCountSemC, FileRepAckShmemLock); LWLockAcquire(FileRepAckShmemLock, LW_EXCLUSIVE); if (fileRepAckShmem->positionConsume == fileRepAckShmem->positionWraparound && fileRepAckShmem->positionInsert != fileRepAckShmem->positionWraparound) { fileRepAckShmem->positionConsume = fileRepAckShmem->positionBegin; fileRepAckShmem->positionWraparound = fileRepAckShmem->positionEnd; } /* Re-assign to find if messageState is changed */ fileRepShmemMessageDescr = (FileRepShmemMessageDescr_s*) fileRepAckShmem->positionConsume; } // internal while fileRepAckShmem->consumeCount++; LWLockRelease(FileRepAckShmemLock); FileRepSubProcess_ProcessSignals(); if (FileRepSubProcess_GetState() != FileRepStateReady && FileRepSubProcess_GetState() != FileRepStateInitialization) { break; } SIMPLE_FAULT_INJECTOR(FileRepConsumer); /* Calculate and compare FileRepMessageHeader_s Crc */ fileRepMessageHeader = (FileRepMessageHeader_s*) (fileRepAckShmem->positionConsume + sizeof(FileRepShmemMessageDescr_s)); FileRep_CalculateCrc((char *) fileRepMessageHeader, sizeof(FileRepMessageHeader_s), &messageHeaderCrcLocal); fileRepMessageHeaderCrc = (pg_crc32 *) (fileRepAckShmem->positionConsume + sizeof(FileRepMessageHeader_s) + sizeof(FileRepShmemMessageDescr_s)); if (*fileRepMessageHeaderCrc != messageHeaderCrcLocal) { status = STATUS_ERROR; ereport(WARNING, (errmsg("mirror failure, " "could not match ack message header checksum between primary '%u' and mirror '%u', " "failover requested", *fileRepMessageHeaderCrc, messageHeaderCrcLocal), errhint("run gprecoverseg to re-establish mirror connectivity"), FileRep_errdetail(fileRepMessageHeader->fileRepIdentifier, fileRepMessageHeader->fileRepRelationType, fileRepMessageHeader->fileRepOperation, fileRepMessageHeader->messageCount), FileRep_errdetail_ShmemAck(), FileRep_errcontext())); break; } /* Write operation is never acknowledged. * That means message should never have body. * CRC of body should be always 0. */ Assert(fileRepMessageHeader->fileRepOperation != FileRepOperationWrite); Assert(fileRepMessageHeader->fileRepMessageBodyCrc == 0); switch (fileRepMessageHeader->fileRepOperation) { case FileRepOperationReconcileXLogEof: xLogEof = fileRepMessageHeader->fileRepOperationDescription.reconcile.xLogEof; if (Debug_filerep_print) ereport(LOG, (errmsg("ack reconcile xlogid '%d' xrecoff '%d' ", xLogEof.xlogid, xLogEof.xrecoff))); break; case FileRepOperationValidation: mirrorStatus = fileRepMessageHeader->fileRepOperationDescription.validation.mirrorStatus; if (Debug_filerep_print) ereport(LOG, (errmsg("ack validation status '%s' ", FileRepStatusToString[mirrorStatus]))); break; case FileRepOperationCreate: mirrorStatus = fileRepMessageHeader->fileRepOperationDescription.create.mirrorStatus; if (Debug_filerep_print) ereport(LOG, (errmsg("ack create status '%s' ", FileRepStatusToString[mirrorStatus]))); break; case FileRepOperationStartSlruChecksum: mirrorStatus = fileRepMessageHeader->fileRepOperationDescription.startChecksum.mirrorStatus; if (Debug_filerep_print) { ereport(LOG, (errmsg("ack start SLRU checksum: status = '%s', directory = '%s' ", FileRepStatusToString[mirrorStatus], fileRepMessageHeader->fileRepIdentifier.fileRepFlatFileIdentifier.directorySimpleName))); } break; case FileRepOperationVerifySlruDirectoryChecksum: mirrorStatus = fileRepMessageHeader->fileRepOperationDescription.verifyDirectoryChecksum.mirrorStatus; if (Debug_filerep_print) { ereport(LOG, (errmsg("ack verify SLRU directory checksum: status = '%s', directory = '%s' ", FileRepStatusToString[mirrorStatus], fileRepMessageHeader->fileRepIdentifier.fileRepFlatFileIdentifier.directorySimpleName))); } break; default: break; } if (fileRepMessageHeader->fileRepAckState != FileRepAckStateCompleted) { status = STATUS_ERROR; ereport(WARNING, (errmsg("mirror failure, " "could not complete operation on mirror ack state '%s', " "failover requested", FileRepAckStateToString[fileRepMessageHeader->fileRepAckState]), errhint("run gprecoverseg to re-establish mirror connectivity"), errSendAlert(true), FileRep_errdetail(fileRepMessageHeader->fileRepIdentifier, fileRepMessageHeader->fileRepRelationType, fileRepMessageHeader->fileRepOperation, fileRepMessageHeader->messageCount), FileRep_errdetail_Shmem(), FileRep_errdetail_ShmemAck(), FileRep_errcontext())); /* * FAULT has to be set before entry is updated in ack hash table * in order to suspend backend process. */ FileRep_SetSegmentState(SegmentStateFault, FaultTypeMirror); FileRepSubProcess_ProcessSignals(); } if (FileRepAckPrimary_UpdateHashEntry( fileRepMessageHeader->fileRepIdentifier, fileRepMessageHeader->fileRepRelationType, fileRepMessageHeader->fileRepAckState) != STATUS_OK) { status = STATUS_ERROR; ereport(WARNING, (errmsg("mirror failure, " "could not update ack state '%s' in ack hash table, " "failover requested", FileRepAckStateToString[fileRepMessageHeader->fileRepAckState]), errhint("run gprecoverseg to re-establish mirror connectivity"), errSendAlert(true), FileRep_errdetail(fileRepMessageHeader->fileRepIdentifier, fileRepMessageHeader->fileRepRelationType, fileRepMessageHeader->fileRepOperation, fileRepMessageHeader->messageCount), FileRep_errdetail_Shmem(), FileRep_errdetail_ShmemAck(), FileRep_errcontext())); } FileRep_InsertLogEntry( "P_RunConsumer", fileRepMessageHeader->fileRepIdentifier, fileRepMessageHeader->fileRepRelationType, fileRepMessageHeader->fileRepOperation, messageHeaderCrcLocal, fileRepMessageHeader->fileRepMessageBodyCrc, fileRepMessageHeader->fileRepAckState, FILEREP_UNDEFINED, fileRepMessageHeader->messageCount); if (status != STATUS_OK) { break; } movePositionConsume = TRUE; } // while(1) return status; }
static int FileRepAckPrimary_RunReceiver(void) { uint32_t msgLength = 0; FileRepConsumerProcIndex_e msgType; int status = STATUS_OK; char *msgPositionInsert; FileRepShmemMessageDescr_s *fileRepShmemMessageDescr; uint32 spareField; FileRep_InsertConfigLogEntry("run receiver"); while (1) { FileRepSubProcess_ProcessSignals(); if (FileRepSubProcess_GetState() != FileRepStateReady && FileRepSubProcess_GetState() != FileRepStateInitialization) { break; } if ( ! FileRepConnServer_AwaitMessageBegin()) { /* call was interrupted ... go back to beginning to process signals */ continue; } status = FileRepConnServer_ReceiveMessageType(&msgType); if (status != STATUS_OK) { break; } /* DATA MESSAGE TYPE */ status = FileRepConnServer_ReceiveMessageLength(&msgLength); if (status != STATUS_OK) { break; } msgPositionInsert = FileRep_ReserveShmem(fileRepAckShmemArray[msgType], msgLength, /* not used */ &spareField, FileRepOperationNotSpecified, FileRepAckShmemLock); if (msgPositionInsert == NULL) { status = STATUS_ERROR; ereport(WARNING, (errmsg("mirror failure, " "could not queue received ack message to be processed, " "failover requested"), errhint("run gprecoverseg to re-establish mirror connectivity"), FileRep_errdetail_Shmem(), FileRep_errdetail_ShmemAck(), FileRep_errcontext())); break; } status = FileRepConnServer_ReceiveMessageData( msgPositionInsert + sizeof(FileRepShmemMessageDescr_s), msgLength); if (status != STATUS_OK) { break; } SIMPLE_FAULT_INJECTOR(FileRepReceiver); fileRepShmemMessageDescr = (FileRepShmemMessageDescr_s*) msgPositionInsert; /* it is not in use */ fileRepShmemMessageDescr->messageSync = FALSE; fileRepShmemMessageDescr->messageState = FileRepShmemMessageStateReady; LWLockAcquire(FileRepAckShmemLock, LW_EXCLUSIVE); FileRep_IpcSignal(fileRepIpcArray[fileRepAckShmemArray[msgType]->ipcArrayIndex]->semC, &fileRepIpcArray[fileRepAckShmemArray[msgType]->ipcArrayIndex]->refCountSemC); LWLockRelease(FileRepAckShmemLock); FileRep_InsertLogEntry( "P_RunReceiver", FileRep_GetFlatFileIdentifier("", ""), FileRepRelationTypeNotSpecified, FileRepOperationNotSpecified, FILEREP_UNDEFINED, FILEREP_UNDEFINED, FileRepAckStateNotInitialized, spareField, FILEREP_UNDEFINED); } // while(1) FileRepConnServer_CloseConnection(); return status; }
/* * SenderLoop * */ static int FileRepAckMirror_RunSender(void) { FileRepShmemMessageDescr_s *fileRepShmemMessageDescr=NULL; char *fileRepMessage; int status = STATUS_OK; bool movePositionConsume = FALSE; FileRepConsumerProcIndex_e messageType; FileRepMessageHeader_s *fileRepMessageHeader; FileRepShmem_s *fileRepAckShmem = NULL; FileRep_InsertConfigLogEntry("run sender ack"); fileRepAckShmem = fileRepAckShmemArray[FILEREP_OUTGOING_MESSAGE_QUEUE]; while (1) { LWLockAcquire(FileRepAckShmemLock, LW_EXCLUSIVE); if (movePositionConsume) { fileRepAckShmem->positionConsume = fileRepAckShmem->positionConsume + fileRepShmemMessageDescr->messageLength + sizeof(FileRepShmemMessageDescr_s); if (fileRepAckShmem->positionConsume == fileRepAckShmem->positionWraparound && fileRepAckShmem->positionInsert != fileRepAckShmem->positionWraparound) { fileRepAckShmem->positionConsume = fileRepAckShmem->positionBegin; fileRepAckShmem->positionWraparound = fileRepAckShmem->positionEnd; } FileRep_IpcSignal(fileRepIpcArray[fileRepAckShmem->ipcArrayIndex]->semP, &fileRepIpcArray[fileRepAckShmem->ipcArrayIndex]->refCountSemP); } fileRepShmemMessageDescr = (FileRepShmemMessageDescr_s*) fileRepAckShmem->positionConsume; while ((fileRepAckShmem->positionConsume == fileRepAckShmem->positionInsert) || ((fileRepAckShmem->positionConsume != fileRepAckShmem->positionInsert) && (fileRepShmemMessageDescr->messageState != FileRepShmemMessageStateReady))) { fileRepIpcArray[fileRepAckShmem->ipcArrayIndex]->refCountSemC++; LWLockRelease(FileRepAckShmemLock); FileRepSubProcess_ProcessSignals(); if (FileRepSubProcess_GetState() != FileRepStateReady) { LWLockAcquire(FileRepAckShmemLock, LW_EXCLUSIVE); break; } FileRep_IpcWait(fileRepIpcArray[fileRepAckShmem->ipcArrayIndex]->semC, &fileRepIpcArray[fileRepAckShmem->ipcArrayIndex]->refCountSemC, FileRepAckShmemLock); LWLockAcquire(FileRepAckShmemLock, LW_EXCLUSIVE); if (fileRepAckShmem->positionConsume == fileRepAckShmem->positionWraparound && fileRepAckShmem->positionInsert != fileRepAckShmem->positionWraparound) { fileRepAckShmem->positionConsume = fileRepAckShmem->positionBegin; fileRepAckShmem->positionWraparound = fileRepAckShmem->positionEnd; } /* Re-assign to find if messageState is changed */ fileRepShmemMessageDescr = (FileRepShmemMessageDescr_s*) fileRepAckShmem->positionConsume; } // while internal fileRepAckShmem->consumeCount++; LWLockRelease(FileRepAckShmemLock); FileRepSubProcess_ProcessSignals(); if (FileRepSubProcess_GetState() != FileRepStateReady) { break; } FileRep_InsertLogEntry( "M_RunSenderAck", FileRep_GetFlatFileIdentifier("", ""), FileRepRelationTypeNotSpecified, FileRepOperationNotSpecified, FILEREP_UNDEFINED, FILEREP_UNDEFINED, FileRepAckStateNotInitialized, FILEREP_UNDEFINED, FILEREP_UNDEFINED); SIMPLE_FAULT_INJECTOR(FileRepSender); fileRepMessage = (char*) (fileRepAckShmem->positionConsume + sizeof(FileRepShmemMessageDescr_s)); fileRepMessageHeader = (FileRepMessageHeader_s*) (fileRepAckShmem->positionConsume + sizeof(FileRepShmemMessageDescr_s)); messageType = FileRepMessageTypeXLog; if (! FileRepConnClient_SendMessage( messageType, fileRepShmemMessageDescr->messageSync, fileRepMessage, fileRepShmemMessageDescr->messageLength)) { ereport(WARNING, (errcode_for_socket_access(), errmsg("mirror failure, " "could not sent ack message to primary : %m, " "failover requested"), errhint("run gprecoverseg to re-establish mirror connectivity"), FileRep_errdetail_ShmemAck(), FileRep_errcontext())); status = STATUS_ERROR; break; } movePositionConsume = TRUE; } // while(1) FileRepConnClient_CloseConnection(); return status; }
/* * update segment configuration in catalog and shared memory */ static bool probeUpdateConfig(FtsSegmentStatusChange *changes, int changeCount) { Relation configrel; Relation histrel; SysScanDesc sscan; ScanKeyData scankey; HeapTuple configtuple; HeapTuple newtuple; HeapTuple histtuple; Datum configvals[Natts_gp_segment_configuration]; bool confignulls[Natts_gp_segment_configuration] = { false }; bool repls[Natts_gp_segment_configuration] = { false }; Datum histvals[Natts_gp_configuration_history]; bool histnulls[Natts_gp_configuration_history] = { false }; bool valid; bool primary; bool changelogging; int i; char desc[SQL_CMD_BUF_SIZE]; /* * Commit/abort transaction below will destroy * CurrentResourceOwner. We need it for catalog reads. */ ResourceOwner save = CurrentResourceOwner; StartTransactionCommand(); elog(LOG, "probeUpdateConfig called for %d changes", changeCount); histrel = heap_open(GpConfigHistoryRelationId, RowExclusiveLock); configrel = heap_open(GpSegmentConfigRelationId, RowExclusiveLock); for (i = 0; i < changeCount; i++) { FtsSegmentStatusChange *change = &changes[i]; valid = (changes[i].newStatus & FTS_STATUS_ALIVE); primary = (changes[i].newStatus & FTS_STATUS_PRIMARY); changelogging = (changes[i].newStatus & FTS_STATUS_CHANGELOGGING); if (changelogging) { Assert(failover_strategy == 'f'); Assert(primary && valid); } Assert((valid || !primary) && "Primary cannot be down"); /* * Insert new tuple into gp_configuration_history catalog. */ histvals[Anum_gp_configuration_history_time-1] = TimestampTzGetDatum(GetCurrentTimestamp()); histvals[Anum_gp_configuration_history_dbid-1] = Int16GetDatum(changes[i].dbid); snprintf(desc, sizeof(desc), "FTS: content %d fault marking status %s%s role %c", change->segindex, valid ? "UP" : "DOWN", (changelogging) ? " mode: change-tracking" : "", primary ? 'p' : 'm'); histvals[Anum_gp_configuration_history_desc-1] = CStringGetTextDatum(desc); histtuple = heap_form_tuple(RelationGetDescr(histrel), histvals, histnulls); simple_heap_insert(histrel, histtuple); CatalogUpdateIndexes(histrel, histtuple); /* * Find and update gp_segment_configuration tuple. */ ScanKeyInit(&scankey, Anum_gp_segment_configuration_dbid, BTEqualStrategyNumber, F_INT2EQ, Int16GetDatum(changes[i].dbid)); sscan = systable_beginscan(configrel, GpSegmentConfigDbidIndexId, true, SnapshotNow, 1, &scankey); configtuple = systable_getnext(sscan); if (!HeapTupleIsValid(configtuple)) { elog(ERROR, "FTS cannot find dbid=%d in %s", changes[i].dbid, RelationGetRelationName(configrel)); } configvals[Anum_gp_segment_configuration_role-1] = CharGetDatum(primary ? 'p' : 'm'); repls[Anum_gp_segment_configuration_role-1] = true; configvals[Anum_gp_segment_configuration_status-1] = CharGetDatum(valid ? 'u' : 'd'); repls[Anum_gp_segment_configuration_status-1] = true; if (changelogging) { configvals[Anum_gp_segment_configuration_mode-1] = CharGetDatum('c'); } repls[Anum_gp_segment_configuration_mode-1] = changelogging; newtuple = heap_modify_tuple(configtuple, RelationGetDescr(configrel), configvals, confignulls, repls); simple_heap_update(configrel, &configtuple->t_self, newtuple); CatalogUpdateIndexes(configrel, newtuple); systable_endscan(sscan); pfree(newtuple); /* * Update shared memory */ ftsProbeInfo->fts_status[changes[i].dbid] = changes[i].newStatus; } heap_close(histrel, RowExclusiveLock); heap_close(configrel, RowExclusiveLock); SIMPLE_FAULT_INJECTOR(FtsWaitForShutdown); /* * Do not block shutdown. We will always get a change to update * gp_segment_configuration in subsequent probes upon database * restart. */ if (shutdown_requested) { elog(LOG, "Shutdown in progress, ignoring FTS prober updates."); return false; } CommitTransactionCommand(); CurrentResourceOwner = save; return true; }