Beispiel #1
0
/* ------------------------------------------------------------------
 * 	ExecShareInputScan
 * 	Retrieve a tuple from the ShareInputScan
 * ------------------------------------------------------------------
 */
TupleTableSlot *
ExecShareInputScan(ShareInputScanState *node)
{
	EState *estate;
	ScanDirection dir;
	bool forward;
	TupleTableSlot *slot;

	ShareInputScan * sisc = (ShareInputScan *) node->ss.ps.plan;

	ShareType share_type = sisc->share_type;

	/* 
	 * get state info from node
	 */
	estate = node->ss.ps.state;
	dir = estate->es_direction;
	forward = ScanDirectionIsForward(dir);


	/* if first time call, need to initialize the tuplestore state.  */
	if(node->ts_state == NULL)
	{
		elog(DEBUG1, "SISC (shareid=%d, slice=%d): No tuplestore yet, initializing tuplestore",
				sisc->share_id, currentSliceId);
		init_tuplestore_state(node);
	}

	slot = node->ss.ps.ps_ResultTupleSlot;

	while(1)
	{
		bool gotOK = false;

		if(share_type == SHARE_MATERIAL || share_type == SHARE_MATERIAL_XSLICE) 
		{
			ntuplestore_acc_advance((NTupleStoreAccessor *) node->ts_pos, forward ? 1 : -1);
			gotOK = ntuplestore_acc_current_tupleslot((NTupleStoreAccessor *) node->ts_pos, slot);
		}
		else
		{
			gotOK = tuplesort_gettupleslot_pos(node->ts_state->sortstore, (TuplesortPos *)node->ts_pos, forward, slot, CurrentMemoryContext);
		}

		if(!gotOK)
			return NULL;

		SIMPLE_FAULT_INJECTOR(ExecShareInputNext);

		return slot;
	}

	Assert(!"should not be here");
	return NULL;
}
Beispiel #2
0
/*
 * Creates a new numbered workfile in a given set
 *
 *  The given file_no is used to generate the file name
 */
ExecWorkFile *
workfile_mgr_create_fileno(workfile_set *work_set, uint32 file_no)
{
	Assert(NULL != work_set);

	char file_name[MAXPGPATH];
	retrieve_file_no(work_set, file_no, file_name, sizeof(file_name));

	ExecWorkFile *ewfile = ExecWorkFile_Create(file_name,
			work_set->metadata.type,
			true /* del_on_close */,
			work_set->metadata.bfz_compress_type);

	SIMPLE_FAULT_INJECTOR(WorkfileCreationFail);

	ExecWorkfile_SetWorkset(ewfile, work_set);

	return ewfile;
}
/*
 * Indicate we intend to create a tablespace file as part of the current transaction.
 *
 * An XLOG IntentToCreate record is generated that will guard the subsequent file-system
 * create in case the transaction aborts.
 *
 * After 1 or more calls to this routine to mark intention about tablespace files that are going
 * to be created, call ~_DoPendingCreates to do the actual file-system creates.  (See its
 * note on XLOG flushing).
 */
void
PersistentTablespace_MarkCreatePending(
									   Oid filespaceOid,
 /* The filespace where the tablespace lives. */

									   Oid tablespaceOid,
 /* The tablespace OID for the create. */

									   MirroredObjectExistenceState mirrorExistenceState,

									   ItemPointer persistentTid,
 /* TID of the gp_persistent_rel_files tuple for the rel file */

									   int64 *persistentSerialNum,


									   bool flushToXLog)
 /* When true, the XLOG record for this change will be flushed to disk. */

{
	WRITE_PERSISTENT_STATE_ORDERED_LOCK_DECLARE;

	PersistentFileSysObjName fsObjName;

	TablespaceDirEntry tablespaceDirEntry;
	TransactionId topXid;

	if (Persistent_BeforePersistenceWork())
	{
		if (Debug_persistent_print)
			elog(Persistent_DebugPrintLevel(),
				 "Skipping persistent tablespace %u because we are before persistence work",
				 tablespaceOid);

		return;

		/*
		 * The initdb process will load the persistent table once we out of
		 * bootstrap mode.
		 */
	}

	PersistentTablespace_VerifyInitScan();

	PersistentFileSysObjName_SetTablespaceDir(&fsObjName, tablespaceOid);

	topXid = GetTopTransactionId();

	WRITE_PERSISTENT_STATE_ORDERED_LOCK;

	PersistentTablespace_AddTuple(
								  filespaceOid,
								  tablespaceOid,
								  PersistentFileSysState_CreatePending,
								   /* createMirrorDataLossTrackingSessionNum */ 0,
								  mirrorExistenceState,
								   /* reserved */ 0,
								   /* parentXid */ topXid,
								  flushToXLog,
								  persistentTid,
								  persistentSerialNum);

	WRITE_TABLESPACE_HASH_LOCK;
	tablespaceDirEntry =
		PersistentTablespace_CreateEntryUnderLock(filespaceOid, tablespaceOid);
	Assert(tablespaceDirEntry != NULL);
	tablespaceDirEntry->state = PersistentFileSysState_CreatePending;
	ItemPointerCopy(persistentTid, &tablespaceDirEntry->persistentTid);
	tablespaceDirEntry->persistentSerialNum = *persistentSerialNum;
	WRITE_TABLESPACE_HASH_UNLOCK;

	/*
	 * This XLOG must be generated under the persistent write-lock.
	 */
#ifdef MASTER_MIRROR_SYNC
	mmxlog_log_create_tablespace(
								 filespaceOid,
								 tablespaceOid);
#endif

	SIMPLE_FAULT_INJECTOR(FaultBeforePendingDeleteTablespaceEntry);

	/*
	 * MPP-18228 To make adding 'Create Pending' entry to persistent table and
	 * adding to the PendingDelete list atomic
	 */
	PendingDelete_AddCreatePendingEntryWrapper(
											   &fsObjName,
											   persistentTid,
											   *persistentSerialNum);

	WRITE_PERSISTENT_STATE_ORDERED_UNLOCK;

	if (Debug_persistent_print)
		elog(Persistent_DebugPrintLevel(),
			 "Persistent tablespace directory: Add '%s' in state 'Created', mirror existence state '%s', serial number " INT64_FORMAT " at TID %s",
			 PersistentFileSysObjName_ObjectName(&fsObjName),
			 MirroredObjectExistenceState_Name(mirrorExistenceState),
			 *persistentSerialNum,
			 ItemPointerToString(persistentTid));
}
Beispiel #4
0
/*
 * Actually do a base backup for the specified tablespaces.
 *
 * This is split out mainly to avoid complaints about "variable might be
 * clobbered by longjmp" from stupider versions of gcc.
 */
static void
perform_base_backup(basebackup_options *opt, DIR *tblspcdir)
{
	XLogRecPtr	startptr;
	XLogRecPtr	endptr;
	char	   *labelfile;

	startptr = do_pg_start_backup(opt->label, opt->fastcheckpoint, &labelfile);
	Assert(!XLogRecPtrIsInvalid(startptr));

	elogif(!debug_basebackup, LOG,
		   "basebackup perform -- "
		   "Basebackup start xlog location = %X/%X",
		   startptr.xlogid, startptr.xrecoff);

	/*
	 * Set xlogCleanUpTo so that checkpoint process knows
	 * which old xlog files should not be cleaned
	 */
	WalSndSetXLogCleanUpTo(startptr);

	SIMPLE_FAULT_INJECTOR(BaseBackupPostCreateCheckpoint);

	SendXlogRecPtrResult(startptr);

	PG_ENSURE_ERROR_CLEANUP(base_backup_cleanup, (Datum) 0);
	{
		List	   *filespaces = NIL;
		ListCell   *lc;

		/* Collect information about all filespaces, including pg_system */
		filespaces = get_filespaces_to_send(opt);

		/* Send filespace header */
		SendBackupHeader(filespaces);

		/* Send off our filespaces one by one */
		foreach(lc, filespaces)
		{
			filespaceinfo *fi = (filespaceinfo *) lfirst(lc);
			StringInfoData buf;

			/* Send CopyOutResponse message */
			pq_beginmessage(&buf, 'H');
			pq_sendbyte(&buf, 0);		/* overall format */
			pq_sendint(&buf, 0, 2);		/* natts */
			pq_endmessage(&buf);

			/* In the main tar, include the backup_label first. */
			if (fi->primary_path == NULL)
				sendFileWithContent(BACKUP_LABEL_FILE, labelfile);

			sendDir(fi->primary_path == NULL ? "." : fi->primary_path,
					fi->primary_path == NULL ? 1 : strlen(fi->primary_path),
					opt->exclude, false);

			/* In the main tar, include pg_control last. */
			if (fi->primary_path == NULL)
			{
				struct stat statbuf;

				if (lstat(XLOG_CONTROL_FILE, &statbuf) != 0)
				{
					ereport(ERROR,
							(errcode_for_file_access(),
							 errmsg("could not stat control file \"%s\": %m",
									XLOG_CONTROL_FILE)));
				}

				sendFile(XLOG_CONTROL_FILE, XLOG_CONTROL_FILE, &statbuf);

				elogif(debug_basebackup, LOG,
					   "basebackup perform -- Sent file %s." , XLOG_CONTROL_FILE);
			}

			/*
			 * If we're including WAL, and this is the main data directory we
			 * don't terminate the tar stream here. Instead, we will append
			 * the xlog files below and terminate it then. This is safe since
			 * the main data directory is always sent *last*.
			 */
			if (opt->includewal && fi->xlogdir)
			{
				Assert(lnext(lc) == NULL);
			}
			else
				pq_putemptymessage('c');		/* CopyDone */
		}
	}
Beispiel #5
0
void
datumstreamread_block_content(DatumStreamRead * acc)
{
	Assert(acc);

	/*
	 * Clear out state from previous block.
	 */
	DatumStreamBlockRead_Reset(&acc->blockRead);

	acc->largeObjectState = DatumStreamLargeObjectState_None;

	/*
	 * Read in data.
	 */
	if (acc->getBlockInfo.execBlockKind == AOCSBK_BLOCK)
	{
		Assert(!acc->getBlockInfo.isLarge);

		if (acc->getBlockInfo.isCompressed)
		{
			/* Compressed, need to decompress to our own buffer.  */
			if (acc->large_object_buffer_size < acc->getBlockInfo.contentLen)
			{
				MemoryContext oldCtxt;

				oldCtxt = MemoryContextSwitchTo(acc->memctxt);

				if (acc->large_object_buffer)
				{
					pfree(acc->large_object_buffer);
					acc->large_object_buffer = NULL;

					SIMPLE_FAULT_INJECTOR(MallocFailure);
				}

				acc->large_object_buffer_size = acc->getBlockInfo.contentLen;
				acc->large_object_buffer = palloc(acc->getBlockInfo.contentLen);
				MemoryContextSwitchTo(oldCtxt);
			}

			AppendOnlyStorageRead_Content(
										  &acc->ao_read,
										  (uint8 *) acc->large_object_buffer,
										  acc->getBlockInfo.contentLen);

			acc->buffer_beginp = acc->large_object_buffer;
		}
		else
		{
			acc->buffer_beginp = AppendOnlyStorageRead_GetBuffer(&acc->ao_read);
		}


		if (Debug_appendonly_print_datumstream)
			elog(LOG,
				 "datumstream_read_block_content filePathName %s firstRowNum " INT64_FORMAT " rowCnt %u "
				 "ndatum %u contentLen %d datump %p",
				 acc->ao_read.bufferedRead.filePathName,
				 acc->getBlockInfo.firstRow,
				 acc->getBlockInfo.rowCnt,
				 acc->blockRead.logical_row_count,
				 acc->getBlockInfo.contentLen, acc->blockRead.datump);

	}
	else if (acc->getBlockInfo.execBlockKind == AOCSBK_BLOB)
	{
		Assert(acc->getBlockInfo.rowCnt == 1);

		if (acc->typeInfo.datumlen >= 0)
		{
			elog(ERROR, "Large object must be variable length objects (varlena)");
		}

		/*
		 * NOTE: Do not assert the content is large.  What appears to be large
		 * content
		 */
		/* NOTE: can compress into one AO storage block. */

		if (acc->large_object_buffer_size < acc->getBlockInfo.contentLen)
		{
			MemoryContext oldCtxt;

			oldCtxt = MemoryContextSwitchTo(acc->memctxt);

			if (acc->large_object_buffer)
				pfree(acc->large_object_buffer);

			acc->large_object_buffer_size = acc->getBlockInfo.contentLen;
			acc->large_object_buffer = palloc(acc->getBlockInfo.contentLen);
			MemoryContextSwitchTo(oldCtxt);
		}

		AppendOnlyStorageRead_Content(
									  &acc->ao_read,
									  acc->large_object_buffer,
									  acc->getBlockInfo.contentLen);

		acc->buffer_beginp = acc->large_object_buffer;
		acc->largeObjectState = DatumStreamLargeObjectState_HaveAoContent;

		if (Debug_datumstream_read_check_large_varlena_integrity)
		{
			datumstreamread_check_large_varlena_integrity(
														  acc,
														  acc->buffer_beginp,
											   acc->getBlockInfo.contentLen);
		}
	}
	else
	{
		elog(ERROR,
			 "Unexpected Append-Only Column Store executor kind %d",
			 acc->getBlockInfo.execBlockKind);
	}

	/*
	 * Unpack the information from the block headers and get ready to read the first datum.
	 */
	datumstreamread_block_get_ready(acc);
}
Beispiel #6
0
/* ----------------------------------------------------------------
 *		MultiExecHash
 *
 *		build hash table for hashjoin, doing partitioning if more
 *		than one batch is required.
 * ----------------------------------------------------------------
 */
Node *
MultiExecHash(HashState *node)
{
	PlanState  *outerNode;
	List	   *hashkeys;
	HashJoinTable hashtable;
	TupleTableSlot *slot;
	ExprContext *econtext;
	uint32		hashvalue = 0;

	/* must provide our own instrumentation support */
	if (node->ps.instrument)
		InstrStartNode(node->ps.instrument);

	/*
	 * get state info from node
	 */
	outerNode = outerPlanState(node);
	hashtable = node->hashtable;

	/*
	 * set expression context
	 */
	hashkeys = node->hashkeys;
	econtext = node->ps.ps_ExprContext;

	SIMPLE_FAULT_INJECTOR(MultiExecHashLargeVmem);

	/*
	 * get all inner tuples and insert into the hash table (or temp files)
	 */
	for (;;)
	{
		slot = ExecProcNode(outerNode);
		if (TupIsNull(slot))
			break;

		Gpmon_M_Incr(GpmonPktFromHashState(node), GPMON_QEXEC_M_ROWSIN);
		CheckSendPlanStateGpmonPkt(&node->ps);
		/* We have to compute the hash value */
		econtext->ecxt_innertuple = slot;
		bool hashkeys_null = false;

		if (ExecHashGetHashValue(node, hashtable, econtext, hashkeys, false,
								 node->hs_keepnull, &hashvalue, &hashkeys_null))
		{
			ExecHashTableInsert(node, hashtable, slot, hashvalue);
		}

		if (hashkeys_null)
		{
			node->hs_hashkeys_null = true;
			if (node->hs_quit_if_hashkeys_null)
			{
				ExecSquelchNode(outerNode);
				return NULL;
			}
		}
	}

	/* Now we have set up all the initial batches & primary overflow batches. */
	hashtable->nbatch_outstart = hashtable->nbatch;

	/* must provide our own instrumentation support */
	if (node->ps.instrument)
		InstrStopNode(node->ps.instrument, hashtable->totalTuples);

	/*
	 * We do not return the hash table directly because it's not a subtype of
	 * Node, and so would violate the MultiExecProcNode API.  Instead, our
	 * parent Hashjoin node is expected to know how to fish it out of our node
	 * state.  Ugly but not really worth cleaning up, since Hashjoin knows
	 * quite a bit more about Hash besides that.
	 */
	return NULL;
}
Beispiel #7
0
/*
 * Creates a new gang by logging on a session to each segDB involved.
 *
 * call this function in GangContext memory context.
 * elog ERROR or return a non-NULL gang.
 */
static Gang *
createGang_thread(GangType type, int gang_id, int size, int content)
{
	Gang	   *newGangDefinition = NULL;
	SegmentDatabaseDescriptor *segdbDesc = NULL;
	DoConnectParms *doConnectParmsAr = NULL;
	DoConnectParms *pParms = NULL;
	int			parmIndex = 0;
	int			threadCount = 0;
	int			i = 0;
	int			create_gang_retry_counter = 0;
	int			in_recovery_mode_count = 0;
	int			successful_connections = 0;

	PQExpBufferData create_gang_error;

	ELOG_DISPATCHER_DEBUG("createGang type = %d, gang_id = %d, size = %d, content = %d",
						  type, gang_id, size, content);

	/* check arguments */
	Assert(size == 1 || size == getgpsegmentCount());
	Assert(CurrentResourceOwner != NULL);
	Assert(CurrentMemoryContext == GangContext);
	Assert(gp_connections_per_thread > 0);

	/* Writer gang is created before reader gangs. */
	if (type == GANGTYPE_PRIMARY_WRITER)
		Insist(!GangsExist());

	initPQExpBuffer(&create_gang_error);

	Assert(CurrentGangCreating == NULL);

create_gang_retry:

	/*
	 * If we're in a retry, we may need to reset our initial state a bit. We
	 * also want to ensure that all resources have been released.
	 */
	Assert(newGangDefinition == NULL);
	Assert(doConnectParmsAr == NULL);
	successful_connections = 0;
	in_recovery_mode_count = 0;
	threadCount = 0;

	/* allocate and initialize a gang structure */
	newGangDefinition = buildGangDefinition(type, gang_id, size, content);
	CurrentGangCreating = newGangDefinition;

	Assert(newGangDefinition != NULL);
	Assert(newGangDefinition->size == size);
	Assert(newGangDefinition->perGangContext != NULL);
	MemoryContextSwitchTo(newGangDefinition->perGangContext);

	resetPQExpBuffer(&create_gang_error);

	/*
	 * The most threads we could have is segdb_count /
	 * gp_connections_per_thread, rounded up. This is equivalent to 1 +
	 * (segdb_count-1) / gp_connections_per_thread. We allocate enough memory
	 * for this many DoConnectParms structures, even though we may not use
	 * them all.
	 */
	threadCount = 1 + (size - 1) / gp_connections_per_thread;
	Assert(threadCount > 0);

	/* initialize connect parameters */
	doConnectParmsAr = makeConnectParms(threadCount, type, gang_id);
	for (i = 0; i < size; i++)
	{
		parmIndex = i / gp_connections_per_thread;
		pParms = &doConnectParmsAr[parmIndex];
		segdbDesc = &newGangDefinition->db_descriptors[i];
		pParms->segdbDescPtrArray[pParms->db_count++] = segdbDesc;
	}

	/* start threads and doing the connect */
	for (i = 0; i < threadCount; i++)
	{
		int			pthread_err;

		pParms = &doConnectParmsAr[i];

		ELOG_DISPATCHER_DEBUG("createGang creating thread %d of %d for libpq connections",
							  i + 1, threadCount);

		pthread_err = gp_pthread_create(&pParms->thread, thread_DoConnect, pParms, "createGang");
		if (pthread_err != 0)
		{
			int			j;

			/*
			 * Error during thread create (this should be caused by resource
			 * constraints). If we leave the threads running, they'll
			 * immediately have some problems -- so we need to join them, and
			 * *then* we can issue our FATAL error
			 */
			for (j = 0; j < i; j++)
			{
				pthread_join(doConnectParmsAr[j].thread, NULL);
			}

			ereport(FATAL, (errcode(ERRCODE_INTERNAL_ERROR),
							errmsg("failed to create thread %d of %d", i + 1, threadCount),
							errdetail("pthread_create() failed with err %d", pthread_err)));
		}
	}

	/*
	 * wait for all of the DoConnect threads to complete.
	 */
	for (i = 0; i < threadCount; i++)
	{
		ELOG_DISPATCHER_DEBUG("joining to thread %d of %d for libpq connections",
							  i + 1, threadCount);

		if (0 != pthread_join(doConnectParmsAr[i].thread, NULL))
		{
			elog(FATAL, "could not create segworker group");
		}
	}

	/*
	 * Free the memory allocated for the threadParms array
	 */
	destroyConnectParms(doConnectParmsAr, threadCount);
	doConnectParmsAr = NULL;

	SIMPLE_FAULT_INJECTOR(GangCreated);

	/* find out the successful connections and the failed ones */
	checkConnectionStatus(newGangDefinition, &in_recovery_mode_count,
						  &successful_connections, &create_gang_error);

	ELOG_DISPATCHER_DEBUG("createGang: %d processes requested; %d successful connections %d in recovery",
						  size, successful_connections, in_recovery_mode_count);

	MemoryContextSwitchTo(GangContext);

	if (size == successful_connections)
	{
		setLargestGangsize(size);
		termPQExpBuffer(&create_gang_error);
		CurrentGangCreating = NULL;

		return newGangDefinition;
	}

	/* there'er failed connections */

	/* FTS shows some segment DBs are down, destroy all gangs. */
	if (isFTSEnabled() &&
		FtsTestSegmentDBIsDown(newGangDefinition->db_descriptors, size))
	{
		appendPQExpBuffer(&create_gang_error, "FTS detected one or more segments are down\n");
		goto exit;
	}

	/* failure due to recovery */
	if (successful_connections + in_recovery_mode_count == size)
	{
		if (gp_gang_creation_retry_count &&
			create_gang_retry_counter++ < gp_gang_creation_retry_count &&
			type == GANGTYPE_PRIMARY_WRITER)
		{
			/*
			 * Retry for non-writer gangs is meaningless because writer gang
			 * must be gone when QE is in recovery mode
			 */
			DisconnectAndDestroyGang(newGangDefinition);
			newGangDefinition = NULL;
			CurrentGangCreating = NULL;

			ELOG_DISPATCHER_DEBUG("createGang: gang creation failed, but retryable.");

			CHECK_FOR_INTERRUPTS();
			pg_usleep(gp_gang_creation_retry_timer * 1000);
			CHECK_FOR_INTERRUPTS();

			goto create_gang_retry;
		}

		appendPQExpBuffer(&create_gang_error, "segment(s) are in recovery mode\n");
	}

exit:
	if (newGangDefinition != NULL)
		DisconnectAndDestroyGang(newGangDefinition);

	if (type == GANGTYPE_PRIMARY_WRITER)
	{
		DisconnectAndDestroyAllGangs(true);
		CheckForResetSession();
	}

	CurrentGangCreating = NULL;

	ereport(ERROR,
			(errcode(ERRCODE_GP_INTERCONNECTION_ERROR),
			 errmsg("failed to acquire resources on one or more segments"),
			 errdetail("%s", create_gang_error.data)));
	return NULL;
}
Beispiel #8
0
static int
FileRepPrimary_ResyncBufferPoolIncrementalWrite(ChangeTrackingRequest *request)
{
	int				status = STATUS_OK;
	Page			page;
	Buffer			buf; 
	BlockNumber		numBlocks = 0;
	SMgrRelation	smgr_relation = NULL;
	char			relidstr[OIDCHARS + 1 + OIDCHARS + 1 + OIDCHARS + 1];
	int				ii;
	XLogRecPtr		loc;
	XLogRecPtr		loc1;
	int				count = 0;
	int				thresholdCount = 0;
	bool			mirrorDataLossOccurred = FALSE;
	int				NumberOfRelations = request->count;
	
	FileRepResyncHashEntry_s	entry;
	ChangeTrackingResult		*result = NULL;	

	while (1)
	{
		/* allow flushing buffers from buffer pool during scan */
		FileRepResync_SetReadBufferRequest();
		if ((result = ChangeTracking_GetChanges(request)) != NULL) 
		{
			FileRepResync_ResetReadBufferRequest();
					
			for (ii = 0; ii < result->count; ii++)
			{
				
				if (smgr_relation == NULL)
				{
					NumberOfRelations--;
					
					smgr_relation = smgropen(result->entries[ii].relFileNode);
					
					snprintf(relidstr, sizeof(relidstr), "%u/%u/%u",
							 smgr_relation->smgr_rnode.spcNode,
							 smgr_relation->smgr_rnode.dbNode,
							 smgr_relation->smgr_rnode.relNode);

					numBlocks = smgrnblocks(smgr_relation);
					
					if (Debug_filerep_print)
						elog(LOG, "resynchronize buffer pool relation '%u/%u/%u' "
							 "number of blocks:'%u' ",
							 smgr_relation->smgr_rnode.spcNode,
							 smgr_relation->smgr_rnode.dbNode,
							 smgr_relation->smgr_rnode.relNode,
							 numBlocks);
					
					thresholdCount = Min(numBlocks, 1024);
				}
				
				loc1 =  result->entries[ii].lsn_end;
				
				/*
				 * if relation was truncated then block_num from change tracking can be beyond numBlocks 
				 */
				if (result->entries[ii].block_num >=  numBlocks)
				{
					ereport(LOG,	
							(errmsg("could not resynchonize buffer pool relation '%s' block '%d' (maybe due to truncate), "
									"lsn change tracking '%s(%u/%u)' "
									"number of blocks '%d' ",
									relidstr,
									result->entries[ii].block_num,
									XLogLocationToString(&loc1),
									loc1.xlogid,
									loc1.xrecoff,
									numBlocks),						
							 FileRep_errcontext()));						
					
					goto flush_check;
				}
				
				/* allow flushing buffers from buffer pool during scan */
				FileRepResync_SetReadBufferRequest();
				buf = ReadBuffer_Resync(smgr_relation,
										result->entries[ii].block_num);
				FileRepResync_ResetReadBufferRequest();
				
				Assert(result->entries[ii].block_num < numBlocks);
				
				LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
				page = BufferGetPage(buf);
				
				loc = PageGetLSN(page); 
				
				if(Debug_filerep_print)
				{
					elog(LOG,	
							"incremental resync buffer pool identifier '%s' num blocks '%d' blkno '%d' lsn page '%s(%u/%u)' "
							"lsn end change tracking '%s(%u/%u)' ",
							relidstr,
							numBlocks,
							result->entries[ii].block_num,
							XLogLocationToString(&loc),
							loc.xlogid,
							loc.xrecoff,
							XLogLocationToString(&loc1),
							result->entries[ii].lsn_end.xlogid,
							result->entries[ii].lsn_end.xrecoff);					
				}
				else
				{
					char	tmpBuf[FILEREP_MAX_LOG_DESCRIPTION_LEN];
					
					snprintf(tmpBuf, sizeof(tmpBuf), 
							 "incremental resync buffer pool identifier '%s' num blocks '%d' blkno '%d' lsn page '%s(%u/%u)' ",
							 relidstr,
							 numBlocks,
							 result->entries[ii].block_num,
							 XLogLocationToString(&loc),
							 loc.xlogid,
							 loc.xrecoff);
					
					FileRep_InsertConfigLogEntry(tmpBuf);
					
					snprintf(tmpBuf, sizeof(tmpBuf), 
							 "incremental resync buffer pool identifier '%s' lsn end change tracking '%s(%u/%u)' ",
							 relidstr,
							 XLogLocationToString(&loc1),
							 result->entries[ii].lsn_end.xlogid,
							 result->entries[ii].lsn_end.xrecoff);
					
					FileRep_InsertConfigLogEntry(tmpBuf);
					
				}
								
				if (XLByteLE(result->entries[ii].lsn_end, PageGetLSN(page)))
				{
					if (! XLByteEQ(PageGetLSN(page), result->entries[ii].lsn_end))
					{
						ereport(LOG,
							(errmsg("Resynchonize buffer pool relation '%s' block '%d' has page lsn less than CT lsn, "
								"lsn end change tracking '%s(%u/%u)' lsn page '%s(%u/%u)' "
								"number of blocks '%d'",
								relidstr,
								result->entries[ii].block_num,
								XLogLocationToString(&loc),
								loc.xlogid,
								loc.xrecoff,
								XLogLocationToString(&loc1),
								loc1.xlogid,
								loc1.xrecoff,
								numBlocks),
							 FileRep_errcontext()));

					}

					/*
					 * It's safe and better to perform write of the page to mirror,
					 * for this case, as primary and mirror data pages should always
					 * be same. So, we might do some extra work but definitely won't
					 * lose out blocks, or error out and need to perform full recovery.
					 * Need to cover for this case as there are some known scenarios where
					 * CT file can have extra records which should have been discarded,
					 * but as we loose out information of xlog LSN cannot be discarded.
					 * One such case is when CT_TRANSIENT being compacted to CT_COMPACT
					 * with specific xlog LSN (to discard extra records) in CT mode gets
					 * interrupted by resync. Compaction during Resync collects all the
					 * CT records and doesn't have xlog LSN information to discard any
					 * extra records from CT_TRANSIENT.
					 */

					smgrwrite(smgr_relation,
							  result->entries[ii].block_num,
							  (char *)BufferGetBlock(buf),
							  FALSE);
				}

				SIMPLE_FAULT_INJECTOR(FileRepResyncWorker);

				UnlockReleaseBuffer(buf);

				SIMPLE_FAULT_INJECTOR(FileRepResyncWorker);

	flush_check:			
				if (((ii + 1) == result->count) ||
					! (result->entries[ii].relFileNode.spcNode == result->entries[ii+1].relFileNode.spcNode &&
					   result->entries[ii].relFileNode.dbNode == result->entries[ii+1].relFileNode.dbNode &&
					   result->entries[ii].relFileNode.relNode == result->entries[ii+1].relFileNode.relNode))
				{
					if (result->ask_for_more == false)
					{
								
						smgrimmedsync(smgr_relation);
						
						smgrclose(smgr_relation);
								 
						smgr_relation = NULL;
							
						FileRep_GetRelationPath(
												 entry.fileName, 
												 result->entries[ii].relFileNode, 
												 0 /* segment file number is always 0 for Buffer Pool */);							 

						/*
						 * We only want to update the state with this call to
						 * FileRepResync_UpdateEntry(), so to ensure that we
						 * don't incur any sideeffects set the changed page
						 * count to zero as it will only be updated to if the
						 * hashtable entry changed page count is zero.
						 */
						entry.mirrorBufpoolResyncChangedPageCount = 0;
						status = FileRepResync_UpdateEntry(&entry);
						if (status != STATUS_OK)
						{
							 break;
						}
					}
								 
				}			
							
				if (count > thresholdCount)
				{
					count = 0;
					FileRepSubProcess_ProcessSignals();
					
					if (! (FileRepSubProcess_GetState() == FileRepStateReady && 
						   dataState == DataStateInResync))
					{
						mirrorDataLossOccurred = TRUE;
						break;
					}
				}
				else
					count++;
			}  // for (ii = 0; ii < result->count; ii++)
			
		} // if ((result = ChangeTracking_GetChanges(request)) != NULL) 
		
		FileRepResync_ResetReadBufferRequest();
			
		if (result != NULL && result->ask_for_more == true)
		{
			Assert(request->count == 1);
			request->entries[0].lsn_start = result->next_start_lsn;
		}
		else
		{
			break;
		}

	} // while(1) 
		
	ChangeTracking_FreeRequest(request);
	ChangeTracking_FreeResult(result);
	
	Insist(NumberOfRelations == 0);
	
	if (mirrorDataLossOccurred)
		status = STATUS_ERROR;
	
	return status;	
}
Beispiel #9
0
static int
FileRepPrimary_ResyncWrite(FileRepResyncHashEntry_s	*entry)
{

	int				status = STATUS_OK;
	Page			page;
	Buffer			buf; 
	BlockNumber		numBlocks;
	BlockNumber		blkno;
	SMgrRelation	smgr_relation;
	char			relidstr[OIDCHARS + 1 + OIDCHARS + 1 + OIDCHARS + 1];
	XLogRecPtr		loc;
	int				count = 0;
	int				thresholdCount = 0;
	bool			mirrorDataLossOccurred = FALSE;
		
	switch (entry->relStorageMgr)
	{

		case PersistentFileSysRelStorageMgr_BufferPool:
			
			switch (entry->mirrorDataSynchronizationState)
			{
				case MirroredRelDataSynchronizationState_BufferPoolScanIncremental:
				case MirroredRelDataSynchronizationState_FullCopy:

					smgr_relation = smgropen(entry->relFileNode);
					
					numBlocks = smgrnblocks(smgr_relation);

					snprintf(relidstr, sizeof(relidstr), "%u/%u/%u",
							 smgr_relation->smgr_rnode.spcNode,
							 smgr_relation->smgr_rnode.dbNode,
							 smgr_relation->smgr_rnode.relNode);

					if (Debug_filerep_print)
						elog(LOG, "resync buffer pool relation '%s' number of blocks '%d' ",
							 relidstr, numBlocks);

					thresholdCount = Min(numBlocks, 1024);
					
					/* 
					 * required in order to report how many blocks were synchronized 
					 * if gp_persistent_relation_node does not return that information 
					 */
					if (entry->mirrorBufpoolResyncChangedPageCount == 0)
					{
						entry->mirrorBufpoolResyncChangedPageCount = numBlocks - entry->mirrorBufpoolResyncCkptBlockNum;
					}
					
					for (blkno = entry->mirrorBufpoolResyncCkptBlockNum; blkno < numBlocks; blkno++) 
					{
						XLogRecPtr	endResyncLSN = (isFullResync() ? 
													FileRepResync_GetEndFullResyncLSN() :
													FileRepResync_GetEndIncrResyncLSN());

						SIMPLE_FAULT_INJECTOR(FileRepResyncWorkerRead);

						FileRepResync_SetReadBufferRequest();
						buf = ReadBuffer_Resync(smgr_relation, blkno);
						FileRepResync_ResetReadBufferRequest();
						
						LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
						page = BufferGetPage(buf);
						
						loc = PageGetLSN(page);
						
						if (Debug_filerep_print)
						{
							elog(LOG, 
									 "full resync buffer pool identifier '%s' num blocks '%d' blkno '%d' lsn begin change tracking '%s(%u/%u)' "
									 "lsn page '%s(%u/%u)' lsn end change tracking '%s(%u/%u)' ",
									 relidstr,
									 numBlocks,
									 blkno,
									 XLogLocationToString(&entry->mirrorBufpoolResyncCkptLoc),
									 entry->mirrorBufpoolResyncCkptLoc.xlogid,
									 entry->mirrorBufpoolResyncCkptLoc.xrecoff,
									 XLogLocationToString(&loc),
									 loc.xlogid,
									 loc.xrecoff,
									 XLogLocationToString(&endResyncLSN),
									 endResyncLSN.xlogid,
									 endResyncLSN.xrecoff);
						}
						else
						{
							char	tmpBuf[FILEREP_MAX_LOG_DESCRIPTION_LEN];
							
							snprintf(tmpBuf, sizeof(tmpBuf), 
									 "full resync buffer pool identifier '%s' num blocks '%d' blkno '%d' lsn begin change tracking '%s(%u/%u)' ",
									 relidstr,
									 numBlocks,
									 blkno,
									 XLogLocationToString(&entry->mirrorBufpoolResyncCkptLoc),
									 entry->mirrorBufpoolResyncCkptLoc.xlogid,
									 entry->mirrorBufpoolResyncCkptLoc.xrecoff);
														
							FileRep_InsertConfigLogEntry(tmpBuf);
							
							snprintf(tmpBuf, sizeof(tmpBuf), 
									 "full resync buffer pool identifier '%s' lsn page '%s(%u/%u)' lsn end change tracking '%s(%u/%u)' ",
									 relidstr,
									 XLogLocationToString(&loc),
									 loc.xlogid,
									 loc.xrecoff,
									 XLogLocationToString(&endResyncLSN),
									 endResyncLSN.xlogid,
									 endResyncLSN.xrecoff);
							
							FileRep_InsertConfigLogEntry(tmpBuf);
							
						}
						
						if (XLByteLE(PageGetLSN(page), endResyncLSN) &&
							XLByteLE(entry->mirrorBufpoolResyncCkptLoc, PageGetLSN(page))) 
						{
							smgrwrite(smgr_relation, 
									  blkno,
									  (char *)BufferGetBlock(buf),
									  FALSE);
						}

						SIMPLE_FAULT_INJECTOR(FileRepResyncWorker);

						UnlockReleaseBuffer(buf);
						
						if (count > thresholdCount)
						{
							count = 0;
							FileRepSubProcess_ProcessSignals();
							
							if (! (FileRepSubProcess_GetState() == FileRepStateReady && 
								   dataState == DataStateInResync))
							{
								mirrorDataLossOccurred = TRUE;
								break;
							}
						}
						else
							count++;
					}
						
					if (mirrorDataLossOccurred)
						break;

					if (entry->mirrorDataSynchronizationState != MirroredRelDataSynchronizationState_FullCopy)
					{
						LockRelationForResyncExtension(&smgr_relation->smgr_rnode, ExclusiveLock);
					
						numBlocks = smgrnblocks(smgr_relation);
					
						smgrtruncate(smgr_relation,
								 numBlocks,
								 TRUE /* isTemp, TRUE means to not record in XLOG */,
								 FALSE /* isLocalBuf */,
								 &entry->persistentTid,
								 entry->persistentSerialNum);
								 
						UnlockRelationForResyncExtension(&smgr_relation->smgr_rnode, ExclusiveLock);
					}
					
					smgrimmedsync(smgr_relation);
					smgrclose(smgr_relation);
					
					smgr_relation = NULL;
					break;
					
				case MirroredRelDataSynchronizationState_None:										
				case MirroredRelDataSynchronizationState_DataSynchronized:
					break;
					
				default:
					ereport(LOG, 
							(errmsg("could not resynchronize relation '%u/%u/%u' "
									"mirror synchronization state:'%s(%d)' ",
									entry->relFileNode.relNode,
									entry->relFileNode.spcNode,
									entry->relFileNode.dbNode,
									MirroredRelDataSynchronizationState_Name(entry->mirrorDataSynchronizationState),
									entry->mirrorDataSynchronizationState)));
					break;
			}
			break;
			
		case PersistentFileSysRelStorageMgr_AppendOnly:
		{
			MirroredAppendOnlyOpen	mirroredOpen;
			int						primaryError;
			bool					mirrorDataLossOccurred;
			char					*buffer = NULL;
			int64					endOffset = entry->mirrorAppendOnlyNewEof;
			int64					startOffset = entry->mirrorAppendOnlyLossEof;
			int32					bufferLen = 0;
			int						retval = 0;
			
			switch (entry->mirrorDataSynchronizationState)
			{
				case MirroredRelDataSynchronizationState_AppendOnlyCatchup:
				case MirroredRelDataSynchronizationState_FullCopy:
					
					/* 
					 * required in order to report how many blocks were synchronized 
					 * if gp_persistent_relation_node does not return that information 
					 */
					if (entry->mirrorBufpoolResyncChangedPageCount == 0)
					{
						entry->mirrorBufpoolResyncChangedPageCount = (endOffset - startOffset) / BLCKSZ;
					}					
					
					/*
					 * The MirroredAppendOnly_OpenResynchonize routine knows we are a resynch worker and
					 * will open BOTH, but write only the MIRROR!!!
					 */
					MirroredAppendOnly_OpenResynchonize(
											&mirroredOpen, 
											&entry->relFileNode,
											entry->segmentFileNum,
											startOffset,
											&primaryError,
											&mirrorDataLossOccurred);
					if (primaryError != 0)
					{
						ereport(ERROR,
								(errcode_for_file_access(),
								 errmsg("could not open file %u/%u/%u.%u : %s",
										entry->relFileNode.dbNode,
										entry->relFileNode.spcNode,
										entry->relFileNode.relNode,
										entry->segmentFileNum,
										strerror(primaryError))));
						
						break;
					}

					if (mirrorDataLossOccurred)
						break;
					
					/* AO and CO Data Store writes 64k size by default */
					bufferLen = (Size) Min(2*BLCKSZ, endOffset - startOffset);
					buffer = (char*) palloc(bufferLen);
					MemSet(buffer, 0, bufferLen);
					
					while (startOffset < endOffset)
					{
						retval = MirroredAppendOnly_Read(
												&mirroredOpen,
												buffer,
												bufferLen);
						
						if (retval != bufferLen) 
						{
							ereport(ERROR,
									(errcode_for_file_access(),
									 errmsg("could not read from position:" INT64_FORMAT " in file %u/%u/%u.%u : %m",
											startOffset, 
											entry->relFileNode.dbNode,
											entry->relFileNode.spcNode,
											entry->relFileNode.relNode,
											entry->segmentFileNum)));
							
							break;
						}						
						
						MirroredAppendOnly_Append(
											  &mirroredOpen,
											  buffer,
											  bufferLen,
											  &primaryError,
											  &mirrorDataLossOccurred);
						
						if (mirrorDataLossOccurred)
							break;

						Assert(primaryError == 0);	// No primary writes as resync worker.
						
						startOffset += bufferLen;
						/* AO and CO Data Store writes 64k size by default */
						bufferLen = (Size) Min(2*BLCKSZ, endOffset - startOffset);						
					}
					
					pfree(buffer);
					buffer = NULL;
					
					if (mirrorDataLossOccurred)
						break;
					
					/* Flush written data on Mirror */
					MirroredAppendOnly_Flush(
										&mirroredOpen,
										&primaryError,
										&mirrorDataLossOccurred);
					if (mirrorDataLossOccurred)
						break;
					
					Assert(primaryError == 0);	// Not flushed on primary as resync worker.
					
					/* Close Primary and Mirror */
					MirroredAppendOnly_Close(
										&mirroredOpen,
										&mirrorDataLossOccurred);
								
					break;
					
				case MirroredRelDataSynchronizationState_None:										
				case MirroredRelDataSynchronizationState_DataSynchronized:
					break;					
					
				default:
					ereport(LOG, 
							(errmsg("could not resynchronize relation '%u/%u/%u' "
									"mirror synchronization state:'%s(%d)' ",
									entry->relFileNode.relNode,
									entry->relFileNode.spcNode,
									entry->relFileNode.dbNode,
									MirroredRelDataSynchronizationState_Name(entry->mirrorDataSynchronizationState),
									entry->mirrorDataSynchronizationState)));
					break;
			}
			
			break;
		}	//case
		default:
			Assert(0);
			break;
	} //switch
	
	if (mirrorDataLossOccurred)
		status = STATUS_ERROR;
	
	return status;
}
/*
 * Indicate we intend to create a filespace file as part of the current transaction.
 *
 * An XLOG IntentToCreate record is generated that will guard the subsequent file-system
 * create in case the transaction aborts.
 *
 * After 1 or more calls to this routine to mark intention about filespace files that are going
 * to be created, call ~_DoPendingCreates to do the actual file-system creates.  (See its
 * note on XLOG flushing).
 */
void PersistentFilespace_MarkCreatePending(
	Oid 		filespaceOid,
				/* The filespace where the filespace lives. */

	int16		primaryDbId,

	char 		*primaryFilespaceLocation,
				/*
				 * The primary filespace directory path.  NOT Blank padded.
				 * Just a NULL terminated string.
				 */

	int16		mirrorDbId,

	char 		*mirrorFilespaceLocation,

	MirroredObjectExistenceState mirrorExistenceState,

	ItemPointer		persistentTid,
				/* TID of the gp_persistent_rel_files tuple for the rel file */

	int64			*persistentSerialNum,


	bool			flushToXLog)
				/* When true, the XLOG record for this change will be flushed to disk. */

{
	WRITE_PERSISTENT_STATE_ORDERED_LOCK_DECLARE;

	PersistentFileSysObjName fsObjName;

	FilespaceDirEntry filespaceDirEntry;
	TransactionId topXid;
	Datum values[Natts_gp_persistent_filespace_node];
	char mirrorFilespaceLocationBlankPadded[FilespaceLocationBlankPaddedWithNullTermLen];
	char primaryFilespaceLocationBlankPadded[FilespaceLocationBlankPaddedWithNullTermLen];

	if (Persistent_BeforePersistenceWork())
	{
		if (Debug_persistent_print)
			elog(Persistent_DebugPrintLevel(),
				 "Skipping persistent filespace %u because we are before persistence work",
				 filespaceOid);

		return;	// The initdb process will load the persistent table once we out of bootstrap mode.
	}

	PersistentFilespace_VerifyInitScan();

	PersistentFileSysObjName_SetFilespaceDir(&fsObjName,filespaceOid);

	topXid = GetTopTransactionId();

	WRITE_PERSISTENT_STATE_ORDERED_LOCK;

	PersistentFilespace_BlankPadCopyLocation(
										primaryFilespaceLocationBlankPadded,
										primaryFilespaceLocation);
	
	PersistentFilespace_BlankPadCopyLocation(
										mirrorFilespaceLocationBlankPadded,
										mirrorFilespaceLocation);

	GpPersistentFilespaceNode_SetDatumValues(
										values,
										filespaceOid,
										primaryDbId,
										primaryFilespaceLocationBlankPadded,
										mirrorDbId,
										mirrorFilespaceLocationBlankPadded,
										PersistentFileSysState_CreatePending,
										/* createMirrorDataLossTrackingSessionNum */ 0,
										mirrorExistenceState,
										/* reserved */ 0,
										/* parentXid */ topXid,
										/* persistentSerialNum */ 0);	// This will be set by PersistentFileSysObj_AddTuple.

	PersistentFileSysObj_AddTuple(
							PersistentFsObjType_FilespaceDir,
							values,
							flushToXLog,
							persistentTid,
							persistentSerialNum);


	WRITE_FILESPACE_HASH_LOCK;

	filespaceDirEntry =	PersistentFilespace_CreateDirUnderLock(filespaceOid);

	Assert(filespaceDirEntry != NULL);

	filespaceDirEntry->dbId1 = primaryDbId;
	memcpy(filespaceDirEntry->locationBlankPadded1, primaryFilespaceLocationBlankPadded,
		   FilespaceLocationBlankPaddedWithNullTermLen);
	
	filespaceDirEntry->dbId2 = mirrorDbId;
	memcpy(filespaceDirEntry->locationBlankPadded2, mirrorFilespaceLocationBlankPadded,
		   FilespaceLocationBlankPaddedWithNullTermLen);

	filespaceDirEntry->state = PersistentFileSysState_CreatePending;
	ItemPointerCopy(persistentTid, &filespaceDirEntry->persistentTid);
	filespaceDirEntry->persistentSerialNum = *persistentSerialNum;

	WRITE_FILESPACE_HASH_UNLOCK;

	/*
	 * This XLOG must be generated under the persistent write-lock.
	 */
#ifdef MASTER_MIRROR_SYNC
	mmxlog_log_create_filespace(filespaceOid);
#endif

	SIMPLE_FAULT_INJECTOR(FaultBeforePendingDeleteFilespaceEntry);

	/*
	 * MPP-18228
	 * To make adding 'Create Pending' entry to persistent table and adding
	 * to the PendingDelete list atomic
	 */
	PendingDelete_AddCreatePendingEntryWrapper(
								&fsObjName,
								persistentTid,
								*persistentSerialNum);

	WRITE_PERSISTENT_STATE_ORDERED_UNLOCK;

	if (Debug_persistent_print)
		elog(Persistent_DebugPrintLevel(),
		     "Persistent filespace directory: Add '%s' in state 'Created', mirror existence state '%s', serial number " INT64_FORMAT " at TID %s",
			 PersistentFileSysObjName_ObjectName(&fsObjName),
			 MirroredObjectExistenceState_Name(mirrorExistenceState),
			 *persistentSerialNum,
			 ItemPointerToString(persistentTid));
}
Beispiel #11
0
/*
 * ExecWorkFile_Write
 *    write the given data from the end of the last write position.
 *
 * This function returns true if the write succeeds. Otherwise, return false.
 */
bool
ExecWorkFile_Write(ExecWorkFile *workfile,
				   void *data,
				   uint64 size)
{
	Assert(workfile != NULL);
	uint64 bytes;

	SIMPLE_FAULT_INJECTOR(WorkfileWriteFail);

	if (data == NULL || size == 0)
	{
		return false;
	}

	/* Test the per-query and per-segment limit */
	if ((workfile->flags & EXEC_WORKFILE_LIMIT_SIZE) &&
			!WorkfileDiskspace_Reserve(size))
	{
		/* Failed to reserve additional disk space, notify caller */
		workfile_mgr_report_error();
	}

	switch(workfile->fileType)
	{
		case BUFFILE:
			{}
			BufFile *buffile = (BufFile *)workfile->file;

			int64 current_size = BufFileGetSize(buffile);
			int64 new_size = 0;

			PG_TRY();
			{
				bytes = BufFileWrite(buffile, data, size);
			}
			PG_CATCH();
			{
				new_size = BufFileGetSize(buffile);
				workfile->size = new_size;
				WorkfileDiskspace_Commit( (new_size - current_size), size, true /* update_query_size */);

				PG_RE_THROW();
			}
			PG_END_TRY();

			new_size = BufFileGetSize(buffile);
			workfile->size = new_size;

			WorkfileDiskspace_Commit( (new_size - current_size), size, true /* update_query_size */);
			workfile_set_update_in_progress_size(workfile->work_set, new_size - current_size);

			if (bytes != size)
			{
				workfile_mgr_report_error();
			}

			break;
		case BFZ:

			PG_TRY();
			{
				bfz_append((bfz_t *)workfile->file, data, size);
			}
			PG_CATCH();
			{
				Assert(WorkfileDiskspace_IsFull());
				WorkfileDiskspace_Commit(0, size, true /* update_query_size */);

				PG_RE_THROW();
			}
			PG_END_TRY();

			/* bfz_append always adds to the file size */
			workfile->size += size;
			if ((workfile->flags & EXEC_WORKFILE_LIMIT_SIZE))
			{
				WorkfileDiskspace_Commit(size, size, true /* update_query_size */);
			}
			workfile_set_update_in_progress_size(workfile->work_set, size);

			break;
		default:
			insist_log(false, "invalid work file type: %d", workfile->fileType);
	}

	return true;
}
Beispiel #12
0
/*
 * Indicate we intend to create a relation file as part of the current transaction.
 *
 * This function adds an entry in 'gp_persistent_relation_node' for either a new table (segment file
 * # 0) or a new segment file under AO table (segment file # > 0 for row/column-oriented AO) with a state
 * 'Create Pending'. An XLOG IntentToCreate record is generated that will guard the subsequent file-system
 * create in case the transaction aborts.
 *
 * Paramaters
 * -----------
 * relFileNode = The tablespace, database, and relation OIDs for the create
 * segmentFileNum = As the name implies (   0 for heap
 *                                       >= 0 for RO/CO AO as applicable)
 * relStorageMgr = Persistent Relation storage Manager
 * relBufpoolKind = Buffer pool type beneath corrosponding relation
 * TODO bufferPollBulkLoad = ???
 * TODO mirrorExistenceState = ???
 * TODO relDataSynchronizationState = ???
 * flushToXlog = If true, the XLOG record for this change will be flushed to disk.
 * TODO isLocalBuf = ???
 *
 * Return
 * ------
 * relationName = Name of the relation used for either debugging or to store in PendingDelete LL.
 * persistentTid = Resulting TID of the gp_persistent_rel_files tuple for the relation
 * serialNum = Resulting serial number for the relation.  Distinquishes the uses of the tuple
 */
void PersistentRelation_AddCreatePending(
	RelFileNode 		*relFileNode,
	int32				segmentFileNum,
	PersistentFileSysRelStorageMgr relStorageMgr,
	PersistentFileSysRelBufpoolKind relBufpoolKind,
	bool				bufferPoolBulkLoad,
	MirroredObjectExistenceState mirrorExistenceState,
	MirroredRelDataSynchronizationState relDataSynchronizationState,
	char				*relationName,
	ItemPointer			persistentTid,
	int64				*serialNum,
	bool 				flushToXLog,
	bool				isLocalBuf)
{
	WRITE_PERSISTENT_STATE_ORDERED_LOCK_DECLARE;

	PersistentFileSysObjName fsObjName;

	XLogRecPtr mirrorBufpoolResyncCkptLoc;

	Datum values[Natts_gp_persistent_relation_node];

	if(RelFileNode_IsEmpty(relFileNode))
		elog(ERROR, "Invalid RelFileNode (0,0,0)");

	MemSet(&mirrorBufpoolResyncCkptLoc, 0, sizeof(XLogRecPtr));

	if (Persistent_BeforePersistenceWork())
	{	
		if (Debug_persistent_print)
			elog(Persistent_DebugPrintLevel(), 
			     "Skipping persistent relation '%s' because we are before persistence work",
				 relpath(*relFileNode));

		MemSet(persistentTid, 0, sizeof(ItemPointerData));
		*serialNum = 0;

		return;	// The initdb process will load the persistent table once we out of bootstrap mode.
	}

	/* Verify if the needed shared mem data structures for persistent tables are setup and inited */
	PersistentRelation_VerifyInitScan();

	/* Setup the file system object name */
	PersistentFileSysObjName_SetRelationFile(
										&fsObjName, 
										relFileNode,
										segmentFileNum);

	WRITE_PERSISTENT_STATE_ORDERED_LOCK;

	/* Create a values array which will be used to create a 'gp_persistent_relation_node' tuple */
	GpPersistentRelationNode_SetDatumValues(
										values,
										relFileNode->spcNode,
										relFileNode->dbNode,
										relFileNode->relNode,
										segmentFileNum,
										relStorageMgr,
										(bufferPoolBulkLoad ?
												PersistentFileSysState_BulkLoadCreatePending :
												PersistentFileSysState_CreatePending),
										/* createMirrorDataLossTrackingSessionNum */ 0,
										mirrorExistenceState,
										relDataSynchronizationState,
										/* mirrorBufpoolMarkedForScanIncrementalResync */ false,
										/* mirrorBufpoolResyncChangedPageCount */ 0,
										&mirrorBufpoolResyncCkptLoc,
										/* mirrorBufpoolResyncCkptBlockNum */ 0,
										/* mirrorAppendOnlyLossEof */ 0,
										/* mirrorAppendOnlyNewEof */ 0,
										relBufpoolKind,
										GetTopTransactionId(),
										/* persistentSerialNum */ 0);	// This will be set by PersistentFileSysObj_AddTuple.

	/* Add a new tuple to 'gp_persistent_relation_node' table for the new relation/segment file
	 * we intend to create. This will also create and apply a new persistent serial number. */
	PersistentFileSysObj_AddTuple(
							PersistentFsObjType_RelationFile,
							values,
							flushToXLog,
							persistentTid,
							serialNum);
		
	/*
	 * This XLOG must be generated under the persistent write-lock.
	 */
#ifdef MASTER_MIRROR_SYNC
	mmxlog_log_create_relfilenode(
						relFileNode->spcNode,
						relFileNode->dbNode,
						relFileNode->relNode,
						segmentFileNum);	
#endif

	SIMPLE_FAULT_INJECTOR(FaultBeforePendingDeleteRelationEntry);

   /* We'll add an entry to the PendingDelete LinkedList (LL) to remeber what we
    * created in this transaction (or sub-transaction). If the transaction
    * aborts then we can search for all such entries in this LL and get rid of (delete)
    * such relations or segment files on the disk.
	*
	* MPP-18228
	* To make adding 'Create Pending' entry to persistent table and adding
	* to the PendingDelete list atomic
	*/
	PendingDelete_AddCreatePendingRelationEntry(
								&fsObjName,
								persistentTid,
								serialNum,
								relStorageMgr,
								relationName,
								isLocalBuf,
								bufferPoolBulkLoad);


	WRITE_PERSISTENT_STATE_ORDERED_UNLOCK;

	if (Debug_persistent_print)
		elog(Persistent_DebugPrintLevel(), 
		     "Persistent relation: Add '%s', relation name '%s' in state 'Create Pending', relation storage manager '%s', mirror existence state '%s', relation data resynchronization state '%s', serial number " INT64_FORMAT " at TID %s",
			 PersistentFileSysObjName_ObjectName(&fsObjName),
			 relationName,
			 PersistentFileSysRelStorageMgr_Name(relStorageMgr),
			 MirroredObjectExistenceState_Name(mirrorExistenceState),
			 MirroredRelDataSynchronizationState_Name(relDataSynchronizationState),
			 *serialNum,
			 ItemPointerToString(persistentTid));
}
Beispiel #13
0
/*
 * FileRepAckPrimary_RunConsumer()
 */
static int
FileRepAckPrimary_RunConsumer(void)
{
	FileRepShmemMessageDescr_s	*fileRepShmemMessageDescr = NULL;
	FileRepMessageHeader_s		*fileRepMessageHeader = NULL;
	pg_crc32					*fileRepMessageHeaderCrc;
	pg_crc32					messageHeaderCrcLocal = 0;
	int							status = STATUS_OK;
	bool						movePositionConsume = FALSE;
	FileRepShmem_s              *fileRepAckShmem = NULL;
		
	FileRep_InsertConfigLogEntry("run consumer");
	
	fileRepAckShmem = fileRepAckShmemArray[FILEREP_ACKSHMEM_MESSAGE_SLOT_PRIMARY_ACK];
	
	while (1) {	
		
		LWLockAcquire(FileRepAckShmemLock, LW_EXCLUSIVE);
		
		if (movePositionConsume) {
			
			fileRepAckShmem->positionConsume = 
				fileRepAckShmem->positionConsume +
				fileRepShmemMessageDescr->messageLength + 
				sizeof(FileRepShmemMessageDescr_s);
			
			if (fileRepAckShmem->positionConsume == fileRepAckShmem->positionWraparound &&
				fileRepAckShmem->positionInsert != fileRepAckShmem->positionWraparound) {
				
				fileRepAckShmem->positionConsume = fileRepAckShmem->positionBegin;
				fileRepAckShmem->positionWraparound = fileRepAckShmem->positionEnd;
			}
			FileRep_IpcSignal(fileRepIpcArray[fileRepAckShmem->ipcArrayIndex]->semP, 
							  &fileRepIpcArray[fileRepAckShmem->ipcArrayIndex]->refCountSemP);
		}
				
		fileRepShmemMessageDescr = 
			(FileRepShmemMessageDescr_s*) fileRepAckShmem->positionConsume;	

		while ((fileRepAckShmem->positionConsume == fileRepAckShmem->positionInsert) ||
			   ((fileRepAckShmem->positionConsume != fileRepAckShmem->positionInsert) &&
				(fileRepShmemMessageDescr->messageState != FileRepShmemMessageStateReady))) {
			
			fileRepIpcArray[fileRepAckShmem->ipcArrayIndex]->refCountSemC++;

			LWLockRelease(FileRepAckShmemLock);
						
			FileRepSubProcess_ProcessSignals();
			if (FileRepSubProcess_GetState() != FileRepStateReady &&
				FileRepSubProcess_GetState() != FileRepStateInitialization) {
				LWLockAcquire(FileRepAckShmemLock, LW_EXCLUSIVE);
				break;
			}
			
			FileRep_IpcWait(fileRepIpcArray[fileRepAckShmem->ipcArrayIndex]->semC, &fileRepIpcArray[fileRepAckShmem->ipcArrayIndex]->refCountSemC, FileRepAckShmemLock);
			
			LWLockAcquire(FileRepAckShmemLock, LW_EXCLUSIVE);
			
			if (fileRepAckShmem->positionConsume == fileRepAckShmem->positionWraparound &&
				fileRepAckShmem->positionInsert != fileRepAckShmem->positionWraparound) {
				
				fileRepAckShmem->positionConsume = fileRepAckShmem->positionBegin;
				fileRepAckShmem->positionWraparound = fileRepAckShmem->positionEnd;
			}
			
			/* Re-assign to find if messageState is changed */
			fileRepShmemMessageDescr = 
				(FileRepShmemMessageDescr_s*) fileRepAckShmem->positionConsume;	
				
		} // internal while
		fileRepAckShmem->consumeCount++;
		LWLockRelease(FileRepAckShmemLock);
		
		FileRepSubProcess_ProcessSignals();
		if (FileRepSubProcess_GetState() != FileRepStateReady && 
			FileRepSubProcess_GetState() != FileRepStateInitialization) {
			break;
		}
		
		SIMPLE_FAULT_INJECTOR(FileRepConsumer);
		
		/* Calculate and compare FileRepMessageHeader_s Crc */
		fileRepMessageHeader = (FileRepMessageHeader_s*) (fileRepAckShmem->positionConsume + 
														  sizeof(FileRepShmemMessageDescr_s));
		
		FileRep_CalculateCrc((char *) fileRepMessageHeader,
							 sizeof(FileRepMessageHeader_s),
							 &messageHeaderCrcLocal);	
		
		fileRepMessageHeaderCrc =
			(pg_crc32 *) (fileRepAckShmem->positionConsume + 
						  sizeof(FileRepMessageHeader_s) + 
						  sizeof(FileRepShmemMessageDescr_s));
		
		if (*fileRepMessageHeaderCrc != messageHeaderCrcLocal) 
		{
			status = STATUS_ERROR;
			ereport(WARNING,
					(errmsg("mirror failure, "
							"could not match ack message header checksum between primary '%u' and mirror '%u', "
							"failover requested", 
							*fileRepMessageHeaderCrc, 
							messageHeaderCrcLocal),
					 errhint("run gprecoverseg to re-establish mirror connectivity"),
					 FileRep_errdetail(fileRepMessageHeader->fileRepIdentifier,
									   fileRepMessageHeader->fileRepRelationType,
									   fileRepMessageHeader->fileRepOperation,
									   fileRepMessageHeader->messageCount),
					 FileRep_errdetail_ShmemAck(),
					 FileRep_errcontext()));		
						
			break;
		}
				
	    /* Write operation is never acknowledged. 
		 * That means message should never have body. 
		 * CRC of body should be always 0.
		 */
		Assert(fileRepMessageHeader->fileRepOperation != FileRepOperationWrite);
		Assert(fileRepMessageHeader->fileRepMessageBodyCrc == 0);
		
		switch (fileRepMessageHeader->fileRepOperation)
		{
			case FileRepOperationReconcileXLogEof:			
				xLogEof = fileRepMessageHeader->fileRepOperationDescription.reconcile.xLogEof;

				if (Debug_filerep_print)
					ereport(LOG,
						(errmsg("ack reconcile xlogid '%d' xrecoff '%d' ",
							xLogEof.xlogid, 
							xLogEof.xrecoff)));	

				break;
		
			case FileRepOperationValidation:
				mirrorStatus = fileRepMessageHeader->fileRepOperationDescription.validation.mirrorStatus;

				if (Debug_filerep_print)
					ereport(LOG,
						(errmsg("ack validation status '%s' ",
							FileRepStatusToString[mirrorStatus])));	

				break;
				
			case FileRepOperationCreate:
				mirrorStatus = fileRepMessageHeader->fileRepOperationDescription.create.mirrorStatus;

				if (Debug_filerep_print)
					ereport(LOG,
						(errmsg("ack create status '%s' ",
								FileRepStatusToString[mirrorStatus])));	

				break;

			case FileRepOperationStartSlruChecksum:
				mirrorStatus =
					fileRepMessageHeader->fileRepOperationDescription.startChecksum.mirrorStatus;

				if (Debug_filerep_print)
				{
					ereport(LOG,
						(errmsg("ack start SLRU checksum: status = '%s', directory = '%s' ",
								FileRepStatusToString[mirrorStatus],
								fileRepMessageHeader->fileRepIdentifier.fileRepFlatFileIdentifier.directorySimpleName)));
				}

				break;

			case FileRepOperationVerifySlruDirectoryChecksum:
				mirrorStatus =
					fileRepMessageHeader->fileRepOperationDescription.verifyDirectoryChecksum.mirrorStatus;

				if (Debug_filerep_print)
				{
					ereport(LOG,
						(errmsg("ack verify SLRU directory checksum: status = '%s', directory = '%s' ",
								FileRepStatusToString[mirrorStatus],
								fileRepMessageHeader->fileRepIdentifier.fileRepFlatFileIdentifier.directorySimpleName)));
				}

				break;
				
			default:
				break;
		}
		
		if (fileRepMessageHeader->fileRepAckState != FileRepAckStateCompleted) {

			status = STATUS_ERROR;
			
			ereport(WARNING,
					(errmsg("mirror failure, "
							"could not complete operation on mirror ack state '%s', "
							"failover requested", 
							FileRepAckStateToString[fileRepMessageHeader->fileRepAckState]),
					 errhint("run gprecoverseg to re-establish mirror connectivity"),
					 errSendAlert(true),
					 FileRep_errdetail(fileRepMessageHeader->fileRepIdentifier,
									   fileRepMessageHeader->fileRepRelationType,
									   fileRepMessageHeader->fileRepOperation,
									   fileRepMessageHeader->messageCount),
					 FileRep_errdetail_Shmem(),
					 FileRep_errdetail_ShmemAck(),
					 FileRep_errcontext()));	
			
			/* 
			 * FAULT has to be set before entry is updated in ack hash table
			 * in order to suspend backend process.
			 */	
			FileRep_SetSegmentState(SegmentStateFault, FaultTypeMirror);
			FileRepSubProcess_ProcessSignals();
		}
				
		if (FileRepAckPrimary_UpdateHashEntry(
				fileRepMessageHeader->fileRepIdentifier,
				fileRepMessageHeader->fileRepRelationType,
				fileRepMessageHeader->fileRepAckState) != STATUS_OK) {
			
			status = STATUS_ERROR;
			ereport(WARNING,
					(errmsg("mirror failure, "
							"could not update ack state '%s' in ack hash table, "
							"failover requested", 
							FileRepAckStateToString[fileRepMessageHeader->fileRepAckState]),
					 errhint("run gprecoverseg to re-establish mirror connectivity"),
					 errSendAlert(true),
					 FileRep_errdetail(fileRepMessageHeader->fileRepIdentifier,
									   fileRepMessageHeader->fileRepRelationType,
									   fileRepMessageHeader->fileRepOperation,
									   fileRepMessageHeader->messageCount),
					 FileRep_errdetail_Shmem(),
					 FileRep_errdetail_ShmemAck(),
					 FileRep_errcontext()));					
		}
	
		FileRep_InsertLogEntry(
							   "P_RunConsumer",
							   fileRepMessageHeader->fileRepIdentifier,
							   fileRepMessageHeader->fileRepRelationType,
							   fileRepMessageHeader->fileRepOperation,
							   messageHeaderCrcLocal,
							   fileRepMessageHeader->fileRepMessageBodyCrc,
							   fileRepMessageHeader->fileRepAckState,
							   FILEREP_UNDEFINED,
							   fileRepMessageHeader->messageCount);				
		
		if (status != STATUS_OK) {
			break;
		}
				
		movePositionConsume = TRUE;		
	} // while(1)	
	
	return status;
}
Beispiel #14
0
static int
FileRepAckPrimary_RunReceiver(void)
{
	uint32_t				msgLength = 0;
	FileRepConsumerProcIndex_e	msgType;
	int						status = STATUS_OK;
	char					*msgPositionInsert;
	FileRepShmemMessageDescr_s  *fileRepShmemMessageDescr;
	uint32					spareField;
	
	FileRep_InsertConfigLogEntry("run receiver");
	
	while (1) {
		
		FileRepSubProcess_ProcessSignals();
		if (FileRepSubProcess_GetState() != FileRepStateReady &&
			FileRepSubProcess_GetState() != FileRepStateInitialization) {
			break;
		}
		
		if ( ! FileRepConnServer_AwaitMessageBegin()) {
			/* call was interrupted ... go back to beginning to process signals */
			continue;
		}

		status = FileRepConnServer_ReceiveMessageType(&msgType);
		
		if (status != STATUS_OK) {
			break;
		}
				
		/* DATA MESSAGE TYPE */
		status = FileRepConnServer_ReceiveMessageLength(&msgLength);
		
		if (status != STATUS_OK) {
			break;
		}

		msgPositionInsert = FileRep_ReserveShmem(fileRepAckShmemArray[msgType], 
												 msgLength, 
												 /* not used */ &spareField, 
												 FileRepOperationNotSpecified, 
												 FileRepAckShmemLock);
		
		if (msgPositionInsert == NULL) {
			
			status = STATUS_ERROR;
			ereport(WARNING,
					(errmsg("mirror failure, "
							"could not queue received ack message to be processed, "
							"failover requested"), 
					 errhint("run gprecoverseg to re-establish mirror connectivity"),
					 FileRep_errdetail_Shmem(),
					 FileRep_errdetail_ShmemAck(),
					 FileRep_errcontext()));													
			break;
		}
		
		status = FileRepConnServer_ReceiveMessageData(
						msgPositionInsert + sizeof(FileRepShmemMessageDescr_s),
						msgLength);
		
		if (status != STATUS_OK) {
			break;
		}		
		
		SIMPLE_FAULT_INJECTOR(FileRepReceiver);
		
		fileRepShmemMessageDescr = 
		(FileRepShmemMessageDescr_s*) msgPositionInsert;	
		
		/* it is not in use */
		fileRepShmemMessageDescr->messageSync = FALSE;
		
		fileRepShmemMessageDescr->messageState = FileRepShmemMessageStateReady; 
		
		LWLockAcquire(FileRepAckShmemLock, LW_EXCLUSIVE);
		
		FileRep_IpcSignal(fileRepIpcArray[fileRepAckShmemArray[msgType]->ipcArrayIndex]->semC, 
						  &fileRepIpcArray[fileRepAckShmemArray[msgType]->ipcArrayIndex]->refCountSemC);
		
		LWLockRelease(FileRepAckShmemLock);
		
		FileRep_InsertLogEntry(
							   "P_RunReceiver",
							   FileRep_GetFlatFileIdentifier("", ""),
							   FileRepRelationTypeNotSpecified,
							   FileRepOperationNotSpecified,
							   FILEREP_UNDEFINED,
							   FILEREP_UNDEFINED,
							   FileRepAckStateNotInitialized,
							   spareField,
							   FILEREP_UNDEFINED);			
		
	} // while(1)
	
	FileRepConnServer_CloseConnection();
	
	return status;
}
Beispiel #15
0
/*
 * SenderLoop
 *
 */
static int
FileRepAckMirror_RunSender(void)
{
	FileRepShmemMessageDescr_s	*fileRepShmemMessageDescr=NULL;
	char						*fileRepMessage;
	int							status = STATUS_OK;
	bool						movePositionConsume = FALSE;
	FileRepConsumerProcIndex_e  messageType;
	FileRepMessageHeader_s		*fileRepMessageHeader;
	FileRepShmem_s              *fileRepAckShmem = NULL;
	
	FileRep_InsertConfigLogEntry("run sender ack");
	
	fileRepAckShmem = fileRepAckShmemArray[FILEREP_OUTGOING_MESSAGE_QUEUE];

	while (1) {

		LWLockAcquire(FileRepAckShmemLock, LW_EXCLUSIVE);
		
		if (movePositionConsume) {
			
			fileRepAckShmem->positionConsume = 
					fileRepAckShmem->positionConsume +
					fileRepShmemMessageDescr->messageLength + 
					sizeof(FileRepShmemMessageDescr_s);
			
			if (fileRepAckShmem->positionConsume == fileRepAckShmem->positionWraparound &&
				fileRepAckShmem->positionInsert != fileRepAckShmem->positionWraparound) {
				
				fileRepAckShmem->positionConsume = fileRepAckShmem->positionBegin;
				fileRepAckShmem->positionWraparound = fileRepAckShmem->positionEnd;
			}
			
			FileRep_IpcSignal(fileRepIpcArray[fileRepAckShmem->ipcArrayIndex]->semP, 
							  &fileRepIpcArray[fileRepAckShmem->ipcArrayIndex]->refCountSemP);
		}
		
		fileRepShmemMessageDescr = 
		(FileRepShmemMessageDescr_s*) fileRepAckShmem->positionConsume;	
		
		while ((fileRepAckShmem->positionConsume == fileRepAckShmem->positionInsert) ||
			   ((fileRepAckShmem->positionConsume != fileRepAckShmem->positionInsert) &&
				(fileRepShmemMessageDescr->messageState != FileRepShmemMessageStateReady))) {
			
			fileRepIpcArray[fileRepAckShmem->ipcArrayIndex]->refCountSemC++;
			
			LWLockRelease(FileRepAckShmemLock);
			
			FileRepSubProcess_ProcessSignals();
			if (FileRepSubProcess_GetState() != FileRepStateReady) {

				LWLockAcquire(FileRepAckShmemLock, LW_EXCLUSIVE);
				break;
			}
			
			FileRep_IpcWait(fileRepIpcArray[fileRepAckShmem->ipcArrayIndex]->semC, &fileRepIpcArray[fileRepAckShmem->ipcArrayIndex]->refCountSemC, FileRepAckShmemLock);
						
			LWLockAcquire(FileRepAckShmemLock, LW_EXCLUSIVE); 
			
			if (fileRepAckShmem->positionConsume == fileRepAckShmem->positionWraparound &&
				fileRepAckShmem->positionInsert != fileRepAckShmem->positionWraparound) {
				
				fileRepAckShmem->positionConsume = fileRepAckShmem->positionBegin;
				fileRepAckShmem->positionWraparound = fileRepAckShmem->positionEnd;
			}			
			
			/* Re-assign to find if messageState is changed */
			fileRepShmemMessageDescr = 
			(FileRepShmemMessageDescr_s*) fileRepAckShmem->positionConsume;				
		} // while internal
		fileRepAckShmem->consumeCount++;
		
		LWLockRelease(FileRepAckShmemLock); 

		FileRepSubProcess_ProcessSignals();
		if (FileRepSubProcess_GetState() != FileRepStateReady) {
			break;
		}
	
		FileRep_InsertLogEntry(
							   "M_RunSenderAck",
							   FileRep_GetFlatFileIdentifier("", ""),
							   FileRepRelationTypeNotSpecified,
							   FileRepOperationNotSpecified,
							   FILEREP_UNDEFINED,
							   FILEREP_UNDEFINED,
							   FileRepAckStateNotInitialized,
							   FILEREP_UNDEFINED,
							   FILEREP_UNDEFINED);		
				
		SIMPLE_FAULT_INJECTOR(FileRepSender);
		
		fileRepMessage = (char*) (fileRepAckShmem->positionConsume + 
								  sizeof(FileRepShmemMessageDescr_s));
		
		fileRepMessageHeader = (FileRepMessageHeader_s*) (fileRepAckShmem->positionConsume + 
														  sizeof(FileRepShmemMessageDescr_s));

		messageType = FileRepMessageTypeXLog;
		
		if (! FileRepConnClient_SendMessage(
						messageType,
						fileRepShmemMessageDescr->messageSync,
						fileRepMessage,
						fileRepShmemMessageDescr->messageLength)) 
		{

			ereport(WARNING, 
					(errcode_for_socket_access(),
					 errmsg("mirror failure, "
							"could not sent ack message to primary : %m, "
							"failover requested"),
					 errhint("run gprecoverseg to re-establish mirror connectivity"),
					 FileRep_errdetail_ShmemAck(),
					 FileRep_errcontext()));		
			
			status = STATUS_ERROR;
			break;
		}

		movePositionConsume = TRUE;
	} // while(1)
	
	FileRepConnClient_CloseConnection();

	return status;
}
Beispiel #16
0
Datei: fts.c Projekt: LJoNe/gpdb
/*
 * update segment configuration in catalog and shared memory
 */
static bool
probeUpdateConfig(FtsSegmentStatusChange *changes, int changeCount)
{
	Relation configrel;
	Relation histrel;
	SysScanDesc sscan;
	ScanKeyData scankey;
	HeapTuple configtuple;
	HeapTuple newtuple;
	HeapTuple histtuple;
	Datum configvals[Natts_gp_segment_configuration];
	bool confignulls[Natts_gp_segment_configuration] = { false };
	bool repls[Natts_gp_segment_configuration] = { false };
	Datum histvals[Natts_gp_configuration_history];
	bool histnulls[Natts_gp_configuration_history] = { false };
	bool valid;
	bool primary;
	bool changelogging;
	int i;
	char desc[SQL_CMD_BUF_SIZE];

	/*
	 * Commit/abort transaction below will destroy
	 * CurrentResourceOwner.  We need it for catalog reads.
	 */
	ResourceOwner save = CurrentResourceOwner;
	StartTransactionCommand();
	elog(LOG, "probeUpdateConfig called for %d changes", changeCount);

	histrel = heap_open(GpConfigHistoryRelationId,
						RowExclusiveLock);
	configrel = heap_open(GpSegmentConfigRelationId,
						  RowExclusiveLock);

	for (i = 0; i < changeCount; i++)
	{
		FtsSegmentStatusChange *change = &changes[i];
		valid   = (changes[i].newStatus & FTS_STATUS_ALIVE);
		primary = (changes[i].newStatus & FTS_STATUS_PRIMARY);
		changelogging = (changes[i].newStatus & FTS_STATUS_CHANGELOGGING);

		if (changelogging)
		{
			Assert(failover_strategy == 'f');
			Assert(primary && valid);
		}

		Assert((valid || !primary) && "Primary cannot be down");

		/*
		 * Insert new tuple into gp_configuration_history catalog.
		 */
		histvals[Anum_gp_configuration_history_time-1] =
				TimestampTzGetDatum(GetCurrentTimestamp());
		histvals[Anum_gp_configuration_history_dbid-1] =
				Int16GetDatum(changes[i].dbid);
		snprintf(desc, sizeof(desc),
				 "FTS: content %d fault marking status %s%s role %c",
				 change->segindex, valid ? "UP" : "DOWN",
				 (changelogging) ? " mode: change-tracking" : "",
				 primary ? 'p' : 'm');
		histvals[Anum_gp_configuration_history_desc-1] =
					CStringGetTextDatum(desc);

		histtuple = heap_form_tuple(RelationGetDescr(histrel), histvals, histnulls);
		simple_heap_insert(histrel, histtuple);
		CatalogUpdateIndexes(histrel, histtuple);

		/*
		 * Find and update gp_segment_configuration tuple.
		 */
		ScanKeyInit(&scankey,
					Anum_gp_segment_configuration_dbid,
					BTEqualStrategyNumber, F_INT2EQ,
					Int16GetDatum(changes[i].dbid));
		sscan = systable_beginscan(configrel, GpSegmentConfigDbidIndexId,
								   true, SnapshotNow, 1, &scankey);
		configtuple = systable_getnext(sscan);
		if (!HeapTupleIsValid(configtuple))
		{
			elog(ERROR, "FTS cannot find dbid=%d in %s", changes[i].dbid,
				 RelationGetRelationName(configrel));
		}
		configvals[Anum_gp_segment_configuration_role-1] =
				CharGetDatum(primary ? 'p' : 'm');
		repls[Anum_gp_segment_configuration_role-1] = true;
		configvals[Anum_gp_segment_configuration_status-1] =
				CharGetDatum(valid ? 'u' : 'd');
		repls[Anum_gp_segment_configuration_status-1] = true;
		if (changelogging)
		{
			configvals[Anum_gp_segment_configuration_mode-1] =
					CharGetDatum('c');
		}
		repls[Anum_gp_segment_configuration_mode-1] = changelogging;

		newtuple = heap_modify_tuple(configtuple, RelationGetDescr(configrel),
									 configvals, confignulls, repls);
		simple_heap_update(configrel, &configtuple->t_self, newtuple);
		CatalogUpdateIndexes(configrel, newtuple);

		systable_endscan(sscan);
		pfree(newtuple);
		/*
		 * Update shared memory
		 */
		ftsProbeInfo->fts_status[changes[i].dbid] = changes[i].newStatus;
	}
	heap_close(histrel, RowExclusiveLock);
	heap_close(configrel, RowExclusiveLock);

	SIMPLE_FAULT_INJECTOR(FtsWaitForShutdown);
	/*
	 * Do not block shutdown.  We will always get a change to update
	 * gp_segment_configuration in subsequent probes upon database
	 * restart.
	 */
	if (shutdown_requested)
	{
		elog(LOG, "Shutdown in progress, ignoring FTS prober updates.");
		return false;
	}
	CommitTransactionCommand();
	CurrentResourceOwner = save;
	return true;
}