/*
 * CheckInMemConstraintsPgType
 * 		Check uniqueness constraints for pg_type in-memory tuples upon insert
 */
static void
CheckInMemConstraintsPgType(InMemHeapRelation relation, HeapTuple newTuple)
{
	Assert(NULL != newTuple);
	Assert(NULL != relation);
	Assert(NULL != relation->rel);

	TupleDesc tupleDesc = relation->rel->rd_att;
	Oid relnamespaceNew = DatumGetObjectId(tuple_getattr(newTuple, tupleDesc, Anum_pg_type_typnamespace));
	char *typnameNew    = DatumGetCString(tuple_getattr(newTuple, tupleDesc, Anum_pg_type_typname));

	for (int i = 0; i < relation->tupsize; i++)
	{
		HeapTuple tuple = relation->tuples[i].tuple;
		Assert(NULL != tuple);

		insist_log(HeapTupleGetOid(tuple) != HeapTupleGetOid(newTuple),
					"in-memory tuple with Oid = %d already exists in pg_type.", HeapTupleGetOid(tuple));

		Oid relnamespace = DatumGetObjectId(tuple_getattr(tuple, tupleDesc, Anum_pg_type_typnamespace));
		char *typname    = DatumGetCString(tuple_getattr(tuple, tupleDesc, Anum_pg_type_typname));
		size_t typnameLen = strlen(typname);

		insist_log(relnamespace != relnamespaceNew ||
				   typnameLen != strlen(typnameNew) ||
				   0 != strncmp(typname, typnameNew, typnameLen),
				"in-memory tuple with typname = %s and typnamespace = %d already exists in pg_type.", typname, relnamespace);
	}
}
/*
 * CheckInMemConstraintsPgNamespace
 * 		Check uniqueness constraints for pg_namespace in-memory tuples upon insert
 */
static void
CheckInMemConstraintsPgNamespace(InMemHeapRelation relation, HeapTuple newTuple)
{
	Assert(NULL != newTuple);
	Assert(NULL != relation); 
	Assert(NULL != relation->rel);

	TupleDesc tupleDesc = relation->rel->rd_att;
	Oid nspdboidNew     = DatumGetObjectId(tuple_getattr(newTuple, tupleDesc, Anum_pg_namespace_nspdboid));
	char *nspnameNew    = DatumGetCString(tuple_getattr(newTuple, tupleDesc, Anum_pg_namespace_nspname));

	for (int i = 0; i < relation->tupsize; i++)
	{
		HeapTuple tuple = relation->tuples[i].tuple;
		Assert(NULL != tuple);

		insist_log(HeapTupleGetOid(tuple) != HeapTupleGetOid(newTuple), 
			"in-memory tuple with Oid = %d already exists in pg_namespace.", HeapTupleGetOid(tuple));

		Oid nspdboid  = DatumGetObjectId(tuple_getattr(tuple, tupleDesc, Anum_pg_namespace_nspdboid));
		char *nspname = DatumGetCString(tuple_getattr(tuple, tupleDesc, Anum_pg_namespace_nspname));
		size_t nspnameLen = strlen(nspname);

		insist_log(nspdboid != nspdboidNew ||
				   nspnameLen != strlen(nspnameNew) ||
				   0 != strncmp(nspname, nspnameNew, nspnameLen),
			"in-memory tuple with nspname = %s and nspdboid = %d already exists in pg_namespace.", nspname, nspdboid);
	}
}
Exemple #3
0
static void *
readtup_heap(Tuplestorestate *state, unsigned int len)
{
	void	   *tup = NULL;
	uint32		tuplen = 0;

	if (is_len_memtuplen(len))
	{
		tuplen = memtuple_size_from_uint32(len);
	}
	else
	{
		/* len is HeapTuple.t_len. The record size includes rest of the HeapTuple fields */
		tuplen = len + HEAPTUPLESIZE;
	}

	tup = (void *) palloc(tuplen);
	USEMEM(state, GetMemoryChunkSpace(tup));

	if(is_len_memtuplen(len))
	{
		/* read in the tuple proper */
		memtuple_set_mtlen((MemTuple) tup, len);

		if (BufFileRead(state->myfile, (void *) ((char *) tup + sizeof(uint32)),
					tuplen - sizeof(uint32))
				!= (size_t) (tuplen - sizeof(uint32)))
		{
			insist_log(false, "unexpected end of data");
		}
	}
	else
	{
		HeapTuple htup = (HeapTuple) tup;
		htup->t_len = tuplen - HEAPTUPLESIZE;

		if (BufFileRead(state->myfile, (void *) ((char *) tup + sizeof(uint32)),
					tuplen - sizeof(uint32))
				!= (size_t) (tuplen - sizeof(uint32)))
		{
			insist_log(false, "unexpected end of data");
		}
		htup->t_data = (HeapTupleHeader ) ((char *) tup + HEAPTUPLESIZE);
	}

	if (state->backward)	/* need trailing length word? */
	{
		if (BufFileRead(state->myfile, (void *) &tuplen,
						sizeof(tuplen)) != sizeof(tuplen))
		{
			insist_log(false, "unexpected end of data");
		}
	}

	return (void *) tup;
}
Exemple #4
0
static uint32
getlen(Tuplestorestate *state, TuplestorePos *pos, bool eofOK)
{
	uint32 len;
	size_t		nbytes;

	nbytes = BufFileRead(state->myfile, (void *) &len, sizeof(len));
	if (nbytes == sizeof(len))
		return len;
	insist_log(nbytes == 0, "unexpected end of tape");
	insist_log(eofOK, "unexpected end of data");
	return 0;
}
/*
 * Check all the connections of a gang.
 *
 * return the count of successful connections and
 * the count of failed connections due to recovery.
 */
static void
checkConnectionStatus(Gang *gp,
					  int *countInRecovery,
					  int *countSuccessful,
					  struct PQExpBufferData *errorMessage)
{
	SegmentDatabaseDescriptor *segdbDesc = NULL;
	int			size = gp->size;
	int			i = 0;

	/*
	 * In this loop, we check whether the connections were successful. If not,
	 * we recreate the error message with palloc and report it.
	 */
	for (i = 0; i < size; i++)
	{
		segdbDesc = &gp->db_descriptors[i];

		/*
		 * check connection established or not, if not, we may have to
		 * re-build this gang.
		 */
		if (segdbDesc->errcode && segdbDesc->error_message.len > 0)
		{
			/*
			 * Log failed connections.	Complete failures are taken care of
			 * later.
			 */
			Assert(segdbDesc->whoami != NULL);
			elog(LOG, "Failed connection to %s", segdbDesc->whoami);

			insist_log(segdbDesc->errcode != 0 && segdbDesc->error_message.len != 0,
					   "connection is null, but no error code or error message, for segDB %d", i);

			ereport(LOG, (errcode(segdbDesc->errcode), errmsg("%s", segdbDesc->error_message.data)));

			/* this connect failed -- but why ? */
			if (segment_failure_due_to_recovery(segdbDesc->error_message.data))
			{
				elog(LOG, "segment is in recovery mode (%s)", segdbDesc->whoami);
				(*countInRecovery)++;
			}
			else
			{
				appendPQExpBuffer(errorMessage, "%s (%s)\n", segdbDesc->error_message.data, segdbDesc->whoami);
			}

			cdbconn_resetQEErrorMessage(segdbDesc);
		}
		else
		{
			Assert(segdbDesc->errcode == 0 && segdbDesc->error_message.len == 0);

			/* We have a live connection! */
			(*countSuccessful)++;
		}
	}
}
Exemple #6
0
/*
 * count_usable_fds --- count how many FDs the system will let us open,
 *		and estimate how many are already open.
 *
 * We stop counting if usable_fds reaches max_to_probe.  Note: a small
 * value of max_to_probe might result in an underestimate of already_open;
 * we must fill in any "gaps" in the set of used FDs before the calculation
 * of already_open will give the right answer.	In practice, max_to_probe
 * of a couple of dozen should be enough to ensure good results.
 *
 * We assume stdin (FD 0) is available for dup'ing
 */
static void
count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
{
	int		   *fd;
	int			size;
	int			used = 0;
	int			highestfd = 0;
	int			j;

	size = 1024;
	fd = (int *) palloc(size * sizeof(int));

	/* dup until failure or probe limit reached */
	for (;;)
	{
		int			thisfd;

		thisfd = dup(0);
		if (thisfd < 0)
		{
			/* Expect EMFILE or ENFILE, else it's fishy */
			if (errno != EMFILE && errno != ENFILE)
			{
				insist_log(false, "dup(0) failed after %d successes: %m", used);
			}
			break;
		}

		if (used >= size)
		{
			size *= 2;
			fd = (int *) repalloc(fd, size * sizeof(int));
		}
		fd[used++] = thisfd;

		if (highestfd < thisfd)
			highestfd = thisfd;

		if (used >= max_to_probe)
			break;
	}

	/* release the files we opened */
	for (j = 0; j < used; j++)
		close(fd[j]);

	pfree(fd);

	/*
	 * Return results.	usable_fds is just the number of successful dups. We
	 * assume that the system limit is highestfd+1 (remember 0 is a legal FD
	 * number) and so already_open is highestfd+1 - usable_fds.
	 */
	*usable_fds = used;
	*already_open = highestfd + 1 - used;
}
/* ----------------
 *      tuple_getattr
 *
 *      Extracts an attribute from a HeapTuple given its attnum and
 *      returns it as a Datum.
 *
 *      <tuple> is the pointer to the heap tuple.  <attnum> is the attribute
 *      number of the column (field) caller wants.  <tupleDesc> is a
 *      pointer to the structure describing the row and all its fields.
 *
 * ----------------
 */
Datum
tuple_getattr(HeapTuple tuple, TupleDesc tupleDesc, int attnum)
{
	Assert(NULL != tupleDesc);
	Assert(NULL != tuple);
	bool isnull;
	Datum attr = heap_getattr(tuple, attnum, tupleDesc, &isnull);
	insist_log(!isnull, "attribute cannot be null");
	return attr;
}
static void shareinput_clean_lk_ctxt(ShareInput_Lk_Context *lk_ctxt)
{
	int err;

	elog(DEBUG1, "shareinput_clean_lk_ctxt cleanup lk ctxt %p", lk_ctxt);

	if(lk_ctxt->readyfd >= 0)
	{
		err = gp_retry_close(lk_ctxt->readyfd);
		insist_log(!err, "shareinput_clean_lk_ctxt cannot close readyfd: %m");

		lk_ctxt->readyfd = -1;
	}

	if(lk_ctxt->donefd >= 0)
	{
		err = gp_retry_close(lk_ctxt->donefd);
		insist_log(!err, "shareinput_clean_lk_ctxt cannot close donefd: %m");

		lk_ctxt->donefd = -1;
	}

	if(lk_ctxt->del_ready && lk_ctxt->lkname_ready[0])
	{
		err = unlink(lk_ctxt->lkname_ready);
		insist_log(!err, "shareinput_clean_lk_ctxt cannot unlink \"%s\": %m", lk_ctxt->lkname_ready);

		lk_ctxt->del_ready = false;
	}

	if(lk_ctxt->del_done && lk_ctxt->lkname_done[0])
	{
		err = unlink(lk_ctxt->lkname_done);
		insist_log(!err, "shareinput_clean_lk_ctxt cannot unline \"%s\": %m", lk_ctxt->lkname_done);

		lk_ctxt->del_done = false;
	}

	gp_free2 (lk_ctxt, sizeof(ShareInput_Lk_Context));
}
void
ExecWorkFile_Flush(ExecWorkFile *workfile)
{
	Assert(workfile != NULL);
	switch(workfile->fileType)
	{
	case BUFFILE:
		BufFileFlush((BufFile *) workfile->file);
		break;
	default:
		insist_log(false, "invalid work file type: %d", workfile->fileType);
	}
}
/*
 * Re-open a suspended file for reading. This allocates all the necessary
 * buffers and data structures to restart reading from the file
 */
void
ExecWorkFile_Restart(ExecWorkFile *workfile)
{
	Assert(workfile != NULL);
	Assert((workfile->flags & EXEC_WORKFILE_SUSPENDABLE) != 0);

	switch(workfile->fileType)
	{
	case BFZ:
		bfz_scan_begin((bfz_t *) workfile->file);
		break;
	default:
		insist_log(false, "invalid work file type: %d", workfile->fileType);
	}
}
/*
 * CheckInMemConstraintsPgAttribute
 * 		Check uniqueness constraints for pg_attribute in-memory tuples upon insert
 */
static void
CheckInMemConstraintsPgAttribute(InMemHeapRelation relation, HeapTuple newTuple)
{
	Assert(NULL != newTuple);
	Assert(NULL != relation); 
	Assert(NULL != relation->rel);

	TupleDesc tupleDesc = relation->rel->rd_att;
	Oid attrelidNew     = DatumGetObjectId(tuple_getattr(newTuple, tupleDesc, Anum_pg_attribute_attrelid));
	char *attnameNew    = DatumGetCString(tuple_getattr(newTuple, tupleDesc, Anum_pg_attribute_attname));
	AttrNumber attnoNew = DatumGetInt16((tuple_getattr(newTuple, tupleDesc, Anum_pg_attribute_attnum)));

	for (int i = 0; i < relation->tupsize; i++)
	{
		HeapTuple tuple = relation->tuples[i].tuple;
		Assert(NULL != tuple);

		Oid attrelid     = DatumGetObjectId(tuple_getattr(tuple, tupleDesc, Anum_pg_attribute_attrelid));
		char *attname    = DatumGetCString(tuple_getattr(tuple, tupleDesc, Anum_pg_attribute_attname));
		AttrNumber attno = DatumGetInt16((tuple_getattr(tuple, tupleDesc, Anum_pg_attribute_attnum)));
		size_t attnameLen = strlen(attname);

		if (attrelid != attrelidNew)
		{
			/* attributes belong to different relations */
			continue;
		}

		insist_log(attno != attnoNew,
			"in-memory tuple with attrelid = %d and attno = %d already exists in pg_attribute.", attrelid, attno);

		insist_log((attnameLen != strlen(attnameNew)) ||
				   (0 != strncmp(attname, attnameNew, attnameLen)),
			"in-memory tuple with attrelid = %d and attname = %s already exists in pg_attribute.", attrelid, attname);
	}
}
Exemple #12
0
static CommandId
GetRealCmax(TransactionId xmin, CommandId combocid)
{
	if (combocid >= usedComboCids)
	{
		insist_log(!Gp_is_writer,
				"writer segworker group unable to resolve visibility %u/%u", combocid, usedComboCids);

		/* We're a reader */
		return getSharedComboCidEntry(xmin, combocid, CMAX);
	}

	Assert(combocid < usedComboCids);
	return comboCids[combocid].cmax;
}
Exemple #13
0
/*
 * Creates a LogicalTapeSet with a generated file name.
 */
LogicalTapeSet *LogicalTapeSetCreate(int ntapes, bool del_on_close)
{
	char tmpprefix[MAXPGPATH];
	int len = snprintf(tmpprefix, MAXPGPATH, "%s/slice%d_sort",
			PG_TEMP_FILES_DIR,
			currentSliceId);
	insist_log(len <= MAXPGPATH - 1, "could not generate temporary file name");
	StringInfo uniquename = ExecWorkFile_AddUniqueSuffix(tmpprefix);

	LogicalTapeSet *lts = LogicalTapeSetCreate_Named(uniquename->data, ntapes, del_on_close);

	pfree(uniquename->data);
	pfree(uniquename);

	return lts;
}
/*
 * Suspend a file without closing it. For bfz, which allocates a buffer for
 * each open a file, this frees up that buffer but keeps the fd so we can
 * re-open this file later
 *
 * Returns the actual size of the file on disk
 */
int64
ExecWorkFile_Suspend(ExecWorkFile *workfile)
{
	Assert(workfile != NULL);
	Assert((workfile->flags & EXEC_WORKFILE_SUSPENDABLE) != 0);

	int64 size = -1;
	switch(workfile->fileType)
	{
	case BFZ:
		size = bfz_append_end((bfz_t *) workfile->file);
		ExecWorkFile_AdjustBFZSize(workfile, size);
		break;
	default:
		insist_log(false, "invalid work file type: %d", workfile->fileType);
	}
	return size;
}
/*
 * ExecWorkFile_ReadFromBuffer
 *
 * This function provides a faster implementation of Read which applies
 * when the data is already in the underlying buffer.
 * In that case, it returns a pointer to the data in the buffer
 * If the data is not in the buffer, returns NULL and the caller must
 * call the regular ExecWorkFile_Read with a destination buffer.
 *
 * Currently only bfz supports this behavior.
 *
 */
void *
ExecWorkFile_ReadFromBuffer(ExecWorkFile *workfile,
				  uint64 size)
{
	Assert(workfile != NULL);
	void *data = NULL;

	switch(workfile->fileType)
	{
		case BFZ:
			data = bfz_scan_peek((bfz_t *)workfile->file, size);
			break;
		default:
			insist_log(false, "invalid work file type: %d", workfile->fileType);
	}

	return data;
}
/*
 * For a new workfile, sets the capabilities flags according to
 * the known underlying file type capabilities and the method the file was created
 */
static void
ExecWorkFile_SetFlags(ExecWorkFile *workfile, bool delOnClose, bool created)
{
	Assert(workfile != NULL);
	/* Assert that only the creator of a file can delete it on close */
	AssertImply(delOnClose, created);

	switch(workfile->fileType)
	{

	case BUFFILE:
		workfile->flags |= EXEC_WORKFILE_RANDOM_ACCESS;
		break;
	case BFZ:
		workfile->flags |= EXEC_WORKFILE_SUSPENDABLE;
		break;
	default:
		insist_log(false, "invalid work file type: %d", workfile->fileType);
	}

	if (delOnClose)
	{
		workfile->flags |= EXEC_WORKFILE_DEL_ON_CLOSE;
	}

	if (created)
	{
		workfile->flags |= EXEC_WORKFILE_CREATED;
		elog(gp_workfile_caching_loglevel, "Created workfile %s, delOnClose = %d",
				ExecWorkFile_GetFileName(workfile), delOnClose);
	}
	else
	{
		elog(gp_workfile_caching_loglevel, "Opened existing workfile %s, delOnClose = %d",
				ExecWorkFile_GetFileName(workfile), delOnClose);
	}

	if ((gp_workfile_limit_per_query > 0) || (gp_workfile_limit_per_segment > 0))
	{
		workfile->flags |= EXEC_WORKFILE_LIMIT_SIZE;
	}

}
Exemple #17
0
/*
 * Open a temporary file that will (optionally) disappear when we close it.
 *
 * If 'makenameunique' is true, this function generates a file name which
 * should be unique to this particular OpenTemporaryFile() request and
 * distinct from any others in concurrent use on the same host.  As a
 * convenience for monitoring and debugging, the given 'fileName' string
 * and 'extentseqnum' are embedded in the file name.
 *
 * If 'makenameunique' is false, then 'fileName' and 'extentseqnum' identify a
 * new or existing temporary file which other processes also could open and
 * share.
 *
 * If 'create' is true, a new file is created.  If successful, a valid vfd
 * index (>0) is returned; otherwise an error is thrown.
 *
 * If 'create' is false, an existing file is opened.  If successful, a valid
 * vfd index (>0) is returned.  If the file does not exist or cannot be
 * opened, an invalid vfd index (<= 0) is returned.
 *
 * If 'delOnClose' is true, then the file is removed when you call
 * FileClose(); or when the process exits; or (provided 'closeAtEOXact' is
 * true) when the transaction ends.
 *
 * If 'closeAtEOXact' is true, the vfd is closed automatically at end of
 * transaction unless you have called FileClose() to close it before then.
 * If 'closeAtEOXact' is false, the vfd state is not changed at end of
 * transaction.
 *
 * In most cases, you don't want temporary files to outlive the transaction
 * that created them, so you should specify 'true' for both 'delOnClose' and
 * 'closeAtEOXact'.
 */
File
OpenTemporaryFile(const char   *fileName,
                  int           extentseqnum,
                  bool          makenameunique,
                  bool          create,
                  bool          delOnClose,
                  bool          closeAtEOXact)
{

	char	tempfilepath[MAXPGPATH];

	Assert(fileName);
    AssertImply(makenameunique, create && delOnClose);


    char tempfileprefix[MAXPGPATH];

    int len = GetTempFilePrefix(tempfileprefix, MAXPGPATH, fileName);
    insist_log(len <= MAXPGPATH - 1, "could not generate temporary file name");

    if (makenameunique)
	{
		/*
		 * Generate a tempfile name that should be unique within the current
		 * database instance.
		 */
		snprintf(tempfilepath, sizeof(tempfilepath),
				 "%s_%d_%04d.%ld",
				 tempfileprefix,
				 MyProcPid,
                 extentseqnum,
                 tempFileCounter++);
	}
	else
	{
        snprintf(tempfilepath, sizeof(tempfilepath),
				 "%s.%04d",
				 tempfileprefix,
				 extentseqnum);
	}

    return OpenNamedFile(tempfilepath, create, delOnClose, closeAtEOXact);
}    /* OpenTemporaryFile */
/*
 * Save the serialized plan to a file in the workfile set.
 * It will be used to do full plan matching before reusing.
 */
static void
workfile_mgr_save_plan(workfile_set *work_set, workfile_set_plan *sf_plan)
{
	Assert(work_set);
	Assert(sf_plan);

	ExecWorkFile *plan_file = workfile_mgr_create_fileno(work_set, WORKFILE_NUM_ALL_PLAN);
	insist_log(plan_file != NULL, "Could not create temporary work file: %m");

	elog(gp_workfile_caching_loglevel, "Saving query plan to file %s", ExecWorkFile_GetFileName(plan_file));


	bool res = ExecWorkFile_Write(plan_file, sf_plan->serialized_plan,
			sf_plan->serialized_plan_len);
	if(!res)
	{
		workfile_mgr_report_error();
	}

	workfile_mgr_close_file(work_set, plan_file);
}
/*
 * CheckInMemConstraintsGpDistributionPolicy
 * 		Check uniqueness constraints for gp_distribution_policy in-memory tuples upon insert
 */
static void
CheckInMemConstraintsGpDistributionPolicy(InMemHeapRelation relation, HeapTuple newTuple)
{
	Assert(NULL != newTuple);
	Assert(NULL != relation);
	Assert(NULL != relation->rel);

	TupleDesc tupleDesc = relation->rel->rd_att;
	Oid reloidNew = DatumGetObjectId(tuple_getattr(newTuple, tupleDesc, Anum_gp_policy_localoid));

	for (int i = 0; i < relation->tupsize; i++)
	{
		HeapTuple tuple = relation->tuples[i].tuple;
		Assert(NULL != tuple);

		Oid reloid = DatumGetObjectId(tuple_getattr(tuple, tupleDesc, Anum_gp_policy_localoid));

		insist_log(reloidNew != reloid,
				   "in-memory tuple with localoid = %d already exists in gp_distribution_policy.", reloid);
	}
}
/*
 * ExecWorkFile_Tell64
 *    return the value of the current file position indicator.
 */
uint64
ExecWorkFile_Tell64(ExecWorkFile *workfile)
{
	Assert(workfile != NULL);
	uint64 bytes = 0;

	switch(workfile->fileType)
	{
		case BUFFILE:
			BufFileTell((BufFile *)workfile->file, (int64 *) &bytes);
			break;
			
		case BFZ:
			bytes = bfz_totalbytes((bfz_t *)workfile->file);
			break;
		default:
			insist_log(false, "invalid work file type: %d", workfile->fileType);
	}

	return bytes;
}
/*
 * CheckInMemConstraintsPgExttable
 * 		Check uniqueness constraints for pg_exttable in-memory tuples upon insert
 */
static void
CheckInMemConstraintsPgExttable(InMemHeapRelation relation, HeapTuple newTuple)
{
	Assert(NULL != newTuple);
	Assert(NULL != relation);
	Assert(NULL != relation->rel);

	TupleDesc tupleDesc = relation->rel->rd_att;
	Oid reloidNew = DatumGetObjectId(tuple_getattr(newTuple, tupleDesc, Anum_pg_exttable_reloid));

	for (int i = 0; i < relation->tupsize; i++)
	{
		HeapTuple tuple = relation->tuples[i].tuple;
		Assert(NULL != tuple);

		Oid reloid = DatumGetObjectId(tuple_getattr(tuple, tupleDesc, Anum_pg_exttable_reloid));

		insist_log(reloidNew != reloid,
				   "in-memory tuple with reloid = %d already exists in pg_exttable.", reloid);
	}
}
/*
 * ExecWorkFile_Read
 *    read the data with specified size to the given buffer.
 *
 * The given buffer should contain at least the space specified by 
 * 'size'.
 *
 * If the read succeeds, this function returns the number of bytes
 * that are read. Otherwise, returns 0.
 */
uint64
ExecWorkFile_Read(ExecWorkFile *workfile,
				  void *data,
				  uint64 size)
{
	Assert(workfile != NULL);
	uint64 bytes = 0;
	
	switch(workfile->fileType)
	{
		case BUFFILE:
			bytes = BufFileRead((BufFile *)workfile->file, data, size);
			break;
			
		case BFZ:
			bytes = bfz_scan_next((bfz_t *)workfile->file, data, size);
			break;
		default:
			insist_log(false, "invalid work file type: %d", workfile->fileType);
	}
	
	return bytes;
}
/*
 * ExecWorkFile_Close
 *    close the work file, and release the space.
 *
 *    Returns the actual size of the file on disk upon closing
 */
int64
ExecWorkFile_Close(ExecWorkFile *workfile)
{
	Assert(workfile != NULL);
	bfz_t *bfz_file = NULL;

	switch(workfile->fileType)
	{
		case BUFFILE:
			BufFileClose((BufFile *)workfile->file);
			break;
			
		case BFZ:
			bfz_file = (bfz_t *)workfile->file;
			Assert(bfz_file != NULL);

			if (bfz_file->mode == BFZ_MODE_APPEND)
			{
				/* Flush data out to disk if we were writing */
				int64 file_size = bfz_append_end(bfz_file);
				/* Adjust the size with WorkfileDiskspace to our actual size */
				ExecWorkFile_AdjustBFZSize(workfile, file_size);
			}

			bfz_close(bfz_file, true, true);
			break;
		default:
			insist_log(false, "invalid work file type: %d", workfile->fileType);
	}

	int64 size = ExecWorkFile_GetSize(workfile);

	pfree(workfile->fileName);
	pfree(workfile);

	return size;
}
Exemple #24
0
/*
 * SaveMemoryBufToDisk
 *    Saves the memory account information in a file. The file name is auto
 *    generated using gp_session_id, gp_command_count and the passed time stamp
 *
 * memoryBuf: The buffer where the momory tree is serialized in (typically) csv form.
 * prefix: A file name prefix that can be used to uniquely identify the file's content
 */
static void
SaveMemoryBufToDisk(struct StringInfoData *memoryBuf, char *prefix)
{
	char fileName[MEMORY_REPORT_FILE_NAME_LENGTH];

	Assert((strlen("pg_log/") + strlen("memory_") + strlen(prefix) + strlen(".mem")) < MEMORY_REPORT_FILE_NAME_LENGTH);
	snprintf(fileName, MEMORY_REPORT_FILE_NAME_LENGTH, "%s/memory_%s.mem", "pg_log", prefix);

	FILE *file = fopen(fileName, "w");

	if (file == NULL)
	{
		elog(ERROR, "Could not write memory usage information. Failed to open file: %s", fileName);
	}

	uint64 bytes = fwrite(memoryBuf->data, 1, memoryBuf->len, file);

	if (bytes != memoryBuf->len)
	{
		insist_log(false, "Could not write memory usage information. Attempted to write %d", memoryBuf->len);
	}

	fclose(file);
}
/*
 * ExecWorkFile_Rewind
 *    rewind the pointer position to the beginning of the file.
 *
 * This function returns true if this succeeds. Otherwise, return false.
 */
bool
ExecWorkFile_Rewind(ExecWorkFile *workfile)
{
	Assert(workfile != NULL);

	long ret = 0;
	int64 file_size = 0;
	switch(workfile->fileType)
	{
		case BUFFILE:
			ret = BufFileSeek((BufFile *)workfile->file, 0L  /* offset */, SEEK_SET);
			/* BufFileSeek returns 0 if everything went OK */
			return (0 == ret);
		case BFZ:
			file_size = bfz_append_end((bfz_t *)workfile->file);
			ExecWorkFile_AdjustBFZSize(workfile, file_size);
			bfz_scan_begin((bfz_t *)workfile->file);
			break;
		default:
			insist_log(false, "invalid work file type: %d", workfile->fileType);
	}

	return true;
}
/* ----------------------------------------------------------------
 *		ExecInitMaterial
 * ----------------------------------------------------------------
 */
MaterialState *
ExecInitMaterial(Material *node, EState *estate, int eflags)
{
	MaterialState *matstate;
	Plan	   *outerPlan;

	/*
	 * create state structure
	 */
	matstate = makeNode(MaterialState);
	matstate->ss.ps.plan = (Plan *) node;
	matstate->ss.ps.state = estate;

	/*
	 * We must have random access to the subplan output to do backward scan or
	 * mark/restore.  We also prefer to materialize the subplan output if we
	 * might be called on to rewind and replay it many times. However, if none
	 * of these cases apply, we can skip storing the data.
	 */
	matstate->randomAccess = node->cdb_strict ||
							(eflags & (EXEC_FLAG_REWIND |
										EXEC_FLAG_BACKWARD |
										EXEC_FLAG_MARK)) != 0;

	matstate->eof_underlying = false;
	matstate->ts_state = palloc0(sizeof(GenericTupStore));
	matstate->ts_pos = NULL;
	matstate->ts_markpos = NULL;
	matstate->share_lk_ctxt = NULL;
	matstate->ts_destroyed = false;
	ExecMaterialResetWorkfileState(matstate);

	/*
	 * Miscellaneous initialization
	 *
	 * Materialization nodes don't need ExprContexts because they never call
	 * ExecQual or ExecProject.
	 */

#define MATERIAL_NSLOTS 2

	/*
	 * tuple table initialization
	 *
	 * material nodes only return tuples from their materialized relation.
	 */
	ExecInitResultTupleSlot(estate, &matstate->ss.ps);
	matstate->ss.ss_ScanTupleSlot = ExecInitExtraTupleSlot(estate);

	/*
	 * If eflag contains EXEC_FLAG_REWIND or EXEC_FLAG_BACKWARD or EXEC_FLAG_MARK,
	 * then this node is not eager free safe.
	 */
	matstate->ss.ps.delayEagerFree =
		((eflags & (EXEC_FLAG_REWIND | EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)) != 0);

	/*
	 * initialize child nodes
	 *
	 * We shield the child node from the need to support BACKWARD, or
	 * MARK/RESTORE.
	 */
	eflags &= ~(EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK);

	/*
	 * If Materialize does not have any external parameters, then it
	 * can shield the child node from being rescanned as well, hence
	 * we can clear the EXEC_FLAG_REWIND as well. If there are parameters,
	 * don't clear the REWIND flag, as the child will be rewound.
	 */
	if (node->plan.allParam == NULL || node->plan.extParam == NULL)
	{
		eflags &= ~EXEC_FLAG_REWIND;
	}

	outerPlan = outerPlan(node);
	/*
	 * A very basic check to see if the optimizer requires the material to do a projection.
	 * Ideally, this check would recursively compare all the target list expressions. However,
	 * such a check is tricky because of the varno mismatch (outer plan may have a varno that
	 * index into range table, while the material may refer to the same relation as "outer" varno)
	 * [JIRA: MPP-25365]
	 */
	insist_log(list_length(node->plan.targetlist) == list_length(outerPlan->targetlist),
			"Material operator does not support projection");
	outerPlanState(matstate) = ExecInitNode(outerPlan, estate, eflags);

	/*
	 * If the child node of a Material is a Motion, then this Material node is
	 * not eager free safe.
	 */
	if (IsA(outerPlan((Plan *)node), Motion))
	{
		matstate->ss.ps.delayEagerFree = true;
	}

	/*
	 * initialize tuple type.  no need to initialize projection info because
	 * this node doesn't do projections.
	 */
	ExecAssignResultTypeFromTL(&matstate->ss.ps);
	ExecAssignScanTypeFromOuterPlan(&matstate->ss);
	matstate->ss.ps.ps_ProjInfo = NULL;

	/*
	 * If share input, need to register with range table entry
	 */
	if(node->share_type != SHARE_NOTSHARED) 
	{
		ShareNodeEntry *snEntry = ExecGetShareNodeEntry(estate, node->share_id, true); 
		snEntry->sharePlan = (Node *) node;
		snEntry->shareState = (Node *) matstate;
	}

	initGpmonPktForMaterial((Plan *)node, &matstate->ss.ps.gpmon_pkt, estate);

	return matstate;
}
Exemple #27
0
/*
 * Reads the GP catalog tables and build a CdbComponentDatabases structure.
 * It then converts this to a Gang structure and initializes all the non-connection related fields.
 *
 * Call this function in GangContext.
 * Returns a not-null pointer.
 */
Gang *
buildGangDefinition(GangType type, int gang_id, int size, int content)
{
	Gang *newGangDefinition = NULL;
	CdbComponentDatabaseInfo *cdbinfo = NULL;
	CdbComponentDatabaseInfo *cdbInfoCopy = NULL;
	SegmentDatabaseDescriptor *segdbDesc = NULL;
	MemoryContext perGangContext = NULL;

	int segCount = 0;
	int i = 0;

	ELOG_DISPATCHER_DEBUG("buildGangDefinition:Starting %d qExec processes for %s gang",
			size, gangTypeToString(type));

	Assert(CurrentMemoryContext == GangContext);
	Assert(size == 1 || size == getgpsegmentCount());

	/* read gp_segment_configuration and build CdbComponentDatabases */
	cdb_component_dbs = getComponentDatabases();

	if (cdb_component_dbs == NULL ||
		cdb_component_dbs->total_segments <= 0 ||
		cdb_component_dbs->total_segment_dbs <= 0)
		insist_log(false, "schema not populated while building segworker group");

	/* if mirroring is not configured */
	if (cdb_component_dbs->total_segment_dbs == cdb_component_dbs->total_segments)
	{
		ELOG_DISPATCHER_DEBUG("building Gang: mirroring not configured");
		disableFTS();
	}

	perGangContext = AllocSetContextCreate(GangContext, "Per Gang Context",
					ALLOCSET_DEFAULT_MINSIZE,
					ALLOCSET_DEFAULT_INITSIZE,
					ALLOCSET_DEFAULT_MAXSIZE);
	Assert(perGangContext != NULL);
	MemoryContextSwitchTo(perGangContext);

	/* allocate a gang */
	newGangDefinition = (Gang *) palloc0(sizeof(Gang));
	newGangDefinition->type = type;
	newGangDefinition->size = size;
	newGangDefinition->gang_id = gang_id;
	newGangDefinition->allocated = false;
	newGangDefinition->noReuse = false;
	newGangDefinition->dispatcherActive = false;
	newGangDefinition->portal_name = NULL;
	newGangDefinition->perGangContext = perGangContext;
	newGangDefinition->db_descriptors =
			(SegmentDatabaseDescriptor *) palloc0(size * sizeof(SegmentDatabaseDescriptor));

	/* initialize db_descriptors */
	switch (type)
	{
	case GANGTYPE_ENTRYDB_READER:
		cdbinfo = &cdb_component_dbs->entry_db_info[0];
		cdbInfoCopy = copyCdbComponentDatabaseInfo(cdbinfo);
		segdbDesc = &newGangDefinition->db_descriptors[0];
		cdbconn_initSegmentDescriptor(segdbDesc, cdbInfoCopy);
		setQEIdentifier(segdbDesc, -1, perGangContext);
		break;

	case GANGTYPE_SINGLETON_READER:
		cdbinfo = findDatabaseInfoBySegIndex(cdb_component_dbs, content);
		cdbInfoCopy = copyCdbComponentDatabaseInfo(cdbinfo);
		segdbDesc = &newGangDefinition->db_descriptors[0];
		cdbconn_initSegmentDescriptor(segdbDesc, cdbInfoCopy);
		setQEIdentifier(segdbDesc, -1, perGangContext);
		break;

	case GANGTYPE_PRIMARY_READER:
	case GANGTYPE_PRIMARY_WRITER:
		/*
		 * We loop through the segment_db_info.  Each item has a segindex.
		 * They are sorted by segindex, and there can be > 1 segment_db_info for
		 * a given segindex (currently, there can be 1 or 2)
		 */
		for (i = 0; i < cdb_component_dbs->total_segment_dbs; i++)
		{
			cdbinfo = &cdb_component_dbs->segment_db_info[i];
			if (SEGMENT_IS_ACTIVE_PRIMARY(cdbinfo))
			{
				segdbDesc = &newGangDefinition->db_descriptors[segCount];
				cdbInfoCopy = copyCdbComponentDatabaseInfo(cdbinfo);
				cdbconn_initSegmentDescriptor(segdbDesc, cdbInfoCopy);
				setQEIdentifier(segdbDesc, -1, perGangContext);
				segCount++;
			}
		}

		if (size != segCount)
		{
			FtsReConfigureMPP(false);
			elog(ERROR, "Not all primary segment instances are active and connected");
		}
		break;

	default:
		Assert(false);
	}

	ELOG_DISPATCHER_DEBUG("buildGangDefinition done");
	MemoryContextSwitchTo(GangContext);
	return newGangDefinition;
}
Exemple #28
0
/*
 * Create a writer gang.
 */
Gang *
AllocateWriterGang()
{
	Gang *writerGang = NULL;
	MemoryContext oldContext = NULL;
	int i = 0;

	ELOG_DISPATCHER_DEBUG("AllocateWriterGang begin.");

	if (Gp_role != GP_ROLE_DISPATCH)
	{
		elog(FATAL, "dispatch process called with role %d", Gp_role);
	}

	/*
	 * First, we look for an unallocated but created gang of the right type
	 * if it exists, we return it.
	 * Else, we create a new gang
	 */
	if (primaryWriterGang == NULL)
	{
		int nsegdb = getgpsegmentCount();

		insist_log(IsTransactionOrTransactionBlock(),
				"cannot allocate segworker group outside of transaction");

		if (GangContext == NULL)
		{
			GangContext = AllocSetContextCreate(TopMemoryContext,
					"Gang Context",
					ALLOCSET_DEFAULT_MINSIZE,
					ALLOCSET_DEFAULT_INITSIZE,
					ALLOCSET_DEFAULT_MAXSIZE);
		}
		Assert(GangContext != NULL);
		oldContext = MemoryContextSwitchTo(GangContext);

		writerGang = createGang(GANGTYPE_PRIMARY_WRITER,
				PRIMARY_WRITER_GANG_ID, nsegdb, -1);
		writerGang->allocated = true;

		/*
		 * set "whoami" for utility statement.
		 * non-utility statement will overwrite it in function getCdbProcessList.
		 */
		for(i = 0; i < writerGang->size; i++)
			setQEIdentifier(&writerGang->db_descriptors[i], -1, writerGang->perGangContext);

		MemoryContextSwitchTo(oldContext);
	}
	else
	{
		ELOG_DISPATCHER_DEBUG("Reusing an existing primary writer gang");
		writerGang = primaryWriterGang;
	}

	/* sanity check the gang */
	if (!GangOK(writerGang))
		elog(ERROR, "could not connect to segment: initialization of segworker group failed");

	ELOG_DISPATCHER_DEBUG("AllocateWriterGang end.");

	primaryWriterGang = writerGang;
	return writerGang;
}
Exemple #29
0
/*
 * Create a reader gang.
 *
 * @type can be GANGTYPE_ENTRYDB_READER, GANGTYPE_SINGLETON_READER or GANGTYPE_PRIMARY_READER.
 */
Gang *
AllocateReaderGang(GangType type, char *portal_name)
{
	MemoryContext oldContext = NULL;
	Gang *gp = NULL;
	int size = 0;
	int content = 0;

	ELOG_DISPATCHER_DEBUG("AllocateReaderGang for portal %s: allocatedReaderGangsN %d, availableReaderGangsN %d, "
			"allocatedReaderGangs1 %d, availableReaderGangs1 %d",
			(portal_name ? portal_name : "<unnamed>"),
			list_length(allocatedReaderGangsN),
			list_length(availableReaderGangsN),
			list_length(allocatedReaderGangs1),
			list_length(availableReaderGangs1));

	if (Gp_role != GP_ROLE_DISPATCH)
	{
		elog(FATAL, "dispatch process called with role %d", Gp_role);
	}

	insist_log(IsTransactionOrTransactionBlock(),
			"cannot allocate segworker group outside of transaction");

	if (GangContext == NULL)
	{
		GangContext = AllocSetContextCreate(TopMemoryContext, "Gang Context",
		ALLOCSET_DEFAULT_MINSIZE,
		ALLOCSET_DEFAULT_INITSIZE,
		ALLOCSET_DEFAULT_MAXSIZE);
	}
	Assert(GangContext != NULL);
	oldContext = MemoryContextSwitchTo(GangContext);

	switch (type)
	{
	case GANGTYPE_ENTRYDB_READER:
		content = -1;
		size = 1;
		break;

	case GANGTYPE_SINGLETON_READER:
		content = gp_singleton_segindex;
		size = 1;
		break;

	case GANGTYPE_PRIMARY_READER:
		content = 0;
		size = getgpsegmentCount();
		break;

	default:
		Assert(false);
	}

	/*
	 * First, we look for an unallocated but created gang of the right type
	 * if it exists, we return it.
	 * Else, we create a new gang
	 */
	gp = getAvailableGang(type, size, content);
	if (gp == NULL)
	{
		ELOG_DISPATCHER_DEBUG("Creating a new reader size %d gang for %s",
				size, (portal_name ? portal_name : "unnamed portal"));

		gp = createGang(type, gang_id_counter++, size, content);
		gp->allocated = true;
	}

	/*
	 * make sure no memory is still allocated for previous
	 * portal name that this gang belonged to
	 */
	if (gp->portal_name)
		pfree(gp->portal_name);

	/* let the gang know which portal it is being assigned to */
	gp->portal_name = (portal_name ? pstrdup(portal_name) : (char *) NULL);

	/* sanity check the gang */
	insist_log(GangOK(gp), "could not connect to segment: initialization of segworker group failed");

	addGangToAllocated(gp);

	MemoryContextSwitchTo(oldContext);

	ELOG_DISPATCHER_DEBUG("on return: allocatedReaderGangs %d, availableReaderGangsN %d, "
			"allocatedReaderGangs1 %d, availableReaderGangs1 %d",
			list_length(allocatedReaderGangsN),
			list_length(availableReaderGangsN),
			list_length(allocatedReaderGangs1),
			list_length(availableReaderGangs1));

	return gp;
}
/*
 * ExecWorkFile_Seek
 *   Result is 0 if OK, EOF if not.  Logical position is not moved if an
 *   impossible seek is attempted.
 */
int
ExecWorkFile_Seek(ExecWorkFile *workfile, uint64 offset, int whence)
{
	Assert(workfile != NULL);
	Assert((workfile->flags & EXEC_WORKFILE_RANDOM_ACCESS) != 0);
	int result = 0;

	/* Determine if this seeks beyond EOF */
	int64 additional_size = 0;
	switch (whence)
	{
		case SEEK_SET:
			if (offset > workfile->size)
			{
				additional_size = offset - workfile->size;
			}
			break;

		case SEEK_CUR:
			if (ExecWorkFile_Tell64(workfile) + offset > workfile->size)
			{
				additional_size = ExecWorkFile_Tell64(workfile) + offset - workfile->size;
			}
			break;

		default:
			elog(LOG, "invalid whence: %d", whence);
			Assert(false);
			return EOF;
	}

	/* Reserve disk space if needed */
	if (additional_size > 0)
	{
		/*
		 * We only allow seeking beyond EOF for files opened for writing
		 *  (i.e. files we created)
		 */
		if (workfile->flags & EXEC_WORKFILE_CREATED)
		{
			bool success = WorkfileDiskspace_Reserve(additional_size);
			if (!success)
			{
				/* Failed to reserve additional disk space, notify caller */
				return EOF;
			}
		}
		else
		{
			return EOF;
		}
	}

	/* Do the actual seek */
	switch(workfile->fileType)
	{
	case BUFFILE:
		result = BufFileSeek((BufFile *)workfile->file, offset, whence);
		if (additional_size > 0)
		{
			workfile->size = BufFileGetSize((BufFile *)workfile->file);
		}
		break;
	default:
		insist_log(false, "invalid work file type: %d", workfile->fileType);
	}

	if (additional_size > 0)
	{
		WorkfileDiskspace_Commit(additional_size, additional_size, true /* update_query_size */);
		workfile_update_in_progress_size(workfile, additional_size);
	}

	return result;
}