/* * CheckInMemConstraintsPgType * Check uniqueness constraints for pg_type in-memory tuples upon insert */ static void CheckInMemConstraintsPgType(InMemHeapRelation relation, HeapTuple newTuple) { Assert(NULL != newTuple); Assert(NULL != relation); Assert(NULL != relation->rel); TupleDesc tupleDesc = relation->rel->rd_att; Oid relnamespaceNew = DatumGetObjectId(tuple_getattr(newTuple, tupleDesc, Anum_pg_type_typnamespace)); char *typnameNew = DatumGetCString(tuple_getattr(newTuple, tupleDesc, Anum_pg_type_typname)); for (int i = 0; i < relation->tupsize; i++) { HeapTuple tuple = relation->tuples[i].tuple; Assert(NULL != tuple); insist_log(HeapTupleGetOid(tuple) != HeapTupleGetOid(newTuple), "in-memory tuple with Oid = %d already exists in pg_type.", HeapTupleGetOid(tuple)); Oid relnamespace = DatumGetObjectId(tuple_getattr(tuple, tupleDesc, Anum_pg_type_typnamespace)); char *typname = DatumGetCString(tuple_getattr(tuple, tupleDesc, Anum_pg_type_typname)); size_t typnameLen = strlen(typname); insist_log(relnamespace != relnamespaceNew || typnameLen != strlen(typnameNew) || 0 != strncmp(typname, typnameNew, typnameLen), "in-memory tuple with typname = %s and typnamespace = %d already exists in pg_type.", typname, relnamespace); } }
/* * CheckInMemConstraintsPgNamespace * Check uniqueness constraints for pg_namespace in-memory tuples upon insert */ static void CheckInMemConstraintsPgNamespace(InMemHeapRelation relation, HeapTuple newTuple) { Assert(NULL != newTuple); Assert(NULL != relation); Assert(NULL != relation->rel); TupleDesc tupleDesc = relation->rel->rd_att; Oid nspdboidNew = DatumGetObjectId(tuple_getattr(newTuple, tupleDesc, Anum_pg_namespace_nspdboid)); char *nspnameNew = DatumGetCString(tuple_getattr(newTuple, tupleDesc, Anum_pg_namespace_nspname)); for (int i = 0; i < relation->tupsize; i++) { HeapTuple tuple = relation->tuples[i].tuple; Assert(NULL != tuple); insist_log(HeapTupleGetOid(tuple) != HeapTupleGetOid(newTuple), "in-memory tuple with Oid = %d already exists in pg_namespace.", HeapTupleGetOid(tuple)); Oid nspdboid = DatumGetObjectId(tuple_getattr(tuple, tupleDesc, Anum_pg_namespace_nspdboid)); char *nspname = DatumGetCString(tuple_getattr(tuple, tupleDesc, Anum_pg_namespace_nspname)); size_t nspnameLen = strlen(nspname); insist_log(nspdboid != nspdboidNew || nspnameLen != strlen(nspnameNew) || 0 != strncmp(nspname, nspnameNew, nspnameLen), "in-memory tuple with nspname = %s and nspdboid = %d already exists in pg_namespace.", nspname, nspdboid); } }
static void * readtup_heap(Tuplestorestate *state, unsigned int len) { void *tup = NULL; uint32 tuplen = 0; if (is_len_memtuplen(len)) { tuplen = memtuple_size_from_uint32(len); } else { /* len is HeapTuple.t_len. The record size includes rest of the HeapTuple fields */ tuplen = len + HEAPTUPLESIZE; } tup = (void *) palloc(tuplen); USEMEM(state, GetMemoryChunkSpace(tup)); if(is_len_memtuplen(len)) { /* read in the tuple proper */ memtuple_set_mtlen((MemTuple) tup, len); if (BufFileRead(state->myfile, (void *) ((char *) tup + sizeof(uint32)), tuplen - sizeof(uint32)) != (size_t) (tuplen - sizeof(uint32))) { insist_log(false, "unexpected end of data"); } } else { HeapTuple htup = (HeapTuple) tup; htup->t_len = tuplen - HEAPTUPLESIZE; if (BufFileRead(state->myfile, (void *) ((char *) tup + sizeof(uint32)), tuplen - sizeof(uint32)) != (size_t) (tuplen - sizeof(uint32))) { insist_log(false, "unexpected end of data"); } htup->t_data = (HeapTupleHeader ) ((char *) tup + HEAPTUPLESIZE); } if (state->backward) /* need trailing length word? */ { if (BufFileRead(state->myfile, (void *) &tuplen, sizeof(tuplen)) != sizeof(tuplen)) { insist_log(false, "unexpected end of data"); } } return (void *) tup; }
static uint32 getlen(Tuplestorestate *state, TuplestorePos *pos, bool eofOK) { uint32 len; size_t nbytes; nbytes = BufFileRead(state->myfile, (void *) &len, sizeof(len)); if (nbytes == sizeof(len)) return len; insist_log(nbytes == 0, "unexpected end of tape"); insist_log(eofOK, "unexpected end of data"); return 0; }
/* * Check all the connections of a gang. * * return the count of successful connections and * the count of failed connections due to recovery. */ static void checkConnectionStatus(Gang *gp, int *countInRecovery, int *countSuccessful, struct PQExpBufferData *errorMessage) { SegmentDatabaseDescriptor *segdbDesc = NULL; int size = gp->size; int i = 0; /* * In this loop, we check whether the connections were successful. If not, * we recreate the error message with palloc and report it. */ for (i = 0; i < size; i++) { segdbDesc = &gp->db_descriptors[i]; /* * check connection established or not, if not, we may have to * re-build this gang. */ if (segdbDesc->errcode && segdbDesc->error_message.len > 0) { /* * Log failed connections. Complete failures are taken care of * later. */ Assert(segdbDesc->whoami != NULL); elog(LOG, "Failed connection to %s", segdbDesc->whoami); insist_log(segdbDesc->errcode != 0 && segdbDesc->error_message.len != 0, "connection is null, but no error code or error message, for segDB %d", i); ereport(LOG, (errcode(segdbDesc->errcode), errmsg("%s", segdbDesc->error_message.data))); /* this connect failed -- but why ? */ if (segment_failure_due_to_recovery(segdbDesc->error_message.data)) { elog(LOG, "segment is in recovery mode (%s)", segdbDesc->whoami); (*countInRecovery)++; } else { appendPQExpBuffer(errorMessage, "%s (%s)\n", segdbDesc->error_message.data, segdbDesc->whoami); } cdbconn_resetQEErrorMessage(segdbDesc); } else { Assert(segdbDesc->errcode == 0 && segdbDesc->error_message.len == 0); /* We have a live connection! */ (*countSuccessful)++; } } }
/* * count_usable_fds --- count how many FDs the system will let us open, * and estimate how many are already open. * * We stop counting if usable_fds reaches max_to_probe. Note: a small * value of max_to_probe might result in an underestimate of already_open; * we must fill in any "gaps" in the set of used FDs before the calculation * of already_open will give the right answer. In practice, max_to_probe * of a couple of dozen should be enough to ensure good results. * * We assume stdin (FD 0) is available for dup'ing */ static void count_usable_fds(int max_to_probe, int *usable_fds, int *already_open) { int *fd; int size; int used = 0; int highestfd = 0; int j; size = 1024; fd = (int *) palloc(size * sizeof(int)); /* dup until failure or probe limit reached */ for (;;) { int thisfd; thisfd = dup(0); if (thisfd < 0) { /* Expect EMFILE or ENFILE, else it's fishy */ if (errno != EMFILE && errno != ENFILE) { insist_log(false, "dup(0) failed after %d successes: %m", used); } break; } if (used >= size) { size *= 2; fd = (int *) repalloc(fd, size * sizeof(int)); } fd[used++] = thisfd; if (highestfd < thisfd) highestfd = thisfd; if (used >= max_to_probe) break; } /* release the files we opened */ for (j = 0; j < used; j++) close(fd[j]); pfree(fd); /* * Return results. usable_fds is just the number of successful dups. We * assume that the system limit is highestfd+1 (remember 0 is a legal FD * number) and so already_open is highestfd+1 - usable_fds. */ *usable_fds = used; *already_open = highestfd + 1 - used; }
/* ---------------- * tuple_getattr * * Extracts an attribute from a HeapTuple given its attnum and * returns it as a Datum. * * <tuple> is the pointer to the heap tuple. <attnum> is the attribute * number of the column (field) caller wants. <tupleDesc> is a * pointer to the structure describing the row and all its fields. * * ---------------- */ Datum tuple_getattr(HeapTuple tuple, TupleDesc tupleDesc, int attnum) { Assert(NULL != tupleDesc); Assert(NULL != tuple); bool isnull; Datum attr = heap_getattr(tuple, attnum, tupleDesc, &isnull); insist_log(!isnull, "attribute cannot be null"); return attr; }
static void shareinput_clean_lk_ctxt(ShareInput_Lk_Context *lk_ctxt) { int err; elog(DEBUG1, "shareinput_clean_lk_ctxt cleanup lk ctxt %p", lk_ctxt); if(lk_ctxt->readyfd >= 0) { err = gp_retry_close(lk_ctxt->readyfd); insist_log(!err, "shareinput_clean_lk_ctxt cannot close readyfd: %m"); lk_ctxt->readyfd = -1; } if(lk_ctxt->donefd >= 0) { err = gp_retry_close(lk_ctxt->donefd); insist_log(!err, "shareinput_clean_lk_ctxt cannot close donefd: %m"); lk_ctxt->donefd = -1; } if(lk_ctxt->del_ready && lk_ctxt->lkname_ready[0]) { err = unlink(lk_ctxt->lkname_ready); insist_log(!err, "shareinput_clean_lk_ctxt cannot unlink \"%s\": %m", lk_ctxt->lkname_ready); lk_ctxt->del_ready = false; } if(lk_ctxt->del_done && lk_ctxt->lkname_done[0]) { err = unlink(lk_ctxt->lkname_done); insist_log(!err, "shareinput_clean_lk_ctxt cannot unline \"%s\": %m", lk_ctxt->lkname_done); lk_ctxt->del_done = false; } gp_free2 (lk_ctxt, sizeof(ShareInput_Lk_Context)); }
void ExecWorkFile_Flush(ExecWorkFile *workfile) { Assert(workfile != NULL); switch(workfile->fileType) { case BUFFILE: BufFileFlush((BufFile *) workfile->file); break; default: insist_log(false, "invalid work file type: %d", workfile->fileType); } }
/* * Re-open a suspended file for reading. This allocates all the necessary * buffers and data structures to restart reading from the file */ void ExecWorkFile_Restart(ExecWorkFile *workfile) { Assert(workfile != NULL); Assert((workfile->flags & EXEC_WORKFILE_SUSPENDABLE) != 0); switch(workfile->fileType) { case BFZ: bfz_scan_begin((bfz_t *) workfile->file); break; default: insist_log(false, "invalid work file type: %d", workfile->fileType); } }
/* * CheckInMemConstraintsPgAttribute * Check uniqueness constraints for pg_attribute in-memory tuples upon insert */ static void CheckInMemConstraintsPgAttribute(InMemHeapRelation relation, HeapTuple newTuple) { Assert(NULL != newTuple); Assert(NULL != relation); Assert(NULL != relation->rel); TupleDesc tupleDesc = relation->rel->rd_att; Oid attrelidNew = DatumGetObjectId(tuple_getattr(newTuple, tupleDesc, Anum_pg_attribute_attrelid)); char *attnameNew = DatumGetCString(tuple_getattr(newTuple, tupleDesc, Anum_pg_attribute_attname)); AttrNumber attnoNew = DatumGetInt16((tuple_getattr(newTuple, tupleDesc, Anum_pg_attribute_attnum))); for (int i = 0; i < relation->tupsize; i++) { HeapTuple tuple = relation->tuples[i].tuple; Assert(NULL != tuple); Oid attrelid = DatumGetObjectId(tuple_getattr(tuple, tupleDesc, Anum_pg_attribute_attrelid)); char *attname = DatumGetCString(tuple_getattr(tuple, tupleDesc, Anum_pg_attribute_attname)); AttrNumber attno = DatumGetInt16((tuple_getattr(tuple, tupleDesc, Anum_pg_attribute_attnum))); size_t attnameLen = strlen(attname); if (attrelid != attrelidNew) { /* attributes belong to different relations */ continue; } insist_log(attno != attnoNew, "in-memory tuple with attrelid = %d and attno = %d already exists in pg_attribute.", attrelid, attno); insist_log((attnameLen != strlen(attnameNew)) || (0 != strncmp(attname, attnameNew, attnameLen)), "in-memory tuple with attrelid = %d and attname = %s already exists in pg_attribute.", attrelid, attname); } }
static CommandId GetRealCmax(TransactionId xmin, CommandId combocid) { if (combocid >= usedComboCids) { insist_log(!Gp_is_writer, "writer segworker group unable to resolve visibility %u/%u", combocid, usedComboCids); /* We're a reader */ return getSharedComboCidEntry(xmin, combocid, CMAX); } Assert(combocid < usedComboCids); return comboCids[combocid].cmax; }
/* * Creates a LogicalTapeSet with a generated file name. */ LogicalTapeSet *LogicalTapeSetCreate(int ntapes, bool del_on_close) { char tmpprefix[MAXPGPATH]; int len = snprintf(tmpprefix, MAXPGPATH, "%s/slice%d_sort", PG_TEMP_FILES_DIR, currentSliceId); insist_log(len <= MAXPGPATH - 1, "could not generate temporary file name"); StringInfo uniquename = ExecWorkFile_AddUniqueSuffix(tmpprefix); LogicalTapeSet *lts = LogicalTapeSetCreate_Named(uniquename->data, ntapes, del_on_close); pfree(uniquename->data); pfree(uniquename); return lts; }
/* * Suspend a file without closing it. For bfz, which allocates a buffer for * each open a file, this frees up that buffer but keeps the fd so we can * re-open this file later * * Returns the actual size of the file on disk */ int64 ExecWorkFile_Suspend(ExecWorkFile *workfile) { Assert(workfile != NULL); Assert((workfile->flags & EXEC_WORKFILE_SUSPENDABLE) != 0); int64 size = -1; switch(workfile->fileType) { case BFZ: size = bfz_append_end((bfz_t *) workfile->file); ExecWorkFile_AdjustBFZSize(workfile, size); break; default: insist_log(false, "invalid work file type: %d", workfile->fileType); } return size; }
/* * ExecWorkFile_ReadFromBuffer * * This function provides a faster implementation of Read which applies * when the data is already in the underlying buffer. * In that case, it returns a pointer to the data in the buffer * If the data is not in the buffer, returns NULL and the caller must * call the regular ExecWorkFile_Read with a destination buffer. * * Currently only bfz supports this behavior. * */ void * ExecWorkFile_ReadFromBuffer(ExecWorkFile *workfile, uint64 size) { Assert(workfile != NULL); void *data = NULL; switch(workfile->fileType) { case BFZ: data = bfz_scan_peek((bfz_t *)workfile->file, size); break; default: insist_log(false, "invalid work file type: %d", workfile->fileType); } return data; }
/* * For a new workfile, sets the capabilities flags according to * the known underlying file type capabilities and the method the file was created */ static void ExecWorkFile_SetFlags(ExecWorkFile *workfile, bool delOnClose, bool created) { Assert(workfile != NULL); /* Assert that only the creator of a file can delete it on close */ AssertImply(delOnClose, created); switch(workfile->fileType) { case BUFFILE: workfile->flags |= EXEC_WORKFILE_RANDOM_ACCESS; break; case BFZ: workfile->flags |= EXEC_WORKFILE_SUSPENDABLE; break; default: insist_log(false, "invalid work file type: %d", workfile->fileType); } if (delOnClose) { workfile->flags |= EXEC_WORKFILE_DEL_ON_CLOSE; } if (created) { workfile->flags |= EXEC_WORKFILE_CREATED; elog(gp_workfile_caching_loglevel, "Created workfile %s, delOnClose = %d", ExecWorkFile_GetFileName(workfile), delOnClose); } else { elog(gp_workfile_caching_loglevel, "Opened existing workfile %s, delOnClose = %d", ExecWorkFile_GetFileName(workfile), delOnClose); } if ((gp_workfile_limit_per_query > 0) || (gp_workfile_limit_per_segment > 0)) { workfile->flags |= EXEC_WORKFILE_LIMIT_SIZE; } }
/* * Open a temporary file that will (optionally) disappear when we close it. * * If 'makenameunique' is true, this function generates a file name which * should be unique to this particular OpenTemporaryFile() request and * distinct from any others in concurrent use on the same host. As a * convenience for monitoring and debugging, the given 'fileName' string * and 'extentseqnum' are embedded in the file name. * * If 'makenameunique' is false, then 'fileName' and 'extentseqnum' identify a * new or existing temporary file which other processes also could open and * share. * * If 'create' is true, a new file is created. If successful, a valid vfd * index (>0) is returned; otherwise an error is thrown. * * If 'create' is false, an existing file is opened. If successful, a valid * vfd index (>0) is returned. If the file does not exist or cannot be * opened, an invalid vfd index (<= 0) is returned. * * If 'delOnClose' is true, then the file is removed when you call * FileClose(); or when the process exits; or (provided 'closeAtEOXact' is * true) when the transaction ends. * * If 'closeAtEOXact' is true, the vfd is closed automatically at end of * transaction unless you have called FileClose() to close it before then. * If 'closeAtEOXact' is false, the vfd state is not changed at end of * transaction. * * In most cases, you don't want temporary files to outlive the transaction * that created them, so you should specify 'true' for both 'delOnClose' and * 'closeAtEOXact'. */ File OpenTemporaryFile(const char *fileName, int extentseqnum, bool makenameunique, bool create, bool delOnClose, bool closeAtEOXact) { char tempfilepath[MAXPGPATH]; Assert(fileName); AssertImply(makenameunique, create && delOnClose); char tempfileprefix[MAXPGPATH]; int len = GetTempFilePrefix(tempfileprefix, MAXPGPATH, fileName); insist_log(len <= MAXPGPATH - 1, "could not generate temporary file name"); if (makenameunique) { /* * Generate a tempfile name that should be unique within the current * database instance. */ snprintf(tempfilepath, sizeof(tempfilepath), "%s_%d_%04d.%ld", tempfileprefix, MyProcPid, extentseqnum, tempFileCounter++); } else { snprintf(tempfilepath, sizeof(tempfilepath), "%s.%04d", tempfileprefix, extentseqnum); } return OpenNamedFile(tempfilepath, create, delOnClose, closeAtEOXact); } /* OpenTemporaryFile */
/* * Save the serialized plan to a file in the workfile set. * It will be used to do full plan matching before reusing. */ static void workfile_mgr_save_plan(workfile_set *work_set, workfile_set_plan *sf_plan) { Assert(work_set); Assert(sf_plan); ExecWorkFile *plan_file = workfile_mgr_create_fileno(work_set, WORKFILE_NUM_ALL_PLAN); insist_log(plan_file != NULL, "Could not create temporary work file: %m"); elog(gp_workfile_caching_loglevel, "Saving query plan to file %s", ExecWorkFile_GetFileName(plan_file)); bool res = ExecWorkFile_Write(plan_file, sf_plan->serialized_plan, sf_plan->serialized_plan_len); if(!res) { workfile_mgr_report_error(); } workfile_mgr_close_file(work_set, plan_file); }
/* * CheckInMemConstraintsGpDistributionPolicy * Check uniqueness constraints for gp_distribution_policy in-memory tuples upon insert */ static void CheckInMemConstraintsGpDistributionPolicy(InMemHeapRelation relation, HeapTuple newTuple) { Assert(NULL != newTuple); Assert(NULL != relation); Assert(NULL != relation->rel); TupleDesc tupleDesc = relation->rel->rd_att; Oid reloidNew = DatumGetObjectId(tuple_getattr(newTuple, tupleDesc, Anum_gp_policy_localoid)); for (int i = 0; i < relation->tupsize; i++) { HeapTuple tuple = relation->tuples[i].tuple; Assert(NULL != tuple); Oid reloid = DatumGetObjectId(tuple_getattr(tuple, tupleDesc, Anum_gp_policy_localoid)); insist_log(reloidNew != reloid, "in-memory tuple with localoid = %d already exists in gp_distribution_policy.", reloid); } }
/* * ExecWorkFile_Tell64 * return the value of the current file position indicator. */ uint64 ExecWorkFile_Tell64(ExecWorkFile *workfile) { Assert(workfile != NULL); uint64 bytes = 0; switch(workfile->fileType) { case BUFFILE: BufFileTell((BufFile *)workfile->file, (int64 *) &bytes); break; case BFZ: bytes = bfz_totalbytes((bfz_t *)workfile->file); break; default: insist_log(false, "invalid work file type: %d", workfile->fileType); } return bytes; }
/* * CheckInMemConstraintsPgExttable * Check uniqueness constraints for pg_exttable in-memory tuples upon insert */ static void CheckInMemConstraintsPgExttable(InMemHeapRelation relation, HeapTuple newTuple) { Assert(NULL != newTuple); Assert(NULL != relation); Assert(NULL != relation->rel); TupleDesc tupleDesc = relation->rel->rd_att; Oid reloidNew = DatumGetObjectId(tuple_getattr(newTuple, tupleDesc, Anum_pg_exttable_reloid)); for (int i = 0; i < relation->tupsize; i++) { HeapTuple tuple = relation->tuples[i].tuple; Assert(NULL != tuple); Oid reloid = DatumGetObjectId(tuple_getattr(tuple, tupleDesc, Anum_pg_exttable_reloid)); insist_log(reloidNew != reloid, "in-memory tuple with reloid = %d already exists in pg_exttable.", reloid); } }
/* * ExecWorkFile_Read * read the data with specified size to the given buffer. * * The given buffer should contain at least the space specified by * 'size'. * * If the read succeeds, this function returns the number of bytes * that are read. Otherwise, returns 0. */ uint64 ExecWorkFile_Read(ExecWorkFile *workfile, void *data, uint64 size) { Assert(workfile != NULL); uint64 bytes = 0; switch(workfile->fileType) { case BUFFILE: bytes = BufFileRead((BufFile *)workfile->file, data, size); break; case BFZ: bytes = bfz_scan_next((bfz_t *)workfile->file, data, size); break; default: insist_log(false, "invalid work file type: %d", workfile->fileType); } return bytes; }
/* * ExecWorkFile_Close * close the work file, and release the space. * * Returns the actual size of the file on disk upon closing */ int64 ExecWorkFile_Close(ExecWorkFile *workfile) { Assert(workfile != NULL); bfz_t *bfz_file = NULL; switch(workfile->fileType) { case BUFFILE: BufFileClose((BufFile *)workfile->file); break; case BFZ: bfz_file = (bfz_t *)workfile->file; Assert(bfz_file != NULL); if (bfz_file->mode == BFZ_MODE_APPEND) { /* Flush data out to disk if we were writing */ int64 file_size = bfz_append_end(bfz_file); /* Adjust the size with WorkfileDiskspace to our actual size */ ExecWorkFile_AdjustBFZSize(workfile, file_size); } bfz_close(bfz_file, true, true); break; default: insist_log(false, "invalid work file type: %d", workfile->fileType); } int64 size = ExecWorkFile_GetSize(workfile); pfree(workfile->fileName); pfree(workfile); return size; }
/* * SaveMemoryBufToDisk * Saves the memory account information in a file. The file name is auto * generated using gp_session_id, gp_command_count and the passed time stamp * * memoryBuf: The buffer where the momory tree is serialized in (typically) csv form. * prefix: A file name prefix that can be used to uniquely identify the file's content */ static void SaveMemoryBufToDisk(struct StringInfoData *memoryBuf, char *prefix) { char fileName[MEMORY_REPORT_FILE_NAME_LENGTH]; Assert((strlen("pg_log/") + strlen("memory_") + strlen(prefix) + strlen(".mem")) < MEMORY_REPORT_FILE_NAME_LENGTH); snprintf(fileName, MEMORY_REPORT_FILE_NAME_LENGTH, "%s/memory_%s.mem", "pg_log", prefix); FILE *file = fopen(fileName, "w"); if (file == NULL) { elog(ERROR, "Could not write memory usage information. Failed to open file: %s", fileName); } uint64 bytes = fwrite(memoryBuf->data, 1, memoryBuf->len, file); if (bytes != memoryBuf->len) { insist_log(false, "Could not write memory usage information. Attempted to write %d", memoryBuf->len); } fclose(file); }
/* * ExecWorkFile_Rewind * rewind the pointer position to the beginning of the file. * * This function returns true if this succeeds. Otherwise, return false. */ bool ExecWorkFile_Rewind(ExecWorkFile *workfile) { Assert(workfile != NULL); long ret = 0; int64 file_size = 0; switch(workfile->fileType) { case BUFFILE: ret = BufFileSeek((BufFile *)workfile->file, 0L /* offset */, SEEK_SET); /* BufFileSeek returns 0 if everything went OK */ return (0 == ret); case BFZ: file_size = bfz_append_end((bfz_t *)workfile->file); ExecWorkFile_AdjustBFZSize(workfile, file_size); bfz_scan_begin((bfz_t *)workfile->file); break; default: insist_log(false, "invalid work file type: %d", workfile->fileType); } return true; }
/* ---------------------------------------------------------------- * ExecInitMaterial * ---------------------------------------------------------------- */ MaterialState * ExecInitMaterial(Material *node, EState *estate, int eflags) { MaterialState *matstate; Plan *outerPlan; /* * create state structure */ matstate = makeNode(MaterialState); matstate->ss.ps.plan = (Plan *) node; matstate->ss.ps.state = estate; /* * We must have random access to the subplan output to do backward scan or * mark/restore. We also prefer to materialize the subplan output if we * might be called on to rewind and replay it many times. However, if none * of these cases apply, we can skip storing the data. */ matstate->randomAccess = node->cdb_strict || (eflags & (EXEC_FLAG_REWIND | EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)) != 0; matstate->eof_underlying = false; matstate->ts_state = palloc0(sizeof(GenericTupStore)); matstate->ts_pos = NULL; matstate->ts_markpos = NULL; matstate->share_lk_ctxt = NULL; matstate->ts_destroyed = false; ExecMaterialResetWorkfileState(matstate); /* * Miscellaneous initialization * * Materialization nodes don't need ExprContexts because they never call * ExecQual or ExecProject. */ #define MATERIAL_NSLOTS 2 /* * tuple table initialization * * material nodes only return tuples from their materialized relation. */ ExecInitResultTupleSlot(estate, &matstate->ss.ps); matstate->ss.ss_ScanTupleSlot = ExecInitExtraTupleSlot(estate); /* * If eflag contains EXEC_FLAG_REWIND or EXEC_FLAG_BACKWARD or EXEC_FLAG_MARK, * then this node is not eager free safe. */ matstate->ss.ps.delayEagerFree = ((eflags & (EXEC_FLAG_REWIND | EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)) != 0); /* * initialize child nodes * * We shield the child node from the need to support BACKWARD, or * MARK/RESTORE. */ eflags &= ~(EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK); /* * If Materialize does not have any external parameters, then it * can shield the child node from being rescanned as well, hence * we can clear the EXEC_FLAG_REWIND as well. If there are parameters, * don't clear the REWIND flag, as the child will be rewound. */ if (node->plan.allParam == NULL || node->plan.extParam == NULL) { eflags &= ~EXEC_FLAG_REWIND; } outerPlan = outerPlan(node); /* * A very basic check to see if the optimizer requires the material to do a projection. * Ideally, this check would recursively compare all the target list expressions. However, * such a check is tricky because of the varno mismatch (outer plan may have a varno that * index into range table, while the material may refer to the same relation as "outer" varno) * [JIRA: MPP-25365] */ insist_log(list_length(node->plan.targetlist) == list_length(outerPlan->targetlist), "Material operator does not support projection"); outerPlanState(matstate) = ExecInitNode(outerPlan, estate, eflags); /* * If the child node of a Material is a Motion, then this Material node is * not eager free safe. */ if (IsA(outerPlan((Plan *)node), Motion)) { matstate->ss.ps.delayEagerFree = true; } /* * initialize tuple type. no need to initialize projection info because * this node doesn't do projections. */ ExecAssignResultTypeFromTL(&matstate->ss.ps); ExecAssignScanTypeFromOuterPlan(&matstate->ss); matstate->ss.ps.ps_ProjInfo = NULL; /* * If share input, need to register with range table entry */ if(node->share_type != SHARE_NOTSHARED) { ShareNodeEntry *snEntry = ExecGetShareNodeEntry(estate, node->share_id, true); snEntry->sharePlan = (Node *) node; snEntry->shareState = (Node *) matstate; } initGpmonPktForMaterial((Plan *)node, &matstate->ss.ps.gpmon_pkt, estate); return matstate; }
/* * Reads the GP catalog tables and build a CdbComponentDatabases structure. * It then converts this to a Gang structure and initializes all the non-connection related fields. * * Call this function in GangContext. * Returns a not-null pointer. */ Gang * buildGangDefinition(GangType type, int gang_id, int size, int content) { Gang *newGangDefinition = NULL; CdbComponentDatabaseInfo *cdbinfo = NULL; CdbComponentDatabaseInfo *cdbInfoCopy = NULL; SegmentDatabaseDescriptor *segdbDesc = NULL; MemoryContext perGangContext = NULL; int segCount = 0; int i = 0; ELOG_DISPATCHER_DEBUG("buildGangDefinition:Starting %d qExec processes for %s gang", size, gangTypeToString(type)); Assert(CurrentMemoryContext == GangContext); Assert(size == 1 || size == getgpsegmentCount()); /* read gp_segment_configuration and build CdbComponentDatabases */ cdb_component_dbs = getComponentDatabases(); if (cdb_component_dbs == NULL || cdb_component_dbs->total_segments <= 0 || cdb_component_dbs->total_segment_dbs <= 0) insist_log(false, "schema not populated while building segworker group"); /* if mirroring is not configured */ if (cdb_component_dbs->total_segment_dbs == cdb_component_dbs->total_segments) { ELOG_DISPATCHER_DEBUG("building Gang: mirroring not configured"); disableFTS(); } perGangContext = AllocSetContextCreate(GangContext, "Per Gang Context", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); Assert(perGangContext != NULL); MemoryContextSwitchTo(perGangContext); /* allocate a gang */ newGangDefinition = (Gang *) palloc0(sizeof(Gang)); newGangDefinition->type = type; newGangDefinition->size = size; newGangDefinition->gang_id = gang_id; newGangDefinition->allocated = false; newGangDefinition->noReuse = false; newGangDefinition->dispatcherActive = false; newGangDefinition->portal_name = NULL; newGangDefinition->perGangContext = perGangContext; newGangDefinition->db_descriptors = (SegmentDatabaseDescriptor *) palloc0(size * sizeof(SegmentDatabaseDescriptor)); /* initialize db_descriptors */ switch (type) { case GANGTYPE_ENTRYDB_READER: cdbinfo = &cdb_component_dbs->entry_db_info[0]; cdbInfoCopy = copyCdbComponentDatabaseInfo(cdbinfo); segdbDesc = &newGangDefinition->db_descriptors[0]; cdbconn_initSegmentDescriptor(segdbDesc, cdbInfoCopy); setQEIdentifier(segdbDesc, -1, perGangContext); break; case GANGTYPE_SINGLETON_READER: cdbinfo = findDatabaseInfoBySegIndex(cdb_component_dbs, content); cdbInfoCopy = copyCdbComponentDatabaseInfo(cdbinfo); segdbDesc = &newGangDefinition->db_descriptors[0]; cdbconn_initSegmentDescriptor(segdbDesc, cdbInfoCopy); setQEIdentifier(segdbDesc, -1, perGangContext); break; case GANGTYPE_PRIMARY_READER: case GANGTYPE_PRIMARY_WRITER: /* * We loop through the segment_db_info. Each item has a segindex. * They are sorted by segindex, and there can be > 1 segment_db_info for * a given segindex (currently, there can be 1 or 2) */ for (i = 0; i < cdb_component_dbs->total_segment_dbs; i++) { cdbinfo = &cdb_component_dbs->segment_db_info[i]; if (SEGMENT_IS_ACTIVE_PRIMARY(cdbinfo)) { segdbDesc = &newGangDefinition->db_descriptors[segCount]; cdbInfoCopy = copyCdbComponentDatabaseInfo(cdbinfo); cdbconn_initSegmentDescriptor(segdbDesc, cdbInfoCopy); setQEIdentifier(segdbDesc, -1, perGangContext); segCount++; } } if (size != segCount) { FtsReConfigureMPP(false); elog(ERROR, "Not all primary segment instances are active and connected"); } break; default: Assert(false); } ELOG_DISPATCHER_DEBUG("buildGangDefinition done"); MemoryContextSwitchTo(GangContext); return newGangDefinition; }
/* * Create a writer gang. */ Gang * AllocateWriterGang() { Gang *writerGang = NULL; MemoryContext oldContext = NULL; int i = 0; ELOG_DISPATCHER_DEBUG("AllocateWriterGang begin."); if (Gp_role != GP_ROLE_DISPATCH) { elog(FATAL, "dispatch process called with role %d", Gp_role); } /* * First, we look for an unallocated but created gang of the right type * if it exists, we return it. * Else, we create a new gang */ if (primaryWriterGang == NULL) { int nsegdb = getgpsegmentCount(); insist_log(IsTransactionOrTransactionBlock(), "cannot allocate segworker group outside of transaction"); if (GangContext == NULL) { GangContext = AllocSetContextCreate(TopMemoryContext, "Gang Context", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); } Assert(GangContext != NULL); oldContext = MemoryContextSwitchTo(GangContext); writerGang = createGang(GANGTYPE_PRIMARY_WRITER, PRIMARY_WRITER_GANG_ID, nsegdb, -1); writerGang->allocated = true; /* * set "whoami" for utility statement. * non-utility statement will overwrite it in function getCdbProcessList. */ for(i = 0; i < writerGang->size; i++) setQEIdentifier(&writerGang->db_descriptors[i], -1, writerGang->perGangContext); MemoryContextSwitchTo(oldContext); } else { ELOG_DISPATCHER_DEBUG("Reusing an existing primary writer gang"); writerGang = primaryWriterGang; } /* sanity check the gang */ if (!GangOK(writerGang)) elog(ERROR, "could not connect to segment: initialization of segworker group failed"); ELOG_DISPATCHER_DEBUG("AllocateWriterGang end."); primaryWriterGang = writerGang; return writerGang; }
/* * Create a reader gang. * * @type can be GANGTYPE_ENTRYDB_READER, GANGTYPE_SINGLETON_READER or GANGTYPE_PRIMARY_READER. */ Gang * AllocateReaderGang(GangType type, char *portal_name) { MemoryContext oldContext = NULL; Gang *gp = NULL; int size = 0; int content = 0; ELOG_DISPATCHER_DEBUG("AllocateReaderGang for portal %s: allocatedReaderGangsN %d, availableReaderGangsN %d, " "allocatedReaderGangs1 %d, availableReaderGangs1 %d", (portal_name ? portal_name : "<unnamed>"), list_length(allocatedReaderGangsN), list_length(availableReaderGangsN), list_length(allocatedReaderGangs1), list_length(availableReaderGangs1)); if (Gp_role != GP_ROLE_DISPATCH) { elog(FATAL, "dispatch process called with role %d", Gp_role); } insist_log(IsTransactionOrTransactionBlock(), "cannot allocate segworker group outside of transaction"); if (GangContext == NULL) { GangContext = AllocSetContextCreate(TopMemoryContext, "Gang Context", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); } Assert(GangContext != NULL); oldContext = MemoryContextSwitchTo(GangContext); switch (type) { case GANGTYPE_ENTRYDB_READER: content = -1; size = 1; break; case GANGTYPE_SINGLETON_READER: content = gp_singleton_segindex; size = 1; break; case GANGTYPE_PRIMARY_READER: content = 0; size = getgpsegmentCount(); break; default: Assert(false); } /* * First, we look for an unallocated but created gang of the right type * if it exists, we return it. * Else, we create a new gang */ gp = getAvailableGang(type, size, content); if (gp == NULL) { ELOG_DISPATCHER_DEBUG("Creating a new reader size %d gang for %s", size, (portal_name ? portal_name : "unnamed portal")); gp = createGang(type, gang_id_counter++, size, content); gp->allocated = true; } /* * make sure no memory is still allocated for previous * portal name that this gang belonged to */ if (gp->portal_name) pfree(gp->portal_name); /* let the gang know which portal it is being assigned to */ gp->portal_name = (portal_name ? pstrdup(portal_name) : (char *) NULL); /* sanity check the gang */ insist_log(GangOK(gp), "could not connect to segment: initialization of segworker group failed"); addGangToAllocated(gp); MemoryContextSwitchTo(oldContext); ELOG_DISPATCHER_DEBUG("on return: allocatedReaderGangs %d, availableReaderGangsN %d, " "allocatedReaderGangs1 %d, availableReaderGangs1 %d", list_length(allocatedReaderGangsN), list_length(availableReaderGangsN), list_length(allocatedReaderGangs1), list_length(availableReaderGangs1)); return gp; }
/* * ExecWorkFile_Seek * Result is 0 if OK, EOF if not. Logical position is not moved if an * impossible seek is attempted. */ int ExecWorkFile_Seek(ExecWorkFile *workfile, uint64 offset, int whence) { Assert(workfile != NULL); Assert((workfile->flags & EXEC_WORKFILE_RANDOM_ACCESS) != 0); int result = 0; /* Determine if this seeks beyond EOF */ int64 additional_size = 0; switch (whence) { case SEEK_SET: if (offset > workfile->size) { additional_size = offset - workfile->size; } break; case SEEK_CUR: if (ExecWorkFile_Tell64(workfile) + offset > workfile->size) { additional_size = ExecWorkFile_Tell64(workfile) + offset - workfile->size; } break; default: elog(LOG, "invalid whence: %d", whence); Assert(false); return EOF; } /* Reserve disk space if needed */ if (additional_size > 0) { /* * We only allow seeking beyond EOF for files opened for writing * (i.e. files we created) */ if (workfile->flags & EXEC_WORKFILE_CREATED) { bool success = WorkfileDiskspace_Reserve(additional_size); if (!success) { /* Failed to reserve additional disk space, notify caller */ return EOF; } } else { return EOF; } } /* Do the actual seek */ switch(workfile->fileType) { case BUFFILE: result = BufFileSeek((BufFile *)workfile->file, offset, whence); if (additional_size > 0) { workfile->size = BufFileGetSize((BufFile *)workfile->file); } break; default: insist_log(false, "invalid work file type: %d", workfile->fileType); } if (additional_size > 0) { WorkfileDiskspace_Commit(additional_size, additional_size, true /* update_query_size */); workfile_update_in_progress_size(workfile, additional_size); } return result; }