/* * tuplestore_restorepos - restores current position in tuple sequence to * last saved position */ void tuplestore_restorepos_pos(Tuplestorestate *state, TuplestorePos *pos) { Assert(state->eflags & EXEC_FLAG_MARK); switch (state->status) { case TSS_INMEM: pos->eof_reached = false; pos->current = pos->markpos_current; break; case TSS_WRITEFILE: pos->eof_reached = false; pos->readpos_offset = pos->markpos_offset; break; case TSS_READFILE: pos->eof_reached = false; if (BufFileSeek(state->myfile, pos->markpos_offset, SEEK_SET) != 0) elog(ERROR, "tuplestore_restorepos failed"); break; default: elog(ERROR, "invalid tuplestore state"); break; } }
/* * tuplestore_rescan - rewind the active read pointer to start */ void tuplestore_rescan(Tuplestorestate *state) { TSReadPointer *readptr = &state->readptrs[state->activeptr]; Assert(readptr->eflags & EXEC_FLAG_REWIND); Assert(!state->truncated); switch (state->status) { case TSS_INMEM: readptr->eof_reached = false; readptr->current = 0; break; case TSS_WRITEFILE: readptr->eof_reached = false; readptr->file = 0; readptr->offset = 0L; break; case TSS_READFILE: readptr->eof_reached = false; if (BufFileSeek(state->myfile, 0, 0L, SEEK_SET) != 0) elog(ERROR, "tuplestore seek to start failed"); break; default: elog(ERROR, "invalid tuplestore state"); break; } }
/* * tuplestore_rescan - rewind and replay the scan */ void tuplestore_rescan_pos(Tuplestorestate *state, TuplestorePos *pos) { Assert(state->eflags & EXEC_FLAG_REWIND); switch (state->status) { case TSS_INMEM: pos->eof_reached = false; pos->current = 0; break; case TSS_WRITEFILE: pos->eof_reached = false; pos->readpos_offset = 0L; break; case TSS_READFILE: pos->eof_reached = false; if (BufFileSeek(state->myfile, 0L /* offset */, SEEK_SET) != 0) elog(ERROR, "seek to start failed"); break; default: elog(ERROR, "invalid tuplestore state"); break; } }
/* * tuplestore_rescan - rewind the active read pointer to start */ void tuplestore_rescan(Tuplestorestate *state) { TSReadPointer *readptr = &state->readptrs[state->activeptr]; Assert(readptr->eflags & EXEC_FLAG_REWIND); Assert(!state->truncated); switch (state->status) { case TSS_INMEM: readptr->eof_reached = false; readptr->current = 0; break; case TSS_WRITEFILE: readptr->eof_reached = false; readptr->file = 0; readptr->offset = 0L; break; case TSS_READFILE: readptr->eof_reached = false; if (BufFileSeek(state->myfile, 0, 0L, SEEK_SET) != 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not seek in tuplestore temporary file: %m"))); break; default: elog(ERROR, "invalid tuplestore state"); break; } }
/* * BufFileSeekBlock --- block-oriented seek * * Performs absolute seek to the start of the n'th BLCKSZ-sized block of * the file. Note that users of this interface will fail if their files * exceed BLCKSZ * LONG_MAX bytes, but that is quite a lot; we don't work * with tables bigger than that, either... * * Result is 0 if OK, EOF if not. Logical position is not moved if an * impossible seek is attempted. */ int BufFileSeekBlock(BufFile *file, long blknum) { return BufFileSeek(file, (int) (blknum / BUFFILE_SEG_SIZE), (off_t) (blknum % BUFFILE_SEG_SIZE) * BLCKSZ, SEEK_SET); }
/* * BufFileSeekBlock --- block-oriented seek * * Performs absolute seek to the start of the n'th BLCKSZ-sized block of * the file. Note that users of this interface will fail if their files * exceed BLCKSZ * LONG_MAX bytes, but that is quite a lot; we don't work * with tables bigger than that, either... * * Result is 0 if OK, EOF if not. Logical position is not moved if an * impossible seek is attempted. */ int BufFileSeekBlock(BufFile *file, long blknum) { return BufFileSeek(file, (int) (blknum / RELSEG_SIZE), (blknum % RELSEG_SIZE) * BLCKSZ, SEEK_SET); }
/* * ExecWorkFile_Rewind * rewind the pointer position to the beginning of the file. * * This function returns true if this succeeds. Otherwise, return false. */ bool ExecWorkFile_Rewind(ExecWorkFile *workfile) { Assert(workfile != NULL); long ret = 0; int64 file_size = 0; switch(workfile->fileType) { case BUFFILE: ret = BufFileSeek((BufFile *)workfile->file, 0L /* offset */, SEEK_SET); /* BufFileSeek returns 0 if everything went OK */ return (0 == ret); case BFZ: file_size = bfz_append_end((bfz_t *)workfile->file); ExecWorkFile_AdjustBFZSize(workfile, file_size); bfz_scan_begin((bfz_t *)workfile->file); break; default: insist_log(false, "invalid work file type: %d", workfile->fileType); } return true; }
/* * BufFileSeekBlock --- block-oriented seek * * Performs absolute seek to the start of the n'th BLCKSZ-sized block of * the file. Note that users of this interface will fail if their files * exceed BLCKSZ * LONG_MAX bytes, but that is quite a lot; we don't work * with tables bigger than that, either... * * Result is 0 if OK, EOF if not. Logical position is not moved if an * impossible seek is attempted. */ int BufFileSeekBlock(BufFile *file, int64 blknum) { return BufFileSeek(file, blknum * BLCKSZ, SEEK_SET); }
/* * Fetch the next tuple in either forward or back direction. * Returns NULL if no more tuples. If should_free is set, the * caller must pfree the returned tuple when done with it. * * Backward scan is only allowed if randomAccess was set true or * EXEC_FLAG_BACKWARD was specified to tuplestore_set_eflags(). */ static void * tuplestore_gettuple(Tuplestorestate *state, bool forward, bool *should_free) { TSReadPointer *readptr = &state->readptrs[state->activeptr]; unsigned int tuplen; void *tup; Assert(forward || (readptr->eflags & EXEC_FLAG_BACKWARD)); switch (state->status) { case TSS_INMEM: *should_free = false; if (forward) { if (readptr->eof_reached) return NULL; if (readptr->current < state->memtupcount) { /* We have another tuple, so return it */ return state->memtuples[readptr->current++]; } readptr->eof_reached = true; return NULL; } else { /* * if all tuples are fetched already then we return last * tuple, else tuple before last returned. */ if (readptr->eof_reached) { readptr->current = state->memtupcount; readptr->eof_reached = false; } else { if (readptr->current <= state->memtupdeleted) { Assert(!state->truncated); return NULL; } readptr->current--; /* last returned tuple */ } if (readptr->current <= state->memtupdeleted) { Assert(!state->truncated); return NULL; } return state->memtuples[readptr->current - 1]; } break; case TSS_WRITEFILE: /* Skip state change if we'll just return NULL */ if (readptr->eof_reached && forward) return NULL; /* * Switch from writing to reading. */ BufFileTell(state->myfile, &state->writepos_file, &state->writepos_offset); if (!readptr->eof_reached) if (BufFileSeek(state->myfile, readptr->file, readptr->offset, SEEK_SET) != 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not seek in tuplestore temporary file: %m"))); state->status = TSS_READFILE; /* FALL THRU into READFILE case */ case TSS_READFILE: *should_free = true; if (forward) { if ((tuplen = getlen(state, true)) != 0) { tup = READTUP(state, tuplen); return tup; } else { readptr->eof_reached = true; return NULL; } } /* * Backward. * * if all tuples are fetched already then we return last tuple, * else tuple before last returned. * * Back up to fetch previously-returned tuple's ending length * word. If seek fails, assume we are at start of file. */ if (BufFileSeek(state->myfile, 0, -(long) sizeof(unsigned int), SEEK_CUR) != 0) { /* even a failed backwards fetch gets you out of eof state */ readptr->eof_reached = false; Assert(!state->truncated); return NULL; } tuplen = getlen(state, false); if (readptr->eof_reached) { readptr->eof_reached = false; /* We will return the tuple returned before returning NULL */ } else { /* * Back up to get ending length word of tuple before it. */ if (BufFileSeek(state->myfile, 0, -(long) (tuplen + 2 * sizeof(unsigned int)), SEEK_CUR) != 0) { /* * If that fails, presumably the prev tuple is the first * in the file. Back up so that it becomes next to read * in forward direction (not obviously right, but that is * what in-memory case does). */ if (BufFileSeek(state->myfile, 0, -(long) (tuplen + sizeof(unsigned int)), SEEK_CUR) != 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not seek in tuplestore temporary file: %m"))); Assert(!state->truncated); return NULL; } tuplen = getlen(state, false); } /* * Now we have the length of the prior tuple, back up and read it. * Note: READTUP expects we are positioned after the initial * length word of the tuple, so back up to that point. */ if (BufFileSeek(state->myfile, 0, -(long) tuplen, SEEK_CUR) != 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not seek in tuplestore temporary file: %m"))); tup = READTUP(state, tuplen); return tup; default: elog(ERROR, "invalid tuplestore state"); return NULL; /* keep compiler quiet */ } }
static void tuplestore_puttuple_common(Tuplestorestate *state, void *tuple) { TSReadPointer *readptr; int i; ResourceOwner oldowner; state->tuples++; switch (state->status) { case TSS_INMEM: /* * Update read pointers as needed; see API spec above. */ readptr = state->readptrs; for (i = 0; i < state->readptrcount; readptr++, i++) { if (readptr->eof_reached && i != state->activeptr) { readptr->eof_reached = false; readptr->current = state->memtupcount; } } /* * Grow the array as needed. Note that we try to grow the array * when there is still one free slot remaining --- if we fail, * there'll still be room to store the incoming tuple, and then * we'll switch to tape-based operation. */ if (state->memtupcount >= state->memtupsize - 1) { (void) grow_memtuples(state); Assert(state->memtupcount < state->memtupsize); } /* Stash the tuple in the in-memory array */ state->memtuples[state->memtupcount++] = tuple; /* * Done if we still fit in available memory and have array slots. */ if (state->memtupcount < state->memtupsize && !LACKMEM(state)) return; /* * Nope; time to switch to tape-based operation. Make sure that * the temp file(s) are created in suitable temp tablespaces. */ PrepareTempTablespaces(); /* associate the file with the store's resource owner */ oldowner = CurrentResourceOwner; CurrentResourceOwner = state->resowner; state->myfile = BufFileCreateTemp(state->interXact); CurrentResourceOwner = oldowner; /* * Freeze the decision about whether trailing length words will be * used. We can't change this choice once data is on tape, even * though callers might drop the requirement. */ state->backward = (state->eflags & EXEC_FLAG_BACKWARD) != 0; state->status = TSS_WRITEFILE; dumptuples(state); break; case TSS_WRITEFILE: /* * Update read pointers as needed; see API spec above. Note: * BufFileTell is quite cheap, so not worth trying to avoid * multiple calls. */ readptr = state->readptrs; for (i = 0; i < state->readptrcount; readptr++, i++) { if (readptr->eof_reached && i != state->activeptr) { readptr->eof_reached = false; BufFileTell(state->myfile, &readptr->file, &readptr->offset); } } WRITETUP(state, tuple); break; case TSS_READFILE: /* * Switch from reading to writing. */ if (!state->readptrs[state->activeptr].eof_reached) BufFileTell(state->myfile, &state->readptrs[state->activeptr].file, &state->readptrs[state->activeptr].offset); if (BufFileSeek(state->myfile, state->writepos_file, state->writepos_offset, SEEK_SET) != 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not seek in tuplestore temporary file: %m"))); state->status = TSS_WRITEFILE; /* * Update read pointers as needed; see API spec above. */ readptr = state->readptrs; for (i = 0; i < state->readptrcount; readptr++, i++) { if (readptr->eof_reached && i != state->activeptr) { readptr->eof_reached = false; readptr->file = state->writepos_file; readptr->offset = state->writepos_offset; } } WRITETUP(state, tuple); break; default: elog(ERROR, "invalid tuplestore state"); break; } }
/* * tuplestore_select_read_pointer - make the specified read pointer active */ void tuplestore_select_read_pointer(Tuplestorestate *state, int ptr) { TSReadPointer *readptr; TSReadPointer *oldptr; Assert(ptr >= 0 && ptr < state->readptrcount); /* No work if already active */ if (ptr == state->activeptr) return; readptr = &state->readptrs[ptr]; oldptr = &state->readptrs[state->activeptr]; switch (state->status) { case TSS_INMEM: case TSS_WRITEFILE: /* no work */ break; case TSS_READFILE: /* * First, save the current read position in the pointer about to * become inactive. */ if (!oldptr->eof_reached) BufFileTell(state->myfile, &oldptr->file, &oldptr->offset); /* * We have to make the temp file's seek position equal to the * logical position of the new read pointer. In eof_reached * state, that's the EOF, which we have available from the saved * write position. */ if (readptr->eof_reached) { if (BufFileSeek(state->myfile, state->writepos_file, state->writepos_offset, SEEK_SET) != 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not seek in tuplestore temporary file: %m"))); } else { if (BufFileSeek(state->myfile, readptr->file, readptr->offset, SEEK_SET) != 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not seek in tuplestore temporary file: %m"))); } break; default: elog(ERROR, "invalid tuplestore state"); break; } state->activeptr = ptr; }
/* * tuplestore_copy_read_pointer - copy a read pointer's state to another */ void tuplestore_copy_read_pointer(Tuplestorestate *state, int srcptr, int destptr) { TSReadPointer *sptr = &state->readptrs[srcptr]; TSReadPointer *dptr = &state->readptrs[destptr]; Assert(srcptr >= 0 && srcptr < state->readptrcount); Assert(destptr >= 0 && destptr < state->readptrcount); /* Assigning to self is a no-op */ if (srcptr == destptr) return; if (dptr->eflags != sptr->eflags) { /* Possible change of overall eflags, so copy and then recompute */ int eflags; int i; *dptr = *sptr; eflags = state->readptrs[0].eflags; for (i = 1; i < state->readptrcount; i++) eflags |= state->readptrs[i].eflags; state->eflags = eflags; } else *dptr = *sptr; switch (state->status) { case TSS_INMEM: case TSS_WRITEFILE: /* no work */ break; case TSS_READFILE: /* * This case is a bit tricky since the active read pointer's * position corresponds to the seek point, not what is in its * variables. Assigning to the active requires a seek, and * assigning from the active requires a tell, except when * eof_reached. */ if (destptr == state->activeptr) { if (dptr->eof_reached) { if (BufFileSeek(state->myfile, state->writepos_file, state->writepos_offset, SEEK_SET) != 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not seek in tuplestore temporary file: %m"))); } else { if (BufFileSeek(state->myfile, dptr->file, dptr->offset, SEEK_SET) != 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not seek in tuplestore temporary file: %m"))); } } else if (srcptr == state->activeptr) { if (!dptr->eof_reached) BufFileTell(state->myfile, &dptr->file, &dptr->offset); } break; default: elog(ERROR, "invalid tuplestore state"); break; } }
/* * Dump the shared local snapshot, so that the readers can pick it up. * * BufFileCreateTemp_ReaderWriter(filename, iswriter) */ void dumpSharedLocalSnapshot_forCursor(void) { SharedSnapshotSlot *src = NULL; char* fname = NULL; BufFile *f = NULL; Size count=0; TransactionId *xids = NULL; int64 sub_size; int64 size_read; ResourceOwner oldowner; MemoryContext oldcontext; Assert(Gp_role == GP_ROLE_DISPATCH || (Gp_role == GP_ROLE_EXECUTE && Gp_is_writer)); Assert(SharedLocalSnapshotSlot != NULL); src = (SharedSnapshotSlot *)SharedLocalSnapshotSlot; fname = sharedLocalSnapshot_filename(src->QDxid, src->QDcid, src->segmateSync); /* * Create our dump-file. Hold the reference to it in * the transaction's resource owner, so that it lives as long * as the cursor we're declaring. */ oldowner = CurrentResourceOwner; CurrentResourceOwner = TopTransactionResourceOwner; oldcontext = MemoryContextSwitchTo(TopTransactionContext); f = BufFileCreateTemp_ReaderWriter(fname, true, false); /* * Remember our file, so that we can close it at end of transaction. * The resource owner mechanism would do it for us as a backstop, but it * produces warnings at commit if some files haven't been closed. */ shared_snapshot_files = lappend(shared_snapshot_files, f); MemoryContextSwitchTo(oldcontext); CurrentResourceOwner = oldowner; /* we have our file. */ #define FileWriteOK(file, ptr, size) (BufFileWrite(file, ptr, size) == size) #define FileWriteFieldWithCount(count, file, field) \ if (BufFileWrite((file), &(field), sizeof(field)) != sizeof(field)) break; \ count += sizeof(field); do { /* Write our length as zero. (we'll fix it later). */ count = 0; /* * We write two counts here: One is count of first part, * second is size of subtransaction xids copied from * SharedLocalSnapshotSlot. This can be a big number. */ FileWriteFieldWithCount(count, f, count); FileWriteFieldWithCount(count, f, src->total_subcnt); FileWriteFieldWithCount(count, f, src->pid); FileWriteFieldWithCount(count, f, src->xid); FileWriteFieldWithCount(count, f, src->cid); FileWriteFieldWithCount(count, f, src->startTimestamp); FileWriteFieldWithCount(count, f, src->combocidcnt); FileWriteFieldWithCount(count, f, src->combocids); FileWriteFieldWithCount(count, f, src->snapshot.xmin); FileWriteFieldWithCount(count, f, src->snapshot.xmax); FileWriteFieldWithCount(count, f, src->snapshot.xcnt); if (!FileWriteOK(f, &src->snapshot.xip, src->snapshot.xcnt * sizeof(TransactionId))) break; count += src->snapshot.xcnt * sizeof(TransactionId); FileWriteFieldWithCount(count, f, src->snapshot.curcid); /* * THE STUFF IN THE SHARED LOCAL VERSION OF * snapshot.distribSnapshotWithLocalMapping * APPEARS TO *NEVER* BE USED, SO THERE IS * NO POINT IN TRYING TO DUMP IT (IN FACT, * IT'S ALLOCATION STRATEGY ISN'T SHMEM-FRIENDLY). */ /* * THIS STUFF IS USED IN THE FILENAME * SO THE READER ALREADY HAS IT. * dst->QDcid = src->QDcid; dst->segmateSync = src->segmateSync; dst->QDxid = src->QDxid; dst->ready = src->ready; * */ if (src->total_subcnt > src->inmemory_subcnt) { Assert(subxip_file != 0); xids = palloc(MAX_XIDBUF_SIZE); FileSeek(subxip_file, 0, SEEK_SET); sub_size = (src->total_subcnt - src->inmemory_subcnt) * sizeof(TransactionId); while (sub_size > 0) { size_read = (sub_size > MAX_XIDBUF_SIZE) ? MAX_XIDBUF_SIZE : sub_size; if (size_read != FileRead(subxip_file, (char *)xids, size_read)) { elog(ERROR, "Error in reading subtransaction file."); } if (!FileWriteOK(f, xids, sub_size)) { break; } sub_size -= size_read; } pfree(xids); if (sub_size != 0) break; } if (src->inmemory_subcnt > 0) { sub_size = src->inmemory_subcnt * sizeof(TransactionId); if (!FileWriteOK(f, src->subxids, sub_size)) { break; } } /* * Now update our length field: seek to beginning and overwrite * our original zero-length. count does not include * subtransaction ids. */ if (BufFileSeek(f, 0 /* offset */, SEEK_SET) != 0) break; if (!FileWriteOK(f, &count, sizeof(count))) break; /* now flush and close. */ BufFileFlush(f); /* * Temp files get deleted on close! * * BufFileClose(f); */ return; } while (0); elog(ERROR, "Failed to write shared snapshot to temp-file"); }
void readSharedLocalSnapshot_forCursor(Snapshot snapshot) { BufFile *f; char *fname=NULL; Size count=0, sanity; uint8 *p, *buffer=NULL; pid_t writerPid; TransactionId localXid; CommandId localCid; TimestampTz localXactStartTimestamp; uint32 combocidcnt; ComboCidKeyData tmp_combocids[MaxComboCids]; uint32 sub_size; uint32 read_size; int64 subcnt; TransactionId *subxids = NULL; Assert(Gp_role == GP_ROLE_EXECUTE); Assert(!Gp_is_writer); Assert(SharedLocalSnapshotSlot != NULL); Assert(snapshot->xip != NULL); Assert(snapshot->subxip != NULL); /* * Open our dump-file, this will either return a valid file, or * throw an error. * * NOTE: this is always run *after* the dump by the writer is * guaranteed to have completed. */ fname = sharedLocalSnapshot_filename(QEDtxContextInfo.distributedXid, QEDtxContextInfo.curcid, QEDtxContextInfo.segmateSync); f = BufFileCreateTemp_ReaderWriter(fname, false, false); /* we have our file. */ #define FileReadOK(file, ptr, size) (BufFileRead(file, ptr, size) == size) /* Read the file-length info */ if (!FileReadOK(f, &count, sizeof(count))) elog(ERROR, "Cursor snapshot: failed to read size"); elog(DEBUG1, "Reading in cursor-snapshot %u bytes", (unsigned int)count); buffer = palloc(count); /* * Seek back to the beginning: * We're going to read this all in one go, the size * of this buffer should be more than a few hundred bytes. */ if (BufFileSeek(f, 0 /* offset */, SEEK_SET) != 0) elog(ERROR, "Cursor snapshot: failed to seek."); if (!FileReadOK(f, buffer, count)) elog(ERROR, "Cursor snapshot: failed to read content"); /* we've got the entire snapshot read into our buffer. */ p = buffer; /* sanity check count */ memcpy(&sanity, p, sizeof(sanity)); if (sanity != count) elog(ERROR, "cursor snapshot failed sanity %u != %u", (unsigned int)sanity, (unsigned int)count); p += sizeof(sanity); memcpy(&sub_size, p, sizeof(uint32)); p += sizeof(uint32); /* see dumpSharedLocalSnapshot_forCursor() for the correct order here */ memcpy(&writerPid, p, sizeof(writerPid)); p += sizeof(writerPid); memcpy(&localXid, p, sizeof(localXid)); p += sizeof(localXid); memcpy(&localCid, p, sizeof(localCid)); p += sizeof(localCid); memcpy(&localXactStartTimestamp, p, sizeof(localXactStartTimestamp)); p += sizeof(localXactStartTimestamp); memcpy(&combocidcnt, p, sizeof(combocidcnt)); p += sizeof(combocidcnt); memcpy(tmp_combocids, p, sizeof(tmp_combocids)); p += sizeof(tmp_combocids); /* handle the combocid stuff (same as in GetSnapshotData()) */ if (usedComboCids != combocidcnt) { if (usedComboCids == 0) { MemoryContext oldCtx = MemoryContextSwitchTo(TopTransactionContext); comboCids = palloc(combocidcnt * sizeof(ComboCidKeyData)); MemoryContextSwitchTo(oldCtx); } else repalloc(comboCids, combocidcnt * sizeof(ComboCidKeyData)); } memcpy(comboCids, tmp_combocids, combocidcnt * sizeof(ComboCidKeyData)); usedComboCids = ((combocidcnt < MaxComboCids) ? combocidcnt : MaxComboCids); memcpy(&snapshot->xmin, p, sizeof(snapshot->xmin)); p += sizeof(snapshot->xmin); memcpy(&snapshot->xmax, p, sizeof(snapshot->xmax)); p += sizeof(snapshot->xmax); memcpy(&snapshot->xcnt, p, sizeof(snapshot->xcnt)); p += sizeof(snapshot->xcnt); memcpy(snapshot->xip, p, snapshot->xcnt * sizeof(TransactionId)); p += snapshot->xcnt * sizeof(TransactionId); /* zero out the slack in the xip-array */ memset(snapshot->xip + snapshot->xcnt, 0, (xipEntryCount - snapshot->xcnt)*sizeof(TransactionId)); memcpy(&snapshot->curcid, p, sizeof(snapshot->curcid)); /* Now we're done with the buffer */ pfree(buffer); /* * Now read the subtransaction ids. This can be a big number, so cannot * allocate memory all at once. */ sub_size *= sizeof(TransactionId); ResetXidBuffer(&subxbuf); if (sub_size) { subxids = palloc(MAX_XIDBUF_SIZE); } while (sub_size > 0) { read_size = sub_size > MAX_XIDBUF_SIZE ? MAX_XIDBUF_SIZE : sub_size; if (!FileReadOK(f, (char *)subxids, read_size)) { elog(ERROR, "Error in Reading Subtransaction file."); } subcnt = read_size/sizeof(TransactionId); AddSortedToXidBuffer(&subxbuf, subxids, subcnt); sub_size -= read_size; } if (subxids) { pfree(subxids); } /* we're done with file. */ BufFileClose(f); SetSharedTransactionId_reader(localXid, snapshot->curcid); return; }
static void tuplestore_puttuple_common(Tuplestorestate *state, void *tuple) { TSReadPointer *readptr; int i; ResourceOwner oldowner; switch (state->status) { case TSS_INMEM: /* * Update read pointers as needed; see API spec above. */ readptr = state->readptrs; for (i = 0; i < state->readptrcount; readptr++, i++) { if (readptr->eof_reached && i != state->activeptr) { readptr->eof_reached = false; readptr->current = state->memtupcount; } } /* * Grow the array as needed. Note that we try to grow the array * when there is still one free slot remaining --- if we fail, * there'll still be room to store the incoming tuple, and then * we'll switch to tape-based operation. */ if (state->memtupcount >= state->memtupsize - 1) { /* * See grow_memtuples() in tuplesort.c for the rationale * behind these two tests. */ if (state->availMem > (long) (state->memtupsize * sizeof(void *)) && (Size) (state->memtupsize * 2) < MaxAllocSize / sizeof(void *)) { FREEMEM(state, GetMemoryChunkSpace(state->memtuples)); state->memtupsize *= 2; state->memtuples = (void **) repalloc(state->memtuples, state->memtupsize * sizeof(void *)); USEMEM(state, GetMemoryChunkSpace(state->memtuples)); if (LACKMEM(state)) elog(ERROR, "unexpected out-of-memory situation in tuplestore"); } } /* Stash the tuple in the in-memory array */ state->memtuples[state->memtupcount++] = tuple; /* * Done if we still fit in available memory and have array slots. */ if (state->memtupcount < state->memtupsize && !LACKMEM(state)) return; /* * Nope; time to switch to tape-based operation. Make sure that * the temp file(s) are created in suitable temp tablespaces. */ PrepareTempTablespaces(); /* associate the file with the store's resource owner */ oldowner = CurrentResourceOwner; CurrentResourceOwner = state->resowner; char tmpprefix[50]; snprintf(tmpprefix, 50, "slice%d_tuplestore", currentSliceId); state->myfile = BufFileCreateTemp(tmpprefix, state->interXact); CurrentResourceOwner = oldowner; /* * Freeze the decision about whether trailing length words will be * used. We can't change this choice once data is on tape, even * though callers might drop the requirement. */ state->backward = (state->eflags & EXEC_FLAG_BACKWARD) != 0; state->status = TSS_WRITEFILE; dumptuples(state); break; case TSS_WRITEFILE: /* * Update read pointers as needed; see API spec above. Note: * BufFileTell is quite cheap, so not worth trying to avoid * multiple calls. */ readptr = state->readptrs; for (i = 0; i < state->readptrcount; readptr++, i++) { if (readptr->eof_reached && i != state->activeptr) { readptr->eof_reached = false; BufFileTell(state->myfile, &readptr->file, &readptr->offset); } } WRITETUP(state, tuple); break; case TSS_READFILE: /* * Switch from reading to writing. */ if (!state->readptrs[state->activeptr].eof_reached) BufFileTell(state->myfile, &state->readptrs[state->activeptr].file, &state->readptrs[state->activeptr].offset); if (BufFileSeek(state->myfile, state->writepos_file, state->writepos_offset, SEEK_SET) != 0) elog(ERROR, "tuplestore seek to EOF failed"); state->status = TSS_WRITEFILE; /* * Update read pointers as needed; see API spec above. */ readptr = state->readptrs; for (i = 0; i < state->readptrcount; readptr++, i++) { if (readptr->eof_reached && i != state->activeptr) { readptr->eof_reached = false; readptr->file = state->writepos_file; readptr->offset = state->writepos_offset; } } WRITETUP(state, tuple); break; default: elog(ERROR, "invalid tuplestore state"); break; } }
static void tuplestore_puttuple_common(Tuplestorestate *state, TuplestorePos *pos, void *tuple) { ResourceOwner oldowner; switch (state->status) { case TSS_INMEM: /* * Grow the array as needed. Note that we try to grow the array * when there is still one free slot remaining --- if we fail, * there'll still be room to store the incoming tuple, and then * we'll switch to tape-based operation. */ if (state->memtupcount >= state->memtupsize - 1) { /* * See grow_memtuples() in tuplesort.c for the rationale * behind these two tests. */ if (state->availMem > (long) (state->memtupsize * sizeof(void *)) && (Size) (state->memtupsize * 2) < MaxAllocSize / sizeof(void *)) { FREEMEM(state, GetMemoryChunkSpace(state->memtuples)); state->memtupsize *= 2; state->memtuples = (void **) repalloc(state->memtuples, state->memtupsize * sizeof(void *)); USEMEM(state, GetMemoryChunkSpace(state->memtuples)); } } /* Stash the tuple in the in-memory array */ state->memtuples[state->memtupcount++] = tuple; /* If eof_reached, keep read position in sync */ if (pos->eof_reached) pos->current = state->memtupcount; /* * Done if we still fit in available memory and have array slots. */ if (state->memtupcount < state->memtupsize && !LACKMEM(state)) return; /* * Nope; time to switch to tape-based operation. Make sure that * the temp file(s) are created in suitable temp tablespaces. */ PrepareTempTablespaces(); /* associate the file with the store's resource owner */ oldowner = CurrentResourceOwner; CurrentResourceOwner = state->resowner; { char tmpprefix[50]; snprintf(tmpprefix, 50, "slice%d_tuplestore", currentSliceId); state->myfile = BufFileCreateTemp(tmpprefix, state->interXact); } CurrentResourceOwner = oldowner; state->status = TSS_WRITEFILE; dumptuples(state, pos); break; case TSS_WRITEFILE: WRITETUP(state, pos, tuple); break; case TSS_READFILE: /* * Switch from reading to writing. */ if (!pos->eof_reached) BufFileTell(state->myfile, &pos->readpos_offset); if (BufFileSeek(state->myfile, pos->writepos_offset, SEEK_SET) != 0) elog(ERROR, "seek to EOF failed"); state->status = TSS_WRITEFILE; WRITETUP(state, pos, tuple); break; default: elog(ERROR, "invalid tuplestore state"); break; } }
/* * Fetch the next tuple in either forward or back direction. * Returns NULL if no more tuples. If should_free is set, the * caller must pfree the returned tuple when done with it. * * Backward scan is only allowed if randomAccess was set true or * EXEC_FLAG_BACKWARD was specified to tuplestore_set_eflags(). */ static void * tuplestore_gettuple(Tuplestorestate *state, TuplestorePos *pos, bool forward, bool *should_free) { uint32 tuplen; void *tup; Assert(forward || (state->eflags & EXEC_FLAG_BACKWARD)); switch (state->status) { case TSS_INMEM: *should_free = false; if (forward) { if (pos->current < state->memtupcount) return state->memtuples[pos->current++]; pos->eof_reached = true; return NULL; } else { if (pos->current <= 0) return NULL; /* * if all tuples are fetched already then we return last * tuple, else - tuple before last returned. */ if (pos->eof_reached) pos->eof_reached = false; else { pos->current--; /* last returned tuple */ if (pos->current <= 0) return NULL; } return state->memtuples[pos->current - 1]; } break; case TSS_WRITEFILE: /* Skip state change if we'll just return NULL */ if (pos->eof_reached && forward) return NULL; /* * Switch from writing to reading. */ BufFileTell(state->myfile, &pos->writepos_offset); if (!pos->eof_reached) if (BufFileSeek(state->myfile, pos->readpos_offset, SEEK_SET) != 0) elog(ERROR, "seek failed"); state->status = TSS_READFILE; /* FALL THRU into READFILE case */ case TSS_READFILE: *should_free = true; if (forward) { if ((tuplen = getlen(state, pos, true)) != 0) { tup = READTUP(state, pos, tuplen); /* CDB XXX XXX XXX XXX */ /* MPP-1347: EXPLAIN ANALYZE shows runaway memory usage. * Readtup does a usemem, but the free happens in * ExecStoreTuple. Do a free so state->availMem * doesn't go massively negative to screw up * stats. It would be better to interrogate the * heap for actual memory usage than use this * homemade accounting. */ FREEMEM(state, GetMemoryChunkSpace(tup)); /* CDB XXX XXX XXX XXX */ return tup; } else { pos->eof_reached = true; return NULL; } } /* * Backward. * * if all tuples are fetched already then we return last tuple, * else - tuple before last returned. * * Back up to fetch previously-returned tuple's ending length * word. If seek fails, assume we are at start of file. */ insist_log(false, "Backward scanning of tuplestores are not supported at this time"); if (BufFileSeek(state->myfile, -(long) sizeof(uint32) /* offset */, SEEK_CUR) != 0) return NULL; tuplen = getlen(state, pos, false); if (pos->eof_reached) { pos->eof_reached = false; /* We will return the tuple returned before returning NULL */ } else { /* * Back up to get ending length word of tuple before it. */ if (BufFileSeek(state->myfile, -(long) (tuplen + 2 * sizeof(uint32)) /* offset */, SEEK_CUR) != 0) { /* * If that fails, presumably the prev tuple is the first * in the file. Back up so that it becomes next to read * in forward direction (not obviously right, but that is * what in-memory case does). */ if (BufFileSeek(state->myfile, -(long) (tuplen + sizeof(uint32)) /* offset */, SEEK_CUR) != 0) elog(ERROR, "bogus tuple length in backward scan"); return NULL; } tuplen = getlen(state, pos, false); } /* * Now we have the length of the prior tuple, back up and read it. * Note: READTUP expects we are positioned after the initial * length word of the tuple, so back up to that point. */ if (BufFileSeek(state->myfile, -(long) tuplen /* offset */, SEEK_CUR) != 0) elog(ERROR, "bogus tuple length in backward scan"); tup = READTUP(state, pos, tuplen); return tup; default: elog(ERROR, "invalid tuplestore state"); return NULL; /* keep compiler quiet */ } }
/* * ExecWorkFile_Seek * Result is 0 if OK, EOF if not. Logical position is not moved if an * impossible seek is attempted. */ int ExecWorkFile_Seek(ExecWorkFile *workfile, uint64 offset, int whence) { Assert(workfile != NULL); Assert((workfile->flags & EXEC_WORKFILE_RANDOM_ACCESS) != 0); int result = 0; /* Determine if this seeks beyond EOF */ int64 additional_size = 0; switch (whence) { case SEEK_SET: if (offset > workfile->size) { additional_size = offset - workfile->size; } break; case SEEK_CUR: if (ExecWorkFile_Tell64(workfile) + offset > workfile->size) { additional_size = ExecWorkFile_Tell64(workfile) + offset - workfile->size; } break; default: elog(LOG, "invalid whence: %d", whence); Assert(false); return EOF; } /* Reserve disk space if needed */ if (additional_size > 0) { /* * We only allow seeking beyond EOF for files opened for writing * (i.e. files we created) */ if (workfile->flags & EXEC_WORKFILE_CREATED) { bool success = WorkfileDiskspace_Reserve(additional_size); if (!success) { /* Failed to reserve additional disk space, notify caller */ return EOF; } } else { return EOF; } } /* Do the actual seek */ switch(workfile->fileType) { case BUFFILE: result = BufFileSeek((BufFile *)workfile->file, offset, whence); if (additional_size > 0) { workfile->size = BufFileGetSize((BufFile *)workfile->file); } break; default: insist_log(false, "invalid work file type: %d", workfile->fileType); } if (additional_size > 0) { WorkfileDiskspace_Commit(additional_size, additional_size, true /* update_query_size */); workfile_update_in_progress_size(workfile, additional_size); } return result; }
void loadSharedComboCommandId(TransactionId xmin, CommandId combocid, CommandId *cmin, CommandId *cmax) { bool found = false; ComboCidEntryData entry; int i; Assert(Gp_role == GP_ROLE_EXECUTE); Assert(!Gp_is_writer); Assert(cmin != NULL); Assert(cmax != NULL); if (lockHolderProcPtr == NULL) { /* get lockholder! */ elog(ERROR, "loadSharedComboCommandId: NO LOCK HOLDER POINTER."); } if (combocid_map == NULL) { MemoryContext oldCtx; char path[MAXPGPATH]; ComboCidMapName(path, gp_session_id, lockHolderProcPtr->pid); /* open our file, as appropriate: this will throw an error if the create-fails. */ oldCtx = MemoryContextSwitchTo(TopMemoryContext); combocid_map = BufFileCreateTemp_ReaderWriter(path, false, true); MemoryContextSwitchTo(oldCtx); } Assert(combocid_map != NULL); /* Seek to the beginning to start our search ? */ if (BufFileSeek(combocid_map, 0 /* fileno */, 0 /* offset */, SEEK_SET) != 0) { elog(ERROR, "loadSharedComboCommandId: seek to beginning failed."); } /* * Read this entry in ... * * We're going to read in the entire table, caching all occurrences of * our xmin. */ for (i = 0; i < lockHolderProcPtr->combocid_map_count; i++) { if (BufFileRead(combocid_map, &entry, sizeof(ComboCidEntryData)) != sizeof(ComboCidEntryData)) { elog(ERROR, "loadSharedComboCommandId: read failed I/O error."); } if (entry.key.xmin == xmin) { bool cached = false; readerComboCidKeyData reader_key; readerComboCidEntryData *reader_entry; memset(&reader_key, 0, sizeof(reader_key)); reader_key.writer_pid = lockHolderProcPtr->pid; reader_key.xmin = entry.key.xmin; reader_key.session = gp_session_id; reader_key.combocid = entry.combocid; reader_entry = (readerComboCidEntryData *) hash_search(readerComboHash, &reader_key, HASH_ENTER, &cached); if (!cached) { reader_entry->cmin = entry.key.cmin; reader_entry->cmax = entry.key.cmax; } /* * This was our entry -- we're going to continue our scan, * to pull in any additional entries for our xmin */ if (entry.combocid == combocid) { *cmin = entry.key.cmin; *cmax = entry.key.cmax; found = true; } } } if (!found) { elog(ERROR, "loadSharedComboCommandId: no combocid entry found for %u/%u", xmin, combocid); } }