/* ---------------------------------------------------------------- * ExecHash * * build hash table for hashjoin, doing partitioning if more * than one batch is required. * ---------------------------------------------------------------- */ TupleTableSlot * ExecHash(HashState *node) { EState *estate; PlanState *outerNode; List *hashkeys; HashJoinTable hashtable; TupleTableSlot *slot; ExprContext *econtext; int nbatch; int i; /* * get state info from node */ estate = node->ps.state; outerNode = outerPlanState(node); hashtable = node->hashtable; nbatch = hashtable->nbatch; if (nbatch > 0) { /* * Open temp files for inner batches, if needed. Note that file * buffers are palloc'd in regular executor context. */ for (i = 0; i < nbatch; i++) hashtable->innerBatchFile[i] = BufFileCreateTemp(false); } /* * set expression context */ hashkeys = node->hashkeys; econtext = node->ps.ps_ExprContext; /* * get all inner tuples and insert into the hash table (or temp files) */ for (;;) { slot = ExecProcNode(outerNode); if (TupIsNull(slot)) break; hashtable->hashNonEmpty = true; econtext->ecxt_innertuple = slot; ExecHashTableInsert(hashtable, econtext, hashkeys); ExecClearTuple(slot); } /* * Return the slot so that we have the tuple descriptor when we need * to save/restore them. -Jeff 11 July 1991 */ return slot; }
/* * Create a set of logical tapes in a temporary underlying file. * * Each tape is initialized in write state. */ LogicalTapeSet * LogicalTapeSetCreate(int ntapes) { LogicalTapeSet *lts; LogicalTape *lt; int i; /* * Create top-level struct including per-tape LogicalTape structs. First * LogicalTape struct is already counted in sizeof(LogicalTapeSet). */ Assert(ntapes > 0); lts = (LogicalTapeSet *) palloc(sizeof(LogicalTapeSet) + (ntapes - 1) *sizeof(LogicalTape)); lts->pfile = BufFileCreateTemp(false); lts->nFileBlocks = 0L; lts->forgetFreeSpace = false; lts->blocksSorted = true; /* a zero-length array is sorted ... */ lts->freeBlocksLen = 32; /* reasonable initial guess */ lts->freeBlocks = (long *) palloc(lts->freeBlocksLen * sizeof(long)); lts->nFreeBlocks = 0; lts->nTapes = ntapes; /* * Initialize per-tape structs. Note we allocate the I/O buffer and * first-level indirect block for a tape only when it is first actually * written to. This avoids wasting memory space when tuplesort.c * overestimates the number of tapes needed. */ for (i = 0; i < ntapes; i++) { lt = <s->tapes[i]; lt->indirect = NULL; lt->writing = true; lt->frozen = false; lt->dirty = false; lt->numFullBlocks = 0L; lt->lastBlockBytes = 0; lt->buffer = NULL; lt->curBlockNumber = 0L; lt->pos = 0; lt->nbytes = 0; } return lts; }
/* * Open many bfz files to simulate running out of file handles. * file_type values: * 0 - bfz, no compression * 1 - bfz, zlib compression * 2 - buffile */ static void open_many_files(int file_type) { char file_name[MAXPGPATH]; int iter = 0; while (true) { CHECK_FOR_INTERRUPTS(); snprintf(file_name, MAXPGPATH, "fake_file_%d", iter); switch(file_type) { case 0: case 1: ; #if USE_ASSERT_CHECKING bfz_t *bfz_file = #endif bfz_create(file_name, true /* delOnClose */, file_type); Assert(NULL != bfz_file); break; case 2: ; #if USE_ASSERT_CHECKING BufFile *buf_file = #endif BufFileCreateTemp(file_name, false /* interXact */ ); Assert(NULL != buf_file); break; default: Assert(false && "argument for fault type not supported"); } iter++; } return; }
static void tuplestore_puttuple_common(Tuplestorestate *state, void *tuple) { TSReadPointer *readptr; int i; ResourceOwner oldowner; state->tuples++; switch (state->status) { case TSS_INMEM: /* * Update read pointers as needed; see API spec above. */ readptr = state->readptrs; for (i = 0; i < state->readptrcount; readptr++, i++) { if (readptr->eof_reached && i != state->activeptr) { readptr->eof_reached = false; readptr->current = state->memtupcount; } } /* * Grow the array as needed. Note that we try to grow the array * when there is still one free slot remaining --- if we fail, * there'll still be room to store the incoming tuple, and then * we'll switch to tape-based operation. */ if (state->memtupcount >= state->memtupsize - 1) { (void) grow_memtuples(state); Assert(state->memtupcount < state->memtupsize); } /* Stash the tuple in the in-memory array */ state->memtuples[state->memtupcount++] = tuple; /* * Done if we still fit in available memory and have array slots. */ if (state->memtupcount < state->memtupsize && !LACKMEM(state)) return; /* * Nope; time to switch to tape-based operation. Make sure that * the temp file(s) are created in suitable temp tablespaces. */ PrepareTempTablespaces(); /* associate the file with the store's resource owner */ oldowner = CurrentResourceOwner; CurrentResourceOwner = state->resowner; state->myfile = BufFileCreateTemp(state->interXact); CurrentResourceOwner = oldowner; /* * Freeze the decision about whether trailing length words will be * used. We can't change this choice once data is on tape, even * though callers might drop the requirement. */ state->backward = (state->eflags & EXEC_FLAG_BACKWARD) != 0; state->status = TSS_WRITEFILE; dumptuples(state); break; case TSS_WRITEFILE: /* * Update read pointers as needed; see API spec above. Note: * BufFileTell is quite cheap, so not worth trying to avoid * multiple calls. */ readptr = state->readptrs; for (i = 0; i < state->readptrcount; readptr++, i++) { if (readptr->eof_reached && i != state->activeptr) { readptr->eof_reached = false; BufFileTell(state->myfile, &readptr->file, &readptr->offset); } } WRITETUP(state, tuple); break; case TSS_READFILE: /* * Switch from reading to writing. */ if (!state->readptrs[state->activeptr].eof_reached) BufFileTell(state->myfile, &state->readptrs[state->activeptr].file, &state->readptrs[state->activeptr].offset); if (BufFileSeek(state->myfile, state->writepos_file, state->writepos_offset, SEEK_SET) != 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not seek in tuplestore temporary file: %m"))); state->status = TSS_WRITEFILE; /* * Update read pointers as needed; see API spec above. */ readptr = state->readptrs; for (i = 0; i < state->readptrcount; readptr++, i++) { if (readptr->eof_reached && i != state->activeptr) { readptr->eof_reached = false; readptr->file = state->writepos_file; readptr->offset = state->writepos_offset; } } WRITETUP(state, tuple); break; default: elog(ERROR, "invalid tuplestore state"); break; } }
/* * Initialize GiST build buffers. */ GISTBuildBuffers * gistInitBuildBuffers(int pagesPerBuffer, int levelStep, int maxLevel) { GISTBuildBuffers *gfbb; HASHCTL hashCtl; gfbb = palloc(sizeof(GISTBuildBuffers)); gfbb->pagesPerBuffer = pagesPerBuffer; gfbb->levelStep = levelStep; /* * Create a temporary file to hold buffer pages that are swapped out of * memory. */ gfbb->pfile = BufFileCreateTemp(false); gfbb->nFileBlocks = 0; /* Initialize free page management. */ gfbb->nFreeBlocks = 0; gfbb->freeBlocksLen = 32; gfbb->freeBlocks = (long *) palloc(gfbb->freeBlocksLen * sizeof(long)); /* * Current memory context will be used for all in-memory data structures * of buffers which are persistent during buffering build. */ gfbb->context = CurrentMemoryContext; /* * nodeBuffersTab hash is association between index blocks and it's * buffers. */ hashCtl.keysize = sizeof(BlockNumber); hashCtl.entrysize = sizeof(GISTNodeBuffer); hashCtl.hcxt = CurrentMemoryContext; hashCtl.hash = tag_hash; hashCtl.match = memcmp; gfbb->nodeBuffersTab = hash_create("gistbuildbuffers", 1024, &hashCtl, HASH_ELEM | HASH_CONTEXT | HASH_FUNCTION | HASH_COMPARE); gfbb->bufferEmptyingQueue = NIL; /* * Per-level node buffers lists for final buffers emptying process. Node * buffers are inserted here when they are created. */ gfbb->buffersOnLevelsLen = 1; gfbb->buffersOnLevels = (List **) palloc(sizeof(List *) * gfbb->buffersOnLevelsLen); gfbb->buffersOnLevels[0] = NIL; /* * Block numbers of node buffers which last pages are currently loaded * into main memory. */ gfbb->loadedBuffersLen = 32; gfbb->loadedBuffers = (GISTNodeBuffer **) palloc(gfbb->loadedBuffersLen * sizeof(GISTNodeBuffer *)); gfbb->loadedBuffersCount = 0; gfbb->rootlevel = maxLevel; return gfbb; }
/* * Create a set of logical tapes in a temporary underlying file. * * Each tape is initialized in write state. Serial callers pass ntapes, * NULL argument for shared, and -1 for worker. Parallel worker callers * pass ntapes, a shared file handle, NULL shared argument, and their own * worker number. Leader callers, which claim shared worker tapes here, * must supply non-sentinel values for all arguments except worker number, * which should be -1. * * Leader caller is passing back an array of metadata each worker captured * when LogicalTapeFreeze() was called for their final result tapes. Passed * tapes array is actually sized ntapes - 1, because it includes only * worker tapes, whereas leader requires its own leader tape. Note that we * rely on the assumption that reclaimed worker tapes will only be read * from once by leader, and never written to again (tapes are initialized * for writing, but that's only to be consistent). Leader may not write to * its own tape purely due to a restriction in the shared buffile * infrastructure that may be lifted in the future. */ LogicalTapeSet * LogicalTapeSetCreate(int ntapes, TapeShare *shared, SharedFileSet *fileset, int worker) { LogicalTapeSet *lts; LogicalTape *lt; int i; /* * Create top-level struct including per-tape LogicalTape structs. */ Assert(ntapes > 0); lts = (LogicalTapeSet *) palloc(offsetof(LogicalTapeSet, tapes) + ntapes * sizeof(LogicalTape)); lts->nBlocksAllocated = 0L; lts->nBlocksWritten = 0L; lts->nHoleBlocks = 0L; lts->forgetFreeSpace = false; lts->blocksSorted = true; /* a zero-length array is sorted ... */ lts->freeBlocksLen = 32; /* reasonable initial guess */ lts->freeBlocks = (long *) palloc(lts->freeBlocksLen * sizeof(long)); lts->nFreeBlocks = 0; lts->nTapes = ntapes; /* * Initialize per-tape structs. Note we allocate the I/O buffer and the * first block for a tape only when it is first actually written to. This * avoids wasting memory space when tuplesort.c overestimates the number * of tapes needed. */ for (i = 0; i < ntapes; i++) { lt = <s->tapes[i]; lt->writing = true; lt->frozen = false; lt->dirty = false; lt->firstBlockNumber = -1L; lt->curBlockNumber = -1L; lt->nextBlockNumber = -1L; lt->offsetBlockNumber = 0L; lt->buffer = NULL; lt->buffer_size = 0; /* palloc() larger than MaxAllocSize would fail */ lt->max_size = MaxAllocSize; lt->pos = 0; lt->nbytes = 0; } /* * Create temp BufFile storage as required. * * Leader concatenates worker tapes, which requires special adjustment to * final tapeset data. Things are simpler for the worker case and the * serial case, though. They are generally very similar -- workers use a * shared fileset, whereas serial sorts use a conventional serial BufFile. */ if (shared) ltsConcatWorkerTapes(lts, shared, fileset); else if (fileset) { char filename[MAXPGPATH]; pg_itoa(worker, filename); lts->pfile = BufFileCreateShared(fileset, filename); } else lts->pfile = BufFileCreateTemp(false); return lts; }
static void tuplestore_puttuple_common(Tuplestorestate *state, TuplestorePos *pos, void *tuple) { ResourceOwner oldowner; switch (state->status) { case TSS_INMEM: /* * Grow the array as needed. Note that we try to grow the array * when there is still one free slot remaining --- if we fail, * there'll still be room to store the incoming tuple, and then * we'll switch to tape-based operation. */ if (state->memtupcount >= state->memtupsize - 1) { /* * See grow_memtuples() in tuplesort.c for the rationale * behind these two tests. */ if (state->availMem > (long) (state->memtupsize * sizeof(void *)) && (Size) (state->memtupsize * 2) < MaxAllocSize / sizeof(void *)) { FREEMEM(state, GetMemoryChunkSpace(state->memtuples)); state->memtupsize *= 2; state->memtuples = (void **) repalloc(state->memtuples, state->memtupsize * sizeof(void *)); USEMEM(state, GetMemoryChunkSpace(state->memtuples)); } } /* Stash the tuple in the in-memory array */ state->memtuples[state->memtupcount++] = tuple; /* If eof_reached, keep read position in sync */ if (pos->eof_reached) pos->current = state->memtupcount; /* * Done if we still fit in available memory and have array slots. */ if (state->memtupcount < state->memtupsize && !LACKMEM(state)) return; /* * Nope; time to switch to tape-based operation. Make sure that * the temp file(s) are created in suitable temp tablespaces. */ PrepareTempTablespaces(); /* associate the file with the store's resource owner */ oldowner = CurrentResourceOwner; CurrentResourceOwner = state->resowner; { char tmpprefix[50]; snprintf(tmpprefix, 50, "slice%d_tuplestore", currentSliceId); state->myfile = BufFileCreateTemp(tmpprefix, state->interXact); } CurrentResourceOwner = oldowner; state->status = TSS_WRITEFILE; dumptuples(state, pos); break; case TSS_WRITEFILE: WRITETUP(state, pos, tuple); break; case TSS_READFILE: /* * Switch from reading to writing. */ if (!pos->eof_reached) BufFileTell(state->myfile, &pos->readpos_offset); if (BufFileSeek(state->myfile, pos->writepos_offset, SEEK_SET) != 0) elog(ERROR, "seek to EOF failed"); state->status = TSS_WRITEFILE; WRITETUP(state, pos, tuple); break; default: elog(ERROR, "invalid tuplestore state"); break; } }
static void tuplestore_puttuple_common(Tuplestorestate *state, void *tuple) { TSReadPointer *readptr; int i; ResourceOwner oldowner; switch (state->status) { case TSS_INMEM: /* * Update read pointers as needed; see API spec above. */ readptr = state->readptrs; for (i = 0; i < state->readptrcount; readptr++, i++) { if (readptr->eof_reached && i != state->activeptr) { readptr->eof_reached = false; readptr->current = state->memtupcount; } } /* * Grow the array as needed. Note that we try to grow the array * when there is still one free slot remaining --- if we fail, * there'll still be room to store the incoming tuple, and then * we'll switch to tape-based operation. */ if (state->memtupcount >= state->memtupsize - 1) { /* * See grow_memtuples() in tuplesort.c for the rationale * behind these two tests. */ if (state->availMem > (long) (state->memtupsize * sizeof(void *)) && (Size) (state->memtupsize * 2) < MaxAllocSize / sizeof(void *)) { FREEMEM(state, GetMemoryChunkSpace(state->memtuples)); state->memtupsize *= 2; state->memtuples = (void **) repalloc(state->memtuples, state->memtupsize * sizeof(void *)); USEMEM(state, GetMemoryChunkSpace(state->memtuples)); if (LACKMEM(state)) elog(ERROR, "unexpected out-of-memory situation in tuplestore"); } } /* Stash the tuple in the in-memory array */ state->memtuples[state->memtupcount++] = tuple; /* * Done if we still fit in available memory and have array slots. */ if (state->memtupcount < state->memtupsize && !LACKMEM(state)) return; /* * Nope; time to switch to tape-based operation. Make sure that * the temp file(s) are created in suitable temp tablespaces. */ PrepareTempTablespaces(); /* associate the file with the store's resource owner */ oldowner = CurrentResourceOwner; CurrentResourceOwner = state->resowner; char tmpprefix[50]; snprintf(tmpprefix, 50, "slice%d_tuplestore", currentSliceId); state->myfile = BufFileCreateTemp(tmpprefix, state->interXact); CurrentResourceOwner = oldowner; /* * Freeze the decision about whether trailing length words will be * used. We can't change this choice once data is on tape, even * though callers might drop the requirement. */ state->backward = (state->eflags & EXEC_FLAG_BACKWARD) != 0; state->status = TSS_WRITEFILE; dumptuples(state); break; case TSS_WRITEFILE: /* * Update read pointers as needed; see API spec above. Note: * BufFileTell is quite cheap, so not worth trying to avoid * multiple calls. */ readptr = state->readptrs; for (i = 0; i < state->readptrcount; readptr++, i++) { if (readptr->eof_reached && i != state->activeptr) { readptr->eof_reached = false; BufFileTell(state->myfile, &readptr->file, &readptr->offset); } } WRITETUP(state, tuple); break; case TSS_READFILE: /* * Switch from reading to writing. */ if (!state->readptrs[state->activeptr].eof_reached) BufFileTell(state->myfile, &state->readptrs[state->activeptr].file, &state->readptrs[state->activeptr].offset); if (BufFileSeek(state->myfile, state->writepos_file, state->writepos_offset, SEEK_SET) != 0) elog(ERROR, "tuplestore seek to EOF failed"); state->status = TSS_WRITEFILE; /* * Update read pointers as needed; see API spec above. */ readptr = state->readptrs; for (i = 0; i < state->readptrcount; readptr++, i++) { if (readptr->eof_reached && i != state->activeptr) { readptr->eof_reached = false; readptr->file = state->writepos_file; readptr->offset = state->writepos_offset; } } WRITETUP(state, tuple); break; default: elog(ERROR, "invalid tuplestore state"); break; } }