/* Combiner function for rbtree.c */ static void ginCombineData(RBNode *existing, const RBNode *newdata, void *arg) { EntryAccumulator *eo = (EntryAccumulator *) existing; const EntryAccumulator *en = (const EntryAccumulator *) newdata; BuildAccumulator *accum = (BuildAccumulator *) arg; /* * Note this code assumes that newdata contains only one itempointer. */ if (eo->number >= eo->length) { accum->allocatedMemory -= GetMemoryChunkSpace(eo->list); eo->length *= 2; eo->list = (ItemPointerData *) repalloc(eo->list, sizeof(ItemPointerData) * eo->length); accum->allocatedMemory += GetMemoryChunkSpace(eo->list); } /* If item pointers are not ordered, they will need to be sorted. */ if (eo->shouldSort == FALSE) { int res; res = compareItemPointers(eo->list + eo->number - 1, en->list); Assert(res != 0); if (res > 0) eo->shouldSort = TRUE; } eo->list[eo->number] = en->list[0]; eo->number++; }
/* * tuplestore_begin_xxx * * Initialize for a tuple store operation. */ static Tuplestorestate * tuplestore_begin_common(int eflags, bool interXact, int maxKBytes) { Tuplestorestate *state; state = (Tuplestorestate *) palloc0(sizeof(Tuplestorestate)); state->status = TSS_INMEM; state->eflags = eflags; state->interXact = interXact; state->availMem = maxKBytes * 1024L; state->availMemMin = state->availMem; state->allowedMem = state->availMem; state->myfile = NULL; state->context = CurrentMemoryContext; state->resowner = CurrentResourceOwner; state->memtupcount = 0; state->memtupsize = 1024; /* initial guess */ state->memtuples = (void **) palloc(state->memtupsize * sizeof(void *)); state->pos.eof_reached = false; state->pos.current = 0; USEMEM(state, GetMemoryChunkSpace(state->memtuples)); state->eof_reached = false; state->current = 0; return state; }
static void writetup_heap(Tuplestorestate *state, TuplestorePos *pos, void *tup) { uint32 tuplen = 0; Size memsize = 0; if(is_heaptuple_memtuple((HeapTuple) tup)) tuplen = memtuple_get_size((MemTuple) tup, NULL); else { Assert(!is_heaptuple_splitter((HeapTuple) tup)); tuplen = heaptuple_get_size((HeapTuple) tup); } if (BufFileWrite(state->myfile, (void *) tup, tuplen) != (size_t) tuplen) elog(ERROR, "write failed"); if (state->eflags & EXEC_FLAG_BACKWARD) /* need trailing length word? */ if (BufFileWrite(state->myfile, (void *) &tuplen, sizeof(tuplen)) != sizeof(tuplen)) elog(ERROR, "write failed"); memsize = GetMemoryChunkSpace(tup); state->spilledBytes += memsize; FREEMEM(state, memsize); pfree(tup); }
static void writetup_heap(Tuplestorestate *state, void *tup) { MinimalTuple tuple = (MinimalTuple) tup; /* the part of the MinimalTuple we'll write: */ char *tupbody = (char *) tuple + MINIMAL_TUPLE_DATA_OFFSET; unsigned int tupbodylen = tuple->t_len - MINIMAL_TUPLE_DATA_OFFSET; /* total on-disk footprint: */ unsigned int tuplen = tupbodylen + sizeof(int); if (BufFileWrite(state->myfile, (void *) &tuplen, sizeof(tuplen)) != sizeof(tuplen)) ereport(ERROR, (errcode_for_file_access(), errmsg("could not write to tuplestore temporary file: %m"))); if (BufFileWrite(state->myfile, (void *) tupbody, tupbodylen) != (size_t) tupbodylen) ereport(ERROR, (errcode_for_file_access(), errmsg("could not write to tuplestore temporary file: %m"))); if (state->backward) /* need trailing length word? */ if (BufFileWrite(state->myfile, (void *) &tuplen, sizeof(tuplen)) != sizeof(tuplen)) ereport(ERROR, (errcode_for_file_access(), errmsg("could not write to tuplestore temporary file: %m"))); FREEMEM(state, GetMemoryChunkSpace(tuple)); heap_free_minimal_tuple(tuple); }
/* * tuplestore_clear * * Delete all the contents of a tuplestore, and reset its read pointers * to the start. */ void tuplestore_clear(Tuplestorestate *state) { int i; TSReadPointer *readptr; if (state->myfile) BufFileClose(state->myfile); state->myfile = NULL; if (state->memtuples) { for (i = state->memtupdeleted; i < state->memtupcount; i++) { FREEMEM(state, GetMemoryChunkSpace(state->memtuples[i])); pfree(state->memtuples[i]); } } state->status = TSS_INMEM; state->truncated = false; state->memtupdeleted = 0; state->memtupcount = 0; state->tuples = 0; readptr = state->readptrs; for (i = 0; i < state->readptrcount; readptr++, i++) { readptr->eof_reached = false; readptr->current = 0; } }
/* * tuplestore_begin_xxx * * Initialize for a tuple store operation. */ static Tuplestorestate * tuplestore_begin_common(int eflags, bool interXact, int maxKBytes) { Tuplestorestate *state; state = (Tuplestorestate *) palloc0(sizeof(Tuplestorestate)); state->status = TSS_INMEM; state->eflags = eflags; state->interXact = interXact; state->truncated = false; state->availMem = maxKBytes * 1024L; state->myfile = NULL; state->context = CurrentMemoryContext; state->resowner = CurrentResourceOwner; state->memtupcount = 0; state->memtupsize = 1024; /* initial guess */ state->memtuples = (void **) palloc(state->memtupsize * sizeof(void *)); USEMEM(state, GetMemoryChunkSpace(state->memtuples)); state->activeptr = 0; state->readptrcount = 1; state->readptrsize = 8; /* arbitrary */ state->readptrs = (TSReadPointer *) palloc(state->readptrsize * sizeof(TSReadPointer)); state->readptrs[0].eflags = eflags; state->readptrs[0].eof_reached = false; state->readptrs[0].current = 0; return state; }
static void * copytup_heap(Tuplestorestate *state, void *tup) { MinimalTuple tuple; tuple = minimal_tuple_from_heap_tuple((HeapTuple) tup); USEMEM(state, GetMemoryChunkSpace(tuple)); return (void *) tuple; }
static void * readtup_heap(Tuplestorestate *state, unsigned int len) { void *tup = NULL; uint32 tuplen = 0; if (is_len_memtuplen(len)) { tuplen = memtuple_size_from_uint32(len); } else { /* len is HeapTuple.t_len. The record size includes rest of the HeapTuple fields */ tuplen = len + HEAPTUPLESIZE; } tup = (void *) palloc(tuplen); USEMEM(state, GetMemoryChunkSpace(tup)); if(is_len_memtuplen(len)) { /* read in the tuple proper */ memtuple_set_mtlen((MemTuple) tup, len); if (BufFileRead(state->myfile, (void *) ((char *) tup + sizeof(uint32)), tuplen - sizeof(uint32)) != (size_t) (tuplen - sizeof(uint32))) { insist_log(false, "unexpected end of data"); } } else { HeapTuple htup = (HeapTuple) tup; htup->t_len = tuplen - HEAPTUPLESIZE; if (BufFileRead(state->myfile, (void *) ((char *) tup + sizeof(uint32)), tuplen - sizeof(uint32)) != (size_t) (tuplen - sizeof(uint32))) { insist_log(false, "unexpected end of data"); } htup->t_data = (HeapTupleHeader ) ((char *) tup + HEAPTUPLESIZE); } if (state->backward) /* need trailing length word? */ { if (BufFileRead(state->myfile, (void *) &tuplen, sizeof(tuplen)) != sizeof(tuplen)) { insist_log(false, "unexpected end of data"); } } return (void *) tup; }
/* Combiner function for rbtree.c */ static void ginCombineData(RBNode *existing, const RBNode *newdata, void *arg) { GinEntryAccumulator *eo = (GinEntryAccumulator *) existing; const GinEntryAccumulator *en = (const GinEntryAccumulator *) newdata; BuildAccumulator *accum = (BuildAccumulator *) arg; /* * Note this code assumes that newdata contains only one itempointer. */ if (eo->count >= eo->maxcount) { if (eo->maxcount > INT_MAX) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("posting list is too long"), errhint("Reduce maintenance_work_mem."))); accum->allocatedMemory -= GetMemoryChunkSpace(eo->list); eo->maxcount *= 2; eo->list = (ItemPointerData *) repalloc_huge(eo->list, sizeof(ItemPointerData) * eo->maxcount); accum->allocatedMemory += GetMemoryChunkSpace(eo->list); } /* If item pointers are not ordered, they will need to be sorted later */ if (eo->shouldSort == FALSE) { int res; res = ginCompareItemPointers(eo->list + eo->count - 1, en->list); Assert(res != 0); if (res > 0) eo->shouldSort = TRUE; } eo->list[eo->count] = en->list[0]; eo->count++; }
/* * Similar to tuplestore_puttuple(), but work from values + nulls arrays. * This avoids an extra tuple-construction operation. */ void tuplestore_putvalues(Tuplestorestate *state, TupleDesc tdesc, Datum *values, bool *isnull) { MinimalTuple tuple; MemoryContext oldcxt = MemoryContextSwitchTo(state->context); tuple = heap_form_minimal_tuple(tdesc, values, isnull); USEMEM(state, GetMemoryChunkSpace(tuple)); tuplestore_puttuple_common(state, (void *) tuple); MemoryContextSwitchTo(oldcxt); }
/* * Accept one tuple and append it to the tuplestore. * * Note that the input tuple is always copied; the caller need not save it. * * If the active read pointer is currently "at EOF", it remains so (the read * pointer implicitly advances along with the write pointer); otherwise the * read pointer is unchanged. Non-active read pointers do not move, which * means they are certain to not be "at EOF" immediately after puttuple. * This curious-seeming behavior is for the convenience of nodeMaterial.c and * nodeCtescan.c, which would otherwise need to do extra pointer repositioning * steps. * * tuplestore_puttupleslot() is a convenience routine to collect data from * a TupleTableSlot without an extra copy operation. */ void tuplestore_puttupleslot(Tuplestorestate *state, TupleTableSlot *slot) { MinimalTuple tuple; /* * Form a MinimalTuple in working memory */ tuple = ExecCopySlotMinimalTuple(slot); USEMEM(state, GetMemoryChunkSpace(tuple)); tuplestore_puttuple_common(state, (void *) tuple); }
/* * This is basically the same as datumCopy(), but extended to count * palloc'd space in accum->allocatedMemory. */ static Datum getDatumCopy(BuildAccumulator *accum, OffsetNumber attnum, Datum value) { Form_pg_attribute att = accum->ginstate->origTupdesc->attrs[attnum - 1]; Datum res; if (att->attbyval) res = value; else { res = datumCopy(value, false, att->attlen); accum->allocatedMemory += GetMemoryChunkSpace(DatumGetPointer(res)); } return res; }
/* * Accept one tuple and append it to the tuplestore. * * Note that the input tuple is always copied; the caller need not save it. * * If the read status is currently "AT EOF" then it remains so (the read * pointer advances along with the write pointer); otherwise the read * pointer is unchanged. This is for the convenience of nodeMaterial.c. * * tuplestore_puttupleslot() is a convenience routine to collect data from * a TupleTableSlot without an extra copy operation. */ void tuplestore_puttupleslot_pos(Tuplestorestate *state, TuplestorePos *pos, TupleTableSlot *slot) { MemTuple tuple; MemoryContext oldcxt = MemoryContextSwitchTo(state->context); /* * Form a MinimalTuple in working memory */ tuple = ExecCopySlotMemTuple(slot); USEMEM(state, GetMemoryChunkSpace(tuple)); tuplestore_puttuple_common(state, pos, (void *) tuple); MemoryContextSwitchTo(oldcxt); }
/* * tuplestore_begin_xxx * * Initialize for a tuple store operation. */ static Tuplestorestate * tuplestore_begin_common(int eflags, bool interXact, int maxKBytes) { Tuplestorestate *state; state = (Tuplestorestate *) palloc0(sizeof(Tuplestorestate)); state->status = TSS_INMEM; state->eflags = eflags; state->interXact = interXact; state->truncated = false; state->allowedMem = maxKBytes * 1024L; state->availMem = state->allowedMem; state->myfile = NULL; state->context = CurrentMemoryContext; state->resowner = CurrentResourceOwner; state->memtupdeleted = 0; state->memtupcount = 0; state->tuples = 0; /* * Initial size of array must be more than ALLOCSET_SEPARATE_THRESHOLD; * see comments in grow_memtuples(). */ state->memtupsize = Max(16384 / sizeof(void *), ALLOCSET_SEPARATE_THRESHOLD / sizeof(void *) + 1); state->growmemtuples = true; state->memtuples = (void **) palloc(state->memtupsize * sizeof(void *)); USEMEM(state, GetMemoryChunkSpace(state->memtuples)); state->activeptr = 0; state->readptrcount = 1; state->readptrsize = 8; /* arbitrary */ state->readptrs = (TSReadPointer *) palloc(state->readptrsize * sizeof(TSReadPointer)); state->readptrs[0].eflags = eflags; state->readptrs[0].eof_reached = false; state->readptrs[0].current = 0; return state; }
/* * Find/store one entry from indexed value. */ static void ginInsertBAEntry(BuildAccumulator *accum, ItemPointer heapptr, OffsetNumber attnum, Datum key, GinNullCategory category) { GinEntryAccumulator eatmp; GinEntryAccumulator *ea; bool isNew; /* * For the moment, fill only the fields of eatmp that will be looked at by * cmpEntryAccumulator or ginCombineData. */ eatmp.attnum = attnum; eatmp.key = key; eatmp.category = category; /* temporarily set up single-entry itempointer list */ eatmp.list = heapptr; ea = (GinEntryAccumulator *) rb_insert(accum->tree, (RBNode *) &eatmp, &isNew); if (isNew) { /* * Finish initializing new tree entry, including making permanent * copies of the datum (if it's not null) and itempointer. */ if (category == GIN_CAT_NORM_KEY) ea->key = getDatumCopy(accum, attnum, key); ea->maxcount = DEF_NPTR; ea->count = 1; ea->shouldSort = FALSE; ea->list = (ItemPointerData *) palloc(sizeof(ItemPointerData) * DEF_NPTR); ea->list[0] = *heapptr; accum->allocatedMemory += GetMemoryChunkSpace(ea->list); } else { /* * ginCombineData did everything needed. */ } }
/* * tuplestore_trim - remove all but ntuples tuples before current */ static void tuplestore_trim(Tuplestorestate *state, int ntuples) { int nremove; int i; /* * We don't bother trimming temp files since it usually would mean more * work than just letting them sit in kernel buffers until they age out. */ if (state->status != TSS_INMEM) return; nremove = state->current - ntuples; if (nremove <= 0) return; /* nothing to do */ Assert(nremove <= state->memtupcount); /* Release no-longer-needed tuples */ for (i = 0; i < nremove; i++) { FREEMEM(state, GetMemoryChunkSpace(state->memtuples[i])); pfree(state->memtuples[i]); } /* * Slide the array down and readjust pointers. This may look pretty * stupid, but we expect that there will usually not be very many * tuple-pointers to move, so this isn't that expensive; and it keeps a * lot of other logic simple. * * In fact, in the current usage for merge joins, it's demonstrable that * there will always be exactly one non-removed tuple; so optimize that * case. */ if (nremove + 1 == state->memtupcount) state->memtuples[0] = state->memtuples[nremove]; else memmove(state->memtuples, state->memtuples + nremove, (state->memtupcount - nremove) * sizeof(void *)); state->memtupcount -= nremove; state->current -= nremove; state->markpos_current -= nremove; }
/* * Similar to tuplestore_puttuple(), but work from values + nulls arrays. * This avoids an extra tuple-construction operation. */ void tuplestore_putvalues(Tuplestorestate *state, TupleDesc tdesc, Datum *values, bool *isnull) { MemoryContext oldcxt = MemoryContextSwitchTo(state->context); if (!state->mt_bind) { state->mt_bind = create_memtuple_binding(tdesc); Assert(state->mt_bind); } MemTuple tuple = memtuple_form_to(state->mt_bind, values, isnull, NULL, NULL, false); USEMEM(state, GetMemoryChunkSpace(tuple)); tuplestore_puttuple_common(state, (void *) tuple); MemoryContextSwitchTo(oldcxt); }
static void * readtup_heap(Tuplestorestate *state, unsigned int len) { unsigned int tupbodylen = len - sizeof(int); unsigned int tuplen = tupbodylen + MINIMAL_TUPLE_DATA_OFFSET; MinimalTuple tuple = (MinimalTuple) palloc(tuplen); char *tupbody = (char *) tuple + MINIMAL_TUPLE_DATA_OFFSET; USEMEM(state, GetMemoryChunkSpace(tuple)); /* read in the tuple proper */ tuple->t_len = tuplen; if (BufFileRead(state->myfile, (void *) tupbody, tupbodylen) != (size_t) tupbodylen) elog(ERROR, "unexpected end of data"); if (state->backward) /* need trailing length word? */ if (BufFileRead(state->myfile, (void *) &tuplen, sizeof(tuplen)) != sizeof(tuplen)) elog(ERROR, "unexpected end of data"); return (void *) tuple; }
/* * Find/store one entry from indexed value. */ static void ginInsertEntry(BuildAccumulator *accum, ItemPointer heapptr, OffsetNumber attnum, Datum entry) { EntryAccumulator key; EntryAccumulator *ea; bool isNew; /* * For the moment, fill only the fields of key that will be looked at * by cmpEntryAccumulator or ginCombineData. */ key.attnum = attnum; key.value = entry; /* temporarily set up single-entry itempointer list */ key.list = heapptr; ea = (EntryAccumulator *) rb_insert(accum->tree, (RBNode *) &key, &isNew); if (isNew) { /* * Finish initializing new tree entry, including making permanent * copies of the datum and itempointer. */ ea->value = getDatumCopy(accum, attnum, entry); ea->length = DEF_NPTR; ea->number = 1; ea->shouldSort = FALSE; ea->list = (ItemPointerData *) palloc(sizeof(ItemPointerData) * DEF_NPTR); ea->list[0] = *heapptr; accum->allocatedMemory += GetMemoryChunkSpace(ea->list); } else { /* * ginCombineData did everything needed. */ } }
/* Allocator function for rbtree.c */ static RBNode * ginAllocEntryAccumulator(void *arg) { BuildAccumulator *accum = (BuildAccumulator *) arg; GinEntryAccumulator *ea; /* * Allocate memory by rather big chunks to decrease overhead. We have no * need to reclaim RBNodes individually, so this costs nothing. */ if (accum->entryallocator == NULL || accum->eas_used >= DEF_NENTRY) { accum->entryallocator = palloc(sizeof(GinEntryAccumulator) * DEF_NENTRY); accum->allocatedMemory += GetMemoryChunkSpace(accum->entryallocator); accum->eas_used = 0; } /* Allocate new RBNode from current chunk */ ea = accum->entryallocator + accum->eas_used; accum->eas_used++; return (RBNode *) ea; }
static void * readtup_heap(Tuplestorestate *state, unsigned int len) { unsigned int tupbodylen = len - sizeof(int); unsigned int tuplen = tupbodylen + MINIMAL_TUPLE_DATA_OFFSET; MinimalTuple tuple = (MinimalTuple) palloc(tuplen); char *tupbody = (char *) tuple + MINIMAL_TUPLE_DATA_OFFSET; USEMEM(state, GetMemoryChunkSpace(tuple)); /* read in the tuple proper */ tuple->t_len = tuplen; if (BufFileRead(state->myfile, (void *) tupbody, tupbodylen) != (size_t) tupbodylen) ereport(ERROR, (errcode_for_file_access(), errmsg("could not read from tuplestore temporary file: %m"))); if (state->backward) /* need trailing length word? */ if (BufFileRead(state->myfile, (void *) &tuplen, sizeof(tuplen)) != sizeof(tuplen)) ereport(ERROR, (errcode_for_file_access(), errmsg("could not read from tuplestore temporary file: %m"))); return (void *) tuple; }
static void tuplestore_puttuple_common(Tuplestorestate *state, void *tuple) { TSReadPointer *readptr; int i; ResourceOwner oldowner; switch (state->status) { case TSS_INMEM: /* * Update read pointers as needed; see API spec above. */ readptr = state->readptrs; for (i = 0; i < state->readptrcount; readptr++, i++) { if (readptr->eof_reached && i != state->activeptr) { readptr->eof_reached = false; readptr->current = state->memtupcount; } } /* * Grow the array as needed. Note that we try to grow the array * when there is still one free slot remaining --- if we fail, * there'll still be room to store the incoming tuple, and then * we'll switch to tape-based operation. */ if (state->memtupcount >= state->memtupsize - 1) { /* * See grow_memtuples() in tuplesort.c for the rationale * behind these two tests. */ if (state->availMem > (long) (state->memtupsize * sizeof(void *)) && (Size) (state->memtupsize * 2) < MaxAllocSize / sizeof(void *)) { FREEMEM(state, GetMemoryChunkSpace(state->memtuples)); state->memtupsize *= 2; state->memtuples = (void **) repalloc(state->memtuples, state->memtupsize * sizeof(void *)); USEMEM(state, GetMemoryChunkSpace(state->memtuples)); } } /* Stash the tuple in the in-memory array */ state->memtuples[state->memtupcount++] = tuple; /* * Done if we still fit in available memory and have array slots. */ if (state->memtupcount < state->memtupsize && !LACKMEM(state)) return; /* * Nope; time to switch to tape-based operation. Make sure that * the temp file(s) are created in suitable temp tablespaces. */ PrepareTempTablespaces(); /* associate the file with the store's resource owner */ oldowner = CurrentResourceOwner; CurrentResourceOwner = state->resowner; state->myfile = BufFileCreateTemp(state->interXact); CurrentResourceOwner = oldowner; /* * Freeze the decision about whether trailing length words will be * used. We can't change this choice once data is on tape, even * though callers might drop the requirement. */ state->backward = (state->eflags & EXEC_FLAG_BACKWARD) != 0; state->status = TSS_WRITEFILE; dumptuples(state); break; case TSS_WRITEFILE: /* * Update read pointers as needed; see API spec above. Note: * BufFileTell is quite cheap, so not worth trying to avoid * multiple calls. */ readptr = state->readptrs; for (i = 0; i < state->readptrcount; readptr++, i++) { if (readptr->eof_reached && i != state->activeptr) { readptr->eof_reached = false; BufFileTell(state->myfile, &readptr->file, &readptr->offset); } } WRITETUP(state, tuple); break; case TSS_READFILE: /* * Switch from reading to writing. */ if (!state->readptrs[state->activeptr].eof_reached) BufFileTell(state->myfile, &state->readptrs[state->activeptr].file, &state->readptrs[state->activeptr].offset); if (BufFileSeek(state->myfile, state->writepos_file, state->writepos_offset, SEEK_SET) != 0) elog(ERROR, "tuplestore seek to EOF failed"); state->status = TSS_WRITEFILE; /* * Update read pointers as needed; see API spec above. */ readptr = state->readptrs; for (i = 0; i < state->readptrcount; readptr++, i++) { if (readptr->eof_reached && i != state->activeptr) { readptr->eof_reached = false; readptr->file = state->writepos_file; readptr->offset = state->writepos_offset; } } WRITETUP(state, tuple); break; default: elog(ERROR, "invalid tuplestore state"); break; } }
/* * Grow the memtuples[] array, if possible within our memory constraint. We * must not exceed INT_MAX tuples in memory or the caller-provided memory * limit. Return TRUE if we were able to enlarge the array, FALSE if not. * * Normally, at each increment we double the size of the array. When doing * that would exceed a limit, we attempt one last, smaller increase (and then * clear the growmemtuples flag so we don't try any more). That allows us to * use memory as fully as permitted; sticking to the pure doubling rule could * result in almost half going unused. Because availMem moves around with * tuple addition/removal, we need some rule to prevent making repeated small * increases in memtupsize, which would just be useless thrashing. The * growmemtuples flag accomplishes that and also prevents useless * recalculations in this function. */ static bool grow_memtuples(Tuplestorestate *state) { int newmemtupsize; int memtupsize = state->memtupsize; int64 memNowUsed = state->allowedMem - state->availMem; /* Forget it if we've already maxed out memtuples, per comment above */ if (!state->growmemtuples) return false; /* Select new value of memtupsize */ if (memNowUsed <= state->availMem) { /* * We've used no more than half of allowedMem; double our usage, * clamping at INT_MAX tuples. */ if (memtupsize < INT_MAX / 2) newmemtupsize = memtupsize * 2; else { newmemtupsize = INT_MAX; state->growmemtuples = false; } } else { /* * This will be the last increment of memtupsize. Abandon doubling * strategy and instead increase as much as we safely can. * * To stay within allowedMem, we can't increase memtupsize by more * than availMem / sizeof(void *) elements. In practice, we want to * increase it by considerably less, because we need to leave some * space for the tuples to which the new array slots will refer. We * assume the new tuples will be about the same size as the tuples * we've already seen, and thus we can extrapolate from the space * consumption so far to estimate an appropriate new size for the * memtuples array. The optimal value might be higher or lower than * this estimate, but it's hard to know that in advance. We again * clamp at INT_MAX tuples. * * This calculation is safe against enlarging the array so much that * LACKMEM becomes true, because the memory currently used includes * the present array; thus, there would be enough allowedMem for the * new array elements even if no other memory were currently used. * * We do the arithmetic in float8, because otherwise the product of * memtupsize and allowedMem could overflow. Any inaccuracy in the * result should be insignificant; but even if we computed a * completely insane result, the checks below will prevent anything * really bad from happening. */ double grow_ratio; grow_ratio = (double) state->allowedMem / (double) memNowUsed; if (memtupsize * grow_ratio < INT_MAX) newmemtupsize = (int) (memtupsize * grow_ratio); else newmemtupsize = INT_MAX; /* We won't make any further enlargement attempts */ state->growmemtuples = false; } /* Must enlarge array by at least one element, else report failure */ if (newmemtupsize <= memtupsize) goto noalloc; /* * On a 32-bit machine, allowedMem could exceed MaxAllocHugeSize. Clamp * to ensure our request won't be rejected. Note that we can easily * exhaust address space before facing this outcome. (This is presently * impossible due to guc.c's MAX_KILOBYTES limitation on work_mem, but * don't rely on that at this distance.) */ if ((Size) newmemtupsize >= MaxAllocHugeSize / sizeof(void *)) { newmemtupsize = (int) (MaxAllocHugeSize / sizeof(void *)); state->growmemtuples = false; /* can't grow any more */ } /* * We need to be sure that we do not cause LACKMEM to become true, else * the space management algorithm will go nuts. The code above should * never generate a dangerous request, but to be safe, check explicitly * that the array growth fits within availMem. (We could still cause * LACKMEM if the memory chunk overhead associated with the memtuples * array were to increase. That shouldn't happen because we chose the * initial array size large enough to ensure that palloc will be treating * both old and new arrays as separate chunks. But we'll check LACKMEM * explicitly below just in case.) */ if (state->availMem < (int64) ((newmemtupsize - memtupsize) * sizeof(void *))) goto noalloc; /* OK, do it */ FREEMEM(state, GetMemoryChunkSpace(state->memtuples)); state->memtupsize = newmemtupsize; state->memtuples = (void **) repalloc_huge(state->memtuples, state->memtupsize * sizeof(void *)); USEMEM(state, GetMemoryChunkSpace(state->memtuples)); if (LACKMEM(state)) elog(ERROR, "unexpected out-of-memory situation in tuplestore"); return true; noalloc: /* If for any reason we didn't realloc, shut off future attempts */ state->growmemtuples = false; return false; }
/* * tuplestore_trim - remove all no-longer-needed tuples * * Calling this function authorizes the tuplestore to delete all tuples * before the oldest read pointer, if no read pointer is marked as requiring * REWIND capability. * * Note: this is obviously safe if no pointer has BACKWARD capability either. * If a pointer is marked as BACKWARD but not REWIND capable, it means that * the pointer can be moved backward but not before the oldest other read * pointer. */ void tuplestore_trim(Tuplestorestate *state) { int oldest; int nremove; int i; /* * Truncation is disallowed if any read pointer requires rewind * capability. */ if (state->eflags & EXEC_FLAG_REWIND) return; /* * We don't bother trimming temp files since it usually would mean more * work than just letting them sit in kernel buffers until they age out. */ if (state->status != TSS_INMEM) return; /* Find the oldest read pointer */ oldest = state->memtupcount; for (i = 0; i < state->readptrcount; i++) { if (!state->readptrs[i].eof_reached) oldest = Min(oldest, state->readptrs[i].current); } /* * Note: you might think we could remove all the tuples before the oldest * "current", since that one is the next to be returned. However, since * tuplestore_gettuple returns a direct pointer to our internal copy of * the tuple, it's likely that the caller has still got the tuple just * before "current" referenced in a slot. So we keep one extra tuple * before the oldest "current". (Strictly speaking, we could require such * callers to use the "copy" flag to tuplestore_gettupleslot, but for * efficiency we allow this one case to not use "copy".) */ nremove = oldest - 1; if (nremove <= 0) return; /* nothing to do */ Assert(nremove >= state->memtupdeleted); Assert(nremove <= state->memtupcount); /* Release no-longer-needed tuples */ for (i = state->memtupdeleted; i < nremove; i++) { FREEMEM(state, GetMemoryChunkSpace(state->memtuples[i])); pfree(state->memtuples[i]); state->memtuples[i] = NULL; } state->memtupdeleted = nremove; /* mark tuplestore as truncated (used for Assert crosschecks only) */ state->truncated = true; /* * If nremove is less than 1/8th memtupcount, just stop here, leaving the * "deleted" slots as NULL. This prevents us from expending O(N^2) time * repeatedly memmove-ing a large pointer array. The worst case space * wastage is pretty small, since it's just pointers and not whole tuples. */ if (nremove < state->memtupcount / 8) return; /* * Slide the array down and readjust pointers. * * In mergejoin's current usage, it's demonstrable that there will always * be exactly one non-removed tuple; so optimize that case. */ if (nremove + 1 == state->memtupcount) state->memtuples[0] = state->memtuples[nremove]; else memmove(state->memtuples, state->memtuples + nremove, (state->memtupcount - nremove) * sizeof(void *)); state->memtupdeleted = 0; state->memtupcount -= nremove; for (i = 0; i < state->readptrcount; i++) { if (!state->readptrs[i].eof_reached) state->readptrs[i].current -= nremove; } }
static void tuplestore_puttuple_common(Tuplestorestate *state, TuplestorePos *pos, void *tuple) { ResourceOwner oldowner; switch (state->status) { case TSS_INMEM: /* * Grow the array as needed. Note that we try to grow the array * when there is still one free slot remaining --- if we fail, * there'll still be room to store the incoming tuple, and then * we'll switch to tape-based operation. */ if (state->memtupcount >= state->memtupsize - 1) { /* * See grow_memtuples() in tuplesort.c for the rationale * behind these two tests. */ if (state->availMem > (long) (state->memtupsize * sizeof(void *)) && (Size) (state->memtupsize * 2) < MaxAllocSize / sizeof(void *)) { FREEMEM(state, GetMemoryChunkSpace(state->memtuples)); state->memtupsize *= 2; state->memtuples = (void **) repalloc(state->memtuples, state->memtupsize * sizeof(void *)); USEMEM(state, GetMemoryChunkSpace(state->memtuples)); } } /* Stash the tuple in the in-memory array */ state->memtuples[state->memtupcount++] = tuple; /* If eof_reached, keep read position in sync */ if (pos->eof_reached) pos->current = state->memtupcount; /* * Done if we still fit in available memory and have array slots. */ if (state->memtupcount < state->memtupsize && !LACKMEM(state)) return; /* * Nope; time to switch to tape-based operation. Make sure that * the temp file(s) are created in suitable temp tablespaces. */ PrepareTempTablespaces(); /* associate the file with the store's resource owner */ oldowner = CurrentResourceOwner; CurrentResourceOwner = state->resowner; { char tmpprefix[50]; snprintf(tmpprefix, 50, "slice%d_tuplestore", currentSliceId); state->myfile = BufFileCreateTemp(tmpprefix, state->interXact); } CurrentResourceOwner = oldowner; state->status = TSS_WRITEFILE; dumptuples(state, pos); break; case TSS_WRITEFILE: WRITETUP(state, pos, tuple); break; case TSS_READFILE: /* * Switch from reading to writing. */ if (!pos->eof_reached) BufFileTell(state->myfile, &pos->readpos_offset); if (BufFileSeek(state->myfile, pos->writepos_offset, SEEK_SET) != 0) elog(ERROR, "seek to EOF failed"); state->status = TSS_WRITEFILE; WRITETUP(state, pos, tuple); break; default: elog(ERROR, "invalid tuplestore state"); break; } }
/* * Fetch the next tuple in either forward or back direction. * Returns NULL if no more tuples. If should_free is set, the * caller must pfree the returned tuple when done with it. * * Backward scan is only allowed if randomAccess was set true or * EXEC_FLAG_BACKWARD was specified to tuplestore_set_eflags(). */ static void * tuplestore_gettuple(Tuplestorestate *state, TuplestorePos *pos, bool forward, bool *should_free) { uint32 tuplen; void *tup; Assert(forward || (state->eflags & EXEC_FLAG_BACKWARD)); switch (state->status) { case TSS_INMEM: *should_free = false; if (forward) { if (pos->current < state->memtupcount) return state->memtuples[pos->current++]; pos->eof_reached = true; return NULL; } else { if (pos->current <= 0) return NULL; /* * if all tuples are fetched already then we return last * tuple, else - tuple before last returned. */ if (pos->eof_reached) pos->eof_reached = false; else { pos->current--; /* last returned tuple */ if (pos->current <= 0) return NULL; } return state->memtuples[pos->current - 1]; } break; case TSS_WRITEFILE: /* Skip state change if we'll just return NULL */ if (pos->eof_reached && forward) return NULL; /* * Switch from writing to reading. */ BufFileTell(state->myfile, &pos->writepos_offset); if (!pos->eof_reached) if (BufFileSeek(state->myfile, pos->readpos_offset, SEEK_SET) != 0) elog(ERROR, "seek failed"); state->status = TSS_READFILE; /* FALL THRU into READFILE case */ case TSS_READFILE: *should_free = true; if (forward) { if ((tuplen = getlen(state, pos, true)) != 0) { tup = READTUP(state, pos, tuplen); /* CDB XXX XXX XXX XXX */ /* MPP-1347: EXPLAIN ANALYZE shows runaway memory usage. * Readtup does a usemem, but the free happens in * ExecStoreTuple. Do a free so state->availMem * doesn't go massively negative to screw up * stats. It would be better to interrogate the * heap for actual memory usage than use this * homemade accounting. */ FREEMEM(state, GetMemoryChunkSpace(tup)); /* CDB XXX XXX XXX XXX */ return tup; } else { pos->eof_reached = true; return NULL; } } /* * Backward. * * if all tuples are fetched already then we return last tuple, * else - tuple before last returned. * * Back up to fetch previously-returned tuple's ending length * word. If seek fails, assume we are at start of file. */ insist_log(false, "Backward scanning of tuplestores are not supported at this time"); if (BufFileSeek(state->myfile, -(long) sizeof(uint32) /* offset */, SEEK_CUR) != 0) return NULL; tuplen = getlen(state, pos, false); if (pos->eof_reached) { pos->eof_reached = false; /* We will return the tuple returned before returning NULL */ } else { /* * Back up to get ending length word of tuple before it. */ if (BufFileSeek(state->myfile, -(long) (tuplen + 2 * sizeof(uint32)) /* offset */, SEEK_CUR) != 0) { /* * If that fails, presumably the prev tuple is the first * in the file. Back up so that it becomes next to read * in forward direction (not obviously right, but that is * what in-memory case does). */ if (BufFileSeek(state->myfile, -(long) (tuplen + sizeof(uint32)) /* offset */, SEEK_CUR) != 0) elog(ERROR, "bogus tuple length in backward scan"); return NULL; } tuplen = getlen(state, pos, false); } /* * Now we have the length of the prior tuple, back up and read it. * Note: READTUP expects we are positioned after the initial * length word of the tuple, so back up to that point. */ if (BufFileSeek(state->myfile, -(long) tuplen /* offset */, SEEK_CUR) != 0) elog(ERROR, "bogus tuple length in backward scan"); tup = READTUP(state, pos, tuplen); return tup; default: elog(ERROR, "invalid tuplestore state"); return NULL; /* keep compiler quiet */ } }
/* * tuplestore_trim - remove all no-longer-needed tuples * * Calling this function authorizes the tuplestore to delete all tuples * before the oldest read pointer, if no read pointer is marked as requiring * REWIND capability. * * Note: this is obviously safe if no pointer has BACKWARD capability either. * If a pointer is marked as BACKWARD but not REWIND capable, it means that * the pointer can be moved backward but not before the oldest other read * pointer. */ void tuplestore_trim(Tuplestorestate *state) { int oldest; int nremove; int i; /* * Truncation is disallowed if any read pointer requires rewind * capability. */ if (state->eflags & EXEC_FLAG_REWIND) return; /* * We don't bother trimming temp files since it usually would mean more * work than just letting them sit in kernel buffers until they age out. */ if (state->status != TSS_INMEM) return; /* Find the oldest read pointer */ oldest = state->memtupcount; for (i = 0; i < state->readptrcount; i++) { if (!state->readptrs[i].eof_reached) oldest = Min(oldest, state->readptrs[i].current); } /* * Note: you might think we could remove all the tuples before the oldest * "current", since that one is the next to be returned. However, since * tuplestore_gettuple returns a direct pointer to our internal copy of * the tuple, it's likely that the caller has still got the tuple just * before "current" referenced in a slot. So we keep one extra tuple * before the oldest "current". (Strictly speaking, we could require such * callers to use the "copy" flag to tuplestore_gettupleslot, but for * efficiency we allow this one case to not use "copy".) */ nremove = oldest - 1; if (nremove <= 0) return; /* nothing to do */ Assert(nremove <= state->memtupcount); /* Release no-longer-needed tuples */ for (i = 0; i < nremove; i++) { FREEMEM(state, GetMemoryChunkSpace(state->memtuples[i])); pfree(state->memtuples[i]); } /* * Slide the array down and readjust pointers. This may look pretty * stupid, but we expect that there will usually not be very many * tuple-pointers to move, so this isn't that expensive; and it keeps a * lot of other logic simple. * * In fact, in the current usage for merge joins, it's demonstrable that * there will always be exactly one non-removed tuple; so optimize that * case. */ if (nremove + 1 == state->memtupcount) state->memtuples[0] = state->memtuples[nremove]; else memmove(state->memtuples, state->memtuples + nremove, (state->memtupcount - nremove) * sizeof(void *)); state->memtupcount -= nremove; for (i = 0; i < state->readptrcount; i++) { if (!state->readptrs[i].eof_reached) state->readptrs[i].current -= nremove; } /* mark tuplestore as truncated (used for Assert crosschecks only) */ state->truncated = true; }