/* ---------------------------------------------------------------- * WorkTableScanNext * * This is a workhorse for ExecWorkTableScan * ---------------------------------------------------------------- */ static TupleTableSlot * WorkTableScanNext(WorkTableScanState *node) { TupleTableSlot *slot; Tuplestorestate *tuplestorestate; /* * get information from the estate and scan state * * Note: we intentionally do not support backward scan. Although it would * take only a couple more lines here, it would force nodeRecursiveunion.c * to create the tuplestore with backward scan enabled, which has a * performance cost. In practice backward scan is never useful for a * worktable plan node, since it cannot appear high enough in the plan * tree of a scrollable cursor to be exposed to a backward-scan * requirement. So it's not worth expending effort to support it. * * Note: we are also assuming that this node is the only reader of the * worktable. Therefore, we don't need a private___ read pointer for the * tuplestore, nor do we need to tell tuplestore_gettupleslot to copy. */ Assert(ScanDirectionIsForward(node->ss.ps.state->es_direction)); tuplestorestate = node->rustate->working_table; /* * Get the next tuple from tuplestore. Return NULL if no more tuples. */ slot = node->ss.ss_ScanTupleSlot; (void) tuplestore_gettupleslot(tuplestorestate, true, false, slot); return slot; }
void parquet_getnext(ParquetScanDesc scan, ScanDirection direction, TupleTableSlot *slot) { AOTupleId aoTupleId; Assert(ScanDirectionIsForward(direction)); for(;;) { if(scan->bufferDone) { /* * Get the next row group. We call this function until we * successfully get a block to process, or finished reading * all the data (all 'segment' files) for this relation. */ while(!getNextRowGroup(scan)) { /* have we read all this relation's data. done! */ if(scan->pqs_done_all_splits) { ExecClearTuple(slot); return /*NULL*/; } } scan->bufferDone = false; } bool tupleExist = ParquetRowGroupReader_ScanNextTuple( scan->pqs_tupDesc, &scan->rowGroupReader, scan->hawqAttrToParquetColChunks, scan->proj, slot); if(tupleExist) { int segno = ((FileSplitNode *)list_nth(scan->splits, scan->pqs_splits_processed - 1))->segno; AOTupleIdInit_Init(&aoTupleId); AOTupleIdInit_segmentFileNum(&aoTupleId, segno); scan->cur_seg_row++; AOTupleIdInit_rowNum(&aoTupleId, scan->cur_seg_row); scan->cdb_fake_ctid = *((ItemPointer)&aoTupleId); slot_set_ctid(slot, &(scan->cdb_fake_ctid)); return; } /* no more items in the row group, get new buffer */ scan->bufferDone = true; } }
/* ------------------------------------------------------------------ * ExecShareInputScan * Retrieve a tuple from the ShareInputScan * ------------------------------------------------------------------ */ TupleTableSlot * ExecShareInputScan(ShareInputScanState *node) { EState *estate; ScanDirection dir; bool forward; TupleTableSlot *slot; ShareInputScan * sisc = (ShareInputScan *) node->ss.ps.plan; ShareType share_type = sisc->share_type; /* * get state info from node */ estate = node->ss.ps.state; dir = estate->es_direction; forward = ScanDirectionIsForward(dir); /* if first time call, need to initialize the tuplestore state. */ if(node->ts_state == NULL) { elog(DEBUG1, "SISC (shareid=%d, slice=%d): No tuplestore yet, initializing tuplestore", sisc->share_id, currentSliceId); init_tuplestore_state(node); } slot = node->ss.ps.ps_ResultTupleSlot; while(1) { bool gotOK = false; if(share_type == SHARE_MATERIAL || share_type == SHARE_MATERIAL_XSLICE) { ntuplestore_acc_advance((NTupleStoreAccessor *) node->ts_pos, forward ? 1 : -1); gotOK = ntuplestore_acc_current_tupleslot((NTupleStoreAccessor *) node->ts_pos, slot); } else { gotOK = tuplesort_gettupleslot_pos(node->ts_state->sortstore, (TuplesortPos *)node->ts_pos, forward, slot, CurrentMemoryContext); } if(!gotOK) return NULL; SIMPLE_FAULT_INJECTOR(ExecShareInputNext); return slot; } Assert(!"should not be here"); return NULL; }
/* ---------------------------------------------------------------- * FunctionNext * * This is a workhorse for ExecFunctionScan * ---------------------------------------------------------------- */ static TupleTableSlot * FunctionNext(FunctionScanState *node) { TupleTableSlot *slot; EState *estate; ScanDirection direction; Tuplestorestate *tuplestorestate; bool should_free; HeapTuple heapTuple; /* * get information from the estate and scan state */ estate = node->ss.ps.state; direction = estate->es_direction; tuplestorestate = node->tuplestorestate; /* * If first time through, read all tuples from function and put them in a * tuplestore. Subsequent calls just fetch tuples from tuplestore. */ if (tuplestorestate == NULL) { ExprContext *econtext = node->ss.ps.ps_ExprContext; TupleDesc funcTupdesc; node->tuplestorestate = tuplestorestate = ExecMakeTableFunctionResult(node->funcexpr, econtext, node->tupdesc, &funcTupdesc); /* * If function provided a tupdesc, cross-check it. We only really * need to do this for functions returning RECORD, but might as well * do it always. */ if (funcTupdesc) tupledesc_match(node->tupdesc, funcTupdesc); } /* * Get the next tuple from tuplestore. Return NULL if no more tuples. */ heapTuple = tuplestore_getheaptuple(tuplestorestate, ScanDirectionIsForward(direction), &should_free); slot = node->ss.ss_ScanTupleSlot; if (heapTuple) return ExecStoreTuple(heapTuple, slot, InvalidBuffer, should_free); else return ExecClearTuple(slot); }
/* ---------------------------------------------------------------- * FunctionNext * * This is a workhorse for ExecFunctionScan * ---------------------------------------------------------------- */ static TupleTableSlot * FunctionNext(FunctionScanState *node) { TupleTableSlot *slot; EState *estate; ScanDirection direction; Tuplestorestate *tuplestorestate; bool should_free; HeapTuple heapTuple; /* * get information from the estate and scan state */ estate = node->ss.ps.state; direction = estate->es_direction; tuplestorestate = node->tuplestorestate; /* * If first time through, read all tuples from function and put them * in a tuplestore. Subsequent calls just fetch tuples from * tuplestore. */ if (tuplestorestate == NULL) { ExprContext *econtext = node->ss.ps.ps_ExprContext; TupleDesc funcTupdesc; node->tuplestorestate = tuplestorestate = ExecMakeTableFunctionResult(node->funcexpr, econtext, node->tupdesc, &funcTupdesc); /* * If function provided a tupdesc, cross-check it. We only really * need to do this for functions returning RECORD, but might as * well do it always. */ if (funcTupdesc && !tupledesc_match(node->tupdesc, funcTupdesc)) ereport(ERROR, (errcode(ERRCODE_DATATYPE_MISMATCH), errmsg("query-specified return row and actual function return row do not match"))); } /* * Get the next tuple from tuplestore. Return NULL if no more tuples. */ heapTuple = tuplestore_getheaptuple(tuplestorestate, ScanDirectionIsForward(direction), &should_free); slot = node->ss.ss_ScanTupleSlot; return ExecStoreTuple(heapTuple, slot, InvalidBuffer, should_free); }
/* * Step from current page. */ Buffer rumStep(Buffer buffer, Relation index, int lockmode, ScanDirection scanDirection) { Buffer nextbuffer; Page page = BufferGetPage(buffer); bool isLeaf = RumPageIsLeaf(page); bool isData = RumPageIsData(page); BlockNumber blkno; blkno = (ScanDirectionIsForward(scanDirection)) ? RumPageGetOpaque(page)->rightlink : RumPageGetOpaque(page)->leftlink; if (blkno == InvalidBlockNumber) { UnlockReleaseBuffer(buffer); return InvalidBuffer; } nextbuffer = ReadBuffer(index, blkno); UnlockReleaseBuffer(buffer); LockBuffer(nextbuffer, lockmode); /* Sanity check that the page we stepped to is of similar kind. */ page = BufferGetPage(nextbuffer); if (isLeaf != RumPageIsLeaf(page) || isData != RumPageIsData(page)) elog(ERROR, "right sibling of RUM page is of different type"); /* * Given the proper lock sequence above, we should never land on a deleted * page. */ if (RumPageIsDeleted(page)) elog(ERROR, "%s sibling of RUM page was deleted", ScanDirectionIsForward(scanDirection) ? "right" : "left"); return nextbuffer; }
/* ---------------------------------------------------------------- * ExecAppend * * Handles iteration over multiple subplans. * ---------------------------------------------------------------- */ TupleTableSlot * ExecAppend(AppendState *node) { for (;;) { PlanState *subnode; TupleTableSlot *result; /* * figure out which subplan we are currently processing */ subnode = node->appendplans[node->as_whichplan]; /* * get a tuple from the subplan */ result = ExecProcNode(subnode); if (!TupIsNull(result)) { /* * If the subplan gave us something then return it as-is. We do * NOT make use of the result slot that was set up in * ExecInitAppend, first because there's no reason to and second * because it may have the wrong tuple descriptor in * inherited-UPDATE cases. */ return result; } /* * Go on to the "next" subplan in the appropriate direction. If no * more subplans, return the empty slot set up for us by * ExecInitAppend. */ if (ScanDirectionIsForward(node->ps.state->es_direction)) node->as_whichplan++; else node->as_whichplan--; if (!exec_append_initialize_next(node)) return ExecClearTuple(node->ps.ps_ResultTupleSlot); /* Else loop back and try to get a tuple from the new subplan */ } }
/* * _bt_next() -- Get the next item in a scan. * * On entry, so->currPos describes the current page, which is pinned * but not locked, and so->currPos.itemIndex identifies which item was * previously returned. * * On successful exit, scan->xs_ctup.t_self is set to the TID of the * next heap tuple, and if requested, scan->xs_itup points to a copy of * the index tuple. so->currPos is updated as needed. * * On failure exit (no more tuples), we release pin and set * so->currPos.buf to InvalidBuffer. */ bool _bt_next(IndexScanDesc scan, ScanDirection dir) { BTScanOpaque so = (BTScanOpaque) scan->opaque; BTScanPosItem *currItem; /* * Advance to next tuple on current page; or if there's no more, try to * step to the next page with data. */ if (ScanDirectionIsForward(dir)) { if (++so->currPos.itemIndex > so->currPos.lastItem) { /* We must acquire lock before applying _bt_steppage */ Assert(BufferIsValid(so->currPos.buf)); LockBuffer(so->currPos.buf, BT_READ); if (!_bt_steppage(scan, dir)) return false; /* Drop the lock, but not pin, on the new page */ LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK); } } else { if (--so->currPos.itemIndex < so->currPos.firstItem) { /* We must acquire lock before applying _bt_steppage */ Assert(BufferIsValid(so->currPos.buf)); LockBuffer(so->currPos.buf, BT_READ); if (!_bt_steppage(scan, dir)) return false; /* Drop the lock, but not pin, on the new page */ LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK); } } /* OK, itemIndex says what to return */ currItem = &so->currPos.items[so->currPos.itemIndex]; scan->xs_ctup.t_self = currItem->heapTid; if (scan->xs_want_itup) scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset); return true; }
/* ---------------------------------------------------------------- * FunctionNext * * This is a workhorse for ExecFunctionScan * ---------------------------------------------------------------- */ static TupleTableSlot * FunctionNext(FunctionScanState *node) { TupleTableSlot *slot; EState *estate; ScanDirection direction; Tuplestorestate *tuplestorestate; /* * get information from the estate and scan state */ estate = node->ss.ps.state; direction = estate->es_direction; tuplestorestate = node->tuplestorestate; /* * If first time through, read all tuples from function and put them in a * tuplestore. Subsequent calls just fetch tuples from tuplestore. */ if (tuplestorestate == NULL) { node->tuplestorestate = tuplestorestate = ExecMakeTableFunctionResult(node->funcexpr, node->ss.ps.ps_ExprContext, node->tupdesc, node->eflags & EXEC_FLAG_BACKWARD); } /* * Get the next tuple from tuplestore. Return NULL if no more tuples. */ slot = node->ss.ss_ScanTupleSlot; (void) tuplestore_gettupleslot(tuplestorestate, ScanDirectionIsForward(direction), false, slot); return slot; }
/* * _bt_first() -- Find the first item in a scan. * * We need to be clever about the type of scan, the operation it's * performing, and the tree ordering. We find the * first item in the tree that satisfies the qualification * associated with the scan descriptor. On exit, the page containing * the current index tuple is read locked and pinned, and the scan's * opaque data entry is updated to include the buffer. */ bool _bt_first(IndexScanDesc scan, ScanDirection dir) { Relation rel = scan->indexRelation; BTScanOpaque so = (BTScanOpaque) scan->opaque; Buffer buf; Page page; BTStack stack; OffsetNumber offnum; BTItem btitem; IndexTuple itup; ItemPointer current; BlockNumber blkno; StrategyNumber strat; bool res; int32 result; bool scanFromEnd; bool continuescan; ScanKey scankeys = NULL; int keysCount = 0; int *nKeyIs = NULL; int i, j; StrategyNumber strat_total; /* * Order the scan keys in our canonical fashion and eliminate any * redundant keys. */ _bt_orderkeys(scan); /* * Quit now if _bt_orderkeys() discovered that the scan keys can never * be satisfied (eg, x == 1 AND x > 2). */ if (!so->qual_ok) return false; /* * Examine the scan keys to discover where we need to start the scan. */ scanFromEnd = false; strat_total = BTEqualStrategyNumber; if (so->numberOfKeys > 0) { nKeyIs = (int *) palloc(so->numberOfKeys * sizeof(int)); for (i = 0; i < so->numberOfKeys; i++) { AttrNumber attno = so->keyData[i].sk_attno; /* ignore keys for already-determined attrs */ if (attno <= keysCount) continue; /* if we didn't find a boundary for the preceding attr, quit */ if (attno > keysCount + 1) break; strat = _bt_getstrat(rel, attno, so->keyData[i].sk_procedure); /* * Can we use this key as a starting boundary for this attr? * * We can use multiple keys if they look like, say, = >= = but we * have to stop after accepting a > or < boundary. */ if (strat == strat_total || strat == BTEqualStrategyNumber) nKeyIs[keysCount++] = i; else if (ScanDirectionIsBackward(dir) && (strat == BTLessStrategyNumber || strat == BTLessEqualStrategyNumber)) { nKeyIs[keysCount++] = i; strat_total = strat; if (strat == BTLessStrategyNumber) break; } else if (ScanDirectionIsForward(dir) && (strat == BTGreaterStrategyNumber || strat == BTGreaterEqualStrategyNumber)) { nKeyIs[keysCount++] = i; strat_total = strat; if (strat == BTGreaterStrategyNumber) break; } } if (keysCount == 0) scanFromEnd = true; } else scanFromEnd = true; /* if we just need to walk down one edge of the tree, do that */ if (scanFromEnd) { if (nKeyIs) pfree(nKeyIs); return _bt_endpoint(scan, dir); } /* * We want to start the scan somewhere within the index. Set up a * scankey we can use to search for the correct starting point. */ scankeys = (ScanKey) palloc(keysCount * sizeof(ScanKeyData)); for (i = 0; i < keysCount; i++) { FmgrInfo *procinfo; j = nKeyIs[i]; /* * _bt_orderkeys disallows it, but it's place to add some code * later */ if (so->keyData[j].sk_flags & SK_ISNULL) { pfree(nKeyIs); pfree(scankeys); elog(ERROR, "btree doesn't support is(not)null, yet"); return false; } procinfo = index_getprocinfo(rel, i + 1, BTORDER_PROC); ScanKeyEntryInitializeWithInfo(scankeys + i, so->keyData[j].sk_flags, i + 1, procinfo, CurrentMemoryContext, so->keyData[j].sk_argument); } if (nKeyIs) pfree(nKeyIs); current = &(scan->currentItemData); /* * Use the manufactured scan key to descend the tree and position * ourselves on the target leaf page. */ stack = _bt_search(rel, keysCount, scankeys, &buf, BT_READ); /* don't need to keep the stack around... */ _bt_freestack(stack); if (!BufferIsValid(buf)) { /* Only get here if index is completely empty */ ItemPointerSetInvalid(current); so->btso_curbuf = InvalidBuffer; pfree(scankeys); return false; } /* remember which buffer we have pinned */ so->btso_curbuf = buf; blkno = BufferGetBlockNumber(buf); page = BufferGetPage(buf); /* position to the precise item on the page */ offnum = _bt_binsrch(rel, buf, keysCount, scankeys); ItemPointerSet(current, blkno, offnum); /* * At this point we are positioned at the first item >= scan key, or * possibly at the end of a page on which all the existing items are * less than the scan key and we know that everything on later pages * is greater than or equal to scan key. * * We could step forward in the latter case, but that'd be a waste of * time if we want to scan backwards. So, it's now time to examine * the scan strategy to find the exact place to start the scan. * * Note: if _bt_step fails (meaning we fell off the end of the index in * one direction or the other), we either return false (no matches) or * call _bt_endpoint() to set up a scan starting at that index * endpoint, as appropriate for the desired scan type. * * it's yet other place to add some code later for is(not)null ... */ switch (strat_total) { case BTLessStrategyNumber: /* * Back up one to arrive at last item < scankey */ if (!_bt_step(scan, &buf, BackwardScanDirection)) { pfree(scankeys); return false; } break; case BTLessEqualStrategyNumber: /* * We need to find the last item <= scankey, so step forward * till we find one > scankey, then step back one. */ if (offnum > PageGetMaxOffsetNumber(page)) { if (!_bt_step(scan, &buf, ForwardScanDirection)) { pfree(scankeys); return _bt_endpoint(scan, dir); } } for (;;) { offnum = ItemPointerGetOffsetNumber(current); page = BufferGetPage(buf); result = _bt_compare(rel, keysCount, scankeys, page, offnum); if (result < 0) break; if (!_bt_step(scan, &buf, ForwardScanDirection)) { pfree(scankeys); return _bt_endpoint(scan, dir); } } if (!_bt_step(scan, &buf, BackwardScanDirection)) { pfree(scankeys); return false; } break; case BTEqualStrategyNumber: /* * Make sure we are on the first equal item; might have to * step forward if currently at end of page. */ if (offnum > PageGetMaxOffsetNumber(page)) { if (!_bt_step(scan, &buf, ForwardScanDirection)) { pfree(scankeys); return false; } offnum = ItemPointerGetOffsetNumber(current); page = BufferGetPage(buf); } result = _bt_compare(rel, keysCount, scankeys, page, offnum); if (result != 0) goto nomatches; /* no equal items! */ /* * If a backward scan was specified, need to start with last * equal item not first one. */ if (ScanDirectionIsBackward(dir)) { do { if (!_bt_step(scan, &buf, ForwardScanDirection)) { pfree(scankeys); return _bt_endpoint(scan, dir); } offnum = ItemPointerGetOffsetNumber(current); page = BufferGetPage(buf); result = _bt_compare(rel, keysCount, scankeys, page, offnum); } while (result == 0); if (!_bt_step(scan, &buf, BackwardScanDirection)) elog(ERROR, "equal items disappeared?"); } break; case BTGreaterEqualStrategyNumber: /* * We want the first item >= scankey, which is where we are... * unless we're not anywhere at all... */ if (offnum > PageGetMaxOffsetNumber(page)) { if (!_bt_step(scan, &buf, ForwardScanDirection)) { pfree(scankeys); return false; } } break; case BTGreaterStrategyNumber: /* * We want the first item > scankey, so make sure we are on an * item and then step over any equal items. */ if (offnum > PageGetMaxOffsetNumber(page)) { if (!_bt_step(scan, &buf, ForwardScanDirection)) { pfree(scankeys); return false; } offnum = ItemPointerGetOffsetNumber(current); page = BufferGetPage(buf); } result = _bt_compare(rel, keysCount, scankeys, page, offnum); while (result == 0) { if (!_bt_step(scan, &buf, ForwardScanDirection)) { pfree(scankeys); return false; } offnum = ItemPointerGetOffsetNumber(current); page = BufferGetPage(buf); result = _bt_compare(rel, keysCount, scankeys, page, offnum); } break; } /* okay, current item pointer for the scan is right */ offnum = ItemPointerGetOffsetNumber(current); page = BufferGetPage(buf); btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum)); itup = &btitem->bti_itup; /* is the first item actually acceptable? */ if (_bt_checkkeys(scan, itup, dir, &continuescan)) { /* yes, return it */ scan->xs_ctup.t_self = itup->t_tid; res = true; } else if (continuescan) { /* no, but there might be another one that is */ res = _bt_next(scan, dir); } else { /* no tuples in the index match this scan key */ nomatches: ItemPointerSetInvalid(current); so->btso_curbuf = InvalidBuffer; _bt_relbuf(rel, buf); res = false; } pfree(scankeys); return res; }
/* * _bt_endpoint() -- Find the first or last key in the index. * * This is used by _bt_first() to set up a scan when we've determined * that the scan must start at the beginning or end of the index (for * a forward or backward scan respectively). */ static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir) { Relation rel; Buffer buf; Page page; BTPageOpaque opaque; ItemPointer current; OffsetNumber maxoff; OffsetNumber start; BlockNumber blkno; BTItem btitem; IndexTuple itup; BTScanOpaque so; bool res; bool continuescan; rel = scan->indexRelation; current = &(scan->currentItemData); so = (BTScanOpaque) scan->opaque; /* * Scan down to the leftmost or rightmost leaf page. This is a * simplified version of _bt_search(). We don't maintain a stack * since we know we won't need it. */ buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir)); if (!BufferIsValid(buf)) { /* empty index... */ ItemPointerSetInvalid(current); so->btso_curbuf = InvalidBuffer; return false; } blkno = BufferGetBlockNumber(buf); page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); Assert(P_ISLEAF(opaque)); maxoff = PageGetMaxOffsetNumber(page); if (ScanDirectionIsForward(dir)) { /* There could be dead pages to the left, so not this: */ /* Assert(P_LEFTMOST(opaque)); */ start = P_FIRSTDATAKEY(opaque); } else if (ScanDirectionIsBackward(dir)) { Assert(P_RIGHTMOST(opaque)); start = PageGetMaxOffsetNumber(page); if (start < P_FIRSTDATAKEY(opaque)) /* watch out for empty * page */ start = P_FIRSTDATAKEY(opaque); } else { elog(ERROR, "invalid scan direction: %d", (int) dir); start = 0; /* keep compiler quiet */ } ItemPointerSet(current, blkno, start); /* remember which buffer we have pinned */ so->btso_curbuf = buf; /* * Left/rightmost page could be empty due to deletions, if so step * till we find a nonempty page. */ if (start > maxoff) { if (!_bt_step(scan, &buf, dir)) return false; start = ItemPointerGetOffsetNumber(current); page = BufferGetPage(buf); } btitem = (BTItem) PageGetItem(page, PageGetItemId(page, start)); itup = &(btitem->bti_itup); /* see if we picked a winner */ if (_bt_checkkeys(scan, itup, dir, &continuescan)) { /* yes, return it */ scan->xs_ctup.t_self = itup->t_tid; res = true; } else if (continuescan) { /* no, but there might be another one that is */ res = _bt_next(scan, dir); } else { /* no tuples in the index match this scan key */ ItemPointerSetInvalid(current); so->btso_curbuf = InvalidBuffer; _bt_relbuf(rel, buf); res = false; } return res; }
/* ---------------------------------------------------------------- * ExecMaterial * * As long as we are at the end of the data collected in the tuplestore, * we collect one new row from the subplan on each call, and stash it * aside in the tuplestore before returning it. The tuplestore is * only read if we are asked to scan backwards, rescan, or mark/restore. * * ---------------------------------------------------------------- */ TupleTableSlot * /* result tuple from subplan */ ExecMaterial(MaterialState *node) { EState *estate; ScanDirection dir; bool forward; Tuplestorestate *tuplestorestate; HeapTuple heapTuple = NULL; bool should_free = false; bool eof_tuplestore; TupleTableSlot *slot; /* * get state info from node */ estate = node->ss.ps.state; dir = estate->es_direction; forward = ScanDirectionIsForward(dir); tuplestorestate = (Tuplestorestate *) node->tuplestorestate; /* * If first time through, and we need a tuplestore, initialize it. */ if (tuplestorestate == NULL && node->randomAccess) { tuplestorestate = tuplestore_begin_heap(true, false, work_mem); node->tuplestorestate = (void *) tuplestorestate; } /* * If we are not at the end of the tuplestore, or are going backwards, try * to fetch a tuple from tuplestore. */ eof_tuplestore = (tuplestorestate == NULL) || tuplestore_ateof(tuplestorestate); if (!forward && eof_tuplestore) { if (!node->eof_underlying) { /* * When reversing direction at tuplestore EOF, the first * getheaptuple call will fetch the last-added tuple; but we want * to return the one before that, if possible. So do an extra * fetch. */ heapTuple = tuplestore_getheaptuple(tuplestorestate, forward, &should_free); if (heapTuple == NULL) return NULL; /* the tuplestore must be empty */ if (should_free) heap_freetuple(heapTuple); } eof_tuplestore = false; } if (!eof_tuplestore) { heapTuple = tuplestore_getheaptuple(tuplestorestate, forward, &should_free); if (heapTuple == NULL && forward) eof_tuplestore = true; } /* * If necessary, try to fetch another row from the subplan. * * Note: the eof_underlying state variable exists to short-circuit further * subplan calls. It's not optional, unfortunately, because some plan * node types are not robust about being called again when they've already * returned NULL. */ if (eof_tuplestore && !node->eof_underlying) { PlanState *outerNode; TupleTableSlot *outerslot; /* * We can only get here with forward==true, so no need to worry about * which direction the subplan will go. */ outerNode = outerPlanState(node); outerslot = ExecProcNode(outerNode); if (TupIsNull(outerslot)) { node->eof_underlying = true; return NULL; } heapTuple = ExecFetchSlotTuple(outerslot); should_free = false; /* * Append returned tuple to tuplestore, too. NOTE: because the * tuplestore is certainly in EOF state, its read position will move * forward over the added tuple. This is what we want. */ if (tuplestorestate) tuplestore_puttuple(tuplestorestate, (void *) heapTuple); } /* * Return the obtained tuple, if any. */ slot = (TupleTableSlot *) node->ss.ps.ps_ResultTupleSlot; if (heapTuple) return ExecStoreTuple(heapTuple, slot, InvalidBuffer, should_free); else return ExecClearTuple(slot); }
/* ---------------------------------------------------------------- * CteScanNext * * This is a workhorse for ExecCteScan * ---------------------------------------------------------------- */ static TupleTableSlot * CteScanNext(CteScanState *node) { EState *estate; ScanDirection dir; bool forward; Tuplestorestate *tuplestorestate; bool eof_tuplestore; TupleTableSlot *slot; /* * get state info from node */ estate = node->ss.ps.state; dir = estate->es_direction; forward = ScanDirectionIsForward(dir); tuplestorestate = node->leader->cte_table; tuplestore_select_read_pointer(tuplestorestate, node->readptr); slot = node->ss.ss_ScanTupleSlot; /* * If we are not at the end of the tuplestore, or are going backwards, try * to fetch a tuple from tuplestore. */ eof_tuplestore = tuplestore_ateof(tuplestorestate); if (!forward && eof_tuplestore) { if (!node->leader->eof_cte) { /* * When reversing direction at tuplestore EOF, the first * gettupleslot call will fetch the last-added tuple; but we want * to return the one before that, if possible. So do an extra * fetch. */ if (!tuplestore_advance(tuplestorestate, forward)) return NULL; /* the tuplestore must be empty */ } eof_tuplestore = false; } /* * If we can fetch another tuple from the tuplestore, return it. * * Note: we have to use copy=true in the tuplestore_gettupleslot call, * because we are sharing the tuplestore with other nodes that might write * into the tuplestore before we get called again. */ if (!eof_tuplestore) { if (tuplestore_gettupleslot(tuplestorestate, forward, true, slot)) return slot; if (forward) eof_tuplestore = true; } /* * If necessary, try to fetch another row from the CTE query. * * Note: the eof_cte state variable exists to short-circuit further calls * of the CTE plan. It's not optional, unfortunately, because some plan * node types are not robust about being called again when they've already * returned NULL. */ if (eof_tuplestore && !node->leader->eof_cte) { TupleTableSlot *cteslot; /* * We can only get here with forward==true, so no need to worry about * which direction the subplan will go. */ cteslot = ExecProcNode(node->cteplanstate); if (TupIsNull(cteslot)) { node->leader->eof_cte = true; return NULL; } /* * There are corner cases where the subplan could change which * tuplestore read pointer is active, so be sure to reselect ours * before storing the tuple we got. */ tuplestore_select_read_pointer(tuplestorestate, node->readptr); /* * Append a copy of the returned tuple to tuplestore. NOTE: because * our read pointer is certainly in EOF state, its read position will * move forward over the added tuple. This is what we want. Also, * any other readers will *not* move past the new tuple, which is what * they want. */ tuplestore_puttupleslot(tuplestorestate, cteslot); /* * We MUST copy the CTE query's output tuple into our own slot. This * is because other CteScan nodes might advance the CTE query before * we are called again, and our output tuple must stay stable over * that. */ return ExecCopySlot(slot, cteslot); } /* * Nothing left ... */ return ExecClearTuple(slot); }
/* * _bt_steppage() -- Step to next page containing valid data for scan * * On entry, so->currPos.buf must be pinned and read-locked. We'll drop * the lock and pin before moving to next page. * * On success exit, we hold pin and read-lock on the next interesting page, * and so->currPos is updated to contain data from that page. * * If there are no more matching records in the given direction, we drop all * locks and pins, set so->currPos.buf to InvalidBuffer, and return FALSE. */ static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir) { BTScanOpaque so = (BTScanOpaque) scan->opaque; Relation rel; Page page; BTPageOpaque opaque; /* we must have the buffer pinned and locked */ Assert(BufferIsValid(so->currPos.buf)); /* Before leaving current page, deal with any killed items */ if (so->numKilled > 0) _bt_killitems(scan, true); /* * Before we modify currPos, make a copy of the page data if there was a * mark position that needs it. */ if (so->markItemIndex >= 0) { /* bump pin on current buffer for assignment to mark buffer */ IncrBufferRefCount(so->currPos.buf); memcpy(&so->markPos, &so->currPos, offsetof(BTScanPosData, items[1]) + so->currPos.lastItem * sizeof(BTScanPosItem)); if (so->markTuples) memcpy(so->markTuples, so->currTuples, so->currPos.nextTupleOffset); so->markPos.itemIndex = so->markItemIndex; so->markItemIndex = -1; } rel = scan->indexRelation; if (ScanDirectionIsForward(dir)) { /* Walk right to the next page with data */ /* We must rely on the previously saved nextPage link! */ BlockNumber blkno = so->currPos.nextPage; /* Remember we left a page with data */ so->currPos.moreLeft = true; for (;;) { /* release the previous buffer */ _bt_relbuf(rel, so->currPos.buf); so->currPos.buf = InvalidBuffer; /* if we're at end of scan, give up */ if (blkno == P_NONE || !so->currPos.moreRight) return false; /* check for interrupts while we're not holding any buffer lock */ CHECK_FOR_INTERRUPTS(); /* step right one page */ so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ); /* check for deleted page */ page = BufferGetPage(so->currPos.buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (!P_IGNORE(opaque)) { PredicateLockPage(rel, blkno, scan->xs_snapshot); /* see if there are any matches on this page */ /* note that this will clear moreRight if we can stop */ if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque))) break; } /* nope, keep going */ blkno = opaque->btpo_next; } } else { /* Remember we left a page with data */ so->currPos.moreRight = true; /* * Walk left to the next page with data. This is much more complex * than the walk-right case because of the possibility that the page * to our left splits while we are in flight to it, plus the * possibility that the page we were on gets deleted after we leave * it. See nbtree/README for details. */ for (;;) { /* Done if we know there are no matching keys to the left */ if (!so->currPos.moreLeft) { _bt_relbuf(rel, so->currPos.buf); so->currPos.buf = InvalidBuffer; return false; } /* Step to next physical page */ so->currPos.buf = _bt_walk_left(rel, so->currPos.buf); /* if we're physically at end of index, return failure */ if (so->currPos.buf == InvalidBuffer) return false; /* * Okay, we managed to move left to a non-deleted page. Done if * it's not half-dead and contains matching tuples. Else loop back * and do it all again. */ page = BufferGetPage(so->currPos.buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (!P_IGNORE(opaque)) { PredicateLockPage(rel, BufferGetBlockNumber(so->currPos.buf), scan->xs_snapshot); /* see if there are any matches on this page */ /* note that this will clear moreLeft if we can stop */ if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page))) break; } } } return true; }
/* ---------------------------------------------------------------- * IndexOnlyNext * * Retrieve a tuple from the IndexOnlyScan node's index. * ---------------------------------------------------------------- */ static TupleTableSlot * IndexOnlyNext(IndexOnlyScanState *node) { EState *estate; ExprContext *econtext; ScanDirection direction; IndexScanDesc scandesc; TupleTableSlot *slot; ItemPointer tid; /* * extract necessary information from index scan node */ estate = node->ss.ps.state; direction = estate->es_direction; /* flip direction if this is an overall backward scan */ if (ScanDirectionIsBackward(((IndexOnlyScan *) node->ss.ps.plan)->indexorderdir)) { if (ScanDirectionIsForward(direction)) direction = BackwardScanDirection; else if (ScanDirectionIsBackward(direction)) direction = ForwardScanDirection; } scandesc = node->ioss_ScanDesc; econtext = node->ss.ps.ps_ExprContext; slot = node->ss.ss_ScanTupleSlot; /* * OK, now that we have what we need, fetch the next tuple. */ while ((tid = index_getnext_tid(scandesc, direction)) != NULL) { HeapTuple tuple = NULL; /* * We can skip the heap fetch if the TID references a heap page on * which all tuples are known visible to everybody. In any case, * we'll use the index tuple not the heap tuple as the data source. * * Note on Memory Ordering Effects: visibilitymap_test does not lock * the visibility map buffer, and therefore the result we read here * could be slightly stale. However, it can't be stale enough to * matter. * * We need to detect clearing a VM bit due to an insert right away, * because the tuple is present in the index page but not visible. The * reading of the TID by this scan (using a shared lock on the index * buffer) is serialized with the insert of the TID into the index * (using an exclusive lock on the index buffer). Because the VM bit * is cleared before updating the index, and locking/unlocking of the * index page acts as a full memory barrier, we are sure to see the * cleared bit if we see a recently-inserted TID. * * Deletes do not update the index page (only VACUUM will clear out * the TID), so the clearing of the VM bit by a delete is not * serialized with this test below, and we may see a value that is * significantly stale. However, we don't care about the delete right * away, because the tuple is still visible until the deleting * transaction commits or the statement ends (if it's our * transaction). In either case, the lock on the VM buffer will have * been released (acting as a write barrier) after clearing the * bit. And for us to have a snapshot that includes the deleting * transaction (making the tuple invisible), we must have acquired * ProcArrayLock after that time, acting as a read barrier. * * It's worth going through this complexity to avoid needing to lock * the VM buffer, which could cause significant contention. */ if (!visibilitymap_test(scandesc->heapRelation, ItemPointerGetBlockNumber(tid), &node->ioss_VMBuffer)) { /* * Rats, we have to visit the heap to check visibility. */ node->ioss_HeapFetches++; tuple = index_fetch_heap(scandesc); if (tuple == NULL) continue; /* no visible tuple, try next index entry */ /* * Only MVCC snapshots are supported here, so there should be no * need to keep following the HOT chain once a visible entry has * been found. If we did want to allow that, we'd need to keep * more state to remember not to call index_getnext_tid next time. */ if (scandesc->xs_continue_hot) elog(ERROR, "non-MVCC snapshots are not supported in index-only scans"); /* * Note: at this point we are holding a pin on the heap page, as * recorded in scandesc->xs_cbuf. We could release that pin now, * but it's not clear whether it's a win to do so. The next index * entry might require a visit to the same heap page. */ } /* * Fill the scan tuple slot with data from the index. */ StoreIndexTuple(slot, scandesc->xs_itup, scandesc->xs_itupdesc); /* * If the index was lossy, we have to recheck the index quals. * (Currently, this can never happen, but we should support the case * for possible future use, eg with GiST indexes.) */ if (scandesc->xs_recheck) { econtext->ecxt_scantuple = slot; ResetExprContext(econtext); if (!ExecQual(node->indexqual, econtext, false)) { /* Fails recheck, so drop it and loop back for another */ InstrCountFiltered2(node, 1); continue; } } /* * We don't currently support rechecking ORDER BY distances. (In * principle, if the index can support retrieval of the originally * indexed value, it should be able to produce an exact distance * calculation too. So it's not clear that adding code here for * recheck/re-sort would be worth the trouble. But we should at least * throw an error if someone tries it.) */ if (scandesc->numberOfOrderBys > 0 && scandesc->xs_recheckorderby) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("lossy distance functions are not supported in index-only scans"))); /* * Predicate locks for index-only scans must be acquired at the page * level when the heap is not accessed, since tuple-level predicate * locks need the tuple's xmin value. If we had to visit the tuple * anyway, then we already have the tuple-level lock and can skip the * page lock. */ if (tuple == NULL) PredicateLockPage(scandesc->heapRelation, ItemPointerGetBlockNumber(tid), estate->es_snapshot); return slot; } /* * if we get here it means the index scan failed so we are at the end of * the scan.. */ return ExecClearTuple(slot); }
/* ---------------------------------------------------------------- * IndexNext * * Retrieve a tuple from the IndexScan node's currentRelation * using the index specified in the IndexScanState information. * ---------------------------------------------------------------- */ TupleTableSlot * IndexNext(IndexScanState *node) { EState *estate; ExprContext *econtext; ScanDirection direction; IndexScanDesc scandesc; Index scanrelid; HeapTuple tuple; TupleTableSlot *slot; /* * extract necessary information from index scan node */ estate = node->ss.ps.state; direction = estate->es_direction; initScanDesc(node); /* flip direction if this is an overall backward scan */ if (ScanDirectionIsBackward(((IndexScan *) node->ss.ps.plan)->indexorderdir)) { if (ScanDirectionIsForward(direction)) direction = BackwardScanDirection; else if (ScanDirectionIsBackward(direction)) direction = ForwardScanDirection; } scandesc = node->iss_ScanDesc; econtext = node->ss.ps.ps_ExprContext; slot = node->ss.ss_ScanTupleSlot; scanrelid = ((IndexScan *) node->ss.ps.plan)->scan.scanrelid; /* * Check if we are evaluating PlanQual for tuple of this relation. * Additional checking is not good, but no other way for now. We could * introduce new nodes for this case and handle IndexScan --> NewNode * switching in Init/ReScan plan... */ if (estate->es_evTuple != NULL && estate->es_evTuple[scanrelid - 1] != NULL) { if (estate->es_evTupleNull[scanrelid - 1]) { if (!node->ss.ps.delayEagerFree) { ExecEagerFreeIndexScan(node); } return ExecClearTuple(slot); } ExecStoreGenericTuple(estate->es_evTuple[scanrelid - 1], slot, false); /* Does the tuple meet the indexqual condition? */ econtext->ecxt_scantuple = slot; ResetExprContext(econtext); if (!ExecQual(node->indexqualorig, econtext, false)) { if (!node->ss.ps.delayEagerFree) { ExecEagerFreeIndexScan(node); } ExecClearTuple(slot); /* would not be returned by scan */ } /* Flag for the next call that no more tuples */ estate->es_evTupleNull[scanrelid - 1] = true; Gpmon_M_Incr_Rows_Out(GpmonPktFromIndexScanState(node)); CheckSendPlanStateGpmonPkt(&node->ss.ps); return slot; } /* * ok, now that we have what we need, fetch the next tuple. */ if ((tuple = index_getnext(scandesc, direction)) != NULL) { /* * Store the scanned tuple in the scan tuple slot of the scan state. * Note: we pass 'false' because tuples returned by amgetnext are * pointers onto disk pages and must not be pfree()'d. */ ExecStoreHeapTuple(tuple, /* tuple to store */ slot, /* slot to store in */ scandesc->xs_cbuf, /* buffer containing tuple */ false); /* don't pfree */ Gpmon_M_Incr_Rows_Out(GpmonPktFromIndexScanState(node)); CheckSendPlanStateGpmonPkt(&node->ss.ps); return slot; } if (!node->ss.ps.delayEagerFree) { ExecEagerFreeIndexScan(node); } /* * if we get here it means the index scan failed so we are at the end of * the scan.. */ return ExecClearTuple(slot); }
/* ---------------------------------------------------------------- * FunctionNext * * This is a workhorse for ExecFunctionScan * ---------------------------------------------------------------- */ static TupleTableSlot * FunctionNext(FunctionScanState *node) { TupleTableSlot *slot; EState *estate; ScanDirection direction; Tuplestorestate *tuplestorestate; /* * get information from the estate and scan state */ estate = node->ss.ps.state; direction = estate->es_direction; tuplestorestate = node->tuplestorestate; /* * If first time through, read all tuples from function and put them in a * tuplestore. Subsequent calls just fetch tuples from tuplestore. */ if (tuplestorestate == NULL) { tuplestorestate = ExecMakeTableFunctionResult( node->funcexpr, node->ss.ps.ps_ExprContext, node->tupdesc, PlanStateOperatorMemKB( (PlanState *) node)); node->tuplestorestate = tuplestorestate; /* CDB: Offer extra info for EXPLAIN ANALYZE. */ if (node->ss.ps.instrument) { /* Let the tuplestore share our Instrumentation object. */ tuplestore_set_instrument(tuplestorestate, node->ss.ps.instrument); /* Request a callback at end of query. */ node->ss.ps.cdbexplainfun = ExecFunctionScanExplainEnd; } } /* * Get the next tuple from tuplestore. Return NULL if no more tuples. */ slot = node->ss.ss_ScanTupleSlot; if (tuplestore_gettupleslot(tuplestorestate, ScanDirectionIsForward(direction), slot)) { /* CDB: Label each row with a synthetic ctid for subquery dedup. */ if (node->cdb_want_ctid) { HeapTuple tuple = ExecFetchSlotHeapTuple(slot); /* Increment 48-bit row count */ node->cdb_fake_ctid.ip_posid++; if (node->cdb_fake_ctid.ip_posid == 0) ItemPointerSetBlockNumber(&node->cdb_fake_ctid, 1 + ItemPointerGetBlockNumber(&node->cdb_fake_ctid)); tuple->t_self = node->cdb_fake_ctid; } } if (!TupIsNull(slot)) { Gpmon_M_Incr_Rows_Out(GpmonPktFromFuncScanState(node)); CheckSendPlanStateGpmonPkt(&node->ss.ps); } else if (!node->ss.ps.delayEagerFree) { ExecEagerFreeFunctionScan((FunctionScanState *)(&node->ss.ps)); } return slot; }
/* ---------------------------------------------------------------- * ExecMaterial * * As long as we are at the end of the data collected in the tuplestore, * we collect one new row from the subplan on each call, and stash it * aside in the tuplestore before returning it. The tuplestore is * only read if we are asked to scan backwards, rescan, or mark/restore. * * ---------------------------------------------------------------- */ TupleTableSlot * /* result tuple from subplan */ ExecMaterial(MaterialState *node) { EState *estate; ScanDirection dir; bool forward; Tuplestorestate *tuplestorestate; bool eof_tuplestore; TupleTableSlot *slot; /* * get state info from node */ estate = node->ss.ps.state; dir = estate->es_direction; forward = ScanDirectionIsForward(dir); tuplestorestate = node->tuplestorestate; /* * If first time through, and we need a tuplestore, initialize it. */ if (tuplestorestate == NULL && node->eflags != 0) { tuplestorestate = tuplestore_begin_heap(true, false, work_mem); tuplestore_set_eflags(tuplestorestate, node->eflags); if (node->eflags & EXEC_FLAG_MARK) { /* * Allocate a second read pointer to serve as the mark. We know it * must have index 1, so needn't store that. */ int ptrno PG_USED_FOR_ASSERTS_ONLY; ptrno = tuplestore_alloc_read_pointer(tuplestorestate, node->eflags); Assert(ptrno == 1); } node->tuplestorestate = tuplestorestate; } /* * If we are not at the end of the tuplestore, or are going backwards, try * to fetch a tuple from tuplestore. */ eof_tuplestore = (tuplestorestate == NULL) || tuplestore_ateof(tuplestorestate); if (!forward && eof_tuplestore) { if (!node->eof_underlying) { /* * When reversing direction at tuplestore EOF, the first * gettupleslot call will fetch the last-added tuple; but we want * to return the one before that, if possible. So do an extra * fetch. */ if (!tuplestore_advance(tuplestorestate, forward)) return NULL; /* the tuplestore must be empty */ } eof_tuplestore = false; } /* * If we can fetch another tuple from the tuplestore, return it. */ slot = node->ss.ps.ps_ResultTupleSlot; if (!eof_tuplestore) { if (tuplestore_gettupleslot(tuplestorestate, forward, false, slot)) return slot; if (forward) eof_tuplestore = true; } /* * If necessary, try to fetch another row from the subplan. * * Note: the eof_underlying state variable exists to short-circuit further * subplan calls. It's not optional, unfortunately, because some plan * node types are not robust about being called again when they've already * returned NULL. */ if (eof_tuplestore && !node->eof_underlying) { PlanState *outerNode; TupleTableSlot *outerslot; /* * We can only get here with forward==true, so no need to worry about * which direction the subplan will go. */ outerNode = outerPlanState(node); outerslot = ExecProcNode(outerNode); if (TupIsNull(outerslot)) { node->eof_underlying = true; return NULL; } /* * Append a copy of the returned tuple to tuplestore. NOTE: because * the tuplestore is certainly in EOF state, its read position will * move forward over the added tuple. This is what we want. */ if (tuplestorestate) tuplestore_puttupleslot(tuplestorestate, outerslot); /* * We can just return the subplan's returned tuple, without copying. */ return outerslot; } /* * Nothing left ... */ return ExecClearTuple(slot); }
/* ---------------------------------------------------------------- * ExecProcAppend * * Handles the iteration over the multiple scans. * * NOTE: Can't call this ExecAppend, that name is used in execMain. * ---------------------------------------------------------------- */ TupleTableSlot * ExecProcAppend(AppendState *node) { EState *estate; int whichplan; PlanState *subnode; TupleTableSlot *result; TupleTableSlot *result_slot; ScanDirection direction; /* * get information from the node */ estate = node->ps.state; direction = estate->es_direction; whichplan = node->as_whichplan; result_slot = node->ps.ps_ResultTupleSlot; /* * figure out which subplan we are currently processing */ subnode = node->appendplans[whichplan]; /* * get a tuple from the subplan */ result = ExecProcNode(subnode); if (!TupIsNull(result)) { /* * if the subplan gave us something then place a copy of whatever * we get into our result slot and return it. * * Note we rely on the subplan to retain ownership of the tuple for * as long as we need it --- we don't copy it. */ return ExecStoreTuple(result->val, result_slot, InvalidBuffer, false); } else { /* * .. go on to the "next" subplan in the appropriate direction and * try processing again (recursively) */ if (ScanDirectionIsForward(direction)) node->as_whichplan++; else node->as_whichplan--; /* * return something from next node or an empty slot if all of our * subplans have been exhausted. */ if (exec_append_initialize_next(node)) { ExecSetSlotDescriptorIsNew(result_slot, true); return ExecProcAppend(node); } else return ExecClearTuple(result_slot); } }
/* * _bt_step() -- Step one item in the requested direction in a scan on * the tree. * * *bufP is the current buffer (read-locked and pinned). If we change * pages, it's updated appropriately. * * If successful, update scan's currentItemData and return true. * If no adjacent record exists in the requested direction, * release buffer pin/locks and return false. */ bool _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) { Relation rel = scan->indexRelation; ItemPointer current = &(scan->currentItemData); BTScanOpaque so = (BTScanOpaque) scan->opaque; Page page; BTPageOpaque opaque; OffsetNumber offnum, maxoff; BlockNumber blkno; /* * Don't use ItemPointerGetOffsetNumber or you risk to get assertion * due to ability of ip_posid to be equal 0. */ offnum = current->ip_posid; page = BufferGetPage(*bufP); opaque = (BTPageOpaque) PageGetSpecialPointer(page); maxoff = PageGetMaxOffsetNumber(page); if (ScanDirectionIsForward(dir)) { if (!PageIsEmpty(page) && offnum < maxoff) offnum = OffsetNumberNext(offnum); else { /* Walk right to the next page with data */ for (;;) { /* if we're at end of scan, release the buffer and return */ if (P_RIGHTMOST(opaque)) { _bt_relbuf(rel, *bufP); ItemPointerSetInvalid(current); *bufP = so->btso_curbuf = InvalidBuffer; return false; } /* step right one page */ blkno = opaque->btpo_next; _bt_relbuf(rel, *bufP); *bufP = _bt_getbuf(rel, blkno, BT_READ); page = BufferGetPage(*bufP); opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (!P_IGNORE(opaque)) { maxoff = PageGetMaxOffsetNumber(page); /* done if it's not empty */ offnum = P_FIRSTDATAKEY(opaque); if (!PageIsEmpty(page) && offnum <= maxoff) break; } } } } else /* backwards scan */ { if (offnum > P_FIRSTDATAKEY(opaque)) offnum = OffsetNumberPrev(offnum); else { /* * Walk left to the next page with data. This is much more * complex than the walk-right case because of the possibility * that the page to our left splits while we are in flight to * it, plus the possibility that the page we were on gets * deleted after we leave it. See nbtree/README for details. */ for (;;) { *bufP = _bt_walk_left(rel, *bufP); /* if we're at end of scan, return failure */ if (*bufP == InvalidBuffer) { ItemPointerSetInvalid(current); so->btso_curbuf = InvalidBuffer; return false; } page = BufferGetPage(*bufP); opaque = (BTPageOpaque) PageGetSpecialPointer(page); /* * Okay, we managed to move left to a non-deleted page. * Done if it's not half-dead and not empty. Else loop * back and do it all again. */ if (!P_IGNORE(opaque)) { maxoff = PageGetMaxOffsetNumber(page); offnum = maxoff; if (!PageIsEmpty(page) && maxoff >= P_FIRSTDATAKEY(opaque)) break; } } } } /* Update scan state */ so->btso_curbuf = *bufP; blkno = BufferGetBlockNumber(*bufP); ItemPointerSet(current, blkno, offnum); return true; }
/* * hidden_getnext * * Returns the next tuple from the hidden data. This has to be called after * exceeding heap/index scan. */ HeapTuple hidden_getnext(HiddenScanDesc hscan, ScanDirection direction) { if (ScanDirectionIsForward(direction)) { if (hscan->hdn_idx < 0) hscan->hdn_idx = 0; while (hscan->hdn_idx < hscan->hdn_len) { HeapTuple tuple; bool valid = true; /* fetch */ tuple = hscan->hdn_tuples[hscan->hdn_idx++]; /* key test */ if (hscan->hdn_nkeys > 0) HeapKeyTest(tuple, RelationGetDescr(hscan->hdn_rel), hscan->hdn_nkeys, hscan->hdn_key, valid); if (valid) { /* save the last tuple for the sake of "no movement" */ hscan->hdn_lasttuple = tuple; return tuple; } } } else if (ScanDirectionIsBackward(direction)) { if (hscan->hdn_idx >= hscan->hdn_len) hscan->hdn_idx = hscan->hdn_len - 1; while (hscan->hdn_idx >= 0) { HeapTuple tuple; bool valid = true; /* fetch */ tuple = hscan->hdn_tuples[hscan->hdn_idx--]; /* key test */ if (hscan->hdn_nkeys > 0) HeapKeyTest(tuple, RelationGetDescr(hscan->hdn_rel), hscan->hdn_nkeys, hscan->hdn_key, valid); if (valid) { /* save the last tuple for the sake of "no movement" */ hscan->hdn_lasttuple = tuple; return tuple; } } } else { /* * ``no movement'' scan direction: refetch prior tuple */ return hscan->hdn_lasttuple; } hscan->hdn_lasttuple = NULL; return NULL; }
/* * _bt_first() -- Find the first item in a scan. * * We need to be clever about the direction of scan, the search * conditions, and the tree ordering. We find the first item (or, * if backwards scan, the last item) in the tree that satisfies the * qualifications in the scan key. On success exit, the page containing * the current index tuple is pinned but not locked, and data about * the matching tuple(s) on the page has been loaded into so->currPos. * scan->xs_ctup.t_self is set to the heap TID of the current tuple, * and if requested, scan->xs_itup points to a copy of the index tuple. * * If there are no matching items in the index, we return FALSE, with no * pins or locks held. * * Note that scan->keyData[], and the so->keyData[] scankey built from it, * are both search-type scankeys (see nbtree/README for more about this). * Within this routine, we build a temporary insertion-type scankey to use * in locating the scan start position. */ bool _bt_first(IndexScanDesc scan, ScanDirection dir) { Relation rel = scan->indexRelation; BTScanOpaque so = (BTScanOpaque) scan->opaque; Buffer buf; BTStack stack; OffsetNumber offnum; StrategyNumber strat; bool nextkey; bool goback; ScanKey startKeys[INDEX_MAX_KEYS]; ScanKeyData scankeys[INDEX_MAX_KEYS]; ScanKeyData notnullkeys[INDEX_MAX_KEYS]; int keysCount = 0; int i; StrategyNumber strat_total; BTScanPosItem *currItem; pgstat_count_index_scan(rel); /* * Examine the scan keys and eliminate any redundant keys; also mark the * keys that must be matched to continue the scan. */ _bt_preprocess_keys(scan); /* * Quit now if _bt_preprocess_keys() discovered that the scan keys can * never be satisfied (eg, x == 1 AND x > 2). */ if (!so->qual_ok) return false; /*---------- * Examine the scan keys to discover where we need to start the scan. * * We want to identify the keys that can be used as starting boundaries; * these are =, >, or >= keys for a forward scan or =, <, <= keys for * a backwards scan. We can use keys for multiple attributes so long as * the prior attributes had only =, >= (resp. =, <=) keys. Once we accept * a > or < boundary or find an attribute with no boundary (which can be * thought of as the same as "> -infinity"), we can't use keys for any * attributes to its right, because it would break our simplistic notion * of what initial positioning strategy to use. * * When the scan keys include cross-type operators, _bt_preprocess_keys * may not be able to eliminate redundant keys; in such cases we will * arbitrarily pick a usable one for each attribute. This is correct * but possibly not optimal behavior. (For example, with keys like * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when * x=5 would be more efficient.) Since the situation only arises given * a poorly-worded query plus an incomplete opfamily, live with it. * * When both equality and inequality keys appear for a single attribute * (again, only possible when cross-type operators appear), we *must* * select one of the equality keys for the starting point, because * _bt_checkkeys() will stop the scan as soon as an equality qual fails. * For example, if we have keys like "x >= 4 AND x = 10" and we elect to * start at x=4, we will fail and stop before reaching x=10. If multiple * equality quals survive preprocessing, however, it doesn't matter which * one we use --- by definition, they are either redundant or * contradictory. * * Any regular (not SK_SEARCHNULL) key implies a NOT NULL qualifier. * If the index stores nulls at the end of the index we'll be starting * from, and we have no boundary key for the column (which means the key * we deduced NOT NULL from is an inequality key that constrains the other * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to * use as a boundary key. If we didn't do this, we might find ourselves * traversing a lot of null entries at the start of the scan. * * In this loop, row-comparison keys are treated the same as keys on their * first (leftmost) columns. We'll add on lower-order columns of the row * comparison below, if possible. * * The selected scan keys (at most one per index column) are remembered by * storing their addresses into the local startKeys[] array. *---------- */ strat_total = BTEqualStrategyNumber; if (so->numberOfKeys > 0) { AttrNumber curattr; ScanKey chosen; ScanKey impliesNN; ScanKey cur; /* * chosen is the so-far-chosen key for the current attribute, if any. * We don't cast the decision in stone until we reach keys for the * next attribute. */ curattr = 1; chosen = NULL; /* Also remember any scankey that implies a NOT NULL constraint */ impliesNN = NULL; /* * Loop iterates from 0 to numberOfKeys inclusive; we use the last * pass to handle after-last-key processing. Actual exit from the * loop is at one of the "break" statements below. */ for (cur = so->keyData, i = 0;; cur++, i++) { if (i >= so->numberOfKeys || cur->sk_attno != curattr) { /* * Done looking at keys for curattr. If we didn't find a * usable boundary key, see if we can deduce a NOT NULL key. */ if (chosen == NULL && impliesNN != NULL && ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ? ScanDirectionIsForward(dir) : ScanDirectionIsBackward(dir))) { /* Yes, so build the key in notnullkeys[keysCount] */ chosen = ¬nullkeys[keysCount]; ScanKeyEntryInitialize(chosen, (SK_SEARCHNOTNULL | SK_ISNULL | (impliesNN->sk_flags & (SK_BT_DESC | SK_BT_NULLS_FIRST))), curattr, ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ? BTGreaterStrategyNumber : BTLessStrategyNumber), InvalidOid, InvalidOid, InvalidOid, (Datum) 0); } /* * If we still didn't find a usable boundary key, quit; else * save the boundary key pointer in startKeys. */ if (chosen == NULL) break; startKeys[keysCount++] = chosen; /* * Adjust strat_total, and quit if we have stored a > or < * key. */ strat = chosen->sk_strategy; if (strat != BTEqualStrategyNumber) { strat_total = strat; if (strat == BTGreaterStrategyNumber || strat == BTLessStrategyNumber) break; } /* * Done if that was the last attribute, or if next key is not * in sequence (implying no boundary key is available for the * next attribute). */ if (i >= so->numberOfKeys || cur->sk_attno != curattr + 1) break; /* * Reset for next attr. */ curattr = cur->sk_attno; chosen = NULL; impliesNN = NULL; } /* * Can we use this key as a starting boundary for this attr? * * If not, does it imply a NOT NULL constraint? (Because * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber, * *any* inequality key works for that; we need not test.) */ switch (cur->sk_strategy) { case BTLessStrategyNumber: case BTLessEqualStrategyNumber: if (chosen == NULL) { if (ScanDirectionIsBackward(dir)) chosen = cur; else impliesNN = cur; } break; case BTEqualStrategyNumber: /* override any non-equality choice */ chosen = cur; break; case BTGreaterEqualStrategyNumber: case BTGreaterStrategyNumber: if (chosen == NULL) { if (ScanDirectionIsForward(dir)) chosen = cur; else impliesNN = cur; } break; } } } /* * If we found no usable boundary keys, we have to start from one end of * the tree. Walk down that edge to the first or last key, and scan from * there. */ if (keysCount == 0) return _bt_endpoint(scan, dir); /* * We want to start the scan somewhere within the index. Set up an * insertion scankey we can use to search for the boundary point we * identified above. The insertion scankey is built in the local * scankeys[] array, using the keys identified by startKeys[]. */ Assert(keysCount <= INDEX_MAX_KEYS); for (i = 0; i < keysCount; i++) { ScanKey cur = startKeys[i]; Assert(cur->sk_attno == i + 1); if (cur->sk_flags & SK_ROW_HEADER) { /* * Row comparison header: look to the first row member instead. * * The member scankeys are already in insertion format (ie, they * have sk_func = 3-way-comparison function), but we have to watch * out for nulls, which _bt_preprocess_keys didn't check. A null * in the first row member makes the condition unmatchable, just * like qual_ok = false. */ ScanKey subkey = (ScanKey) DatumGetPointer(cur->sk_argument); Assert(subkey->sk_flags & SK_ROW_MEMBER); if (subkey->sk_flags & SK_ISNULL) return false; memcpy(scankeys + i, subkey, sizeof(ScanKeyData)); /* * If the row comparison is the last positioning key we accepted, * try to add additional keys from the lower-order row members. * (If we accepted independent conditions on additional index * columns, we use those instead --- doesn't seem worth trying to * determine which is more restrictive.) Note that this is OK * even if the row comparison is of ">" or "<" type, because the * condition applied to all but the last row member is effectively * ">=" or "<=", and so the extra keys don't break the positioning * scheme. But, by the same token, if we aren't able to use all * the row members, then the part of the row comparison that we * did use has to be treated as just a ">=" or "<=" condition, and * so we'd better adjust strat_total accordingly. */ if (i == keysCount - 1) { bool used_all_subkeys = false; Assert(!(subkey->sk_flags & SK_ROW_END)); for (;;) { subkey++; Assert(subkey->sk_flags & SK_ROW_MEMBER); if (subkey->sk_attno != keysCount + 1) break; /* out-of-sequence, can't use it */ if (subkey->sk_strategy != cur->sk_strategy) break; /* wrong direction, can't use it */ if (subkey->sk_flags & SK_ISNULL) break; /* can't use null keys */ Assert(keysCount < INDEX_MAX_KEYS); memcpy(scankeys + keysCount, subkey, sizeof(ScanKeyData)); keysCount++; if (subkey->sk_flags & SK_ROW_END) { used_all_subkeys = true; break; } } if (!used_all_subkeys) { switch (strat_total) { case BTLessStrategyNumber: strat_total = BTLessEqualStrategyNumber; break; case BTGreaterStrategyNumber: strat_total = BTGreaterEqualStrategyNumber; break; } } break; /* done with outer loop */ } } else { /* * Ordinary comparison key. Transform the search-style scan key * to an insertion scan key by replacing the sk_func with the * appropriate btree comparison function. * * If scankey operator is not a cross-type comparison, we can use * the cached comparison function; otherwise gotta look it up in * the catalogs. (That can't lead to infinite recursion, since no * indexscan initiated by syscache lookup will use cross-data-type * operators.) * * We support the convention that sk_subtype == InvalidOid means * the opclass input type; this is a hack to simplify life for * ScanKeyInit(). */ if (cur->sk_subtype == rel->rd_opcintype[i] || cur->sk_subtype == InvalidOid) { FmgrInfo *procinfo; procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC); ScanKeyEntryInitializeWithInfo(scankeys + i, cur->sk_flags, cur->sk_attno, InvalidStrategy, cur->sk_subtype, cur->sk_collation, procinfo, cur->sk_argument); } else { RegProcedure cmp_proc; cmp_proc = get_opfamily_proc(rel->rd_opfamily[i], rel->rd_opcintype[i], cur->sk_subtype, BTORDER_PROC); if (!RegProcedureIsValid(cmp_proc)) elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"", BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype, cur->sk_attno, RelationGetRelationName(rel)); ScanKeyEntryInitialize(scankeys + i, cur->sk_flags, cur->sk_attno, InvalidStrategy, cur->sk_subtype, cur->sk_collation, cmp_proc, cur->sk_argument); } } } /*---------- * Examine the selected initial-positioning strategy to determine exactly * where we need to start the scan, and set flag variables to control the * code below. * * If nextkey = false, _bt_search and _bt_binsrch will locate the first * item >= scan key. If nextkey = true, they will locate the first * item > scan key. * * If goback = true, we will then step back one item, while if * goback = false, we will start the scan on the located item. *---------- */ switch (strat_total) { case BTLessStrategyNumber: /* * Find first item >= scankey, then back up one to arrive at last * item < scankey. (Note: this positioning strategy is only used * for a backward scan, so that is always the correct starting * position.) */ nextkey = false; goback = true; break; case BTLessEqualStrategyNumber: /* * Find first item > scankey, then back up one to arrive at last * item <= scankey. (Note: this positioning strategy is only used * for a backward scan, so that is always the correct starting * position.) */ nextkey = true; goback = true; break; case BTEqualStrategyNumber: /* * If a backward scan was specified, need to start with last equal * item not first one. */ if (ScanDirectionIsBackward(dir)) { /* * This is the same as the <= strategy. We will check at the * end whether the found item is actually =. */ nextkey = true; goback = true; } else { /* * This is the same as the >= strategy. We will check at the * end whether the found item is actually =. */ nextkey = false; goback = false; } break; case BTGreaterEqualStrategyNumber: /* * Find first item >= scankey. (This is only used for forward * scans.) */ nextkey = false; goback = false; break; case BTGreaterStrategyNumber: /* * Find first item > scankey. (This is only used for forward * scans.) */ nextkey = true; goback = false; break; default: /* can't get here, but keep compiler quiet */ elog(ERROR, "unrecognized strat_total: %d", (int) strat_total); return false; } /* * Use the manufactured insertion scan key to descend the tree and * position ourselves on the target leaf page. */ stack = _bt_search(rel, keysCount, scankeys, nextkey, &buf, BT_READ); /* don't need to keep the stack around... */ _bt_freestack(stack); /* remember which buffer we have pinned, if any */ so->currPos.buf = buf; if (!BufferIsValid(buf)) { /* * We only get here if the index is completely empty. Lock relation * because nothing finer to lock exists. */ PredicateLockRelation(rel, scan->xs_snapshot); return false; } else PredicateLockPage(rel, BufferGetBlockNumber(buf), scan->xs_snapshot); /* initialize moreLeft/moreRight appropriately for scan direction */ if (ScanDirectionIsForward(dir)) { so->currPos.moreLeft = false; so->currPos.moreRight = true; } else { so->currPos.moreLeft = true; so->currPos.moreRight = false; } so->numKilled = 0; /* just paranoia */ so->markItemIndex = -1; /* ditto */ /* position to the precise item on the page */ offnum = _bt_binsrch(rel, buf, keysCount, scankeys, nextkey); /* * If nextkey = false, we are positioned at the first item >= scan key, or * possibly at the end of a page on which all the existing items are less * than the scan key and we know that everything on later pages is greater * than or equal to scan key. * * If nextkey = true, we are positioned at the first item > scan key, or * possibly at the end of a page on which all the existing items are less * than or equal to the scan key and we know that everything on later * pages is greater than scan key. * * The actually desired starting point is either this item or the prior * one, or in the end-of-page case it's the first item on the next page or * the last item on this page. Adjust the starting offset if needed. (If * this results in an offset before the first item or after the last one, * _bt_readpage will report no items found, and then we'll step to the * next page as needed.) */ if (goback) offnum = OffsetNumberPrev(offnum); /* * Now load data from the first page of the scan. */ if (!_bt_readpage(scan, dir, offnum)) { /* * There's no actually-matching data on this page. Try to advance to * the next page. Return false if there's no matching data at all. */ if (!_bt_steppage(scan, dir)) return false; } /* Drop the lock, but not pin, on the current page */ LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK); /* OK, itemIndex says what to return */ currItem = &so->currPos.items[so->currPos.itemIndex]; scan->xs_ctup.t_self = currItem->heapTid; if (scan->xs_want_itup) scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset); return true; }
/* * _bt_readpage() -- Load data from current index page into so->currPos * * Caller must have pinned and read-locked so->currPos.buf; the buffer's state * is not changed here. Also, currPos.moreLeft and moreRight must be valid; * they are updated as appropriate. All other fields of so->currPos are * initialized from scratch here. * * We scan the current page starting at offnum and moving in the indicated * direction. All items matching the scan keys are loaded into currPos.items. * moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports * that there can be no more matching tuples in the current scan direction. * * Returns true if any matching items found on the page, false if none. */ static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) { BTScanOpaque so = (BTScanOpaque) scan->opaque; Page page; BTPageOpaque opaque; OffsetNumber minoff; OffsetNumber maxoff; int itemIndex; IndexTuple itup; bool continuescan; /* we must have the buffer pinned and locked */ Assert(BufferIsValid(so->currPos.buf)); page = BufferGetPage(so->currPos.buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); minoff = P_FIRSTDATAKEY(opaque); maxoff = PageGetMaxOffsetNumber(page); /* * we must save the page's right-link while scanning it; this tells us * where to step right to after we're done with these items. There is no * corresponding need for the left-link, since splits always go right. */ so->currPos.nextPage = opaque->btpo_next; /* initialize tuple workspace to empty */ so->currPos.nextTupleOffset = 0; if (ScanDirectionIsForward(dir)) { /* load items[] in ascending order */ itemIndex = 0; offnum = Max(offnum, minoff); while (offnum <= maxoff) { itup = _bt_checkkeys(scan, page, offnum, dir, &continuescan); if (itup != NULL) { /* tuple passes all scan key conditions, so remember it */ _bt_saveitem(so, itemIndex, offnum, itup); itemIndex++; } if (!continuescan) { /* there can't be any more matches, so stop */ so->currPos.moreRight = false; break; } offnum = OffsetNumberNext(offnum); } Assert(itemIndex <= MaxIndexTuplesPerPage); so->currPos.firstItem = 0; so->currPos.lastItem = itemIndex - 1; so->currPos.itemIndex = 0; } else { /* load items[] in descending order */ itemIndex = MaxIndexTuplesPerPage; offnum = Min(offnum, maxoff); while (offnum >= minoff) { itup = _bt_checkkeys(scan, page, offnum, dir, &continuescan); if (itup != NULL) { /* tuple passes all scan key conditions, so remember it */ itemIndex--; _bt_saveitem(so, itemIndex, offnum, itup); } if (!continuescan) { /* there can't be any more matches, so stop */ so->currPos.moreLeft = false; break; } offnum = OffsetNumberPrev(offnum); } Assert(itemIndex >= 0); so->currPos.firstItem = itemIndex; so->currPos.lastItem = MaxIndexTuplesPerPage - 1; so->currPos.itemIndex = MaxIndexTuplesPerPage - 1; } return (so->currPos.firstItem <= so->currPos.lastItem); }
/* ---------------------------------------------------------------- * ValuesNext * * This is a workhorse for ExecValuesScan * ---------------------------------------------------------------- */ static TupleTableSlot * ValuesNext(ValuesScanState *node) { TupleTableSlot *slot; EState *estate; ExprContext *econtext; ScanDirection direction; List *exprlist; /* * get information from the estate and scan state */ estate = node->ss.ps.state; direction = estate->es_direction; slot = node->ss.ss_ScanTupleSlot; econtext = node->rowcontext; /* * Get the next tuple. Return NULL if no more tuples. */ if (ScanDirectionIsForward(direction)) { if (node->curr_idx < node->array_len) node->curr_idx++; if (node->curr_idx < node->array_len) exprlist = node->exprlists[node->curr_idx]; else exprlist = NIL; } else { if (node->curr_idx >= 0) node->curr_idx--; if (node->curr_idx >= 0) exprlist = node->exprlists[node->curr_idx]; else exprlist = NIL; } /* * Always clear the result slot; this is appropriate if we are at the end * of the data, and if we're not, we still need it as the first step of * the store-virtual-tuple protocol. It seems wise to clear the slot * before we reset the context it might have pointers into. */ ExecClearTuple(slot); if (exprlist) { MemoryContext oldContext; List *exprstatelist; Datum *values; bool *isnull; ListCell *lc; int resind; /* * Get rid of any prior cycle's leftovers. We use ReScanExprContext * not just ResetExprContext because we want any registered shutdown * callbacks to be called. */ ReScanExprContext(econtext); /* * Build the expression eval state in the econtext's per-tuple memory. * This is a tad unusual, but we want to delete the eval state again * when we move to the next row, to avoid growth of memory * requirements over a long values list. */ oldContext = MemoryContextSwitchTo(econtext->ecxt_per_tuple_memory); /* * Pass NULL, not my plan node, because we don't want anything in this * transient state linking into permanent state. The only possibility * is a SubPlan, and there shouldn't be any (any subselects in the * VALUES list should be InitPlans). */ exprstatelist = (List *) ExecInitExpr((Expr *) exprlist, NULL); /* parser should have checked all sublists are the same length */ Assert(list_length(exprstatelist) == slot->tts_tupleDescriptor->natts); /* * Compute the expressions and build a virtual result tuple. We * already did ExecClearTuple(slot). */ values = slot->tts_values; isnull = slot->tts_isnull; resind = 0; foreach(lc, exprstatelist) { ExprState *estate = (ExprState *) lfirst(lc); values[resind] = ExecEvalExpr(estate, econtext, &isnull[resind], NULL); resind++; } MemoryContextSwitchTo(oldContext); /* * And return the virtual tuple. */ ExecStoreVirtualTuple(slot); }
/* * _bt_endpoint() -- Find the first or last page in the index, and scan * from there to the first key satisfying all the quals. * * This is used by _bt_first() to set up a scan when we've determined * that the scan must start at the beginning or end of the index (for * a forward or backward scan respectively). Exit conditions are the * same as for _bt_first(). */ static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir) { Relation rel = scan->indexRelation; BTScanOpaque so = (BTScanOpaque) scan->opaque; Buffer buf; Page page; BTPageOpaque opaque; OffsetNumber start; BTScanPosItem *currItem; /* * Scan down to the leftmost or rightmost leaf page. This is a simplified * version of _bt_search(). We don't maintain a stack since we know we * won't need it. */ buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir)); if (!BufferIsValid(buf)) { /* * Empty index. Lock the whole relation, as nothing finer to lock * exists. */ PredicateLockRelation(rel, scan->xs_snapshot); so->currPos.buf = InvalidBuffer; return false; } PredicateLockPage(rel, BufferGetBlockNumber(buf), scan->xs_snapshot); page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); Assert(P_ISLEAF(opaque)); if (ScanDirectionIsForward(dir)) { /* There could be dead pages to the left, so not this: */ /* Assert(P_LEFTMOST(opaque)); */ start = P_FIRSTDATAKEY(opaque); } else if (ScanDirectionIsBackward(dir)) { Assert(P_RIGHTMOST(opaque)); start = PageGetMaxOffsetNumber(page); } else { elog(ERROR, "invalid scan direction: %d", (int) dir); start = 0; /* keep compiler quiet */ } /* remember which buffer we have pinned */ so->currPos.buf = buf; /* initialize moreLeft/moreRight appropriately for scan direction */ if (ScanDirectionIsForward(dir)) { so->currPos.moreLeft = false; so->currPos.moreRight = true; } else { so->currPos.moreLeft = true; so->currPos.moreRight = false; } so->numKilled = 0; /* just paranoia */ so->markItemIndex = -1; /* ditto */ /* * Now load data from the first page of the scan. */ if (!_bt_readpage(scan, dir, start)) { /* * There's no actually-matching data on this page. Try to advance to * the next page. Return false if there's no matching data at all. */ if (!_bt_steppage(scan, dir)) return false; } /* Drop the lock, but not pin, on the current page */ LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK); /* OK, itemIndex says what to return */ currItem = &so->currPos.items[so->currPos.itemIndex]; scan->xs_ctup.t_self = currItem->heapTid; if (scan->xs_want_itup) scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset); return true; }
/* ---------------------------------------------------------------- * ExecSort * * Sorts tuples from the outer subtree of the node using tuplesort, * which saves the results in a temporary file or memory. After the * initial call, returns a tuple from the file with each call. * * Conditions: * -- none. * * Initial States: * -- the outer child is prepared to return the first tuple. * ---------------------------------------------------------------- */ TupleTableSlot * ExecSort(SortState *node) { EState *estate; ScanDirection dir; Tuplesortstate *tuplesortstate; TupleTableSlot *slot; /* * get state info from node */ SO1_printf("ExecSort: %s\n", "entering routine"); estate = node->ss.ps.state; dir = estate->es_direction; tuplesortstate = (Tuplesortstate *) node->tuplesortstate; /* * If first time through, read all tuples from outer plan and pass them to * tuplesort.c. Subsequent calls just fetch tuples from tuplesort. */ if (!node->sort_Done) { Sort *plannode = (Sort *) node->ss.ps.plan; PlanState *outerNode; TupleDesc tupDesc; SO1_printf("ExecSort: %s\n", "sorting subplan"); /* * Want to scan subplan in the forward direction while creating the * sorted data. */ estate->es_direction = ForwardScanDirection; /* * Initialize tuplesort module. */ SO1_printf("ExecSort: %s\n", "calling tuplesort_begin"); outerNode = outerPlanState(node); tupDesc = ExecGetResultType(outerNode); tuplesortstate = tuplesort_begin_heap(tupDesc, plannode->numCols, plannode->sortColIdx, plannode->sortOperators, plannode->collations, plannode->nullsFirst, work_mem, node->randomAccess); if (node->bounded) tuplesort_set_bound(tuplesortstate, node->bound); node->tuplesortstate = (void *) tuplesortstate; /* * Scan the subplan and feed all the tuples to tuplesort. */ for (;;) { slot = ExecProcNode(outerNode); if (TupIsNull(slot)) break; tuplesort_puttupleslot(tuplesortstate, slot); } /* * Complete the sort. */ tuplesort_performsort(tuplesortstate); /* * restore to user specified direction */ estate->es_direction = dir; /* * finally set the sorted flag to true */ node->sort_Done = true; node->bounded_Done = node->bounded; node->bound_Done = node->bound; SO1_printf("ExecSort: %s\n", "sorting done"); } SO1_printf("ExecSort: %s\n", "retrieving tuple from tuplesort"); /* * Get the first or next tuple from tuplesort. Returns NULL if no more * tuples. */ slot = node->ss.ps.ps_ResultTupleSlot; (void) tuplesort_gettupleslot(tuplesortstate, ScanDirectionIsForward(dir), slot); return slot; }
/* ---------------------------------------------------------------- * ExecMaterial * * As long as we are at the end of the data collected in the tuplestore, * we collect one new row from the subplan on each call, and stash it * aside in the tuplestore before returning it. The tuplestore is * only read if we are asked to scan backwards, rescan, or mark/restore. * * ---------------------------------------------------------------- */ TupleTableSlot * /* result tuple from subplan */ ExecMaterial(MaterialState *node) { EState *estate; ScanDirection dir; bool forward; NTupleStore *ts; NTupleStoreAccessor *tsa; bool eof_tuplestore; TupleTableSlot *slot; Material *ma; /* * get state info from node */ estate = node->ss.ps.state; dir = estate->es_direction; forward = ScanDirectionIsForward(dir); ts = node->ts_state->matstore; tsa = (NTupleStoreAccessor *) node->ts_pos; ma = (Material *) node->ss.ps.plan; Assert(IsA(ma, Material)); /* * If first time through, and we need a tuplestore, initialize it. */ if (ts == NULL && (ma->share_type != SHARE_NOTSHARED || node->randomAccess)) { /* * For cross slice material, we only run ExecMaterial on DriverSlice */ if(ma->share_type == SHARE_MATERIAL_XSLICE) { char rwfile_prefix[100]; if(ma->driver_slice != currentSliceId) { elog(LOG, "Material Exec on CrossSlice, current slice %d", currentSliceId); return NULL; } shareinput_create_bufname_prefix(rwfile_prefix, sizeof(rwfile_prefix), ma->share_id); elog(LOG, "Material node creates shareinput rwfile %s", rwfile_prefix); ts = ntuplestore_create_readerwriter(rwfile_prefix, PlanStateOperatorMemKB((PlanState *)node) * 1024, true); tsa = ntuplestore_create_accessor(ts, true); } else { /* Non-shared Materialize node */ bool isWriter = true; workfile_set *work_set = NULL; if (gp_workfile_caching) { work_set = workfile_mgr_find_set( &node->ss.ps); if (NULL != work_set) { /* Reusing cached workfiles. Tell subplan we won't be needing any tuples */ elog(gp_workfile_caching_loglevel, "Materialize reusing cached workfiles, initiating Squelch walker"); isWriter = false; ExecSquelchNode(outerPlanState(node)); node->eof_underlying = true; node->cached_workfiles_found = true; if (node->ss.ps.instrument) { node->ss.ps.instrument->workfileReused = true; } } } if (NULL == work_set) { /* * No work_set found, this is because: * a. workfile caching is enabled but we didn't find any reusable set * b. workfile caching is disabled * Creating new empty workset */ Assert(!node->cached_workfiles_found); /* Don't try to cache when running under a ShareInputScan node */ bool can_reuse = (ma->share_type == SHARE_NOTSHARED); work_set = workfile_mgr_create_set(BUFFILE, can_reuse, &node->ss.ps, NULL_SNAPSHOT); isWriter = true; } Assert(NULL != work_set); AssertEquivalent(node->cached_workfiles_found, !isWriter); ts = ntuplestore_create_workset(work_set, node->cached_workfiles_found, PlanStateOperatorMemKB((PlanState *) node) * 1024); tsa = ntuplestore_create_accessor(ts, isWriter); } Assert(ts && tsa); node->ts_state->matstore = ts; node->ts_pos = (void *) tsa; /* CDB: Offer extra info for EXPLAIN ANALYZE. */ if (node->ss.ps.instrument) { /* Let the tuplestore share our Instrumentation object. */ ntuplestore_setinstrument(ts, node->ss.ps.instrument); /* Request a callback at end of query. */ node->ss.ps.cdbexplainfun = ExecMaterialExplainEnd; } /* * MPP: If requested, fetch all rows from subplan and put them * in the tuplestore. This decouples a middle slice's receiving * and sending Motion operators to neutralize a deadlock hazard. * MPP TODO: Remove when a better solution is implemented. * * ShareInput: if the material node * is used to share input, we will need to fetch all rows and put * them in tuple store */ while (((Material *) node->ss.ps.plan)->cdb_strict || ma->share_type != SHARE_NOTSHARED) { /* * When reusing cached workfiles, we already have all the tuples, * and we don't need to read anything from subplan. */ if (node->cached_workfiles_found) { break; } TupleTableSlot *outerslot = ExecProcNode(outerPlanState(node)); if (TupIsNull(outerslot)) { node->eof_underlying = true; if (ntuplestore_created_reusable_workfiles(ts)) { ntuplestore_flush(ts); ntuplestore_mark_workset_complete(ts); } ntuplestore_acc_seek_bof(tsa); break; } Gpmon_M_Incr(GpmonPktFromMaterialState(node), GPMON_QEXEC_M_ROWSIN); ntuplestore_acc_put_tupleslot(tsa, outerslot); } CheckSendPlanStateGpmonPkt(&node->ss.ps); if(forward) ntuplestore_acc_seek_bof(tsa); else ntuplestore_acc_seek_eof(tsa); /* for share input, material do not need to return any tuple */ if(ma->share_type != SHARE_NOTSHARED) { Assert(ma->share_type == SHARE_MATERIAL || ma->share_type == SHARE_MATERIAL_XSLICE); /* * if the material is shared across slice, notify consumers that * it is ready. */ if(ma->share_type == SHARE_MATERIAL_XSLICE) { if (ma->driver_slice == currentSliceId) { ntuplestore_flush(ts); node->share_lk_ctxt = shareinput_writer_notifyready(ma->share_id, ma->nsharer_xslice, estate->es_plannedstmt->planGen); } } return NULL; } } if(ma->share_type != SHARE_NOTSHARED) return NULL; /* * If we can fetch another tuple from the tuplestore, return it. */ slot = node->ss.ps.ps_ResultTupleSlot; if(forward) eof_tuplestore = (tsa == NULL) || !ntuplestore_acc_advance(tsa, 1); else eof_tuplestore = (tsa == NULL) || !ntuplestore_acc_advance(tsa, -1); if(tsa!=NULL && ntuplestore_acc_tell(tsa, NULL)) { ntuplestore_acc_current_tupleslot(tsa, slot); if (!TupIsNull(slot)) { Gpmon_M_Incr_Rows_Out(GpmonPktFromMaterialState(node)); CheckSendPlanStateGpmonPkt(&node->ss.ps); } return slot; } /* * If necessary, try to fetch another row from the subplan. * * Note: the eof_underlying state variable exists to short-circuit further * subplan calls. It's not optional, unfortunately, because some plan * node types are not robust about being called again when they've already * returned NULL. * If reusing cached workfiles, there is no need to execute subplan at all. */ if (eof_tuplestore && !node->eof_underlying) { PlanState *outerNode; TupleTableSlot *outerslot; Assert(!node->cached_workfiles_found && "we shouldn't get here when using cached workfiles"); /* * We can only get here with forward==true, so no need to worry about * which direction the subplan will go. */ outerNode = outerPlanState(node); outerslot = ExecProcNode(outerNode); if (TupIsNull(outerslot)) { node->eof_underlying = true; if (ntuplestore_created_reusable_workfiles(ts)) { ntuplestore_flush(ts); ntuplestore_mark_workset_complete(ts); } if (!node->ss.ps.delayEagerFree) { ExecEagerFreeMaterial(node); } return NULL; } Gpmon_M_Incr(GpmonPktFromMaterialState(node), GPMON_QEXEC_M_ROWSIN); if (tsa) ntuplestore_acc_put_tupleslot(tsa, outerslot); /* * And return a copy of the tuple. (XXX couldn't we just return the * outerslot?) */ Gpmon_M_Incr_Rows_Out(GpmonPktFromMaterialState(node)); CheckSendPlanStateGpmonPkt(&node->ss.ps); return ExecCopySlot(slot, outerslot); } if (!node->ss.ps.delayEagerFree) { ExecEagerFreeMaterial(node); } /* * Nothing left ... */ return NULL; }
/* * Test whether an indextuple satisfies all the scankey conditions. * * If so, copy its TID into scan->xs_ctup.t_self, and return TRUE. * If not, return FALSE (xs_ctup is not changed). * * If the tuple fails to pass the qual, we also determine whether there's * any need to continue the scan beyond this tuple, and set *continuescan * accordingly. See comments for _bt_preprocess_keys(), above, about how * this is done. * * scan: index scan descriptor (containing a search-type scankey) * page: buffer page containing index tuple * offnum: offset number of index tuple (must be a valid item!) * dir: direction we are scanning in * continuescan: output parameter (will be set correctly in all cases) */ bool _bt_checkkeys(IndexScanDesc scan, Page page, OffsetNumber offnum, ScanDirection dir, bool *continuescan) { ItemId iid = PageGetItemId(page, offnum); bool tuple_valid; IndexTuple tuple; TupleDesc tupdesc; BTScanOpaque so; int keysz; int ikey; ScanKey key; *continuescan = true; /* default assumption */ /* * If the scan specifies not to return killed tuples, then we treat a * killed tuple as not passing the qual. Most of the time, it's a win to * not bother examining the tuple's index keys, but just return * immediately with continuescan = true to proceed to the next tuple. * However, if this is the last tuple on the page, we should check the * index keys to prevent uselessly advancing to the next page. */ if (scan->ignore_killed_tuples && ItemIdIsDead(iid)) { /* return immediately if there are more tuples on the page */ if (ScanDirectionIsForward(dir)) { if (offnum < PageGetMaxOffsetNumber(page)) return false; } else { BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (offnum > P_FIRSTDATAKEY(opaque)) return false; } /* * OK, we want to check the keys, but we'll return FALSE even if the * tuple passes the key tests. */ tuple_valid = false; } else tuple_valid = true; tuple = (IndexTuple) PageGetItem(page, iid); IncrIndexProcessed(); tupdesc = RelationGetDescr(scan->indexRelation); so = (BTScanOpaque) scan->opaque; keysz = so->numberOfKeys; for (key = so->keyData, ikey = 0; ikey < keysz; key++, ikey++) { Datum datum; bool isNull; Datum test; /* row-comparison keys need special processing */ if (key->sk_flags & SK_ROW_HEADER) { if (_bt_check_rowcompare(key, tuple, tupdesc, dir, continuescan)) continue; return false; } datum = index_getattr(tuple, key->sk_attno, tupdesc, &isNull); if (key->sk_flags & SK_ISNULL) { /* Handle IS NULL tests */ Assert(key->sk_flags & SK_SEARCHNULL); if (isNull) continue; /* tuple satisfies this qual */ /* * Tuple fails this qual. If it's a required qual for the current * scan direction, then we can conclude no further tuples will * pass, either. */ if ((key->sk_flags & SK_BT_REQFWD) && ScanDirectionIsForward(dir)) *continuescan = false; else if ((key->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsBackward(dir)) *continuescan = false; /* * In any case, this indextuple doesn't match the qual. */ return false; } if (isNull) { if (key->sk_flags & SK_BT_NULLS_FIRST) { /* * Since NULLs are sorted before non-NULLs, we know we have * reached the lower limit of the range of values for this * index attr. On a backward scan, we can stop if this qual * is one of the "must match" subset. On a forward scan, * however, we should keep going. */ if ((key->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsBackward(dir)) *continuescan = false; } else { /* * Since NULLs are sorted after non-NULLs, we know we have * reached the upper limit of the range of values for this * index attr. On a forward scan, we can stop if this qual is * one of the "must match" subset. On a backward scan, * however, we should keep going. */ if ((key->sk_flags & SK_BT_REQFWD) && ScanDirectionIsForward(dir)) *continuescan = false; } /* * In any case, this indextuple doesn't match the qual. */ return false; } test = FunctionCall2(&key->sk_func, datum, key->sk_argument); if (!DatumGetBool(test)) { /* * Tuple fails this qual. If it's a required qual for the current * scan direction, then we can conclude no further tuples will * pass, either. * * Note: because we stop the scan as soon as any required equality * qual fails, it is critical that equality quals be used for the * initial positioning in _bt_first() when they are available. See * comments in _bt_first(). */ if ((key->sk_flags & SK_BT_REQFWD) && ScanDirectionIsForward(dir)) *continuescan = false; else if ((key->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsBackward(dir)) *continuescan = false; /* * In any case, this indextuple doesn't match the qual. */ return false; } } /* If we get here, the tuple passes all index quals. */ if (tuple_valid) scan->xs_ctup.t_self = tuple->t_tid; return tuple_valid; }
/* ---------------------------------------------------------------- * IndexNext * * Retrieve a tuple from the IndexScan node's currentRelation * using the index specified in the IndexScanState information. * ---------------------------------------------------------------- */ static TupleTableSlot * IndexNext(IndexScanState *node) { EState *estate; ExprContext *econtext; ScanDirection direction; IndexScanDesc scandesc; HeapTuple tuple; TupleTableSlot *slot; /* * extract necessary information from index scan node */ estate = node->ss.ps.state; direction = estate->es_direction; /* flip direction if this is an overall backward scan */ if (ScanDirectionIsBackward(((IndexScan *) node->ss.ps.plan)->indexorderdir)) { if (ScanDirectionIsForward(direction)) direction = BackwardScanDirection; else if (ScanDirectionIsBackward(direction)) direction = ForwardScanDirection; } scandesc = node->iss_ScanDesc; econtext = node->ss.ps.ps_ExprContext; slot = node->ss.ss_ScanTupleSlot; /* * ok, now that we have what we need, fetch the next tuple. */ while ((tuple = index_getnext(scandesc, direction)) != NULL) { /* * Store the scanned tuple in the scan tuple slot of the scan state. * Note: we pass 'false' because tuples returned by amgetnext are * pointers onto disk pages and must not be pfree()'d. */ ExecStoreTuple(tuple, /* tuple to store */ slot, /* slot to store in */ scandesc->xs_cbuf, /* buffer containing tuple */ false); /* don't pfree */ /* * If the index was lossy, we have to recheck the index quals using * the real tuple. */ if (scandesc->xs_recheck) { econtext->ecxt_scantuple = slot; ResetExprContext(econtext); if (!ExecQual(node->indexqualorig, econtext, false)) continue; /* nope, so ask index for another one */ } return slot; } /* * if we get here it means the index scan failed so we are at the end of * the scan.. */ return ExecClearTuple(slot); }
/* * Test whether an indextuple satisfies a row-comparison scan condition. * * Return true if so, false if not. If not, also clear *continuescan if * it's not possible for any future tuples in the current scan direction * to pass the qual. * * This is a subroutine for _bt_checkkeys, which see for more info. */ static bool _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, TupleDesc tupdesc, ScanDirection dir, bool *continuescan) { ScanKey subkey = (ScanKey) DatumGetPointer(skey->sk_argument); int32 cmpresult = 0; bool result; /* First subkey should be same as the header says */ Assert(subkey->sk_attno == skey->sk_attno); /* Loop over columns of the row condition */ for (;;) { Datum datum; bool isNull; Assert(subkey->sk_flags & SK_ROW_MEMBER); datum = index_getattr(tuple, subkey->sk_attno, tupdesc, &isNull); if (isNull) { if (subkey->sk_flags & SK_BT_NULLS_FIRST) { /* * Since NULLs are sorted before non-NULLs, we know we have * reached the lower limit of the range of values for this * index attr. On a backward scan, we can stop if this qual is * one of the "must match" subset. On a forward scan, * however, we should keep going. */ if ((subkey->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsBackward(dir)) *continuescan = false; } else { /* * Since NULLs are sorted after non-NULLs, we know we have * reached the upper limit of the range of values for this * index attr. On a forward scan, we can stop if this qual is * one of the "must match" subset. On a backward scan, * however, we should keep going. */ if ((subkey->sk_flags & SK_BT_REQFWD) && ScanDirectionIsForward(dir)) *continuescan = false; } /* * In any case, this indextuple doesn't match the qual. */ return false; } if (subkey->sk_flags & SK_ISNULL) { /* * Unlike the simple-scankey case, this isn't a disallowed case. * But it can never match. If all the earlier row comparison * columns are required for the scan direction, we can stop the * scan, because there can't be another tuple that will succeed. */ if (subkey != (ScanKey) DatumGetPointer(skey->sk_argument)) subkey--; if ((subkey->sk_flags & SK_BT_REQFWD) && ScanDirectionIsForward(dir)) *continuescan = false; else if ((subkey->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsBackward(dir)) *continuescan = false; return false; } /* Perform the test --- three-way comparison not bool operator */ cmpresult = DatumGetInt32(FunctionCall2(&subkey->sk_func, datum, subkey->sk_argument)); if (subkey->sk_flags & SK_BT_DESC) cmpresult = -cmpresult; /* Done comparing if unequal, else advance to next column */ if (cmpresult != 0) break; if (subkey->sk_flags & SK_ROW_END) break; subkey++; } /* * At this point cmpresult indicates the overall result of the row * comparison, and subkey points to the deciding column (or the last * column if the result is "="). */ switch (subkey->sk_strategy) { /* EQ and NE cases aren't allowed here */ case BTLessStrategyNumber: result = (cmpresult < 0); break; case BTLessEqualStrategyNumber: result = (cmpresult <= 0); break; case BTGreaterEqualStrategyNumber: result = (cmpresult >= 0); break; case BTGreaterStrategyNumber: result = (cmpresult > 0); break; default: elog(ERROR, "unrecognized RowCompareType: %d", (int) subkey->sk_strategy); result = 0; /* keep compiler quiet */ break; } if (!result) { /* * Tuple fails this qual. If it's a required qual for the current * scan direction, then we can conclude no further tuples will pass, * either. Note we have to look at the deciding column, not * necessarily the first or last column of the row condition. */ if ((subkey->sk_flags & SK_BT_REQFWD) && ScanDirectionIsForward(dir)) *continuescan = false; else if ((subkey->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsBackward(dir)) *continuescan = false; } return result; }