/* * Determine operatorMemKB for this operator. * For HashJoin, this is given by the right child, for everyone else it is the actual node. * * If PlanState is NULL (e.g. when spilling from a built-in function), return 0. */ static uint64 get_operator_work_mem(PlanState *ps) { if (NULL == ps) { return 0; } PlanState *psOp = ps; if (IsA(ps,HashJoinState)) { Assert(IsA(ps->righttree, HashState)); psOp = ps->righttree; } return PlanStateOperatorMemKB(psOp); }
/* ---------------------------------------------------------------- * ExecMaterial * * As long as we are at the end of the data collected in the tuplestore, * we collect one new row from the subplan on each call, and stash it * aside in the tuplestore before returning it. The tuplestore is * only read if we are asked to scan backwards, rescan, or mark/restore. * * ---------------------------------------------------------------- */ TupleTableSlot * /* result tuple from subplan */ ExecMaterial(MaterialState *node) { EState *estate; ScanDirection dir; bool forward; NTupleStore *ts; NTupleStoreAccessor *tsa; bool eof_tuplestore; TupleTableSlot *slot; Material *ma; /* * get state info from node */ estate = node->ss.ps.state; dir = estate->es_direction; forward = ScanDirectionIsForward(dir); ts = node->ts_state->matstore; tsa = (NTupleStoreAccessor *) node->ts_pos; ma = (Material *) node->ss.ps.plan; Assert(IsA(ma, Material)); /* * If first time through, and we need a tuplestore, initialize it. */ if (ts == NULL && (ma->share_type != SHARE_NOTSHARED || node->randomAccess)) { /* * For cross slice material, we only run ExecMaterial on DriverSlice */ if(ma->share_type == SHARE_MATERIAL_XSLICE) { char rwfile_prefix[100]; if(ma->driver_slice != currentSliceId) { elog(LOG, "Material Exec on CrossSlice, current slice %d", currentSliceId); return NULL; } shareinput_create_bufname_prefix(rwfile_prefix, sizeof(rwfile_prefix), ma->share_id); elog(LOG, "Material node creates shareinput rwfile %s", rwfile_prefix); ts = ntuplestore_create_readerwriter(rwfile_prefix, PlanStateOperatorMemKB((PlanState *)node) * 1024, true); tsa = ntuplestore_create_accessor(ts, true); } else { /* Non-shared Materialize node */ bool isWriter = true; workfile_set *work_set = NULL; if (gp_workfile_caching) { work_set = workfile_mgr_find_set( &node->ss.ps); if (NULL != work_set) { /* Reusing cached workfiles. Tell subplan we won't be needing any tuples */ elog(gp_workfile_caching_loglevel, "Materialize reusing cached workfiles, initiating Squelch walker"); isWriter = false; ExecSquelchNode(outerPlanState(node)); node->eof_underlying = true; node->cached_workfiles_found = true; if (node->ss.ps.instrument) { node->ss.ps.instrument->workfileReused = true; } } } if (NULL == work_set) { /* * No work_set found, this is because: * a. workfile caching is enabled but we didn't find any reusable set * b. workfile caching is disabled * Creating new empty workset */ Assert(!node->cached_workfiles_found); /* Don't try to cache when running under a ShareInputScan node */ bool can_reuse = (ma->share_type == SHARE_NOTSHARED); work_set = workfile_mgr_create_set(BUFFILE, can_reuse, &node->ss.ps, NULL_SNAPSHOT); isWriter = true; } Assert(NULL != work_set); AssertEquivalent(node->cached_workfiles_found, !isWriter); ts = ntuplestore_create_workset(work_set, node->cached_workfiles_found, PlanStateOperatorMemKB((PlanState *) node) * 1024); tsa = ntuplestore_create_accessor(ts, isWriter); } Assert(ts && tsa); node->ts_state->matstore = ts; node->ts_pos = (void *) tsa; /* CDB: Offer extra info for EXPLAIN ANALYZE. */ if (node->ss.ps.instrument) { /* Let the tuplestore share our Instrumentation object. */ ntuplestore_setinstrument(ts, node->ss.ps.instrument); /* Request a callback at end of query. */ node->ss.ps.cdbexplainfun = ExecMaterialExplainEnd; } /* * MPP: If requested, fetch all rows from subplan and put them * in the tuplestore. This decouples a middle slice's receiving * and sending Motion operators to neutralize a deadlock hazard. * MPP TODO: Remove when a better solution is implemented. * * ShareInput: if the material node * is used to share input, we will need to fetch all rows and put * them in tuple store */ while (((Material *) node->ss.ps.plan)->cdb_strict || ma->share_type != SHARE_NOTSHARED) { /* * When reusing cached workfiles, we already have all the tuples, * and we don't need to read anything from subplan. */ if (node->cached_workfiles_found) { break; } TupleTableSlot *outerslot = ExecProcNode(outerPlanState(node)); if (TupIsNull(outerslot)) { node->eof_underlying = true; if (ntuplestore_created_reusable_workfiles(ts)) { ntuplestore_flush(ts); ntuplestore_mark_workset_complete(ts); } ntuplestore_acc_seek_bof(tsa); break; } Gpmon_M_Incr(GpmonPktFromMaterialState(node), GPMON_QEXEC_M_ROWSIN); ntuplestore_acc_put_tupleslot(tsa, outerslot); } CheckSendPlanStateGpmonPkt(&node->ss.ps); if(forward) ntuplestore_acc_seek_bof(tsa); else ntuplestore_acc_seek_eof(tsa); /* for share input, material do not need to return any tuple */ if(ma->share_type != SHARE_NOTSHARED) { Assert(ma->share_type == SHARE_MATERIAL || ma->share_type == SHARE_MATERIAL_XSLICE); /* * if the material is shared across slice, notify consumers that * it is ready. */ if(ma->share_type == SHARE_MATERIAL_XSLICE) { if (ma->driver_slice == currentSliceId) { ntuplestore_flush(ts); node->share_lk_ctxt = shareinput_writer_notifyready(ma->share_id, ma->nsharer_xslice, estate->es_plannedstmt->planGen); } } return NULL; } } if(ma->share_type != SHARE_NOTSHARED) return NULL; /* * If we can fetch another tuple from the tuplestore, return it. */ slot = node->ss.ps.ps_ResultTupleSlot; if(forward) eof_tuplestore = (tsa == NULL) || !ntuplestore_acc_advance(tsa, 1); else eof_tuplestore = (tsa == NULL) || !ntuplestore_acc_advance(tsa, -1); if(tsa!=NULL && ntuplestore_acc_tell(tsa, NULL)) { ntuplestore_acc_current_tupleslot(tsa, slot); if (!TupIsNull(slot)) { Gpmon_M_Incr_Rows_Out(GpmonPktFromMaterialState(node)); CheckSendPlanStateGpmonPkt(&node->ss.ps); } return slot; } /* * If necessary, try to fetch another row from the subplan. * * Note: the eof_underlying state variable exists to short-circuit further * subplan calls. It's not optional, unfortunately, because some plan * node types are not robust about being called again when they've already * returned NULL. * If reusing cached workfiles, there is no need to execute subplan at all. */ if (eof_tuplestore && !node->eof_underlying) { PlanState *outerNode; TupleTableSlot *outerslot; Assert(!node->cached_workfiles_found && "we shouldn't get here when using cached workfiles"); /* * We can only get here with forward==true, so no need to worry about * which direction the subplan will go. */ outerNode = outerPlanState(node); outerslot = ExecProcNode(outerNode); if (TupIsNull(outerslot)) { node->eof_underlying = true; if (ntuplestore_created_reusable_workfiles(ts)) { ntuplestore_flush(ts); ntuplestore_mark_workset_complete(ts); } if (!node->ss.ps.delayEagerFree) { ExecEagerFreeMaterial(node); } return NULL; } Gpmon_M_Incr(GpmonPktFromMaterialState(node), GPMON_QEXEC_M_ROWSIN); if (tsa) ntuplestore_acc_put_tupleslot(tsa, outerslot); /* * And return a copy of the tuple. (XXX couldn't we just return the * outerslot?) */ Gpmon_M_Incr_Rows_Out(GpmonPktFromMaterialState(node)); CheckSendPlanStateGpmonPkt(&node->ss.ps); return ExecCopySlot(slot, outerslot); } if (!node->ss.ps.delayEagerFree) { ExecEagerFreeMaterial(node); } /* * Nothing left ... */ return NULL; }
/* ---------------------------------------------------------------- * FunctionNext * * This is a workhorse for ExecFunctionScan * ---------------------------------------------------------------- */ static TupleTableSlot * FunctionNext(FunctionScanState *node) { TupleTableSlot *slot; EState *estate; ScanDirection direction; Tuplestorestate *tuplestorestate; /* * get information from the estate and scan state */ estate = node->ss.ps.state; direction = estate->es_direction; tuplestorestate = node->tuplestorestate; /* * If first time through, read all tuples from function and put them in a * tuplestore. Subsequent calls just fetch tuples from tuplestore. */ if (tuplestorestate == NULL) { tuplestorestate = ExecMakeTableFunctionResult( node->funcexpr, node->ss.ps.ps_ExprContext, node->tupdesc, PlanStateOperatorMemKB( (PlanState *) node)); node->tuplestorestate = tuplestorestate; /* CDB: Offer extra info for EXPLAIN ANALYZE. */ if (node->ss.ps.instrument) { /* Let the tuplestore share our Instrumentation object. */ tuplestore_set_instrument(tuplestorestate, node->ss.ps.instrument); /* Request a callback at end of query. */ node->ss.ps.cdbexplainfun = ExecFunctionScanExplainEnd; } } /* * Get the next tuple from tuplestore. Return NULL if no more tuples. */ slot = node->ss.ss_ScanTupleSlot; if (tuplestore_gettupleslot(tuplestorestate, ScanDirectionIsForward(direction), slot)) { /* CDB: Label each row with a synthetic ctid for subquery dedup. */ if (node->cdb_want_ctid) { HeapTuple tuple = ExecFetchSlotHeapTuple(slot); /* Increment 48-bit row count */ node->cdb_fake_ctid.ip_posid++; if (node->cdb_fake_ctid.ip_posid == 0) ItemPointerSetBlockNumber(&node->cdb_fake_ctid, 1 + ItemPointerGetBlockNumber(&node->cdb_fake_ctid)); tuple->t_self = node->cdb_fake_ctid; } } if (!TupIsNull(slot)) { Gpmon_M_Incr_Rows_Out(GpmonPktFromFuncScanState(node)); CheckSendPlanStateGpmonPkt(&node->ss.ps); } else if (!node->ss.ps.delayEagerFree) { ExecEagerFreeFunctionScan((FunctionScanState *)(&node->ss.ps)); } return slot; }
/* ---------------------------------------------------------------- * ExecSort * * Sorts tuples from the outer subtree of the node using tuplesort, * which saves the results in a temporary file or memory. After the * initial call, returns a tuple from the file with each call. * * Conditions: * -- none. * * Initial States: * -- the outer child is prepared to return the first tuple. * ---------------------------------------------------------------- */ TupleTableSlot * ExecSort(SortState *node) { EState *estate; ScanDirection dir; Tuplesortstate *tuplesortstate = NULL; Tuplesortstate_mk *tuplesortstate_mk = NULL; TupleTableSlot *slot = NULL; Sort *plannode = NULL; PlanState *outerNode = NULL; TupleDesc tupDesc = NULL; workfile_set *work_set = NULL; /* * get state info from node */ SO1_printf("ExecSort: %s\n", "entering routine"); estate = node->ss.ps.state; dir = estate->es_direction; if(gp_enable_mk_sort) { tuplesortstate_mk = node->tuplesortstate->sortstore_mk; } else { tuplesortstate = node->tuplesortstate->sortstore; } /* * In Window node, we might need to call ExecSort again even when * the last tuple in the Sort has been retrieved. Since we might * eager free the tuplestore, the tuplestorestate could be NULL. * We simply return NULL in this case. */ if (node->sort_Done && ((gp_enable_mk_sort && tuplesortstate_mk == NULL) || (!gp_enable_mk_sort && tuplesortstate == NULL))) { return NULL; } plannode = (Sort *) node->ss.ps.plan; /* * If called for the first time, initialize tuplesort_state */ if (!node->sort_Done) { SO1_printf("ExecSort: %s\n", "sorting subplan"); if (gp_workfile_caching) { /* Look for cached workfile set. Mark here if found */ work_set = workfile_mgr_find_set(&node->ss.ps); if (work_set != NULL) { elog(gp_workfile_caching_loglevel, "Sort found matching cached workfile set"); node->cached_workfiles_found = true; } } /* * Want to scan subplan in the forward direction while creating the * sorted data. */ estate->es_direction = ForwardScanDirection; /* * Initialize tuplesort module. */ SO1_printf("ExecSort: %s\n", "calling tuplesort_begin"); outerNode = outerPlanState(node); tupDesc = ExecGetResultType(outerNode); if(plannode->share_type == SHARE_SORT_XSLICE) { char rwfile_prefix[100]; if(plannode->driver_slice != currentSliceId) { elog(LOG, "Sort exec on CrossSlice, current slice %d", currentSliceId); return NULL; } shareinput_create_bufname_prefix(rwfile_prefix, sizeof(rwfile_prefix), plannode->share_id); elog(LOG, "Sort node create shareinput rwfile %s", rwfile_prefix); if(gp_enable_mk_sort) tuplesortstate_mk = tuplesort_begin_heap_file_readerwriter_mk( & node->ss, rwfile_prefix, true, tupDesc, plannode->numCols, plannode->sortOperators, plannode->sortColIdx, PlanStateOperatorMemKB((PlanState *) node), true ); else tuplesortstate = tuplesort_begin_heap_file_readerwriter( rwfile_prefix, true, tupDesc, plannode->numCols, plannode->sortOperators, plannode->sortColIdx, PlanStateOperatorMemKB((PlanState *) node), true ); } else { if(gp_enable_mk_sort) tuplesortstate_mk = tuplesort_begin_heap_mk(& node->ss, tupDesc, plannode->numCols, plannode->sortOperators, plannode->sortColIdx, PlanStateOperatorMemKB((PlanState *) node), node->randomAccess); else tuplesortstate = tuplesort_begin_heap(tupDesc, plannode->numCols, plannode->sortOperators, plannode->sortColIdx, PlanStateOperatorMemKB((PlanState *) node), node->randomAccess); } if(gp_enable_mk_sort) { node->tuplesortstate->sortstore_mk = tuplesortstate_mk; } else { node->tuplesortstate->sortstore = tuplesortstate; } /* CDB */ { ExprContext *econtext = node->ss.ps.ps_ExprContext; bool isNull; int64 limit = 0; int64 offset = 0; int unique = 0; int sort_flags = gp_sort_flags; /* get the guc */ int maxdistinct = gp_sort_max_distinct; /* get the guc */ if (node->limitCount) { limit = DatumGetInt64( ExecEvalExprSwitchContext(node->limitCount, econtext, &isNull, NULL)); /* Interpret NULL limit as no limit */ if (isNull) limit = 0; else if (limit < 0) limit = 0; } if (node->limitOffset) { offset = DatumGetInt64( ExecEvalExprSwitchContext(node->limitOffset, econtext, &isNull, NULL)); /* Interpret NULL offset as no offset */ if (isNull) offset = 0; else if (offset < 0) offset = 0; } if (node->noduplicates) unique = 1; if(gp_enable_mk_sort) cdb_tuplesort_init_mk(tuplesortstate_mk, offset, limit, unique, sort_flags, maxdistinct); else cdb_tuplesort_init(tuplesortstate, offset, limit, unique, sort_flags, maxdistinct); } /* If EXPLAIN ANALYZE, share our Instrumentation object with sort. */ if(gp_enable_mk_sort) { if (node->ss.ps.instrument) tuplesort_set_instrument_mk(tuplesortstate_mk, node->ss.ps.instrument, node->ss.ps.cdbexplainbuf); tuplesort_set_gpmon_mk(tuplesortstate_mk, &node->ss.ps.gpmon_pkt, &node->ss.ps.gpmon_plan_tick); } else { if (node->ss.ps.instrument) tuplesort_set_instrument(tuplesortstate, node->ss.ps.instrument, node->ss.ps.cdbexplainbuf); tuplesort_set_gpmon(tuplesortstate, &node->ss.ps.gpmon_pkt, &node->ss.ps.gpmon_plan_tick); } } /* * Before reading any tuples from below, check if we can re-use * existing spill files. * Only mk_sort supports spill file caching. */ if (!node->sort_Done && gp_enable_mk_sort && gp_workfile_caching) { Assert(tuplesortstate_mk != NULL); if (node->cached_workfiles_found && !node->cached_workfiles_loaded) { Assert(work_set != NULL); elog(gp_workfile_caching_loglevel, "nodeSort: loading cached workfile metadata"); tuplesort_set_spillfile_set_mk(tuplesortstate_mk, work_set); tuplesort_read_spill_metadata_mk(tuplesortstate_mk); node->cached_workfiles_loaded = true; if (node->ss.ps.instrument) { node->ss.ps.instrument->workfileReused = true; } /* Loaded sorted data from cached workfile, therefore * no need to sort anymore! */ node->sort_Done = true; elog(gp_workfile_caching_loglevel, "Sort reusing cached workfiles, initiating Squelch walker"); ExecSquelchNode(outerNode); } } /* * If first time through and no cached workfiles can be used, * read all tuples from outer plan and pass them to * tuplesort.c. Subsequent calls just fetch tuples from tuplesort. */ if (!node->sort_Done) { Assert(outerNode != NULL); /* * Scan the subplan and feed all the tuples to tuplesort. */ for (;;) { slot = ExecProcNode(outerNode); if (TupIsNull(slot)) { break; } CheckSendPlanStateGpmonPkt(&node->ss.ps); if(gp_enable_mk_sort) tuplesort_puttupleslot_mk(tuplesortstate_mk, slot); else tuplesort_puttupleslot(tuplesortstate, slot); } #ifdef FAULT_INJECTOR FaultInjector_InjectFaultIfSet( ExecSortBeforeSorting, DDLNotSpecified, "" /* databaseName */, "" /* tableName */ ); #endif /* * Complete the sort. */ if(gp_enable_mk_sort) { tuplesort_performsort_mk(tuplesortstate_mk); } else { tuplesort_performsort(tuplesortstate); } CheckSendPlanStateGpmonPkt(&node->ss.ps); /* * restore to user specified direction */ estate->es_direction = dir; /* * finally set the sorted flag to true */ node->sort_Done = true; SO1_printf("ExecSort: %s\n", "sorting done"); /* for share input, do not need to return any tuple */ if(plannode->share_type != SHARE_NOTSHARED) { Assert(plannode->share_type == SHARE_SORT || plannode->share_type == SHARE_SORT_XSLICE); if(plannode->share_type == SHARE_SORT_XSLICE) { if(plannode->driver_slice == currentSliceId) { if(gp_enable_mk_sort) tuplesort_flush_mk(tuplesortstate_mk); else tuplesort_flush(tuplesortstate); node->share_lk_ctxt = shareinput_writer_notifyready(plannode->share_id, plannode->nsharer_xslice, estate->es_plannedstmt->planGen); } } return NULL; } } /* if (!node->sort_Done) */ if(plannode->share_type != SHARE_NOTSHARED) return NULL; SO1_printf("ExecSort: %s\n", "retrieving tuple from tuplesort"); /* * Get the first or next tuple from tuplesort. Returns NULL if no more * tuples. */ slot = node->ss.ps.ps_ResultTupleSlot; if(gp_enable_mk_sort) (void) tuplesort_gettupleslot_mk(tuplesortstate_mk, ScanDirectionIsForward(dir), slot); else (void) tuplesort_gettupleslot(tuplesortstate, ScanDirectionIsForward(dir), slot); if (TupIsNull(slot) && !node->ss.ps.delayEagerFree) { ExecEagerFreeSort(node); } return slot; }
/* * init_tuplestore_state * Initialize the tuplestore state for the Shared node if the state * is not initialized. */ static void init_tuplestore_state(ShareInputScanState *node) { Assert(node->ts_state == NULL); EState *estate = node->ss.ps.state; ShareInputScan *sisc = (ShareInputScan *)node->ss.ps.plan; ShareNodeEntry *snEntry = ExecGetShareNodeEntry(estate, sisc->share_id, false); PlanState *snState = NULL; ShareType share_type = sisc->share_type; if(snEntry) { snState = (PlanState *) snEntry->shareState; if(snState) { ExecProcNode(snState); } else { Assert(share_type == SHARE_MATERIAL_XSLICE || share_type == SHARE_SORT_XSLICE); } } if(share_type == SHARE_MATERIAL_XSLICE) { char rwfile_prefix[100]; shareinput_create_bufname_prefix(rwfile_prefix, sizeof(rwfile_prefix), sisc->share_id); node->ts_state = palloc0(sizeof(GenericTupStore)); node->ts_state->matstore = ntuplestore_create_readerwriter(rwfile_prefix, 0, false); node->ts_pos = (void *) ntuplestore_create_accessor(node->ts_state->matstore, false); ntuplestore_acc_seek_bof((NTupleStoreAccessor *)node->ts_pos); } else if(share_type == SHARE_MATERIAL) { /* The materialstate->ts_state structure should have been initialized already, during init of material node */ node->ts_state = ((MaterialState *)snState)->ts_state; Assert(NULL != node->ts_state->matstore); node->ts_pos = (void *) ntuplestore_create_accessor(node->ts_state->matstore, false); ntuplestore_acc_seek_bof((NTupleStoreAccessor *)node->ts_pos); } else if(share_type == SHARE_SORT_XSLICE) { char rwfile_prefix[100]; shareinput_create_bufname_prefix(rwfile_prefix, sizeof(rwfile_prefix), sisc->share_id); node->ts_state = palloc0(sizeof(GenericTupStore)); if(gp_enable_mk_sort) { node->ts_state->sortstore_mk = tuplesort_begin_heap_file_readerwriter_mk( & node->ss, rwfile_prefix, false, NULL, 0, NULL, NULL, PlanStateOperatorMemKB((PlanState *) node), true); tuplesort_begin_pos_mk(node->ts_state->sortstore_mk, (TuplesortPos_mk **)(&node->ts_pos)); tuplesort_rescan_pos_mk(node->ts_state->sortstore_mk, (TuplesortPos_mk *)node->ts_pos); } else { node->ts_state->sortstore = tuplesort_begin_heap_file_readerwriter( rwfile_prefix, false, NULL, 0, NULL, NULL, PlanStateOperatorMemKB((PlanState *) node), true); tuplesort_begin_pos(node->ts_state->sortstore, (TuplesortPos **)(&node->ts_pos)); tuplesort_rescan_pos(node->ts_state->sortstore, (TuplesortPos *)node->ts_pos); } } else { Assert(sisc->share_type == SHARE_SORT); Assert(snState != NULL); if(gp_enable_mk_sort) { node->ts_state = ((SortState *)snState)->tuplesortstate; Assert(NULL != node->ts_state->sortstore_mk); tuplesort_begin_pos_mk(node->ts_state->sortstore_mk, (TuplesortPos_mk **)(&node->ts_pos)); tuplesort_rescan_pos_mk(node->ts_state->sortstore_mk, (TuplesortPos_mk *)node->ts_pos); } else { node->ts_state = ((SortState *)snState)->tuplesortstate; Assert(NULL != node->ts_state->sortstore); tuplesort_begin_pos(node->ts_state->sortstore, (TuplesortPos **)(&node->ts_pos)); tuplesort_rescan_pos(node->ts_state->sortstore, (TuplesortPos *)node->ts_pos); } } Assert(NULL != node->ts_state); Assert(NULL != node->ts_state->matstore || NULL != node->ts_state->sortstore || NULL != node->ts_state->sortstore_mk); }