//--------------------------------------------------------------------------- // @function: // CCostContext::DRowsPerHost // // @doc: // Return the number of rows per host // //--------------------------------------------------------------------------- CDouble CCostContext::DRowsPerHost() const { DOUBLE dRows = Pstats()->DRows().DVal(); COptCtxt *poptctxt = COptCtxt::PoctxtFromTLS(); const ULONG ulHosts = poptctxt->Pcm()->UlHosts(); CDistributionSpec *pds = Pdpplan()->Pds(); if (CDistributionSpec::EdtHashed == pds->Edt()) { CDistributionSpecHashed *pdshashed = CDistributionSpecHashed::PdsConvert(pds); DrgPexpr *pdrgpexpr = pdshashed->Pdrgpexpr(); CColRefSet *pcrsUsed = CUtils::PcrsExtractColumns(m_pmp, pdrgpexpr); const CColRefSet *pcrsReqdStats = this->Poc()->Prprel()->PcrsStat(); if (!pcrsReqdStats->FSubset(pcrsUsed)) { // statistics not available for distribution columns, therefore // assume uniform distribution across hosts // clean up pcrsUsed->Release(); return CDouble(dRows / ulHosts); } DrgPul *pdrgpul = GPOS_NEW(m_pmp) DrgPul(m_pmp); pcrsUsed->ExtractColIds(m_pmp, pdrgpul); pcrsUsed->Release(); CStatisticsConfig *pstatsconf = poptctxt->Poconf()->Pstatsconf(); CDouble dNDVs = CStatisticsUtils::DGroups(m_pmp, Pstats(), pstatsconf, pdrgpul, NULL /*pbsKeys*/); pdrgpul->Release(); if (dNDVs < ulHosts) { // estimated number of distinct values of distribution columns is smaller than number of hosts. // We assume data is distributed across a subset of hosts in this case. This results in a larger // number of rows per host compared to the uniform case, allowing us to capture data skew in // cost computation return CDouble(dRows / dNDVs.DVal()); } } return CDouble(dRows / ulHosts); }
//--------------------------------------------------------------------------- // @function: // CQueryContext::PqcGenerate // // @doc: // Generate the query context for the given expression and array of // output column ref ids // //--------------------------------------------------------------------------- CQueryContext * CQueryContext::PqcGenerate ( IMemoryPool *pmp, CExpression * pexpr, DrgPul *pdrgpulQueryOutputColRefId, DrgPmdname *pdrgpmdname, BOOL fDeriveStats ) { GPOS_ASSERT(NULL != pexpr && NULL != pdrgpulQueryOutputColRefId); CColRefSet *pcrs = GPOS_NEW(pmp) CColRefSet(pmp); DrgPcr *pdrgpcr = GPOS_NEW(pmp) DrgPcr(pmp); COptCtxt *poptctxt = COptCtxt::PoctxtFromTLS(); CColumnFactory *pcf = poptctxt->Pcf(); GPOS_ASSERT(NULL != pcf); const ULONG ulLen = pdrgpulQueryOutputColRefId->UlLength(); for (ULONG ul = 0; ul < ulLen; ul++) { ULONG *pul = (*pdrgpulQueryOutputColRefId)[ul]; GPOS_ASSERT(NULL != pul); CColRef *pcr = pcf->PcrLookup(*pul); GPOS_ASSERT(NULL != pcr); pcrs->Include(pcr); pdrgpcr->Append(pcr); } COrderSpec *pos = NULL; CExpression *pexprResult = pexpr; COperator *popTop = PopTop(pexpr); if (COperator::EopLogicalLimit == popTop->Eopid()) { // top level operator is a limit, copy order spec to query context pos = CLogicalLimit::PopConvert(popTop)->Pos(); pos->AddRef(); } else { // no order required pos = GPOS_NEW(pmp) COrderSpec(pmp); } CDistributionSpec *pds = NULL; BOOL fDML = CUtils::FLogicalDML(pexpr->Pop()); poptctxt->MarkDMLQuery(fDML); if (fDML) { pds = GPOS_NEW(pmp) CDistributionSpecAny(COperator::EopSentinel); } else { pds = GPOS_NEW(pmp) CDistributionSpecSingleton(CDistributionSpecSingleton::EstMaster); } CRewindabilitySpec *prs = GPOS_NEW(pmp) CRewindabilitySpec(CRewindabilitySpec::ErtNone /*ert*/); CEnfdOrder *peo = GPOS_NEW(pmp) CEnfdOrder(pos, CEnfdOrder::EomSatisfy); // we require satisfy matching on distribution since final query results must be sent to master CEnfdDistribution *ped = GPOS_NEW(pmp) CEnfdDistribution(pds, CEnfdDistribution::EdmSatisfy); CEnfdRewindability *per = GPOS_NEW(pmp) CEnfdRewindability(prs, CEnfdRewindability::ErmSatisfy); CCTEReq *pcter = poptctxt->Pcteinfo()->PcterProducers(pmp); CReqdPropPlan *prpp = GPOS_NEW(pmp) CReqdPropPlan(pcrs, peo, ped, per, pcter); pdrgpmdname->AddRef(); return GPOS_NEW(pmp) CQueryContext(pmp, pexprResult, prpp, pdrgpcr, pdrgpmdname, fDeriveStats); }
//--------------------------------------------------------------------------- // @function: // CQueryContext::PqcGenerate // // @doc: // Generate the query context for the given expression and array of // output column ref ids // //--------------------------------------------------------------------------- CQueryContext * CQueryContext::PqcGenerate ( IMemoryPool *mp, CExpression * pexpr, ULongPtrArray *pdrgpulQueryOutputColRefId, CMDNameArray *pdrgpmdname, BOOL fDeriveStats ) { GPOS_ASSERT(NULL != pexpr && NULL != pdrgpulQueryOutputColRefId); CColRefSet *pcrs = GPOS_NEW(mp) CColRefSet(mp); CColRefArray *colref_array = GPOS_NEW(mp) CColRefArray(mp); COptCtxt *poptctxt = COptCtxt::PoctxtFromTLS(); CColumnFactory *col_factory = poptctxt->Pcf(); GPOS_ASSERT(NULL != col_factory); // Collect required column references (colref_array) const ULONG length = pdrgpulQueryOutputColRefId->Size(); for (ULONG ul = 0; ul < length; ul++) { ULONG *pul = (*pdrgpulQueryOutputColRefId)[ul]; GPOS_ASSERT(NULL != pul); CColRef *colref = col_factory->LookupColRef(*pul); GPOS_ASSERT(NULL != colref); pcrs->Include(colref); colref_array->Append(colref); } // Collect required properties (prpp) at the top level: // By default no sort order requirement is added, unless the root operator in // the input logical expression is a LIMIT. This is because Orca always // attaches top level Sort to a LIMIT node. COrderSpec *pos = NULL; CExpression *pexprResult = pexpr; COperator *popTop = PopTop(pexpr); if (COperator::EopLogicalLimit == popTop->Eopid()) { // top level operator is a limit, copy order spec to query context pos = CLogicalLimit::PopConvert(popTop)->Pos(); pos->AddRef(); } else { // no order required pos = GPOS_NEW(mp) COrderSpec(mp); } CDistributionSpec *pds = NULL; BOOL fDML = CUtils::FLogicalDML(pexpr->Pop()); poptctxt->MarkDMLQuery(fDML); // DML commands do not have distribution requirement. Otherwise the // distribution requirement is Singleton. if (fDML) { pds = GPOS_NEW(mp) CDistributionSpecAny(COperator::EopSentinel); } else { pds = GPOS_NEW(mp) CDistributionSpecSingleton(CDistributionSpecSingleton::EstMaster); } // By default, no rewindability requirement needs to be satisfied at the top level CRewindabilitySpec *prs = GPOS_NEW(mp) CRewindabilitySpec(CRewindabilitySpec::ErtNotRewindable, CRewindabilitySpec::EmhtNoMotion); // Ensure order, distribution and rewindability meet 'satisfy' matching at the top level CEnfdOrder *peo = GPOS_NEW(mp) CEnfdOrder(pos, CEnfdOrder::EomSatisfy); CEnfdDistribution *ped = GPOS_NEW(mp) CEnfdDistribution(pds, CEnfdDistribution::EdmSatisfy); CEnfdRewindability *per = GPOS_NEW(mp) CEnfdRewindability(prs, CEnfdRewindability::ErmSatisfy); // Required CTEs are obtained from the CTEInfo global information in the optimizer context CCTEReq *pcter = poptctxt->Pcteinfo()->PcterProducers(mp); // NB: Partition propagation requirements are not initialized here. They are // constructed later based on derived relation properties (CPartInfo) by // CReqdPropPlan::InitReqdPartitionPropagation(). CReqdPropPlan *prpp = GPOS_NEW(mp) CReqdPropPlan(pcrs, peo, ped, per, pcter); // Finally, create the CQueryContext pdrgpmdname->AddRef(); return GPOS_NEW(mp) CQueryContext(mp, pexprResult, prpp, colref_array, pdrgpmdname, fDeriveStats); }