//--------------------------------------------------------------------------- // @function: // CLogicalConstTableGet::PstatsDerive // // @doc: // Derive statistics // //--------------------------------------------------------------------------- IStatistics * CLogicalConstTableGet::PstatsDerive ( IMemoryPool *pmp, CExpressionHandle &exprhdl, DrgPstat * // not used ) const { GPOS_ASSERT(Esp(exprhdl) > EspNone); CReqdPropRelational *prprel = CReqdPropRelational::Prprel(exprhdl.Prp()); CColRefSet *pcrs = prprel->PcrsStat(); DrgPul *pdrgpulColIds = GPOS_NEW(pmp) DrgPul(pmp); pcrs->ExtractColIds(pmp, pdrgpulColIds); DrgPul *pdrgpulColWidth = CUtils::Pdrgpul(pmp, m_pdrgpcrOutput); IStatistics *pstats = CStatistics::PstatsDummy ( pmp, pdrgpulColIds, pdrgpulColWidth, m_pdrgpdrgpdatum->UlLength() ); // clean up pdrgpulColIds->Release(); pdrgpulColWidth->Release(); return pstats; }
//--------------------------------------------------------------------------- // @function: // CLogical::PstatsDeriveDummy // // @doc: // Derive dummy statistics // //--------------------------------------------------------------------------- IStatistics * CLogical::PstatsDeriveDummy ( IMemoryPool *pmp, CExpressionHandle &exprhdl, CDouble dRows ) const { GPOS_CHECK_ABORT; // return a dummy stats object that has a histogram for every // required-stats column GPOS_ASSERT(Esp(exprhdl) > EspNone); CReqdPropRelational *prprel = CReqdPropRelational::Prprel(exprhdl.Prp()); CColRefSet *pcrs = prprel->PcrsStat(); DrgPul *pdrgpulColIds = GPOS_NEW(pmp) DrgPul(pmp); pcrs->ExtractColIds(pmp, pdrgpulColIds); IStatistics *pstats = CStatistics::PstatsDummy(pmp, pdrgpulColIds, dRows); // clean up pdrgpulColIds->Release(); return pstats; }
//--------------------------------------------------------------------------- // @function: // CCostContext::DRowsPerHost // // @doc: // Return the number of rows per host // //--------------------------------------------------------------------------- CDouble CCostContext::DRowsPerHost() const { DOUBLE dRows = Pstats()->DRows().DVal(); COptCtxt *poptctxt = COptCtxt::PoctxtFromTLS(); const ULONG ulHosts = poptctxt->Pcm()->UlHosts(); CDistributionSpec *pds = Pdpplan()->Pds(); if (CDistributionSpec::EdtHashed == pds->Edt()) { CDistributionSpecHashed *pdshashed = CDistributionSpecHashed::PdsConvert(pds); DrgPexpr *pdrgpexpr = pdshashed->Pdrgpexpr(); CColRefSet *pcrsUsed = CUtils::PcrsExtractColumns(m_pmp, pdrgpexpr); const CColRefSet *pcrsReqdStats = this->Poc()->Prprel()->PcrsStat(); if (!pcrsReqdStats->FSubset(pcrsUsed)) { // statistics not available for distribution columns, therefore // assume uniform distribution across hosts // clean up pcrsUsed->Release(); return CDouble(dRows / ulHosts); } DrgPul *pdrgpul = GPOS_NEW(m_pmp) DrgPul(m_pmp); pcrsUsed->ExtractColIds(m_pmp, pdrgpul); pcrsUsed->Release(); CStatisticsConfig *pstatsconf = poptctxt->Poconf()->Pstatsconf(); CDouble dNDVs = CStatisticsUtils::DGroups(m_pmp, Pstats(), pstatsconf, pdrgpul, NULL /*pbsKeys*/); pdrgpul->Release(); if (dNDVs < ulHosts) { // estimated number of distinct values of distribution columns is smaller than number of hosts. // We assume data is distributed across a subset of hosts in this case. This results in a larger // number of rows per host compared to the uniform case, allowing us to capture data skew in // cost computation return CDouble(dRows / dNDVs.DVal()); } } return CDouble(dRows / ulHosts); }