//--------------------------------------------------------------------------- // @function: // CPhysicalInnerHashJoin::PdsDeriveFromHashedChildren // // @doc: // Derive hash join distribution from hashed children; // return NULL if derivation failed // //--------------------------------------------------------------------------- CDistributionSpec * CPhysicalInnerHashJoin::PdsDeriveFromHashedChildren ( IMemoryPool *pmp, CDistributionSpec *pdsOuter, CDistributionSpec *pdsInner ) const { GPOS_ASSERT(NULL != pdsOuter); GPOS_ASSERT(NULL != pdsInner); CDistributionSpecHashed *pdshashedOuter = CDistributionSpecHashed::PdsConvert(pdsOuter); CDistributionSpecHashed *pdshashedInner = CDistributionSpecHashed::PdsConvert(pdsInner); if (CUtils::FContains(PdrgpexprOuterKeys(), pdshashedOuter->Pdrgpexpr()) && CUtils::FContains(PdrgpexprInnerKeys(), pdshashedInner->Pdrgpexpr())) { // if both sides are hashed on subsets of hash join keys, join's output can be // seen as distributed on outer spec or (equivalently) on inner spec, // in this case, we create a new spec based on outer side and mark inner // side as an equivalent one, return PdshashedCreateMatching(pmp, pdshashedOuter, 0 /*ulSourceChild*/); } return NULL; }
//--------------------------------------------------------------------------- // @function: // CPhysical::GetSkew // // @doc: // Helper to compute skew estimate based on given stats and // distribution spec // //--------------------------------------------------------------------------- CDouble CPhysical::GetSkew ( IStatistics *stats, CDistributionSpec *pds ) { CDouble dSkew = 1.0; if (CDistributionSpec::EdtHashed == pds->Edt()) { CDistributionSpecHashed *pdshashed = CDistributionSpecHashed::PdsConvert(pds); const CExpressionArray *pdrgpexpr = pdshashed->Pdrgpexpr(); const ULONG size = pdrgpexpr->Size(); for (ULONG ul = 0; ul < size; ul++) { CExpression *pexpr = (*pdrgpexpr)[ul]; if (COperator::EopScalarIdent == pexpr->Pop()->Eopid()) { // consider only hashed distribution direct columns for now CScalarIdent *popScId = CScalarIdent::PopConvert(pexpr->Pop()); ULONG colid = popScId->Pcr()->Id(); CDouble dSkewCol = stats->GetSkew(colid); if (dSkewCol > dSkew) { dSkew = dSkewCol; } } } } return CDouble(dSkew); }
//--------------------------------------------------------------------------- // @function: // CPhysicalInnerHashJoin::PdsDeriveFromHashedOuter // // @doc: // Derive hash join distribution from a hashed outer child; // return NULL if derivation failed // //--------------------------------------------------------------------------- CDistributionSpec * CPhysicalInnerHashJoin::PdsDeriveFromHashedOuter ( IMemoryPool *pmp, CDistributionSpec *pdsOuter, CDistributionSpec * #ifdef GPOS_DEBUG pdsInner #endif // GPOS_DEBUG ) const { GPOS_ASSERT(NULL != pdsOuter); GPOS_ASSERT(NULL != pdsInner); GPOS_ASSERT(CDistributionSpec::EdtHashed == pdsOuter->Edt()); CDistributionSpecHashed *pdshashedOuter = CDistributionSpecHashed::PdsConvert(pdsOuter); if (CUtils::FContains(PdrgpexprOuterKeys(), pdshashedOuter->Pdrgpexpr())) { // outer child is hashed on a subset of outer hashkeys, // return a hashed distribution equivalent to a matching outer distribution return PdshashedCreateMatching(pmp, pdshashedOuter, 0 /*ulSourceChild*/); } return NULL; }
//--------------------------------------------------------------------------- // @function: // CPhysicalInnerHashJoin::PdsDeriveFromReplicatedOuter // // @doc: // Derive hash join distribution from a replicated outer child; // //--------------------------------------------------------------------------- CDistributionSpec * CPhysicalInnerHashJoin::PdsDeriveFromReplicatedOuter ( IMemoryPool *pmp, CDistributionSpec * #ifdef GPOS_DEBUG pdsOuter #endif // GPOS_DEBUG , CDistributionSpec *pdsInner ) const { GPOS_ASSERT(NULL != pdsOuter); GPOS_ASSERT(NULL != pdsInner); GPOS_ASSERT(CDistributionSpec::EdtReplicated == pdsOuter->Edt()); // if outer child is replicated, join results distribution is defined by inner child if (CDistributionSpec::EdtHashed == pdsInner->Edt()) { CDistributionSpecHashed *pdshashedInner = CDistributionSpecHashed::PdsConvert(pdsInner); if (CUtils::FContains(PdrgpexprInnerKeys(), pdshashedInner->Pdrgpexpr())) { // inner child is hashed on a subset of inner hashkeys, // return a hashed distribution equivalent to a matching outer distribution return PdshashedCreateMatching(pmp, pdshashedInner, 1 /*ulSourceChild*/); } } // otherwise, pass-through inner distribution pdsInner->AddRef(); return pdsInner; }
void CDistributionSpecHashedNoOp::AppendEnforcers ( IMemoryPool *pmp, CExpressionHandle &exprhdl, CReqdPropPlan *, DrgPexpr *pdrgpexpr, CExpression *pexpr ) { CDrvdProp *pdp = exprhdl.Pdp(); CDistributionSpec *pdsChild = CDrvdPropPlan::Pdpplan(pdp)->Pds(); CDistributionSpecHashed *pdsChildHashed = dynamic_cast<CDistributionSpecHashed *>(pdsChild); if (NULL == pdsChildHashed) { return; } DrgPexpr *pdrgpexprNoOpRedistributionColumns = pdsChildHashed->Pdrgpexpr(); pdrgpexprNoOpRedistributionColumns->AddRef(); CDistributionSpecHashedNoOp* pdsNoOp = GPOS_NEW(pmp) CDistributionSpecHashedNoOp(pdrgpexprNoOpRedistributionColumns); pexpr->AddRef(); CExpression *pexprMotion = GPOS_NEW(pmp) CExpression ( pmp, GPOS_NEW(pmp) CPhysicalMotionHashDistribute(pmp, pdsNoOp), pexpr ); pdrgpexpr->Append(pexprMotion); }
//--------------------------------------------------------------------------- // @function: // CCostContext::DRowsPerHost // // @doc: // Return the number of rows per host // //--------------------------------------------------------------------------- CDouble CCostContext::DRowsPerHost() const { DOUBLE dRows = Pstats()->DRows().DVal(); COptCtxt *poptctxt = COptCtxt::PoctxtFromTLS(); const ULONG ulHosts = poptctxt->Pcm()->UlHosts(); CDistributionSpec *pds = Pdpplan()->Pds(); if (CDistributionSpec::EdtHashed == pds->Edt()) { CDistributionSpecHashed *pdshashed = CDistributionSpecHashed::PdsConvert(pds); DrgPexpr *pdrgpexpr = pdshashed->Pdrgpexpr(); CColRefSet *pcrsUsed = CUtils::PcrsExtractColumns(m_pmp, pdrgpexpr); const CColRefSet *pcrsReqdStats = this->Poc()->Prprel()->PcrsStat(); if (!pcrsReqdStats->FSubset(pcrsUsed)) { // statistics not available for distribution columns, therefore // assume uniform distribution across hosts // clean up pcrsUsed->Release(); return CDouble(dRows / ulHosts); } DrgPul *pdrgpul = GPOS_NEW(m_pmp) DrgPul(m_pmp); pcrsUsed->ExtractColIds(m_pmp, pdrgpul); pcrsUsed->Release(); CStatisticsConfig *pstatsconf = poptctxt->Poconf()->Pstatsconf(); CDouble dNDVs = CStatisticsUtils::DGroups(m_pmp, Pstats(), pstatsconf, pdrgpul, NULL /*pbsKeys*/); pdrgpul->Release(); if (dNDVs < ulHosts) { // estimated number of distinct values of distribution columns is smaller than number of hosts. // We assume data is distributed across a subset of hosts in this case. This results in a larger // number of rows per host compared to the uniform case, allowing us to capture data skew in // cost computation return CDouble(dRows / dNDVs.DVal()); } } return CDouble(dRows / ulHosts); }
//--------------------------------------------------------------------------- // @function: // CPhysicalSequenceProject::CreateOrderSpec // // @doc: // Create local order spec that we request relational child to satisfy // //--------------------------------------------------------------------------- void CPhysicalSequenceProject::CreateOrderSpec ( IMemoryPool *pmp ) { GPOS_ASSERT(NULL == m_pos); GPOS_ASSERT(NULL != m_pds); GPOS_ASSERT(NULL != m_pdrgpos); m_pos = GPOS_NEW(pmp) COrderSpec(pmp); // add partition by keys to order spec if (CDistributionSpec::EdtHashed == m_pds->Edt()) { CDistributionSpecHashed *pdshashed = CDistributionSpecHashed::PdsConvert(m_pds); const DrgPexpr *pdrgpexpr = pdshashed->Pdrgpexpr(); const ULONG ulSize = pdrgpexpr->UlLength(); for (ULONG ul = 0; ul < ulSize; ul++) { CExpression *pexpr = (*pdrgpexpr)[ul]; // we assume partition-by keys are always scalar idents CScalarIdent *popScId = CScalarIdent::PopConvert(pexpr->Pop()); const CColRef *pcr = popScId->Pcr(); gpmd::IMDId *pmdid = pcr->Pmdtype()->PmdidCmp(IMDType::EcmptL); pmdid->AddRef(); m_pos->Append(pmdid, pcr, COrderSpec::EntLast); } } if (0 == m_pdrgpos->UlLength()) { return; } COrderSpec *posFirst = (*m_pdrgpos)[0]; #ifdef GPOS_DEBUG const ULONG ulLength = m_pdrgpos->UlLength(); for (ULONG ul = 1; ul < ulLength; ul++) { COrderSpec *posCurrent = (*m_pdrgpos)[ul]; GPOS_ASSERT(posFirst->FSatisfies(posCurrent) && "first order spec must satisfy all other order specs"); } #endif // GPOS_DEBUG // we assume here that the first order spec in the children array satisfies all other // order specs in the array, this happens as part of the initial normalization // so we need to add columns only from the first order spec const ULONG ulSize = posFirst->UlSortColumns(); for (ULONG ul = 0; ul < ulSize; ul++) { const CColRef *pcr = posFirst->Pcr(ul); gpmd::IMDId *pmdid = posFirst->PmdidSortOp(ul); pmdid->AddRef(); COrderSpec::ENullTreatment ent = posFirst->Ent(ul); m_pos->Append(pmdid, pcr, ent); } }