Ejemplo n.º 1
0
//---------------------------------------------------------------------------
//	@function:
//		CPhysical::GetSkew
//
//	@doc:
//		Helper to compute skew estimate based on given stats and
//		distribution spec
//
//---------------------------------------------------------------------------
CDouble
CPhysical::GetSkew
	(
	IStatistics *stats,
	CDistributionSpec *pds
	)
{
	CDouble dSkew = 1.0;
	if (CDistributionSpec::EdtHashed == pds->Edt())
	{
		CDistributionSpecHashed *pdshashed = CDistributionSpecHashed::PdsConvert(pds);
		const CExpressionArray *pdrgpexpr = pdshashed->Pdrgpexpr();
		const ULONG size = pdrgpexpr->Size();
		for (ULONG ul = 0; ul < size; ul++)
		{
			CExpression *pexpr = (*pdrgpexpr)[ul];
			if (COperator::EopScalarIdent == pexpr->Pop()->Eopid())
			{
				// consider only hashed distribution direct columns for now
				CScalarIdent *popScId = CScalarIdent::PopConvert(pexpr->Pop());
				ULONG colid = popScId->Pcr()->Id();
				CDouble dSkewCol = stats->GetSkew(colid);
				if (dSkewCol > dSkew)
				{
					dSkew = dSkewCol;
				}
			}
		}
	}

	return CDouble(dSkew);
}
Ejemplo n.º 2
0
//---------------------------------------------------------------------------
//	@function:
//		CPhysicalInnerHashJoin::PdsDeriveFromHashedOuter
//
//	@doc:
//		Derive hash join distribution from a hashed outer child;
//		return NULL if derivation failed
//
//---------------------------------------------------------------------------
CDistributionSpec *
CPhysicalInnerHashJoin::PdsDeriveFromHashedOuter
	(
	IMemoryPool *pmp,
	CDistributionSpec *pdsOuter,
	CDistributionSpec *
#ifdef GPOS_DEBUG
	pdsInner
#endif // GPOS_DEBUG
	)
	const
{
	GPOS_ASSERT(NULL != pdsOuter);
	GPOS_ASSERT(NULL != pdsInner);

	GPOS_ASSERT(CDistributionSpec::EdtHashed == pdsOuter->Edt());

	 CDistributionSpecHashed *pdshashedOuter = CDistributionSpecHashed::PdsConvert(pdsOuter);
	 if (CUtils::FContains(PdrgpexprOuterKeys(), pdshashedOuter->Pdrgpexpr()))
	 {
	 	// outer child is hashed on a subset of outer hashkeys,
	 	// return a hashed distribution equivalent to a matching outer distribution
		return PdshashedCreateMatching(pmp, pdshashedOuter, 0 /*ulSourceChild*/);
	 }

	 return NULL;
}
Ejemplo n.º 3
0
//---------------------------------------------------------------------------
//	@function:
//		CPhysicalInnerHashJoin::PdsDeriveFromHashedChildren
//
//	@doc:
//		Derive hash join distribution from hashed children;
//		return NULL if derivation failed
//
//---------------------------------------------------------------------------
CDistributionSpec *
CPhysicalInnerHashJoin::PdsDeriveFromHashedChildren
	(
	IMemoryPool *pmp,
	CDistributionSpec *pdsOuter,
	CDistributionSpec *pdsInner
	)
	const
{
	GPOS_ASSERT(NULL != pdsOuter);
	GPOS_ASSERT(NULL != pdsInner);

	CDistributionSpecHashed *pdshashedOuter = CDistributionSpecHashed::PdsConvert(pdsOuter);
 	CDistributionSpecHashed *pdshashedInner = CDistributionSpecHashed::PdsConvert(pdsInner);

	if (CUtils::FContains(PdrgpexprOuterKeys(), pdshashedOuter->Pdrgpexpr()) &&
 		CUtils::FContains(PdrgpexprInnerKeys(), pdshashedInner->Pdrgpexpr()))
 	{
 	 	// if both sides are hashed on subsets of hash join keys, join's output can be
 		// seen as distributed on outer spec or (equivalently) on inner spec,
 	 	// in this case, we create a new spec based on outer side and mark inner
 		// side as an equivalent one,

		return PdshashedCreateMatching(pmp, pdshashedOuter, 0 /*ulSourceChild*/);
 	}

	return NULL;
}
Ejemplo n.º 4
0
//---------------------------------------------------------------------------
//	@function:
//		CPhysicalInnerHashJoin::PdsDeriveFromReplicatedOuter
//
//	@doc:
//		Derive hash join distribution from a replicated outer child;
//
//---------------------------------------------------------------------------
CDistributionSpec *
CPhysicalInnerHashJoin::PdsDeriveFromReplicatedOuter
	(
	IMemoryPool *pmp,
	CDistributionSpec *
#ifdef GPOS_DEBUG
	pdsOuter
#endif // GPOS_DEBUG
	,
	CDistributionSpec *pdsInner
	)
	const
{
	GPOS_ASSERT(NULL != pdsOuter);
	GPOS_ASSERT(NULL != pdsInner);
	GPOS_ASSERT(CDistributionSpec::EdtReplicated == pdsOuter->Edt());

	// if outer child is replicated, join results distribution is defined by inner child
	if (CDistributionSpec::EdtHashed == pdsInner->Edt())
	{
		CDistributionSpecHashed *pdshashedInner = CDistributionSpecHashed::PdsConvert(pdsInner);
		if (CUtils::FContains(PdrgpexprInnerKeys(), pdshashedInner->Pdrgpexpr()))
		{
			// inner child is hashed on a subset of inner hashkeys,
		 	// return a hashed distribution equivalent to a matching outer distribution
			return PdshashedCreateMatching(pmp, pdshashedInner, 1 /*ulSourceChild*/);
		}
	}

	// otherwise, pass-through inner distribution
	pdsInner->AddRef();
	return pdsInner;
}
Ejemplo n.º 5
0
void
CDistributionSpecHashedNoOp::AppendEnforcers
	(
	IMemoryPool *pmp,
	CExpressionHandle &exprhdl,
	CReqdPropPlan *,
	DrgPexpr *pdrgpexpr,
	CExpression *pexpr
	)
{
	CDrvdProp *pdp = exprhdl.Pdp();
	CDistributionSpec *pdsChild = CDrvdPropPlan::Pdpplan(pdp)->Pds();
	CDistributionSpecHashed *pdsChildHashed = dynamic_cast<CDistributionSpecHashed *>(pdsChild);
	if (NULL == pdsChildHashed)
	{
		return;
	}
	
	DrgPexpr *pdrgpexprNoOpRedistributionColumns = pdsChildHashed->Pdrgpexpr();
	pdrgpexprNoOpRedistributionColumns->AddRef();
	CDistributionSpecHashedNoOp* pdsNoOp = GPOS_NEW(pmp) CDistributionSpecHashedNoOp(pdrgpexprNoOpRedistributionColumns);
	pexpr->AddRef();
	CExpression *pexprMotion = GPOS_NEW(pmp) CExpression
			(
					pmp,
					GPOS_NEW(pmp) CPhysicalMotionHashDistribute(pmp, pdsNoOp),
					pexpr
			);
	pdrgpexpr->Append(pexprMotion);
}
Ejemplo n.º 6
0
//---------------------------------------------------------------------------
//	@function:
//		CCostContext::DRowsPerHost
//
//	@doc:
//		Return the number of rows per host
//
//---------------------------------------------------------------------------
CDouble
CCostContext::DRowsPerHost() const
{
	DOUBLE dRows = Pstats()->DRows().DVal();
	COptCtxt *poptctxt = COptCtxt::PoctxtFromTLS();
	const ULONG ulHosts = poptctxt->Pcm()->UlHosts();

	CDistributionSpec *pds =  Pdpplan()->Pds();
	if (CDistributionSpec::EdtHashed == pds->Edt())
	{
		CDistributionSpecHashed *pdshashed = CDistributionSpecHashed::PdsConvert(pds);
		DrgPexpr *pdrgpexpr = pdshashed->Pdrgpexpr();
		CColRefSet *pcrsUsed = CUtils::PcrsExtractColumns(m_pmp, pdrgpexpr);

		const CColRefSet *pcrsReqdStats = this->Poc()->Prprel()->PcrsStat();
		if (!pcrsReqdStats->FSubset(pcrsUsed))
		{
			// statistics not available for distribution columns, therefore
			// assume uniform distribution across hosts
			// clean up
			pcrsUsed->Release();

			return CDouble(dRows / ulHosts);
		}

		DrgPul *pdrgpul = GPOS_NEW(m_pmp) DrgPul(m_pmp);
		pcrsUsed->ExtractColIds(m_pmp, pdrgpul);
		pcrsUsed->Release();

		CStatisticsConfig *pstatsconf = poptctxt->Poconf()->Pstatsconf();
		CDouble dNDVs = CStatisticsUtils::DGroups(m_pmp, Pstats(), pstatsconf, pdrgpul, NULL /*pbsKeys*/);
		pdrgpul->Release();

		if (dNDVs < ulHosts)
		{
			// estimated number of distinct values of distribution columns is smaller than number of hosts.
			// We assume data is distributed across a subset of hosts in this case. This results in a larger
			// number of rows per host compared to the uniform case, allowing us to capture data skew in
			// cost computation
			return CDouble(dRows / dNDVs.DVal());
		}
	}

	return CDouble(dRows / ulHosts);
}
//---------------------------------------------------------------------------
//	@function:
//		CPhysicalPartitionSelectorDML::PdsRequired
//
//	@doc:
//		Compute required distribution of the n-th child
//
//---------------------------------------------------------------------------
CDistributionSpec *
CPhysicalPartitionSelectorDML::PdsRequired
	(
	IMemoryPool *mp,
	CExpressionHandle &exprhdl,
	CDistributionSpec *pdsInput,
	ULONG child_index,
	CDrvdProp2dArray *, // pdrgpdpCtxt
	ULONG // ulOptReq
	)
	const
{
	GPOS_ASSERT(0 == child_index);

	// if required distribution uses any defined column, it has to be enforced on top,
	// in this case, we request Any distribution from the child
	CColRefSet *pcrs = NULL;
	CDistributionSpec::EDistributionType edtRequired = pdsInput->Edt();
	if (CDistributionSpec::EdtHashed == edtRequired)
	{
		CDistributionSpecHashed *pdshashed = CDistributionSpecHashed::PdsConvert(pdsInput);
		pcrs = pdshashed->PcrsUsed(m_mp);
	}

	if (CDistributionSpec::EdtRouted == edtRequired)
	{
		CDistributionSpecRouted *pdsrouted = CDistributionSpecRouted::PdsConvert(pdsInput);
		pcrs = GPOS_NEW(m_mp) CColRefSet(m_mp);
		pcrs->Include(pdsrouted->Pcr());
	}

	BOOL fUsesDefinedCols = (NULL != pcrs && pcrs->FMember(m_pcrOid));
	CRefCount::SafeRelease(pcrs);
	if (fUsesDefinedCols)
	{
		return GPOS_NEW(mp) CDistributionSpecAny(this->Eopid());
	}

	return PdsPassThru(mp, exprhdl, pdsInput, child_index);
}
Ejemplo n.º 8
0
//---------------------------------------------------------------------------
//	@function:
//		CPhysicalComputeScalar::PdsRequired
//
//	@doc:
//		Compute required distribution of the n-th child
//
//---------------------------------------------------------------------------
CDistributionSpec *
CPhysicalComputeScalar::PdsRequired
	(
	IMemoryPool *pmp,
	CExpressionHandle &exprhdl,
	CDistributionSpec *pdsRequired,
	ULONG ulChildIndex,
	DrgPdp *, // pdrgpdpCtxt
	ULONG ulOptReq
	)
	const
{
	GPOS_ASSERT(0 == ulChildIndex);
	GPOS_ASSERT(2 > ulOptReq);

	// check if master-only/replicated distribution needs to be requested
	CDistributionSpec *pds = PdsMasterOnlyOrReplicated(pmp, exprhdl, pdsRequired, ulChildIndex, ulOptReq);
	if (NULL != pds)
	{
		return pds;
	}

	CDrvdPropScalar *pdpscalar = exprhdl.Pdpscalar(1 /*ulChildIndex*/);

	// if a Project operator has a call to a set function, passing a Random distribution through this
	// Project may have the effect of not distributing the results of the set function to all nodes,
	// but only to the nodes on which first child of the Project is distributed.
	// to avoid that, we don't push the distribution requirement in this case and thus, for a random
	// distribution, the result of the set function is spread uniformly over all nodes
	if (pdpscalar->FHasNonScalarFunction())
	{
		return GPOS_NEW(pmp) CDistributionSpecAny();
	}

	// if required distribution uses any defined column, it has to be enforced on top of ComputeScalar,
	// in this case, we request Any distribution from the child
	CDistributionSpec::EDistributionType edtRequired = pdsRequired->Edt();
	if (CDistributionSpec::EdtHashed == edtRequired)
	{
		CDistributionSpecHashed *pdshashed = CDistributionSpecHashed::PdsConvert(pdsRequired);
		CColRefSet *pcrs = pdshashed->PcrsUsed(m_pmp);
		BOOL fUsesDefinedCols = FUnaryUsesDefinedColumns(pcrs, exprhdl);
		pcrs->Release();
		if (fUsesDefinedCols)
		{
			return GPOS_NEW(pmp) CDistributionSpecAny();
		}
	}

	if (CDistributionSpec::EdtRouted == edtRequired)
	{
		CDistributionSpecRouted *pdsrouted = CDistributionSpecRouted::PdsConvert(pdsRequired);
		CColRefSet *pcrs = GPOS_NEW(m_pmp) CColRefSet(m_pmp);
		pcrs->Include(pdsrouted->Pcr());
		BOOL fUsesDefinedCols = FUnaryUsesDefinedColumns(pcrs, exprhdl);
		pcrs->Release();
		if (fUsesDefinedCols)
		{
			return GPOS_NEW(pmp) CDistributionSpecAny();
		}
	}

	if (0 == ulOptReq)
	{
		// Req0: required distribution will be enforced on top of ComputeScalar
		return GPOS_NEW(pmp) CDistributionSpecAny();
	}

	// Req1: required distribution will be enforced on top of ComputeScalar's child
	return PdsPassThru(pmp, exprhdl, pdsRequired, ulChildIndex);
}
Ejemplo n.º 9
0
//---------------------------------------------------------------------------
//	@function:
//		CPhysicalHashJoin::PdshashedPassThru
//
//	@doc:
//		Create a child hashed distribution request based on input hashed
//		distribution,
//		return NULL if no such request can be created
//
//
//---------------------------------------------------------------------------
CDistributionSpecHashed *
CPhysicalHashJoin::PdshashedPassThru
	(
	IMemoryPool *pmp,
	CExpressionHandle  &exprhdl,
	CDistributionSpecHashed *pdshashedInput,
	ULONG  , // ulChildIndex
	DrgPdp *, // pdrgpdpCtxt
	ULONG
#ifdef GPOS_DEBUG
	 ulOptReq
#endif // GPOS_DEBUG
	)
	const
{
	GPOS_ASSERT(ulOptReq == m_pdrgpdsRedistributeRequests->UlLength());
	GPOS_ASSERT(NULL != pdshashedInput);

	if (!GPOS_FTRACE(EopttraceEnableRedistributeBroadcastHashJoin))
	{
		// this option is disabled
		return NULL;
	}

	// since incoming request is hashed, we attempt here to propagate this request to outer child
	CColRefSet *pcrsOuterOutput = exprhdl.Pdprel(0 /*ulChildIndex*/)->PcrsOutput();
	DrgPexpr *pdrgpexprIncomingRequest = pdshashedInput->Pdrgpexpr();
	CColRefSet *pcrsAllUsed = CUtils::PcrsExtractColumns(pmp, pdrgpexprIncomingRequest);
	BOOL fSubset = pcrsOuterOutput->FSubset(pcrsAllUsed);
	BOOL fDisjoint = pcrsOuterOutput->FDisjoint(pcrsAllUsed);
	pcrsAllUsed->Release();
	if (fSubset)
	{
		// incoming request uses columns from outer child only, pass it through
		pdshashedInput->AddRef();
		return pdshashedInput;
	}

	if (!fDisjoint)
	{
		 // incoming request intersects with columns from outer child,
		 // we restrict the request to outer child columns only, then we pass it through
		 DrgPexpr *pdrgpexprChildRequest = GPOS_NEW(pmp) DrgPexpr(pmp);
		 const ULONG ulSize = pdrgpexprIncomingRequest->UlLength();
		 for (ULONG ul = 0; ul < ulSize; ul++)
		 {
			 CExpression *pexpr = (*pdrgpexprIncomingRequest)[ul];
			 CColRefSet *pcrsUsed = CDrvdPropScalar::Pdpscalar(pexpr->PdpDerive())->PcrsUsed();
			 if (pcrsOuterOutput->FSubset(pcrsUsed))
			 {
				 // hashed expression uses columns from outer child only, add it to request
				 pexpr->AddRef();
				 pdrgpexprChildRequest->Append(pexpr);
			 }
		 }
		 GPOS_ASSERT(0 < pdrgpexprChildRequest->UlLength());

		 CDistributionSpecHashed *pdshashed = GPOS_NEW(pmp) CDistributionSpecHashed(pdrgpexprChildRequest, pdshashedInput->FNullsColocated());

		 // since the other child of the join is replicated, we need to enforce hashed-distribution across segments here
		 pdshashed->MarkUnsatisfiableBySingleton();

		 return pdshashed;
	}

	return NULL;
}
Ejemplo n.º 10
0
//---------------------------------------------------------------------------
//	@function:
//		CPhysicalSequenceProject::CreateOrderSpec
//
//	@doc:
//		Create local order spec that we request relational child to satisfy
//
//---------------------------------------------------------------------------
void
CPhysicalSequenceProject::CreateOrderSpec
	(
	IMemoryPool *pmp
	)
{
	GPOS_ASSERT(NULL == m_pos);
	GPOS_ASSERT(NULL != m_pds);
	GPOS_ASSERT(NULL != m_pdrgpos);

	m_pos = GPOS_NEW(pmp) COrderSpec(pmp);

	// add partition by keys to order spec
	if (CDistributionSpec::EdtHashed == m_pds->Edt())
	{
		CDistributionSpecHashed *pdshashed = CDistributionSpecHashed::PdsConvert(m_pds);

		const DrgPexpr *pdrgpexpr = pdshashed->Pdrgpexpr();
		const ULONG ulSize = pdrgpexpr->UlLength();
		for (ULONG ul = 0; ul < ulSize; ul++)
		{
			CExpression *pexpr = (*pdrgpexpr)[ul];
			// we assume partition-by keys are always scalar idents
			CScalarIdent *popScId = CScalarIdent::PopConvert(pexpr->Pop());
			const CColRef *pcr = popScId->Pcr();

			gpmd::IMDId *pmdid = pcr->Pmdtype()->PmdidCmp(IMDType::EcmptL);
			pmdid->AddRef();

			m_pos->Append(pmdid, pcr, COrderSpec::EntLast);
		}
	}

	if (0 == m_pdrgpos->UlLength())
	{
		return;
	}

	COrderSpec *posFirst = (*m_pdrgpos)[0];
#ifdef GPOS_DEBUG
	const ULONG ulLength = m_pdrgpos->UlLength();
	for (ULONG ul = 1; ul < ulLength; ul++)
	{
		COrderSpec *posCurrent = (*m_pdrgpos)[ul];
		GPOS_ASSERT(posFirst->FSatisfies(posCurrent) &&
				"first order spec must satisfy all other order specs");
	}
#endif // GPOS_DEBUG

	// we assume here that the first order spec in the children array satisfies all other
	// order specs in the array, this happens as part of the initial normalization
	// so we need to add columns only from the first order spec
	const ULONG ulSize = posFirst->UlSortColumns();
	for (ULONG ul = 0; ul < ulSize; ul++)
	{
		const CColRef *pcr = posFirst->Pcr(ul);
		gpmd::IMDId *pmdid = posFirst->PmdidSortOp(ul);
		pmdid->AddRef();
		COrderSpec::ENullTreatment ent = posFirst->Ent(ul);
		m_pos->Append(pmdid, pcr, ent);
	}
}