greenplumn CHistogram 源码

  • 2022-08-18
  • 浏览 (383)

greenplumn CHistogram 代码

文件路径:/src/backend/gporca/libnaucrates/include/naucrates/statistics/CHistogram.h

//---------------------------------------------------------------------------
//	Greenplum Database
//	Copyright (C) 2018 VMware, Inc. or its affiliates.
//
//	@filename:
//		CHistogram.h
//
//	@doc:
//		Histogram implementation
//---------------------------------------------------------------------------
#ifndef GPNAUCRATES_CHistogram_H
#define GPNAUCRATES_CHistogram_H

#include "gpos/base.h"
#include "gpos/common/DbgPrintMixin.h"

#include "gpopt/base/CKHeap.h"
#include "naucrates/statistics/CBucket.h"
#include "naucrates/statistics/CStatsPred.h"

namespace gpopt
{
class CColRef;
class CStatisticsConfig;
class CColumnFactory;
}  // namespace gpopt

namespace gpnaucrates
{
// type definitions
// array of doubles
using CDoubleArray = CDynamicPtrArray<CDouble, CleanupDelete>;

//---------------------------------------------------------------------------
//	@class:
//		CHistogram
//
//	@doc:
//
//---------------------------------------------------------------------------
class CHistogram : public gpos::DbgPrintMixin<CHistogram>
{
	// hash map from column id to a histogram
	using UlongToHistogramMap =
		CHashMap<ULONG, CHistogram, gpos::HashValue<ULONG>, gpos::Equals<ULONG>,
				 CleanupDelete<ULONG>, CleanupDelete<CHistogram>>;

	// iterator
	using UlongToHistogramMapIter =
		CHashMapIter<ULONG, CHistogram, gpos::HashValue<ULONG>,
					 gpos::Equals<ULONG>, CleanupDelete<ULONG>,
					 CleanupDelete<CHistogram>>;

	// hash map from column ULONG to CDouble
	using UlongToDoubleMap =
		CHashMap<ULONG, CDouble, gpos::HashValue<ULONG>, gpos::Equals<ULONG>,
				 CleanupDelete<ULONG>, CleanupDelete<CDouble>>;

	// iterator
	using UlongToDoubleMapIter =
		CHashMapIter<ULONG, CDouble, gpos::HashValue<ULONG>,
					 gpos::Equals<ULONG>, CleanupDelete<ULONG>,
					 CleanupDelete<CDouble>>;

private:
	// shared memory pool
	CMemoryPool *m_mp;

	// all the buckets in the histogram. This is shared among histograms,
	// so must not be modified unless we first make a new copy. We do not copy
	// histograms unless required, as it is an expensive operation in memory and time.
	CBucketArray *m_histogram_buckets;

	// well-defined histogram. if false, then bounds are unknown
	BOOL m_is_well_defined;

	// null fraction
	CDouble m_null_freq;

	// ndistinct of tuples not covered in the buckets
	CDouble m_distinct_remaining;

	// frequency of tuples not covered in the buckets
	CDouble m_freq_remaining;

	// has histogram skew been measures
	BOOL m_skew_was_measured;

	// skew estimate
	CDouble m_skew;

	// was the NDVs in histogram scaled
	BOOL m_NDVs_were_scaled;

	// is column statistics missing in the database
	BOOL m_is_col_stats_missing;

	// return an array buckets after applying equality filter on the histogram buckets
	CBucketArray *MakeBucketsWithEqualityFilter(CPoint *point) const;

	// return an array buckets after applying non equality filter on the histogram buckets
	CBucketArray *MakeBucketsWithInequalityFilter(CPoint *point) const;

	// less than or less than equal filter
	CHistogram *MakeHistogramLessThanOrLessThanEqualFilter(
		CStatsPred::EStatsCmpType stats_cmp_type, CPoint *point) const;

	// greater than or greater than equal filter
	CHistogram *MakeHistogramGreaterThanOrGreaterThanEqualFilter(
		CStatsPred::EStatsCmpType stats_cmp_type, CPoint *point) const;

	// equal filter
	CHistogram *MakeHistogramEqualFilter(CPoint *point) const;

	// not equal filter
	CHistogram *MakeHistogramInequalityFilter(CPoint *point) const;

	// IDF filter
	CHistogram *MakeHistogramIDFFilter(CPoint *point) const;

	// INDF filter
	CHistogram *MakeHistogramINDFFilter(CPoint *point) const;

	// equality join
	CHistogram *MakeJoinHistogramEqualityFilter(
		const CHistogram *histogram) const;

	// generate histogram based on NDV
	CHistogram *MakeNDVBasedJoinHistogramEqualityFilter(
		const CHistogram *histogram) const;

	// construct a new histogram for an INDF join predicate
	CHistogram *MakeJoinHistogramINDFFilter(const CHistogram *histogram) const;

	// accessor for n-th bucket
	CBucket *operator[](ULONG) const;

	// compute skew estimate
	void ComputeSkew();

	// helper to add buckets from one histogram to another
	static void AddBuckets(CMemoryPool *mp, const CBucketArray *src_buckets,
						   CBucketArray *dest_buckets, CDouble rows_old,
						   CDouble rows_new, ULONG begin, ULONG end);

	static void AddBuckets(CMemoryPool *mp, const CBucketArray *src_buckets,
						   CBucketArray *dest_buckets, CDouble rows,
						   CDoubleArray *dest_bucket_freqs, ULONG begin,
						   ULONG end);

	// helper to combine histogram buckets to reduce total buckets
	static CBucketArray *CombineBuckets(CMemoryPool *mp, CBucketArray *buckets,
										ULONG desired_num_buckets);

	// check if we can compute NDVRemain for JOIN histogram for the given input histograms
	static BOOL CanComputeJoinNDVRemain(const CHistogram *histogram1,
										const CHistogram *histogram2);

	// compute the effects of the NDV and frequency of the tuples not captured
	// by the histogram
	static void ComputeJoinNDVRemainInfo(
		const CHistogram *histogram1, const CHistogram *histogram2,
		const CBucketArray *join_buckets,  // join buckets
		CDouble
			hist1_buckets_freq,	 // frequency of the buckets in input1 that contributed to the join
		CDouble
			hist2_buckets_freq,	 // frequency of the buckets in input2 that contributed to the join
		CDouble *result_distinct_remain, CDouble *result_freq_remain);


	// check if the cardinality estimation should be done only via NDVs
	static BOOL NeedsNDVBasedCardEstimationForEq(const CHistogram *histogram);

	BOOL IsHistogramForTextRelatedTypes() const;

	// add residual union all buckets after the merge
	ULONG AddResidualUnionAllBucket(CBucketArray *histogram_buckets,
									CBucket *bucket, CDouble rows_old,
									CDouble rows_new, BOOL bucket_is_residual,
									ULONG index) const;

	// add residual union buckets after the merge
	ULONG AddResidualUnionBucket(CBucketArray *histogram_buckets,
								 CBucket *bucket, CDouble rows,
								 BOOL bucket_is_residual, ULONG index,
								 CDoubleArray *dest_bucket_freqs) const;

	// used to keep track of adjacent stats buckets and how similar
	// they are in terms of distribution
	struct SAdjBucketBoundary
	{
		// boundary_index 0 refers to boundary between b[0] and b[1]
		ULONG m_boundary_index;
		// similarity factor between two adjacent buckets calculated as (freq0/ndv0 - freq1/ndv1) + (freq0/width0 - freq1/width1)
		CDouble m_similarity_factor;

		SAdjBucketBoundary(ULONG index, CDouble similarity_factor)
			: m_boundary_index(index), m_similarity_factor(similarity_factor)
		{
		}

		// used for sorting in the binary heap
		CDouble
		DCost() const
		{
			return m_similarity_factor;
		}
		CDouble
		GetCostForHeap() const
		{
			return DCost();
		}
	};
	using SAdjBucketBoundaryArray =
		CDynamicPtrArray<SAdjBucketBoundary, CleanupDelete<SAdjBucketBoundary>>;

public:
	CHistogram &operator=(const CHistogram &) = delete;

	CHistogram(const CHistogram &) = delete;

	// ctors
	explicit CHistogram(CMemoryPool *mp, CBucketArray *histogram_buckets,
						BOOL is_well_defined = true);

	explicit CHistogram(CMemoryPool *mp, BOOL is_well_defined = true);

	CHistogram(CMemoryPool *mp, CBucketArray *histogram_buckets,
			   BOOL is_well_defined, CDouble null_freq,
			   CDouble distinct_remaining, CDouble freq_remaining,
			   BOOL is_col_stats_missing = false);

	// set null frequency
	void SetNullFrequency(CDouble null_freq);

	// set information about the scaling of NDVs
	void
	SetNDVScaled()
	{
		m_NDVs_were_scaled = true;
	}

	// have the NDVs been scaled
	BOOL
	WereNDVsScaled() const
	{
		return m_NDVs_were_scaled;
	}

	// filter by comparing with point
	CHistogram *MakeHistogramFilter(CStatsPred::EStatsCmpType stats_cmp_type,
									CPoint *point) const;

	// filter by comparing with point and normalize
	CHistogram *MakeHistogramFilterNormalize(
		CStatsPred::EStatsCmpType stats_cmp_type, CPoint *point,
		CDouble *scale_factor) const;

	// join with another histogram
	CHistogram *MakeJoinHistogram(CStatsPred::EStatsCmpType stats_cmp_type,
								  const CHistogram *histogram) const;

	// LASJ with another histogram
	CHistogram *MakeLASJHistogram(CStatsPred::EStatsCmpType stats_cmp_type,
								  const CHistogram *histogram) const;

	// join with another histogram and normalize it.
	// If the join is not an equality join the function returns an empty histogram
	CHistogram *MakeJoinHistogramNormalize(
		CStatsPred::EStatsCmpType stats_cmp_type, CDouble rows,
		const CHistogram *other_histogram, CDouble rows_other,
		CDouble *scale_factor) const;

	// scale factor of inequality (!=) join
	CDouble GetInequalityJoinScaleFactor(CDouble rows,
										 const CHistogram *other_histogram,
										 CDouble rows_other) const;

	// left anti semi join with another histogram and normalize
	CHistogram *MakeLASJHistogramNormalize(
		CStatsPred::EStatsCmpType stats_cmp_type, CDouble rows,
		const CHistogram *other_histogram, CDouble *scale_factor,
		BOOL
			DoIgnoreLASJHistComputation	 // except for the case of LOJ cardinality estimation this flag is always
		// "true" since LASJ stats computation is very aggressive
	) const;

	// group by and normalize
	CHistogram *MakeGroupByHistogramNormalize(CDouble rows,
											  CDouble *scale_factor) const;

	// union all and normalize
	CHistogram *MakeUnionAllHistogramNormalize(
		CDouble rows, const CHistogram *other_histogram,
		CDouble rows_other) const;

	// union and normalize
	CHistogram *MakeUnionHistogramNormalize(CDouble rows,
											const CHistogram *other_histogram,
											CDouble rows_other,
											CDouble *num_output_rows) const;

	// cleanup residual buckets
	static void CleanupResidualBucket(CBucket *bucket, BOOL bucket_is_residual);

	// get the next bucket for union / union all
	static CBucket *GetNextBucket(const CHistogram *histogram,
								  CBucket *new_bucket,
								  BOOL *target_bucket_is_residual,
								  ULONG *current_bucket_index);

	// number of buckets
	ULONG
	GetNumBuckets() const
	{
		GPOS_ASSERT(m_histogram_buckets != nullptr);
		return m_histogram_buckets->Size();
	}

	// buckets accessor
	const CBucketArray *
	GetBuckets() const
	{
		return m_histogram_buckets;
	}

	// well defined
	BOOL
	IsWellDefined() const
	{
		return m_is_well_defined;
	}

	BOOL ContainsOnlySingletonBuckets() const;

	// is the column statistics missing in the database
	BOOL
	IsColStatsMissing() const
	{
		return m_is_col_stats_missing;
	}

	// print function
	IOstream &OsPrint(IOstream &os) const;

	// total frequency from buckets and null fraction
	CDouble GetFrequency() const;

	// total number of distinct values
	CDouble GetNumDistinct() const;

	// is histogram well formed
	BOOL IsValid() const;

	// return copy of histogram
	CHistogram *CopyHistogram() const;

	// destructor
	virtual ~CHistogram()
	{
		m_histogram_buckets->Release();
	}

	// normalize histogram and return scaling factor
	CDouble NormalizeHistogram();

	// is histogram normalized
	BOOL IsNormalized() const;

	// translate the histogram into a derived column stats
	CDXLStatsDerivedColumn *TranslateToDXLDerivedColumnStats(
		CMDAccessor *md_accessor, ULONG colid, CDouble width) const;

	// randomly pick a bucket index
	ULONG GetRandomBucketIndex(ULONG *seed) const;

	// estimate of data skew
	CDouble
	GetSkew()
	{
		if (!m_skew_was_measured)
		{
			ComputeSkew();
		}

		return m_skew;
	}

	// accessor of null fraction
	CDouble
	GetNullFreq() const
	{
		return m_null_freq;
	}

	// accessor of remaining number of tuples
	CDouble
	GetDistinctRemain() const
	{
		return m_distinct_remaining;
	}

	// accessor of remaining frequency
	CDouble
	GetFreqRemain() const
	{
		return m_freq_remaining;
	}

	// check if histogram is empty
	BOOL IsEmpty() const;

	// cap the total number of distinct values (NDVs) in buckets to the number of rows
	void CapNDVs(CDouble rows);

	// is comparison type supported for filters for text columns
	static BOOL IsOpSupportedForTextFilter(
		CStatsPred::EStatsCmpType stats_cmp_type);

	// is comparison type supported for filters
	static BOOL IsOpSupportedForFilter(
		CStatsPred::EStatsCmpType stats_cmp_type);

	// is the join predicate's comparison type supported
	static BOOL JoinPredCmpTypeIsSupported(
		CStatsPred::EStatsCmpType stats_cmp_type);

	// create the default histogram for a given column reference
	static CHistogram *MakeDefaultHistogram(CMemoryPool *mp, CColRef *col_ref,
											BOOL is_empty);

	// create the default non empty histogram for a boolean column
	static CHistogram *MakeDefaultBoolHistogram(CMemoryPool *mp);

	// helper method to append histograms from one map to the other
	static void AddHistograms(CMemoryPool *mp,
							  UlongToHistogramMap *src_histograms,
							  UlongToHistogramMap *dest_histograms);

	// add dummy histogram buckets and column width for the array of columns
	static void AddDummyHistogramAndWidthInfo(
		CMemoryPool *mp, CColumnFactory *col_factory,
		UlongToHistogramMap *output_histograms,
		UlongToDoubleMap *output_col_widths, const ULongPtrArray *columns,
		BOOL is_empty);

	// add dummy histogram buckets for the columns in the input histogram
	static void AddEmptyHistogram(CMemoryPool *mp,
								  UlongToHistogramMap *output_histograms,
								  UlongToHistogramMap *input_histograms);

	// create a deep copy of m_histogram_buckets
	static CBucketArray *DeepCopyHistogramBuckets(CMemoryPool *mp,
												  const CBucketArray *buckets);

	// default histogram selectivity
	static const CDouble DefaultSelectivity;

	// minimum number of distinct values in a column
	static const CDouble MinDistinct;

	// default scale factor when there is no filtering of input
	static const CDouble NeutralScaleFactor;

	// default Null frequency
	static const CDouble DefaultNullFreq;

	// default NDV remain
	static const CDouble DefaultNDVRemain;

	// default frequency of NDV remain
	static const CDouble DefaultNDVFreqRemain;
};	// class CHistogram

}  // namespace gpnaucrates

#endif	// !GPNAUCRATES_CHistogram_H

// EOF

相关信息

greenplumn 源码目录

相关文章

greenplumn CBucket 源码

greenplumn CFilterStatsProcessor 源码

greenplumn CGroupByStatsProcessor 源码

greenplumn CInnerJoinStatsProcessor 源码

greenplumn CJoinStatsProcessor 源码

greenplumn CLeftAntiSemiJoinStatsProcessor 源码

greenplumn CLeftOuterJoinStatsProcessor 源码

greenplumn CLeftSemiJoinStatsProcessor 源码

greenplumn CLimitStatsProcessor 源码

greenplumn CPoint 源码

0  赞