greenplumn CStatisticsUtils 源码

  • 2022-08-18
  • 浏览 (277)

greenplumn CStatisticsUtils 代码

文件路径:/src/backend/gporca/libnaucrates/include/naucrates/statistics/CStatisticsUtils.h

//---------------------------------------------------------------------------
//	Greenplum Database
//	Copyright 2018 VMware, Inc. or its affiliates.
//
//	@filename:
//		CStatisticsUtils.h
//
//	@doc:
//		Utility functions for statistics
//---------------------------------------------------------------------------
#ifndef GPOPT_CStatisticsUtils_H
#define GPOPT_CStatisticsUtils_H

#include "gpos/base.h"

#include "gpopt/base/CColRef.h"
#include "gpopt/engine/CStatisticsConfig.h"
#include "gpopt/mdcache/CMDAccessor.h"
#include "gpopt/operators/CExpression.h"
#include "gpopt/operators/CLogical.h"
#include "gpopt/operators/CScalarBoolOp.h"
#include "naucrates/base/IDatum.h"
#include "naucrates/statistics/CStatistics.h"
#include "naucrates/statistics/CStatsPredDisj.h"
#include "naucrates/statistics/CStatsPredUnsupported.h"
#include "naucrates/statistics/CStatsPredUtils.h"

namespace gpopt
{
class CTableDescriptor;
}

namespace gpnaucrates
{
using namespace gpos;
using namespace gpopt;

//---------------------------------------------------------------------------
//	@class:
//		CStatisticsUtils
//
//	@doc:
//		Utility functions for statistics
//
//---------------------------------------------------------------------------
class CStatisticsUtils
{
private:
	// MCV value and its corresponding frequency, used for sorting MCVs
	struct SMcvPair
	{
		// MCV datum
		IDatum *m_datum_mcv;

		// the frequency of this MCV
		CDouble m_mcv_freq;

		// ctor
		SMcvPair(IDatum *datum_mcv, CDouble mcv_freq)
			: m_datum_mcv(datum_mcv), m_mcv_freq(mcv_freq)
		{
		}

		// dtor
		~SMcvPair()
		{
			m_datum_mcv->Release();
		}
	};

	// array of SMcvVPairs
	using SMcvPairPtrArray = CDynamicPtrArray<SMcvPair, CleanupDelete>;

	// given MCVs and histogram buckets, merge them into buckets of a single histogram
	static CBucketArray *MergeMcvHistBucket(
		CMemoryPool *mp, const CBucketArray *mcv_buckets,
		const CBucketArray *histogram_buckets);

	// split a histogram bucket given an MCV bucket
	static CBucketArray *SplitHistBucketGivenMcvBuckets(
		CMemoryPool *mp, const CBucket *histogram_bucket,
		const CBucketArray *mcv_buckets);

	// given lower and upper bound information and their closedness, return a bucket if they can form a valid bucket
	static CBucket *CreateValidBucket(CMemoryPool *mp,
									  CPoint *bucket_lower_bound,
									  CPoint *bucket_upper_bound,
									  BOOL is_lower_closed,
									  BOOL is_upper_closed);

	// given lower and upper bound information and their closedness, test if they can form a valid bucket
	static BOOL IsValidBucket(CPoint *bucket_lower_bound,
							  CPoint *bucket_upper_bound, BOOL is_lower_closed,
							  BOOL is_upper_closed);

	// find the MCVs that fall within the same histogram bucket and perform the split
	static void SplitHistDriver(CMemoryPool *mp,
								const CBucket *histogram_bucket,
								const CBucketArray *mcv_buckets,
								CBucketArray *merged_buckets, ULONG *mcv_index,
								ULONG mcv);

	// distribute total distinct and frequency of the histogram bucket into the new buckets
	static CBucketArray *DistributeBucketProperties(
		CMemoryPool *mp, CDouble total_frequency, CDouble total_distinct_values,
		const CBucketArray *buckets);

	// add the NDVs for all of the grouping columns
	static void AddNdvForAllGrpCols(
		CMemoryPool *mp, const CStatistics *input_stats,
		const ULongPtrArray *grouping_columns,
		CDoubleArray *output_ndvs  // output array of NDV
	);

	// compute max number of groups when grouping on columns from the given source
	static CDouble MaxNumGroupsForGivenSrcGprCols(
		CMemoryPool *mp, const CStatisticsConfig *stats_config,
		CStatistics *input_stats, const ULongPtrArray *src_grouping_cols);

	// check to see if any one of the grouping columns has been capped
	static BOOL CappedGrpColExists(const CStatistics *stats,
								   const ULongPtrArray *grouping_columns);

	// return the maximum NDV given an array of grouping columns
	static CDouble MaxNdv(const CStatistics *stats,
						  const ULongPtrArray *grouping_columns);

public:
	// private dtor
	virtual ~CStatisticsUtils() = delete;

	// private ctor
	CStatisticsUtils() = delete;

	CStatisticsUtils(const CStatisticsUtils &) = delete;

	// get the next data point for generating new bucket boundary
	static CPoint *NextPoint(CMemoryPool *mp, CMDAccessor *md_accessor,
							 CPoint *point);

	// transform mcv information to optimizer's histogram structure
	static CHistogram *TransformMCVToHist(CMemoryPool *mp,
										  const IMDType *mdtype,
										  IDatumArray *mcv_datums,
										  CDoubleArray *freq_array,
										  ULONG num_mcv_values);

	// merge MCVs and histogram
	static CHistogram *MergeMCVHist(CMemoryPool *mp,
									const CHistogram *mcv_histogram,
									const CHistogram *histogram);

	// comparison function for sorting MCVs
	static inline INT GetMcvPairCmpFunc(const void *val1, const void *val2);

	// utility function to print column stats before/after applying filter
	static void PrintColStats(CMemoryPool *mp, CStatsPred *pred_stats,
							  ULONG cond_colid, CHistogram *histogram,
							  CDouble last_scale_factor,
							  BOOL is_filter_applied_before);

	// extract all the column identifiers used in the statistics filter
	static void ExtractUsedColIds(CMemoryPool *mp, CBitSet *colids_bitset,
								  CStatsPred *pred_stats,
								  ULongPtrArray *colids);

	// given the previously generated histogram, update the intermediate
	// result of the disjunction
	static void UpdateDisjStatistics(
		CMemoryPool *mp, CBitSet *non_updatable_cols,
		CDouble input_disjunct_rows, CDouble local_rows,
		CHistogram *previous_histogram,
		UlongToHistogramMap *disjunctive_result_histograms, ULONG last_colid);

	// given a disjunction filter, generate a bit set of columns whose
	// histogram buckets cannot be changed by applying the predicates in the
	// disjunction
	static CBitSet *GetColsNonUpdatableHistForDisj(CMemoryPool *mp,
												   CStatsPredDisj *pred_stats);

	// helper method to add a histogram to a map
	static void AddHistogram(CMemoryPool *mp, ULONG colid,
							 const CHistogram *histogram,
							 UlongToHistogramMap *col_histogram_mapping,
							 BOOL replace_old = false);

	// create a new hash map of histograms after merging
	// the histograms generated by the child of disjunctive predicate
	static UlongToHistogramMap *MergeHistogramMapsForDisjPreds(
		CMemoryPool *mp, CBitSet *non_updatable_cols,
		UlongToHistogramMap *hmap1, UlongToHistogramMap *hmap2, CDouble rows1,
		CDouble rows2);

	// helper method to copy the hash map of histograms
	static UlongToHistogramMap *CopyHistHashMap(
		CMemoryPool *mp, UlongToHistogramMap *col_histogram_mapping);

	// return the column identifier of the filter if the predicate is
	// on a single column else	return gpos::ulong_max
	static ULONG GetColId(const CStatsPredPtrArry *stats_preds_array);

	// add remaining buckets from one array of buckets to the other
	static void AddRemainingBuckets(CMemoryPool *mp,
									const CBucketArray *src_buckets,
									CBucketArray *dest_buckets,
									ULONG *start_val);

	// generate a null datum with the type of passed colref
	static IDatum *DatumNull(const CColRef *colref);

#ifdef GPOS_DEBUG
	// helper method to print the hash map of histograms
	static void PrintHistogramMap(IOstream &os,
								  UlongToHistogramMap *col_histogram_mapping);
#endif	// GPOS_DEBUG

	// derive statistics of dynamic scan based on part-selector stats in the given map
	static IStatistics *DeriveStatsForDynamicScan(
		CMemoryPool *mp, CExpressionHandle &exprhdl, ULONG scan_id,
		CPartitionPropagationSpec *pps_reqd);

	// derive statistics of (dynamic) index-get
	static IStatistics *DeriveStatsForIndexGet(
		CMemoryPool *mp, CExpressionHandle &exprhdl,
		IStatisticsArray *stats_contexts);

	// derive statistics of bitmap table-get
	static IStatistics *DeriveStatsForBitmapTableGet(
		CMemoryPool *mp, CExpressionHandle &exprhdl,
		IStatisticsArray *stats_contexts);

	// compute the cumulative number of distinct values (NDV) of the group by operator
	// from the array of NDV of the individual grouping columns
	static CDouble GetCumulativeNDVs(const CStatisticsConfig *stats_config,
									 CDoubleArray *ndv_array);

	// return the mapping between the table column used for grouping to the logical operator id where it was defined.
	// If the grouping column is not a table column then the logical op id is initialized to gpos::ulong_max
	static UlongToUlongPtrArrayMap *GetGrpColIdToUpperBoundNDVIdxMap(
		CMemoryPool *mp, CStatistics *stats,
		const CColRefSet *grouping_cols_refset, CBitSet *keys);

	// extract NDVs for the given array of grouping columns
	static CDoubleArray *ExtractNDVForGrpCols(
		CMemoryPool *mp, const CStatisticsConfig *stats_config,
		const IStatistics *stats,
		CColRefSet *grp_cols_refset,  // grouping columns
		CBitSet *keys				  // keys derived during optimization
	);

	// compute the cumulative number of groups for the given set of grouping columns
	static CDouble Groups(CMemoryPool *mp, IStatistics *stats,
						  const CStatisticsConfig *stats_config,
						  ULongPtrArray *grouping_cols, CBitSet *keys);

	// return the default number of distinct values
	static CDouble
	DefaultDistinctVals(CDouble rows)
	{
		return std::min(CStatistics::DefaultDistinctValues.Get(), rows.Get());
	}

	// add the statistics (histogram and width) of the grouping columns
	static void AddGrpColStats(CMemoryPool *mp, const CStatistics *input_stats,
							   CColRefSet *grp_cols_refset,
							   UlongToHistogramMap *output_histograms,
							   UlongToDoubleMap *output_col_widths);

	// return the set of grouping columns for statistics computation;
	static CColRefSet *MakeGroupByColsForStats(
		CMemoryPool *mp, const ULongPtrArray *grouping_columns,
		CColRefSet *
			computed_groupby_cols  // output set of grouping columns that are computed attributes
	);



	// return the total number of distinct values in the given array of buckets
	static CDouble GetNumDistinct(const CBucketArray *histogram_buckets);

	// return the cumulative frequency in the given array of buckets
	static CDouble GetFrequency(const CBucketArray *histogram_buckets);

	// true if the given operator increases risk of cardinality misestimation
	static BOOL IncreasesRisk(CLogical *logical_op);

	// return the default column width
	static CDouble DefaultColumnWidth(const IMDType *mdtype);

	// helper method to add width information
	static void AddWidthInfo(CMemoryPool *mp, UlongToDoubleMap *src_width,
							 UlongToDoubleMap *dest_width);


	// for the output stats object, compute its upper bound cardinality mapping based on the bounding method
	// estimated output cardinality and information maintained in the current stats object
	static void ComputeCardUpperBounds(
		CMemoryPool *mp,  // memory pool
		const CStatistics *input_stats,
		CStatistics
			*output_stats,	  // output statistics object that is to be updated
		CDouble rows_output,  // estimated output cardinality of the operator
		CStatistics::ECardBoundingMethod
			card_bounding_method  // technique used to estimate max source cardinality in the output stats object
	);

	static BOOL IsStatsCmpTypeNdvEq(CStatsPred::EStatsCmpType stats_cmp_type);

};	// class CStatisticsUtils

// comparison function for sorting MCVs
INT
CStatisticsUtils::GetMcvPairCmpFunc(const void *val1, const void *val2)
{
	GPOS_ASSERT(nullptr != val1);
	GPOS_ASSERT(nullptr != val2);
	const SMcvPair *mcv_pair1 = *(const SMcvPair **) (val1);
	const SMcvPair *mcv_pair2 = *(const SMcvPair **) (val2);

	const IDatum *datum1 = mcv_pair1->m_datum_mcv;
	const IDatum *datum2 = mcv_pair2->m_datum_mcv;

	if (datum1->StatsAreEqual(datum2))
	{
		return 0;
	}

	if (datum1->StatsAreComparable(datum2) && datum1->StatsAreLessThan(datum2))
	{
		return -1;
	}

	return 1;
}
}  // namespace gpnaucrates


#endif	// !GPOPT_CStatisticsUtils_H

// EOF

相关信息

greenplumn 源码目录

相关文章

greenplumn CBucket 源码

greenplumn CFilterStatsProcessor 源码

greenplumn CGroupByStatsProcessor 源码

greenplumn CHistogram 源码

greenplumn CInnerJoinStatsProcessor 源码

greenplumn CJoinStatsProcessor 源码

greenplumn CLeftAntiSemiJoinStatsProcessor 源码

greenplumn CLeftOuterJoinStatsProcessor 源码

greenplumn CLeftSemiJoinStatsProcessor 源码

greenplumn CLimitStatsProcessor 源码

0  赞