greenplumn CStatisticsUtils 源码
greenplumn CStatisticsUtils 代码
文件路径:/src/backend/gporca/libnaucrates/include/naucrates/statistics/CStatisticsUtils.h
//---------------------------------------------------------------------------
// Greenplum Database
// Copyright 2018 VMware, Inc. or its affiliates.
//
// @filename:
// CStatisticsUtils.h
//
// @doc:
// Utility functions for statistics
//---------------------------------------------------------------------------
#ifndef GPOPT_CStatisticsUtils_H
#define GPOPT_CStatisticsUtils_H
#include "gpos/base.h"
#include "gpopt/base/CColRef.h"
#include "gpopt/engine/CStatisticsConfig.h"
#include "gpopt/mdcache/CMDAccessor.h"
#include "gpopt/operators/CExpression.h"
#include "gpopt/operators/CLogical.h"
#include "gpopt/operators/CScalarBoolOp.h"
#include "naucrates/base/IDatum.h"
#include "naucrates/statistics/CStatistics.h"
#include "naucrates/statistics/CStatsPredDisj.h"
#include "naucrates/statistics/CStatsPredUnsupported.h"
#include "naucrates/statistics/CStatsPredUtils.h"
namespace gpopt
{
class CTableDescriptor;
}
namespace gpnaucrates
{
using namespace gpos;
using namespace gpopt;
//---------------------------------------------------------------------------
// @class:
// CStatisticsUtils
//
// @doc:
// Utility functions for statistics
//
//---------------------------------------------------------------------------
class CStatisticsUtils
{
private:
// MCV value and its corresponding frequency, used for sorting MCVs
struct SMcvPair
{
// MCV datum
IDatum *m_datum_mcv;
// the frequency of this MCV
CDouble m_mcv_freq;
// ctor
SMcvPair(IDatum *datum_mcv, CDouble mcv_freq)
: m_datum_mcv(datum_mcv), m_mcv_freq(mcv_freq)
{
}
// dtor
~SMcvPair()
{
m_datum_mcv->Release();
}
};
// array of SMcvVPairs
using SMcvPairPtrArray = CDynamicPtrArray<SMcvPair, CleanupDelete>;
// given MCVs and histogram buckets, merge them into buckets of a single histogram
static CBucketArray *MergeMcvHistBucket(
CMemoryPool *mp, const CBucketArray *mcv_buckets,
const CBucketArray *histogram_buckets);
// split a histogram bucket given an MCV bucket
static CBucketArray *SplitHistBucketGivenMcvBuckets(
CMemoryPool *mp, const CBucket *histogram_bucket,
const CBucketArray *mcv_buckets);
// given lower and upper bound information and their closedness, return a bucket if they can form a valid bucket
static CBucket *CreateValidBucket(CMemoryPool *mp,
CPoint *bucket_lower_bound,
CPoint *bucket_upper_bound,
BOOL is_lower_closed,
BOOL is_upper_closed);
// given lower and upper bound information and their closedness, test if they can form a valid bucket
static BOOL IsValidBucket(CPoint *bucket_lower_bound,
CPoint *bucket_upper_bound, BOOL is_lower_closed,
BOOL is_upper_closed);
// find the MCVs that fall within the same histogram bucket and perform the split
static void SplitHistDriver(CMemoryPool *mp,
const CBucket *histogram_bucket,
const CBucketArray *mcv_buckets,
CBucketArray *merged_buckets, ULONG *mcv_index,
ULONG mcv);
// distribute total distinct and frequency of the histogram bucket into the new buckets
static CBucketArray *DistributeBucketProperties(
CMemoryPool *mp, CDouble total_frequency, CDouble total_distinct_values,
const CBucketArray *buckets);
// add the NDVs for all of the grouping columns
static void AddNdvForAllGrpCols(
CMemoryPool *mp, const CStatistics *input_stats,
const ULongPtrArray *grouping_columns,
CDoubleArray *output_ndvs // output array of NDV
);
// compute max number of groups when grouping on columns from the given source
static CDouble MaxNumGroupsForGivenSrcGprCols(
CMemoryPool *mp, const CStatisticsConfig *stats_config,
CStatistics *input_stats, const ULongPtrArray *src_grouping_cols);
// check to see if any one of the grouping columns has been capped
static BOOL CappedGrpColExists(const CStatistics *stats,
const ULongPtrArray *grouping_columns);
// return the maximum NDV given an array of grouping columns
static CDouble MaxNdv(const CStatistics *stats,
const ULongPtrArray *grouping_columns);
public:
// private dtor
virtual ~CStatisticsUtils() = delete;
// private ctor
CStatisticsUtils() = delete;
CStatisticsUtils(const CStatisticsUtils &) = delete;
// get the next data point for generating new bucket boundary
static CPoint *NextPoint(CMemoryPool *mp, CMDAccessor *md_accessor,
CPoint *point);
// transform mcv information to optimizer's histogram structure
static CHistogram *TransformMCVToHist(CMemoryPool *mp,
const IMDType *mdtype,
IDatumArray *mcv_datums,
CDoubleArray *freq_array,
ULONG num_mcv_values);
// merge MCVs and histogram
static CHistogram *MergeMCVHist(CMemoryPool *mp,
const CHistogram *mcv_histogram,
const CHistogram *histogram);
// comparison function for sorting MCVs
static inline INT GetMcvPairCmpFunc(const void *val1, const void *val2);
// utility function to print column stats before/after applying filter
static void PrintColStats(CMemoryPool *mp, CStatsPred *pred_stats,
ULONG cond_colid, CHistogram *histogram,
CDouble last_scale_factor,
BOOL is_filter_applied_before);
// extract all the column identifiers used in the statistics filter
static void ExtractUsedColIds(CMemoryPool *mp, CBitSet *colids_bitset,
CStatsPred *pred_stats,
ULongPtrArray *colids);
// given the previously generated histogram, update the intermediate
// result of the disjunction
static void UpdateDisjStatistics(
CMemoryPool *mp, CBitSet *non_updatable_cols,
CDouble input_disjunct_rows, CDouble local_rows,
CHistogram *previous_histogram,
UlongToHistogramMap *disjunctive_result_histograms, ULONG last_colid);
// given a disjunction filter, generate a bit set of columns whose
// histogram buckets cannot be changed by applying the predicates in the
// disjunction
static CBitSet *GetColsNonUpdatableHistForDisj(CMemoryPool *mp,
CStatsPredDisj *pred_stats);
// helper method to add a histogram to a map
static void AddHistogram(CMemoryPool *mp, ULONG colid,
const CHistogram *histogram,
UlongToHistogramMap *col_histogram_mapping,
BOOL replace_old = false);
// create a new hash map of histograms after merging
// the histograms generated by the child of disjunctive predicate
static UlongToHistogramMap *MergeHistogramMapsForDisjPreds(
CMemoryPool *mp, CBitSet *non_updatable_cols,
UlongToHistogramMap *hmap1, UlongToHistogramMap *hmap2, CDouble rows1,
CDouble rows2);
// helper method to copy the hash map of histograms
static UlongToHistogramMap *CopyHistHashMap(
CMemoryPool *mp, UlongToHistogramMap *col_histogram_mapping);
// return the column identifier of the filter if the predicate is
// on a single column else return gpos::ulong_max
static ULONG GetColId(const CStatsPredPtrArry *stats_preds_array);
// add remaining buckets from one array of buckets to the other
static void AddRemainingBuckets(CMemoryPool *mp,
const CBucketArray *src_buckets,
CBucketArray *dest_buckets,
ULONG *start_val);
// generate a null datum with the type of passed colref
static IDatum *DatumNull(const CColRef *colref);
#ifdef GPOS_DEBUG
// helper method to print the hash map of histograms
static void PrintHistogramMap(IOstream &os,
UlongToHistogramMap *col_histogram_mapping);
#endif // GPOS_DEBUG
// derive statistics of dynamic scan based on part-selector stats in the given map
static IStatistics *DeriveStatsForDynamicScan(
CMemoryPool *mp, CExpressionHandle &exprhdl, ULONG scan_id,
CPartitionPropagationSpec *pps_reqd);
// derive statistics of (dynamic) index-get
static IStatistics *DeriveStatsForIndexGet(
CMemoryPool *mp, CExpressionHandle &exprhdl,
IStatisticsArray *stats_contexts);
// derive statistics of bitmap table-get
static IStatistics *DeriveStatsForBitmapTableGet(
CMemoryPool *mp, CExpressionHandle &exprhdl,
IStatisticsArray *stats_contexts);
// compute the cumulative number of distinct values (NDV) of the group by operator
// from the array of NDV of the individual grouping columns
static CDouble GetCumulativeNDVs(const CStatisticsConfig *stats_config,
CDoubleArray *ndv_array);
// return the mapping between the table column used for grouping to the logical operator id where it was defined.
// If the grouping column is not a table column then the logical op id is initialized to gpos::ulong_max
static UlongToUlongPtrArrayMap *GetGrpColIdToUpperBoundNDVIdxMap(
CMemoryPool *mp, CStatistics *stats,
const CColRefSet *grouping_cols_refset, CBitSet *keys);
// extract NDVs for the given array of grouping columns
static CDoubleArray *ExtractNDVForGrpCols(
CMemoryPool *mp, const CStatisticsConfig *stats_config,
const IStatistics *stats,
CColRefSet *grp_cols_refset, // grouping columns
CBitSet *keys // keys derived during optimization
);
// compute the cumulative number of groups for the given set of grouping columns
static CDouble Groups(CMemoryPool *mp, IStatistics *stats,
const CStatisticsConfig *stats_config,
ULongPtrArray *grouping_cols, CBitSet *keys);
// return the default number of distinct values
static CDouble
DefaultDistinctVals(CDouble rows)
{
return std::min(CStatistics::DefaultDistinctValues.Get(), rows.Get());
}
// add the statistics (histogram and width) of the grouping columns
static void AddGrpColStats(CMemoryPool *mp, const CStatistics *input_stats,
CColRefSet *grp_cols_refset,
UlongToHistogramMap *output_histograms,
UlongToDoubleMap *output_col_widths);
// return the set of grouping columns for statistics computation;
static CColRefSet *MakeGroupByColsForStats(
CMemoryPool *mp, const ULongPtrArray *grouping_columns,
CColRefSet *
computed_groupby_cols // output set of grouping columns that are computed attributes
);
// return the total number of distinct values in the given array of buckets
static CDouble GetNumDistinct(const CBucketArray *histogram_buckets);
// return the cumulative frequency in the given array of buckets
static CDouble GetFrequency(const CBucketArray *histogram_buckets);
// true if the given operator increases risk of cardinality misestimation
static BOOL IncreasesRisk(CLogical *logical_op);
// return the default column width
static CDouble DefaultColumnWidth(const IMDType *mdtype);
// helper method to add width information
static void AddWidthInfo(CMemoryPool *mp, UlongToDoubleMap *src_width,
UlongToDoubleMap *dest_width);
// for the output stats object, compute its upper bound cardinality mapping based on the bounding method
// estimated output cardinality and information maintained in the current stats object
static void ComputeCardUpperBounds(
CMemoryPool *mp, // memory pool
const CStatistics *input_stats,
CStatistics
*output_stats, // output statistics object that is to be updated
CDouble rows_output, // estimated output cardinality of the operator
CStatistics::ECardBoundingMethod
card_bounding_method // technique used to estimate max source cardinality in the output stats object
);
static BOOL IsStatsCmpTypeNdvEq(CStatsPred::EStatsCmpType stats_cmp_type);
}; // class CStatisticsUtils
// comparison function for sorting MCVs
INT
CStatisticsUtils::GetMcvPairCmpFunc(const void *val1, const void *val2)
{
GPOS_ASSERT(nullptr != val1);
GPOS_ASSERT(nullptr != val2);
const SMcvPair *mcv_pair1 = *(const SMcvPair **) (val1);
const SMcvPair *mcv_pair2 = *(const SMcvPair **) (val2);
const IDatum *datum1 = mcv_pair1->m_datum_mcv;
const IDatum *datum2 = mcv_pair2->m_datum_mcv;
if (datum1->StatsAreEqual(datum2))
{
return 0;
}
if (datum1->StatsAreComparable(datum2) && datum1->StatsAreLessThan(datum2))
{
return -1;
}
return 1;
}
} // namespace gpnaucrates
#endif // !GPOPT_CStatisticsUtils_H
// EOF
相关信息
相关文章
greenplumn CFilterStatsProcessor 源码
greenplumn CGroupByStatsProcessor 源码
greenplumn CInnerJoinStatsProcessor 源码
greenplumn CJoinStatsProcessor 源码
greenplumn CLeftAntiSemiJoinStatsProcessor 源码
greenplumn CLeftOuterJoinStatsProcessor 源码
0
赞
热门推荐
-
2、 - 优质文章
-
3、 gate.io
-
7、 golang
-
9、 openharmony
-
10、 Vue中input框自动聚焦