统计信息准确性校验【绽放吧!GaussDB(DWS)云原生数仓】
当前OLAP数据库查询性能对客户起着至关重要的作用,而查询性能的好坏之一在于统计信息的准确性,本文针对统计信息的准确性给出校验方法,供大家参考。
一:创建基表并收集统计信息
1,创建基表导入数据
drop table if exists tmptable;
create table tmptable(
i_int int,
i_text text
);
insert into tmptable values(generate_series(1,10),'testabc');
insert into tmptable values(generate_series(1,10),'testbcd');
insert into tmptable values(generate_series(10,20),'testbcd');
insert into tmptable values(generate_series(10,20),'testcde');
2,创建表达式统计信息
create STATISTICS s1 on btrim(i_text,'test') from tmptable;
3,表analyze
analyze tmptable;
4,查看统计信息
表reltuples
select reltuples,relpages from pg_class where relname='tmptable' ;
列统计信息
select tablename,n_distinct,most_common_vals,most_common_freqs from pg_stats where tablename='tmptable';
表达式统计信息
select tablename,statistics_name,expr, n_distinct,most_common_vals, most_common_freqs from pg_stats_ext_exprs where tablename='tmptable';
二:校验统计信息方式如下:
1,校验列的distinct
--------------------------------------------------列的distinct 估算值校验--------------------------------------------------
create or replace function test_distinct(in tablename1 text,in attname1 text)
returns table(a_distinct numeric,e_distinct numeric,percent numeric)
as $$
declare
a numeric;
b numeric;
sql_1 text;
BEGIN
sql_1:='select n_distinct from pg_stats where tablename = '''|| tablename1 || ''' and attname ='''|| attname1 || ''';';
EXECUTE IMMEDIATE sql_1 into e_distinct;
sql_1:='select count(distinct '||attname1||') from '||tablename1;
EXECUTE IMMEDIATE sql_1 into a;
sql_1:='select count(*) from '||tablename1||' where '||attname1||' is not null ';
EXECUTE IMMEDIATE sql_1 into b ;
if e_distinct<0 then
a_distinct =a/b;
ELSE
a_distinct=a;
end if;
percent=a_distinct/e_distinct;
return query select a_distinct,e_distinct,percent;
end;
$$
LANGUAGE plpgsql;
--参数:表名,列名
--返回 :真实值,估算值,准确率(越接近1表示越准确)
select test_distinct('tmptable','i_int');
2,校验mcf值估算
--------------------------------------mcf值估算校验----------------------------------
3,表达式统计信息distinct值估算
-------------------------表达式统计信息distinct准确性计算-------------------------------
create or replace function test_exp_distinct(in tablename1 text,in statistics_name1 text)
returns table(a_distinct numeric,e_distinct numeric,percent numeric)
as $$
declare
a numeric;
b numeric;
sql_1 text;
expr1 text;
expr2 text;
BEGIN
expr2:='select expr from pg_stats_ext_exprs where tablename='''|| tablename1 || ''' and statistics_name ='''|| statistics_name1 || ''' ;';
EXECUTE IMMEDIATE expr2 into expr1;
sql_1:='select n_distinct from pg_stats_ext_exprs where tablename = '''|| tablename1 || ''' and statistics_name ='''|| statistics_name1 || ''';';
EXECUTE IMMEDIATE sql_1 into e_distinct;
sql_1:='select count(distinct '||expr1||') from '||tablename1;
EXECUTE IMMEDIATE sql_1 into a;
sql_1:='select count(*) from '||tablename1||' where '||expr1||' is not null ';
EXECUTE IMMEDIATE sql_1 into b ;
if e_distinct<0 then
a_distinct =a/b;
ELSE
a_distinct=a;
end if;
percent=a_distinct/e_distinct;
return query select a_distinct,e_distinct,percent;
end;
$$
LANGUAGE plpgsql;
--参数:表名,表达式统计信息名
--出参,实际值,估算值,准确率(越接近1表示越准确)
select test_exp_distinct('tmptable','s1');
4,校验表达式统计信息mcf估算
我正在参加【有奖征文 第27期】绽放吧!GaussDB(DWS)云原生数仓!https://bbs.huaweicloud.com/blogs/412962
- 点赞
- 收藏
- 关注作者
评论(0)