统计信息准确性校验【绽放吧!GaussDB(DWS)云原生数仓】
【摘要】 当前OLAP数据库查询性能对客户起着至关重要的作用,而查询性能的好坏之一在于统计信息的准确性,本文针对统计信息的准确性给出校验方法,供大家参考。
当前OLAP数据库查询性能对客户起着至关重要的作用,而查询性能的好坏之一在于统计信息的准确性,本文针对统计信息的准确性给出校验方法,供大家参考。
一:创建基表并收集统计信息
1,创建基表导入数据
drop table if exists tmptable;
create table tmptable(
i_int int,
i_text text
);
insert into tmptable values(generate_series(1,10),'testabc');
insert into tmptable values(generate_series(1,10),'testbcd');
insert into tmptable values(generate_series(10,20),'testbcd');
insert into tmptable values(generate_series(10,20),'testcde');
2,创建表达式统计信息
create STATISTICS s1 on btrim(i_text,'test') from tmptable;
3,表analyze
analyze tmptable;
4,查看统计信息
表reltuples
select reltuples,relpages from pg_class where relname='tmptable' ;
列统计信息
select tablename,n_distinct,most_common_vals,most_common_freqs from pg_stats where tablename='tmptable';
表达式统计信息
select tablename,statistics_name,expr, n_distinct,most_common_vals, most_common_freqs from pg_stats_ext_exprs where tablename='tmptable';
二:校验统计信息方式如下:
1,校验列的distinct
--------------------------------------------------列的distinct 估算值校验--------------------------------------------------
create or replace function test_distinct(in tablename1 text,in attname1 text)
returns table(a_distinct numeric,e_distinct numeric,percent numeric)
as $$
declare
a numeric;
b numeric;
sql_1 text;
BEGIN
sql_1:='select n_distinct from pg_stats where tablename = '''|| tablename1 || ''' and attname ='''|| attname1 || ''';';
EXECUTE IMMEDIATE sql_1 into e_distinct;
sql_1:='select count(distinct '||attname1||') from '||tablename1;
EXECUTE IMMEDIATE sql_1 into a;
sql_1:='select count(*) from '||tablename1||' where '||attname1||' is not null ';
EXECUTE IMMEDIATE sql_1 into b ;
if e_distinct<0 then
a_distinct =a/b;
ELSE
a_distinct=a;
end if;
percent=a_distinct/e_distinct;
return query select a_distinct,e_distinct,percent;
end;
$$
LANGUAGE plpgsql;
--参数:表名,列名
--返回 :真实值,估算值,准确率(越接近1表示越准确)
select test_distinct('tmptable','i_int');
2,校验mcf值估算
--------------------------------------mcf值估算校验----------------------------------
create or replace function test_mcf2(in tablename1 text,in attname1 text)
returns table(col int,a_f numeric ,e_f numeric,percent numeric)
as $$
declare
a bigint;
b numeric;
sql_1 text;
sql_0 text;
BEGIN
sql_0:='select count(*) from '|| tablename1 || '';
EXECUTE sql_0 into a;
sql_1:='with cte1 as
(select regexp_split_to_table(array_to_string(most_common_vals, '',''), '','') as mcv,
regexp_split_to_table(array_to_string(most_common_freqs, '',''), '','') as most_common_freqs,
attname
from pg_stats
where tablename = '''|| tablename1 || '''
and attname = '''|| attname1 || ''')
select '||attname1||',mcf::numeric as a_f ,most_common_freqs::numeric as e_f ,(mcf/most_common_freqs)::numeric as rate
from (select count(*) / '|| a || ' as mcf, '||attname1||'
from '|| tablename1 || '
join cte1
on '|| tablename1 || '.'||attname1||' = cte1.mcv
group by '||attname1||') tb1
join cte1
on tb1.'||attname1||' = cte1.mcv ;';
return query EXECUTE sql_1 ;
end;
$$
LANGUAGE plpgsql;
--参数:表名,列名
--返回值:mcv值,实际mcf,估算mcf,准确率(越接近1表示越准确)
select test_mcf2('tmptable','i_int');
3,表达式统计信息distinct值估算
-------------------------表达式统计信息distinct准确性计算-------------------------------
create or replace function test_exp_distinct(in tablename1 text,in statistics_name1 text)
returns table(a_distinct numeric,e_distinct numeric,percent numeric)
as $$
declare
a numeric;
b numeric;
sql_1 text;
expr1 text;
expr2 text;
BEGIN
expr2:='select expr from pg_stats_ext_exprs where tablename='''|| tablename1 || ''' and statistics_name ='''|| statistics_name1 || ''' ;';
EXECUTE IMMEDIATE expr2 into expr1;
sql_1:='select n_distinct from pg_stats_ext_exprs where tablename = '''|| tablename1 || ''' and statistics_name ='''|| statistics_name1 || ''';';
EXECUTE IMMEDIATE sql_1 into e_distinct;
sql_1:='select count(distinct '||expr1||') from '||tablename1;
EXECUTE IMMEDIATE sql_1 into a;
sql_1:='select count(*) from '||tablename1||' where '||expr1||' is not null ';
EXECUTE IMMEDIATE sql_1 into b ;
if e_distinct<0 then
a_distinct =a/b;
ELSE
a_distinct=a;
end if;
percent=a_distinct/e_distinct;
return query select a_distinct,e_distinct,percent;
end;
$$
LANGUAGE plpgsql;
--参数:表名,表达式统计信息名
--出参,实际值,估算值,准确率(越接近1表示越准确)
select test_exp_distinct('tmptable','s1');
4,校验表达式统计信息mcf估算
-------------------------表达式统计信息mcf准确性计算-------------------------------
create or replace function test_exp_mcf(in tablename1 text,statistics_name1 text)
returns table(tablename text,col text,a_f numeric ,e_f numeric,percent numeric)
as $$
declare
a bigint;
b numeric;
sql_1 text;
sql_0 text;
exprtext text;
expr1 text;
BEGIN
expr1:='select expr from pg_stats_ext_exprs where tablename='''|| tablename1 || ''' and statistics_name ='''|| statistics_name1 || ''' ;';
EXECUTE IMMEDIATE expr1 into exprtext;
sql_0:='select count(*) from '|| tablename1 || '';
EXECUTE sql_0 into a;
sql_1:='with cte1 as
(select regexp_split_to_table(array_to_string(most_common_vals, '',''), '','') as mcv,
regexp_split_to_table(array_to_string(most_common_freqs, '',''), '','') as most_common_freqs,
statistics_name,expr
from pg_stats_ext_exprs
where tablename = '''|| tablename1 || '''
and statistics_name = '''||statistics_name1||''')
select '''|| tablename1 || ''',mcv,a_mcf::numeric,most_common_freqs::numeric as e_mcf,most_common_freqs/a_mcf::numeric as rate
from (select count(*) / '|| a || ' as a_mcf, expr,'||exprtext||'as c1
from '|| tablename1 || '
join cte1
on '||exprtext||' = cte1.mcv group by mcv,expr,'||exprtext||'
) tb1
join cte1
on c1=cte1.mcv ;
';
return query EXECUTE sql_1 ;
end;
$$
LANGUAGE plpgsql;
----入参:表名,表达式统计信息名
--出参,实际值,估算值,准确率(越接近1表示越准确)
select test_exp_mcf('tmptable','s1');
我正在参加【有奖征文 第27期】绽放吧!GaussDB(DWS)云原生数仓!https://bbs.huaweicloud.com/blogs/412962
【声明】本内容来自华为云开发者社区博主,不代表华为云及华为云开发者社区的观点和立场。转载时必须标注文章的来源(华为云社区)、文章链接、文章作者等基本信息,否则作者和本社区有权追究责任。如果您发现本社区中有涉嫌抄袭的内容,欢迎发送邮件进行举报,并提供相关证据,一经查实,本社区将立刻删除涉嫌侵权内容,举报邮箱:
cloudbbs@huaweicloud.com
- 点赞
- 收藏
- 关注作者
热门文章
评论(0)