- 微信
- 微博
  
  分享文章到微博
- 复制链接
  
  复制链接到剪贴板

统计信息准确性校验【绽放吧！GaussDB(DWS)云原生数仓】

yd_249256431 发表于 2023/10/24 09:52:42 2023/10/24

【摘要】当前OLAP数据库查询性能对客户起着至关重要的作用，而查询性能的好坏之一在于统计信息的准确性，本文针对统计信息的准确性给出校验方法，供大家参考。

当前OLAP数据库查询性能对客户起着至关重要的作用，而查询性能的好坏之一在于统计信息的准确性，本文针对统计信息的准确性给出校验方法，供大家参考。

一：创建基表并收集统计信息

1，创建基表导入数据

drop table if exists tmptable;
create table tmptable(
i_int int,
i_text text
);

insert into tmptable values(generate_series(1,10),'testabc');
insert into tmptable values(generate_series(1,10),'testbcd');
insert into tmptable values(generate_series(10,20),'testbcd');
insert into tmptable values(generate_series(10,20),'testcde');

2，创建表达式统计信息

create STATISTICS s1 on btrim(i_text,'test') from tmptable;

3，表analyze

analyze tmptable;

4,查看统计信息

表reltuples

select reltuples,relpages from pg_class where relname='tmptable' ;

列统计信息

select tablename,n_distinct,most_common_vals,most_common_freqs from pg_stats where tablename='tmptable';

表达式统计信息

select tablename,statistics_name,expr, n_distinct,most_common_vals, most_common_freqs from pg_stats_ext_exprs where tablename='tmptable';

二：校验统计信息方式如下：

1，校验列的distinct

--------------------------------------------------列的distinct 估算值校验--------------------------------------------------

create or replace function test_distinct(in tablename1 text,in attname1 text)
returns table(a_distinct numeric,e_distinct numeric,percent numeric)
as $$
declare
a numeric;
b numeric;
sql_1 text;
BEGIN
sql_1:='select n_distinct from pg_stats where tablename = '''|| tablename1 || ''' and attname ='''|| attname1 || ''';';
EXECUTE IMMEDIATE sql_1 into e_distinct;
sql_1:='select count(distinct '||attname1||') from '||tablename1;
EXECUTE IMMEDIATE sql_1 into a;
sql_1:='select count(*) from '||tablename1||' where '||attname1||' is not null ';
EXECUTE IMMEDIATE sql_1 into b ;
if e_distinct<0 then
a_distinct =a/b;
ELSE
a_distinct=a;
end if;
percent=a_distinct/e_distinct;
return query select a_distinct,e_distinct,percent;
end;
$$
LANGUAGE plpgsql;

--参数：表名，列名
--返回：真实值，估算值，准确率（越接近1表示越准确）
select test_distinct('tmptable','i_int');

2，校验mcf值估算

--------------------------------------mcf值估算校验----------------------------------

create or replace function test_mcf2(in tablename1 text,in attname1 text)

returns table(col int,a_f numeric ,e_f numeric,percent numeric)

as $$

declare

a bigint;

b numeric;

sql_1 text;

sql_0 text;

BEGIN

sql_0:='select count(*) from '|| tablename1 || '';

EXECUTE sql_0 into a;

sql_1:='with cte1 as

(select regexp_split_to_table(array_to_string(most_common_vals, '',''), '','') as mcv,

regexp_split_to_table(array_to_string(most_common_freqs, '',''), '','') as most_common_freqs,

attname

from pg_stats

where tablename = '''|| tablename1 || '''

and attname = '''|| attname1 || ''')

select '||attname1||',mcf::numeric as a_f ,most_common_freqs::numeric as e_f ,(mcf/most_common_freqs)::numeric as rate

from (select count(*) / '|| a || ' as mcf, '||attname1||'

from '|| tablename1 || '

join cte1

on '|| tablename1 || '.'||attname1||' = cte1.mcv

group by '||attname1||') tb1

join cte1

on tb1.'||attname1||' = cte1.mcv ;';

return query EXECUTE sql_1 ;

end;

LANGUAGE plpgsql;

--参数：表名，列名

--返回值：mcv值，实际mcf，估算mcf，准确率（越接近1表示越准确）

select test_mcf2('tmptable','i_int');

3，表达式统计信息distinct值估算

-------------------------表达式统计信息distinct准确性计算-------------------------------

create or replace function test_exp_distinct(in tablename1 text,in statistics_name1 text)
returns table(a_distinct numeric,e_distinct numeric,percent numeric)
as $$
declare
a numeric;
b numeric;
sql_1 text;
expr1 text;
expr2 text;
BEGIN
expr2:='select expr from pg_stats_ext_exprs where tablename='''|| tablename1 || ''' and statistics_name ='''|| statistics_name1 || ''' ;';
EXECUTE IMMEDIATE expr2 into expr1;
sql_1:='select n_distinct from pg_stats_ext_exprs where tablename = '''|| tablename1 || ''' and statistics_name ='''|| statistics_name1 || ''';';
EXECUTE IMMEDIATE sql_1 into e_distinct;
sql_1:='select count(distinct '||expr1||') from '||tablename1;
EXECUTE IMMEDIATE sql_1 into a;
sql_1:='select count(*) from '||tablename1||' where '||expr1||' is not null ';
EXECUTE IMMEDIATE sql_1 into b ;
if e_distinct<0 then
a_distinct =a/b;
ELSE
a_distinct=a;
end if;
percent=a_distinct/e_distinct;
return query select a_distinct,e_distinct,percent;
end;
$$
LANGUAGE plpgsql;

--参数：表名，表达式统计信息名
--出参，实际值，估算值，准确率（越接近1表示越准确）
select test_exp_distinct('tmptable','s1');

4，校验表达式统计信息mcf估算

-------------------------表达式统计信息mcf准确性计算-------------------------------

create or replace function test_exp_mcf(in tablename1 text,statistics_name1 text)

returns table(tablename text,col text,a_f numeric ,e_f numeric,percent numeric)

as $$

declare

a bigint;

b numeric;

sql_1 text;

sql_0 text;

exprtext text;

expr1 text;

BEGIN

expr1:='select expr from pg_stats_ext_exprs where tablename='''|| tablename1 || ''' and statistics_name ='''|| statistics_name1 || ''' ;';

EXECUTE IMMEDIATE expr1 into exprtext;

sql_0:='select count(*) from '|| tablename1 || '';

EXECUTE sql_0 into a;

sql_1:='with cte1 as

(select regexp_split_to_table(array_to_string(most_common_vals, '',''), '','') as mcv,

regexp_split_to_table(array_to_string(most_common_freqs, '',''), '','') as most_common_freqs,

statistics_name,expr

from pg_stats_ext_exprs

where tablename = '''|| tablename1 || '''

and statistics_name = '''||statistics_name1||''')

select '''|| tablename1 || ''',mcv,a_mcf::numeric,most_common_freqs::numeric as e_mcf,most_common_freqs/a_mcf::numeric as rate

from (select count(*) / '|| a || ' as a_mcf, expr,'||exprtext||'as c1

from '|| tablename1 || '

join cte1

on '||exprtext||' = cte1.mcv group by mcv,expr,'||exprtext||'

) tb1

join cte1

on c1=cte1.mcv ;

return query EXECUTE sql_1 ;

end;

LANGUAGE plpgsql;

----入参：表名,表达式统计信息名

--出参，实际值，估算值，准确率（越接近1表示越准确）

select test_exp_mcf('tmptable','s1');

我正在参加【有奖征文第27期】绽放吧！GaussDB(DWS)云原生数仓！https://bbs.huaweicloud.com/blogs/412962

【声明】本内容来自华为云开发者社区博主，不代表华为云及华为云开发者社区的观点和立场。转载时必须标注文章的来源（华为云社区）、文章链接、文章作者等基本信息，否则作者和本社区有权追究责任。如果您发现本社区中有涉嫌抄袭的内容，欢迎发送邮件进行举报，并提供相关证据，一经查实，本社区将立刻删除涉嫌侵权内容，举报邮箱： cloudbbs@huaweicloud.com

点赞
收藏
关注作者

0/1000

抱歉，系统识别当前为高风险访问，暂不支持该操作

全部回复

上滑加载中

设置昵称

在此一键设置昵称，即可参与社区互动！

*长度不超过10个汉字或20个英文字符，设置后3个月内不可修改。

确认取消

加入云驻计划，成为创作者

华为云周边好礼
免费体验产品
特殊身份标识
线下官方门票
内部专家零距离
与10000+优质创作者共同成长

立即加入

统计信息准确性校验【绽放吧！GaussDB(DWS)云原生数仓】

全部回复

设置昵称

关于作者

目录

加入云驻计划，成为创作者

统计信息准确性校验【绽放吧！GaussDB(DWS)云原生数仓】

全部回复

设置昵称

关于作者

目录

加入云驻计划，成为创作者

推荐阅读

相关产品