结果解读深度指南:P值、置信区间与统计显著性
【摘要】 I. 统计推断基础概念 1.1 假设检验框架统计推断的核心是假设检验,这是一个基于数据对总体参数做出推论的系统性方法。在A/B测试中,我们通过比较两个组别的差异来做出决策。 1.1.1 零假设与备择假设每个A/B测试都始于两个相互竞争的假设:零假设 (H₀):实验组和对照组之间没有真实差异,观察到的差异只是随机波动的结果备择假设 (H₁):实验组和对照组之间存在真实差异import num...
I. 统计推断基础概念
1.1 假设检验框架
统计推断的核心是假设检验,这是一个基于数据对总体参数做出推论的系统性方法。在A/B测试中,我们通过比较两个组别的差异来做出决策。
1.1.1 零假设与备择假设
每个A/B测试都始于两个相互竞争的假设:
- 零假设 (H₀):实验组和对照组之间没有真实差异,观察到的差异只是随机波动的结果
- 备择假设 (H₁):实验组和对照组之间存在真实差异
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
def demonstrate_hypothesis_testing():
"""可视化假设检验的基本概念"""
# 设置参数
np.random.seed(42)
mu_0 = 0 # 零假设的均值
mu_1 = 0.5 # 备择假设的均值
sigma = 1 # 标准差
n = 100 # 样本量
alpha = 0.05 # 显著性水平
# 生成样本数据
control_data = np.random.normal(mu_0, sigma, n)
treatment_data = np.random.normal(mu_1, sigma, n)
# 执行t检验
t_stat, p_value = stats.ttest_ind(treatment_data, control_data)
# 可视化结果
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
# 左侧:数据分布
ax1.hist(control_data, alpha=0.7, label='对照组', bins=20)
ax1.hist(treatment_data, alpha=0.7, label='实验组', bins=20)
ax1.axvline(np.mean(control_data), color='blue', linestyle='--', linewidth=2)
ax1.axvline(np.mean(treatment_data), color='orange', linestyle='--', linewidth=2)
ax1.set_xlabel('指标值')
ax1.set_ylabel频数')
ax1.set_title('数据分布比较')
ax1.legend()
# 右侧:假设检验可视化
x = np.linspace(-3, 3, 1000)
y_h0 = stats.norm.pdf(x, mu_0, sigma/np.sqrt(n))
y_h1 = stats.norm.pdf(x, mu_1, sigma/np.sqrt(n))
ax2.plot(x, y_h0, 'b-', label='零假设分布 (H₀)')
ax2.plot(x, y_h1, 'r-', label='备择假设分布 (H₁)')
# 标记临界区域
critical_value = stats.norm.ppf(1 - alpha/2) * sigma/np.sqrt(n)
ax2.axvline(critical_value, color='gray', linestyle='--', label=f'临界值')
ax2.axvline(-critical_value, color='gray', linestyle='--')
# 填充拒绝域
x_fill_left = np.linspace(-3, -critical_value, 100)
x_fill_right = np.linspace(critical_value, 3, 100)
ax2.fill_between(x_fill_left, stats.norm.pdf(x_fill_left, mu_0, sigma/np.sqrt(n)),
alpha=0.3, color='blue', label='拒绝域')
ax2.fill_between(x_fill_right, stats.norm.pdf(x_fill_right, mu_0, sigma/np.sqrt(n)),
alpha=0.3, color='blue')
ax2.set_xlabel('样本均值差异')
ax2.set_ylabel('概率密度')
ax2.set_title('假设检验可视化')
ax2.legend()
plt.tight_layout()
plt.show()
print(f"t统计量: {t_stat:.4f}")
print(f"P值: {p_value:.4f}")
print(f"显著性水平: {alpha}")
print(f"结果: {'统计显著' if p_value < alpha else '不显著'}")
return t_stat, p_value
# 运行演示
t_stat, p_value = demonstrate_hypothesis_testing()
1.1.2 决策错误类型
在假设检验中,我们可能犯两种类型的错误:
错误类型 | 定义 | 概率表示 | 业务影响 |
---|---|---|---|
第一类错误 (Type I Error) | 错误拒绝真零假设 | α | 假阳性,推出无效的改动 |
第二类错误 (Type II Error) | 错误接受假零假设 | β | 假阴性,错过有价值的优化 |
统计功效 (Statistical Power) | 正确拒绝假零假设 | 1-β | 检测真实效应的能力 |
1.2 抽样分布与中心极限定理
理解抽样分布是解读统计结果的关键。中心极限定理告诉我们,无论总体分布如何,样本均值的分布都近似正态分布。
def demonstrate_sampling_distribution():
"""演示抽样分布的概念"""
np.random.seed(42)
# 创建一个偏态的总体分布(比如网站停留时间)
population = np.random.exponential(scale=300, size=10000)
# 不同样本量下的抽样分布
sample_sizes = [10, 30, 100, 500]
n_samples = 1000
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
axes = axes.ravel()
for i, n in enumerate(sample_sizes):
sample_means = []
for _ in range(n_samples):
sample = np.random.choice(population, size=n, replace=False)
sample_means.append(np.mean(sample))
axes[i].hist(sample_means, bins=30, density=True, alpha=0.7)
axes[i].set_title(f'样本量 n = {n}\n均值 = {np.mean(sample_means):.1f}, 标准误 = {np.std(sample_means):.1f}')
axes[i].set_xlabel('样本均值')
axes[i].set_ylabel('密度')
# 叠加正态分布曲线
xmin, xmax = axes[i].get_xlim()
x = np.linspace(xmin, xmax, 100)
normal_curve = stats.norm.pdf(x, np.mean(sample_means), np.std(sample_means))
axes[i].plot(x, normal_curve, 'r-', linewidth=2, label='正态分布')
axes[i].legend()
plt.tight_layout()
plt.show()
# 计算总体参数
print(f"总体均值: {np.mean(population):.2f}")
print(f"总体标准差: {np.std(population):.2f}")
# 运行演示
demonstrate_sampling_distribution()
Lexical error on line 6. Unrecognized text.
... B --> B1[零假设 H₀] B --> B2[备择假设
----------------------^
II. P值的深入理解
2.1 P值的真实含义
P值是统计学中最容易被误解的概念之一。正确的理解是:P值是在零假设成立的前提下,观察到当前数据或更极端数据的概率。
2.1.1 P值的计算原理
class PValueExplainer:
"""P值解释器"""
def __init__(self):
self.interpretation_guidelines = {
(0, 0.001): "极强的证据反对零假设",
(0.001, 0.01): "非常强的证据反对零假设",
(0.01, 0.05): "强的证据反对零假设",
(0.05, 0.1): "弱的证据反对零假设",
(0.1, 1): "几乎没有证据反对零假设"
}
def calculate_p_value(self, control_data, treatment_data, test_type='t_test'):
"""计算P值"""
if test_type == 't_test':
# 独立样本t检验
t_stat, p_value = stats.ttest_ind(treatment_data, control_data)
return t_stat, p_value
elif test_type == 'proportion':
# 比例检验
from statsmodels.stats.proportion import proportions_ztest
count = [np.sum(treatment_data), np.sum(control_data)]
nobs = [len(treatment_data), len(control_data)]
z_stat, p_value = proportions_ztest(count, nobs)
return z_stat, p_value
else:
raise ValueError("不支持的检验类型")
def interpret_p_value(self, p_value):
"""解释P值的含义"""
for range_key, interpretation in self.interpretation_guidelines.items():
if range_key[0] <= p_value < range_key[1]:
return interpretation
return "无法解释的P值范围"
def demonstrate_p_value_simulation(self, true_effect=0, n=100, n_simulations=10000):
"""通过模拟展示P值的分布"""
np.random.seed(42)
p_values = []
for _ in range(n_simulations):
# 模拟A/B测试数据
control = np.random.normal(0, 1, n)
treatment = np.random.normal(true_effect, 1, n)
# 计算P值
_, p_value = stats.ttest_ind(treatment, control)
p_values.append(p_value)
# 绘制P值分布
plt.figure(figsize=(10, 6))
plt.hist(p_values, bins=50, density=True, alpha=0.7)
plt.axhline(1, color='red', linestyle='--', label='均匀分布参考线')
plt.xlabel('P值')
plt.ylabel('密度')
plt.title(f'P值分布模拟 (真实效应 = {true_effect}, 样本量 = {n})')
plt.legend()
plt.show()
# 计算不同显著性水平下的拒绝比例
alpha_levels = [0.001, 0.01, 0.05, 0.1]
rejection_rates = {}
for alpha in alpha_levels:
rejection_rate = np.mean(np.array(p_values) < alpha)
rejection_rates[alpha] = rejection_rate
print("不同显著性水平下的拒绝比例:")
for alpha, rate in rejection_rates.items():
print(f"α = {alpha}: {rate:.3f} (期望: {alpha})")
return p_values, rejection_rates
# 使用P值解释器
explainer = PValueExplainer()
# 模拟无真实效应的情况
print("情况1: 零假设为真 (无真实效应)")
p_values_null, rates_null = explainer.demonstrate_p_value_simulation(true_effect=0)
# 模拟有真实效应的情况
print("\n情况2: 备择假设为真 (有真实效应)")
p_values_effect, rates_effect = explainer.demonstrate_p_value_simulation(true_effect=0.3)
# 计算并解释具体示例
print("\n具体示例解释:")
np.random.seed(123)
control_example = np.random.normal(10, 2, 200)
treatment_example = np.random.normal(10.5, 2, 200) # 有真实效应
t_stat, p_value = explainer.calculate_p_value(control_example, treatment_example)
interpretation = explainer.interpret_p_value(p_value)
print(f"t统计量: {t_stat:.4f}")
print(f"P值: {p_value:.4f}")
print(f"解释: {interpretation}")
2.2 P值的常见误解
在实践中,P值经常被错误理解和应用。以下是几个关键的澄清:
误解 | 正确理解 | 示例说明 |
---|---|---|
P值代表零假设为真的概率 | P值是基于零假设为真计算的概率 | P=0.04不意味着零假设有4%的概率为真 |
小P值意味着大效应 | P值受样本量影响,小样本的大效应可能不显著 | 大样本的小效应可能产生小P值 |
P>0.05意味着"没有差异" | 只能说不拒绝零假设,不是证明没有差异 | 可能是样本量不足或效应太小 |
P值是实验重复成功的概率 | P值不是复制概率,每次实验都是独立的 | 95%置信度不意味着95%复制成功率 |
2.3 P值的局限性
P值作为一个单一的统计量,有其固有的局限性:
def demonstrate_p_value_limitations():
"""展示P值的局限性"""
np.random.seed(42)
# 场景1: 大样本量下的微小效应
print("场景1: 大样本量下的微小效应")
n_large = 10000
control_large = np.random.normal(10, 2, n_large)
treatment_large = np.random.normal(10.01, 2, n_large) # 微小效应
t_large, p_large = stats.ttest_ind(treatment_large, control_large)
effect_size_large = (np.mean(treatment_large) - np.mean(control_large)) / np.std(control_large)
print(f"样本量: {n_large}")
print(f"效应大小 (Cohen's d): {effect_size_large:.4f}")
print(f"P值: {p_large:.6f}")
print(f"统计显著: {p_large < 0.05}")
print(f"业务意义: {'可能无意义' if abs(effect_size_large) < 0.1 else '可能有意义'}")
print("\n" + "="*50)
# 场景2: 小样本量下的中等效应
print("场景2: 小样本量下的中等效应")
n_small = 20
control_small = np.random.normal(10, 2, n_small)
treatment_small = np.random.normal(11, 2, n_small) # 中等效应
t_small, p_small = stats.ttest_ind(treatment_small, control_small)
effect_size_small = (np.mean(treatment_small) - np.mean(control_small)) / np.std(control_small)
print(f"样本量: {n_small}")
print(f"效应大小 (Cohen's d): {effect_size_small:.4f}")
print(f"P值: {p_small:.4f}")
print(f"统计显著: {p_small < 0.05}")
print(f"业务意义: {'可能无意义' if abs(effect_size_small) < 0.1 else '可能有意义'}")
# 可视化比较
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
# 场景1可视化
ax1.hist(control_large, alpha=0.7, bins=30, label='对照组')
ax1.hist(treatment_large, alpha=0.7, bins=30, label='实验组')
ax1.set_xlabel('指标值')
ax1.set_ylabel('频数')
ax1.set_title(f'大样本微小效应\nP值={p_large:.6f}, 效应大小={effect_size_large:.4f}')
ax1.legend()
# 场景2可视化
ax2.hist(control_small, alpha=0.7, bins=15, label='对照组')
ax2.hist(treatment_small, alpha=0.7, bins=15, label='实验组')
ax2.set_xlabel('指标值')
ax2.set_ylabel('频数')
ax2.set_title(f'小样本中等效应\nP值={p_small:.4f}, 效应大小={effect_size_small:.4f}')
ax2.legend()
plt.tight_layout()
plt.show()
return {
'large_sample': {'n': n_large, 'p': p_large, 'effect_size': effect_size_large},
'small_sample': {'n': n_small, 'p': p_small, 'effect_size': effect_size_small}
}
# 运行演示
limitation_results = demonstrate_p_value_limitations()
Lexical error on line 20. Unrecognized text.
... B1 --> B11[基于H₀的计算] B2 --> B21
----------------------^
III. 置信区间的原理与应用
3.1 置信区间的统计学基础
置信区间提供了参数估计的不确定性范围,是对P值的重要补充。
3.1.1 置信区间的计算原理
class ConfidenceIntervalCalculator:
"""置信区间计算器"""
def __init__(self, confidence_level=0.95):
self.confidence_level = confidence_level
self.alpha = 1 - confidence_level
def calculate_mean_ci(self, data, method='t_distribution'):
"""计算均值的置信区间"""
n = len(data)
mean = np.mean(data)
std = np.std(data, ddof=1) # 样本标准差
if method == 't_distribution' and n < 30:
# 小样本使用t分布
t_value = stats.t.ppf(1 - self.alpha/2, df=n-1)
margin_of_error = t_value * (std / np.sqrt(n))
else:
# 大样本使用正态近似
z_value = stats.norm.ppf(1 - self.alpha/2)
margin_of_error = z_value * (std / np.sqrt(n))
ci_lower = mean - margin_of_error
ci_upper = mean + margin_of_error
return {
'mean': mean,
'std': std,
'n': n,
'margin_of_error': margin_of_error,
'ci_lower': ci_lower,
'ci_upper': ci_upper,
'ci_length': ci_upper - ci_lower
}
def calculate_proportion_ci(self, successes, trials, method='normal'):
"""计算比例的置信区间"""
p = successes / trials
if method == 'normal':
# 正态近似方法
z_value = stats.norm.ppf(1 - self.alpha/2)
margin_of_error = z_value * np.sqrt(p * (1 - p) / trials)
elif method == 'wilson':
# Wilson score区间
z = stats.norm.ppf(1 - self.alpha/2)
denominator = 1 + z**2 / trials
centre_adjusted_probability = p + z**2 / (2 * trials)
adjusted_standard_deviation = np.sqrt(p * (1 - p) / trials + z**2 / (4 * trials**2))
ci_lower = (centre_adjusted_probability - z * adjusted_standard_deviation) / denominator
ci_upper = (centre_adjusted_probability + z * adjusted_standard_deviation) / denominator
margin_of_error = (ci_upper - ci_lower) / 2
return {
'proportion': p,
'successes': successes,
'trials': trials,
'margin_of_error': margin_of_error,
'ci_lower': ci_lower,
'ci_upper': ci_upper,
'method': 'wilson'
}
ci_lower = max(0, p - margin_of_error)
ci_upper = min(1, p + margin_of_error)
return {
'proportion': p,
'successes': successes,
'trials': trials,
'margin_of_error': margin_of_error,
'ci_lower': ci_lower,
'ci_upper': ci_upper,
'method': 'normal'
}
def calculate_difference_ci(self, control_data, treatment_data):
"""计算两组差异的置信区间"""
control_mean = np.mean(control_data)
treatment_mean = np.mean(treatment_data)
mean_difference = treatment_mean - control_mean
# 合并标准差
n_control = len(control_data)
n_treatment = len(treatment_data)
std_control = np.std(control_data, ddof=1)
std_treatment = np.std(treatment_data, ddof=1)
pooled_std = np.sqrt(((n_control-1)*std_control**2 + (n_treatment-1)*std_treatment**2) /
(n_control + n_treatment - 2))
# 标准误
se_difference = pooled_std * np.sqrt(1/n_control + 1/n_treatment)
# t值
df = n_control + n_treatment - 2
t_value = stats.t.ppf(1 - self.alpha/2, df)
margin_of_error = t_value * se_difference
return {
'mean_difference': mean_difference,
'relative_difference': mean_difference / control_mean,
'ci_lower': mean_difference - margin_of_error,
'ci_upper': mean_difference + margin_of_error,
'margin_of_error': margin_of_error,
'standard_error': se_difference
}
def demonstrate_confidence_interval_interpretation():
"""演示置信区间的正确解释"""
np.random.seed(42)
# 模拟多个实验
n_experiments = 100
true_mean = 10
sample_size = 30
confidence_level = 0.95
ci_calculator = ConfidenceIntervalCalculator(confidence_level)
# 存储结果
results = []
for i in range(n_experiments):
# 生成样本数据
sample_data = np.random.normal(true_mean, 2, sample_size)
# 计算置信区间
ci_result = ci_calculator.calculate_mean_ci(sample_data)
# 检查是否包含真实均值
contains_true_mean = ci_result['ci_lower'] <= true_mean <= ci_result['ci_upper']
results.append({
'experiment': i+1,
'sample_mean': ci_result['mean'],
'ci_lower': ci_result['ci_lower'],
'ci_upper': ci_result['ci_upper'],
'contains_true_mean': contains_true_mean
})
# 可视化结果
plt.figure(figsize=(12, 8))
# 选择前20个实验进行可视化
display_results = results[:20]
for i, result in enumerate(display_results):
color = 'green' if result['contains_true_mean'] else 'red'
plt.plot([result['ci_lower'], result['ci_upper']], [i, i], color=color, linewidth=2)
plt.plot(result['sample_mean'], i, 'ko', markersize=4)
plt.axvline(true_mean, color='blue', linestyle='--', linewidth=2, label='真实均值')
plt.xlabel('指标值')
plt.ylabel('实验编号')
plt.title(f'置信区间演示 ({n_experiments}个实验,置信水平{confidence_level:.0%})')
plt.legend()
# 计算包含真实均值的比例
coverage_rate = np.mean([r['contains_true_mean'] for r in results])
print(f"置信区间包含真实均值的比例: {coverage_rate:.3f} (期望: {confidence_level})")
plt.show()
return results, coverage_rate
# 运行演示
ci_results, coverage_rate = demonstrate_confidence_interval_interpretation()
# 实际数据示例
print("\n实际数据示例:")
calculator = ConfidenceIntervalCalculator(confidence_level=0.95)
# 比例数据示例
prop_result = calculator.calculate_proportion_ci(successes=150, trials=1000)
print(f"转化率置信区间: {prop_result['proportion']:.3f} [{prop_result['ci_lower']:.3f}, {prop_result['ci_upper']:.3f}]")
# 均值数据示例
np.random.seed(123)
data_example = np.random.normal(100, 15, 50)
mean_result = calculator.calculate_mean_ci(data_example)
print(f"均值置信区间: {mean_result['mean']:.2f} [{mean_result['ci_lower']:.2f}, {mean_result['ci_upper']:.2f}]")
3.2 置信区间的正确解读
置信区间的解读需要特别注意其概率含义:
解读方面 | 正确理解 | 错误理解 |
---|---|---|
概率解释 | 95%的类似构造的区间会包含真实参数 | 真实参数有95%的概率落在此区间内 |
区间宽度 | 宽度反映估计精度,受样本量影响 | 宽度与效应大小直接相关 |
与零的关系 | 区间包含零意味着不拒绝零假设 | 区间包含零意味着没有效应 |
多重比较 | 同时看多个区间需要校正 | 每个区间独立解释 |
3.3 置信区间的业务应用
在业务场景中,置信区间提供了比P值更丰富的信息:
def business_decision_framework():
"""基于置信区间的业务决策框架"""
# 模拟多个A/B测试场景
scenarios = [
{
'name': '明确的正面效果',
'control_rate': 0.10,
'treatment_rate': 0.13,
'n_per_group': 1000,
'business_impact': '高'
},
{
'name': '不确定的小效应',
'control_rate': 0.10,
'treatment_rate': 0.105,
'n_per_group': 1000,
'business_impact': '中'
},
{
'name': '统计显著但业务意义小',
'control_rate': 0.50,
'treatment_rate': 0.51,
'n_per_group': 10000,
'business_impact': '低'
},
{
'name': '负面效果风险',
'control_rate': 0.10,
'treatment_rate': 0.09,
'n_per_group': 500,
'business_impact': '高'
}
]
calculator = ConfidenceIntervalCalculator(confidence_level=0.95)
results = []
for scenario in scenarios:
# 模拟数据
np.random.seed(42)
control_data = np.random.binomial(1, scenario['control_rate'], scenario['n_per_group'])
treatment_data = np.random.binomial(1, scenario['treatment_rate'], scenario['n_per_group'])
# 计算差异的置信区间
control_success = np.sum(control_data)
treatment_success = np.sum(treatment_data)
control_ci = calculator.calculate_proportion_ci(control_success, scenario['n_per_group'])
treatment_ci = calculator.calculate_proportion_ci(treatment_success, scenario['n_per_group'])
# 计算相对提升的置信区间
relative_difference = (treatment_ci['proportion'] - control_ci['proportion']) / control_ci['proportion']
# 使用bootstrap计算相对差异的CI
bootstrap_differences = []
n_bootstrap = 1000
for _ in range(n_bootstrap):
control_bootstrap = np.random.choice(control_data, size=len(control_data), replace=True)
treatment_bootstrap = np.random.choice(treatment_data, size=len(treatment_data), replace=True)
control_rate_bs = np.mean(control_bootstrap)
treatment_rate_bs = np.mean(treatment_bootstrap)
if control_rate_bs > 0:
rel_diff_bs = (treatment_rate_bs - control_rate_bs) / control_rate_bs
bootstrap_differences.append(rel_diff_bs)
rel_ci_lower = np.percentile(bootstrap_differences, 2.5)
rel_ci_upper = np.percentile(bootstrap_differences, 97.5)
# 统计检验
from statsmodels.stats.proportion import proportions_ztest
count = [treatment_success, control_success]
nobs = [scenario['n_per_group'], scenario['n_per_group']]
z_stat, p_value = proportions_ztest(count, nobs)
# 业务决策
min_meaningful_effect = 0.05 # 5%最小业务意义效应
if rel_ci_lower > min_meaningful_effect:
decision = "发布 - 明确的正面效果"
confidence = "高"
elif rel_ci_lower > 0 and rel_ci_upper > min_meaningful_effect:
decision = "继续测试 - 可能有正面效果"
confidence = "中"
elif rel_ci_upper < 0:
decision = "停止 - 负面效果"
confidence = "高"
else:
decision = "保持 - 效果不明确"
confidence = "低"
results.append({
'场景': scenario['name'],
'对照组转化率': f"{control_ci['proportion']:.3f} [{control_ci['ci_lower']:.3f}, {control_ci['ci_upper']:.3f}]",
'实验组转化率': f"{treatment_ci['proportion']:.3f} [{treatment_ci['ci_lower']:.3f}, {treatment_ci['ci_upper']:.3f}]",
'相对提升': f"{relative_difference:+.1%} [{rel_ci_lower:+.1%}, {rel_ci_upper:+.1%}]",
'P值': f"{p_value:.4f}",
'业务影响': scenario['business_impact'],
'决策建议': decision,
'置信度': confidence
})
# 创建结果表格
import pandas as pd
df_results = pd.DataFrame(results)
return df_results
# 运行业务决策框架
decision_results = business_decision_framework()
print("基于置信区间的业务决策分析")
print(decision_results.to_string(index=False))
Lexical error on line 20. Unrecognized text.
... B1 --> B11[点估计±误差范围] B2 --> B2
----------------------^
IV. 统计显著性的误解与正解
4.1 统计显著性的真实含义
统计显著性是一个被广泛使用但经常被误解的概念。正确理解其含义对于避免决策错误至关重要。
4.1.1 显著性水平的业务含义
class StatisticalSignificanceExplainer:
"""统计显著性解释器"""
def __init__(self, alpha=0.05):
self.alpha = alpha
def demonstrate_significance_errors(self, n_simulations=10000):
"""演示显著性检验中的错误"""
np.random.seed(42)
# 模拟无真实效应的情况
false_positives = 0
false_negatives = 0
true_effect_scenarios = [0, 0.1, 0.2, 0.3] # 不同的真实效应大小
sample_sizes = [50, 100, 500, 1000]
results = {}
for true_effect in true_effect_scenarios:
effect_results = {}
for n in sample_sizes:
significant_count = 0
for _ in range(n_simulations):
control_data = np.random.normal(0, 1, n)
treatment_data = np.random.normal(true_effect, 1, n)
t_stat, p_value = stats.ttest_ind(treatment_data, control_data)
if p_value < self.alpha:
significant_count += 1
power = significant_count / n_simulations
effect_results[n] = power
# 记录错误情况
if true_effect == 0:
false_positives = significant_count / n_simulations
else:
false_negatives = 1 - power
results[true_effect] = effect_results
# 可视化结果
plt.figure(figsize=(12, 8))
for true_effect, power_dict in results.items():
sample_list = list(power_dict.keys())
power_list = list(power_dict.values())
if true_effect == 0:
plt.plot(sample_list, power_list, 'r--', linewidth=2,
label=f'真实效应=0 (第一类错误)')
else:
plt.plot(sample_list, power_list, '-', linewidth=2,
label=f'真实效应={true_effect}')
plt.axhline(self.alpha, color='red', linestyle=':', alpha=0.7, label='α水平')
plt.xlabel('样本量 (每组)')
plt.ylabel('拒绝零假设的比例')
plt.title('统计功效与样本量、真实效应的关系')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
return results
def p_hacking_simulation(self, n_experiments=1000, max_sample_size=1000):
"""演示P值操纵(p-hacking)的影响"""
np.random.seed(42)
# 模拟无真实效应的情况
p_values = []
sample_sizes_used = []
for _ in range(n_experiments):
# 研究人员不断收集数据直到得到显著结果
current_n = 10
found_significant = False
while current_n <= max_sample_size and not found_significant:
control_data = np.random.normal(0, 1, current_n)
treatment_data = np.random.normal(0, 1, current_n) # 无真实效应
t_stat, p_value = stats.ttest_ind(treatment_data, control_data)
if p_value < self.alpha:
found_significant = True
p_values.append(p_value)
sample_sizes_used.append(current_n)
current_n += 10
if not found_significant:
p_values.append(p_value)
sample_sizes_used.append(max_sample_size)
# 计算假阳性率
false_positive_rate = np.mean(np.array(p_values) < self.alpha)
# 可视化
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
# P值分布
ax1.hist(p_values, bins=30, alpha=0.7)
ax1.axvline(self.alpha, color='red', linestyle='--', label=f'α={self.alpha}')
ax1.set_xlabel('P值')
ax1.set_ylabel('频数')
ax1.set_title(f'P值分布 (假阳性率={false_positive_rate:.3f})')
ax1.legend()
# 样本量分布
ax2.hist(sample_sizes_used, bins=30, alpha=0.7)
ax2.set_xlabel('达到显著性时的样本量')
ax2.set_ylabel('频数')
ax2.set_title('P值操纵对样本量的影响')
plt.tight_layout()
plt.show()
return false_positive_rate, p_values, sample_sizes_used
# 使用统计显著性解释器
significance_explainer = StatisticalSignificanceExplainer(alpha=0.05)
# 演示统计功效
print("统计功效分析:")
power_results = significance_explainer.demonstrate_significance_errors()
# 演示P值操纵
print("\nP值操纵演示:")
false_positive_rate, p_values, sample_sizes = significance_explainer.p_hacking_simulation()
print(f"P值操纵后的假阳性率: {false_positive_rate:.3f} (期望: {significance_explainer.alpha})")
4.2 统计显著性的常见陷阱
在实践中,统计显著性经常被误用和误解。以下是几个关键陷阱:
陷阱类型 | 描述 | 影响 | 避免方法 |
---|---|---|---|
P值操纵 | 不断收集数据直到得到显著结果 | 假阳性率大幅增加 | 预先确定样本量,避免中途查看 |
多重比较 | 同时进行多个检验而不校正 | 家族wise错误率膨胀 | 使用Bonferroni或FDR校正 |
显著性追逐 | 只报告显著结果,忽略不显著结果 | 发表偏倚,扭曲真实证据 | 报告所有检验结果 |
误解P值 | 将P值视为效应大小或重要性的度量 | 过度解读微小效应 | 结合置信区间和效应大小 |
4.3 超越统计显著性
现代统计学强调要超越单纯的"显著/不显著"二分法,采用更全面的结果解读方法:
def comprehensive_results_interpretation():
"""综合结果解读框架"""
# 模拟多个A/B测试场景
scenarios = [
{
'name': '传统显著结果',
'control_mean': 10,
'treatment_mean': 11,
'std': 2,
'n_per_group': 100,
'business_context': '高风险决策'
},
{
'name': '边界显著结果',
'control_mean': 10,
'treatment_mean': 10.3,
'std': 2,
'n_per_group': 1000,
'business_context': '中等风险决策'
},
{
'name': '不显著但有效应',
'control_mean': 10,
'treatment_mean': 10.5,
'std': 3,
'n_per_group': 50,
'business_context': '探索性测试'
},
{
'name': '显著但无业务意义',
'control_mean': 10,
'treatment_mean': 10.05,
'std': 1,
'n_per_group': 10000,
'business_context': '大规模监控'
}
]
results = []
for scenario in scenarios:
# 生成数据
np.random.seed(42)
control_data = np.random.normal(scenario['control_mean'], scenario['std'], scenario['n_per_group'])
treatment_data = np.random.normal(scenario['treatment_mean'], scenario['std'], scenario['n_per_group'])
# 计算各种统计量
t_stat, p_value = stats.ttest_ind(treatment_data, control_data)
# 效应大小
cohens_d = (np.mean(treatment_data) - np.mean(control_data)) / np.std(control_data)
# 置信区间
ci_calculator = ConfidenceIntervalCalculator(confidence_level=0.95)
diff_ci = ci_calculator.calculate_difference_ci(control_data, treatment_data)
# 贝叶斯因子(近似)
bayes_factor = np.exp((t_stat**2 / 2) * np.log1p(scenario['n_per_group']/2) - t_stat**2/2)
# 综合评估
statistical_significance = p_value < 0.05
practical_significance = abs(cohens_d) > 0.2 # 小效应阈值
precise_estimate = (diff_ci['ci_upper'] - diff_ci['ci_lower']) < 1 # 区间宽度阈值
# 决策建议
if statistical_significance and practical_significance and precise_estimate:
decision = "强烈推荐实施"
confidence = "高"
elif statistical_significance and practical_significance:
decision = "推荐实施,但需要更多数据确认"
confidence = "中"
elif not statistical_significance but practical_significance and precise_estimate:
decision = "可能有效,建议进一步测试"
confidence = "中低"
elif statistical_significance but not practical_significance:
decision = "统计显著但业务影响小,可能不值得实施"
confidence = "低"
else:
decision = "证据不足,不建议实施"
confidence = "低"
results.append({
'场景': scenario['name'],
'P值': f"{p_value:.4f}",
'统计显著': '是' if statistical_significance else '否',
'Cohen\'s d': f"{cohens_d:.3f}",
'效应大小': '小' if abs(cohens_d) < 0.2 else '中' if abs(cohens_d) < 0.5 else '大',
'均值差异CI': f"[{diff_ci['ci_lower']:.2f}, {diff_ci['ci_upper']:.2f}]",
'贝叶斯因子': f"{bayes_factor:.2f}",
'业务背景': scenario['business_context'],
'决策建议': decision,
'置信度': confidence
})
# 创建综合结果表
import pandas as pd
df_comprehensive = pd.DataFrame(results)
return df_comprehensive
# 运行综合解读框架
comprehensive_results = comprehensive_results_interpretation()
print("综合结果解读分析")
print(comprehensive_results.to_string(index=False))
# 可视化多维度评估
def visualize_multidimensional_assessment():
"""可视化多维度评估结果"""
# 创建雷达图数据
categories = ['统计显著性', '效应大小', '估计精度', '业务重要性', '实施成本']
scenario_scores = {
'传统显著结果': [9, 7, 8, 8, 6],
'边界显著结果': [6, 5, 9, 6, 7],
'不显著但有效应': [4, 6, 5, 7, 5],
'显著但无业务意义': [8, 3, 9, 2, 4]
}
# 创建雷达图
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, polar=True)
angles = np.linspace(0, 2*np.pi, len(categories), endpoint=False).tolist()
angles += angles[:1] # 闭合图形
for scenario, scores in scenario_scores.items():
values = scores + scores[:1] # 闭合图形
ax.plot(angles, values, 'o-', linewidth=2, label=scenario)
ax.fill(angles, values, alpha=0.1)
ax.set_xticks(angles[:-1])
ax.set_xticklabels(categories)
ax.set_ylim(0, 10)
ax.set_title('A/B测试结果多维度评估', size=14, y=1.05)
ax.legend(loc='upper right', bbox_to_anchor=(1.3, 1.0))
plt.tight_layout()
plt.show()
# 运行可视化
visualize_multidimensional_assessment()
V. 实际案例:A/B测试结果解读
5.1 电商转化率优化案例
让我们通过一个真实的电商案例来演示完整的A/B测试结果解读流程。
class ABTestAnalyzer:
"""A/B测试结果分析器"""
def __init__(self, confidence_level=0.95):
self.confidence_level = confidence_level
self.alpha = 1 - confidence_level
self.ci_calculator = ConfidenceIntervalCalculator(confidence_level)
def analyze_conversion_test(self, control_data, treatment_data, test_parameters):
"""分析转化率测试结果"""
# 基本统计量
control_conversions = np.sum(control_data)
control_total = len(control_data)
treatment_conversions = np.sum(treatment_data)
treatment_total = len(treatment_data)
control_rate = control_conversions / control_total
treatment_rate = treatment_conversions / treatment_total
absolute_difference = treatment_rate - control_rate
relative_difference = absolute_difference / control_rate
# 统计检验
from statsmodels.stats.proportion import proportions_ztest
count = [treatment_conversions, control_conversions]
nobs = [treatment_total, control_total]
z_stat, p_value = proportions_ztest(count, nobs)
# 置信区间
control_ci = self.ci_calculator.calculate_proportion_ci(control_conversions, control_total)
treatment_ci = self.ci_calculator.calculate_proportion_ci(treatment_conversions, treatment_total)
# 差异的置信区间 (使用bootstrap)
bootstrap_differences = []
n_bootstrap = 10000
for _ in range(n_bootstrap):
control_bs = np.random.choice(control_data, size=len(control_data), replace=True)
treatment_bs = np.random.choice(treatment_data, size=len(treatment_data), replace=True)
control_rate_bs = np.mean(control_bs)
treatment_rate_bs = np.mean(treatment_bs)
abs_diff_bs = treatment_rate_bs - control_rate_bs
bootstrap_differences.append(abs_diff_bs)
abs_diff_ci_lower = np.percentile(bootstrap_differences, (1-self.confidence_level)/2 * 100)
abs_diff_ci_upper = np.percentile(bootstrap_differences, (1 - (1-self.confidence_level)/2) * 100)
# 相对差异的置信区间
bootstrap_relative_differences = []
for _ in range(n_bootstrap):
control_bs = np.random.choice(control_data, size=len(control_data), replace=True)
treatment_bs = np.random.choice(treatment_data, size=len(treatment_data), replace=True)
control_rate_bs = np.mean(control_bs)
treatment_rate_bs = np.mean(treatment_bs)
if control_rate_bs > 0:
rel_diff_bs = (treatment_rate_bs - control_rate_bs) / control_rate_bs
bootstrap_relative_differences.append(rel_diff_bs)
rel_diff_ci_lower = np.percentile(bootstrap_relative_differences, (1-self.confidence_level)/2 * 100)
rel_diff_ci_upper = np.percentile(bootstrap_relative_differences, (1 - (1-self.confidence_level)/2) * 100)
# 统计功效分析
from statsmodels.stats.power import NormalIndPower
power_analysis = NormalIndPower()
effect_size = self.ci_calculator.cohens_h(control_rate, treatment_rate)
achieved_power = power_analysis.solve_power(
effect_size=effect_size,
nobs1=treatment_total,
alpha=self.alpha,
ratio=control_total/treatment_total
)
# 业务影响评估
business_impact = self.assess_business_impact(
relative_difference, rel_diff_ci_lower, rel_diff_ci_upper,
test_parameters
)
return {
'control': {
'conversions': control_conversions,
'total': control_total,
'rate': control_rate,
'ci_lower': control_ci['ci_lower'],
'ci_upper': control_ci['ci_upper']
},
'treatment': {
'conversions': treatment_conversions,
'total': treatment_total,
'rate': treatment_rate,
'ci_lower': treatment_ci['ci_lower'],
'ci_upper': treatment_ci['ci_upper']
},
'difference': {
'absolute': absolute_difference,
'absolute_ci_lower': abs_diff_ci_lower,
'absolute_ci_upper': abs_diff_ci_upper,
'relative': relative_difference,
'relative_ci_lower': rel_diff_ci_lower,
'relative_ci_upper': rel_diff_ci_upper
},
'statistics': {
'z_statistic': z_stat,
'p_value': p_value,
'significant': p_value < self.alpha,
'effect_size': effect_size,
'achieved_power': achieved_power
},
'business_impact': business_impact
}
def assess_business_impact(self, relative_effect, ci_lower, ci_upper, parameters):
"""评估业务影响"""
min_meaningful_effect = parameters.get('min_meaningful_effect', 0.01)
implementation_cost = parameters.get('implementation_cost', 'low')
strategic_alignment = parameters.get('strategic_alignment', 'medium')
# 确定性评估
if ci_lower > min_meaningful_effect:
certainty = '高'
recommendation = '强烈推荐实施'
elif ci_lower > 0 and ci_upper > min_meaningful_effect:
certainty = '中'
recommendation = '推荐实施,但需要监控'
elif ci_upper < 0:
certainty = '高'
recommendation = '停止,有负面影响风险'
else:
certainty = '低'
recommendation = '继续测试或放弃'
# 预期收益计算
baseline_conversions = parameters.get('baseline_conversions', 1000)
monthly_visitors = parameters.get('monthly_visitors', 100000)
average_order_value = parameters.get('average_order_value', 100)
expected_monthly_improvement = (relative_effect * baseline_conversions * average_order_value)
conservative_estimate = (ci_lower * baseline_conversions * average_order_value)
return {
'certainty': certainty,
'recommendation': recommendation,
'expected_monthly_impact': expected_monthly_improvement,
'conservative_monthly_impact': conservative_estimate,
'min_meaningful_effect': min_meaningful_effect,
'implementation_cost': implementation_cost,
'strategic_alignment': strategic_alignment
}
def generate_comprehensive_report(self, analysis_results, experiment_name):
"""生成综合报告"""
report = [
f"A/B测试综合分析报告: {experiment_name}",
"=" * 60,
f"生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M')}",
f"置信水平: {self.confidence_level:.0%}",
""
]
# 基本结果
report.append("基本结果:")
report.append(f" 对照组: {analysis_results['control']['conversions']}/{analysis_results['control']['total']} "
f"({analysis_results['control']['rate']:.3f}) "
f"[{analysis_results['control']['ci_lower']:.3f}, {analysis_results['control']['ci_upper']:.3f}]")
report.append(f" 实验组: {analysis_results['treatment']['conversions']}/{analysis_results['treatment']['total']} "
f"({analysis_results['treatment']['rate']:.3f}) "
f"[{analysis_results['treatment']['ci_lower']:.3f}, {analysis_results['treatment']['ci_upper']:.3f}]")
report.append(f" 绝对差异: {analysis_results['difference']['absolute']:.4f} "
f"[{analysis_results['difference']['absolute_ci_lower']:.4f}, {analysis_results['difference']['absolute_ci_upper']:.4f}]")
report.append(f" 相对差异: {analysis_results['difference']['relative']:.2%} "
f"[{analysis_results['difference']['relative_ci_lower']:.2%}, {analysis_results['difference']['relative_ci_upper']:.2%}]")
# 统计检验结果
report.append("\n统计检验:")
report.append(f" Z统计量: {analysis_results['statistics']['z_statistic']:.4f}")
report.append(f" P值: {analysis_results['statistics']['p_value']:.4f}")
report.append(f" 统计显著性: {'是' if analysis_results['statistics']['significant'] else '否'}")
report.append(f" 效应大小 (Cohen\'s h): {analysis_results['statistics']['effect_size']:.4f}")
report.append(f" 统计功效: {analysis_results['statistics']['achieved_power']:.3f}")
# 业务影响
report.append("\n业务影响评估:")
report.append(f" 确定性: {analysis_results['business_impact']['certainty']}")
report.append(f" 推荐决策: {analysis_results['business_impact']['recommendation']}")
report.append(f" 预期月收益: ${analysis_results['business_impact']['expected_monthly_impact']:,.2f}")
report.append(f" 保守月收益: ${analysis_results['business_impact']['conservative_monthly_impact']:,.2f}")
report.append(f" 最小有意义效应: {analysis_results['business_impact']['min_meaningful_effect']:.1%}")
report.append(f" 实施成本: {analysis_results['business_impact']['implementation_cost']}")
report.append(f" 战略对齐: {analysis_results['business_impact']['strategic_alignment']}")
# 建议
report.append("\n建议:")
if analysis_results['business_impact']['certainty'] == '高':
if analysis_results['difference']['relative'] > 0:
report.append(" • 立即实施新版本")
report.append(" • 监控长期效果")
report.append(" • 评估对其他指标的影响")
else:
report.append(" • 停止实验")
report.append(" • 分析负面原因")
report.append(" • 考虑回滚更改")
elif analysis_results['business_impact']['certainty'] == '中':
report.append(" • 考虑延长实验时间")
report.append(" • 收集更多用户反馈")
report.append(" • 评估实施风险")
else:
report.append(" • 需要更多数据")
report.append(" • 重新评估实验设计")
report.append(" • 考虑其他优化方向")
return "\n".join(report)
# 实际案例演示
def ecommerce_case_study():
"""电商案例研究"""
# 模拟电商A/B测试数据
np.random.seed(42)
# 场景:网站结账流程优化
control_data = np.random.binomial(1, 0.08, 5000) # 当前转化率8%
treatment_data = np.random.binomial(1, 0.092, 5000) # 新版本转化率9.2% (15%提升)
# 测试参数
test_parameters = {
'min_meaningful_effect': 0.05, # 5%最小业务意义提升
'baseline_conversions': 8000, # 月转化数
'monthly_visitors': 100000, # 月访问量
'average_order_value': 150, # 平均订单价值
'implementation_cost': 'low',
'strategic_alignment': 'high'
}
# 分析结果
analyzer = ABTestAnalyzer(confidence_level=0.95)
results = analyzer.analyze_conversion_test(control_data, treatment_data, test_parameters)
# 生成报告
report = analyzer.generate_comprehensive_report(results, "结账流程优化测试")
print(report)
# 可视化结果
visualize_ab_test_results(results)
return results
def visualize_ab_test_results(results):
"""可视化A/B测试结果"""
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12))
# 1. 转化率比较
groups = ['对照组', '实验组']
rates = [results['control']['rate'], results['treatment']['rate']]
ci_lower = [results['control']['ci_lower'], results['treatment']['ci_lower']]
ci_upper = [results['control']['ci_upper'], results['treatment']['ci_upper']]
bars = ax1.bar(groups, rates, yerr=[rates[i]-ci_lower[i] for i in range(2)],
capsize=5, alpha=0.7, color=['blue', 'orange'])
ax1.set_ylabel('转化率')
ax1.set_title('转化率比较')
ax1.grid(True, alpha=0.3)
# 在柱状图上添加数值标签
for i, bar in enumerate(bars):
height = bar.get_height()
ax1.text(bar.get_x() + bar.get_width()/2., height + 0.001,
f'{rates[i]:.3f}', ha='center', va='bottom')
# 2. 相对提升森林图
relative_effect = results['difference']['relative']
ci_lower_rel = results['difference']['relative_ci_lower']
ci_upper_rel = results['difference']['relative_ci_upper']
ax2.axvline(0, color='gray', linestyle='-', alpha=0.3)
ax2.errorbar(relative_effect, 0, xerr=[[abs(relative_effect - ci_lower_rel)],
[abs(ci_upper_rel - relative_effect)]],
fmt='o', capsize=5, color='red', markersize=8)
ax2.set_xlabel('相对提升')
ax2.set_title('相对提升估计')
ax2.set_yticks([])
ax2.grid(True, alpha=0.3)
# 添加业务意义阈值
min_meaningful = results['business_impact']['min_meaningful_effect']
ax2.axvline(min_meaningful, color='green', linestyle='--', alpha=0.7,
label=f'最小业务意义({min_meaningful:.1%})')
ax2.legend()
# 3. 统计显著性可视化
p_value = results['statistics']['p_value']
alpha = 0.05
x = np.linspace(-3, 3, 1000)
y = stats.norm.pdf(x)
ax3.plot(x, y, 'b-', linewidth=2, label='零假设分布')
# 标记观察到的统计量
z_stat = results['statistics']['z_statistic']
ax3.axvline(z_stat, color='red', linestyle='--', linewidth=2,
label=f'观察到的Z值({z_stat:.2f})')
# 标记拒绝域
critical_value = stats.norm.ppf(1 - alpha/2)
x_fill_left = np.linspace(-3, -critical_value, 100)
x_fill_right = np.linspace(critical_value, 3, 100)
ax3.fill_between(x_fill_left, stats.norm.pdf(x_fill_left), alpha=0.3, color='red')
ax3.fill_between(x_fill_right, stats.norm.pdf(x_fill_right), alpha=0.3, color='red')
ax3.set_xlabel('Z统计量')
ax3.set_ylabel('概率密度')
ax3.set_title(f'统计显著性 (P值={p_value:.4f})')
ax3.legend()
ax3.grid(True, alpha=0.3)
# 4. 业务影响评估
categories = ['统计确定性', '预期收益', '实施可行性', '战略对齐']
scores = [0.8 if results['business_impact']['certainty'] == '高' else 0.5,
min(1.0, results['business_impact']['expected_monthly_impact'] / 10000),
0.9 if results['business_impact']['implementation_cost'] == 'low' else 0.5,
0.8 if results['business_impact']['strategic_alignment'] == 'high' else 0.5]
ax4.bar(categories, scores, color=['skyblue', 'lightgreen', 'gold', 'lightcoral'])
ax4.set_ylim(0, 1)
ax4.set_ylabel('评分')
ax4.set_title('业务影响多维度评估')
ax4.grid(True, alpha=0.3)
# 在柱状图上添加评分标签
for i, score in enumerate(scores):
ax4.text(i, score + 0.02, f'{score:.2f}', ha='center', va='bottom')
plt.tight_layout()
plt.show()
# 运行电商案例研究
print("电商A/B测试案例研究")
case_study_results = ecommerce_case_study()
5.2 敏感性分析与稳健性检验
为了确保结果的可靠性,我们需要进行敏感性分析:
def sensitivity_analysis(analysis_results, n_simulations=1000):
"""敏感性分析"""
print("进行敏感性分析...")
# 1. 样本量敏感性
original_sample_size = analysis_results['control']['total']
sample_sizes = [int(original_sample_size * 0.5), original_sample_size,
int(original_sample_size * 1.5), int(original_sample_size * 2)]
sensitivity_results = []
for n in sample_sizes:
# 通过bootstrap重采样模拟不同样本量
np.random.seed(42)
control_rates = []
treatment_rates = []
p_values = []
for _ in range(n_simulations):
control_bootstrap = np.random.choice([0, 1], size=n,
p=[1-analysis_results['control']['rate'],
analysis_results['control']['rate']])
treatment_bootstrap = np.random.choice([0, 1], size=n,
p=[1-analysis_results['treatment']['rate'],
analysis_results['treatment']['rate']])
control_rate = np.mean(control_bootstrap)
treatment_rate = np.mean(treatment_bootstrap)
from statsmodels.stats.proportion import proportions_ztest
count = [np.sum(treatment_bootstrap), np.sum(control_bootstrap)]
nobs = [n, n]
_, p_value = proportions_ztest(count, nobs)
control_rates.append(control_rate)
treatment_rates.append(treatment_rate)
p_values.append(p_value)
significant_proportion = np.mean(np.array(p_values) < 0.05)
sensitivity_results.append({
'sample_size': n,
'mean_control_rate': np.mean(control_rates),
'mean_treatment_rate': np.mean(treatment_rates),
'significant_proportion': significant_proportion,
'power': significant_proportion
})
# 可视化敏感性分析结果
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
# 样本量对功效的影响
sample_sizes_plot = [r['sample_size'] for r in sensitivity_results]
power_plot = [r['power'] for r in sensitivity_results]
ax1.plot(sample_sizes_plot, power_plot, 'bo-', linewidth=2, markersize=8)
ax1.axhline(0.8, color='red', linestyle='--', label='期望功效(80%)')
ax1.set_xlabel('样本量')
ax1.set_ylabel('统计功效')
ax1.set_title('样本量对统计功效的影响')
ax1.legend()
ax1.grid(True, alpha=0.3)
# 2. 异常值敏感性
# 模拟不同异常值比例的影响
outlier_ratios = [0.00, 0.01, 0.05, 0.10]
outlier_results = []
base_control_rate = analysis_results['control']['rate']
base_treatment_rate = analysis_results['treatment']['rate']
for outlier_ratio in outlier_ratios:
# 在数据中引入异常值
n_outliers = int(original_sample_size * outlier_ratio)
n_normal = original_sample_size - n_outliers
# 正常数据
control_normal = np.random.binomial(1, base_control_rate, n_normal)
treatment_normal = np.random.binomial(1, base_treatment_rate, n_normal)
# 异常值(极端高转化)
control_outliers = np.ones(n_outliers)
treatment_outliers = np.ones(n_outliers)
# 合并数据
control_with_outliers = np.concatenate([control_normal, control_outliers])
treatment_with_outliers = np.concatenate([treatment_normal, treatment_outliers])
# 重新分析
from statsmodels.stats.proportion import proportions_ztest
count = [np.sum(treatment_with_outliers), np.sum(control_with_outliers)]
nobs = [len(treatment_with_outliers), len(control_with_outliers)]
_, p_value = proportions_ztest(count, nobs)
control_rate_outlier = np.mean(control_with_outliers)
treatment_rate_outlier = np.mean(treatment_with_outliers)
relative_effect_outlier = (treatment_rate_outlier - control_rate_outlier) / control_rate_outlier
outlier_results.append({
'outlier_ratio': outlier_ratio,
'p_value': p_value,
'relative_effect': relative_effect_outlier,
'control_rate': control_rate_outlier,
'treatment_rate': treatment_rate_outlier
})
outlier_ratios_plot = [r['outlier_ratio'] for r in outlier_results]
p_values_plot = [r['p_value'] for r in outlier_results]
effects_plot = [r['relative_effect'] for r in outlier_results]
ax2.plot(outlier_ratios_plot, p_values_plot, 'ro-', linewidth=2, markersize=8, label='P值')
ax2_twin = ax2.twinx()
ax2_twin.plot(outlier_ratios_plot, effects_plot, 'go--', linewidth=2, markersize=8, label='相对效应')
ax2.axhline(0.05, color='red', linestyle=':', alpha=0.7, label='显著性阈值')
ax2.set_xlabel('异常值比例')
ax2.set_ylabel('P值')
ax2_twin.set_ylabel('相对效应')
ax2.set_title('异常值对结果的影响')
# 合并图例
lines1, labels1 = ax2.get_legend_handles_labels()
lines2, labels2 = ax2_twin.get_legend_handles_labels()
ax2.legend(lines1 + lines2, labels1 + labels2, loc='upper left')
ax2.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print("\n敏感性分析总结:")
print(f"原始样本量: {original_sample_size}")
print(f"原始P值: {analysis_results['statistics']['p_value']:.4f}")
print(f"原始相对效应: {analysis_results['difference']['relative']:.2%}")
print(f"\n样本量敏感性:")
for result in sensitivity_results:
print(f" 样本量 {result['sample_size']}: 功效 = {result['power']:.3f}")
print(f"\n异常值敏感性:")
for result in outlier_results:
print(f" 异常值比例 {result['outlier_ratio']:.0%}: P值 = {result['p_value']:.4f}, "
f"相对效应 = {result['relative_effect']:.2%}")
return sensitivity_results, outlier_results
# 运行敏感性分析
sensitivity_results, outlier_results = sensitivity_analysis(case_study_results)
【声明】本内容来自华为云开发者社区博主,不代表华为云及华为云开发者社区的观点和立场。转载时必须标注文章的来源(华为云社区)、文章链接、文章作者等基本信息,否则作者和本社区有权追究责任。如果您发现本社区中有涉嫌抄袭的内容,欢迎发送邮件进行举报,并提供相关证据,一经查实,本社区将立刻删除涉嫌侵权内容,举报邮箱:
cloudbbs@huaweicloud.com
- 点赞
- 收藏
- 关注作者
评论(0)