2026-03-29 23:47:20 +08:00

808 lines
24 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
模拟数据生成模块
本模块提供用于测试因果推断的模拟数据生成功能,支持多种数据生成场景:
- 简单 ATE 场景
- 协变量场景
- 交互效应场景
- CATE 场景
作者CausalInferenceAgent
版本1.0.0
"""
import numpy as np
import pandas as pd
from abc import ABC, abstractmethod
from typing import Optional, Dict, Any, Union
from pathlib import Path
class BaseDataSimulator(ABC):
"""
数据模拟器基类
定义所有数据模拟器的通用接口。
"""
def __init__(
self,
n_samples: int = 1000,
treatment_rate: float = 0.5,
noise_level: float = 1.0,
effect_size: float = 1.0,
random_state: Optional[int] = None
):
"""
初始化数据模拟器
参数:
n_samples: 样本数量,默认为 1000
treatment_rate: 处理比例0-1 之间),默认为 0.5
noise_level: 噪声水平,默认为 1.0
effect_size: 效应大小,默认为 1.0
random_state: 随机种子,用于可重复性
"""
self.n_samples = n_samples
self.treatment_rate = treatment_rate
self.noise_level = noise_level
self.effect_size = effect_size
self.random_state = random_state
# 设置随机种子
if random_state is not None:
np.random.seed(random_state)
@abstractmethod
def generate(self) -> pd.DataFrame:
"""
生成模拟数据
返回:
包含模拟数据的 DataFrame
"""
pass
def save_to_csv(self, filepath: Union[str, Path]) -> None:
"""
将生成的数据保存为 CSV 文件
参数:
filepath: 文件路径
"""
df = self.generate()
df.to_csv(filepath, index=False, encoding='utf-8')
def save_to_excel(self, filepath: Union[str, Path]) -> None:
"""
将生成的数据保存为 Excel 文件
参数:
filepath: 文件路径
"""
df = self.generate()
df.to_excel(filepath, index=False)
@abstractmethod
def get_description(self) -> str:
"""
获取数据生成场景的描述
返回:
描述字符串
"""
pass
class SimpleATESimulator(BaseDataSimulator):
"""
简单 ATE平均处理效应场景模拟器
生成只有一个处理变量 T 和一个结果变量 Y 的简单数据。
模型Y = α + τ*T + ε
其中 τ 是平均处理效应。
"""
def __init__(
self,
n_samples: int = 1000,
treatment_rate: float = 0.5,
noise_level: float = 1.0,
effect_size: float = 1.0,
intercept: float = 0.0,
random_state: Optional[int] = None
):
"""
初始化简单 ATE 模拟器
参数:
n_samples: 样本数量
treatment_rate: 处理比例
noise_level: 噪声水平
effect_size: 平均处理效应大小
intercept: 截距项
random_state: 随机种子
"""
super().__init__(
n_samples=n_samples,
treatment_rate=treatment_rate,
noise_level=noise_level,
effect_size=effect_size,
random_state=random_state
)
self.intercept = intercept
def generate(self) -> pd.DataFrame:
"""
生成简单 ATE 场景的模拟数据
返回:
包含 'T'(处理变量)和 'Y'(结果变量)的 DataFrame
"""
# 生成处理变量(伯努利分布)
T = np.random.binomial(1, self.treatment_rate, self.n_samples)
# 生成噪声
noise = np.random.normal(0, self.noise_level, self.n_samples)
# 生成结果变量Y = intercept + effect_size * T + noise
Y = self.intercept + self.effect_size * T + noise
df = pd.DataFrame({
'T': T,
'Y': Y
})
return df
def get_description(self) -> str:
"""获取场景描述"""
return (
f"简单 ATE 场景:\n"
f"- 样本数量:{self.n_samples}\n"
f"- 处理比例:{self.treatment_rate:.2%}\n"
f"- 噪声水平:{self.noise_level}\n"
f"- 平均处理效应:{self.effect_size}\n"
f"- 截距项:{self.intercept}\n"
f"模型Y = {self.intercept} + {self.effect_size}*T + ε"
)
class CovariateSimulator(BaseDataSimulator):
"""
协变量场景模拟器
生成包含协变量 X 的数据,用于控制混淆因素。
模型Y = α + β*X + τ*T + ε
"""
def __init__(
self,
n_samples: int = 1000,
treatment_rate: float = 0.5,
noise_level: float = 1.0,
effect_size: float = 1.0,
n_covariates: int = 3,
covariate_distribution: str = 'normal',
covariate_params: Optional[Dict[str, Any]] = None,
random_state: Optional[int] = None
):
"""
初始化协变量场景模拟器
参数:
n_samples: 样本数量
treatment_rate: 处理比例
noise_level: 噪声水平
effect_size: 处理效应大小
n_covariates: 协变量数量
covariate_distribution: 协变量分布类型 ('normal', 'uniform', 'beta')
covariate_params: 协变量分布参数
random_state: 随机种子
"""
super().__init__(
n_samples=n_samples,
treatment_rate=treatment_rate,
noise_level=noise_level,
effect_size=effect_size,
random_state=random_state
)
self.n_covariates = n_covariates
self.covariate_distribution = covariate_distribution
self.covariate_params = covariate_params or {}
def _generate_covariates(self) -> np.ndarray:
"""
生成协变量数据
返回:
形状为 (n_samples, n_covariates) 的协变量数组
"""
if self.covariate_distribution == 'normal':
mean = self.covariate_params.get('mean', 0)
std = self.covariate_params.get('std', 1)
X = np.random.normal(mean, std, (self.n_samples, self.n_covariates))
elif self.covariate_distribution == 'uniform':
low = self.covariate_params.get('low', -1)
high = self.covariate_params.get('high', 1)
X = np.random.uniform(low, high, (self.n_samples, self.n_covariates))
elif self.covariate_distribution == 'beta':
a = self.covariate_params.get('a', 2)
b = self.covariate_params.get('b', 2)
X = np.random.beta(a, b, (self.n_samples, self.n_covariates))
else:
raise ValueError(f"未知的协变量分布类型:{self.covariate_distribution}")
return X
def generate(self) -> pd.DataFrame:
"""
生成协变量场景的模拟数据
返回:
包含协变量 X、处理变量 T 和结果变量 Y 的 DataFrame
"""
# 生成协变量
X = self._generate_covariates()
# 生成处理变量
T = np.random.binomial(1, self.treatment_rate, self.n_samples)
# 生成噪声
noise = np.random.normal(0, self.noise_level, self.n_samples)
# 生成协变量系数
beta = np.random.uniform(-1, 1, self.n_covariates)
# 生成结果变量Y = β*X + τ*T + ε
Y = np.dot(X, beta) + self.effect_size * T + noise
# 创建 DataFrame
df_dict = {}
for i in range(self.n_covariates):
df_dict[f'X{i}'] = X[:, i]
df_dict['T'] = T
df_dict['Y'] = Y
df = pd.DataFrame(df_dict)
return df
def get_description(self) -> str:
"""获取场景描述"""
return (
f"协变量场景:\n"
f"- 样本数量:{self.n_samples}\n"
f"- 处理比例:{self.treatment_rate:.2%}\n"
f"- 噪声水平:{self.noise_level}\n"
f"- 处理效应:{self.effect_size}\n"
f"- 协变量数量:{self.n_covariates}\n"
f"- 协变量分布:{self.covariate_distribution}\n"
f"模型Y = β*X + {self.effect_size}*T + ε"
)
class InteractionEffectSimulator(BaseDataSimulator):
"""
交互效应场景模拟器
生成处理效应随协变量变化的数据。
模型Y = α + β*X + τ*T + γ*X*T + ε
其中 γ 是交互效应系数。
"""
def __init__(
self,
n_samples: int = 1000,
treatment_rate: float = 0.5,
noise_level: float = 1.0,
effect_size: float = 1.0,
interaction_strength: float = 0.5,
n_covariates: int = 2,
random_state: Optional[int] = None
):
"""
初始化交互效应场景模拟器
参数:
n_samples: 样本数量
treatment_rate: 处理比例
noise_level: 噪声水平
effect_size: 基础处理效应
interaction_strength: 交互效应强度
n_covariates: 协变量数量
random_state: 随机种子
"""
super().__init__(
n_samples=n_samples,
treatment_rate=treatment_rate,
noise_level=noise_level,
effect_size=effect_size,
random_state=random_state
)
self.interaction_strength = interaction_strength
self.n_covariates = n_covariates
def generate(self) -> pd.DataFrame:
"""
生成交互效应场景的模拟数据
返回:
包含协变量 X、处理变量 T 和结果变量 Y 的 DataFrame
"""
# 生成协变量
X = np.random.normal(0, 1, (self.n_samples, self.n_covariates))
# 生成处理变量
T = np.random.binomial(1, self.treatment_rate, self.n_samples)
# 生成噪声
noise = np.random.normal(0, self.noise_level, self.n_samples)
# 生成协变量系数
beta = np.random.uniform(-0.5, 0.5, self.n_covariates)
# 生成交互效应系数
gamma = np.random.uniform(-0.5, 0.5, self.n_covariates)
# 生成结果变量Y = β*X + τ*T + γ*X*T + ε
Y = np.dot(X, beta) + self.effect_size * T
for i in range(self.n_covariates):
Y += gamma[i] * X[:, i] * T
Y += noise
# 创建 DataFrame
df_dict = {}
for i in range(self.n_covariates):
df_dict[f'X{i}'] = X[:, i]
df_dict['T'] = T
df_dict['Y'] = Y
df = pd.DataFrame(df_dict)
return df
def get_description(self) -> str:
"""获取场景描述"""
return (
f"交互效应场景:\n"
f"- 样本数量:{self.n_samples}\n"
f"- 处理比例:{self.treatment_rate:.2%}\n"
f"- 噪声水平:{self.noise_level}\n"
f"- 基础处理效应:{self.effect_size}\n"
f"- 交互效应强度:{self.interaction_strength}\n"
f"- 协变量数量:{self.n_covariates}\n"
f"模型Y = β*X + {self.effect_size}*T + γ*X*T + ε"
)
class CATESimulator(BaseDataSimulator):
"""
条件平均处理效应CATE场景模拟器
生成处理效应依赖于个体特征的数据。
模型Y = α + β*X + τ(X)*T + ε
其中 τ(X) 是条件处理效应函数。
"""
def __init__(
self,
n_samples: int = 1000,
treatment_rate: float = 0.5,
noise_level: float = 1.0,
effect_size: float = 1.0,
n_features: int = 5,
cate_function: str = 'linear',
random_state: Optional[int] = None
):
"""
初始化 CATE 场景模拟器
参数:
n_samples: 样本数量
treatment_rate: 处理比例
noise_level: 噪声水平
effect_size: 平均处理效应基准
n_features: 特征数量
cate_function: CATE 函数类型 ('linear', 'nonlinear', 'threshold')
random_state: 随机种子
"""
super().__init__(
n_samples=n_samples,
treatment_rate=treatment_rate,
noise_level=noise_level,
effect_size=effect_size,
random_state=random_state
)
self.n_features = n_features
self.cate_function = cate_function
def _compute_cate(self, X: np.ndarray) -> np.ndarray:
"""
计算条件处理效应
参数:
X: 特征矩阵,形状为 (n_samples, n_features)
返回:
条件处理效应数组
"""
if self.cate_function == 'linear':
# 线性 CATE: τ(X) = τ0 + β*X
beta_cate = np.random.uniform(-0.5, 0.5, self.n_features)
cate = self.effect_size + np.dot(X, beta_cate)
elif self.cate_function == 'nonlinear':
# 非线性 CATE: τ(X) = τ0 + sin(β*X)
beta_cate = np.random.uniform(-1, 1, self.n_features)
linear_comb = np.dot(X, beta_cate)
cate = self.effect_size + np.sin(linear_comb)
elif self.cate_function == 'threshold':
# 阈值 CATE: τ(X) = τ0 + β*X * I(X > threshold)
beta_cate = np.random.uniform(-0.5, 0.5, self.n_features)
threshold = np.random.uniform(-1, 1)
linear_comb = np.dot(X, beta_cate)
cate = self.effect_size + linear_comb * (linear_comb > threshold).astype(float)
else:
raise ValueError(f"未知的 CATE 函数类型:{self.cate_function}")
return cate
def generate(self) -> pd.DataFrame:
"""
生成 CATE 场景的模拟数据
返回:
包含特征 X、处理变量 T 和结果变量 Y 的 DataFrame
"""
# 生成特征
X = np.random.normal(0, 1, (self.n_samples, self.n_features))
# 生成处理变量
T = np.random.binomial(1, self.treatment_rate, self.n_samples)
# 生成噪声
noise = np.random.normal(0, self.noise_level, self.n_samples)
# 计算 CATE
cate = self._compute_cate(X)
# 生成协变量系数(用于结果模型)
beta_y = np.random.uniform(-0.5, 0.5, self.n_features)
# 生成结果变量Y = β*X + τ(X)*T + ε
Y = np.dot(X, beta_y) + cate * T + noise
# 创建 DataFrame
df_dict = {}
for i in range(self.n_features):
df_dict[f'X{i}'] = X[:, i]
df_dict['T'] = T
df_dict['Y'] = Y
# 添加 CATE 列(用于验证)
df_dict['CATE'] = cate
df = pd.DataFrame(df_dict)
return df
def get_description(self) -> str:
"""获取场景描述"""
return (
f"CATE 场景:\n"
f"- 样本数量:{self.n_samples}\n"
f"- 处理比例:{self.treatment_rate:.2%}\n"
f"- 噪声水平:{self.noise_level}\n"
f"- 平均处理效应基准:{self.effect_size}\n"
f"- 特征数量:{self.n_features}\n"
f"- CATE 函数类型:{self.cate_function}\n"
f"模型Y = β*X + τ(X)*T + ε"
)
class DataSimulatorFactory:
"""
数据模拟器工厂类
用于创建不同类型的模拟器实例。
"""
_simulators = {
'simple_ate': SimpleATESimulator,
'covariate': CovariateSimulator,
'interaction': InteractionEffectSimulator,
'cate': CATESimulator
}
@classmethod
def create(cls, simulator_type: str, **kwargs) -> BaseDataSimulator:
"""
创建指定类型的模拟器
参数:
simulator_type: 模拟器类型 ('simple_ate', 'covariate', 'interaction', 'cate')
**kwargs: 模拟器初始化参数
返回:
模拟器实例
Raises:
ValueError: 未知的模拟器类型
"""
if simulator_type not in cls._simulators:
available_types = ', '.join(cls._simulators.keys())
raise ValueError(
f"未知的模拟器类型:{simulator_type}"
f"可用类型:{available_types}"
)
return cls._simulators[simulator_type](**kwargs)
@classmethod
def list_available_types(cls) -> list:
"""
获取所有可用的模拟器类型
返回:
模拟器类型列表
"""
return list(cls._simulators.keys())
# 便捷函数
def generate_simple_ate_data(
n_samples: int = 1000,
treatment_rate: float = 0.5,
noise_level: float = 1.0,
effect_size: float = 1.0,
intercept: float = 0.0,
random_state: Optional[int] = None
) -> pd.DataFrame:
"""
生成简单 ATE 场景的模拟数据(便捷函数)
参数:
n_samples: 样本数量
treatment_rate: 处理比例
noise_level: 噪声水平
effect_size: 平均处理效应
intercept: 截距项
random_state: 随机种子
返回:
包含 'T''Y' 的 DataFrame
"""
simulator = SimpleATESimulator(
n_samples=n_samples,
treatment_rate=treatment_rate,
noise_level=noise_level,
effect_size=effect_size,
intercept=intercept,
random_state=random_state
)
return simulator.generate()
def generate_covariate_data(
n_samples: int = 1000,
treatment_rate: float = 0.5,
noise_level: float = 1.0,
effect_size: float = 1.0,
n_covariates: int = 3,
covariate_distribution: str = 'normal',
random_state: Optional[int] = None
) -> pd.DataFrame:
"""
生成协变量场景的模拟数据(便捷函数)
参数:
n_samples: 样本数量
treatment_rate: 处理比例
noise_level: 噪声水平
effect_size: 处理效应
n_covariates: 协变量数量
covariate_distribution: 协变量分布类型
random_state: 随机种子
返回:
包含协变量、'T''Y' 的 DataFrame
"""
simulator = CovariateSimulator(
n_samples=n_samples,
treatment_rate=treatment_rate,
noise_level=noise_level,
effect_size=effect_size,
n_covariates=n_covariates,
covariate_distribution=covariate_distribution,
random_state=random_state
)
return simulator.generate()
def generate_interaction_data(
n_samples: int = 1000,
treatment_rate: float = 0.5,
noise_level: float = 1.0,
effect_size: float = 1.0,
interaction_strength: float = 0.5,
n_covariates: int = 2,
random_state: Optional[int] = None
) -> pd.DataFrame:
"""
生成交互效应场景的模拟数据(便捷函数)
参数:
n_samples: 样本数量
treatment_rate: 处理比例
noise_level: 噪声水平
effect_size: 基础处理效应
interaction_strength: 交互效应强度
n_covariates: 协变量数量
random_state: 随机种子
返回:
包含协变量、'T''Y' 的 DataFrame
"""
simulator = InteractionEffectSimulator(
n_samples=n_samples,
treatment_rate=treatment_rate,
noise_level=noise_level,
effect_size=effect_size,
interaction_strength=interaction_strength,
n_covariates=n_covariates,
random_state=random_state
)
return simulator.generate()
def generate_cate_data(
n_samples: int = 1000,
treatment_rate: float = 0.5,
noise_level: float = 1.0,
effect_size: float = 1.0,
n_features: int = 5,
cate_function: str = 'linear',
random_state: Optional[int] = None
) -> pd.DataFrame:
"""
生成 CATE 场景的模拟数据(便捷函数)
参数:
n_samples: 样本数量
treatment_rate: 处理比例
noise_level: 噪声水平
effect_size: 平均处理效应基准
n_features: 特征数量
cate_function: CATE 函数类型
random_state: 随机种子
返回:
包含特征、'T''Y''CATE' 的 DataFrame
"""
simulator = CATESimulator(
n_samples=n_samples,
treatment_rate=treatment_rate,
noise_level=noise_level,
effect_size=effect_size,
n_features=n_features,
cate_function=cate_function,
random_state=random_state
)
return simulator.generate()
# 示例用法
if __name__ == '__main__':
print("=" * 60)
print("模拟数据生成模块示例")
print("=" * 60)
# 1. 简单 ATE 场景
print("\n1. 简单 ATE 场景")
print("-" * 40)
simple_simulator = SimpleATESimulator(
n_samples=500,
treatment_rate=0.5,
noise_level=1.0,
effect_size=2.0,
intercept=5.0,
random_state=42
)
print(simple_simulator.get_description())
simple_df = simple_simulator.generate()
print(f"\n数据形状:{simple_df.shape}")
print(f"前 5 行:\n{simple_df.head()}")
# 2. 协变量场景
print("\n\n2. 协变量场景")
print("-" * 40)
covariate_simulator = CovariateSimulator(
n_samples=500,
treatment_rate=0.4,
noise_level=1.5,
effect_size=1.5,
n_covariates=3,
covariate_distribution='normal',
random_state=42
)
print(covariate_simulator.get_description())
covariate_df = covariate_simulator.generate()
print(f"\n数据形状:{covariate_df.shape}")
print(f"前 5 行:\n{covariate_df.head()}")
# 3. 交互效应场景
print("\n\n3. 交互效应场景")
print("-" * 40)
interaction_simulator = InteractionEffectSimulator(
n_samples=500,
treatment_rate=0.5,
noise_level=1.0,
effect_size=1.0,
interaction_strength=0.5,
n_covariates=2,
random_state=42
)
print(interaction_simulator.get_description())
interaction_df = interaction_simulator.generate()
print(f"\n数据形状:{interaction_df.shape}")
print(f"前 5 行:\n{interaction_df.head()}")
# 4. CATE 场景
print("\n\n4. CATE 场景")
print("-" * 40)
cate_simulator = CATESimulator(
n_samples=500,
treatment_rate=0.5,
noise_level=1.0,
effect_size=1.5,
n_features=4,
cate_function='linear',
random_state=42
)
print(cate_simulator.get_description())
cate_df = cate_simulator.generate()
print(f"\n数据形状:{cate_df.shape}")
print(f"前 5 行:\n{cate_df.head()}")
# 5. 使用工厂创建模拟器
print("\n\n5. 使用工厂创建模拟器")
print("-" * 40)
print(f"可用模拟器类型:{DataSimulatorFactory.list_available_types()}")
factory_simulator = DataSimulatorFactory.create(
'simple_ate',
n_samples=300,
treatment_rate=0.6,
effect_size=2.5,
random_state=42
)
print(factory_simulator.get_description())
# 6. 保存数据
print("\n\n6. 保存数据示例")
print("-" * 40)
output_dir = Path('data/output')
output_dir.mkdir(exist_ok=True)
simple_simulator.save_to_csv(output_dir / 'simple_ate_data.csv')
print(f"已保存:{output_dir / 'simple_ate_data.csv'}")
simple_simulator.save_to_excel(output_dir / 'simple_ate_data.xlsx')
print(f"已保存:{output_dir / 'simple_ate_data.xlsx'}")
# 7. 使用便捷函数
print("\n\n7. 使用便捷函数")
print("-" * 40)
simple_df = generate_simple_ate_data(n_samples=200, effect_size=3.0, random_state=123)
print(f"便捷函数生成数据形状:{simple_df.shape}")
print(f"前 5 行:\n{simple_df.head()}")
print("\n" + "=" * 60)
print("示例完成!")
print("=" * 60)