""" 模拟数据生成模块 本模块提供用于测试因果推断的模拟数据生成功能,支持多种数据生成场景: - 简单 ATE 场景 - 协变量场景 - 交互效应场景 - CATE 场景 作者:CausalInferenceAgent 版本:1.0.0 """ import numpy as np import pandas as pd from abc import ABC, abstractmethod from typing import Optional, Dict, Any, Union from pathlib import Path class BaseDataSimulator(ABC): """ 数据模拟器基类 定义所有数据模拟器的通用接口。 """ def __init__( self, n_samples: int = 1000, treatment_rate: float = 0.5, noise_level: float = 1.0, effect_size: float = 1.0, random_state: Optional[int] = None ): """ 初始化数据模拟器 参数: n_samples: 样本数量,默认为 1000 treatment_rate: 处理比例(0-1 之间),默认为 0.5 noise_level: 噪声水平,默认为 1.0 effect_size: 效应大小,默认为 1.0 random_state: 随机种子,用于可重复性 """ self.n_samples = n_samples self.treatment_rate = treatment_rate self.noise_level = noise_level self.effect_size = effect_size self.random_state = random_state # 设置随机种子 if random_state is not None: np.random.seed(random_state) @abstractmethod def generate(self) -> pd.DataFrame: """ 生成模拟数据 返回: 包含模拟数据的 DataFrame """ pass def save_to_csv(self, filepath: Union[str, Path]) -> None: """ 将生成的数据保存为 CSV 文件 参数: filepath: 文件路径 """ df = self.generate() df.to_csv(filepath, index=False, encoding='utf-8') def save_to_excel(self, filepath: Union[str, Path]) -> None: """ 将生成的数据保存为 Excel 文件 参数: filepath: 文件路径 """ df = self.generate() df.to_excel(filepath, index=False) @abstractmethod def get_description(self) -> str: """ 获取数据生成场景的描述 返回: 描述字符串 """ pass class SimpleATESimulator(BaseDataSimulator): """ 简单 ATE(平均处理效应)场景模拟器 生成只有一个处理变量 T 和一个结果变量 Y 的简单数据。 模型:Y = α + τ*T + ε 其中 τ 是平均处理效应。 """ def __init__( self, n_samples: int = 1000, treatment_rate: float = 0.5, noise_level: float = 1.0, effect_size: float = 1.0, intercept: float = 0.0, random_state: Optional[int] = None ): """ 初始化简单 ATE 模拟器 参数: n_samples: 样本数量 treatment_rate: 处理比例 noise_level: 噪声水平 effect_size: 平均处理效应大小 intercept: 截距项 random_state: 随机种子 """ super().__init__( n_samples=n_samples, treatment_rate=treatment_rate, noise_level=noise_level, effect_size=effect_size, random_state=random_state ) self.intercept = intercept def generate(self) -> pd.DataFrame: """ 生成简单 ATE 场景的模拟数据 返回: 包含 'T'(处理变量)和 'Y'(结果变量)的 DataFrame """ # 生成处理变量(伯努利分布) T = np.random.binomial(1, self.treatment_rate, self.n_samples) # 生成噪声 noise = np.random.normal(0, self.noise_level, self.n_samples) # 生成结果变量:Y = intercept + effect_size * T + noise Y = self.intercept + self.effect_size * T + noise df = pd.DataFrame({ 'T': T, 'Y': Y }) return df def get_description(self) -> str: """获取场景描述""" return ( f"简单 ATE 场景:\n" f"- 样本数量:{self.n_samples}\n" f"- 处理比例:{self.treatment_rate:.2%}\n" f"- 噪声水平:{self.noise_level}\n" f"- 平均处理效应:{self.effect_size}\n" f"- 截距项:{self.intercept}\n" f"模型:Y = {self.intercept} + {self.effect_size}*T + ε" ) class CovariateSimulator(BaseDataSimulator): """ 协变量场景模拟器 生成包含协变量 X 的数据,用于控制混淆因素。 模型:Y = α + β*X + τ*T + ε """ def __init__( self, n_samples: int = 1000, treatment_rate: float = 0.5, noise_level: float = 1.0, effect_size: float = 1.0, n_covariates: int = 3, covariate_distribution: str = 'normal', covariate_params: Optional[Dict[str, Any]] = None, random_state: Optional[int] = None ): """ 初始化协变量场景模拟器 参数: n_samples: 样本数量 treatment_rate: 处理比例 noise_level: 噪声水平 effect_size: 处理效应大小 n_covariates: 协变量数量 covariate_distribution: 协变量分布类型 ('normal', 'uniform', 'beta') covariate_params: 协变量分布参数 random_state: 随机种子 """ super().__init__( n_samples=n_samples, treatment_rate=treatment_rate, noise_level=noise_level, effect_size=effect_size, random_state=random_state ) self.n_covariates = n_covariates self.covariate_distribution = covariate_distribution self.covariate_params = covariate_params or {} def _generate_covariates(self) -> np.ndarray: """ 生成协变量数据 返回: 形状为 (n_samples, n_covariates) 的协变量数组 """ if self.covariate_distribution == 'normal': mean = self.covariate_params.get('mean', 0) std = self.covariate_params.get('std', 1) X = np.random.normal(mean, std, (self.n_samples, self.n_covariates)) elif self.covariate_distribution == 'uniform': low = self.covariate_params.get('low', -1) high = self.covariate_params.get('high', 1) X = np.random.uniform(low, high, (self.n_samples, self.n_covariates)) elif self.covariate_distribution == 'beta': a = self.covariate_params.get('a', 2) b = self.covariate_params.get('b', 2) X = np.random.beta(a, b, (self.n_samples, self.n_covariates)) else: raise ValueError(f"未知的协变量分布类型:{self.covariate_distribution}") return X def generate(self) -> pd.DataFrame: """ 生成协变量场景的模拟数据 返回: 包含协变量 X、处理变量 T 和结果变量 Y 的 DataFrame """ # 生成协变量 X = self._generate_covariates() # 生成处理变量 T = np.random.binomial(1, self.treatment_rate, self.n_samples) # 生成噪声 noise = np.random.normal(0, self.noise_level, self.n_samples) # 生成协变量系数 beta = np.random.uniform(-1, 1, self.n_covariates) # 生成结果变量:Y = β*X + τ*T + ε Y = np.dot(X, beta) + self.effect_size * T + noise # 创建 DataFrame df_dict = {} for i in range(self.n_covariates): df_dict[f'X{i}'] = X[:, i] df_dict['T'] = T df_dict['Y'] = Y df = pd.DataFrame(df_dict) return df def get_description(self) -> str: """获取场景描述""" return ( f"协变量场景:\n" f"- 样本数量:{self.n_samples}\n" f"- 处理比例:{self.treatment_rate:.2%}\n" f"- 噪声水平:{self.noise_level}\n" f"- 处理效应:{self.effect_size}\n" f"- 协变量数量:{self.n_covariates}\n" f"- 协变量分布:{self.covariate_distribution}\n" f"模型:Y = β*X + {self.effect_size}*T + ε" ) class InteractionEffectSimulator(BaseDataSimulator): """ 交互效应场景模拟器 生成处理效应随协变量变化的数据。 模型:Y = α + β*X + τ*T + γ*X*T + ε 其中 γ 是交互效应系数。 """ def __init__( self, n_samples: int = 1000, treatment_rate: float = 0.5, noise_level: float = 1.0, effect_size: float = 1.0, interaction_strength: float = 0.5, n_covariates: int = 2, random_state: Optional[int] = None ): """ 初始化交互效应场景模拟器 参数: n_samples: 样本数量 treatment_rate: 处理比例 noise_level: 噪声水平 effect_size: 基础处理效应 interaction_strength: 交互效应强度 n_covariates: 协变量数量 random_state: 随机种子 """ super().__init__( n_samples=n_samples, treatment_rate=treatment_rate, noise_level=noise_level, effect_size=effect_size, random_state=random_state ) self.interaction_strength = interaction_strength self.n_covariates = n_covariates def generate(self) -> pd.DataFrame: """ 生成交互效应场景的模拟数据 返回: 包含协变量 X、处理变量 T 和结果变量 Y 的 DataFrame """ # 生成协变量 X = np.random.normal(0, 1, (self.n_samples, self.n_covariates)) # 生成处理变量 T = np.random.binomial(1, self.treatment_rate, self.n_samples) # 生成噪声 noise = np.random.normal(0, self.noise_level, self.n_samples) # 生成协变量系数 beta = np.random.uniform(-0.5, 0.5, self.n_covariates) # 生成交互效应系数 gamma = np.random.uniform(-0.5, 0.5, self.n_covariates) # 生成结果变量:Y = β*X + τ*T + γ*X*T + ε Y = np.dot(X, beta) + self.effect_size * T for i in range(self.n_covariates): Y += gamma[i] * X[:, i] * T Y += noise # 创建 DataFrame df_dict = {} for i in range(self.n_covariates): df_dict[f'X{i}'] = X[:, i] df_dict['T'] = T df_dict['Y'] = Y df = pd.DataFrame(df_dict) return df def get_description(self) -> str: """获取场景描述""" return ( f"交互效应场景:\n" f"- 样本数量:{self.n_samples}\n" f"- 处理比例:{self.treatment_rate:.2%}\n" f"- 噪声水平:{self.noise_level}\n" f"- 基础处理效应:{self.effect_size}\n" f"- 交互效应强度:{self.interaction_strength}\n" f"- 协变量数量:{self.n_covariates}\n" f"模型:Y = β*X + {self.effect_size}*T + γ*X*T + ε" ) class CATESimulator(BaseDataSimulator): """ 条件平均处理效应(CATE)场景模拟器 生成处理效应依赖于个体特征的数据。 模型:Y = α + β*X + τ(X)*T + ε 其中 τ(X) 是条件处理效应函数。 """ def __init__( self, n_samples: int = 1000, treatment_rate: float = 0.5, noise_level: float = 1.0, effect_size: float = 1.0, n_features: int = 5, cate_function: str = 'linear', random_state: Optional[int] = None ): """ 初始化 CATE 场景模拟器 参数: n_samples: 样本数量 treatment_rate: 处理比例 noise_level: 噪声水平 effect_size: 平均处理效应基准 n_features: 特征数量 cate_function: CATE 函数类型 ('linear', 'nonlinear', 'threshold') random_state: 随机种子 """ super().__init__( n_samples=n_samples, treatment_rate=treatment_rate, noise_level=noise_level, effect_size=effect_size, random_state=random_state ) self.n_features = n_features self.cate_function = cate_function def _compute_cate(self, X: np.ndarray) -> np.ndarray: """ 计算条件处理效应 参数: X: 特征矩阵,形状为 (n_samples, n_features) 返回: 条件处理效应数组 """ if self.cate_function == 'linear': # 线性 CATE: τ(X) = τ0 + β*X beta_cate = np.random.uniform(-0.5, 0.5, self.n_features) cate = self.effect_size + np.dot(X, beta_cate) elif self.cate_function == 'nonlinear': # 非线性 CATE: τ(X) = τ0 + sin(β*X) beta_cate = np.random.uniform(-1, 1, self.n_features) linear_comb = np.dot(X, beta_cate) cate = self.effect_size + np.sin(linear_comb) elif self.cate_function == 'threshold': # 阈值 CATE: τ(X) = τ0 + β*X * I(X > threshold) beta_cate = np.random.uniform(-0.5, 0.5, self.n_features) threshold = np.random.uniform(-1, 1) linear_comb = np.dot(X, beta_cate) cate = self.effect_size + linear_comb * (linear_comb > threshold).astype(float) else: raise ValueError(f"未知的 CATE 函数类型:{self.cate_function}") return cate def generate(self) -> pd.DataFrame: """ 生成 CATE 场景的模拟数据 返回: 包含特征 X、处理变量 T 和结果变量 Y 的 DataFrame """ # 生成特征 X = np.random.normal(0, 1, (self.n_samples, self.n_features)) # 生成处理变量 T = np.random.binomial(1, self.treatment_rate, self.n_samples) # 生成噪声 noise = np.random.normal(0, self.noise_level, self.n_samples) # 计算 CATE cate = self._compute_cate(X) # 生成协变量系数(用于结果模型) beta_y = np.random.uniform(-0.5, 0.5, self.n_features) # 生成结果变量:Y = β*X + τ(X)*T + ε Y = np.dot(X, beta_y) + cate * T + noise # 创建 DataFrame df_dict = {} for i in range(self.n_features): df_dict[f'X{i}'] = X[:, i] df_dict['T'] = T df_dict['Y'] = Y # 添加 CATE 列(用于验证) df_dict['CATE'] = cate df = pd.DataFrame(df_dict) return df def get_description(self) -> str: """获取场景描述""" return ( f"CATE 场景:\n" f"- 样本数量:{self.n_samples}\n" f"- 处理比例:{self.treatment_rate:.2%}\n" f"- 噪声水平:{self.noise_level}\n" f"- 平均处理效应基准:{self.effect_size}\n" f"- 特征数量:{self.n_features}\n" f"- CATE 函数类型:{self.cate_function}\n" f"模型:Y = β*X + τ(X)*T + ε" ) class DataSimulatorFactory: """ 数据模拟器工厂类 用于创建不同类型的模拟器实例。 """ _simulators = { 'simple_ate': SimpleATESimulator, 'covariate': CovariateSimulator, 'interaction': InteractionEffectSimulator, 'cate': CATESimulator } @classmethod def create(cls, simulator_type: str, **kwargs) -> BaseDataSimulator: """ 创建指定类型的模拟器 参数: simulator_type: 模拟器类型 ('simple_ate', 'covariate', 'interaction', 'cate') **kwargs: 模拟器初始化参数 返回: 模拟器实例 Raises: ValueError: 未知的模拟器类型 """ if simulator_type not in cls._simulators: available_types = ', '.join(cls._simulators.keys()) raise ValueError( f"未知的模拟器类型:{simulator_type}。" f"可用类型:{available_types}" ) return cls._simulators[simulator_type](**kwargs) @classmethod def list_available_types(cls) -> list: """ 获取所有可用的模拟器类型 返回: 模拟器类型列表 """ return list(cls._simulators.keys()) # 便捷函数 def generate_simple_ate_data( n_samples: int = 1000, treatment_rate: float = 0.5, noise_level: float = 1.0, effect_size: float = 1.0, intercept: float = 0.0, random_state: Optional[int] = None ) -> pd.DataFrame: """ 生成简单 ATE 场景的模拟数据(便捷函数) 参数: n_samples: 样本数量 treatment_rate: 处理比例 noise_level: 噪声水平 effect_size: 平均处理效应 intercept: 截距项 random_state: 随机种子 返回: 包含 'T' 和 'Y' 的 DataFrame """ simulator = SimpleATESimulator( n_samples=n_samples, treatment_rate=treatment_rate, noise_level=noise_level, effect_size=effect_size, intercept=intercept, random_state=random_state ) return simulator.generate() def generate_covariate_data( n_samples: int = 1000, treatment_rate: float = 0.5, noise_level: float = 1.0, effect_size: float = 1.0, n_covariates: int = 3, covariate_distribution: str = 'normal', random_state: Optional[int] = None ) -> pd.DataFrame: """ 生成协变量场景的模拟数据(便捷函数) 参数: n_samples: 样本数量 treatment_rate: 处理比例 noise_level: 噪声水平 effect_size: 处理效应 n_covariates: 协变量数量 covariate_distribution: 协变量分布类型 random_state: 随机种子 返回: 包含协变量、'T' 和 'Y' 的 DataFrame """ simulator = CovariateSimulator( n_samples=n_samples, treatment_rate=treatment_rate, noise_level=noise_level, effect_size=effect_size, n_covariates=n_covariates, covariate_distribution=covariate_distribution, random_state=random_state ) return simulator.generate() def generate_interaction_data( n_samples: int = 1000, treatment_rate: float = 0.5, noise_level: float = 1.0, effect_size: float = 1.0, interaction_strength: float = 0.5, n_covariates: int = 2, random_state: Optional[int] = None ) -> pd.DataFrame: """ 生成交互效应场景的模拟数据(便捷函数) 参数: n_samples: 样本数量 treatment_rate: 处理比例 noise_level: 噪声水平 effect_size: 基础处理效应 interaction_strength: 交互效应强度 n_covariates: 协变量数量 random_state: 随机种子 返回: 包含协变量、'T' 和 'Y' 的 DataFrame """ simulator = InteractionEffectSimulator( n_samples=n_samples, treatment_rate=treatment_rate, noise_level=noise_level, effect_size=effect_size, interaction_strength=interaction_strength, n_covariates=n_covariates, random_state=random_state ) return simulator.generate() def generate_cate_data( n_samples: int = 1000, treatment_rate: float = 0.5, noise_level: float = 1.0, effect_size: float = 1.0, n_features: int = 5, cate_function: str = 'linear', random_state: Optional[int] = None ) -> pd.DataFrame: """ 生成 CATE 场景的模拟数据(便捷函数) 参数: n_samples: 样本数量 treatment_rate: 处理比例 noise_level: 噪声水平 effect_size: 平均处理效应基准 n_features: 特征数量 cate_function: CATE 函数类型 random_state: 随机种子 返回: 包含特征、'T'、'Y' 和 'CATE' 的 DataFrame """ simulator = CATESimulator( n_samples=n_samples, treatment_rate=treatment_rate, noise_level=noise_level, effect_size=effect_size, n_features=n_features, cate_function=cate_function, random_state=random_state ) return simulator.generate() # 示例用法 if __name__ == '__main__': print("=" * 60) print("模拟数据生成模块示例") print("=" * 60) # 1. 简单 ATE 场景 print("\n1. 简单 ATE 场景") print("-" * 40) simple_simulator = SimpleATESimulator( n_samples=500, treatment_rate=0.5, noise_level=1.0, effect_size=2.0, intercept=5.0, random_state=42 ) print(simple_simulator.get_description()) simple_df = simple_simulator.generate() print(f"\n数据形状:{simple_df.shape}") print(f"前 5 行:\n{simple_df.head()}") # 2. 协变量场景 print("\n\n2. 协变量场景") print("-" * 40) covariate_simulator = CovariateSimulator( n_samples=500, treatment_rate=0.4, noise_level=1.5, effect_size=1.5, n_covariates=3, covariate_distribution='normal', random_state=42 ) print(covariate_simulator.get_description()) covariate_df = covariate_simulator.generate() print(f"\n数据形状:{covariate_df.shape}") print(f"前 5 行:\n{covariate_df.head()}") # 3. 交互效应场景 print("\n\n3. 交互效应场景") print("-" * 40) interaction_simulator = InteractionEffectSimulator( n_samples=500, treatment_rate=0.5, noise_level=1.0, effect_size=1.0, interaction_strength=0.5, n_covariates=2, random_state=42 ) print(interaction_simulator.get_description()) interaction_df = interaction_simulator.generate() print(f"\n数据形状:{interaction_df.shape}") print(f"前 5 行:\n{interaction_df.head()}") # 4. CATE 场景 print("\n\n4. CATE 场景") print("-" * 40) cate_simulator = CATESimulator( n_samples=500, treatment_rate=0.5, noise_level=1.0, effect_size=1.5, n_features=4, cate_function='linear', random_state=42 ) print(cate_simulator.get_description()) cate_df = cate_simulator.generate() print(f"\n数据形状:{cate_df.shape}") print(f"前 5 行:\n{cate_df.head()}") # 5. 使用工厂创建模拟器 print("\n\n5. 使用工厂创建模拟器") print("-" * 40) print(f"可用模拟器类型:{DataSimulatorFactory.list_available_types()}") factory_simulator = DataSimulatorFactory.create( 'simple_ate', n_samples=300, treatment_rate=0.6, effect_size=2.5, random_state=42 ) print(factory_simulator.get_description()) # 6. 保存数据 print("\n\n6. 保存数据示例") print("-" * 40) output_dir = Path('data/output') output_dir.mkdir(exist_ok=True) simple_simulator.save_to_csv(output_dir / 'simple_ate_data.csv') print(f"已保存:{output_dir / 'simple_ate_data.csv'}") simple_simulator.save_to_excel(output_dir / 'simple_ate_data.xlsx') print(f"已保存:{output_dir / 'simple_ate_data.xlsx'}") # 7. 使用便捷函数 print("\n\n7. 使用便捷函数") print("-" * 40) simple_df = generate_simple_ate_data(n_samples=200, effect_size=3.0, random_state=123) print(f"便捷函数生成数据形状:{simple_df.shape}") print(f"前 5 行:\n{simple_df.head()}") print("\n" + "=" * 60) print("示例完成!") print("=" * 60)