808 lines
24 KiB
Python
808 lines
24 KiB
Python
"""
|
||
模拟数据生成模块
|
||
|
||
本模块提供用于测试因果推断的模拟数据生成功能,支持多种数据生成场景:
|
||
- 简单 ATE 场景
|
||
- 协变量场景
|
||
- 交互效应场景
|
||
- CATE 场景
|
||
|
||
作者:CausalInferenceAgent
|
||
版本:1.0.0
|
||
"""
|
||
|
||
import numpy as np
|
||
import pandas as pd
|
||
from abc import ABC, abstractmethod
|
||
from typing import Optional, Dict, Any, Union
|
||
from pathlib import Path
|
||
|
||
|
||
class BaseDataSimulator(ABC):
|
||
"""
|
||
数据模拟器基类
|
||
|
||
定义所有数据模拟器的通用接口。
|
||
"""
|
||
|
||
def __init__(
|
||
self,
|
||
n_samples: int = 1000,
|
||
treatment_rate: float = 0.5,
|
||
noise_level: float = 1.0,
|
||
effect_size: float = 1.0,
|
||
random_state: Optional[int] = None
|
||
):
|
||
"""
|
||
初始化数据模拟器
|
||
|
||
参数:
|
||
n_samples: 样本数量,默认为 1000
|
||
treatment_rate: 处理比例(0-1 之间),默认为 0.5
|
||
noise_level: 噪声水平,默认为 1.0
|
||
effect_size: 效应大小,默认为 1.0
|
||
random_state: 随机种子,用于可重复性
|
||
"""
|
||
self.n_samples = n_samples
|
||
self.treatment_rate = treatment_rate
|
||
self.noise_level = noise_level
|
||
self.effect_size = effect_size
|
||
self.random_state = random_state
|
||
|
||
# 设置随机种子
|
||
if random_state is not None:
|
||
np.random.seed(random_state)
|
||
|
||
@abstractmethod
|
||
def generate(self) -> pd.DataFrame:
|
||
"""
|
||
生成模拟数据
|
||
|
||
返回:
|
||
包含模拟数据的 DataFrame
|
||
"""
|
||
pass
|
||
|
||
def save_to_csv(self, filepath: Union[str, Path]) -> None:
|
||
"""
|
||
将生成的数据保存为 CSV 文件
|
||
|
||
参数:
|
||
filepath: 文件路径
|
||
"""
|
||
df = self.generate()
|
||
df.to_csv(filepath, index=False, encoding='utf-8')
|
||
|
||
def save_to_excel(self, filepath: Union[str, Path]) -> None:
|
||
"""
|
||
将生成的数据保存为 Excel 文件
|
||
|
||
参数:
|
||
filepath: 文件路径
|
||
"""
|
||
df = self.generate()
|
||
df.to_excel(filepath, index=False)
|
||
|
||
@abstractmethod
|
||
def get_description(self) -> str:
|
||
"""
|
||
获取数据生成场景的描述
|
||
|
||
返回:
|
||
描述字符串
|
||
"""
|
||
pass
|
||
|
||
|
||
class SimpleATESimulator(BaseDataSimulator):
|
||
"""
|
||
简单 ATE(平均处理效应)场景模拟器
|
||
|
||
生成只有一个处理变量 T 和一个结果变量 Y 的简单数据。
|
||
模型:Y = α + τ*T + ε
|
||
其中 τ 是平均处理效应。
|
||
"""
|
||
|
||
def __init__(
|
||
self,
|
||
n_samples: int = 1000,
|
||
treatment_rate: float = 0.5,
|
||
noise_level: float = 1.0,
|
||
effect_size: float = 1.0,
|
||
intercept: float = 0.0,
|
||
random_state: Optional[int] = None
|
||
):
|
||
"""
|
||
初始化简单 ATE 模拟器
|
||
|
||
参数:
|
||
n_samples: 样本数量
|
||
treatment_rate: 处理比例
|
||
noise_level: 噪声水平
|
||
effect_size: 平均处理效应大小
|
||
intercept: 截距项
|
||
random_state: 随机种子
|
||
"""
|
||
super().__init__(
|
||
n_samples=n_samples,
|
||
treatment_rate=treatment_rate,
|
||
noise_level=noise_level,
|
||
effect_size=effect_size,
|
||
random_state=random_state
|
||
)
|
||
self.intercept = intercept
|
||
|
||
def generate(self) -> pd.DataFrame:
|
||
"""
|
||
生成简单 ATE 场景的模拟数据
|
||
|
||
返回:
|
||
包含 'T'(处理变量)和 'Y'(结果变量)的 DataFrame
|
||
"""
|
||
# 生成处理变量(伯努利分布)
|
||
T = np.random.binomial(1, self.treatment_rate, self.n_samples)
|
||
|
||
# 生成噪声
|
||
noise = np.random.normal(0, self.noise_level, self.n_samples)
|
||
|
||
# 生成结果变量:Y = intercept + effect_size * T + noise
|
||
Y = self.intercept + self.effect_size * T + noise
|
||
|
||
df = pd.DataFrame({
|
||
'T': T,
|
||
'Y': Y
|
||
})
|
||
|
||
return df
|
||
|
||
def get_description(self) -> str:
|
||
"""获取场景描述"""
|
||
return (
|
||
f"简单 ATE 场景:\n"
|
||
f"- 样本数量:{self.n_samples}\n"
|
||
f"- 处理比例:{self.treatment_rate:.2%}\n"
|
||
f"- 噪声水平:{self.noise_level}\n"
|
||
f"- 平均处理效应:{self.effect_size}\n"
|
||
f"- 截距项:{self.intercept}\n"
|
||
f"模型:Y = {self.intercept} + {self.effect_size}*T + ε"
|
||
)
|
||
|
||
|
||
class CovariateSimulator(BaseDataSimulator):
|
||
"""
|
||
协变量场景模拟器
|
||
|
||
生成包含协变量 X 的数据,用于控制混淆因素。
|
||
模型:Y = α + β*X + τ*T + ε
|
||
"""
|
||
|
||
def __init__(
|
||
self,
|
||
n_samples: int = 1000,
|
||
treatment_rate: float = 0.5,
|
||
noise_level: float = 1.0,
|
||
effect_size: float = 1.0,
|
||
n_covariates: int = 3,
|
||
covariate_distribution: str = 'normal',
|
||
covariate_params: Optional[Dict[str, Any]] = None,
|
||
random_state: Optional[int] = None
|
||
):
|
||
"""
|
||
初始化协变量场景模拟器
|
||
|
||
参数:
|
||
n_samples: 样本数量
|
||
treatment_rate: 处理比例
|
||
noise_level: 噪声水平
|
||
effect_size: 处理效应大小
|
||
n_covariates: 协变量数量
|
||
covariate_distribution: 协变量分布类型 ('normal', 'uniform', 'beta')
|
||
covariate_params: 协变量分布参数
|
||
random_state: 随机种子
|
||
"""
|
||
super().__init__(
|
||
n_samples=n_samples,
|
||
treatment_rate=treatment_rate,
|
||
noise_level=noise_level,
|
||
effect_size=effect_size,
|
||
random_state=random_state
|
||
)
|
||
self.n_covariates = n_covariates
|
||
self.covariate_distribution = covariate_distribution
|
||
self.covariate_params = covariate_params or {}
|
||
|
||
def _generate_covariates(self) -> np.ndarray:
|
||
"""
|
||
生成协变量数据
|
||
|
||
返回:
|
||
形状为 (n_samples, n_covariates) 的协变量数组
|
||
"""
|
||
if self.covariate_distribution == 'normal':
|
||
mean = self.covariate_params.get('mean', 0)
|
||
std = self.covariate_params.get('std', 1)
|
||
X = np.random.normal(mean, std, (self.n_samples, self.n_covariates))
|
||
|
||
elif self.covariate_distribution == 'uniform':
|
||
low = self.covariate_params.get('low', -1)
|
||
high = self.covariate_params.get('high', 1)
|
||
X = np.random.uniform(low, high, (self.n_samples, self.n_covariates))
|
||
|
||
elif self.covariate_distribution == 'beta':
|
||
a = self.covariate_params.get('a', 2)
|
||
b = self.covariate_params.get('b', 2)
|
||
X = np.random.beta(a, b, (self.n_samples, self.n_covariates))
|
||
|
||
else:
|
||
raise ValueError(f"未知的协变量分布类型:{self.covariate_distribution}")
|
||
|
||
return X
|
||
|
||
def generate(self) -> pd.DataFrame:
|
||
"""
|
||
生成协变量场景的模拟数据
|
||
|
||
返回:
|
||
包含协变量 X、处理变量 T 和结果变量 Y 的 DataFrame
|
||
"""
|
||
# 生成协变量
|
||
X = self._generate_covariates()
|
||
|
||
# 生成处理变量
|
||
T = np.random.binomial(1, self.treatment_rate, self.n_samples)
|
||
|
||
# 生成噪声
|
||
noise = np.random.normal(0, self.noise_level, self.n_samples)
|
||
|
||
# 生成协变量系数
|
||
beta = np.random.uniform(-1, 1, self.n_covariates)
|
||
|
||
# 生成结果变量:Y = β*X + τ*T + ε
|
||
Y = np.dot(X, beta) + self.effect_size * T + noise
|
||
|
||
# 创建 DataFrame
|
||
df_dict = {}
|
||
for i in range(self.n_covariates):
|
||
df_dict[f'X{i}'] = X[:, i]
|
||
df_dict['T'] = T
|
||
df_dict['Y'] = Y
|
||
|
||
df = pd.DataFrame(df_dict)
|
||
|
||
return df
|
||
|
||
def get_description(self) -> str:
|
||
"""获取场景描述"""
|
||
return (
|
||
f"协变量场景:\n"
|
||
f"- 样本数量:{self.n_samples}\n"
|
||
f"- 处理比例:{self.treatment_rate:.2%}\n"
|
||
f"- 噪声水平:{self.noise_level}\n"
|
||
f"- 处理效应:{self.effect_size}\n"
|
||
f"- 协变量数量:{self.n_covariates}\n"
|
||
f"- 协变量分布:{self.covariate_distribution}\n"
|
||
f"模型:Y = β*X + {self.effect_size}*T + ε"
|
||
)
|
||
|
||
|
||
class InteractionEffectSimulator(BaseDataSimulator):
|
||
"""
|
||
交互效应场景模拟器
|
||
|
||
生成处理效应随协变量变化的数据。
|
||
模型:Y = α + β*X + τ*T + γ*X*T + ε
|
||
其中 γ 是交互效应系数。
|
||
"""
|
||
|
||
def __init__(
|
||
self,
|
||
n_samples: int = 1000,
|
||
treatment_rate: float = 0.5,
|
||
noise_level: float = 1.0,
|
||
effect_size: float = 1.0,
|
||
interaction_strength: float = 0.5,
|
||
n_covariates: int = 2,
|
||
random_state: Optional[int] = None
|
||
):
|
||
"""
|
||
初始化交互效应场景模拟器
|
||
|
||
参数:
|
||
n_samples: 样本数量
|
||
treatment_rate: 处理比例
|
||
noise_level: 噪声水平
|
||
effect_size: 基础处理效应
|
||
interaction_strength: 交互效应强度
|
||
n_covariates: 协变量数量
|
||
random_state: 随机种子
|
||
"""
|
||
super().__init__(
|
||
n_samples=n_samples,
|
||
treatment_rate=treatment_rate,
|
||
noise_level=noise_level,
|
||
effect_size=effect_size,
|
||
random_state=random_state
|
||
)
|
||
self.interaction_strength = interaction_strength
|
||
self.n_covariates = n_covariates
|
||
|
||
def generate(self) -> pd.DataFrame:
|
||
"""
|
||
生成交互效应场景的模拟数据
|
||
|
||
返回:
|
||
包含协变量 X、处理变量 T 和结果变量 Y 的 DataFrame
|
||
"""
|
||
# 生成协变量
|
||
X = np.random.normal(0, 1, (self.n_samples, self.n_covariates))
|
||
|
||
# 生成处理变量
|
||
T = np.random.binomial(1, self.treatment_rate, self.n_samples)
|
||
|
||
# 生成噪声
|
||
noise = np.random.normal(0, self.noise_level, self.n_samples)
|
||
|
||
# 生成协变量系数
|
||
beta = np.random.uniform(-0.5, 0.5, self.n_covariates)
|
||
|
||
# 生成交互效应系数
|
||
gamma = np.random.uniform(-0.5, 0.5, self.n_covariates)
|
||
|
||
# 生成结果变量:Y = β*X + τ*T + γ*X*T + ε
|
||
Y = np.dot(X, beta) + self.effect_size * T
|
||
for i in range(self.n_covariates):
|
||
Y += gamma[i] * X[:, i] * T
|
||
Y += noise
|
||
|
||
# 创建 DataFrame
|
||
df_dict = {}
|
||
for i in range(self.n_covariates):
|
||
df_dict[f'X{i}'] = X[:, i]
|
||
df_dict['T'] = T
|
||
df_dict['Y'] = Y
|
||
|
||
df = pd.DataFrame(df_dict)
|
||
|
||
return df
|
||
|
||
def get_description(self) -> str:
|
||
"""获取场景描述"""
|
||
return (
|
||
f"交互效应场景:\n"
|
||
f"- 样本数量:{self.n_samples}\n"
|
||
f"- 处理比例:{self.treatment_rate:.2%}\n"
|
||
f"- 噪声水平:{self.noise_level}\n"
|
||
f"- 基础处理效应:{self.effect_size}\n"
|
||
f"- 交互效应强度:{self.interaction_strength}\n"
|
||
f"- 协变量数量:{self.n_covariates}\n"
|
||
f"模型:Y = β*X + {self.effect_size}*T + γ*X*T + ε"
|
||
)
|
||
|
||
|
||
class CATESimulator(BaseDataSimulator):
|
||
"""
|
||
条件平均处理效应(CATE)场景模拟器
|
||
|
||
生成处理效应依赖于个体特征的数据。
|
||
模型:Y = α + β*X + τ(X)*T + ε
|
||
其中 τ(X) 是条件处理效应函数。
|
||
"""
|
||
|
||
def __init__(
|
||
self,
|
||
n_samples: int = 1000,
|
||
treatment_rate: float = 0.5,
|
||
noise_level: float = 1.0,
|
||
effect_size: float = 1.0,
|
||
n_features: int = 5,
|
||
cate_function: str = 'linear',
|
||
random_state: Optional[int] = None
|
||
):
|
||
"""
|
||
初始化 CATE 场景模拟器
|
||
|
||
参数:
|
||
n_samples: 样本数量
|
||
treatment_rate: 处理比例
|
||
noise_level: 噪声水平
|
||
effect_size: 平均处理效应基准
|
||
n_features: 特征数量
|
||
cate_function: CATE 函数类型 ('linear', 'nonlinear', 'threshold')
|
||
random_state: 随机种子
|
||
"""
|
||
super().__init__(
|
||
n_samples=n_samples,
|
||
treatment_rate=treatment_rate,
|
||
noise_level=noise_level,
|
||
effect_size=effect_size,
|
||
random_state=random_state
|
||
)
|
||
self.n_features = n_features
|
||
self.cate_function = cate_function
|
||
|
||
def _compute_cate(self, X: np.ndarray) -> np.ndarray:
|
||
"""
|
||
计算条件处理效应
|
||
|
||
参数:
|
||
X: 特征矩阵,形状为 (n_samples, n_features)
|
||
|
||
返回:
|
||
条件处理效应数组
|
||
"""
|
||
if self.cate_function == 'linear':
|
||
# 线性 CATE: τ(X) = τ0 + β*X
|
||
beta_cate = np.random.uniform(-0.5, 0.5, self.n_features)
|
||
cate = self.effect_size + np.dot(X, beta_cate)
|
||
|
||
elif self.cate_function == 'nonlinear':
|
||
# 非线性 CATE: τ(X) = τ0 + sin(β*X)
|
||
beta_cate = np.random.uniform(-1, 1, self.n_features)
|
||
linear_comb = np.dot(X, beta_cate)
|
||
cate = self.effect_size + np.sin(linear_comb)
|
||
|
||
elif self.cate_function == 'threshold':
|
||
# 阈值 CATE: τ(X) = τ0 + β*X * I(X > threshold)
|
||
beta_cate = np.random.uniform(-0.5, 0.5, self.n_features)
|
||
threshold = np.random.uniform(-1, 1)
|
||
linear_comb = np.dot(X, beta_cate)
|
||
cate = self.effect_size + linear_comb * (linear_comb > threshold).astype(float)
|
||
|
||
else:
|
||
raise ValueError(f"未知的 CATE 函数类型:{self.cate_function}")
|
||
|
||
return cate
|
||
|
||
def generate(self) -> pd.DataFrame:
|
||
"""
|
||
生成 CATE 场景的模拟数据
|
||
|
||
返回:
|
||
包含特征 X、处理变量 T 和结果变量 Y 的 DataFrame
|
||
"""
|
||
# 生成特征
|
||
X = np.random.normal(0, 1, (self.n_samples, self.n_features))
|
||
|
||
# 生成处理变量
|
||
T = np.random.binomial(1, self.treatment_rate, self.n_samples)
|
||
|
||
# 生成噪声
|
||
noise = np.random.normal(0, self.noise_level, self.n_samples)
|
||
|
||
# 计算 CATE
|
||
cate = self._compute_cate(X)
|
||
|
||
# 生成协变量系数(用于结果模型)
|
||
beta_y = np.random.uniform(-0.5, 0.5, self.n_features)
|
||
|
||
# 生成结果变量:Y = β*X + τ(X)*T + ε
|
||
Y = np.dot(X, beta_y) + cate * T + noise
|
||
|
||
# 创建 DataFrame
|
||
df_dict = {}
|
||
for i in range(self.n_features):
|
||
df_dict[f'X{i}'] = X[:, i]
|
||
df_dict['T'] = T
|
||
df_dict['Y'] = Y
|
||
|
||
# 添加 CATE 列(用于验证)
|
||
df_dict['CATE'] = cate
|
||
|
||
df = pd.DataFrame(df_dict)
|
||
|
||
return df
|
||
|
||
def get_description(self) -> str:
|
||
"""获取场景描述"""
|
||
return (
|
||
f"CATE 场景:\n"
|
||
f"- 样本数量:{self.n_samples}\n"
|
||
f"- 处理比例:{self.treatment_rate:.2%}\n"
|
||
f"- 噪声水平:{self.noise_level}\n"
|
||
f"- 平均处理效应基准:{self.effect_size}\n"
|
||
f"- 特征数量:{self.n_features}\n"
|
||
f"- CATE 函数类型:{self.cate_function}\n"
|
||
f"模型:Y = β*X + τ(X)*T + ε"
|
||
)
|
||
|
||
|
||
class DataSimulatorFactory:
|
||
"""
|
||
数据模拟器工厂类
|
||
|
||
用于创建不同类型的模拟器实例。
|
||
"""
|
||
|
||
_simulators = {
|
||
'simple_ate': SimpleATESimulator,
|
||
'covariate': CovariateSimulator,
|
||
'interaction': InteractionEffectSimulator,
|
||
'cate': CATESimulator
|
||
}
|
||
|
||
@classmethod
|
||
def create(cls, simulator_type: str, **kwargs) -> BaseDataSimulator:
|
||
"""
|
||
创建指定类型的模拟器
|
||
|
||
参数:
|
||
simulator_type: 模拟器类型 ('simple_ate', 'covariate', 'interaction', 'cate')
|
||
**kwargs: 模拟器初始化参数
|
||
|
||
返回:
|
||
模拟器实例
|
||
|
||
Raises:
|
||
ValueError: 未知的模拟器类型
|
||
"""
|
||
if simulator_type not in cls._simulators:
|
||
available_types = ', '.join(cls._simulators.keys())
|
||
raise ValueError(
|
||
f"未知的模拟器类型:{simulator_type}。"
|
||
f"可用类型:{available_types}"
|
||
)
|
||
|
||
return cls._simulators[simulator_type](**kwargs)
|
||
|
||
@classmethod
|
||
def list_available_types(cls) -> list:
|
||
"""
|
||
获取所有可用的模拟器类型
|
||
|
||
返回:
|
||
模拟器类型列表
|
||
"""
|
||
return list(cls._simulators.keys())
|
||
|
||
|
||
# 便捷函数
|
||
def generate_simple_ate_data(
|
||
n_samples: int = 1000,
|
||
treatment_rate: float = 0.5,
|
||
noise_level: float = 1.0,
|
||
effect_size: float = 1.0,
|
||
intercept: float = 0.0,
|
||
random_state: Optional[int] = None
|
||
) -> pd.DataFrame:
|
||
"""
|
||
生成简单 ATE 场景的模拟数据(便捷函数)
|
||
|
||
参数:
|
||
n_samples: 样本数量
|
||
treatment_rate: 处理比例
|
||
noise_level: 噪声水平
|
||
effect_size: 平均处理效应
|
||
intercept: 截距项
|
||
random_state: 随机种子
|
||
|
||
返回:
|
||
包含 'T' 和 'Y' 的 DataFrame
|
||
"""
|
||
simulator = SimpleATESimulator(
|
||
n_samples=n_samples,
|
||
treatment_rate=treatment_rate,
|
||
noise_level=noise_level,
|
||
effect_size=effect_size,
|
||
intercept=intercept,
|
||
random_state=random_state
|
||
)
|
||
return simulator.generate()
|
||
|
||
|
||
def generate_covariate_data(
|
||
n_samples: int = 1000,
|
||
treatment_rate: float = 0.5,
|
||
noise_level: float = 1.0,
|
||
effect_size: float = 1.0,
|
||
n_covariates: int = 3,
|
||
covariate_distribution: str = 'normal',
|
||
random_state: Optional[int] = None
|
||
) -> pd.DataFrame:
|
||
"""
|
||
生成协变量场景的模拟数据(便捷函数)
|
||
|
||
参数:
|
||
n_samples: 样本数量
|
||
treatment_rate: 处理比例
|
||
noise_level: 噪声水平
|
||
effect_size: 处理效应
|
||
n_covariates: 协变量数量
|
||
covariate_distribution: 协变量分布类型
|
||
random_state: 随机种子
|
||
|
||
返回:
|
||
包含协变量、'T' 和 'Y' 的 DataFrame
|
||
"""
|
||
simulator = CovariateSimulator(
|
||
n_samples=n_samples,
|
||
treatment_rate=treatment_rate,
|
||
noise_level=noise_level,
|
||
effect_size=effect_size,
|
||
n_covariates=n_covariates,
|
||
covariate_distribution=covariate_distribution,
|
||
random_state=random_state
|
||
)
|
||
return simulator.generate()
|
||
|
||
|
||
def generate_interaction_data(
|
||
n_samples: int = 1000,
|
||
treatment_rate: float = 0.5,
|
||
noise_level: float = 1.0,
|
||
effect_size: float = 1.0,
|
||
interaction_strength: float = 0.5,
|
||
n_covariates: int = 2,
|
||
random_state: Optional[int] = None
|
||
) -> pd.DataFrame:
|
||
"""
|
||
生成交互效应场景的模拟数据(便捷函数)
|
||
|
||
参数:
|
||
n_samples: 样本数量
|
||
treatment_rate: 处理比例
|
||
noise_level: 噪声水平
|
||
effect_size: 基础处理效应
|
||
interaction_strength: 交互效应强度
|
||
n_covariates: 协变量数量
|
||
random_state: 随机种子
|
||
|
||
返回:
|
||
包含协变量、'T' 和 'Y' 的 DataFrame
|
||
"""
|
||
simulator = InteractionEffectSimulator(
|
||
n_samples=n_samples,
|
||
treatment_rate=treatment_rate,
|
||
noise_level=noise_level,
|
||
effect_size=effect_size,
|
||
interaction_strength=interaction_strength,
|
||
n_covariates=n_covariates,
|
||
random_state=random_state
|
||
)
|
||
return simulator.generate()
|
||
|
||
|
||
def generate_cate_data(
|
||
n_samples: int = 1000,
|
||
treatment_rate: float = 0.5,
|
||
noise_level: float = 1.0,
|
||
effect_size: float = 1.0,
|
||
n_features: int = 5,
|
||
cate_function: str = 'linear',
|
||
random_state: Optional[int] = None
|
||
) -> pd.DataFrame:
|
||
"""
|
||
生成 CATE 场景的模拟数据(便捷函数)
|
||
|
||
参数:
|
||
n_samples: 样本数量
|
||
treatment_rate: 处理比例
|
||
noise_level: 噪声水平
|
||
effect_size: 平均处理效应基准
|
||
n_features: 特征数量
|
||
cate_function: CATE 函数类型
|
||
random_state: 随机种子
|
||
|
||
返回:
|
||
包含特征、'T'、'Y' 和 'CATE' 的 DataFrame
|
||
"""
|
||
simulator = CATESimulator(
|
||
n_samples=n_samples,
|
||
treatment_rate=treatment_rate,
|
||
noise_level=noise_level,
|
||
effect_size=effect_size,
|
||
n_features=n_features,
|
||
cate_function=cate_function,
|
||
random_state=random_state
|
||
)
|
||
return simulator.generate()
|
||
|
||
|
||
# 示例用法
|
||
if __name__ == '__main__':
|
||
print("=" * 60)
|
||
print("模拟数据生成模块示例")
|
||
print("=" * 60)
|
||
|
||
# 1. 简单 ATE 场景
|
||
print("\n1. 简单 ATE 场景")
|
||
print("-" * 40)
|
||
simple_simulator = SimpleATESimulator(
|
||
n_samples=500,
|
||
treatment_rate=0.5,
|
||
noise_level=1.0,
|
||
effect_size=2.0,
|
||
intercept=5.0,
|
||
random_state=42
|
||
)
|
||
print(simple_simulator.get_description())
|
||
simple_df = simple_simulator.generate()
|
||
print(f"\n数据形状:{simple_df.shape}")
|
||
print(f"前 5 行:\n{simple_df.head()}")
|
||
|
||
# 2. 协变量场景
|
||
print("\n\n2. 协变量场景")
|
||
print("-" * 40)
|
||
covariate_simulator = CovariateSimulator(
|
||
n_samples=500,
|
||
treatment_rate=0.4,
|
||
noise_level=1.5,
|
||
effect_size=1.5,
|
||
n_covariates=3,
|
||
covariate_distribution='normal',
|
||
random_state=42
|
||
)
|
||
print(covariate_simulator.get_description())
|
||
covariate_df = covariate_simulator.generate()
|
||
print(f"\n数据形状:{covariate_df.shape}")
|
||
print(f"前 5 行:\n{covariate_df.head()}")
|
||
|
||
# 3. 交互效应场景
|
||
print("\n\n3. 交互效应场景")
|
||
print("-" * 40)
|
||
interaction_simulator = InteractionEffectSimulator(
|
||
n_samples=500,
|
||
treatment_rate=0.5,
|
||
noise_level=1.0,
|
||
effect_size=1.0,
|
||
interaction_strength=0.5,
|
||
n_covariates=2,
|
||
random_state=42
|
||
)
|
||
print(interaction_simulator.get_description())
|
||
interaction_df = interaction_simulator.generate()
|
||
print(f"\n数据形状:{interaction_df.shape}")
|
||
print(f"前 5 行:\n{interaction_df.head()}")
|
||
|
||
# 4. CATE 场景
|
||
print("\n\n4. CATE 场景")
|
||
print("-" * 40)
|
||
cate_simulator = CATESimulator(
|
||
n_samples=500,
|
||
treatment_rate=0.5,
|
||
noise_level=1.0,
|
||
effect_size=1.5,
|
||
n_features=4,
|
||
cate_function='linear',
|
||
random_state=42
|
||
)
|
||
print(cate_simulator.get_description())
|
||
cate_df = cate_simulator.generate()
|
||
print(f"\n数据形状:{cate_df.shape}")
|
||
print(f"前 5 行:\n{cate_df.head()}")
|
||
|
||
# 5. 使用工厂创建模拟器
|
||
print("\n\n5. 使用工厂创建模拟器")
|
||
print("-" * 40)
|
||
print(f"可用模拟器类型:{DataSimulatorFactory.list_available_types()}")
|
||
factory_simulator = DataSimulatorFactory.create(
|
||
'simple_ate',
|
||
n_samples=300,
|
||
treatment_rate=0.6,
|
||
effect_size=2.5,
|
||
random_state=42
|
||
)
|
||
print(factory_simulator.get_description())
|
||
|
||
# 6. 保存数据
|
||
print("\n\n6. 保存数据示例")
|
||
print("-" * 40)
|
||
output_dir = Path('data/output')
|
||
output_dir.mkdir(exist_ok=True)
|
||
|
||
simple_simulator.save_to_csv(output_dir / 'simple_ate_data.csv')
|
||
print(f"已保存:{output_dir / 'simple_ate_data.csv'}")
|
||
|
||
simple_simulator.save_to_excel(output_dir / 'simple_ate_data.xlsx')
|
||
print(f"已保存:{output_dir / 'simple_ate_data.xlsx'}")
|
||
|
||
# 7. 使用便捷函数
|
||
print("\n\n7. 使用便捷函数")
|
||
print("-" * 40)
|
||
simple_df = generate_simple_ate_data(n_samples=200, effect_size=3.0, random_state=123)
|
||
print(f"便捷函数生成数据形状:{simple_df.shape}")
|
||
print(f"前 5 行:\n{simple_df.head()}")
|
||
|
||
print("\n" + "=" * 60)
|
||
print("示例完成!")
|
||
print("=" * 60)
|