diff --git a/ml-augmentation-toolkit_project/README.md b/ml-augmentation-toolkit_project/README.md new file mode 100644 index 0000000..a610d44 --- /dev/null +++ b/ml-augmentation-toolkit_project/README.md @@ -0,0 +1,34 @@ +# alloyxai + +> **A modular machine learning pipeline for data augmentation and explainable modeling in superalloy design** +> 面向高温合金设计的数据增强与可解释性建模一体化机器学习框架 + +--- + +## 🔬 Project Overview | 项目概述 + +**`alloyxai`** is a research-oriented Python toolkit that integrates *data generation*, *imbalance handling*, and *model interpretability* into a unified machine learning pipeline, specifically designed for **superalloy composition optimization and microstructure-performance prediction**. + +该项目融合了多种数据增强手段(MCMC、WGAN-GP、SMOGN)与可解释性分析(SHAP),适用于**高温合金成分设计、相粗化行为建模及高温性能预测等典型材料科学问题**。 + +--- + +## 🧩 Core Modules | 核心模块 + +| 模块名 | 描述 | +|-------------------|------| +| `MCMCSampler` | 基于贝叶斯推断的元素比例生成器(Dirichlet + TruncatedNormal) | +| `WGANGPRegressor` | 面向回归问题的小样本数据生成器,集成条件判别与梯度惩罚机制 | +| `SMOGNAugmentor` | 用于不平衡目标分布的回归型过采样(适合长尾、高偏态分布) | +| `SHAPAnalyzer` | 提供主效应、交互项、蜂群图与依赖图等多层次模型解释能力 | + +--- + +## 🚀 Example Workflow | 示例工作流 + +```bash +# 安装依赖 +pip install -r requirements.txt + +# 运行主流程(默认启用 MCMC + WGAN + SHAP) +python pipeline.py diff --git a/ml-augmentation-toolkit_project/ml-augmentation-toolkit/__init__.py b/ml-augmentation-toolkit_project/ml-augmentation-toolkit/__init__.py new file mode 100644 index 0000000..99a8091 --- /dev/null +++ b/ml-augmentation-toolkit_project/ml-augmentation-toolkit/__init__.py @@ -0,0 +1,2 @@ + + diff --git a/ml-augmentation-toolkit_project/ml-augmentation-toolkit/mcmc_sampler.py b/ml-augmentation-toolkit_project/ml-augmentation-toolkit/mcmc_sampler.py new file mode 100644 index 0000000..dddf553 --- /dev/null +++ b/ml-augmentation-toolkit_project/ml-augmentation-toolkit/mcmc_sampler.py @@ -0,0 +1,160 @@ +import os +import pandas as pd +import numpy as np +import pymc as pm +import arviz as az +import matplotlib.pyplot as plt +import seaborn as sns + + +class MCMCSampler: + """ + 使用 PyMC 对高温合金元素组成与温度进行 MCMC 采样。 + + - 元素组成建模为 Dirichlet 分布(强约束:总和为100%) + - 温度建模为 Truncated Normal 分布 + """ + + def __init__(self, + data_path, + trace_save_path, + sample_save_path, + elements_cols=None, + t_col='T', + draws=4000, + tune=1000, + chains=4, + cores=4, + seed=42, + concentration=100): + """ + 初始化采样器 + + Parameters: + data_path (str): 原始CSV数据路径 + trace_save_path (str): 轨迹保存路径 + sample_save_path (str): 生成样本保存路径 + elements_cols (list): 元素列名(默认10种常见元素) + t_col (str): 温度列名 + draws (int): 每条链的采样步数 + tune (int): 调优步数 + chains (int): 链数 + cores (int): 并行核数 + seed (int): 随机种子 + concentration (float): Dirichlet浓度参数 + """ + self.data_path = data_path + self.trace_save_path = trace_save_path + self.sample_save_path = sample_save_path + self.elements_cols = elements_cols or ['Co', 'Al', 'W', 'Ta', 'Ti', 'Nb', 'Ni', 'Cr', 'V', 'Mo'] + self.t_col = t_col + self.draws = draws + self.tune = tune + self.chains = chains + self.cores = cores + self.seed = seed + self.concentration = concentration + self.EPSILON = 1e-6 + + def load_data(self): + """读取数据并检查列合法性""" + if not os.path.exists(self.data_path): + raise FileNotFoundError(f"找不到数据文件: {self.data_path}") + self.data = pd.read_csv(self.data_path) + + for col in self.elements_cols + [self.t_col]: + if col not in self.data.columns: + raise ValueError(f"缺失列: {col},请检查数据文件格式。") + + self.elements_data = self.data[self.elements_cols].replace(0, 1e-5) + self.t_data = self.data[self.t_col] + + def _compute_dirichlet_alpha(self): + """根据元素均值计算 Dirichlet 参数 α""" + mean_props = self.elements_data.mean(axis=0) / 100.0 + alpha = np.maximum(mean_props * self.concentration, self.EPSILON) + return alpha + + def build_model(self): + """构建 PyMC 模型并进行采样""" + alpha = self._compute_dirichlet_alpha() + t_mu, t_sigma = self.t_data.mean(), self.t_data.std() + t_min, t_max = self.t_data.min(), self.t_data.max() + + with pm.Model() as self.model: + proportions = pm.Dirichlet("proportions", a=alpha, shape=(len(self.elements_cols),)) + elements_generated = pm.Deterministic("elements_generated", proportions * 100) + t_prior = pm.TruncatedNormal("T_prior", mu=t_mu, sigma=t_sigma, + lower=t_min - 10, upper=t_max + 10) + + self.trace = pm.sample( + draws=self.draws, + tune=self.tune, + chains=self.chains, + cores=self.cores, + target_accept=0.95, + random_seed=self.seed, + return_inferencedata=True + ) + + def check_convergence(self): + """使用ArviZ进行收敛性诊断""" + summary = az.summary(self.trace, var_names=["proportions", "T_prior"]) + if summary["r_hat"].max() > 1.05: + print("⚠️ 警告:存在未收敛参数,建议增加采样步数或调整模型!") + return summary + + def save_trace(self): + """保存 MCMC 轨迹数据为 CSV""" + proportions_trace = self.trace.posterior['proportions'].stack(sample=("chain", "draw")).values.transpose(1, 0) + t_trace = self.trace.posterior['T_prior'].stack(sample=("chain", "draw")).values.flatten() + trace_df = pd.DataFrame(proportions_trace, columns=[f"proportions_{el}" for el in self.elements_cols]) + trace_df["T_prior"] = t_trace + + os.makedirs(os.path.dirname(self.trace_save_path), exist_ok=True) + trace_df.to_csv(self.trace_save_path, index=False) + + def extract_samples(self): + """提取生成的后验样本""" + posterior = self.trace.posterior + self.samples_df = pd.DataFrame({ + col: posterior['elements_generated'][..., i].values.flatten() + for i, col in enumerate(self.elements_cols) + }) + self.samples_df['T'] = posterior['T_prior'].values.flatten() + + def save_samples(self): + """保存后验样本""" + os.makedirs(os.path.dirname(self.sample_save_path), exist_ok=True) + self.samples_df.to_csv(self.sample_save_path, index=False) + + def plot_distributions(self, save_dir=None): + """原始与生成数据分布对比图(可选保存)""" + for col in self.elements_cols + ['T']: + plt.figure(figsize=(8, 4)) + sns.kdeplot(self.data[col], label="原始数据", fill=True) + sns.kdeplot(self.samples_df[col], label="生成数据", fill=True) + plt.title(f"{col} 分布对比") + plt.xlabel("值") + plt.ylabel("密度") + plt.legend() + plt.tight_layout() + if save_dir: + os.makedirs(save_dir, exist_ok=True) + plt.savefig(os.path.join(save_dir, f"{col}_kde.png")) + plt.show() + + def run(self, plot=True, save_plot_dir=None): + """执行完整 MCMC 流程""" + print("🔄 开始 MCMC 流程...") + self.load_data() + self.build_model() + self.check_convergence() + self.save_trace() + self.extract_samples() + self.save_samples() + if plot: + self.plot_distributions(save_dir=save_plot_dir) + print("✅ MCMC流程完成!") + return self.samples_df, self.trace + diff --git a/ml-augmentation-toolkit_project/ml-augmentation-toolkit/shap_analyzer.py b/ml-augmentation-toolkit_project/ml-augmentation-toolkit/shap_analyzer.py new file mode 100644 index 0000000..68055b1 --- /dev/null +++ b/ml-augmentation-toolkit_project/ml-augmentation-toolkit/shap_analyzer.py @@ -0,0 +1,129 @@ +import os +import pandas as pd +import numpy as np +import shap +import matplotlib.pyplot as plt +from xgboost import XGBRegressor +from sklearn.model_selection import cross_val_predict, KFold +from sklearn.metrics import r2_score + + +class SHAPAnalyzer: + """ + 使用XGBoost + SHAP进行特征重要性分析和交互作用分析。 + """ + + def __init__(self, target_col, feature_name_mapping=None, random_state=42): + self.target_col = target_col + self.feature_name_mapping = feature_name_mapping or {} + self.random_state = random_state + + def fit(self, train_data, test_data, model_params=None): + self.train_data = train_data + self.test_data = test_data + + self.X_train = self.train_data.drop(columns=[self.target_col]) + self.y_train = self.train_data[self.target_col] + self.X_test = self.test_data.drop(columns=[self.target_col], errors='ignore') + + self.features = self.X_train.columns.tolist() + self.feature_display_names = [self.feature_name_mapping.get(col, col) for col in self.features] + + self.model_params = model_params or { + 'colsample_bytree': 1.0, + 'gamma': 2.0, + 'learning_rate': 0.1, + 'max_depth': 10, + 'n_estimators': 50, + 'subsample': 0.7, + 'eval_metric': 'rmse', + 'n_jobs': -1, + 'random_state': self.random_state + } + + xgb_model = XGBRegressor(**self.model_params) + kf = KFold(n_splits=10, shuffle=True, random_state=self.random_state) + y_pred = cross_val_predict(xgb_model, self.X_train, self.y_train, cv=kf) + + self.r2_score_cv = r2_score(self.y_train, y_pred) + self.y_cv_pred = y_pred # 保存交叉验证预测 + print(f"Cross-validated R²: {self.r2_score_cv:.4f}") + + self.final_model = xgb_model.fit(self.X_train, self.y_train) + + self.explainer = shap.TreeExplainer(self.final_model, feature_perturbation='tree_path_dependent') + self.shap_values = self.explainer(self.X_test).values + self.shap_interaction_values = self.explainer.shap_interaction_values(self.X_test) + + def save_feature_importance(self, path): + xgb_importance = self.final_model.feature_importances_ + shap_importance = np.abs(self.shap_values).mean(axis=0) + + importance_df = pd.DataFrame({ + 'Feature': self.features, + 'DisplayName': self.feature_display_names, + 'XGBoost_Importance': xgb_importance, + 'SHAP_Importance': shap_importance + }).sort_values('SHAP_Importance', ascending=False) + + os.makedirs(os.path.dirname(path), exist_ok=True) + importance_df.to_csv(path, index=False, float_format="%.6f") + print(f"✅ 特征重要性保存到: {path}") + + def save_shap_values(self, path): + shap_df = pd.DataFrame(self.shap_values, columns=self.features) + os.makedirs(os.path.dirname(path), exist_ok=True) + shap_df.to_csv(path, index=False, float_format="%.6f") + print(f"✅ SHAP值保存到: {path}") + + def save_shap_summary_plot(self, path): + plt.figure(figsize=(10, 8)) + shap.summary_plot(self.shap_values, self.X_test, feature_names=self.feature_display_names, show=False) + plt.title("SHAP Summary Plot") + plt.tight_layout() + plt.savefig(path, dpi=300, bbox_inches='tight') + plt.close() + print(f"✅ SHAP蜂群图保存到: {path}") + + def save_interaction_heatmap(self, path): + plt.figure(figsize=(10, 8)) + shap.summary_plot(self.shap_interaction_values, self.X_test, plot_type="compact_dot", show=False) + plt.title("SHAP Interaction Heatmap") + plt.tight_layout() + plt.savefig(path, dpi=300, bbox_inches='tight') + plt.close() + print(f"✅ 交互热力图保存到: {path}") + + def save_interaction_strengths(self, path): + strength = np.mean(np.abs(self.shap_interaction_values), axis=0) + + interaction_records = [] + for i in range(len(self.features)): + for j in range(i+1, len(self.features)): + interaction_records.append({ + 'Feature_A': self.features[i], + 'Feature_B': self.features[j], + 'Interaction_Strength': strength[i, j] + }) + + interaction_df = pd.DataFrame(interaction_records).sort_values('Interaction_Strength', ascending=False) + os.makedirs(os.path.dirname(path), exist_ok=True) + interaction_df.to_csv(path, index=False, float_format="%.6f") + print(f"✅ 全局交互强度保存到: {path}") + + def plot_dependence(self, feature, interaction_feature=None, path=None): + shap.dependence_plot( + feature, + self.shap_values, + self.X_test, + interaction_index=interaction_feature, + show=False + ) + plt.title(f"{feature} Interaction with {interaction_feature}") + if path: + os.makedirs(os.path.dirname(path), exist_ok=True) + plt.savefig(path, dpi=300, bbox_inches='tight') + plt.close() + print(f"✅ 依赖图保存到: {path}") + else: + plt.show() diff --git a/ml-augmentation-toolkit_project/ml-augmentation-toolkit/smogn_augmentor.py b/ml-augmentation-toolkit_project/ml-augmentation-toolkit/smogn_augmentor.py new file mode 100644 index 0000000..2a001e3 --- /dev/null +++ b/ml-augmentation-toolkit_project/ml-augmentation-toolkit/smogn_augmentor.py @@ -0,0 +1,77 @@ +import os +import smogn +import pandas as pd +import matplotlib.pyplot as plt + + +class SMOGNAugmentor: + """ + 使用 SMOGN 对回归数据进行增强,解决目标变量分布不平衡问题。 + """ + + def __init__(self, target_col, samp_method="balance", save_path=None): + """ + 初始化增强器 + + 参数: + target_col (str): 目标变量名(如 'K', 'Vol') + samp_method (str): 采样方法,可选 'balance' 或 'extreme' + save_path (str): 增强后数据的保存路径(可选) + """ + self.target_col = target_col + self.samp_method = samp_method + self.save_path = save_path + self.original_df = None + self.enhanced_df = None + + def fit_transform(self, df): + """ + 对输入 DataFrame 执行 SMOGN 增强 + + 参数: + df (pd.DataFrame): 原始数据 + + 返回: + pd.DataFrame: 增强后的数据 + """ + self.original_df = df.copy() + self.enhanced_df = smogn.smoter( + data=df, + y=self.target_col, + samp_method=self.samp_method + ) + return self.enhanced_df + + def plot_distribution(self, bins=30): + """ + 可视化增强前后目标变量的分布对比图 + + 参数: + bins (int): 直方图分箱数 + """ + if self.original_df is None or self.enhanced_df is None: + raise ValueError("请先运行 fit_transform()") + + plt.figure(figsize=(8, 5)) + plt.hist(self.original_df[self.target_col], bins=bins, alpha=0.5, label="原始数据", edgecolor="black") + plt.hist(self.enhanced_df[self.target_col], bins=bins, alpha=0.5, label="SMOGN 增强数据", edgecolor="black") + plt.xlabel(self.target_col) + plt.ylabel("频数") + plt.title(f"SMOGN 增强前后 {self.target_col} 的分布对比") + plt.legend() + plt.tight_layout() + plt.show() + + def save(self, path=None): + """ + 保存增强后的数据为 CSV + + 参数: + path (str): 指定保存路径;如为空则使用初始化时的 save_path + """ + path = path or self.save_path + if path is None: + raise ValueError("未指定保存路径,请传入 path 或设置 save_path") + os.makedirs(os.path.dirname(path), exist_ok=True) + self.enhanced_df.to_csv(path, index=False) + print(f"✅ 增强后的数据已保存至:{path}") diff --git a/ml-augmentation-toolkit_project/ml-augmentation-toolkit/wgan_gp_generator.py b/ml-augmentation-toolkit_project/ml-augmentation-toolkit/wgan_gp_generator.py new file mode 100644 index 0000000..df9d327 --- /dev/null +++ b/ml-augmentation-toolkit_project/ml-augmentation-toolkit/wgan_gp_generator.py @@ -0,0 +1,187 @@ +import os +import torch +import torch.nn as nn +import torch.optim as optim +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt + + +def seed_everything(seed=42): + """确保结果可复现""" + import random + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + + +class Generator(nn.Module): + def __init__(self, latent_dim, condition_dim, output_dim): + super().__init__() + self.model = nn.Sequential( + nn.Linear(latent_dim + condition_dim, 256), # 输入:噪声 + 条件变量 + nn.LeakyReLU(0.2), + nn.Linear(256, 512), + nn.LeakyReLU(0.2), + nn.Linear(512, 1024), + nn.LeakyReLU(0.2), + nn.Linear(1024, output_dim) # 输出:回归特征 + ) + + def forward(self, z, conditions): + input_combined = torch.cat((z, conditions), dim=1) + return self.model(input_combined) + + +class Discriminator(nn.Module): + def __init__(self, input_dim, condition_dim): + super().__init__() + self.model = nn.Sequential( + nn.Linear(input_dim + condition_dim, 512), + nn.LeakyReLU(0.2), + nn.Linear(512, 256), + nn.LeakyReLU(0.2), + nn.Linear(256, 1) # 输出:真实性分数 + ) + + def forward(self, x, conditions): + input_combined = torch.cat((x, conditions), dim=1) + return self.model(input_combined) + + +class WGANGPRegressor: + def __init__(self, latent_dim=11, lambda_gp=10, device=None): + self.latent_dim = latent_dim + self.lambda_gp = lambda_gp + self.device = device or ("cuda" if torch.cuda.is_available() else "cpu") + self.generator = None + self.discriminator = None + + def fit(self, X, conditions, n_epochs=3000, batch_size=32, n_critic=5, lr=1e-5, save_log_path=None): + """ + 训练 WGAN-GP 模型 + + 参数: + X: np.ndarray, shape=(n_samples, n_features),回归特征 + conditions: np.ndarray, shape=(n_samples, n_condition_features),条件变量 + """ + seed_everything() + + X = torch.tensor(X, dtype=torch.float32).to(self.device) + conditions = torch.tensor(conditions, dtype=torch.float32).to(self.device) + + self.output_dim = X.shape[1] + self.condition_dim = conditions.shape[1] + + self.generator = Generator(self.latent_dim, self.condition_dim, self.output_dim).to(self.device) + self.discriminator = Discriminator(self.output_dim, self.condition_dim).to(self.device) + + optimizer_G = optim.Adam(self.generator.parameters(), lr=lr, betas=(0.5, 0.9)) + optimizer_D = optim.Adam(self.discriminator.parameters(), lr=lr, betas=(0.5, 0.9)) + + self.g_losses = [] + self.d_losses = [] + + for epoch in range(n_epochs): + for _ in range(n_critic): + optimizer_D.zero_grad() + idx = torch.randint(0, X.shape[0], (batch_size,)) + real_x = X[idx] + real_c = conditions[idx] + + z = torch.randn(batch_size, self.latent_dim).to(self.device) + fake_x = self.generator(z, real_c).detach() + + d_real = self.discriminator(real_x, real_c) + d_fake = self.discriminator(fake_x, real_c) + gp = self._gradient_penalty(real_x, fake_x, real_c) + + d_loss = -torch.mean(d_real) + torch.mean(d_fake) + gp + d_loss.backward() + optimizer_D.step() + + # 训练生成器 + optimizer_G.zero_grad() + z = torch.randn(batch_size, self.latent_dim).to(self.device) + fake_x = self.generator(z, real_c) + d_fake = self.discriminator(fake_x, real_c) + g_loss = -torch.mean(d_fake) + g_loss.backward() + optimizer_G.step() + + self.d_losses.append(d_loss.item()) + self.g_losses.append(g_loss.item()) + + if epoch % 100 == 0: + print(f"[{epoch}/{n_epochs}] D_loss: {d_loss.item():.4f}, G_loss: {g_loss.item():.4f}") + + # 日志保存 + if save_log_path: + df_log = pd.DataFrame({'D_loss': self.d_losses, 'G_loss': self.g_losses}) + os.makedirs(os.path.dirname(save_log_path), exist_ok=True) + df_log.to_csv(save_log_path, index=False) + + def _gradient_penalty(self, real_x, fake_x, condition): + alpha = torch.rand(real_x.size(0), 1).to(self.device) + interpolated = (alpha * real_x + (1 - alpha) * fake_x).requires_grad_(True) + d_interpolated = self.discriminator(interpolated, condition) + gradients = torch.autograd.grad(outputs=d_interpolated, + inputs=interpolated, + grad_outputs=torch.ones_like(d_interpolated), + create_graph=True, retain_graph=True)[0] + grad_norm = gradients.view(gradients.size(0), -1).norm(2, dim=1) + return self.lambda_gp * ((grad_norm - 1) ** 2).mean() + + def generate(self, condition_array, n_samples=None, z=None): + """ + 生成模拟数据 + + 参数: + condition_array: np.ndarray, 条件变量数组 + n_samples: int, 要生成的样本数量(若 z 提供则可省略) + z: torch.Tensor, 自定义潜变量张量 + + 返回: + np.ndarray: 生成数据 + """ + self.generator.eval() + condition_array = np.array(condition_array) + + if z is None: + if n_samples is None: + n_samples = condition_array.shape[0] + z = torch.randn(n_samples, self.latent_dim).to(self.device) + else: + z = z.to(self.device) + n_samples = z.size(0) + + conditions = torch.tensor(condition_array, dtype=torch.float32).to(self.device) + if conditions.shape[0] != n_samples: + raise ValueError("生成样本数与条件变量数量不一致。") + + with torch.no_grad(): + fake_data = self.generator(z, conditions).cpu().numpy() + return fake_data + + def plot_loss(self): + plt.figure(figsize=(10, 5)) + plt.plot(self.d_losses, label="Discriminator Loss") + plt.plot(self.g_losses, label="Generator Loss") + plt.xlabel("Iterations") + plt.ylabel("Loss") + plt.legend() + plt.title("WGAN-GP Training Loss") + plt.tight_layout() + plt.show() + + def save_model(self, path_prefix): + os.makedirs(os.path.dirname(path_prefix), exist_ok=True) + torch.save(self.generator.state_dict(), path_prefix + "_G.pth") + torch.save(self.discriminator.state_dict(), path_prefix + "_D.pth") + + def load_model(self, path_prefix): + self.generator.load_state_dict(torch.load(path_prefix + "_G.pth", map_location=self.device)) + self.discriminator.load_state_dict(torch.load(path_prefix + "_D.pth", map_location=self.device)) diff --git a/ml-augmentation-toolkit_project/pipeline.py b/ml-augmentation-toolkit_project/pipeline.py new file mode 100644 index 0000000..b6a32be --- /dev/null +++ b/ml-augmentation-toolkit_project/pipeline.py @@ -0,0 +1,155 @@ +import os +import pandas as pd +from ml-augmentation-toolkit.mcmc_sampler import MCMCSampler +from ml-augmentation-toolkit.wgan_gp_generator import WGANGPRegressor +from ml-augmentation-toolkit.smogn_augmentor import SMOGNAugmentor +from ml-augmentation-toolkit.shap_analyzer import SHAPAnalyzer +from sklearn.preprocessing import StandardScaler + + +def main(config): + # Step 0: 加载原始数据 + print("\n🔵 加载原始数据...") + original_df = pd.read_csv(config["original_data_path"]) + + # 初始化增强数据列表 + enhanced_datasets = [] + + # Step 1: MCMC采样(可选) + if config["use_mcmc"]: + print("\n🚀 Step 1: MCMC Sampling...") + mcmc_sampler = MCMCSampler( + data_path=config["original_data_path"], + trace_save_path=config["mcmc"]["trace_save_path"], + sample_save_path=config["mcmc"]["sample_save_path"], + draws=config["mcmc"]["draws"], + chains=config["mcmc"]["chains"], + cores=config["mcmc"]["cores"] + ) + mcmc_samples, _ = mcmc_sampler.run(plot=False) + enhanced_datasets.append(mcmc_samples) + print("✅ MCMC采样完成。") + + # Step 2: WGAN-GP生成(可选) + if config["use_wgan"]: + print("\n🚀 Step 2: WGAN-GP Generation...") + scaler_X = StandardScaler() + scaler_y = StandardScaler() + + X = original_df.drop(columns=[config["target_col"]]).values + y = original_df[config["target_col"]].values.reshape(-1, 1) + + X_scaled = scaler_X.fit_transform(X) + y_scaled = scaler_y.fit_transform(y) + + wgan_gp = WGANGPRegressor(latent_dim=config["wgan"]["latent_dim"]) + wgan_gp.fit(X_scaled, y_scaled, n_epochs=config["wgan"]["n_epochs"], batch_size=config["wgan"]["batch_size"]) + + generated_scaled = wgan_gp.generate(y_scaled, n_samples=config["wgan"]["n_generated_samples"]) + generated_X = scaler_X.inverse_transform(generated_scaled) + + generated_df = pd.DataFrame(generated_X, columns=original_df.columns.drop(config["target_col"])) + generated_df[config["target_col"]] = scaler_y.inverse_transform(y_scaled[:generated_df.shape[0]]).flatten() + + os.makedirs(os.path.dirname(config["wgan"]["save_path"]), exist_ok=True) + generated_df.to_csv(config["wgan"]["save_path"], index=False) + enhanced_datasets.append(generated_df) + print("✅ WGAN-GP生成完成。") + + # Step 3: SMOGN增强(可选) + if config["use_smogn"]: + print("\n🚀 Step 3: SMOGN Data Augmentation...") + smogn_augmentor = SMOGNAugmentor( + target_col=config["target_col"], + samp_method="balance", + save_path=config["smogn"]["save_path"] + ) + smogn_df = smogn_augmentor.fit_transform(original_df) + smogn_augmentor.save() + enhanced_datasets.append(smogn_df) + print("✅ SMOGN增强完成。") + + # Step 4: 整合所有增强数据 + print("\n🔵 整合增强数据...") + if enhanced_datasets: + all_data = pd.concat(enhanced_datasets, axis=0).drop_duplicates().reset_index(drop=True) + else: + print("⚠️ 未选择任何数据增强方法,仅使用原始数据。") + all_data = original_df.copy() + + print(f"总数据量: {all_data.shape[0]} 条") + + # Step 5: SHAP 可解释性分析 + print("\n🚀 Step 4: SHAP Analysis...") + shap_analyzer = SHAPAnalyzer( + target_col=config["target_col"], + feature_name_mapping=config.get("feature_name_mapping", {}), + random_state=42 + ) + test_data = pd.read_csv(config["shap"]["test_data_path"]) + + shap_analyzer.fit(train_data=all_data, test_data=test_data) + + shap_analyzer.save_feature_importance(config["shap"]["feature_importance_path"]) + shap_analyzer.save_shap_values(config["shap"]["shap_values_path"]) + shap_analyzer.save_shap_summary_plot(config["shap"]["shap_summary_plot_path"]) + shap_analyzer.save_interaction_heatmap(config["shap"]["interaction_heatmap_path"]) + shap_analyzer.save_interaction_strengths(config["shap"]["interaction_strength_path"]) + shap_analyzer.plot_dependence( + feature=config["shap"]["dependence_plot_feature"], + interaction_feature=config["shap"]["dependence_plot_interaction"], + path=config["shap"]["dependence_plot_path"] + ) + + print("\n🎯 Pipeline 完成!") + + +if __name__ == "__main__": + config = { + "original_data_path": "data/原始实验数据.csv", # 原始实验数据路径 + + "use_mcmc": True, # 是否启用 MCMC + "use_wgan": True, # 是否启用 WGAN + "use_smogn": False, # 是否启用 SMOGN + + "target_col": "Vol", + + "mcmc": { + "trace_save_path": "outputs/mcmc_trace.csv", + "sample_save_path": "outputs/mcmc_samples.csv", + "draws": 4000, + "chains": 4, + "cores": 8 + }, + + "wgan": { + "latent_dim": 11, + "n_epochs": 3000, + "batch_size": 64, + "n_generated_samples": 1000, + "save_path": "outputs/wgan_generated.csv" + }, + + "smogn": { + "save_path": "outputs/smogn_augmented.csv" + }, + + "shap": { + "test_data_path": "data/原始实验数据.csv", + "feature_importance_path": "outputs/shap_feature_importance.csv", + "shap_values_path": "outputs/shap_values.csv", + "shap_summary_plot_path": "outputs/shap_summary_plot.png", + "interaction_heatmap_path": "outputs/interaction_heatmap.png", + "interaction_strength_path": "outputs/global_interaction_strength.csv", + "dependence_plot_feature": "Ti", + "dependence_plot_interaction": "Ta", + "dependence_plot_path": "outputs/Ti_Ta_dependence_plot.png" + }, + + "feature_name_mapping": { + "Co": "Co", "Al": "Al", "W": "W", "Ta": "Ta", "Ti": "Ti", "Nb": "Nb", "Ni": "Ni", "Cr": "Cr", "V": "V", "Mo": "Mo", + "Tage": r"$T_{\mathrm{age}}$", "tage": r"$t_{\mathrm{age}}$" + } + } + + main(config) diff --git a/ml-augmentation-toolkit_project/requirements.txt b/ml-augmentation-toolkit_project/requirements.txt new file mode 100644 index 0000000..e69de29 diff --git a/ml-augmentation-toolkit_project/setup.py b/ml-augmentation-toolkit_project/setup.py new file mode 100644 index 0000000..349871e --- /dev/null +++ b/ml-augmentation-toolkit_project/setup.py @@ -0,0 +1,31 @@ +from setuptools import setup, find_packages + +setup( + name='alloyxai', + version='0.1.0', + description="An integrated machine learning pipeline for advanced data augmentation and model interpretability in high-temperature alloy research.", + author_email='linlinsun1010@163.com', + url='https://github.com/003sunshine/alloyxai', + packages=find_packages(), + include_package_data=True, + install_requires=[ + 'numpy>=1.20.0', + 'pandas>=1.3.0', + 'matplotlib>=3.4.0', + 'scikit-learn>=1.0.0', + 'xgboost>=1.5.0', + 'shap>=0.41.0', + 'smogn>=0.1.2', + 'pymc>=5.0.0', + 'arviz>=0.12.0', + 'torch>=1.9.0' + ], + classifiers=[ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + "Intended Audience :: Science/Research", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + ], + python_requires='>=3.8', +)