import numpy as np import matplotlib.pyplot as plt import time from typing import Dict, List, Tuple from utils import train_test_split, normalize_data, accuracy_score, cross_validation from feature_extraction import PCA, FeatureSelector from improved_bp import ImprovedBPNetwork, StandardBPNetwork from classifiers import NaiveBayesClassifier, KNNClassifier, DecisionTreeClassifier from ensemble import BaggingClassifier, AdaBoostClassifier, VotingClassifier class ExperimentRunner: """实验运行器""" def __init__(self): self.results = {} def generate_synthetic_data(self, n_samples: int = 1000, n_features: int = 20, n_classes: int = 3, random_state: int = 42) -> Tuple[np.ndarray, np.ndarray]: """生成合成数据集""" np.random.seed(random_state) # 为每个类生成不同的均值和协方差 class_means = np.random.randn(n_classes, n_features) * 2 X = [] y = [] samples_per_class = n_samples // n_classes for class_idx in range(n_classes): # 生成该类的数据 class_data = np.random.randn(samples_per_class, n_features) + class_means[class_idx] X.append(class_data) y.extend([class_idx] * samples_per_class) X = np.vstack(X) y = np.array(y) # 添加噪声特征 noise_features = np.random.randn(len(X), n_features // 2) X = np.hstack([X, noise_features]) return X, y def run_bp_comparison(self, X: np.ndarray, y: np.ndarray, dataset_name: str): """运行BP算法比较实验""" print(f"\n=== BP算法比较实验 - {dataset_name} ===") X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) X_train_norm, X_test_norm = normalize_data(X_train, X_test) # 改进的BP网络 print("训练改进的BP网络...") start_time = time.time() improved_bp = ImprovedBPNetwork(hidden_layers=[10, 5], learning_rate=0.01, max_epochs=500) improved_bp.fit(X_train_norm, y_train) improved_train_time = time.time() - start_time y_pred_improved = improved_bp.predict(X_test_norm) improved_accuracy = accuracy_score(y_test, y_pred_improved) # 标准BP网络 print("训练标准BP网络...") start_time = time.time() standard_bp = StandardBPNetwork(hidden_layers=[10, 5], learning_rate=0.01, max_epochs=500) standard_bp.fit(X_train_norm, y_train) standard_train_time = time.time() - start_time y_pred_standard = standard_bp.predict(X_test_norm) standard_accuracy = accuracy_score(y_test, y_pred_standard) # 结果 print(f"改进BP - 准确率: {improved_accuracy:.4f}, 训练时间: {improved_train_time:.2f}s") print(f"标准BP - 准确率: {standard_accuracy:.4f}, 训练时间: {standard_train_time:.2f}s") # 绘制损失曲线 plt.figure(figsize=(10, 6)) plt.plot(improved_bp.loss_history, label='改进BP', alpha=0.8) plt.plot(standard_bp.loss_history, label='标准BP', alpha=0.8) plt.xlabel('训练轮次') plt.ylabel('损失') plt.title(f'BP算法损失曲线对比 - {dataset_name}') plt.legend() plt.grid(True) plt.savefig(f'c:/Users/grtsi/ml-homework/bp_comparison_{dataset_name.lower()}.png') plt.show() return { 'improved_bp': {'accuracy': improved_accuracy, 'time': improved_train_time}, 'standard_bp': {'accuracy': standard_accuracy, 'time': standard_train_time} } def run_feature_extraction_comparison(self, X: np.ndarray, y: np.ndarray, dataset_name: str): """运行特征提取比较实验""" print(f"\n=== 特征提取比较实验 - {dataset_name} ===") X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) X_train_norm, X_test_norm = normalize_data(X_train, X_test) classifiers = { 'NaiveBayes': NaiveBayesClassifier(), 'KNN': KNNClassifier(k=5), 'DecisionTree': DecisionTreeClassifier(max_depth=8) } results = {} for clf_name, clf in classifiers.items(): print(f"\n{clf_name} 分类器:") # 无特征提取 clf_no_fe = type(clf)() if clf_name != 'KNN' else KNNClassifier(k=5) clf_no_fe.fit(X_train_norm, y_train) y_pred_no_fe = clf_no_fe.predict(X_test_norm) acc_no_fe = accuracy_score(y_test, y_pred_no_fe) # PCA特征提取 pca = PCA(n_components=min(10, X.shape[1] // 2)) X_train_pca = pca.fit_transform(X_train_norm) X_test_pca = pca.transform(X_test_norm) clf_pca = type(clf)() if clf_name != 'KNN' else KNNClassifier(k=5) clf_pca.fit(X_train_pca, y_train) y_pred_pca = clf_pca.predict(X_test_pca) acc_pca = accuracy_score(y_test, y_pred_pca) # 特征选择 feature_selector = FeatureSelector(k=min(10, X.shape[1] // 2)) X_train_fs = feature_selector.fit_transform(X_train_norm, y_train) X_test_fs = feature_selector.transform(X_test_norm) clf_fs = type(clf)() if clf_name != 'KNN' else KNNClassifier(k=5) clf_fs.fit(X_train_fs, y_train) y_pred_fs = clf_fs.predict(X_test_fs) acc_fs = accuracy_score(y_test, y_pred_fs) print(f" 无特征提取: {acc_no_fe:.4f}") print(f" PCA特征提取: {acc_pca:.4f}") print(f" 特征选择: {acc_fs:.4f}") results[clf_name] = { 'no_feature_extraction': acc_no_fe, 'pca': acc_pca, 'feature_selection': acc_fs } return results def run_classifier_comparison(self, X: np.ndarray, y: np.ndarray, dataset_name: str): """运行分类器比较实验""" print(f"\n=== 分类器比较实验 - {dataset_name} ===") X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) X_train_norm, X_test_norm = normalize_data(X_train, X_test) classifiers = { 'NaiveBayes': NaiveBayesClassifier(), 'KNN': KNNClassifier(k=5), 'DecisionTree': DecisionTreeClassifier(max_depth=8) } results = {} for clf_name, clf in classifiers.items(): print(f"\n{clf_name} 分类器:") # 训练和测试 start_time = time.time() clf.fit(X_train_norm, y_train) train_time = time.time() - start_time y_pred = clf.predict(X_test_norm) accuracy = accuracy_score(y_test, y_pred) # 交叉验证 cv_scores = cross_validation(type(clf)() if clf_name != 'KNN' else KNNClassifier(k=5), X_train_norm, y_train, k=5) print(f" 准确率: {accuracy:.4f}") print(f" 训练时间: {train_time:.4f}s") print(f" 交叉验证均值: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}") results[clf_name] = { 'accuracy': accuracy, 'train_time': train_time, 'cv_mean': np.mean(cv_scores), 'cv_std': np.std(cv_scores) } return results def run_ensemble_comparison(self, X: np.ndarray, y: np.ndarray, dataset_name: str): """运行集成算法比较实验""" print(f"\n=== 集成算法比较实验 - {dataset_name} ===") X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) X_train_norm, X_test_norm = normalize_data(X_train, X_test) # 基础分类器 base_classifiers = { 'DecisionTree': DecisionTreeClassifier(max_depth=8), 'NaiveBayes': NaiveBayesClassifier(), 'KNN': KNNClassifier(k=5) } # 集成分类器 ensemble_classifiers = { 'Bagging_DT': BaggingClassifier('decision_tree', n_estimators=10), 'Voting': VotingClassifier([ DecisionTreeClassifier(max_depth=8), NaiveBayesClassifier(), KNNClassifier(k=5) ]) } results = {} # 测试基础分类器 print("基础分类器:") for clf_name, clf in base_classifiers.items(): start_time = time.time() clf.fit(X_train_norm, y_train) train_time = time.time() - start_time y_pred = clf.predict(X_test_norm) accuracy = accuracy_score(y_test, y_pred) print(f" {clf_name}: {accuracy:.4f} (训练时间: {train_time:.4f}s)") results[clf_name] = {'accuracy': accuracy, 'train_time': train_time} # 测试集成分类器 print("\n集成分类器:") for clf_name, clf in ensemble_classifiers.items(): start_time = time.time() clf.fit(X_train_norm, y_train) train_time = time.time() - start_time y_pred = clf.predict(X_test_norm) accuracy = accuracy_score(y_test, y_pred) print(f" {clf_name}: {accuracy:.4f} (训练时间: {train_time:.4f}s)") results[clf_name] = {'accuracy': accuracy, 'train_time': train_time} return results def run_all_experiments(self): """运行所有实验""" print("开始机器学习算法比较实验...") # 生成两个不同的数据集 print("生成数据集...") X1, y1 = self.generate_synthetic_data(n_samples=800, n_features=20, n_classes=3, random_state=42) X2, y2 = self.generate_synthetic_data(n_samples=1000, n_features=25, n_classes=4, random_state=123) datasets = [ (X1, y1, "Dataset1"), (X2, y2, "Dataset2") ] all_results = {} for X, y, dataset_name in datasets: print(f"\n{'='*50}") print(f"处理数据集: {dataset_name}") print(f"样本数: {X.shape[0]}, 特征数: {X.shape[1]}, 类别数: {len(np.unique(y))}") # 运行各种实验 bp_results = self.run_bp_comparison(X, y, dataset_name) fe_results = self.run_feature_extraction_comparison(X, y, dataset_name) clf_results = self.run_classifier_comparison(X, y, dataset_name) ensemble_results = self.run_ensemble_comparison(X, y, dataset_name) all_results[dataset_name] = { 'bp_comparison': bp_results, 'feature_extraction': fe_results, 'classifier_comparison': clf_results, 'ensemble_comparison': ensemble_results } # 生成总结报告 self.generate_summary_report(all_results) return all_results def generate_summary_report(self, results: Dict): """生成总结报告""" print(f"\n{'='*60}") print("实验总结报告") print(f"{'='*60}") for dataset_name, dataset_results in results.items(): print(f"\n{dataset_name} 结果总结:") print("-" * 40) # BP算法比较 bp_results = dataset_results['bp_comparison'] print(f"BP算法比较:") print(f" 改进BP: 准确率 {bp_results['improved_bp']['accuracy']:.4f}, 时间 {bp_results['improved_bp']['time']:.2f}s") print(f" 标准BP: 准确率 {bp_results['standard_bp']['accuracy']:.4f}, 时间 {bp_results['standard_bp']['time']:.2f}s") # 特征提取比较 fe_results = dataset_results['feature_extraction'] print(f"\n特征提取效果 (最佳结果):") for clf_name, clf_results in fe_results.items(): best_method = max(clf_results, key=clf_results.get) best_acc = clf_results[best_method] print(f" {clf_name}: {best_method} ({best_acc:.4f})") # 集成算法比较 ensemble_results = dataset_results['ensemble_comparison'] print(f"\n分类器性能排名:") sorted_classifiers = sorted(ensemble_results.items(), key=lambda x: x[1]['accuracy'], reverse=True) for i, (clf_name, clf_result) in enumerate(sorted_classifiers[:5]): print(f" {i+1}. {clf_name}: {clf_result['accuracy']:.4f}") if __name__ == "__main__": runner = ExperimentRunner() results = runner.run_all_experiments()