ml-homework/experiments.py

320 lines
13 KiB
Python

import numpy as np
import matplotlib.pyplot as plt
import time
from typing import Dict, List, Tuple
from utils import train_test_split, normalize_data, accuracy_score, cross_validation
from feature_extraction import PCA, FeatureSelector
from improved_bp import ImprovedBPNetwork, StandardBPNetwork
from classifiers import NaiveBayesClassifier, KNNClassifier, DecisionTreeClassifier
from ensemble import BaggingClassifier, AdaBoostClassifier, VotingClassifier
class ExperimentRunner:
"""实验运行器"""
def __init__(self):
self.results = {}
def generate_synthetic_data(self, n_samples: int = 1000, n_features: int = 20, n_classes: int = 3,
random_state: int = 42) -> Tuple[np.ndarray, np.ndarray]:
"""生成合成数据集"""
np.random.seed(random_state)
# 为每个类生成不同的均值和协方差
class_means = np.random.randn(n_classes, n_features) * 2
X = []
y = []
samples_per_class = n_samples // n_classes
for class_idx in range(n_classes):
# 生成该类的数据
class_data = np.random.randn(samples_per_class, n_features) + class_means[class_idx]
X.append(class_data)
y.extend([class_idx] * samples_per_class)
X = np.vstack(X)
y = np.array(y)
# 添加噪声特征
noise_features = np.random.randn(len(X), n_features // 2)
X = np.hstack([X, noise_features])
return X, y
def run_bp_comparison(self, X: np.ndarray, y: np.ndarray, dataset_name: str):
"""运行BP算法比较实验"""
print(f"\n=== BP算法比较实验 - {dataset_name} ===")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
X_train_norm, X_test_norm = normalize_data(X_train, X_test)
# 改进的BP网络
print("训练改进的BP网络...")
start_time = time.time()
improved_bp = ImprovedBPNetwork(hidden_layers=[10, 5], learning_rate=0.01, max_epochs=500)
improved_bp.fit(X_train_norm, y_train)
improved_train_time = time.time() - start_time
y_pred_improved = improved_bp.predict(X_test_norm)
improved_accuracy = accuracy_score(y_test, y_pred_improved)
# 标准BP网络
print("训练标准BP网络...")
start_time = time.time()
standard_bp = StandardBPNetwork(hidden_layers=[10, 5], learning_rate=0.01, max_epochs=500)
standard_bp.fit(X_train_norm, y_train)
standard_train_time = time.time() - start_time
y_pred_standard = standard_bp.predict(X_test_norm)
standard_accuracy = accuracy_score(y_test, y_pred_standard)
# 结果
print(f"改进BP - 准确率: {improved_accuracy:.4f}, 训练时间: {improved_train_time:.2f}s")
print(f"标准BP - 准确率: {standard_accuracy:.4f}, 训练时间: {standard_train_time:.2f}s")
# 绘制损失曲线
plt.figure(figsize=(10, 6))
plt.plot(improved_bp.loss_history, label='改进BP', alpha=0.8)
plt.plot(standard_bp.loss_history, label='标准BP', alpha=0.8)
plt.xlabel('训练轮次')
plt.ylabel('损失')
plt.title(f'BP算法损失曲线对比 - {dataset_name}')
plt.legend()
plt.grid(True)
plt.savefig(f'c:/Users/grtsi/ml-homework/bp_comparison_{dataset_name.lower()}.png')
plt.show()
return {
'improved_bp': {'accuracy': improved_accuracy, 'time': improved_train_time},
'standard_bp': {'accuracy': standard_accuracy, 'time': standard_train_time}
}
def run_feature_extraction_comparison(self, X: np.ndarray, y: np.ndarray, dataset_name: str):
"""运行特征提取比较实验"""
print(f"\n=== 特征提取比较实验 - {dataset_name} ===")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
X_train_norm, X_test_norm = normalize_data(X_train, X_test)
classifiers = {
'NaiveBayes': NaiveBayesClassifier(),
'KNN': KNNClassifier(k=5),
'DecisionTree': DecisionTreeClassifier(max_depth=8)
}
results = {}
for clf_name, clf in classifiers.items():
print(f"\n{clf_name} 分类器:")
# 无特征提取
clf_no_fe = type(clf)() if clf_name != 'KNN' else KNNClassifier(k=5)
clf_no_fe.fit(X_train_norm, y_train)
y_pred_no_fe = clf_no_fe.predict(X_test_norm)
acc_no_fe = accuracy_score(y_test, y_pred_no_fe)
# PCA特征提取
pca = PCA(n_components=min(10, X.shape[1] // 2))
X_train_pca = pca.fit_transform(X_train_norm)
X_test_pca = pca.transform(X_test_norm)
clf_pca = type(clf)() if clf_name != 'KNN' else KNNClassifier(k=5)
clf_pca.fit(X_train_pca, y_train)
y_pred_pca = clf_pca.predict(X_test_pca)
acc_pca = accuracy_score(y_test, y_pred_pca)
# 特征选择
feature_selector = FeatureSelector(k=min(10, X.shape[1] // 2))
X_train_fs = feature_selector.fit_transform(X_train_norm, y_train)
X_test_fs = feature_selector.transform(X_test_norm)
clf_fs = type(clf)() if clf_name != 'KNN' else KNNClassifier(k=5)
clf_fs.fit(X_train_fs, y_train)
y_pred_fs = clf_fs.predict(X_test_fs)
acc_fs = accuracy_score(y_test, y_pred_fs)
print(f" 无特征提取: {acc_no_fe:.4f}")
print(f" PCA特征提取: {acc_pca:.4f}")
print(f" 特征选择: {acc_fs:.4f}")
results[clf_name] = {
'no_feature_extraction': acc_no_fe,
'pca': acc_pca,
'feature_selection': acc_fs
}
return results
def run_classifier_comparison(self, X: np.ndarray, y: np.ndarray, dataset_name: str):
"""运行分类器比较实验"""
print(f"\n=== 分类器比较实验 - {dataset_name} ===")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
X_train_norm, X_test_norm = normalize_data(X_train, X_test)
classifiers = {
'NaiveBayes': NaiveBayesClassifier(),
'KNN': KNNClassifier(k=5),
'DecisionTree': DecisionTreeClassifier(max_depth=8)
}
results = {}
for clf_name, clf in classifiers.items():
print(f"\n{clf_name} 分类器:")
# 训练和测试
start_time = time.time()
clf.fit(X_train_norm, y_train)
train_time = time.time() - start_time
y_pred = clf.predict(X_test_norm)
accuracy = accuracy_score(y_test, y_pred)
# 交叉验证
cv_scores = cross_validation(type(clf)() if clf_name != 'KNN' else KNNClassifier(k=5),
X_train_norm, y_train, k=5)
print(f" 准确率: {accuracy:.4f}")
print(f" 训练时间: {train_time:.4f}s")
print(f" 交叉验证均值: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}")
results[clf_name] = {
'accuracy': accuracy,
'train_time': train_time,
'cv_mean': np.mean(cv_scores),
'cv_std': np.std(cv_scores)
}
return results
def run_ensemble_comparison(self, X: np.ndarray, y: np.ndarray, dataset_name: str):
"""运行集成算法比较实验"""
print(f"\n=== 集成算法比较实验 - {dataset_name} ===")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
X_train_norm, X_test_norm = normalize_data(X_train, X_test)
# 基础分类器
base_classifiers = {
'DecisionTree': DecisionTreeClassifier(max_depth=8),
'NaiveBayes': NaiveBayesClassifier(),
'KNN': KNNClassifier(k=5)
}
# 集成分类器
ensemble_classifiers = {
'Bagging_DT': BaggingClassifier('decision_tree', n_estimators=10),
'Voting': VotingClassifier([
DecisionTreeClassifier(max_depth=8),
NaiveBayesClassifier(),
KNNClassifier(k=5)
])
}
results = {}
# 测试基础分类器
print("基础分类器:")
for clf_name, clf in base_classifiers.items():
start_time = time.time()
clf.fit(X_train_norm, y_train)
train_time = time.time() - start_time
y_pred = clf.predict(X_test_norm)
accuracy = accuracy_score(y_test, y_pred)
print(f" {clf_name}: {accuracy:.4f} (训练时间: {train_time:.4f}s)")
results[clf_name] = {'accuracy': accuracy, 'train_time': train_time}
# 测试集成分类器
print("\n集成分类器:")
for clf_name, clf in ensemble_classifiers.items():
start_time = time.time()
clf.fit(X_train_norm, y_train)
train_time = time.time() - start_time
y_pred = clf.predict(X_test_norm)
accuracy = accuracy_score(y_test, y_pred)
print(f" {clf_name}: {accuracy:.4f} (训练时间: {train_time:.4f}s)")
results[clf_name] = {'accuracy': accuracy, 'train_time': train_time}
return results
def run_all_experiments(self):
"""运行所有实验"""
print("开始机器学习算法比较实验...")
# 生成两个不同的数据集
print("生成数据集...")
X1, y1 = self.generate_synthetic_data(n_samples=800, n_features=20, n_classes=3, random_state=42)
X2, y2 = self.generate_synthetic_data(n_samples=1000, n_features=25, n_classes=4, random_state=123)
datasets = [
(X1, y1, "Dataset1"),
(X2, y2, "Dataset2")
]
all_results = {}
for X, y, dataset_name in datasets:
print(f"\n{'='*50}")
print(f"处理数据集: {dataset_name}")
print(f"样本数: {X.shape[0]}, 特征数: {X.shape[1]}, 类别数: {len(np.unique(y))}")
# 运行各种实验
bp_results = self.run_bp_comparison(X, y, dataset_name)
fe_results = self.run_feature_extraction_comparison(X, y, dataset_name)
clf_results = self.run_classifier_comparison(X, y, dataset_name)
ensemble_results = self.run_ensemble_comparison(X, y, dataset_name)
all_results[dataset_name] = {
'bp_comparison': bp_results,
'feature_extraction': fe_results,
'classifier_comparison': clf_results,
'ensemble_comparison': ensemble_results
}
# 生成总结报告
self.generate_summary_report(all_results)
return all_results
def generate_summary_report(self, results: Dict):
"""生成总结报告"""
print(f"\n{'='*60}")
print("实验总结报告")
print(f"{'='*60}")
for dataset_name, dataset_results in results.items():
print(f"\n{dataset_name} 结果总结:")
print("-" * 40)
# BP算法比较
bp_results = dataset_results['bp_comparison']
print(f"BP算法比较:")
print(f" 改进BP: 准确率 {bp_results['improved_bp']['accuracy']:.4f}, 时间 {bp_results['improved_bp']['time']:.2f}s")
print(f" 标准BP: 准确率 {bp_results['standard_bp']['accuracy']:.4f}, 时间 {bp_results['standard_bp']['time']:.2f}s")
# 特征提取比较
fe_results = dataset_results['feature_extraction']
print(f"\n特征提取效果 (最佳结果):")
for clf_name, clf_results in fe_results.items():
best_method = max(clf_results, key=clf_results.get)
best_acc = clf_results[best_method]
print(f" {clf_name}: {best_method} ({best_acc:.4f})")
# 集成算法比较
ensemble_results = dataset_results['ensemble_comparison']
print(f"\n分类器性能排名:")
sorted_classifiers = sorted(ensemble_results.items(),
key=lambda x: x[1]['accuracy'], reverse=True)
for i, (clf_name, clf_result) in enumerate(sorted_classifiers[:5]):
print(f" {i+1}. {clf_name}: {clf_result['accuracy']:.4f}")
if __name__ == "__main__":
runner = ExperimentRunner()
results = runner.run_all_experiments()