320 lines
13 KiB
Python
320 lines
13 KiB
Python
import numpy as np
|
|
import matplotlib.pyplot as plt
|
|
import time
|
|
from typing import Dict, List, Tuple
|
|
|
|
from utils import train_test_split, normalize_data, accuracy_score, cross_validation
|
|
from feature_extraction import PCA, FeatureSelector
|
|
from improved_bp import ImprovedBPNetwork, StandardBPNetwork
|
|
from classifiers import NaiveBayesClassifier, KNNClassifier, DecisionTreeClassifier
|
|
from ensemble import BaggingClassifier, AdaBoostClassifier, VotingClassifier
|
|
|
|
class ExperimentRunner:
|
|
"""实验运行器"""
|
|
|
|
def __init__(self):
|
|
self.results = {}
|
|
|
|
def generate_synthetic_data(self, n_samples: int = 1000, n_features: int = 20, n_classes: int = 3,
|
|
random_state: int = 42) -> Tuple[np.ndarray, np.ndarray]:
|
|
"""生成合成数据集"""
|
|
np.random.seed(random_state)
|
|
|
|
# 为每个类生成不同的均值和协方差
|
|
class_means = np.random.randn(n_classes, n_features) * 2
|
|
X = []
|
|
y = []
|
|
|
|
samples_per_class = n_samples // n_classes
|
|
|
|
for class_idx in range(n_classes):
|
|
# 生成该类的数据
|
|
class_data = np.random.randn(samples_per_class, n_features) + class_means[class_idx]
|
|
X.append(class_data)
|
|
y.extend([class_idx] * samples_per_class)
|
|
|
|
X = np.vstack(X)
|
|
y = np.array(y)
|
|
|
|
# 添加噪声特征
|
|
noise_features = np.random.randn(len(X), n_features // 2)
|
|
X = np.hstack([X, noise_features])
|
|
|
|
return X, y
|
|
|
|
def run_bp_comparison(self, X: np.ndarray, y: np.ndarray, dataset_name: str):
|
|
"""运行BP算法比较实验"""
|
|
print(f"\n=== BP算法比较实验 - {dataset_name} ===")
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
|
|
X_train_norm, X_test_norm = normalize_data(X_train, X_test)
|
|
|
|
# 改进的BP网络
|
|
print("训练改进的BP网络...")
|
|
start_time = time.time()
|
|
improved_bp = ImprovedBPNetwork(hidden_layers=[10, 5], learning_rate=0.01, max_epochs=500)
|
|
improved_bp.fit(X_train_norm, y_train)
|
|
improved_train_time = time.time() - start_time
|
|
|
|
y_pred_improved = improved_bp.predict(X_test_norm)
|
|
improved_accuracy = accuracy_score(y_test, y_pred_improved)
|
|
|
|
# 标准BP网络
|
|
print("训练标准BP网络...")
|
|
start_time = time.time()
|
|
standard_bp = StandardBPNetwork(hidden_layers=[10, 5], learning_rate=0.01, max_epochs=500)
|
|
standard_bp.fit(X_train_norm, y_train)
|
|
standard_train_time = time.time() - start_time
|
|
|
|
y_pred_standard = standard_bp.predict(X_test_norm)
|
|
standard_accuracy = accuracy_score(y_test, y_pred_standard)
|
|
|
|
# 结果
|
|
print(f"改进BP - 准确率: {improved_accuracy:.4f}, 训练时间: {improved_train_time:.2f}s")
|
|
print(f"标准BP - 准确率: {standard_accuracy:.4f}, 训练时间: {standard_train_time:.2f}s")
|
|
|
|
# 绘制损失曲线
|
|
plt.figure(figsize=(10, 6))
|
|
plt.plot(improved_bp.loss_history, label='改进BP', alpha=0.8)
|
|
plt.plot(standard_bp.loss_history, label='标准BP', alpha=0.8)
|
|
plt.xlabel('训练轮次')
|
|
plt.ylabel('损失')
|
|
plt.title(f'BP算法损失曲线对比 - {dataset_name}')
|
|
plt.legend()
|
|
plt.grid(True)
|
|
plt.savefig(f'c:/Users/grtsi/ml-homework/bp_comparison_{dataset_name.lower()}.png')
|
|
plt.show()
|
|
|
|
return {
|
|
'improved_bp': {'accuracy': improved_accuracy, 'time': improved_train_time},
|
|
'standard_bp': {'accuracy': standard_accuracy, 'time': standard_train_time}
|
|
}
|
|
|
|
def run_feature_extraction_comparison(self, X: np.ndarray, y: np.ndarray, dataset_name: str):
|
|
"""运行特征提取比较实验"""
|
|
print(f"\n=== 特征提取比较实验 - {dataset_name} ===")
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
|
|
X_train_norm, X_test_norm = normalize_data(X_train, X_test)
|
|
|
|
classifiers = {
|
|
'NaiveBayes': NaiveBayesClassifier(),
|
|
'KNN': KNNClassifier(k=5),
|
|
'DecisionTree': DecisionTreeClassifier(max_depth=8)
|
|
}
|
|
|
|
results = {}
|
|
|
|
for clf_name, clf in classifiers.items():
|
|
print(f"\n{clf_name} 分类器:")
|
|
|
|
# 无特征提取
|
|
clf_no_fe = type(clf)() if clf_name != 'KNN' else KNNClassifier(k=5)
|
|
clf_no_fe.fit(X_train_norm, y_train)
|
|
y_pred_no_fe = clf_no_fe.predict(X_test_norm)
|
|
acc_no_fe = accuracy_score(y_test, y_pred_no_fe)
|
|
|
|
# PCA特征提取
|
|
pca = PCA(n_components=min(10, X.shape[1] // 2))
|
|
X_train_pca = pca.fit_transform(X_train_norm)
|
|
X_test_pca = pca.transform(X_test_norm)
|
|
|
|
clf_pca = type(clf)() if clf_name != 'KNN' else KNNClassifier(k=5)
|
|
clf_pca.fit(X_train_pca, y_train)
|
|
y_pred_pca = clf_pca.predict(X_test_pca)
|
|
acc_pca = accuracy_score(y_test, y_pred_pca)
|
|
|
|
# 特征选择
|
|
feature_selector = FeatureSelector(k=min(10, X.shape[1] // 2))
|
|
X_train_fs = feature_selector.fit_transform(X_train_norm, y_train)
|
|
X_test_fs = feature_selector.transform(X_test_norm)
|
|
|
|
clf_fs = type(clf)() if clf_name != 'KNN' else KNNClassifier(k=5)
|
|
clf_fs.fit(X_train_fs, y_train)
|
|
y_pred_fs = clf_fs.predict(X_test_fs)
|
|
acc_fs = accuracy_score(y_test, y_pred_fs)
|
|
|
|
print(f" 无特征提取: {acc_no_fe:.4f}")
|
|
print(f" PCA特征提取: {acc_pca:.4f}")
|
|
print(f" 特征选择: {acc_fs:.4f}")
|
|
|
|
results[clf_name] = {
|
|
'no_feature_extraction': acc_no_fe,
|
|
'pca': acc_pca,
|
|
'feature_selection': acc_fs
|
|
}
|
|
|
|
return results
|
|
|
|
def run_classifier_comparison(self, X: np.ndarray, y: np.ndarray, dataset_name: str):
|
|
"""运行分类器比较实验"""
|
|
print(f"\n=== 分类器比较实验 - {dataset_name} ===")
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
|
|
X_train_norm, X_test_norm = normalize_data(X_train, X_test)
|
|
|
|
classifiers = {
|
|
'NaiveBayes': NaiveBayesClassifier(),
|
|
'KNN': KNNClassifier(k=5),
|
|
'DecisionTree': DecisionTreeClassifier(max_depth=8)
|
|
}
|
|
|
|
results = {}
|
|
|
|
for clf_name, clf in classifiers.items():
|
|
print(f"\n{clf_name} 分类器:")
|
|
|
|
# 训练和测试
|
|
start_time = time.time()
|
|
clf.fit(X_train_norm, y_train)
|
|
train_time = time.time() - start_time
|
|
|
|
y_pred = clf.predict(X_test_norm)
|
|
accuracy = accuracy_score(y_test, y_pred)
|
|
|
|
# 交叉验证
|
|
cv_scores = cross_validation(type(clf)() if clf_name != 'KNN' else KNNClassifier(k=5),
|
|
X_train_norm, y_train, k=5)
|
|
|
|
print(f" 准确率: {accuracy:.4f}")
|
|
print(f" 训练时间: {train_time:.4f}s")
|
|
print(f" 交叉验证均值: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}")
|
|
|
|
results[clf_name] = {
|
|
'accuracy': accuracy,
|
|
'train_time': train_time,
|
|
'cv_mean': np.mean(cv_scores),
|
|
'cv_std': np.std(cv_scores)
|
|
}
|
|
|
|
return results
|
|
|
|
def run_ensemble_comparison(self, X: np.ndarray, y: np.ndarray, dataset_name: str):
|
|
"""运行集成算法比较实验"""
|
|
print(f"\n=== 集成算法比较实验 - {dataset_name} ===")
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
|
|
X_train_norm, X_test_norm = normalize_data(X_train, X_test)
|
|
|
|
# 基础分类器
|
|
base_classifiers = {
|
|
'DecisionTree': DecisionTreeClassifier(max_depth=8),
|
|
'NaiveBayes': NaiveBayesClassifier(),
|
|
'KNN': KNNClassifier(k=5)
|
|
}
|
|
|
|
# 集成分类器
|
|
ensemble_classifiers = {
|
|
'Bagging_DT': BaggingClassifier('decision_tree', n_estimators=10),
|
|
'Voting': VotingClassifier([
|
|
DecisionTreeClassifier(max_depth=8),
|
|
NaiveBayesClassifier(),
|
|
KNNClassifier(k=5)
|
|
])
|
|
}
|
|
|
|
results = {}
|
|
|
|
# 测试基础分类器
|
|
print("基础分类器:")
|
|
for clf_name, clf in base_classifiers.items():
|
|
start_time = time.time()
|
|
clf.fit(X_train_norm, y_train)
|
|
train_time = time.time() - start_time
|
|
|
|
y_pred = clf.predict(X_test_norm)
|
|
accuracy = accuracy_score(y_test, y_pred)
|
|
|
|
print(f" {clf_name}: {accuracy:.4f} (训练时间: {train_time:.4f}s)")
|
|
results[clf_name] = {'accuracy': accuracy, 'train_time': train_time}
|
|
|
|
# 测试集成分类器
|
|
print("\n集成分类器:")
|
|
for clf_name, clf in ensemble_classifiers.items():
|
|
start_time = time.time()
|
|
clf.fit(X_train_norm, y_train)
|
|
train_time = time.time() - start_time
|
|
|
|
y_pred = clf.predict(X_test_norm)
|
|
accuracy = accuracy_score(y_test, y_pred)
|
|
|
|
print(f" {clf_name}: {accuracy:.4f} (训练时间: {train_time:.4f}s)")
|
|
results[clf_name] = {'accuracy': accuracy, 'train_time': train_time}
|
|
|
|
return results
|
|
|
|
def run_all_experiments(self):
|
|
"""运行所有实验"""
|
|
print("开始机器学习算法比较实验...")
|
|
|
|
# 生成两个不同的数据集
|
|
print("生成数据集...")
|
|
X1, y1 = self.generate_synthetic_data(n_samples=800, n_features=20, n_classes=3, random_state=42)
|
|
X2, y2 = self.generate_synthetic_data(n_samples=1000, n_features=25, n_classes=4, random_state=123)
|
|
|
|
datasets = [
|
|
(X1, y1, "Dataset1"),
|
|
(X2, y2, "Dataset2")
|
|
]
|
|
|
|
all_results = {}
|
|
|
|
for X, y, dataset_name in datasets:
|
|
print(f"\n{'='*50}")
|
|
print(f"处理数据集: {dataset_name}")
|
|
print(f"样本数: {X.shape[0]}, 特征数: {X.shape[1]}, 类别数: {len(np.unique(y))}")
|
|
|
|
# 运行各种实验
|
|
bp_results = self.run_bp_comparison(X, y, dataset_name)
|
|
fe_results = self.run_feature_extraction_comparison(X, y, dataset_name)
|
|
clf_results = self.run_classifier_comparison(X, y, dataset_name)
|
|
ensemble_results = self.run_ensemble_comparison(X, y, dataset_name)
|
|
|
|
all_results[dataset_name] = {
|
|
'bp_comparison': bp_results,
|
|
'feature_extraction': fe_results,
|
|
'classifier_comparison': clf_results,
|
|
'ensemble_comparison': ensemble_results
|
|
}
|
|
|
|
# 生成总结报告
|
|
self.generate_summary_report(all_results)
|
|
|
|
return all_results
|
|
|
|
def generate_summary_report(self, results: Dict):
|
|
"""生成总结报告"""
|
|
print(f"\n{'='*60}")
|
|
print("实验总结报告")
|
|
print(f"{'='*60}")
|
|
|
|
for dataset_name, dataset_results in results.items():
|
|
print(f"\n{dataset_name} 结果总结:")
|
|
print("-" * 40)
|
|
|
|
# BP算法比较
|
|
bp_results = dataset_results['bp_comparison']
|
|
print(f"BP算法比较:")
|
|
print(f" 改进BP: 准确率 {bp_results['improved_bp']['accuracy']:.4f}, 时间 {bp_results['improved_bp']['time']:.2f}s")
|
|
print(f" 标准BP: 准确率 {bp_results['standard_bp']['accuracy']:.4f}, 时间 {bp_results['standard_bp']['time']:.2f}s")
|
|
|
|
# 特征提取比较
|
|
fe_results = dataset_results['feature_extraction']
|
|
print(f"\n特征提取效果 (最佳结果):")
|
|
for clf_name, clf_results in fe_results.items():
|
|
best_method = max(clf_results, key=clf_results.get)
|
|
best_acc = clf_results[best_method]
|
|
print(f" {clf_name}: {best_method} ({best_acc:.4f})")
|
|
|
|
# 集成算法比较
|
|
ensemble_results = dataset_results['ensemble_comparison']
|
|
print(f"\n分类器性能排名:")
|
|
sorted_classifiers = sorted(ensemble_results.items(),
|
|
key=lambda x: x[1]['accuracy'], reverse=True)
|
|
for i, (clf_name, clf_result) in enumerate(sorted_classifiers[:5]):
|
|
print(f" {i+1}. {clf_name}: {clf_result['accuracy']:.4f}")
|
|
|
|
if __name__ == "__main__":
|
|
runner = ExperimentRunner()
|
|
results = runner.run_all_experiments()
|