feat: Add initial implementation of core modules

2025-05-24 17:02:46 +08:00 · 2025-05-24 17:02:46 +08:00 · 047b03a590
commit 047b03a590
parent c7160505c6
8 changed files with 1220 additions and 94 deletions
--- a/classifiers.py
+++ b/classifiers.py
@ -0,0 +1,220 @@
 import numpy as np
 from typing import Dict, List, Tuple
 from collections import Counter
 class NaiveBayesClassifier:
    """朴素贝叶斯分类器"""
    def __init__(self):
        self.class_priors = {}
        self.feature_likelihoods = {}
        self.classes = None
    def fit(self, X: np.ndarray, y: np.ndarray):
        """训练朴素贝叶斯分类器"""
        self.classes = np.unique(y)
        n_samples, n_features = X.shape
        # 计算类先验概率
        for c in self.classes:
            self.class_priors[c] = np.sum(y == c) / n_samples
        # 计算特征似然
        self.feature_likelihoods = {}
        for c in self.classes:
            class_data = X[y == c]
            self.feature_likelihoods[c] = {
                'mean': np.mean(class_data, axis=0),
                'var': np.var(class_data, axis=0) + 1e-10  # 避免除零
            }
    def _gaussian_probability(self, x: float, mean: float, var: float) -> float:
        """计算高斯概率密度"""
        return (1 / np.sqrt(2 * np.pi * var)) * np.exp(-0.5 * ((x - mean) ** 2) / var)
    def predict(self, X: np.ndarray) -> np.ndarray:
        """预测"""
        predictions = []
        for sample in X:
            class_scores = {}
            for c in self.classes:
                # 计算后验概率（对数形式避免下溢）
                log_prob = np.log(self.class_priors[c])
                for i, feature_value in enumerate(sample):
                    mean = self.feature_likelihoods[c]['mean'][i]
                    var = self.feature_likelihoods[c]['var'][i]
                    log_prob += np.log(self._gaussian_probability(feature_value, mean, var))
                class_scores[c] = log_prob
            # 选择概率最大的类
            predicted_class = max(class_scores, key=class_scores.get)
            predictions.append(predicted_class)
        return np.array(predictions)
 class KNNClassifier:
    """K最近邻分类器"""
    def __init__(self, k: int = 3):
        self.k = k
        self.X_train = None
        self.y_train = None
    def fit(self, X: np.ndarray, y: np.ndarray):
        """训练KNN分类器"""
        self.X_train = X
        self.y_train = y
    def _euclidean_distance(self, x1: np.ndarray, x2: np.ndarray) -> float:
        """计算欧几里得距离"""
        return np.sqrt(np.sum((x1 - x2) ** 2))
    def predict(self, X: np.ndarray) -> np.ndarray:
        """预测"""
        predictions = []
        for sample in X:
            # 计算与所有训练样本的距离
            distances = []
            for i, train_sample in enumerate(self.X_train):
                dist = self._euclidean_distance(sample, train_sample)
                distances.append((dist, self.y_train[i]))
            # 选择k个最近邻
            distances.sort(key=lambda x: x[0])
            k_nearest = distances[:self.k]
            # 投票决定类别
            votes = [label for _, label in k_nearest]
            predicted_class = max(set(votes), key=votes.count)
            predictions.append(predicted_class)
        return np.array(predictions)
 class DecisionTreeNode:
    """决策树节点"""
    def __init__(self):
        self.feature_idx = None
        self.threshold = None
        self.left = None
        self.right = None
        self.prediction = None
        self.is_leaf = False
 class DecisionTreeClassifier:
    """决策树分类器"""
    def __init__(self, max_depth: int = 10, min_samples_split: int = 2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.root = None
    def _gini_impurity(self, y: np.ndarray) -> float:
        """计算基尼不纯度"""
        if len(y) == 0:
            return 0
        _, counts = np.unique(y, return_counts=True)
        probabilities = counts / len(y)
        return 1 - np.sum(probabilities ** 2)
    def _information_gain(self, y: np.ndarray, y_left: np.ndarray, y_right: np.ndarray) -> float:
        """计算信息增益"""
        n = len(y)
        n_left, n_right = len(y_left), len(y_right)
        if n_left == 0 or n_right == 0:
            return 0
        gini_parent = self._gini_impurity(y)
        gini_children = (n_left / n) * self._gini_impurity(y_left) + (n_right / n) * self._gini_impurity(y_right)
        return gini_parent - gini_children
    def _best_split(self, X: np.ndarray, y: np.ndarray) -> Tuple[int, float, float]:
        """找到最佳分割"""
        best_gain = 0
        best_feature_idx = None
        best_threshold = None
        n_features = X.shape[1]
        for feature_idx in range(n_features):
            feature_values = X[:, feature_idx]
            thresholds = np.unique(feature_values)
            for threshold in thresholds:
                left_mask = feature_values <= threshold
                right_mask = ~left_mask
                if np.sum(left_mask) == 0 or np.sum(right_mask) == 0:
                    continue
                y_left, y_right = y[left_mask], y[right_mask]
                gain = self._information_gain(y, y_left, y_right)
                if gain > best_gain:
                    best_gain = gain
                    best_feature_idx = feature_idx
                    best_threshold = threshold
        return best_feature_idx, best_threshold, best_gain
    def _build_tree(self, X: np.ndarray, y: np.ndarray, depth: int = 0) -> DecisionTreeNode:
        """构建决策树"""
        node = DecisionTreeNode()
        # 停止条件
        if (depth >= self.max_depth or 
            len(np.unique(y)) == 1 or 
            len(y) < self.min_samples_split):
            node.is_leaf = True
            node.prediction = max(set(y), key=list(y).count)
            return node
        # 找到最佳分割
        feature_idx, threshold, gain = self._best_split(X, y)
        if gain == 0:
            node.is_leaf = True
            node.prediction = max(set(y), key=list(y).count)
            return node
        # 分割数据
        left_mask = X[:, feature_idx] <= threshold
        right_mask = ~left_mask
        node.feature_idx = feature_idx
        node.threshold = threshold
        node.left = self._build_tree(X[left_mask], y[left_mask], depth + 1)
        node.right = self._build_tree(X[right_mask], y[right_mask], depth + 1)
        return node
    def fit(self, X: np.ndarray, y: np.ndarray):
        """训练决策树"""
        self.root = self._build_tree(X, y)
    def _predict_sample(self, sample: np.ndarray, node: DecisionTreeNode):
        """预测单个样本"""
        if node.is_leaf:
            return node.prediction
        if sample[node.feature_idx] <= node.threshold:
            return self._predict_sample(sample, node.left)
        else:
            return self._predict_sample(sample, node.right)
    def predict(self, X: np.ndarray) -> np.ndarray:
        """预测"""
        predictions = []
        for sample in X:
            prediction = self._predict_sample(sample, self.root)
            predictions.append(prediction)
        return np.array(predictions)
--- a/ensemble.py
+++ b/ensemble.py
@ -0,0 +1,142 @@
 import numpy as np
 from typing import List, Any
 from classifiers import DecisionTreeClassifier, NaiveBayesClassifier, KNNClassifier
 class BaggingClassifier:
    """Bagging集成分类器"""
    def __init__(self, base_classifier, n_estimators: int = 10, random_state: int = 42):
        self.base_classifier = base_classifier
        self.n_estimators = n_estimators
        self.random_state = random_state
        self.estimators = []
    def fit(self, X: np.ndarray, y: np.ndarray):
        """训练Bagging分类器"""
        np.random.seed(self.random_state)
        n_samples = X.shape[0]
        self.estimators = []
        for i in range(self.n_estimators):
            # Bootstrap采样
            bootstrap_indices = np.random.choice(n_samples, size=n_samples, replace=True)
            X_bootstrap = X[bootstrap_indices]
            y_bootstrap = y[bootstrap_indices]
            # 训练基分类器
            if self.base_classifier == 'decision_tree':
                estimator = DecisionTreeClassifier(max_depth=8)
            elif self.base_classifier == 'naive_bayes':
                estimator = NaiveBayesClassifier()
            elif self.base_classifier == 'knn':
                estimator = KNNClassifier(k=5)
            estimator.fit(X_bootstrap, y_bootstrap)
            self.estimators.append(estimator)
    def predict(self, X: np.ndarray) -> np.ndarray:
        """预测"""
        # 收集所有基分类器的预测
        predictions = np.zeros((X.shape[0], self.n_estimators))
        for i, estimator in enumerate(self.estimators):
            predictions[:, i] = estimator.predict(X)
        # 投票决定最终预测
        final_predictions = []
        for i in range(X.shape[0]):
            votes = predictions[i, :]
            prediction = max(set(votes), key=list(votes).count)
            final_predictions.append(prediction)
        return np.array(final_predictions)
 class AdaBoostClassifier:
    """AdaBoost集成分类器"""
    def __init__(self, n_estimators: int = 10, random_state: int = 42):
        self.n_estimators = n_estimators
        self.random_state = random_state
        self.estimators = []
        self.estimator_weights = []
    def fit(self, X: np.ndarray, y: np.ndarray):
        """训练AdaBoost分类器"""
        np.random.seed(self.random_state)
        n_samples = X.shape[0]
        # 初始化样本权重
        sample_weights = np.ones(n_samples) / n_samples
        self.estimators = []
        self.estimator_weights = []
        for i in range(self.n_estimators):
            # 根据样本权重采样
            sample_indices = np.random.choice(
                n_samples, size=n_samples, replace=True, p=sample_weights
            )
            X_weighted = X[sample_indices]
            y_weighted = y[sample_indices]
            # 训练弱分类器（决策树桩）
            estimator = DecisionTreeClassifier(max_depth=1)
            estimator.fit(X_weighted, y_weighted)
            # 计算预测错误率
            y_pred = estimator.predict(X)
            error_mask = y_pred != y
            error_rate = np.average(error_mask, weights=sample_weights)
            # 如果错误率太高，停止
            if error_rate >= 0.5:
                break
            # 计算分类器权重
            alpha = 0.5 * np.log((1 - error_rate) / (error_rate + 1e-10))
            # 更新样本权重
            sample_weights *= np.exp(-alpha * y * y_pred)
            sample_weights /= np.sum(sample_weights)
            self.estimators.append(estimator)
            self.estimator_weights.append(alpha)
    def predict(self, X: np.ndarray) -> np.ndarray:
        """预测"""
        n_samples = X.shape[0]
        predictions = np.zeros(n_samples)
        for estimator, weight in zip(self.estimators, self.estimator_weights):
            y_pred = estimator.predict(X)
            predictions += weight * y_pred
        return np.sign(predictions)
 class VotingClassifier:
    """投票集成分类器"""
    def __init__(self, estimators: List[Any]):
        self.estimators = estimators
    def fit(self, X: np.ndarray, y: np.ndarray):
        """训练所有分类器"""
        for estimator in self.estimators:
            estimator.fit(X, y)
    def predict(self, X: np.ndarray) -> np.ndarray:
        """预测"""
        predictions = np.zeros((X.shape[0], len(self.estimators)))
        for i, estimator in enumerate(self.estimators):
            predictions[:, i] = estimator.predict(X)
        # 投票决定最终预测
        final_predictions = []
        for i in range(X.shape[0]):
            votes = predictions[i, :]
            prediction = max(set(votes), key=list(votes).count)
            final_predictions.append(prediction)
        return np.array(final_predictions)
--- a/experiments.py
+++ b/experiments.py
@ -0,0 +1,319 @@
 import numpy as np
 import matplotlib.pyplot as plt
 import time
 from typing import Dict, List, Tuple
 from utils import train_test_split, normalize_data, accuracy_score, cross_validation
 from feature_extraction import PCA, FeatureSelector
 from improved_bp import ImprovedBPNetwork, StandardBPNetwork
 from classifiers import NaiveBayesClassifier, KNNClassifier, DecisionTreeClassifier
 from ensemble import BaggingClassifier, AdaBoostClassifier, VotingClassifier
 class ExperimentRunner:
    """实验运行器"""
    def __init__(self):
        self.results = {}
    def generate_synthetic_data(self, n_samples: int = 1000, n_features: int = 20, n_classes: int = 3, 
                              random_state: int = 42) -> Tuple[np.ndarray, np.ndarray]:
        """生成合成数据集"""
        np.random.seed(random_state)
        # 为每个类生成不同的均值和协方差
        class_means = np.random.randn(n_classes, n_features) * 2
        X = []
        y = []
        samples_per_class = n_samples // n_classes
        for class_idx in range(n_classes):
            # 生成该类的数据
            class_data = np.random.randn(samples_per_class, n_features) + class_means[class_idx]
            X.append(class_data)
            y.extend([class_idx] * samples_per_class)
        X = np.vstack(X)
        y = np.array(y)
        # 添加噪声特征
        noise_features = np.random.randn(len(X), n_features // 2)
        X = np.hstack([X, noise_features])
        return X, y
    def run_bp_comparison(self, X: np.ndarray, y: np.ndarray, dataset_name: str):
        """运行BP算法比较实验"""
        print(f"\n=== BP算法比较实验 - {dataset_name} ===")
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
        X_train_norm, X_test_norm = normalize_data(X_train, X_test)
        # 改进的BP网络
        print("训练改进的BP网络...")
        start_time = time.time()
        improved_bp = ImprovedBPNetwork(hidden_layers=[10, 5], learning_rate=0.01, max_epochs=500)
        improved_bp.fit(X_train_norm, y_train)
        improved_train_time = time.time() - start_time
        y_pred_improved = improved_bp.predict(X_test_norm)
        improved_accuracy = accuracy_score(y_test, y_pred_improved)
        # 标准BP网络
        print("训练标准BP网络...")
        start_time = time.time()
        standard_bp = StandardBPNetwork(hidden_layers=[10, 5], learning_rate=0.01, max_epochs=500)
        standard_bp.fit(X_train_norm, y_train)
        standard_train_time = time.time() - start_time
        y_pred_standard = standard_bp.predict(X_test_norm)
        standard_accuracy = accuracy_score(y_test, y_pred_standard)
        # 结果
        print(f"改进BP - 准确率: {improved_accuracy:.4f}, 训练时间: {improved_train_time:.2f}s")
        print(f"标准BP - 准确率: {standard_accuracy:.4f}, 训练时间: {standard_train_time:.2f}s")
        # 绘制损失曲线
        plt.figure(figsize=(10, 6))
        plt.plot(improved_bp.loss_history, label='改进BP', alpha=0.8)
        plt.plot(standard_bp.loss_history, label='标准BP', alpha=0.8)
        plt.xlabel('训练轮次')
        plt.ylabel('损失')
        plt.title(f'BP算法损失曲线对比 - {dataset_name}')
        plt.legend()
        plt.grid(True)
        plt.savefig(f'c:/Users/grtsi/ml-homework/bp_comparison_{dataset_name.lower()}.png')
        plt.show()
        return {
            'improved_bp': {'accuracy': improved_accuracy, 'time': improved_train_time},
            'standard_bp': {'accuracy': standard_accuracy, 'time': standard_train_time}
        }
    def run_feature_extraction_comparison(self, X: np.ndarray, y: np.ndarray, dataset_name: str):
        """运行特征提取比较实验"""
        print(f"\n=== 特征提取比较实验 - {dataset_name} ===")
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
        X_train_norm, X_test_norm = normalize_data(X_train, X_test)
        classifiers = {
            'NaiveBayes': NaiveBayesClassifier(),
            'KNN': KNNClassifier(k=5),
            'DecisionTree': DecisionTreeClassifier(max_depth=8)
        }
        results = {}
        for clf_name, clf in classifiers.items():
            print(f"\n{clf_name} 分类器:")
            # 无特征提取
            clf_no_fe = type(clf)() if clf_name != 'KNN' else KNNClassifier(k=5)
            clf_no_fe.fit(X_train_norm, y_train)
            y_pred_no_fe = clf_no_fe.predict(X_test_norm)
            acc_no_fe = accuracy_score(y_test, y_pred_no_fe)
            # PCA特征提取
            pca = PCA(n_components=min(10, X.shape[1] // 2))
            X_train_pca = pca.fit_transform(X_train_norm)
            X_test_pca = pca.transform(X_test_norm)
            clf_pca = type(clf)() if clf_name != 'KNN' else KNNClassifier(k=5)
            clf_pca.fit(X_train_pca, y_train)
            y_pred_pca = clf_pca.predict(X_test_pca)
            acc_pca = accuracy_score(y_test, y_pred_pca)
            # 特征选择
            feature_selector = FeatureSelector(k=min(10, X.shape[1] // 2))
            X_train_fs = feature_selector.fit_transform(X_train_norm, y_train)
            X_test_fs = feature_selector.transform(X_test_norm)
            clf_fs = type(clf)() if clf_name != 'KNN' else KNNClassifier(k=5)
            clf_fs.fit(X_train_fs, y_train)
            y_pred_fs = clf_fs.predict(X_test_fs)
            acc_fs = accuracy_score(y_test, y_pred_fs)
            print(f"  无特征提取: {acc_no_fe:.4f}")
            print(f"  PCA特征提取: {acc_pca:.4f}")
            print(f"  特征选择: {acc_fs:.4f}")
            results[clf_name] = {
                'no_feature_extraction': acc_no_fe,
                'pca': acc_pca,
                'feature_selection': acc_fs
            }
        return results
    def run_classifier_comparison(self, X: np.ndarray, y: np.ndarray, dataset_name: str):
        """运行分类器比较实验"""
        print(f"\n=== 分类器比较实验 - {dataset_name} ===")
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
        X_train_norm, X_test_norm = normalize_data(X_train, X_test)
        classifiers = {
            'NaiveBayes': NaiveBayesClassifier(),
            'KNN': KNNClassifier(k=5),
            'DecisionTree': DecisionTreeClassifier(max_depth=8)
        }
        results = {}
        for clf_name, clf in classifiers.items():
            print(f"\n{clf_name} 分类器:")
            # 训练和测试
            start_time = time.time()
            clf.fit(X_train_norm, y_train)
            train_time = time.time() - start_time
            y_pred = clf.predict(X_test_norm)
            accuracy = accuracy_score(y_test, y_pred)
            # 交叉验证
            cv_scores = cross_validation(type(clf)() if clf_name != 'KNN' else KNNClassifier(k=5), 
                                       X_train_norm, y_train, k=5)
            print(f"  准确率: {accuracy:.4f}")
            print(f"  训练时间: {train_time:.4f}s")
            print(f"  交叉验证均值: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}")
            results[clf_name] = {
                'accuracy': accuracy,
                'train_time': train_time,
                'cv_mean': np.mean(cv_scores),
                'cv_std': np.std(cv_scores)
            }
        return results
    def run_ensemble_comparison(self, X: np.ndarray, y: np.ndarray, dataset_name: str):
        """运行集成算法比较实验"""
        print(f"\n=== 集成算法比较实验 - {dataset_name} ===")
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
        X_train_norm, X_test_norm = normalize_data(X_train, X_test)
        # 基础分类器
        base_classifiers = {
            'DecisionTree': DecisionTreeClassifier(max_depth=8),
            'NaiveBayes': NaiveBayesClassifier(),
            'KNN': KNNClassifier(k=5)
        }
        # 集成分类器
        ensemble_classifiers = {
            'Bagging_DT': BaggingClassifier('decision_tree', n_estimators=10),
            'Voting': VotingClassifier([
                DecisionTreeClassifier(max_depth=8),
                NaiveBayesClassifier(),
                KNNClassifier(k=5)
            ])
        }
        results = {}
        # 测试基础分类器
        print("基础分类器:")
        for clf_name, clf in base_classifiers.items():
            start_time = time.time()
            clf.fit(X_train_norm, y_train)
            train_time = time.time() - start_time
            y_pred = clf.predict(X_test_norm)
            accuracy = accuracy_score(y_test, y_pred)
            print(f"  {clf_name}: {accuracy:.4f} (训练时间: {train_time:.4f}s)")
            results[clf_name] = {'accuracy': accuracy, 'train_time': train_time}
        # 测试集成分类器
        print("\n集成分类器:")
        for clf_name, clf in ensemble_classifiers.items():
            start_time = time.time()
            clf.fit(X_train_norm, y_train)
            train_time = time.time() - start_time
            y_pred = clf.predict(X_test_norm)
            accuracy = accuracy_score(y_test, y_pred)
            print(f"  {clf_name}: {accuracy:.4f} (训练时间: {train_time:.4f}s)")
            results[clf_name] = {'accuracy': accuracy, 'train_time': train_time}
        return results
    def run_all_experiments(self):
        """运行所有实验"""
        print("开始机器学习算法比较实验...")
        # 生成两个不同的数据集
        print("生成数据集...")
        X1, y1 = self.generate_synthetic_data(n_samples=800, n_features=20, n_classes=3, random_state=42)
        X2, y2 = self.generate_synthetic_data(n_samples=1000, n_features=25, n_classes=4, random_state=123)
        datasets = [
            (X1, y1, "Dataset1"),
            (X2, y2, "Dataset2")
        ]
        all_results = {}
        for X, y, dataset_name in datasets:
            print(f"\n{'='*50}")
            print(f"处理数据集: {dataset_name}")
            print(f"样本数: {X.shape[0]}, 特征数: {X.shape[1]}, 类别数: {len(np.unique(y))}")
            # 运行各种实验
            bp_results = self.run_bp_comparison(X, y, dataset_name)
            fe_results = self.run_feature_extraction_comparison(X, y, dataset_name)
            clf_results = self.run_classifier_comparison(X, y, dataset_name)
            ensemble_results = self.run_ensemble_comparison(X, y, dataset_name)
            all_results[dataset_name] = {
                'bp_comparison': bp_results,
                'feature_extraction': fe_results,
                'classifier_comparison': clf_results,
                'ensemble_comparison': ensemble_results
            }
        # 生成总结报告
        self.generate_summary_report(all_results)
        return all_results
    def generate_summary_report(self, results: Dict):
        """生成总结报告"""
        print(f"\n{'='*60}")
        print("实验总结报告")
        print(f"{'='*60}")
        for dataset_name, dataset_results in results.items():
            print(f"\n{dataset_name} 结果总结:")
            print("-" * 40)
            # BP算法比较
            bp_results = dataset_results['bp_comparison']
            print(f"BP算法比较:")
            print(f"  改进BP: 准确率 {bp_results['improved_bp']['accuracy']:.4f}, 时间 {bp_results['improved_bp']['time']:.2f}s")
            print(f"  标准BP: 准确率 {bp_results['standard_bp']['accuracy']:.4f}, 时间 {bp_results['standard_bp']['time']:.2f}s")
            # 特征提取比较
            fe_results = dataset_results['feature_extraction']
            print(f"\n特征提取效果 (最佳结果):")
            for clf_name, clf_results in fe_results.items():
                best_method = max(clf_results, key=clf_results.get)
                best_acc = clf_results[best_method]
                print(f"  {clf_name}: {best_method} ({best_acc:.4f})")
            # 集成算法比较
            ensemble_results = dataset_results['ensemble_comparison']
            print(f"\n分类器性能排名:")
            sorted_classifiers = sorted(ensemble_results.items(), 
                                      key=lambda x: x[1]['accuracy'], reverse=True)
            for i, (clf_name, clf_result) in enumerate(sorted_classifiers[:5]):
                print(f"  {i+1}. {clf_name}: {clf_result['accuracy']:.4f}")
 if __name__ == "__main__":
    runner = ExperimentRunner()
    results = runner.run_all_experiments()
--- a/feature_extraction.py
+++ b/feature_extraction.py
@ -0,0 +1,96 @@
 import numpy as np
 from typing import Tuple
 class PCA:
    """主成分分析"""
    def __init__(self, n_components: int):
        self.n_components = n_components
        self.components_ = None
        self.mean_ = None
        self.explained_variance_ratio_ = None
    def fit(self, X: np.ndarray) -> 'PCA':
        """训练PCA模型"""
        self.mean_ = np.mean(X, axis=0)
        X_centered = X - self.mean_
        # 计算协方差矩阵
        cov_matrix = np.cov(X_centered, rowvar=False)
        # 计算特征值和特征向量
        eigenvalues, eigenvectors = np.linalg.eigh(cov_matrix)
        # 按特征值降序排列
        idx = np.argsort(eigenvalues)[::-1]
        eigenvalues = eigenvalues[idx]
        eigenvectors = eigenvectors[:, idx]
        # 选择前n_components个主成分
        self.components_ = eigenvectors[:, :self.n_components].T
        self.explained_variance_ratio_ = eigenvalues[:self.n_components] / np.sum(eigenvalues)
        return self
    def transform(self, X: np.ndarray) -> np.ndarray:
        """应用PCA变换"""
        X_centered = X - self.mean_
        return np.dot(X_centered, self.components_.T)
    def fit_transform(self, X: np.ndarray) -> np.ndarray:
        """训练并变换"""
        return self.fit(X).transform(X)
 class FeatureSelector:
    """基于信息增益的特征选择"""
    def __init__(self, k: int):
        self.k = k
        self.selected_features_ = None
    def _entropy(self, y: np.ndarray) -> float:
        """计算熵"""
        _, counts = np.unique(y, return_counts=True)
        probabilities = counts / len(y)
        return -np.sum(probabilities * np.log2(probabilities + 1e-10))
    def _information_gain(self, X_feature: np.ndarray, y: np.ndarray) -> float:
        """计算信息增益"""
        # 对连续特征进行离散化
        if len(np.unique(X_feature)) > 10:
            bins = np.linspace(np.min(X_feature), np.max(X_feature), 11)
            X_feature = np.digitize(X_feature, bins)
        total_entropy = self._entropy(y)
        values, counts = np.unique(X_feature, return_counts=True)
        weighted_entropy = 0
        for value, count in zip(values, counts):
            subset_y = y[X_feature == value]
            weighted_entropy += (count / len(y)) * self._entropy(subset_y)
        return total_entropy - weighted_entropy
    def fit(self, X: np.ndarray, y: np.ndarray) -> 'FeatureSelector':
        """训练特征选择器"""
        n_features = X.shape[1]
        feature_scores = []
        for i in range(n_features):
            score = self._information_gain(X[:, i], y)
            feature_scores.append((i, score))
        # 按信息增益排序
        feature_scores.sort(key=lambda x: x[1], reverse=True)
        self.selected_features_ = [idx for idx, _ in feature_scores[:self.k]]
        return self
    def transform(self, X: np.ndarray) -> np.ndarray:
        """应用特征选择"""
        return X[:, self.selected_features_]
    def fit_transform(self, X: np.ndarray, y: np.ndarray) -> np.ndarray:
        """训练并变换"""
        return self.fit(X, y).transform(X)
--- a/improved_bp.py
+++ b/improved_bp.py
@ -0,0 +1,248 @@
 import numpy as np
 from typing import List, Tuple
 class ImprovedBPNetwork:
    """改进的BP神经网络，支持动态学习率调整"""
    def __init__(self, hidden_layers: List[int], learning_rate: float = 0.01, 
                 max_epochs: int = 1000, tolerance: float = 1e-6):
        self.hidden_layers = hidden_layers
        self.initial_lr = learning_rate
        self.learning_rate = learning_rate
        self.max_epochs = max_epochs
        self.tolerance = tolerance
        self.weights = []
        self.biases = []
        self.loss_history = []
    def _sigmoid(self, x: np.ndarray) -> np.ndarray:
        """Sigmoid激活函数"""
        x = np.clip(x, -500, 500)  # 防止溢出
        return 1 / (1 + np.exp(-x))
    def _sigmoid_derivative(self, x: np.ndarray) -> np.ndarray:
        """Sigmoid函数的导数"""
        s = self._sigmoid(x)
        return s * (1 - s)
    def _initialize_weights(self, input_size: int, output_size: int):
        """初始化权重和偏置"""
        self.weights = []
        self.biases = []
        # 构建网络结构
        layers = [input_size] + self.hidden_layers + [output_size]
        # Xavier初始化
        for i in range(len(layers) - 1):
            w = np.random.normal(0, np.sqrt(2.0 / (layers[i] + layers[i+1])), 
                               (layers[i], layers[i+1]))
            b = np.zeros((1, layers[i+1]))
            self.weights.append(w)
            self.biases.append(b)
    def _forward_pass(self, X: np.ndarray) -> List[np.ndarray]:
        """前向传播"""
        activations = [X]
        for i in range(len(self.weights)):
            z = np.dot(activations[-1], self.weights[i]) + self.biases[i]
            a = self._sigmoid(z)
            activations.append(a)
        return activations
    def _backward_pass(self, X: np.ndarray, y: np.ndarray, activations: List[np.ndarray]) -> Tuple[List[np.ndarray], List[np.ndarray]]:
        """反向传播"""
        m = X.shape[0]
        dw = [np.zeros_like(w) for w in self.weights]
        db = [np.zeros_like(b) for b in self.biases]
        # 输出层误差
        delta = activations[-1] - y
        # 从输出层向输入层反向传播
        for i in range(len(self.weights) - 1, -1, -1):
            dw[i] = np.dot(activations[i].T, delta) / m
            db[i] = np.mean(delta, axis=0, keepdims=True)
            if i > 0:
                delta = np.dot(delta, self.weights[i].T) * self._sigmoid_derivative(
                    np.dot(activations[i], self.weights[i]) + self.biases[i])
        return dw, db
    def _adaptive_learning_rate(self, epoch: int, current_loss: float, prev_loss: float):
        """动态调整学习率"""
        if epoch > 0:
            if current_loss > prev_loss:
                # 损失增加，降低学习率
                self.learning_rate *= 0.9
            elif (prev_loss - current_loss) / prev_loss < 0.001:
                # 损失下降缓慢，增加学习率
                self.learning_rate *= 1.05
        # 限制学习率范围
        self.learning_rate = np.clip(self.learning_rate, 
                                   self.initial_lr * 0.01, 
                                   self.initial_lr * 10)
    def fit(self, X: np.ndarray, y: np.ndarray):
        """训练神经网络"""
        # 处理标签
        if len(y.shape) == 1:
            y_encoded = np.zeros((len(y), len(np.unique(y))))
            for i, label in enumerate(np.unique(y)):
                y_encoded[y == label, i] = 1
            y = y_encoded
        self._initialize_weights(X.shape[1], y.shape[1])
        prev_loss = float('inf')
        for epoch in range(self.max_epochs):
            # 前向传播
            activations = self._forward_pass(X)
            # 计算损失
            loss = np.mean((activations[-1] - y) ** 2)
            self.loss_history.append(loss)
            # 动态调整学习率
            self._adaptive_learning_rate(epoch, loss, prev_loss)
            # 反向传播
            dw, db = self._backward_pass(X, y, activations)
            # 更新权重和偏置
            for i in range(len(self.weights)):
                self.weights[i] -= self.learning_rate * dw[i]
                self.biases[i] -= self.learning_rate * db[i]
            # 检查收敛
            if abs(prev_loss - loss) < self.tolerance:
                print(f"训练在第{epoch+1}轮收敛")
                break
            prev_loss = loss
            if epoch % 100 == 0:
                print(f"Epoch {epoch}, Loss: {loss:.6f}, LR: {self.learning_rate:.6f}")
    def predict(self, X: np.ndarray) -> np.ndarray:
        """预测"""
        activations = self._forward_pass(X)
        return np.argmax(activations[-1], axis=1)
    def predict_proba(self, X: np.ndarray) -> np.ndarray:
        """预测概率"""
        activations = self._forward_pass(X)
        return activations[-1]
 class StandardBPNetwork:
    """标准BP神经网络（固定学习率）"""
    def __init__(self, hidden_layers: List[int], learning_rate: float = 0.01, 
                 max_epochs: int = 1000, tolerance: float = 1e-6):
        self.hidden_layers = hidden_layers
        self.learning_rate = learning_rate
        self.max_epochs = max_epochs
        self.tolerance = tolerance
        self.weights = []
        self.biases = []
        self.loss_history = []
    def _sigmoid(self, x: np.ndarray) -> np.ndarray:
        """Sigmoid激活函数"""
        x = np.clip(x, -500, 500)
        return 1 / (1 + np.exp(-x))
    def _sigmoid_derivative(self, x: np.ndarray) -> np.ndarray:
        """Sigmoid函数的导数"""
        s = self._sigmoid(x)
        return s * (1 - s)
    def _initialize_weights(self, input_size: int, output_size: int):
        """初始化权重和偏置"""
        self.weights = []
        self.biases = []
        layers = [input_size] + self.hidden_layers + [output_size]
        for i in range(len(layers) - 1):
            w = np.random.normal(0, np.sqrt(2.0 / (layers[i] + layers[i+1])), 
                               (layers[i], layers[i+1]))
            b = np.zeros((1, layers[i+1]))
            self.weights.append(w)
            self.biases.append(b)
    def _forward_pass(self, X: np.ndarray) -> List[np.ndarray]:
        """前向传播"""
        activations = [X]
        for i in range(len(self.weights)):
            z = np.dot(activations[-1], self.weights[i]) + self.biases[i]
            a = self._sigmoid(z)
            activations.append(a)
        return activations
    def _backward_pass(self, X: np.ndarray, y: np.ndarray, activations: List[np.ndarray]) -> Tuple[List[np.ndarray], List[np.ndarray]]:
        """反向传播"""
        m = X.shape[0]
        dw = [np.zeros_like(w) for w in self.weights]
        db = [np.zeros_like(b) for b in self.biases]
        delta = activations[-1] - y
        for i in range(len(self.weights) - 1, -1, -1):
            dw[i] = np.dot(activations[i].T, delta) / m
            db[i] = np.mean(delta, axis=0, keepdims=True)
            if i > 0:
                delta = np.dot(delta, self.weights[i].T) * self._sigmoid_derivative(
                    np.dot(activations[i], self.weights[i]) + self.biases[i])
        return dw, db
    def fit(self, X: np.ndarray, y: np.ndarray):
        """训练神经网络"""
        if len(y.shape) == 1:
            y_encoded = np.zeros((len(y), len(np.unique(y))))
            for i, label in enumerate(np.unique(y)):
                y_encoded[y == label, i] = 1
            y = y_encoded
        self._initialize_weights(X.shape[1], y.shape[1])
        prev_loss = float('inf')
        for epoch in range(self.max_epochs):
            activations = self._forward_pass(X)
            loss = np.mean((activations[-1] - y) ** 2)
            self.loss_history.append(loss)
            dw, db = self._backward_pass(X, y, activations)
            for i in range(len(self.weights)):
                self.weights[i] -= self.learning_rate * dw[i]
                self.biases[i] -= self.learning_rate * db[i]
            if abs(prev_loss - loss) < self.tolerance:
                print(f"标准BP训练在第{epoch+1}轮收敛")
                break
            prev_loss = loss
            if epoch % 100 == 0:
                print(f"Standard BP Epoch {epoch}, Loss: {loss:.6f}")
    def predict(self, X: np.ndarray) -> np.ndarray:
        """预测"""
        activations = self._forward_pass(X)
        return np.argmax(activations[-1], axis=1)
    def predict_proba(self, X: np.ndarray) -> np.ndarray:
        """预测概率"""
        activations = self._forward_pass(X)
        return activations[-1]
--- a/main.ipynb
+++ b/main.ipynb
--- a/main.py
+++ b/main.py
@ -0,0 +1,29 @@
 import numpy as np
 import matplotlib.pyplot as plt
 from experiments import ExperimentRunner
 def main():
    """主函数"""
    print("机器学习算法实现与比较系统")
    print("="*50)
    print("本系统实现了以下内容:")
    print("1. 改进的BP神经网络 vs 标准BP网络")
    print("2. 特征提取(PCA, 特征选择)对分类性能的影响")
    print("3. 多种分类算法比较(朴素贝叶斯, KNN, 决策树)")
    print("4. 集成学习算法(Bagging, Voting)")
    print("5. 所有算法均为自主实现，未使用任何ML库")
    print("="*50)
    # 设置matplotlib中文显示
    plt.rcParams['font.sans-serif'] = ['SimHei']
    plt.rcParams['axes.unicode_minus'] = False
    # 运行实验
    runner = ExperimentRunner()
    results = runner.run_all_experiments()
    print("\n实验完成！结果已保存在实验报告中。")
    print("图表已保存到本地文件。")
 if __name__ == "__main__":
    main()
--- a/utils.py
+++ b/utils.py
@ -0,0 +1,72 @@
 import numpy as np
 import pandas as pd
 from typing import Tuple, List
 import math
 def load_data(filepath: str) -> Tuple[np.ndarray, np.ndarray]:
    """加载数据集"""
    data = pd.read_csv(filepath)
    X = data.iloc[:, :-1].values
    y = data.iloc[:, -1].values
    return X, y
 def train_test_split(X: np.ndarray, y: np.ndarray, test_size: float = 0.3, random_state: int = 42) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    """数据集划分"""
    np.random.seed(random_state)
    n_samples = X.shape[0]
    n_test = int(n_samples * test_size)
    indices = np.random.permutation(n_samples)
    test_indices = indices[:n_test]
    train_indices = indices[n_test:]
    return X[train_indices], X[test_indices], y[train_indices], y[test_indices]
 def normalize_data(X_train: np.ndarray, X_test: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
    """数据标准化"""
    mean = np.mean(X_train, axis=0)
    std = np.std(X_train, axis=0)
    std[std == 0] = 1  # 避免除零
    X_train_norm = (X_train - mean) / std
    X_test_norm = (X_test - mean) / std
    return X_train_norm, X_test_norm
 def accuracy_score(y_true: np.ndarray, y_pred: np.ndarray) -> float:
    """计算准确率"""
    return np.mean(y_true == y_pred)
 def confusion_matrix(y_true: np.ndarray, y_pred: np.ndarray) -> np.ndarray:
    """计算混淆矩阵"""
    classes = np.unique(np.concatenate([y_true, y_pred]))
    n_classes = len(classes)
    matrix = np.zeros((n_classes, n_classes), dtype=int)
    for i, true_class in enumerate(classes):
        for j, pred_class in enumerate(classes):
            matrix[i, j] = np.sum((y_true == true_class) & (y_pred == pred_class))
    return matrix
 def cross_validation(classifier, X: np.ndarray, y: np.ndarray, k: int = 5) -> List[float]:
    """K折交叉验证"""
    n_samples = X.shape[0]
    fold_size = n_samples // k
    scores = []
    for i in range(k):
        start_idx = i * fold_size
        end_idx = start_idx + fold_size if i < k - 1 else n_samples
        test_indices = np.arange(start_idx, end_idx)
        train_indices = np.concatenate([np.arange(0, start_idx), np.arange(end_idx, n_samples)])
        X_train_fold, X_test_fold = X[train_indices], X[test_indices]
        y_train_fold, y_test_fold = y[train_indices], y[test_indices]
        classifier.fit(X_train_fold, y_train_fold)
        y_pred = classifier.predict(X_test_fold)
        scores.append(accuracy_score(y_test_fold, y_pred))
    return scores