ml-homework/ensemble.py

import numpy as np
from typing import List, Any
from classifiers import DecisionTreeClassifier, NaiveBayesClassifier, KNNClassifier

class BaggingClassifier:
    """Bagging集成分类器"""

    def __init__(self, base_classifier, n_estimators: int = 10, random_state: int = 42):
        self.base_classifier = base_classifier
        self.n_estimators = n_estimators
        self.random_state = random_state
        self.estimators = []

    def fit(self, X: np.ndarray, y: np.ndarray):
        """训练Bagging分类器"""
        np.random.seed(self.random_state)
        n_samples = X.shape[0]

        self.estimators = []

        for i in range(self.n_estimators):
            # Bootstrap采样
            bootstrap_indices = np.random.choice(n_samples, size=n_samples, replace=True)
            X_bootstrap = X[bootstrap_indices]
            y_bootstrap = y[bootstrap_indices]

            # 训练基分类器
            if self.base_classifier == 'decision_tree':
                estimator = DecisionTreeClassifier(max_depth=8)
            elif self.base_classifier == 'naive_bayes':
                estimator = NaiveBayesClassifier()
            elif self.base_classifier == 'knn':
                estimator = KNNClassifier(k=5)

            estimator.fit(X_bootstrap, y_bootstrap)
            self.estimators.append(estimator)

    def predict(self, X: np.ndarray) -> np.ndarray:
        """预测"""
        # 收集所有基分类器的预测
        predictions = np.zeros((X.shape[0], self.n_estimators))

        for i, estimator in enumerate(self.estimators):
            predictions[:, i] = estimator.predict(X)

        # 投票决定最终预测
        final_predictions = []
        for i in range(X.shape[0]):
            votes = predictions[i, :]
            prediction = max(set(votes), key=list(votes).count)
            final_predictions.append(prediction)

        return np.array(final_predictions)

class AdaBoostClassifier:
    """AdaBoost集成分类器"""

    def __init__(self, n_estimators: int = 10, random_state: int = 42):
        self.n_estimators = n_estimators
        self.random_state = random_state
        self.estimators = []
        self.estimator_weights = []

    def fit(self, X: np.ndarray, y: np.ndarray):
        """训练AdaBoost分类器"""
        np.random.seed(self.random_state)
        n_samples = X.shape[0]

        # 初始化样本权重
        sample_weights = np.ones(n_samples) / n_samples

        self.estimators = []
        self.estimator_weights = []

        for i in range(self.n_estimators):
            # 根据样本权重采样
            sample_indices = np.random.choice(
                n_samples, size=n_samples, replace=True, p=sample_weights
            )
            X_weighted = X[sample_indices]
            y_weighted = y[sample_indices]

            # 训练弱分类器（决策树桩）
            estimator = DecisionTreeClassifier(max_depth=1)
            estimator.fit(X_weighted, y_weighted)

            # 计算预测错误率
            y_pred = estimator.predict(X)
            error_mask = y_pred != y
            error_rate = np.average(error_mask, weights=sample_weights)

            # 如果错误率太高，停止
            if error_rate >= 0.5:
                break

            # 计算分类器权重
            alpha = 0.5 * np.log((1 - error_rate) / (error_rate + 1e-10))

            # 更新样本权重
            sample_weights *= np.exp(-alpha * y * y_pred)
            sample_weights /= np.sum(sample_weights)

            self.estimators.append(estimator)
            self.estimator_weights.append(alpha)

    def predict(self, X: np.ndarray) -> np.ndarray:
        """预测"""
        n_samples = X.shape[0]
        predictions = np.zeros(n_samples)

        for estimator, weight in zip(self.estimators, self.estimator_weights):
            y_pred = estimator.predict(X)
            predictions += weight * y_pred

        return np.sign(predictions)

class VotingClassifier:
    """投票集成分类器"""

    def __init__(self, estimators: List[Any]):
        self.estimators = estimators

    def fit(self, X: np.ndarray, y: np.ndarray):
        """训练所有分类器"""
        for estimator in self.estimators:
            estimator.fit(X, y)

    def predict(self, X: np.ndarray) -> np.ndarray:
        """预测"""
        predictions = np.zeros((X.shape[0], len(self.estimators)))

        for i, estimator in enumerate(self.estimators):
            predictions[:, i] = estimator.predict(X)

        # 投票决定最终预测
        final_predictions = []
        for i in range(X.shape[0]):
            votes = predictions[i, :]
            prediction = max(set(votes), key=list(votes).count)
            final_predictions.append(prediction)

        return np.array(final_predictions)