ml-homework/ensemble.py

143 lines
5.0 KiB
Python

import numpy as np
from typing import List, Any
from classifiers import DecisionTreeClassifier, NaiveBayesClassifier, KNNClassifier
class BaggingClassifier:
"""Bagging集成分类器"""
def __init__(self, base_classifier, n_estimators: int = 10, random_state: int = 42):
self.base_classifier = base_classifier
self.n_estimators = n_estimators
self.random_state = random_state
self.estimators = []
def fit(self, X: np.ndarray, y: np.ndarray):
"""训练Bagging分类器"""
np.random.seed(self.random_state)
n_samples = X.shape[0]
self.estimators = []
for i in range(self.n_estimators):
# Bootstrap采样
bootstrap_indices = np.random.choice(n_samples, size=n_samples, replace=True)
X_bootstrap = X[bootstrap_indices]
y_bootstrap = y[bootstrap_indices]
# 训练基分类器
if self.base_classifier == 'decision_tree':
estimator = DecisionTreeClassifier(max_depth=8)
elif self.base_classifier == 'naive_bayes':
estimator = NaiveBayesClassifier()
elif self.base_classifier == 'knn':
estimator = KNNClassifier(k=5)
estimator.fit(X_bootstrap, y_bootstrap)
self.estimators.append(estimator)
def predict(self, X: np.ndarray) -> np.ndarray:
"""预测"""
# 收集所有基分类器的预测
predictions = np.zeros((X.shape[0], self.n_estimators))
for i, estimator in enumerate(self.estimators):
predictions[:, i] = estimator.predict(X)
# 投票决定最终预测
final_predictions = []
for i in range(X.shape[0]):
votes = predictions[i, :]
prediction = max(set(votes), key=list(votes).count)
final_predictions.append(prediction)
return np.array(final_predictions)
class AdaBoostClassifier:
"""AdaBoost集成分类器"""
def __init__(self, n_estimators: int = 10, random_state: int = 42):
self.n_estimators = n_estimators
self.random_state = random_state
self.estimators = []
self.estimator_weights = []
def fit(self, X: np.ndarray, y: np.ndarray):
"""训练AdaBoost分类器"""
np.random.seed(self.random_state)
n_samples = X.shape[0]
# 初始化样本权重
sample_weights = np.ones(n_samples) / n_samples
self.estimators = []
self.estimator_weights = []
for i in range(self.n_estimators):
# 根据样本权重采样
sample_indices = np.random.choice(
n_samples, size=n_samples, replace=True, p=sample_weights
)
X_weighted = X[sample_indices]
y_weighted = y[sample_indices]
# 训练弱分类器(决策树桩)
estimator = DecisionTreeClassifier(max_depth=1)
estimator.fit(X_weighted, y_weighted)
# 计算预测错误率
y_pred = estimator.predict(X)
error_mask = y_pred != y
error_rate = np.average(error_mask, weights=sample_weights)
# 如果错误率太高,停止
if error_rate >= 0.5:
break
# 计算分类器权重
alpha = 0.5 * np.log((1 - error_rate) / (error_rate + 1e-10))
# 更新样本权重
sample_weights *= np.exp(-alpha * y * y_pred)
sample_weights /= np.sum(sample_weights)
self.estimators.append(estimator)
self.estimator_weights.append(alpha)
def predict(self, X: np.ndarray) -> np.ndarray:
"""预测"""
n_samples = X.shape[0]
predictions = np.zeros(n_samples)
for estimator, weight in zip(self.estimators, self.estimator_weights):
y_pred = estimator.predict(X)
predictions += weight * y_pred
return np.sign(predictions)
class VotingClassifier:
"""投票集成分类器"""
def __init__(self, estimators: List[Any]):
self.estimators = estimators
def fit(self, X: np.ndarray, y: np.ndarray):
"""训练所有分类器"""
for estimator in self.estimators:
estimator.fit(X, y)
def predict(self, X: np.ndarray) -> np.ndarray:
"""预测"""
predictions = np.zeros((X.shape[0], len(self.estimators)))
for i, estimator in enumerate(self.estimators):
predictions[:, i] = estimator.predict(X)
# 投票决定最终预测
final_predictions = []
for i in range(X.shape[0]):
votes = predictions[i, :]
prediction = max(set(votes), key=list(votes).count)
final_predictions.append(prediction)
return np.array(final_predictions)