ml-homework/classifiers.py

221 lines
7.3 KiB
Python

import numpy as np
from typing import Dict, List, Tuple
from collections import Counter
class NaiveBayesClassifier:
"""朴素贝叶斯分类器"""
def __init__(self):
self.class_priors = {}
self.feature_likelihoods = {}
self.classes = None
def fit(self, X: np.ndarray, y: np.ndarray):
"""训练朴素贝叶斯分类器"""
self.classes = np.unique(y)
n_samples, n_features = X.shape
# 计算类先验概率
for c in self.classes:
self.class_priors[c] = np.sum(y == c) / n_samples
# 计算特征似然
self.feature_likelihoods = {}
for c in self.classes:
class_data = X[y == c]
self.feature_likelihoods[c] = {
'mean': np.mean(class_data, axis=0),
'var': np.var(class_data, axis=0) + 1e-10 # 避免除零
}
def _gaussian_probability(self, x: float, mean: float, var: float) -> float:
"""计算高斯概率密度"""
return (1 / np.sqrt(2 * np.pi * var)) * np.exp(-0.5 * ((x - mean) ** 2) / var)
def predict(self, X: np.ndarray) -> np.ndarray:
"""预测"""
predictions = []
for sample in X:
class_scores = {}
for c in self.classes:
# 计算后验概率(对数形式避免下溢)
log_prob = np.log(self.class_priors[c])
for i, feature_value in enumerate(sample):
mean = self.feature_likelihoods[c]['mean'][i]
var = self.feature_likelihoods[c]['var'][i]
log_prob += np.log(self._gaussian_probability(feature_value, mean, var))
class_scores[c] = log_prob
# 选择概率最大的类
predicted_class = max(class_scores, key=class_scores.get)
predictions.append(predicted_class)
return np.array(predictions)
class KNNClassifier:
"""K最近邻分类器"""
def __init__(self, k: int = 3):
self.k = k
self.X_train = None
self.y_train = None
def fit(self, X: np.ndarray, y: np.ndarray):
"""训练KNN分类器"""
self.X_train = X
self.y_train = y
def _euclidean_distance(self, x1: np.ndarray, x2: np.ndarray) -> float:
"""计算欧几里得距离"""
return np.sqrt(np.sum((x1 - x2) ** 2))
def predict(self, X: np.ndarray) -> np.ndarray:
"""预测"""
predictions = []
for sample in X:
# 计算与所有训练样本的距离
distances = []
for i, train_sample in enumerate(self.X_train):
dist = self._euclidean_distance(sample, train_sample)
distances.append((dist, self.y_train[i]))
# 选择k个最近邻
distances.sort(key=lambda x: x[0])
k_nearest = distances[:self.k]
# 投票决定类别
votes = [label for _, label in k_nearest]
predicted_class = max(set(votes), key=votes.count)
predictions.append(predicted_class)
return np.array(predictions)
class DecisionTreeNode:
"""决策树节点"""
def __init__(self):
self.feature_idx = None
self.threshold = None
self.left = None
self.right = None
self.prediction = None
self.is_leaf = False
class DecisionTreeClassifier:
"""决策树分类器"""
def __init__(self, max_depth: int = 10, min_samples_split: int = 2):
self.max_depth = max_depth
self.min_samples_split = min_samples_split
self.root = None
def _gini_impurity(self, y: np.ndarray) -> float:
"""计算基尼不纯度"""
if len(y) == 0:
return 0
_, counts = np.unique(y, return_counts=True)
probabilities = counts / len(y)
return 1 - np.sum(probabilities ** 2)
def _information_gain(self, y: np.ndarray, y_left: np.ndarray, y_right: np.ndarray) -> float:
"""计算信息增益"""
n = len(y)
n_left, n_right = len(y_left), len(y_right)
if n_left == 0 or n_right == 0:
return 0
gini_parent = self._gini_impurity(y)
gini_children = (n_left / n) * self._gini_impurity(y_left) + (n_right / n) * self._gini_impurity(y_right)
return gini_parent - gini_children
def _best_split(self, X: np.ndarray, y: np.ndarray) -> Tuple[int, float, float]:
"""找到最佳分割"""
best_gain = 0
best_feature_idx = None
best_threshold = None
n_features = X.shape[1]
for feature_idx in range(n_features):
feature_values = X[:, feature_idx]
thresholds = np.unique(feature_values)
for threshold in thresholds:
left_mask = feature_values <= threshold
right_mask = ~left_mask
if np.sum(left_mask) == 0 or np.sum(right_mask) == 0:
continue
y_left, y_right = y[left_mask], y[right_mask]
gain = self._information_gain(y, y_left, y_right)
if gain > best_gain:
best_gain = gain
best_feature_idx = feature_idx
best_threshold = threshold
return best_feature_idx, best_threshold, best_gain
def _build_tree(self, X: np.ndarray, y: np.ndarray, depth: int = 0) -> DecisionTreeNode:
"""构建决策树"""
node = DecisionTreeNode()
# 停止条件
if (depth >= self.max_depth or
len(np.unique(y)) == 1 or
len(y) < self.min_samples_split):
node.is_leaf = True
node.prediction = max(set(y), key=list(y).count)
return node
# 找到最佳分割
feature_idx, threshold, gain = self._best_split(X, y)
if gain == 0:
node.is_leaf = True
node.prediction = max(set(y), key=list(y).count)
return node
# 分割数据
left_mask = X[:, feature_idx] <= threshold
right_mask = ~left_mask
node.feature_idx = feature_idx
node.threshold = threshold
node.left = self._build_tree(X[left_mask], y[left_mask], depth + 1)
node.right = self._build_tree(X[right_mask], y[right_mask], depth + 1)
return node
def fit(self, X: np.ndarray, y: np.ndarray):
"""训练决策树"""
self.root = self._build_tree(X, y)
def _predict_sample(self, sample: np.ndarray, node: DecisionTreeNode):
"""预测单个样本"""
if node.is_leaf:
return node.prediction
if sample[node.feature_idx] <= node.threshold:
return self._predict_sample(sample, node.left)
else:
return self._predict_sample(sample, node.right)
def predict(self, X: np.ndarray) -> np.ndarray:
"""预测"""
predictions = []
for sample in X:
prediction = self._predict_sample(sample, self.root)
predictions.append(prediction)
return np.array(predictions)