221 lines
7.3 KiB
Python
221 lines
7.3 KiB
Python
import numpy as np
|
|
from typing import Dict, List, Tuple
|
|
from collections import Counter
|
|
|
|
class NaiveBayesClassifier:
|
|
"""朴素贝叶斯分类器"""
|
|
|
|
def __init__(self):
|
|
self.class_priors = {}
|
|
self.feature_likelihoods = {}
|
|
self.classes = None
|
|
|
|
def fit(self, X: np.ndarray, y: np.ndarray):
|
|
"""训练朴素贝叶斯分类器"""
|
|
self.classes = np.unique(y)
|
|
n_samples, n_features = X.shape
|
|
|
|
# 计算类先验概率
|
|
for c in self.classes:
|
|
self.class_priors[c] = np.sum(y == c) / n_samples
|
|
|
|
# 计算特征似然
|
|
self.feature_likelihoods = {}
|
|
for c in self.classes:
|
|
class_data = X[y == c]
|
|
self.feature_likelihoods[c] = {
|
|
'mean': np.mean(class_data, axis=0),
|
|
'var': np.var(class_data, axis=0) + 1e-10 # 避免除零
|
|
}
|
|
|
|
def _gaussian_probability(self, x: float, mean: float, var: float) -> float:
|
|
"""计算高斯概率密度"""
|
|
return (1 / np.sqrt(2 * np.pi * var)) * np.exp(-0.5 * ((x - mean) ** 2) / var)
|
|
|
|
def predict(self, X: np.ndarray) -> np.ndarray:
|
|
"""预测"""
|
|
predictions = []
|
|
|
|
for sample in X:
|
|
class_scores = {}
|
|
|
|
for c in self.classes:
|
|
# 计算后验概率(对数形式避免下溢)
|
|
log_prob = np.log(self.class_priors[c])
|
|
|
|
for i, feature_value in enumerate(sample):
|
|
mean = self.feature_likelihoods[c]['mean'][i]
|
|
var = self.feature_likelihoods[c]['var'][i]
|
|
log_prob += np.log(self._gaussian_probability(feature_value, mean, var))
|
|
|
|
class_scores[c] = log_prob
|
|
|
|
# 选择概率最大的类
|
|
predicted_class = max(class_scores, key=class_scores.get)
|
|
predictions.append(predicted_class)
|
|
|
|
return np.array(predictions)
|
|
|
|
class KNNClassifier:
|
|
"""K最近邻分类器"""
|
|
|
|
def __init__(self, k: int = 3):
|
|
self.k = k
|
|
self.X_train = None
|
|
self.y_train = None
|
|
|
|
def fit(self, X: np.ndarray, y: np.ndarray):
|
|
"""训练KNN分类器"""
|
|
self.X_train = X
|
|
self.y_train = y
|
|
|
|
def _euclidean_distance(self, x1: np.ndarray, x2: np.ndarray) -> float:
|
|
"""计算欧几里得距离"""
|
|
return np.sqrt(np.sum((x1 - x2) ** 2))
|
|
|
|
def predict(self, X: np.ndarray) -> np.ndarray:
|
|
"""预测"""
|
|
predictions = []
|
|
|
|
for sample in X:
|
|
# 计算与所有训练样本的距离
|
|
distances = []
|
|
for i, train_sample in enumerate(self.X_train):
|
|
dist = self._euclidean_distance(sample, train_sample)
|
|
distances.append((dist, self.y_train[i]))
|
|
|
|
# 选择k个最近邻
|
|
distances.sort(key=lambda x: x[0])
|
|
k_nearest = distances[:self.k]
|
|
|
|
# 投票决定类别
|
|
votes = [label for _, label in k_nearest]
|
|
predicted_class = max(set(votes), key=votes.count)
|
|
predictions.append(predicted_class)
|
|
|
|
return np.array(predictions)
|
|
|
|
class DecisionTreeNode:
|
|
"""决策树节点"""
|
|
|
|
def __init__(self):
|
|
self.feature_idx = None
|
|
self.threshold = None
|
|
self.left = None
|
|
self.right = None
|
|
self.prediction = None
|
|
self.is_leaf = False
|
|
|
|
class DecisionTreeClassifier:
|
|
"""决策树分类器"""
|
|
|
|
def __init__(self, max_depth: int = 10, min_samples_split: int = 2):
|
|
self.max_depth = max_depth
|
|
self.min_samples_split = min_samples_split
|
|
self.root = None
|
|
|
|
def _gini_impurity(self, y: np.ndarray) -> float:
|
|
"""计算基尼不纯度"""
|
|
if len(y) == 0:
|
|
return 0
|
|
|
|
_, counts = np.unique(y, return_counts=True)
|
|
probabilities = counts / len(y)
|
|
return 1 - np.sum(probabilities ** 2)
|
|
|
|
def _information_gain(self, y: np.ndarray, y_left: np.ndarray, y_right: np.ndarray) -> float:
|
|
"""计算信息增益"""
|
|
n = len(y)
|
|
n_left, n_right = len(y_left), len(y_right)
|
|
|
|
if n_left == 0 or n_right == 0:
|
|
return 0
|
|
|
|
gini_parent = self._gini_impurity(y)
|
|
gini_children = (n_left / n) * self._gini_impurity(y_left) + (n_right / n) * self._gini_impurity(y_right)
|
|
|
|
return gini_parent - gini_children
|
|
|
|
def _best_split(self, X: np.ndarray, y: np.ndarray) -> Tuple[int, float, float]:
|
|
"""找到最佳分割"""
|
|
best_gain = 0
|
|
best_feature_idx = None
|
|
best_threshold = None
|
|
|
|
n_features = X.shape[1]
|
|
|
|
for feature_idx in range(n_features):
|
|
feature_values = X[:, feature_idx]
|
|
thresholds = np.unique(feature_values)
|
|
|
|
for threshold in thresholds:
|
|
left_mask = feature_values <= threshold
|
|
right_mask = ~left_mask
|
|
|
|
if np.sum(left_mask) == 0 or np.sum(right_mask) == 0:
|
|
continue
|
|
|
|
y_left, y_right = y[left_mask], y[right_mask]
|
|
gain = self._information_gain(y, y_left, y_right)
|
|
|
|
if gain > best_gain:
|
|
best_gain = gain
|
|
best_feature_idx = feature_idx
|
|
best_threshold = threshold
|
|
|
|
return best_feature_idx, best_threshold, best_gain
|
|
|
|
def _build_tree(self, X: np.ndarray, y: np.ndarray, depth: int = 0) -> DecisionTreeNode:
|
|
"""构建决策树"""
|
|
node = DecisionTreeNode()
|
|
|
|
# 停止条件
|
|
if (depth >= self.max_depth or
|
|
len(np.unique(y)) == 1 or
|
|
len(y) < self.min_samples_split):
|
|
node.is_leaf = True
|
|
node.prediction = max(set(y), key=list(y).count)
|
|
return node
|
|
|
|
# 找到最佳分割
|
|
feature_idx, threshold, gain = self._best_split(X, y)
|
|
|
|
if gain == 0:
|
|
node.is_leaf = True
|
|
node.prediction = max(set(y), key=list(y).count)
|
|
return node
|
|
|
|
# 分割数据
|
|
left_mask = X[:, feature_idx] <= threshold
|
|
right_mask = ~left_mask
|
|
|
|
node.feature_idx = feature_idx
|
|
node.threshold = threshold
|
|
node.left = self._build_tree(X[left_mask], y[left_mask], depth + 1)
|
|
node.right = self._build_tree(X[right_mask], y[right_mask], depth + 1)
|
|
|
|
return node
|
|
|
|
def fit(self, X: np.ndarray, y: np.ndarray):
|
|
"""训练决策树"""
|
|
self.root = self._build_tree(X, y)
|
|
|
|
def _predict_sample(self, sample: np.ndarray, node: DecisionTreeNode):
|
|
"""预测单个样本"""
|
|
if node.is_leaf:
|
|
return node.prediction
|
|
|
|
if sample[node.feature_idx] <= node.threshold:
|
|
return self._predict_sample(sample, node.left)
|
|
else:
|
|
return self._predict_sample(sample, node.right)
|
|
|
|
def predict(self, X: np.ndarray) -> np.ndarray:
|
|
"""预测"""
|
|
predictions = []
|
|
for sample in X:
|
|
prediction = self._predict_sample(sample, self.root)
|
|
predictions.append(prediction)
|
|
|
|
return np.array(predictions)
|