import numpy as np from typing import Dict, List, Tuple from collections import Counter class NaiveBayesClassifier: """朴素贝叶斯分类器""" def __init__(self): self.class_priors = {} self.feature_likelihoods = {} self.classes = None def fit(self, X: np.ndarray, y: np.ndarray): """训练朴素贝叶斯分类器""" self.classes = np.unique(y) n_samples, n_features = X.shape # 计算类先验概率 for c in self.classes: self.class_priors[c] = np.sum(y == c) / n_samples # 计算特征似然 self.feature_likelihoods = {} for c in self.classes: class_data = X[y == c] self.feature_likelihoods[c] = { 'mean': np.mean(class_data, axis=0), 'var': np.var(class_data, axis=0) + 1e-10 # 避免除零 } def _gaussian_probability(self, x: float, mean: float, var: float) -> float: """计算高斯概率密度""" return (1 / np.sqrt(2 * np.pi * var)) * np.exp(-0.5 * ((x - mean) ** 2) / var) def predict(self, X: np.ndarray) -> np.ndarray: """预测""" predictions = [] for sample in X: class_scores = {} for c in self.classes: # 计算后验概率(对数形式避免下溢) log_prob = np.log(self.class_priors[c]) for i, feature_value in enumerate(sample): mean = self.feature_likelihoods[c]['mean'][i] var = self.feature_likelihoods[c]['var'][i] log_prob += np.log(self._gaussian_probability(feature_value, mean, var)) class_scores[c] = log_prob # 选择概率最大的类 predicted_class = max(class_scores, key=class_scores.get) predictions.append(predicted_class) return np.array(predictions) class KNNClassifier: """K最近邻分类器""" def __init__(self, k: int = 3): self.k = k self.X_train = None self.y_train = None def fit(self, X: np.ndarray, y: np.ndarray): """训练KNN分类器""" self.X_train = X self.y_train = y def _euclidean_distance(self, x1: np.ndarray, x2: np.ndarray) -> float: """计算欧几里得距离""" return np.sqrt(np.sum((x1 - x2) ** 2)) def predict(self, X: np.ndarray) -> np.ndarray: """预测""" predictions = [] for sample in X: # 计算与所有训练样本的距离 distances = [] for i, train_sample in enumerate(self.X_train): dist = self._euclidean_distance(sample, train_sample) distances.append((dist, self.y_train[i])) # 选择k个最近邻 distances.sort(key=lambda x: x[0]) k_nearest = distances[:self.k] # 投票决定类别 votes = [label for _, label in k_nearest] predicted_class = max(set(votes), key=votes.count) predictions.append(predicted_class) return np.array(predictions) class DecisionTreeNode: """决策树节点""" def __init__(self): self.feature_idx = None self.threshold = None self.left = None self.right = None self.prediction = None self.is_leaf = False class DecisionTreeClassifier: """决策树分类器""" def __init__(self, max_depth: int = 10, min_samples_split: int = 2): self.max_depth = max_depth self.min_samples_split = min_samples_split self.root = None def _gini_impurity(self, y: np.ndarray) -> float: """计算基尼不纯度""" if len(y) == 0: return 0 _, counts = np.unique(y, return_counts=True) probabilities = counts / len(y) return 1 - np.sum(probabilities ** 2) def _information_gain(self, y: np.ndarray, y_left: np.ndarray, y_right: np.ndarray) -> float: """计算信息增益""" n = len(y) n_left, n_right = len(y_left), len(y_right) if n_left == 0 or n_right == 0: return 0 gini_parent = self._gini_impurity(y) gini_children = (n_left / n) * self._gini_impurity(y_left) + (n_right / n) * self._gini_impurity(y_right) return gini_parent - gini_children def _best_split(self, X: np.ndarray, y: np.ndarray) -> Tuple[int, float, float]: """找到最佳分割""" best_gain = 0 best_feature_idx = None best_threshold = None n_features = X.shape[1] for feature_idx in range(n_features): feature_values = X[:, feature_idx] thresholds = np.unique(feature_values) for threshold in thresholds: left_mask = feature_values <= threshold right_mask = ~left_mask if np.sum(left_mask) == 0 or np.sum(right_mask) == 0: continue y_left, y_right = y[left_mask], y[right_mask] gain = self._information_gain(y, y_left, y_right) if gain > best_gain: best_gain = gain best_feature_idx = feature_idx best_threshold = threshold return best_feature_idx, best_threshold, best_gain def _build_tree(self, X: np.ndarray, y: np.ndarray, depth: int = 0) -> DecisionTreeNode: """构建决策树""" node = DecisionTreeNode() # 停止条件 if (depth >= self.max_depth or len(np.unique(y)) == 1 or len(y) < self.min_samples_split): node.is_leaf = True node.prediction = max(set(y), key=list(y).count) return node # 找到最佳分割 feature_idx, threshold, gain = self._best_split(X, y) if gain == 0: node.is_leaf = True node.prediction = max(set(y), key=list(y).count) return node # 分割数据 left_mask = X[:, feature_idx] <= threshold right_mask = ~left_mask node.feature_idx = feature_idx node.threshold = threshold node.left = self._build_tree(X[left_mask], y[left_mask], depth + 1) node.right = self._build_tree(X[right_mask], y[right_mask], depth + 1) return node def fit(self, X: np.ndarray, y: np.ndarray): """训练决策树""" self.root = self._build_tree(X, y) def _predict_sample(self, sample: np.ndarray, node: DecisionTreeNode): """预测单个样本""" if node.is_leaf: return node.prediction if sample[node.feature_idx] <= node.threshold: return self._predict_sample(sample, node.left) else: return self._predict_sample(sample, node.right) def predict(self, X: np.ndarray) -> np.ndarray: """预测""" predictions = [] for sample in X: prediction = self._predict_sample(sample, self.root) predictions.append(prediction) return np.array(predictions)