ml-homework/classifiers.py

import numpy as np
from typing import Dict, List, Tuple
from collections import Counter

class NaiveBayesClassifier:
    """朴素贝叶斯分类器"""

    def __init__(self):
        self.class_priors = {}
        self.feature_likelihoods = {}
        self.classes = None

    def fit(self, X: np.ndarray, y: np.ndarray):
        """训练朴素贝叶斯分类器"""
        self.classes = np.unique(y)
        n_samples, n_features = X.shape

        # 计算类先验概率
        for c in self.classes:
            self.class_priors[c] = np.sum(y == c) / n_samples

        # 计算特征似然
        self.feature_likelihoods = {}
        for c in self.classes:
            class_data = X[y == c]
            self.feature_likelihoods[c] = {
                'mean': np.mean(class_data, axis=0),
                'var': np.var(class_data, axis=0) + 1e-10  # 避免除零
            }

    def _gaussian_probability(self, x: float, mean: float, var: float) -> float:
        """计算高斯概率密度"""
        return (1 / np.sqrt(2 * np.pi * var)) * np.exp(-0.5 * ((x - mean) ** 2) / var)

    def predict(self, X: np.ndarray) -> np.ndarray:
        """预测"""
        predictions = []

        for sample in X:
            class_scores = {}

            for c in self.classes:
                # 计算后验概率（对数形式避免下溢）
                log_prob = np.log(self.class_priors[c])

                for i, feature_value in enumerate(sample):
                    mean = self.feature_likelihoods[c]['mean'][i]
                    var = self.feature_likelihoods[c]['var'][i]
                    log_prob += np.log(self._gaussian_probability(feature_value, mean, var))

                class_scores[c] = log_prob

            # 选择概率最大的类
            predicted_class = max(class_scores, key=class_scores.get)
            predictions.append(predicted_class)

        return np.array(predictions)

class KNNClassifier:
    """K最近邻分类器"""

    def __init__(self, k: int = 3):
        self.k = k
        self.X_train = None
        self.y_train = None

    def fit(self, X: np.ndarray, y: np.ndarray):
        """训练KNN分类器"""
        self.X_train = X
        self.y_train = y

    def _euclidean_distance(self, x1: np.ndarray, x2: np.ndarray) -> float:
        """计算欧几里得距离"""
        return np.sqrt(np.sum((x1 - x2) ** 2))

    def predict(self, X: np.ndarray) -> np.ndarray:
        """预测"""
        predictions = []

        for sample in X:
            # 计算与所有训练样本的距离
            distances = []
            for i, train_sample in enumerate(self.X_train):
                dist = self._euclidean_distance(sample, train_sample)
                distances.append((dist, self.y_train[i]))

            # 选择k个最近邻
            distances.sort(key=lambda x: x[0])
            k_nearest = distances[:self.k]

            # 投票决定类别
            votes = [label for _, label in k_nearest]
            predicted_class = max(set(votes), key=votes.count)
            predictions.append(predicted_class)

        return np.array(predictions)

class DecisionTreeNode:
    """决策树节点"""

    def __init__(self):
        self.feature_idx = None
        self.threshold = None
        self.left = None
        self.right = None
        self.prediction = None
        self.is_leaf = False

class DecisionTreeClassifier:
    """决策树分类器"""

    def __init__(self, max_depth: int = 10, min_samples_split: int = 2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.root = None

    def _gini_impurity(self, y: np.ndarray) -> float:
        """计算基尼不纯度"""
        if len(y) == 0:
            return 0

        _, counts = np.unique(y, return_counts=True)
        probabilities = counts / len(y)
        return 1 - np.sum(probabilities ** 2)

    def _information_gain(self, y: np.ndarray, y_left: np.ndarray, y_right: np.ndarray) -> float:
        """计算信息增益"""
        n = len(y)
        n_left, n_right = len(y_left), len(y_right)

        if n_left == 0 or n_right == 0:
            return 0

        gini_parent = self._gini_impurity(y)
        gini_children = (n_left / n) * self._gini_impurity(y_left) + (n_right / n) * self._gini_impurity(y_right)

        return gini_parent - gini_children

    def _best_split(self, X: np.ndarray, y: np.ndarray) -> Tuple[int, float, float]:
        """找到最佳分割"""
        best_gain = 0
        best_feature_idx = None
        best_threshold = None

        n_features = X.shape[1]

        for feature_idx in range(n_features):
            feature_values = X[:, feature_idx]
            thresholds = np.unique(feature_values)

            for threshold in thresholds:
                left_mask = feature_values <= threshold
                right_mask = ~left_mask

                if np.sum(left_mask) == 0 or np.sum(right_mask) == 0:
                    continue

                y_left, y_right = y[left_mask], y[right_mask]
                gain = self._information_gain(y, y_left, y_right)

                if gain > best_gain:
                    best_gain = gain
                    best_feature_idx = feature_idx
                    best_threshold = threshold

        return best_feature_idx, best_threshold, best_gain

    def _build_tree(self, X: np.ndarray, y: np.ndarray, depth: int = 0) -> DecisionTreeNode:
        """构建决策树"""
        node = DecisionTreeNode()

        # 停止条件
        if (depth >= self.max_depth or
            len(np.unique(y)) == 1 or
            len(y) < self.min_samples_split):
            node.is_leaf = True
            node.prediction = max(set(y), key=list(y).count)
            return node

        # 找到最佳分割
        feature_idx, threshold, gain = self._best_split(X, y)

        if gain == 0:
            node.is_leaf = True
            node.prediction = max(set(y), key=list(y).count)
            return node

        # 分割数据
        left_mask = X[:, feature_idx] <= threshold
        right_mask = ~left_mask

        node.feature_idx = feature_idx
        node.threshold = threshold
        node.left = self._build_tree(X[left_mask], y[left_mask], depth + 1)
        node.right = self._build_tree(X[right_mask], y[right_mask], depth + 1)

        return node

    def fit(self, X: np.ndarray, y: np.ndarray):
        """训练决策树"""
        self.root = self._build_tree(X, y)

    def _predict_sample(self, sample: np.ndarray, node: DecisionTreeNode):
        """预测单个样本"""
        if node.is_leaf:
            return node.prediction

        if sample[node.feature_idx] <= node.threshold:
            return self._predict_sample(sample, node.left)
        else:
            return self._predict_sample(sample, node.right)

    def predict(self, X: np.ndarray) -> np.ndarray:
        """预测"""
        predictions = []
        for sample in X:
            prediction = self._predict_sample(sample, self.root)
            predictions.append(prediction)

        return np.array(predictions)