import numpy as np import pandas as pd from typing import Tuple, List import math def load_data(filepath: str) -> Tuple[np.ndarray, np.ndarray]: """加载数据集""" data = pd.read_csv(filepath) X = data.iloc[:, :-1].values y = data.iloc[:, -1].values return X, y def train_test_split(X: np.ndarray, y: np.ndarray, test_size: float = 0.3, random_state: int = 42) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: """数据集划分""" np.random.seed(random_state) n_samples = X.shape[0] n_test = int(n_samples * test_size) indices = np.random.permutation(n_samples) test_indices = indices[:n_test] train_indices = indices[n_test:] return X[train_indices], X[test_indices], y[train_indices], y[test_indices] def normalize_data(X_train: np.ndarray, X_test: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: """数据标准化""" mean = np.mean(X_train, axis=0) std = np.std(X_train, axis=0) std[std == 0] = 1 # 避免除零 X_train_norm = (X_train - mean) / std X_test_norm = (X_test - mean) / std return X_train_norm, X_test_norm def accuracy_score(y_true: np.ndarray, y_pred: np.ndarray) -> float: """计算准确率""" return np.mean(y_true == y_pred) def confusion_matrix(y_true: np.ndarray, y_pred: np.ndarray) -> np.ndarray: """计算混淆矩阵""" classes = np.unique(np.concatenate([y_true, y_pred])) n_classes = len(classes) matrix = np.zeros((n_classes, n_classes), dtype=int) for i, true_class in enumerate(classes): for j, pred_class in enumerate(classes): matrix[i, j] = np.sum((y_true == true_class) & (y_pred == pred_class)) return matrix def cross_validation(classifier, X: np.ndarray, y: np.ndarray, k: int = 5) -> List[float]: """K折交叉验证""" n_samples = X.shape[0] fold_size = n_samples // k scores = [] for i in range(k): start_idx = i * fold_size end_idx = start_idx + fold_size if i < k - 1 else n_samples test_indices = np.arange(start_idx, end_idx) train_indices = np.concatenate([np.arange(0, start_idx), np.arange(end_idx, n_samples)]) X_train_fold, X_test_fold = X[train_indices], X[test_indices] y_train_fold, y_test_fold = y[train_indices], y[test_indices] classifier.fit(X_train_fold, y_train_fold) y_pred = classifier.predict(X_test_fold) scores.append(accuracy_score(y_test_fold, y_pred)) return scores