ml-homework/utils.py

73 lines
2.5 KiB
Python

import numpy as np
import pandas as pd
from typing import Tuple, List
import math
def load_data(filepath: str) -> Tuple[np.ndarray, np.ndarray]:
"""加载数据集"""
data = pd.read_csv(filepath)
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values
return X, y
def train_test_split(X: np.ndarray, y: np.ndarray, test_size: float = 0.3, random_state: int = 42) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
"""数据集划分"""
np.random.seed(random_state)
n_samples = X.shape[0]
n_test = int(n_samples * test_size)
indices = np.random.permutation(n_samples)
test_indices = indices[:n_test]
train_indices = indices[n_test:]
return X[train_indices], X[test_indices], y[train_indices], y[test_indices]
def normalize_data(X_train: np.ndarray, X_test: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
"""数据标准化"""
mean = np.mean(X_train, axis=0)
std = np.std(X_train, axis=0)
std[std == 0] = 1 # 避免除零
X_train_norm = (X_train - mean) / std
X_test_norm = (X_test - mean) / std
return X_train_norm, X_test_norm
def accuracy_score(y_true: np.ndarray, y_pred: np.ndarray) -> float:
"""计算准确率"""
return np.mean(y_true == y_pred)
def confusion_matrix(y_true: np.ndarray, y_pred: np.ndarray) -> np.ndarray:
"""计算混淆矩阵"""
classes = np.unique(np.concatenate([y_true, y_pred]))
n_classes = len(classes)
matrix = np.zeros((n_classes, n_classes), dtype=int)
for i, true_class in enumerate(classes):
for j, pred_class in enumerate(classes):
matrix[i, j] = np.sum((y_true == true_class) & (y_pred == pred_class))
return matrix
def cross_validation(classifier, X: np.ndarray, y: np.ndarray, k: int = 5) -> List[float]:
"""K折交叉验证"""
n_samples = X.shape[0]
fold_size = n_samples // k
scores = []
for i in range(k):
start_idx = i * fold_size
end_idx = start_idx + fold_size if i < k - 1 else n_samples
test_indices = np.arange(start_idx, end_idx)
train_indices = np.concatenate([np.arange(0, start_idx), np.arange(end_idx, n_samples)])
X_train_fold, X_test_fold = X[train_indices], X[test_indices]
y_train_fold, y_test_fold = y[train_indices], y[test_indices]
classifier.fit(X_train_fold, y_train_fold)
y_pred = classifier.predict(X_test_fold)
scores.append(accuracy_score(y_test_fold, y_pred))
return scores