73 lines
2.5 KiB
Python
73 lines
2.5 KiB
Python
import numpy as np
|
|
import pandas as pd
|
|
from typing import Tuple, List
|
|
import math
|
|
|
|
def load_data(filepath: str) -> Tuple[np.ndarray, np.ndarray]:
|
|
"""加载数据集"""
|
|
data = pd.read_csv(filepath)
|
|
X = data.iloc[:, :-1].values
|
|
y = data.iloc[:, -1].values
|
|
return X, y
|
|
|
|
def train_test_split(X: np.ndarray, y: np.ndarray, test_size: float = 0.3, random_state: int = 42) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
|
|
"""数据集划分"""
|
|
np.random.seed(random_state)
|
|
n_samples = X.shape[0]
|
|
n_test = int(n_samples * test_size)
|
|
|
|
indices = np.random.permutation(n_samples)
|
|
test_indices = indices[:n_test]
|
|
train_indices = indices[n_test:]
|
|
|
|
return X[train_indices], X[test_indices], y[train_indices], y[test_indices]
|
|
|
|
def normalize_data(X_train: np.ndarray, X_test: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
|
|
"""数据标准化"""
|
|
mean = np.mean(X_train, axis=0)
|
|
std = np.std(X_train, axis=0)
|
|
std[std == 0] = 1 # 避免除零
|
|
|
|
X_train_norm = (X_train - mean) / std
|
|
X_test_norm = (X_test - mean) / std
|
|
|
|
return X_train_norm, X_test_norm
|
|
|
|
def accuracy_score(y_true: np.ndarray, y_pred: np.ndarray) -> float:
|
|
"""计算准确率"""
|
|
return np.mean(y_true == y_pred)
|
|
|
|
def confusion_matrix(y_true: np.ndarray, y_pred: np.ndarray) -> np.ndarray:
|
|
"""计算混淆矩阵"""
|
|
classes = np.unique(np.concatenate([y_true, y_pred]))
|
|
n_classes = len(classes)
|
|
matrix = np.zeros((n_classes, n_classes), dtype=int)
|
|
|
|
for i, true_class in enumerate(classes):
|
|
for j, pred_class in enumerate(classes):
|
|
matrix[i, j] = np.sum((y_true == true_class) & (y_pred == pred_class))
|
|
|
|
return matrix
|
|
|
|
def cross_validation(classifier, X: np.ndarray, y: np.ndarray, k: int = 5) -> List[float]:
|
|
"""K折交叉验证"""
|
|
n_samples = X.shape[0]
|
|
fold_size = n_samples // k
|
|
scores = []
|
|
|
|
for i in range(k):
|
|
start_idx = i * fold_size
|
|
end_idx = start_idx + fold_size if i < k - 1 else n_samples
|
|
|
|
test_indices = np.arange(start_idx, end_idx)
|
|
train_indices = np.concatenate([np.arange(0, start_idx), np.arange(end_idx, n_samples)])
|
|
|
|
X_train_fold, X_test_fold = X[train_indices], X[test_indices]
|
|
y_train_fold, y_test_fold = y[train_indices], y[test_indices]
|
|
|
|
classifier.fit(X_train_fold, y_train_fold)
|
|
y_pred = classifier.predict(X_test_fold)
|
|
scores.append(accuracy_score(y_test_fold, y_pred))
|
|
|
|
return scores
|