feat: Implement comprehensive machine learning models and evaluations for Online Shoppers Intention and Breast Cancer datasets
This commit is contained in:
parent
047b03a590
commit
8e06e86972
220
classifiers.py
220
classifiers.py
@ -1,220 +0,0 @@
|
||||
import numpy as np
|
||||
from typing import Dict, List, Tuple
|
||||
from collections import Counter
|
||||
|
||||
class NaiveBayesClassifier:
|
||||
"""朴素贝叶斯分类器"""
|
||||
|
||||
def __init__(self):
|
||||
self.class_priors = {}
|
||||
self.feature_likelihoods = {}
|
||||
self.classes = None
|
||||
|
||||
def fit(self, X: np.ndarray, y: np.ndarray):
|
||||
"""训练朴素贝叶斯分类器"""
|
||||
self.classes = np.unique(y)
|
||||
n_samples, n_features = X.shape
|
||||
|
||||
# 计算类先验概率
|
||||
for c in self.classes:
|
||||
self.class_priors[c] = np.sum(y == c) / n_samples
|
||||
|
||||
# 计算特征似然
|
||||
self.feature_likelihoods = {}
|
||||
for c in self.classes:
|
||||
class_data = X[y == c]
|
||||
self.feature_likelihoods[c] = {
|
||||
'mean': np.mean(class_data, axis=0),
|
||||
'var': np.var(class_data, axis=0) + 1e-10 # 避免除零
|
||||
}
|
||||
|
||||
def _gaussian_probability(self, x: float, mean: float, var: float) -> float:
|
||||
"""计算高斯概率密度"""
|
||||
return (1 / np.sqrt(2 * np.pi * var)) * np.exp(-0.5 * ((x - mean) ** 2) / var)
|
||||
|
||||
def predict(self, X: np.ndarray) -> np.ndarray:
|
||||
"""预测"""
|
||||
predictions = []
|
||||
|
||||
for sample in X:
|
||||
class_scores = {}
|
||||
|
||||
for c in self.classes:
|
||||
# 计算后验概率(对数形式避免下溢)
|
||||
log_prob = np.log(self.class_priors[c])
|
||||
|
||||
for i, feature_value in enumerate(sample):
|
||||
mean = self.feature_likelihoods[c]['mean'][i]
|
||||
var = self.feature_likelihoods[c]['var'][i]
|
||||
log_prob += np.log(self._gaussian_probability(feature_value, mean, var))
|
||||
|
||||
class_scores[c] = log_prob
|
||||
|
||||
# 选择概率最大的类
|
||||
predicted_class = max(class_scores, key=class_scores.get)
|
||||
predictions.append(predicted_class)
|
||||
|
||||
return np.array(predictions)
|
||||
|
||||
class KNNClassifier:
|
||||
"""K最近邻分类器"""
|
||||
|
||||
def __init__(self, k: int = 3):
|
||||
self.k = k
|
||||
self.X_train = None
|
||||
self.y_train = None
|
||||
|
||||
def fit(self, X: np.ndarray, y: np.ndarray):
|
||||
"""训练KNN分类器"""
|
||||
self.X_train = X
|
||||
self.y_train = y
|
||||
|
||||
def _euclidean_distance(self, x1: np.ndarray, x2: np.ndarray) -> float:
|
||||
"""计算欧几里得距离"""
|
||||
return np.sqrt(np.sum((x1 - x2) ** 2))
|
||||
|
||||
def predict(self, X: np.ndarray) -> np.ndarray:
|
||||
"""预测"""
|
||||
predictions = []
|
||||
|
||||
for sample in X:
|
||||
# 计算与所有训练样本的距离
|
||||
distances = []
|
||||
for i, train_sample in enumerate(self.X_train):
|
||||
dist = self._euclidean_distance(sample, train_sample)
|
||||
distances.append((dist, self.y_train[i]))
|
||||
|
||||
# 选择k个最近邻
|
||||
distances.sort(key=lambda x: x[0])
|
||||
k_nearest = distances[:self.k]
|
||||
|
||||
# 投票决定类别
|
||||
votes = [label for _, label in k_nearest]
|
||||
predicted_class = max(set(votes), key=votes.count)
|
||||
predictions.append(predicted_class)
|
||||
|
||||
return np.array(predictions)
|
||||
|
||||
class DecisionTreeNode:
|
||||
"""决策树节点"""
|
||||
|
||||
def __init__(self):
|
||||
self.feature_idx = None
|
||||
self.threshold = None
|
||||
self.left = None
|
||||
self.right = None
|
||||
self.prediction = None
|
||||
self.is_leaf = False
|
||||
|
||||
class DecisionTreeClassifier:
|
||||
"""决策树分类器"""
|
||||
|
||||
def __init__(self, max_depth: int = 10, min_samples_split: int = 2):
|
||||
self.max_depth = max_depth
|
||||
self.min_samples_split = min_samples_split
|
||||
self.root = None
|
||||
|
||||
def _gini_impurity(self, y: np.ndarray) -> float:
|
||||
"""计算基尼不纯度"""
|
||||
if len(y) == 0:
|
||||
return 0
|
||||
|
||||
_, counts = np.unique(y, return_counts=True)
|
||||
probabilities = counts / len(y)
|
||||
return 1 - np.sum(probabilities ** 2)
|
||||
|
||||
def _information_gain(self, y: np.ndarray, y_left: np.ndarray, y_right: np.ndarray) -> float:
|
||||
"""计算信息增益"""
|
||||
n = len(y)
|
||||
n_left, n_right = len(y_left), len(y_right)
|
||||
|
||||
if n_left == 0 or n_right == 0:
|
||||
return 0
|
||||
|
||||
gini_parent = self._gini_impurity(y)
|
||||
gini_children = (n_left / n) * self._gini_impurity(y_left) + (n_right / n) * self._gini_impurity(y_right)
|
||||
|
||||
return gini_parent - gini_children
|
||||
|
||||
def _best_split(self, X: np.ndarray, y: np.ndarray) -> Tuple[int, float, float]:
|
||||
"""找到最佳分割"""
|
||||
best_gain = 0
|
||||
best_feature_idx = None
|
||||
best_threshold = None
|
||||
|
||||
n_features = X.shape[1]
|
||||
|
||||
for feature_idx in range(n_features):
|
||||
feature_values = X[:, feature_idx]
|
||||
thresholds = np.unique(feature_values)
|
||||
|
||||
for threshold in thresholds:
|
||||
left_mask = feature_values <= threshold
|
||||
right_mask = ~left_mask
|
||||
|
||||
if np.sum(left_mask) == 0 or np.sum(right_mask) == 0:
|
||||
continue
|
||||
|
||||
y_left, y_right = y[left_mask], y[right_mask]
|
||||
gain = self._information_gain(y, y_left, y_right)
|
||||
|
||||
if gain > best_gain:
|
||||
best_gain = gain
|
||||
best_feature_idx = feature_idx
|
||||
best_threshold = threshold
|
||||
|
||||
return best_feature_idx, best_threshold, best_gain
|
||||
|
||||
def _build_tree(self, X: np.ndarray, y: np.ndarray, depth: int = 0) -> DecisionTreeNode:
|
||||
"""构建决策树"""
|
||||
node = DecisionTreeNode()
|
||||
|
||||
# 停止条件
|
||||
if (depth >= self.max_depth or
|
||||
len(np.unique(y)) == 1 or
|
||||
len(y) < self.min_samples_split):
|
||||
node.is_leaf = True
|
||||
node.prediction = max(set(y), key=list(y).count)
|
||||
return node
|
||||
|
||||
# 找到最佳分割
|
||||
feature_idx, threshold, gain = self._best_split(X, y)
|
||||
|
||||
if gain == 0:
|
||||
node.is_leaf = True
|
||||
node.prediction = max(set(y), key=list(y).count)
|
||||
return node
|
||||
|
||||
# 分割数据
|
||||
left_mask = X[:, feature_idx] <= threshold
|
||||
right_mask = ~left_mask
|
||||
|
||||
node.feature_idx = feature_idx
|
||||
node.threshold = threshold
|
||||
node.left = self._build_tree(X[left_mask], y[left_mask], depth + 1)
|
||||
node.right = self._build_tree(X[right_mask], y[right_mask], depth + 1)
|
||||
|
||||
return node
|
||||
|
||||
def fit(self, X: np.ndarray, y: np.ndarray):
|
||||
"""训练决策树"""
|
||||
self.root = self._build_tree(X, y)
|
||||
|
||||
def _predict_sample(self, sample: np.ndarray, node: DecisionTreeNode):
|
||||
"""预测单个样本"""
|
||||
if node.is_leaf:
|
||||
return node.prediction
|
||||
|
||||
if sample[node.feature_idx] <= node.threshold:
|
||||
return self._predict_sample(sample, node.left)
|
||||
else:
|
||||
return self._predict_sample(sample, node.right)
|
||||
|
||||
def predict(self, X: np.ndarray) -> np.ndarray:
|
||||
"""预测"""
|
||||
predictions = []
|
||||
for sample in X:
|
||||
prediction = self._predict_sample(sample, self.root)
|
||||
predictions.append(prediction)
|
||||
|
||||
return np.array(predictions)
|
||||
142
ensemble.py
142
ensemble.py
@ -1,142 +0,0 @@
|
||||
import numpy as np
|
||||
from typing import List, Any
|
||||
from classifiers import DecisionTreeClassifier, NaiveBayesClassifier, KNNClassifier
|
||||
|
||||
class BaggingClassifier:
|
||||
"""Bagging集成分类器"""
|
||||
|
||||
def __init__(self, base_classifier, n_estimators: int = 10, random_state: int = 42):
|
||||
self.base_classifier = base_classifier
|
||||
self.n_estimators = n_estimators
|
||||
self.random_state = random_state
|
||||
self.estimators = []
|
||||
|
||||
def fit(self, X: np.ndarray, y: np.ndarray):
|
||||
"""训练Bagging分类器"""
|
||||
np.random.seed(self.random_state)
|
||||
n_samples = X.shape[0]
|
||||
|
||||
self.estimators = []
|
||||
|
||||
for i in range(self.n_estimators):
|
||||
# Bootstrap采样
|
||||
bootstrap_indices = np.random.choice(n_samples, size=n_samples, replace=True)
|
||||
X_bootstrap = X[bootstrap_indices]
|
||||
y_bootstrap = y[bootstrap_indices]
|
||||
|
||||
# 训练基分类器
|
||||
if self.base_classifier == 'decision_tree':
|
||||
estimator = DecisionTreeClassifier(max_depth=8)
|
||||
elif self.base_classifier == 'naive_bayes':
|
||||
estimator = NaiveBayesClassifier()
|
||||
elif self.base_classifier == 'knn':
|
||||
estimator = KNNClassifier(k=5)
|
||||
|
||||
estimator.fit(X_bootstrap, y_bootstrap)
|
||||
self.estimators.append(estimator)
|
||||
|
||||
def predict(self, X: np.ndarray) -> np.ndarray:
|
||||
"""预测"""
|
||||
# 收集所有基分类器的预测
|
||||
predictions = np.zeros((X.shape[0], self.n_estimators))
|
||||
|
||||
for i, estimator in enumerate(self.estimators):
|
||||
predictions[:, i] = estimator.predict(X)
|
||||
|
||||
# 投票决定最终预测
|
||||
final_predictions = []
|
||||
for i in range(X.shape[0]):
|
||||
votes = predictions[i, :]
|
||||
prediction = max(set(votes), key=list(votes).count)
|
||||
final_predictions.append(prediction)
|
||||
|
||||
return np.array(final_predictions)
|
||||
|
||||
class AdaBoostClassifier:
|
||||
"""AdaBoost集成分类器"""
|
||||
|
||||
def __init__(self, n_estimators: int = 10, random_state: int = 42):
|
||||
self.n_estimators = n_estimators
|
||||
self.random_state = random_state
|
||||
self.estimators = []
|
||||
self.estimator_weights = []
|
||||
|
||||
def fit(self, X: np.ndarray, y: np.ndarray):
|
||||
"""训练AdaBoost分类器"""
|
||||
np.random.seed(self.random_state)
|
||||
n_samples = X.shape[0]
|
||||
|
||||
# 初始化样本权重
|
||||
sample_weights = np.ones(n_samples) / n_samples
|
||||
|
||||
self.estimators = []
|
||||
self.estimator_weights = []
|
||||
|
||||
for i in range(self.n_estimators):
|
||||
# 根据样本权重采样
|
||||
sample_indices = np.random.choice(
|
||||
n_samples, size=n_samples, replace=True, p=sample_weights
|
||||
)
|
||||
X_weighted = X[sample_indices]
|
||||
y_weighted = y[sample_indices]
|
||||
|
||||
# 训练弱分类器(决策树桩)
|
||||
estimator = DecisionTreeClassifier(max_depth=1)
|
||||
estimator.fit(X_weighted, y_weighted)
|
||||
|
||||
# 计算预测错误率
|
||||
y_pred = estimator.predict(X)
|
||||
error_mask = y_pred != y
|
||||
error_rate = np.average(error_mask, weights=sample_weights)
|
||||
|
||||
# 如果错误率太高,停止
|
||||
if error_rate >= 0.5:
|
||||
break
|
||||
|
||||
# 计算分类器权重
|
||||
alpha = 0.5 * np.log((1 - error_rate) / (error_rate + 1e-10))
|
||||
|
||||
# 更新样本权重
|
||||
sample_weights *= np.exp(-alpha * y * y_pred)
|
||||
sample_weights /= np.sum(sample_weights)
|
||||
|
||||
self.estimators.append(estimator)
|
||||
self.estimator_weights.append(alpha)
|
||||
|
||||
def predict(self, X: np.ndarray) -> np.ndarray:
|
||||
"""预测"""
|
||||
n_samples = X.shape[0]
|
||||
predictions = np.zeros(n_samples)
|
||||
|
||||
for estimator, weight in zip(self.estimators, self.estimator_weights):
|
||||
y_pred = estimator.predict(X)
|
||||
predictions += weight * y_pred
|
||||
|
||||
return np.sign(predictions)
|
||||
|
||||
class VotingClassifier:
|
||||
"""投票集成分类器"""
|
||||
|
||||
def __init__(self, estimators: List[Any]):
|
||||
self.estimators = estimators
|
||||
|
||||
def fit(self, X: np.ndarray, y: np.ndarray):
|
||||
"""训练所有分类器"""
|
||||
for estimator in self.estimators:
|
||||
estimator.fit(X, y)
|
||||
|
||||
def predict(self, X: np.ndarray) -> np.ndarray:
|
||||
"""预测"""
|
||||
predictions = np.zeros((X.shape[0], len(self.estimators)))
|
||||
|
||||
for i, estimator in enumerate(self.estimators):
|
||||
predictions[:, i] = estimator.predict(X)
|
||||
|
||||
# 投票决定最终预测
|
||||
final_predictions = []
|
||||
for i in range(X.shape[0]):
|
||||
votes = predictions[i, :]
|
||||
prediction = max(set(votes), key=list(votes).count)
|
||||
final_predictions.append(prediction)
|
||||
|
||||
return np.array(final_predictions)
|
||||
2430
exper.ipynb
Normal file
2430
exper.ipynb
Normal file
File diff suppressed because one or more lines are too long
319
experiments.py
319
experiments.py
@ -1,319 +0,0 @@
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import time
|
||||
from typing import Dict, List, Tuple
|
||||
|
||||
from utils import train_test_split, normalize_data, accuracy_score, cross_validation
|
||||
from feature_extraction import PCA, FeatureSelector
|
||||
from improved_bp import ImprovedBPNetwork, StandardBPNetwork
|
||||
from classifiers import NaiveBayesClassifier, KNNClassifier, DecisionTreeClassifier
|
||||
from ensemble import BaggingClassifier, AdaBoostClassifier, VotingClassifier
|
||||
|
||||
class ExperimentRunner:
|
||||
"""实验运行器"""
|
||||
|
||||
def __init__(self):
|
||||
self.results = {}
|
||||
|
||||
def generate_synthetic_data(self, n_samples: int = 1000, n_features: int = 20, n_classes: int = 3,
|
||||
random_state: int = 42) -> Tuple[np.ndarray, np.ndarray]:
|
||||
"""生成合成数据集"""
|
||||
np.random.seed(random_state)
|
||||
|
||||
# 为每个类生成不同的均值和协方差
|
||||
class_means = np.random.randn(n_classes, n_features) * 2
|
||||
X = []
|
||||
y = []
|
||||
|
||||
samples_per_class = n_samples // n_classes
|
||||
|
||||
for class_idx in range(n_classes):
|
||||
# 生成该类的数据
|
||||
class_data = np.random.randn(samples_per_class, n_features) + class_means[class_idx]
|
||||
X.append(class_data)
|
||||
y.extend([class_idx] * samples_per_class)
|
||||
|
||||
X = np.vstack(X)
|
||||
y = np.array(y)
|
||||
|
||||
# 添加噪声特征
|
||||
noise_features = np.random.randn(len(X), n_features // 2)
|
||||
X = np.hstack([X, noise_features])
|
||||
|
||||
return X, y
|
||||
|
||||
def run_bp_comparison(self, X: np.ndarray, y: np.ndarray, dataset_name: str):
|
||||
"""运行BP算法比较实验"""
|
||||
print(f"\n=== BP算法比较实验 - {dataset_name} ===")
|
||||
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
|
||||
X_train_norm, X_test_norm = normalize_data(X_train, X_test)
|
||||
|
||||
# 改进的BP网络
|
||||
print("训练改进的BP网络...")
|
||||
start_time = time.time()
|
||||
improved_bp = ImprovedBPNetwork(hidden_layers=[10, 5], learning_rate=0.01, max_epochs=500)
|
||||
improved_bp.fit(X_train_norm, y_train)
|
||||
improved_train_time = time.time() - start_time
|
||||
|
||||
y_pred_improved = improved_bp.predict(X_test_norm)
|
||||
improved_accuracy = accuracy_score(y_test, y_pred_improved)
|
||||
|
||||
# 标准BP网络
|
||||
print("训练标准BP网络...")
|
||||
start_time = time.time()
|
||||
standard_bp = StandardBPNetwork(hidden_layers=[10, 5], learning_rate=0.01, max_epochs=500)
|
||||
standard_bp.fit(X_train_norm, y_train)
|
||||
standard_train_time = time.time() - start_time
|
||||
|
||||
y_pred_standard = standard_bp.predict(X_test_norm)
|
||||
standard_accuracy = accuracy_score(y_test, y_pred_standard)
|
||||
|
||||
# 结果
|
||||
print(f"改进BP - 准确率: {improved_accuracy:.4f}, 训练时间: {improved_train_time:.2f}s")
|
||||
print(f"标准BP - 准确率: {standard_accuracy:.4f}, 训练时间: {standard_train_time:.2f}s")
|
||||
|
||||
# 绘制损失曲线
|
||||
plt.figure(figsize=(10, 6))
|
||||
plt.plot(improved_bp.loss_history, label='改进BP', alpha=0.8)
|
||||
plt.plot(standard_bp.loss_history, label='标准BP', alpha=0.8)
|
||||
plt.xlabel('训练轮次')
|
||||
plt.ylabel('损失')
|
||||
plt.title(f'BP算法损失曲线对比 - {dataset_name}')
|
||||
plt.legend()
|
||||
plt.grid(True)
|
||||
plt.savefig(f'c:/Users/grtsi/ml-homework/bp_comparison_{dataset_name.lower()}.png')
|
||||
plt.show()
|
||||
|
||||
return {
|
||||
'improved_bp': {'accuracy': improved_accuracy, 'time': improved_train_time},
|
||||
'standard_bp': {'accuracy': standard_accuracy, 'time': standard_train_time}
|
||||
}
|
||||
|
||||
def run_feature_extraction_comparison(self, X: np.ndarray, y: np.ndarray, dataset_name: str):
|
||||
"""运行特征提取比较实验"""
|
||||
print(f"\n=== 特征提取比较实验 - {dataset_name} ===")
|
||||
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
|
||||
X_train_norm, X_test_norm = normalize_data(X_train, X_test)
|
||||
|
||||
classifiers = {
|
||||
'NaiveBayes': NaiveBayesClassifier(),
|
||||
'KNN': KNNClassifier(k=5),
|
||||
'DecisionTree': DecisionTreeClassifier(max_depth=8)
|
||||
}
|
||||
|
||||
results = {}
|
||||
|
||||
for clf_name, clf in classifiers.items():
|
||||
print(f"\n{clf_name} 分类器:")
|
||||
|
||||
# 无特征提取
|
||||
clf_no_fe = type(clf)() if clf_name != 'KNN' else KNNClassifier(k=5)
|
||||
clf_no_fe.fit(X_train_norm, y_train)
|
||||
y_pred_no_fe = clf_no_fe.predict(X_test_norm)
|
||||
acc_no_fe = accuracy_score(y_test, y_pred_no_fe)
|
||||
|
||||
# PCA特征提取
|
||||
pca = PCA(n_components=min(10, X.shape[1] // 2))
|
||||
X_train_pca = pca.fit_transform(X_train_norm)
|
||||
X_test_pca = pca.transform(X_test_norm)
|
||||
|
||||
clf_pca = type(clf)() if clf_name != 'KNN' else KNNClassifier(k=5)
|
||||
clf_pca.fit(X_train_pca, y_train)
|
||||
y_pred_pca = clf_pca.predict(X_test_pca)
|
||||
acc_pca = accuracy_score(y_test, y_pred_pca)
|
||||
|
||||
# 特征选择
|
||||
feature_selector = FeatureSelector(k=min(10, X.shape[1] // 2))
|
||||
X_train_fs = feature_selector.fit_transform(X_train_norm, y_train)
|
||||
X_test_fs = feature_selector.transform(X_test_norm)
|
||||
|
||||
clf_fs = type(clf)() if clf_name != 'KNN' else KNNClassifier(k=5)
|
||||
clf_fs.fit(X_train_fs, y_train)
|
||||
y_pred_fs = clf_fs.predict(X_test_fs)
|
||||
acc_fs = accuracy_score(y_test, y_pred_fs)
|
||||
|
||||
print(f" 无特征提取: {acc_no_fe:.4f}")
|
||||
print(f" PCA特征提取: {acc_pca:.4f}")
|
||||
print(f" 特征选择: {acc_fs:.4f}")
|
||||
|
||||
results[clf_name] = {
|
||||
'no_feature_extraction': acc_no_fe,
|
||||
'pca': acc_pca,
|
||||
'feature_selection': acc_fs
|
||||
}
|
||||
|
||||
return results
|
||||
|
||||
def run_classifier_comparison(self, X: np.ndarray, y: np.ndarray, dataset_name: str):
|
||||
"""运行分类器比较实验"""
|
||||
print(f"\n=== 分类器比较实验 - {dataset_name} ===")
|
||||
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
|
||||
X_train_norm, X_test_norm = normalize_data(X_train, X_test)
|
||||
|
||||
classifiers = {
|
||||
'NaiveBayes': NaiveBayesClassifier(),
|
||||
'KNN': KNNClassifier(k=5),
|
||||
'DecisionTree': DecisionTreeClassifier(max_depth=8)
|
||||
}
|
||||
|
||||
results = {}
|
||||
|
||||
for clf_name, clf in classifiers.items():
|
||||
print(f"\n{clf_name} 分类器:")
|
||||
|
||||
# 训练和测试
|
||||
start_time = time.time()
|
||||
clf.fit(X_train_norm, y_train)
|
||||
train_time = time.time() - start_time
|
||||
|
||||
y_pred = clf.predict(X_test_norm)
|
||||
accuracy = accuracy_score(y_test, y_pred)
|
||||
|
||||
# 交叉验证
|
||||
cv_scores = cross_validation(type(clf)() if clf_name != 'KNN' else KNNClassifier(k=5),
|
||||
X_train_norm, y_train, k=5)
|
||||
|
||||
print(f" 准确率: {accuracy:.4f}")
|
||||
print(f" 训练时间: {train_time:.4f}s")
|
||||
print(f" 交叉验证均值: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}")
|
||||
|
||||
results[clf_name] = {
|
||||
'accuracy': accuracy,
|
||||
'train_time': train_time,
|
||||
'cv_mean': np.mean(cv_scores),
|
||||
'cv_std': np.std(cv_scores)
|
||||
}
|
||||
|
||||
return results
|
||||
|
||||
def run_ensemble_comparison(self, X: np.ndarray, y: np.ndarray, dataset_name: str):
|
||||
"""运行集成算法比较实验"""
|
||||
print(f"\n=== 集成算法比较实验 - {dataset_name} ===")
|
||||
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
|
||||
X_train_norm, X_test_norm = normalize_data(X_train, X_test)
|
||||
|
||||
# 基础分类器
|
||||
base_classifiers = {
|
||||
'DecisionTree': DecisionTreeClassifier(max_depth=8),
|
||||
'NaiveBayes': NaiveBayesClassifier(),
|
||||
'KNN': KNNClassifier(k=5)
|
||||
}
|
||||
|
||||
# 集成分类器
|
||||
ensemble_classifiers = {
|
||||
'Bagging_DT': BaggingClassifier('decision_tree', n_estimators=10),
|
||||
'Voting': VotingClassifier([
|
||||
DecisionTreeClassifier(max_depth=8),
|
||||
NaiveBayesClassifier(),
|
||||
KNNClassifier(k=5)
|
||||
])
|
||||
}
|
||||
|
||||
results = {}
|
||||
|
||||
# 测试基础分类器
|
||||
print("基础分类器:")
|
||||
for clf_name, clf in base_classifiers.items():
|
||||
start_time = time.time()
|
||||
clf.fit(X_train_norm, y_train)
|
||||
train_time = time.time() - start_time
|
||||
|
||||
y_pred = clf.predict(X_test_norm)
|
||||
accuracy = accuracy_score(y_test, y_pred)
|
||||
|
||||
print(f" {clf_name}: {accuracy:.4f} (训练时间: {train_time:.4f}s)")
|
||||
results[clf_name] = {'accuracy': accuracy, 'train_time': train_time}
|
||||
|
||||
# 测试集成分类器
|
||||
print("\n集成分类器:")
|
||||
for clf_name, clf in ensemble_classifiers.items():
|
||||
start_time = time.time()
|
||||
clf.fit(X_train_norm, y_train)
|
||||
train_time = time.time() - start_time
|
||||
|
||||
y_pred = clf.predict(X_test_norm)
|
||||
accuracy = accuracy_score(y_test, y_pred)
|
||||
|
||||
print(f" {clf_name}: {accuracy:.4f} (训练时间: {train_time:.4f}s)")
|
||||
results[clf_name] = {'accuracy': accuracy, 'train_time': train_time}
|
||||
|
||||
return results
|
||||
|
||||
def run_all_experiments(self):
|
||||
"""运行所有实验"""
|
||||
print("开始机器学习算法比较实验...")
|
||||
|
||||
# 生成两个不同的数据集
|
||||
print("生成数据集...")
|
||||
X1, y1 = self.generate_synthetic_data(n_samples=800, n_features=20, n_classes=3, random_state=42)
|
||||
X2, y2 = self.generate_synthetic_data(n_samples=1000, n_features=25, n_classes=4, random_state=123)
|
||||
|
||||
datasets = [
|
||||
(X1, y1, "Dataset1"),
|
||||
(X2, y2, "Dataset2")
|
||||
]
|
||||
|
||||
all_results = {}
|
||||
|
||||
for X, y, dataset_name in datasets:
|
||||
print(f"\n{'='*50}")
|
||||
print(f"处理数据集: {dataset_name}")
|
||||
print(f"样本数: {X.shape[0]}, 特征数: {X.shape[1]}, 类别数: {len(np.unique(y))}")
|
||||
|
||||
# 运行各种实验
|
||||
bp_results = self.run_bp_comparison(X, y, dataset_name)
|
||||
fe_results = self.run_feature_extraction_comparison(X, y, dataset_name)
|
||||
clf_results = self.run_classifier_comparison(X, y, dataset_name)
|
||||
ensemble_results = self.run_ensemble_comparison(X, y, dataset_name)
|
||||
|
||||
all_results[dataset_name] = {
|
||||
'bp_comparison': bp_results,
|
||||
'feature_extraction': fe_results,
|
||||
'classifier_comparison': clf_results,
|
||||
'ensemble_comparison': ensemble_results
|
||||
}
|
||||
|
||||
# 生成总结报告
|
||||
self.generate_summary_report(all_results)
|
||||
|
||||
return all_results
|
||||
|
||||
def generate_summary_report(self, results: Dict):
|
||||
"""生成总结报告"""
|
||||
print(f"\n{'='*60}")
|
||||
print("实验总结报告")
|
||||
print(f"{'='*60}")
|
||||
|
||||
for dataset_name, dataset_results in results.items():
|
||||
print(f"\n{dataset_name} 结果总结:")
|
||||
print("-" * 40)
|
||||
|
||||
# BP算法比较
|
||||
bp_results = dataset_results['bp_comparison']
|
||||
print(f"BP算法比较:")
|
||||
print(f" 改进BP: 准确率 {bp_results['improved_bp']['accuracy']:.4f}, 时间 {bp_results['improved_bp']['time']:.2f}s")
|
||||
print(f" 标准BP: 准确率 {bp_results['standard_bp']['accuracy']:.4f}, 时间 {bp_results['standard_bp']['time']:.2f}s")
|
||||
|
||||
# 特征提取比较
|
||||
fe_results = dataset_results['feature_extraction']
|
||||
print(f"\n特征提取效果 (最佳结果):")
|
||||
for clf_name, clf_results in fe_results.items():
|
||||
best_method = max(clf_results, key=clf_results.get)
|
||||
best_acc = clf_results[best_method]
|
||||
print(f" {clf_name}: {best_method} ({best_acc:.4f})")
|
||||
|
||||
# 集成算法比较
|
||||
ensemble_results = dataset_results['ensemble_comparison']
|
||||
print(f"\n分类器性能排名:")
|
||||
sorted_classifiers = sorted(ensemble_results.items(),
|
||||
key=lambda x: x[1]['accuracy'], reverse=True)
|
||||
for i, (clf_name, clf_result) in enumerate(sorted_classifiers[:5]):
|
||||
print(f" {i+1}. {clf_name}: {clf_result['accuracy']:.4f}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
runner = ExperimentRunner()
|
||||
results = runner.run_all_experiments()
|
||||
@ -1,96 +0,0 @@
|
||||
import numpy as np
|
||||
from typing import Tuple
|
||||
|
||||
class PCA:
|
||||
"""主成分分析"""
|
||||
|
||||
def __init__(self, n_components: int):
|
||||
self.n_components = n_components
|
||||
self.components_ = None
|
||||
self.mean_ = None
|
||||
self.explained_variance_ratio_ = None
|
||||
|
||||
def fit(self, X: np.ndarray) -> 'PCA':
|
||||
"""训练PCA模型"""
|
||||
self.mean_ = np.mean(X, axis=0)
|
||||
X_centered = X - self.mean_
|
||||
|
||||
# 计算协方差矩阵
|
||||
cov_matrix = np.cov(X_centered, rowvar=False)
|
||||
|
||||
# 计算特征值和特征向量
|
||||
eigenvalues, eigenvectors = np.linalg.eigh(cov_matrix)
|
||||
|
||||
# 按特征值降序排列
|
||||
idx = np.argsort(eigenvalues)[::-1]
|
||||
eigenvalues = eigenvalues[idx]
|
||||
eigenvectors = eigenvectors[:, idx]
|
||||
|
||||
# 选择前n_components个主成分
|
||||
self.components_ = eigenvectors[:, :self.n_components].T
|
||||
self.explained_variance_ratio_ = eigenvalues[:self.n_components] / np.sum(eigenvalues)
|
||||
|
||||
return self
|
||||
|
||||
def transform(self, X: np.ndarray) -> np.ndarray:
|
||||
"""应用PCA变换"""
|
||||
X_centered = X - self.mean_
|
||||
return np.dot(X_centered, self.components_.T)
|
||||
|
||||
def fit_transform(self, X: np.ndarray) -> np.ndarray:
|
||||
"""训练并变换"""
|
||||
return self.fit(X).transform(X)
|
||||
|
||||
class FeatureSelector:
|
||||
"""基于信息增益的特征选择"""
|
||||
|
||||
def __init__(self, k: int):
|
||||
self.k = k
|
||||
self.selected_features_ = None
|
||||
|
||||
def _entropy(self, y: np.ndarray) -> float:
|
||||
"""计算熵"""
|
||||
_, counts = np.unique(y, return_counts=True)
|
||||
probabilities = counts / len(y)
|
||||
return -np.sum(probabilities * np.log2(probabilities + 1e-10))
|
||||
|
||||
def _information_gain(self, X_feature: np.ndarray, y: np.ndarray) -> float:
|
||||
"""计算信息增益"""
|
||||
# 对连续特征进行离散化
|
||||
if len(np.unique(X_feature)) > 10:
|
||||
bins = np.linspace(np.min(X_feature), np.max(X_feature), 11)
|
||||
X_feature = np.digitize(X_feature, bins)
|
||||
|
||||
total_entropy = self._entropy(y)
|
||||
|
||||
values, counts = np.unique(X_feature, return_counts=True)
|
||||
weighted_entropy = 0
|
||||
|
||||
for value, count in zip(values, counts):
|
||||
subset_y = y[X_feature == value]
|
||||
weighted_entropy += (count / len(y)) * self._entropy(subset_y)
|
||||
|
||||
return total_entropy - weighted_entropy
|
||||
|
||||
def fit(self, X: np.ndarray, y: np.ndarray) -> 'FeatureSelector':
|
||||
"""训练特征选择器"""
|
||||
n_features = X.shape[1]
|
||||
feature_scores = []
|
||||
|
||||
for i in range(n_features):
|
||||
score = self._information_gain(X[:, i], y)
|
||||
feature_scores.append((i, score))
|
||||
|
||||
# 按信息增益排序
|
||||
feature_scores.sort(key=lambda x: x[1], reverse=True)
|
||||
self.selected_features_ = [idx for idx, _ in feature_scores[:self.k]]
|
||||
|
||||
return self
|
||||
|
||||
def transform(self, X: np.ndarray) -> np.ndarray:
|
||||
"""应用特征选择"""
|
||||
return X[:, self.selected_features_]
|
||||
|
||||
def fit_transform(self, X: np.ndarray, y: np.ndarray) -> np.ndarray:
|
||||
"""训练并变换"""
|
||||
return self.fit(X, y).transform(X)
|
||||
248
improved_bp.py
248
improved_bp.py
@ -1,248 +0,0 @@
|
||||
import numpy as np
|
||||
from typing import List, Tuple
|
||||
|
||||
class ImprovedBPNetwork:
|
||||
"""改进的BP神经网络,支持动态学习率调整"""
|
||||
|
||||
def __init__(self, hidden_layers: List[int], learning_rate: float = 0.01,
|
||||
max_epochs: int = 1000, tolerance: float = 1e-6):
|
||||
self.hidden_layers = hidden_layers
|
||||
self.initial_lr = learning_rate
|
||||
self.learning_rate = learning_rate
|
||||
self.max_epochs = max_epochs
|
||||
self.tolerance = tolerance
|
||||
self.weights = []
|
||||
self.biases = []
|
||||
self.loss_history = []
|
||||
|
||||
def _sigmoid(self, x: np.ndarray) -> np.ndarray:
|
||||
"""Sigmoid激活函数"""
|
||||
x = np.clip(x, -500, 500) # 防止溢出
|
||||
return 1 / (1 + np.exp(-x))
|
||||
|
||||
def _sigmoid_derivative(self, x: np.ndarray) -> np.ndarray:
|
||||
"""Sigmoid函数的导数"""
|
||||
s = self._sigmoid(x)
|
||||
return s * (1 - s)
|
||||
|
||||
def _initialize_weights(self, input_size: int, output_size: int):
|
||||
"""初始化权重和偏置"""
|
||||
self.weights = []
|
||||
self.biases = []
|
||||
|
||||
# 构建网络结构
|
||||
layers = [input_size] + self.hidden_layers + [output_size]
|
||||
|
||||
# Xavier初始化
|
||||
for i in range(len(layers) - 1):
|
||||
w = np.random.normal(0, np.sqrt(2.0 / (layers[i] + layers[i+1])),
|
||||
(layers[i], layers[i+1]))
|
||||
b = np.zeros((1, layers[i+1]))
|
||||
self.weights.append(w)
|
||||
self.biases.append(b)
|
||||
|
||||
def _forward_pass(self, X: np.ndarray) -> List[np.ndarray]:
|
||||
"""前向传播"""
|
||||
activations = [X]
|
||||
|
||||
for i in range(len(self.weights)):
|
||||
z = np.dot(activations[-1], self.weights[i]) + self.biases[i]
|
||||
a = self._sigmoid(z)
|
||||
activations.append(a)
|
||||
|
||||
return activations
|
||||
|
||||
def _backward_pass(self, X: np.ndarray, y: np.ndarray, activations: List[np.ndarray]) -> Tuple[List[np.ndarray], List[np.ndarray]]:
|
||||
"""反向传播"""
|
||||
m = X.shape[0]
|
||||
dw = [np.zeros_like(w) for w in self.weights]
|
||||
db = [np.zeros_like(b) for b in self.biases]
|
||||
|
||||
# 输出层误差
|
||||
delta = activations[-1] - y
|
||||
|
||||
# 从输出层向输入层反向传播
|
||||
for i in range(len(self.weights) - 1, -1, -1):
|
||||
dw[i] = np.dot(activations[i].T, delta) / m
|
||||
db[i] = np.mean(delta, axis=0, keepdims=True)
|
||||
|
||||
if i > 0:
|
||||
delta = np.dot(delta, self.weights[i].T) * self._sigmoid_derivative(
|
||||
np.dot(activations[i], self.weights[i]) + self.biases[i])
|
||||
|
||||
return dw, db
|
||||
|
||||
def _adaptive_learning_rate(self, epoch: int, current_loss: float, prev_loss: float):
|
||||
"""动态调整学习率"""
|
||||
if epoch > 0:
|
||||
if current_loss > prev_loss:
|
||||
# 损失增加,降低学习率
|
||||
self.learning_rate *= 0.9
|
||||
elif (prev_loss - current_loss) / prev_loss < 0.001:
|
||||
# 损失下降缓慢,增加学习率
|
||||
self.learning_rate *= 1.05
|
||||
|
||||
# 限制学习率范围
|
||||
self.learning_rate = np.clip(self.learning_rate,
|
||||
self.initial_lr * 0.01,
|
||||
self.initial_lr * 10)
|
||||
|
||||
def fit(self, X: np.ndarray, y: np.ndarray):
|
||||
"""训练神经网络"""
|
||||
# 处理标签
|
||||
if len(y.shape) == 1:
|
||||
y_encoded = np.zeros((len(y), len(np.unique(y))))
|
||||
for i, label in enumerate(np.unique(y)):
|
||||
y_encoded[y == label, i] = 1
|
||||
y = y_encoded
|
||||
|
||||
self._initialize_weights(X.shape[1], y.shape[1])
|
||||
|
||||
prev_loss = float('inf')
|
||||
|
||||
for epoch in range(self.max_epochs):
|
||||
# 前向传播
|
||||
activations = self._forward_pass(X)
|
||||
|
||||
# 计算损失
|
||||
loss = np.mean((activations[-1] - y) ** 2)
|
||||
self.loss_history.append(loss)
|
||||
|
||||
# 动态调整学习率
|
||||
self._adaptive_learning_rate(epoch, loss, prev_loss)
|
||||
|
||||
# 反向传播
|
||||
dw, db = self._backward_pass(X, y, activations)
|
||||
|
||||
# 更新权重和偏置
|
||||
for i in range(len(self.weights)):
|
||||
self.weights[i] -= self.learning_rate * dw[i]
|
||||
self.biases[i] -= self.learning_rate * db[i]
|
||||
|
||||
# 检查收敛
|
||||
if abs(prev_loss - loss) < self.tolerance:
|
||||
print(f"训练在第{epoch+1}轮收敛")
|
||||
break
|
||||
|
||||
prev_loss = loss
|
||||
|
||||
if epoch % 100 == 0:
|
||||
print(f"Epoch {epoch}, Loss: {loss:.6f}, LR: {self.learning_rate:.6f}")
|
||||
|
||||
def predict(self, X: np.ndarray) -> np.ndarray:
|
||||
"""预测"""
|
||||
activations = self._forward_pass(X)
|
||||
return np.argmax(activations[-1], axis=1)
|
||||
|
||||
def predict_proba(self, X: np.ndarray) -> np.ndarray:
|
||||
"""预测概率"""
|
||||
activations = self._forward_pass(X)
|
||||
return activations[-1]
|
||||
|
||||
class StandardBPNetwork:
|
||||
"""标准BP神经网络(固定学习率)"""
|
||||
|
||||
def __init__(self, hidden_layers: List[int], learning_rate: float = 0.01,
|
||||
max_epochs: int = 1000, tolerance: float = 1e-6):
|
||||
self.hidden_layers = hidden_layers
|
||||
self.learning_rate = learning_rate
|
||||
self.max_epochs = max_epochs
|
||||
self.tolerance = tolerance
|
||||
self.weights = []
|
||||
self.biases = []
|
||||
self.loss_history = []
|
||||
|
||||
def _sigmoid(self, x: np.ndarray) -> np.ndarray:
|
||||
"""Sigmoid激活函数"""
|
||||
x = np.clip(x, -500, 500)
|
||||
return 1 / (1 + np.exp(-x))
|
||||
|
||||
def _sigmoid_derivative(self, x: np.ndarray) -> np.ndarray:
|
||||
"""Sigmoid函数的导数"""
|
||||
s = self._sigmoid(x)
|
||||
return s * (1 - s)
|
||||
|
||||
def _initialize_weights(self, input_size: int, output_size: int):
|
||||
"""初始化权重和偏置"""
|
||||
self.weights = []
|
||||
self.biases = []
|
||||
|
||||
layers = [input_size] + self.hidden_layers + [output_size]
|
||||
|
||||
for i in range(len(layers) - 1):
|
||||
w = np.random.normal(0, np.sqrt(2.0 / (layers[i] + layers[i+1])),
|
||||
(layers[i], layers[i+1]))
|
||||
b = np.zeros((1, layers[i+1]))
|
||||
self.weights.append(w)
|
||||
self.biases.append(b)
|
||||
|
||||
def _forward_pass(self, X: np.ndarray) -> List[np.ndarray]:
|
||||
"""前向传播"""
|
||||
activations = [X]
|
||||
|
||||
for i in range(len(self.weights)):
|
||||
z = np.dot(activations[-1], self.weights[i]) + self.biases[i]
|
||||
a = self._sigmoid(z)
|
||||
activations.append(a)
|
||||
|
||||
return activations
|
||||
|
||||
def _backward_pass(self, X: np.ndarray, y: np.ndarray, activations: List[np.ndarray]) -> Tuple[List[np.ndarray], List[np.ndarray]]:
|
||||
"""反向传播"""
|
||||
m = X.shape[0]
|
||||
dw = [np.zeros_like(w) for w in self.weights]
|
||||
db = [np.zeros_like(b) for b in self.biases]
|
||||
|
||||
delta = activations[-1] - y
|
||||
|
||||
for i in range(len(self.weights) - 1, -1, -1):
|
||||
dw[i] = np.dot(activations[i].T, delta) / m
|
||||
db[i] = np.mean(delta, axis=0, keepdims=True)
|
||||
|
||||
if i > 0:
|
||||
delta = np.dot(delta, self.weights[i].T) * self._sigmoid_derivative(
|
||||
np.dot(activations[i], self.weights[i]) + self.biases[i])
|
||||
|
||||
return dw, db
|
||||
|
||||
def fit(self, X: np.ndarray, y: np.ndarray):
|
||||
"""训练神经网络"""
|
||||
if len(y.shape) == 1:
|
||||
y_encoded = np.zeros((len(y), len(np.unique(y))))
|
||||
for i, label in enumerate(np.unique(y)):
|
||||
y_encoded[y == label, i] = 1
|
||||
y = y_encoded
|
||||
|
||||
self._initialize_weights(X.shape[1], y.shape[1])
|
||||
|
||||
prev_loss = float('inf')
|
||||
|
||||
for epoch in range(self.max_epochs):
|
||||
activations = self._forward_pass(X)
|
||||
loss = np.mean((activations[-1] - y) ** 2)
|
||||
self.loss_history.append(loss)
|
||||
|
||||
dw, db = self._backward_pass(X, y, activations)
|
||||
|
||||
for i in range(len(self.weights)):
|
||||
self.weights[i] -= self.learning_rate * dw[i]
|
||||
self.biases[i] -= self.learning_rate * db[i]
|
||||
|
||||
if abs(prev_loss - loss) < self.tolerance:
|
||||
print(f"标准BP训练在第{epoch+1}轮收敛")
|
||||
break
|
||||
|
||||
prev_loss = loss
|
||||
|
||||
if epoch % 100 == 0:
|
||||
print(f"Standard BP Epoch {epoch}, Loss: {loss:.6f}")
|
||||
|
||||
def predict(self, X: np.ndarray) -> np.ndarray:
|
||||
"""预测"""
|
||||
activations = self._forward_pass(X)
|
||||
return np.argmax(activations[-1], axis=1)
|
||||
|
||||
def predict_proba(self, X: np.ndarray) -> np.ndarray:
|
||||
"""预测概率"""
|
||||
activations = self._forward_pass(X)
|
||||
return activations[-1]
|
||||
981
main.ipynb
981
main.ipynb
File diff suppressed because one or more lines are too long
29
main.py
29
main.py
@ -1,29 +0,0 @@
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
from experiments import ExperimentRunner
|
||||
|
||||
def main():
|
||||
"""主函数"""
|
||||
print("机器学习算法实现与比较系统")
|
||||
print("="*50)
|
||||
print("本系统实现了以下内容:")
|
||||
print("1. 改进的BP神经网络 vs 标准BP网络")
|
||||
print("2. 特征提取(PCA, 特征选择)对分类性能的影响")
|
||||
print("3. 多种分类算法比较(朴素贝叶斯, KNN, 决策树)")
|
||||
print("4. 集成学习算法(Bagging, Voting)")
|
||||
print("5. 所有算法均为自主实现,未使用任何ML库")
|
||||
print("="*50)
|
||||
|
||||
# 设置matplotlib中文显示
|
||||
plt.rcParams['font.sans-serif'] = ['SimHei']
|
||||
plt.rcParams['axes.unicode_minus'] = False
|
||||
|
||||
# 运行实验
|
||||
runner = ExperimentRunner()
|
||||
results = runner.run_all_experiments()
|
||||
|
||||
print("\n实验完成!结果已保存在实验报告中。")
|
||||
print("图表已保存到本地文件。")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
72
utils.py
72
utils.py
@ -1,72 +0,0 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from typing import Tuple, List
|
||||
import math
|
||||
|
||||
def load_data(filepath: str) -> Tuple[np.ndarray, np.ndarray]:
|
||||
"""加载数据集"""
|
||||
data = pd.read_csv(filepath)
|
||||
X = data.iloc[:, :-1].values
|
||||
y = data.iloc[:, -1].values
|
||||
return X, y
|
||||
|
||||
def train_test_split(X: np.ndarray, y: np.ndarray, test_size: float = 0.3, random_state: int = 42) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
|
||||
"""数据集划分"""
|
||||
np.random.seed(random_state)
|
||||
n_samples = X.shape[0]
|
||||
n_test = int(n_samples * test_size)
|
||||
|
||||
indices = np.random.permutation(n_samples)
|
||||
test_indices = indices[:n_test]
|
||||
train_indices = indices[n_test:]
|
||||
|
||||
return X[train_indices], X[test_indices], y[train_indices], y[test_indices]
|
||||
|
||||
def normalize_data(X_train: np.ndarray, X_test: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
|
||||
"""数据标准化"""
|
||||
mean = np.mean(X_train, axis=0)
|
||||
std = np.std(X_train, axis=0)
|
||||
std[std == 0] = 1 # 避免除零
|
||||
|
||||
X_train_norm = (X_train - mean) / std
|
||||
X_test_norm = (X_test - mean) / std
|
||||
|
||||
return X_train_norm, X_test_norm
|
||||
|
||||
def accuracy_score(y_true: np.ndarray, y_pred: np.ndarray) -> float:
|
||||
"""计算准确率"""
|
||||
return np.mean(y_true == y_pred)
|
||||
|
||||
def confusion_matrix(y_true: np.ndarray, y_pred: np.ndarray) -> np.ndarray:
|
||||
"""计算混淆矩阵"""
|
||||
classes = np.unique(np.concatenate([y_true, y_pred]))
|
||||
n_classes = len(classes)
|
||||
matrix = np.zeros((n_classes, n_classes), dtype=int)
|
||||
|
||||
for i, true_class in enumerate(classes):
|
||||
for j, pred_class in enumerate(classes):
|
||||
matrix[i, j] = np.sum((y_true == true_class) & (y_pred == pred_class))
|
||||
|
||||
return matrix
|
||||
|
||||
def cross_validation(classifier, X: np.ndarray, y: np.ndarray, k: int = 5) -> List[float]:
|
||||
"""K折交叉验证"""
|
||||
n_samples = X.shape[0]
|
||||
fold_size = n_samples // k
|
||||
scores = []
|
||||
|
||||
for i in range(k):
|
||||
start_idx = i * fold_size
|
||||
end_idx = start_idx + fold_size if i < k - 1 else n_samples
|
||||
|
||||
test_indices = np.arange(start_idx, end_idx)
|
||||
train_indices = np.concatenate([np.arange(0, start_idx), np.arange(end_idx, n_samples)])
|
||||
|
||||
X_train_fold, X_test_fold = X[train_indices], X[test_indices]
|
||||
y_train_fold, y_test_fold = y[train_indices], y[test_indices]
|
||||
|
||||
classifier.fit(X_train_fold, y_train_fold)
|
||||
y_pred = classifier.predict(X_test_fold)
|
||||
scores.append(accuracy_score(y_test_fold, y_pred))
|
||||
|
||||
return scores
|
||||
Loading…
x
Reference in New Issue
Block a user