97 lines
3.2 KiB
Python
97 lines
3.2 KiB
Python
import numpy as np
|
|
from typing import Tuple
|
|
|
|
class PCA:
|
|
"""主成分分析"""
|
|
|
|
def __init__(self, n_components: int):
|
|
self.n_components = n_components
|
|
self.components_ = None
|
|
self.mean_ = None
|
|
self.explained_variance_ratio_ = None
|
|
|
|
def fit(self, X: np.ndarray) -> 'PCA':
|
|
"""训练PCA模型"""
|
|
self.mean_ = np.mean(X, axis=0)
|
|
X_centered = X - self.mean_
|
|
|
|
# 计算协方差矩阵
|
|
cov_matrix = np.cov(X_centered, rowvar=False)
|
|
|
|
# 计算特征值和特征向量
|
|
eigenvalues, eigenvectors = np.linalg.eigh(cov_matrix)
|
|
|
|
# 按特征值降序排列
|
|
idx = np.argsort(eigenvalues)[::-1]
|
|
eigenvalues = eigenvalues[idx]
|
|
eigenvectors = eigenvectors[:, idx]
|
|
|
|
# 选择前n_components个主成分
|
|
self.components_ = eigenvectors[:, :self.n_components].T
|
|
self.explained_variance_ratio_ = eigenvalues[:self.n_components] / np.sum(eigenvalues)
|
|
|
|
return self
|
|
|
|
def transform(self, X: np.ndarray) -> np.ndarray:
|
|
"""应用PCA变换"""
|
|
X_centered = X - self.mean_
|
|
return np.dot(X_centered, self.components_.T)
|
|
|
|
def fit_transform(self, X: np.ndarray) -> np.ndarray:
|
|
"""训练并变换"""
|
|
return self.fit(X).transform(X)
|
|
|
|
class FeatureSelector:
|
|
"""基于信息增益的特征选择"""
|
|
|
|
def __init__(self, k: int):
|
|
self.k = k
|
|
self.selected_features_ = None
|
|
|
|
def _entropy(self, y: np.ndarray) -> float:
|
|
"""计算熵"""
|
|
_, counts = np.unique(y, return_counts=True)
|
|
probabilities = counts / len(y)
|
|
return -np.sum(probabilities * np.log2(probabilities + 1e-10))
|
|
|
|
def _information_gain(self, X_feature: np.ndarray, y: np.ndarray) -> float:
|
|
"""计算信息增益"""
|
|
# 对连续特征进行离散化
|
|
if len(np.unique(X_feature)) > 10:
|
|
bins = np.linspace(np.min(X_feature), np.max(X_feature), 11)
|
|
X_feature = np.digitize(X_feature, bins)
|
|
|
|
total_entropy = self._entropy(y)
|
|
|
|
values, counts = np.unique(X_feature, return_counts=True)
|
|
weighted_entropy = 0
|
|
|
|
for value, count in zip(values, counts):
|
|
subset_y = y[X_feature == value]
|
|
weighted_entropy += (count / len(y)) * self._entropy(subset_y)
|
|
|
|
return total_entropy - weighted_entropy
|
|
|
|
def fit(self, X: np.ndarray, y: np.ndarray) -> 'FeatureSelector':
|
|
"""训练特征选择器"""
|
|
n_features = X.shape[1]
|
|
feature_scores = []
|
|
|
|
for i in range(n_features):
|
|
score = self._information_gain(X[:, i], y)
|
|
feature_scores.append((i, score))
|
|
|
|
# 按信息增益排序
|
|
feature_scores.sort(key=lambda x: x[1], reverse=True)
|
|
self.selected_features_ = [idx for idx, _ in feature_scores[:self.k]]
|
|
|
|
return self
|
|
|
|
def transform(self, X: np.ndarray) -> np.ndarray:
|
|
"""应用特征选择"""
|
|
return X[:, self.selected_features_]
|
|
|
|
def fit_transform(self, X: np.ndarray, y: np.ndarray) -> np.ndarray:
|
|
"""训练并变换"""
|
|
return self.fit(X, y).transform(X)
|