import numpy as np from typing import Tuple class PCA: """主成分分析""" def __init__(self, n_components: int): self.n_components = n_components self.components_ = None self.mean_ = None self.explained_variance_ratio_ = None def fit(self, X: np.ndarray) -> 'PCA': """训练PCA模型""" self.mean_ = np.mean(X, axis=0) X_centered = X - self.mean_ # 计算协方差矩阵 cov_matrix = np.cov(X_centered, rowvar=False) # 计算特征值和特征向量 eigenvalues, eigenvectors = np.linalg.eigh(cov_matrix) # 按特征值降序排列 idx = np.argsort(eigenvalues)[::-1] eigenvalues = eigenvalues[idx] eigenvectors = eigenvectors[:, idx] # 选择前n_components个主成分 self.components_ = eigenvectors[:, :self.n_components].T self.explained_variance_ratio_ = eigenvalues[:self.n_components] / np.sum(eigenvalues) return self def transform(self, X: np.ndarray) -> np.ndarray: """应用PCA变换""" X_centered = X - self.mean_ return np.dot(X_centered, self.components_.T) def fit_transform(self, X: np.ndarray) -> np.ndarray: """训练并变换""" return self.fit(X).transform(X) class FeatureSelector: """基于信息增益的特征选择""" def __init__(self, k: int): self.k = k self.selected_features_ = None def _entropy(self, y: np.ndarray) -> float: """计算熵""" _, counts = np.unique(y, return_counts=True) probabilities = counts / len(y) return -np.sum(probabilities * np.log2(probabilities + 1e-10)) def _information_gain(self, X_feature: np.ndarray, y: np.ndarray) -> float: """计算信息增益""" # 对连续特征进行离散化 if len(np.unique(X_feature)) > 10: bins = np.linspace(np.min(X_feature), np.max(X_feature), 11) X_feature = np.digitize(X_feature, bins) total_entropy = self._entropy(y) values, counts = np.unique(X_feature, return_counts=True) weighted_entropy = 0 for value, count in zip(values, counts): subset_y = y[X_feature == value] weighted_entropy += (count / len(y)) * self._entropy(subset_y) return total_entropy - weighted_entropy def fit(self, X: np.ndarray, y: np.ndarray) -> 'FeatureSelector': """训练特征选择器""" n_features = X.shape[1] feature_scores = [] for i in range(n_features): score = self._information_gain(X[:, i], y) feature_scores.append((i, score)) # 按信息增益排序 feature_scores.sort(key=lambda x: x[1], reverse=True) self.selected_features_ = [idx for idx, _ in feature_scores[:self.k]] return self def transform(self, X: np.ndarray) -> np.ndarray: """应用特征选择""" return X[:, self.selected_features_] def fit_transform(self, X: np.ndarray, y: np.ndarray) -> np.ndarray: """训练并变换""" return self.fit(X, y).transform(X)