ml-homework/feature_extraction.py

97 lines
3.2 KiB
Python

import numpy as np
from typing import Tuple
class PCA:
"""主成分分析"""
def __init__(self, n_components: int):
self.n_components = n_components
self.components_ = None
self.mean_ = None
self.explained_variance_ratio_ = None
def fit(self, X: np.ndarray) -> 'PCA':
"""训练PCA模型"""
self.mean_ = np.mean(X, axis=0)
X_centered = X - self.mean_
# 计算协方差矩阵
cov_matrix = np.cov(X_centered, rowvar=False)
# 计算特征值和特征向量
eigenvalues, eigenvectors = np.linalg.eigh(cov_matrix)
# 按特征值降序排列
idx = np.argsort(eigenvalues)[::-1]
eigenvalues = eigenvalues[idx]
eigenvectors = eigenvectors[:, idx]
# 选择前n_components个主成分
self.components_ = eigenvectors[:, :self.n_components].T
self.explained_variance_ratio_ = eigenvalues[:self.n_components] / np.sum(eigenvalues)
return self
def transform(self, X: np.ndarray) -> np.ndarray:
"""应用PCA变换"""
X_centered = X - self.mean_
return np.dot(X_centered, self.components_.T)
def fit_transform(self, X: np.ndarray) -> np.ndarray:
"""训练并变换"""
return self.fit(X).transform(X)
class FeatureSelector:
"""基于信息增益的特征选择"""
def __init__(self, k: int):
self.k = k
self.selected_features_ = None
def _entropy(self, y: np.ndarray) -> float:
"""计算熵"""
_, counts = np.unique(y, return_counts=True)
probabilities = counts / len(y)
return -np.sum(probabilities * np.log2(probabilities + 1e-10))
def _information_gain(self, X_feature: np.ndarray, y: np.ndarray) -> float:
"""计算信息增益"""
# 对连续特征进行离散化
if len(np.unique(X_feature)) > 10:
bins = np.linspace(np.min(X_feature), np.max(X_feature), 11)
X_feature = np.digitize(X_feature, bins)
total_entropy = self._entropy(y)
values, counts = np.unique(X_feature, return_counts=True)
weighted_entropy = 0
for value, count in zip(values, counts):
subset_y = y[X_feature == value]
weighted_entropy += (count / len(y)) * self._entropy(subset_y)
return total_entropy - weighted_entropy
def fit(self, X: np.ndarray, y: np.ndarray) -> 'FeatureSelector':
"""训练特征选择器"""
n_features = X.shape[1]
feature_scores = []
for i in range(n_features):
score = self._information_gain(X[:, i], y)
feature_scores.append((i, score))
# 按信息增益排序
feature_scores.sort(key=lambda x: x[1], reverse=True)
self.selected_features_ = [idx for idx, _ in feature_scores[:self.k]]
return self
def transform(self, X: np.ndarray) -> np.ndarray:
"""应用特征选择"""
return X[:, self.selected_features_]
def fit_transform(self, X: np.ndarray, y: np.ndarray) -> np.ndarray:
"""训练并变换"""
return self.fit(X, y).transform(X)