import numpy as np
from typing import List, Tuple

class ImprovedBPNetwork:
    """改进的BP神经网络，支持动态学习率调整"""
    
    def __init__(self, hidden_layers: List[int], learning_rate: float = 0.01, 
                 max_epochs: int = 1000, tolerance: float = 1e-6):
        self.hidden_layers = hidden_layers
        self.initial_lr = learning_rate
        self.learning_rate = learning_rate
        self.max_epochs = max_epochs
        self.tolerance = tolerance
        self.weights = []
        self.biases = []
        self.loss_history = []
        
    def _sigmoid(self, x: np.ndarray) -> np.ndarray:
        """Sigmoid激活函数"""
        x = np.clip(x, -500, 500)  # 防止溢出
        return 1 / (1 + np.exp(-x))
    
    def _sigmoid_derivative(self, x: np.ndarray) -> np.ndarray:
        """Sigmoid函数的导数"""
        s = self._sigmoid(x)
        return s * (1 - s)
    
    def _initialize_weights(self, input_size: int, output_size: int):
        """初始化权重和偏置"""
        self.weights = []
        self.biases = []
        
        # 构建网络结构
        layers = [input_size] + self.hidden_layers + [output_size]
        
        # Xavier初始化
        for i in range(len(layers) - 1):
            w = np.random.normal(0, np.sqrt(2.0 / (layers[i] + layers[i+1])), 
                               (layers[i], layers[i+1]))
            b = np.zeros((1, layers[i+1]))
            self.weights.append(w)
            self.biases.append(b)
    
    def _forward_pass(self, X: np.ndarray) -> List[np.ndarray]:
        """前向传播"""
        activations = [X]
        
        for i in range(len(self.weights)):
            z = np.dot(activations[-1], self.weights[i]) + self.biases[i]
            a = self._sigmoid(z)
            activations.append(a)
        
        return activations
    
    def _backward_pass(self, X: np.ndarray, y: np.ndarray, activations: List[np.ndarray]) -> Tuple[List[np.ndarray], List[np.ndarray]]:
        """反向传播"""
        m = X.shape[0]
        dw = [np.zeros_like(w) for w in self.weights]
        db = [np.zeros_like(b) for b in self.biases]
        
        # 输出层误差
        delta = activations[-1] - y
        
        # 从输出层向输入层反向传播
        for i in range(len(self.weights) - 1, -1, -1):
            dw[i] = np.dot(activations[i].T, delta) / m
            db[i] = np.mean(delta, axis=0, keepdims=True)
            
            if i > 0:
                delta = np.dot(delta, self.weights[i].T) * self._sigmoid_derivative(
                    np.dot(activations[i], self.weights[i]) + self.biases[i])
        
        return dw, db
    
    def _adaptive_learning_rate(self, epoch: int, current_loss: float, prev_loss: float):
        """动态调整学习率"""
        if epoch > 0:
            if current_loss > prev_loss:
                # 损失增加，降低学习率
                self.learning_rate *= 0.9
            elif (prev_loss - current_loss) / prev_loss < 0.001:
                # 损失下降缓慢，增加学习率
                self.learning_rate *= 1.05
        
        # 限制学习率范围
        self.learning_rate = np.clip(self.learning_rate, 
                                   self.initial_lr * 0.01, 
                                   self.initial_lr * 10)
    
    def fit(self, X: np.ndarray, y: np.ndarray):
        """训练神经网络"""
        # 处理标签
        if len(y.shape) == 1:
            y_encoded = np.zeros((len(y), len(np.unique(y))))
            for i, label in enumerate(np.unique(y)):
                y_encoded[y == label, i] = 1
            y = y_encoded
        
        self._initialize_weights(X.shape[1], y.shape[1])
        
        prev_loss = float('inf')
        
        for epoch in range(self.max_epochs):
            # 前向传播
            activations = self._forward_pass(X)
            
            # 计算损失
            loss = np.mean((activations[-1] - y) ** 2)
            self.loss_history.append(loss)
            
            # 动态调整学习率
            self._adaptive_learning_rate(epoch, loss, prev_loss)
            
            # 反向传播
            dw, db = self._backward_pass(X, y, activations)
            
            # 更新权重和偏置
            for i in range(len(self.weights)):
                self.weights[i] -= self.learning_rate * dw[i]
                self.biases[i] -= self.learning_rate * db[i]
            
            # 检查收敛
            if abs(prev_loss - loss) < self.tolerance:
                print(f"训练在第{epoch+1}轮收敛")
                break
            
            prev_loss = loss
            
            if epoch % 100 == 0:
                print(f"Epoch {epoch}, Loss: {loss:.6f}, LR: {self.learning_rate:.6f}")
    
    def predict(self, X: np.ndarray) -> np.ndarray:
        """预测"""
        activations = self._forward_pass(X)
        return np.argmax(activations[-1], axis=1)
    
    def predict_proba(self, X: np.ndarray) -> np.ndarray:
        """预测概率"""
        activations = self._forward_pass(X)
        return activations[-1]

class StandardBPNetwork:
    """标准BP神经网络（固定学习率）"""
    
    def __init__(self, hidden_layers: List[int], learning_rate: float = 0.01, 
                 max_epochs: int = 1000, tolerance: float = 1e-6):
        self.hidden_layers = hidden_layers
        self.learning_rate = learning_rate
        self.max_epochs = max_epochs
        self.tolerance = tolerance
        self.weights = []
        self.biases = []
        self.loss_history = []
        
    def _sigmoid(self, x: np.ndarray) -> np.ndarray:
        """Sigmoid激活函数"""
        x = np.clip(x, -500, 500)
        return 1 / (1 + np.exp(-x))
    
    def _sigmoid_derivative(self, x: np.ndarray) -> np.ndarray:
        """Sigmoid函数的导数"""
        s = self._sigmoid(x)
        return s * (1 - s)
    
    def _initialize_weights(self, input_size: int, output_size: int):
        """初始化权重和偏置"""
        self.weights = []
        self.biases = []
        
        layers = [input_size] + self.hidden_layers + [output_size]
        
        for i in range(len(layers) - 1):
            w = np.random.normal(0, np.sqrt(2.0 / (layers[i] + layers[i+1])), 
                               (layers[i], layers[i+1]))
            b = np.zeros((1, layers[i+1]))
            self.weights.append(w)
            self.biases.append(b)
    
    def _forward_pass(self, X: np.ndarray) -> List[np.ndarray]:
        """前向传播"""
        activations = [X]
        
        for i in range(len(self.weights)):
            z = np.dot(activations[-1], self.weights[i]) + self.biases[i]
            a = self._sigmoid(z)
            activations.append(a)
        
        return activations
    
    def _backward_pass(self, X: np.ndarray, y: np.ndarray, activations: List[np.ndarray]) -> Tuple[List[np.ndarray], List[np.ndarray]]:
        """反向传播"""
        m = X.shape[0]
        dw = [np.zeros_like(w) for w in self.weights]
        db = [np.zeros_like(b) for b in self.biases]
        
        delta = activations[-1] - y
        
        for i in range(len(self.weights) - 1, -1, -1):
            dw[i] = np.dot(activations[i].T, delta) / m
            db[i] = np.mean(delta, axis=0, keepdims=True)
            
            if i > 0:
                delta = np.dot(delta, self.weights[i].T) * self._sigmoid_derivative(
                    np.dot(activations[i], self.weights[i]) + self.biases[i])
        
        return dw, db
    
    def fit(self, X: np.ndarray, y: np.ndarray):
        """训练神经网络"""
        if len(y.shape) == 1:
            y_encoded = np.zeros((len(y), len(np.unique(y))))
            for i, label in enumerate(np.unique(y)):
                y_encoded[y == label, i] = 1
            y = y_encoded
        
        self._initialize_weights(X.shape[1], y.shape[1])
        
        prev_loss = float('inf')
        
        for epoch in range(self.max_epochs):
            activations = self._forward_pass(X)
            loss = np.mean((activations[-1] - y) ** 2)
            self.loss_history.append(loss)
            
            dw, db = self._backward_pass(X, y, activations)
            
            for i in range(len(self.weights)):
                self.weights[i] -= self.learning_rate * dw[i]
                self.biases[i] -= self.learning_rate * db[i]
            
            if abs(prev_loss - loss) < self.tolerance:
                print(f"标准BP训练在第{epoch+1}轮收敛")
                break
            
            prev_loss = loss
            
            if epoch % 100 == 0:
                print(f"Standard BP Epoch {epoch}, Loss: {loss:.6f}")
    
    def predict(self, X: np.ndarray) -> np.ndarray:
        """预测"""
        activations = self._forward_pass(X)
        return np.argmax(activations[-1], axis=1)
    
    def predict_proba(self, X: np.ndarray) -> np.ndarray:
        """预测概率"""
        activations = self._forward_pass(X)
        return activations[-1]